diff --git a/biome.json b/biome.json index 600b130..d2510ac 100644 --- a/biome.json +++ b/biome.json @@ -4,7 +4,11 @@ "linter": { "enabled": true, "rules": { - "recommended": true + "recommended": true, + "style": { + "noNonNullAssertion": "off", + "noInferrableTypes": "off" + } } }, "formatter": { diff --git a/playground/index.html b/playground/index.html index 2004305..22a76f1 100644 --- a/playground/index.html +++ b/playground/index.html @@ -116,6 +116,36 @@

ensemble

RandomForest, GradientBoosting, AdaBoost

πŸ• Pending +
+

feature_extraction.text

+

CountVectorizer, TfidfVectorizer, HashingVectorizer

+ βœ… Implemented +
+
+

kernel_approximation

+

RBFSampler, Nystroem, AdditiveChi2Sampler

+ βœ… Implemented +
+
+

covariance

+

EmpiricalCovariance, ShrunkCovariance, LedoitWolf, OAS

+ βœ… Implemented +
+
+

cross_decomposition

+

PLSRegression, PLSSVD

+ βœ… Implemented +
+
+

preprocessing (extended)

+

PowerTransformer, QuantileTransformer, Binarizer, FunctionTransformer

+ βœ… Implemented +
+
+

decomposition (extended)

+

IncrementalPCA, KernelPCA, FactorAnalysis

+ βœ… Implemented +
diff --git a/src/bicluster/bicluster.ts b/src/bicluster/bicluster.ts new file mode 100644 index 0000000..37d9c59 --- /dev/null +++ b/src/bicluster/bicluster.ts @@ -0,0 +1,214 @@ +/** + * Biclustering algorithms: SpectralBiclustering and SpectralCoclustering. + * Port of sklearn.cluster.bicluster + */ + +import { NotFittedError } from "../exceptions.js"; + +function svd2( + matrix: Float64Array[], + nComponents: number, +): { U: Float64Array[]; S: Float64Array; Vt: Float64Array[] } { + const m = matrix.length; + const n = matrix[0]?.length ?? 0; + const k = Math.min(nComponents, Math.min(m, n)); + const U: Float64Array[] = Array.from({ length: m }, () => new Float64Array(k)); + const S = new Float64Array(k); + const Vt: Float64Array[] = Array.from({ length: k }, () => new Float64Array(n)); + for (let c = 0; c < k; c++) { + let v = new Float64Array(n); + v[c % n] = 1; + for (let _iter = 0; _iter < 30; _iter++) { + const u = new Float64Array(m); + for (let i = 0; i < m; i++) { + for (let j = 0; j < n; j++) u[i] += (matrix[i]?.[j] ?? 0) * (v[j] ?? 0); + } + const newV = new Float64Array(n); + for (let i = 0; i < m; i++) { + for (let j = 0; j < n; j++) newV[j] += (matrix[i]?.[j] ?? 0) * (u[i] ?? 0); + } + let norm = 0; + for (let j = 0; j < n; j++) norm += (newV[j] ?? 0) ** 2; + norm = Math.sqrt(norm); + if (norm < 1e-12) break; + for (let j = 0; j < n; j++) v[j] = (newV[j] ?? 0) / norm; + } + const u = new Float64Array(m); + for (let i = 0; i < m; i++) { + for (let j = 0; j < n; j++) u[i] += (matrix[i]?.[j] ?? 0) * (v[j] ?? 0); + } + let sigma = 0; + for (let i = 0; i < m; i++) sigma += (u[i] ?? 0) ** 2; + sigma = Math.sqrt(sigma); + S[c] = sigma; + if (sigma > 1e-12) { + for (let i = 0; i < m; i++) U[i]![c] = (u[i] ?? 0) / sigma; + } + for (let j = 0; j < n; j++) Vt[c]![j] = v[j] ?? 0; + for (let i = 0; i < m; i++) { + for (let j = 0; j < n; j++) { + matrix[i]![j] = (matrix[i]?.[j] ?? 0) - (U[i]?.[c] ?? 0) * sigma * (Vt[c]?.[j] ?? 0); + } + } + } + return { U, S, Vt }; +} + +function kmeansSimple(X: Float64Array[], k: number, maxIter = 100): Int32Array { + const n = X.length; + const d = X[0]?.length ?? 0; + const labels = new Int32Array(n); + const centers: Float64Array[] = Array.from({ length: k }, (_, i) => (X[i % n] ?? new Float64Array(d)).slice()); + for (let _iter = 0; _iter < maxIter; _iter++) { + let changed = false; + for (let i = 0; i < n; i++) { + let best = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let j = 0; j < k; j++) { + let dist = 0; + for (let l = 0; l < d; l++) { + const diff = (X[i]?.[l] ?? 0) - (centers[j]?.[l] ?? 0); + dist += diff * diff; + } + if (dist < bestDist) { bestDist = dist; best = j; } + } + if (labels[i] !== best) { labels[i] = best; changed = true; } + } + if (!changed) break; + const counts = new Int32Array(k); + for (let j = 0; j < k; j++) centers[j] = new Float64Array(d); + for (let i = 0; i < n; i++) { + const c = labels[i]!; + counts[c]++; + for (let l = 0; l < d; l++) centers[c]![l]! += X[i]?.[l] ?? 0; + } + for (let j = 0; j < k; j++) { + if ((counts[j] ?? 0) > 0) { + for (let l = 0; l < d; l++) centers[j]![l]! /= counts[j]!; + } + } + } + return labels; +} + +export interface SpectralBiclusteringParams { + nClusters?: number | [number, number]; + method?: "bistochastic" | "scale" | "log"; + nComponents?: number; + nInit?: number; +} + +/** Spectral biclustering. Port of sklearn.cluster.SpectralBiclustering */ +export class SpectralBiclustering { + nClusters: number | [number, number]; + method: string; + nComponents: number; + nInit: number; + rowLabels_?: Int32Array; + columnLabels_?: Int32Array; + biclusters_?: [Int32Array, Int32Array][]; + + constructor(params: SpectralBiclusteringParams = {}) { + this.nClusters = params.nClusters ?? 3; + this.method = params.method ?? "bistochastic"; + this.nComponents = params.nComponents ?? 6; + this.nInit = params.nInit ?? 10; + } + + fit(X: Float64Array[]): this { + const nRows = X.length; + const nCols = X[0]?.length ?? 0; + const [nRowClusters, nColClusters] = Array.isArray(this.nClusters) + ? this.nClusters + : [this.nClusters, this.nClusters]; + const normalized = X.map((row) => row.slice()); + const k = Math.min(this.nComponents, Math.min(nRows, nCols)); + const { U, Vt } = svd2(normalized, k); + const rowVecs = U.slice(0, nRows); + const colVecs = Array.from({ length: nCols }, (_, j) => { + const v = new Float64Array(k); + for (let c = 0; c < k; c++) v[c] = Vt[c]?.[j] ?? 0; + return v; + }); + this.rowLabels_ = kmeansSimple(rowVecs, nRowClusters, 100); + this.columnLabels_ = kmeansSimple(colVecs, nColClusters, 100); + this.biclusters_ = []; + for (let r = 0; r < nRowClusters; r++) { + for (let c = 0; c < nColClusters; c++) { + const rowIdx = Array.from({ length: nRows }, (_, i) => i).filter((i) => this.rowLabels_![i] === r); + const colIdx = Array.from({ length: nCols }, (_, j) => j).filter((j) => this.columnLabels_![j] === c); + this.biclusters_.push([new Int32Array(rowIdx), new Int32Array(colIdx)]); + } + } + return this; + } + + getBicluster(i: number): [Int32Array, Int32Array] { + if (!this.biclusters_) throw new NotFittedError("SpectralBiclustering"); + return this.biclusters_[i]!; + } +} + +export interface SpectralCoclusteringParams { + nClusters?: number; + nSvdVecs?: number | null; + nInit?: number; +} + +/** Spectral co-clustering. Port of sklearn.cluster.SpectralCoclustering */ +export class SpectralCoclustering { + nClusters: number; + nInit: number; + rowLabels_?: Int32Array; + columnLabels_?: Int32Array; + biclusters_?: [Int32Array, Int32Array][]; + + constructor(params: SpectralCoclusteringParams = {}) { + this.nClusters = params.nClusters ?? 3; + this.nInit = params.nInit ?? 10; + } + + fit(X: Float64Array[]): this { + const nRows = X.length; + const nCols = X[0]?.length ?? 0; + const k = this.nClusters; + const rowSums = new Float64Array(nRows); + const colSums = new Float64Array(nCols); + for (let i = 0; i < nRows; i++) { + for (let j = 0; j < nCols; j++) { + rowSums[i] += X[i]?.[j] ?? 0; + colSums[j] += X[i]?.[j] ?? 0; + } + } + const normalized = X.map((row, i) => { + const nr = new Float64Array(nCols); + const rs = Math.sqrt(rowSums[i]! || 1); + for (let j = 0; j < nCols; j++) { + const cs = Math.sqrt(colSums[j]! || 1); + nr[j] = (row[j] ?? 0) / (rs * cs); + } + return nr; + }); + const { U, Vt } = svd2(normalized, k + 1); + const rowVecs = U.slice(0, nRows).map((u) => u.slice(1)); + const colVecs = Array.from({ length: nCols }, (_, j) => { + const v = new Float64Array(k); + for (let c = 1; c <= k; c++) v[c - 1] = Vt[c]?.[j] ?? 0; + return v; + }); + this.rowLabels_ = kmeansSimple(rowVecs, k, 100); + this.columnLabels_ = kmeansSimple(colVecs, k, 100); + this.biclusters_ = []; + for (let c = 0; c < k; c++) { + const rowIdx = Array.from({ length: nRows }, (_, i) => i).filter((i) => this.rowLabels_![i] === c); + const colIdx = Array.from({ length: nCols }, (_, j) => j).filter((j) => this.columnLabels_![j] === c); + this.biclusters_.push([new Int32Array(rowIdx), new Int32Array(colIdx)]); + } + return this; + } + + getBicluster(i: number): [Int32Array, Int32Array] { + if (!this.biclusters_) throw new NotFittedError("SpectralCoclustering"); + return this.biclusters_[i]!; + } +} diff --git a/src/bicluster/bicluster_ext.ts b/src/bicluster/bicluster_ext.ts new file mode 100644 index 0000000..9d04493 --- /dev/null +++ b/src/bicluster/bicluster_ext.ts @@ -0,0 +1,133 @@ +/** + * Bicluster extensions: SpectralCoClustering, BiclusterMixin utilities. + */ + +export class SpectralCoClustering { + rowLabels_: Int32Array = new Int32Array(0); + columnLabels_: Int32Array = new Int32Array(0); + biclusters_: Array<[boolean[], boolean[]]> = []; + + constructor( + private readonly nClusters = 3, + private readonly svdMethod: "randomized" | "arpack" = "randomized", + private readonly seed = 42 + ) { + void this.svdMethod; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const m = X[0]?.length ?? 1; + // Normalize: D_row^(-1/2) X D_col^(-1/2) + const rowSums = X.map((row) => Math.sqrt(Math.max(row.reduce((a, b) => a + b, 0), 1e-10))); + const colSums = new Float64Array(m); + for (const row of X) for (let j = 0; j < m; j++) colSums[j] = (colSums[j] ?? 0) + (row[j] ?? 0); + for (let j = 0; j < m; j++) colSums[j] = Math.sqrt(Math.max(colSums[j] ?? 1, 1e-10)); + const An = X.map((row, i) => new Float64Array(row.map((v, j) => v / Math.max(rowSums[i] ?? 1, 1e-10) / Math.max(colSums[j] ?? 1, 1e-10)))); + // SVD (simplified: power iteration) + const nVecs = this.nClusters - 1; + const rng = this._seededRng(this.seed); + const rowVecs: Float64Array[] = []; + const colVecs: Float64Array[] = []; + for (let k = 0; k < nVecs; k++) { + let v = new Float64Array(m).map(() => rng() - 0.5); + // Power iteration for singular vector + for (let iter = 0; iter < 20; iter++) { + // u = A * v + const u = new Float64Array(n); + for (let i = 0; i < n; i++) for (let j = 0; j < m; j++) u[i] = (u[i] ?? 0) + (An[i]?.[j] ?? 0) * (v[j] ?? 0); + const uNorm = Math.sqrt(u.reduce((a, b) => a + b * b, 0)); + for (let i = 0; i < n; i++) u[i] = (u[i] ?? 0) / Math.max(uNorm, 1e-10); + // v = A^T * u + v = new Float64Array(m); + for (let j = 0; j < m; j++) for (let i = 0; i < n; i++) v[j] = (v[j] ?? 0) + (An[i]?.[j] ?? 0) * (u[i] ?? 0); + const vNorm = Math.sqrt(v.reduce((a, b) => a + b * b, 0)); + for (let j = 0; j < m; j++) v[j] = (v[j] ?? 0) / Math.max(vNorm, 1e-10); + // Deflate + for (const ov of rowVecs) { + let dot = 0; + for (let i = 0; i < n; i++) dot += (ov[i] ?? 0) * (u[i] ?? 0); + for (let i = 0; i < n; i++) u[i] = (u[i] ?? 0) - dot * (ov[i] ?? 0); + } + } + // Compute row vector: An * v + const rowVec = new Float64Array(n); + for (let i = 0; i < n; i++) for (let j = 0; j < m; j++) rowVec[i] = (rowVec[i] ?? 0) + (An[i]?.[j] ?? 0) * (v[j] ?? 0); + rowVecs.push(rowVec); + colVecs.push(v); + } + // K-means on row/col concatenated vectors + this.rowLabels_ = this._kmeans(rowVecs.length > 0 ? X.map((_, i) => new Float64Array(rowVecs.map((rv) => rv[i] ?? 0))) : X.map(() => new Float64Array(1).fill(0))); + this.columnLabels_ = this._kmeans(Array.from({ length: m }, (_, j) => new Float64Array(colVecs.map((cv) => cv[j] ?? 0)))); + // Build biclusters + this.biclusters_ = Array.from({ length: this.nClusters }, (_, k) => { + const rowMask = Array.from({ length: n }, (__, i) => this.rowLabels_[i] === k); + const colMask = Array.from({ length: m }, (__, j) => this.columnLabels_[j] === k); + return [rowMask, colMask] as [boolean[], boolean[]]; + }); + return this; + } + + private _kmeans(X: Float64Array[]): Int32Array { + const n = X.length; + const k = this.nClusters; + const rng = this._seededRng(this.seed + 1); + let centers = Array.from({ length: k }, () => X[Math.floor(rng() * n)] ?? new Float64Array(1)); + let labels = new Int32Array(n); + for (let iter = 0; iter < 50; iter++) { + const newLabels = new Int32Array(n); + for (let i = 0; i < n; i++) { + let best = 0, bestD = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + let d = 0; + const xi = X[i]!; + const ci = centers[c]!; + for (let f = 0; f < xi.length; f++) d += ((xi[f] ?? 0) - (ci[f] ?? 0)) ** 2; + if (d < bestD) { bestD = d; best = c; } + } + newLabels[i] = best; + } + // Update centers + const nF = X[0]?.length ?? 1; + const newCenters = Array.from({ length: k }, () => ({ sum: new Float64Array(nF), cnt: 0 })); + for (let i = 0; i < n; i++) { + const c = newLabels[i]!; + newCenters[c]!.cnt++; + const xi = X[i]!; + for (let f = 0; f < nF; f++) newCenters[c]!.sum[f] = (newCenters[c]!.sum[f] ?? 0) + (xi[f] ?? 0); + } + centers = newCenters.map((nc) => new Float64Array(nc.sum.map((v) => v / Math.max(nc.cnt, 1)))); + const changed = newLabels.some((l, i) => l !== labels[i]); + labels = newLabels; + if (!changed) break; + } + return labels; + } + + private _seededRng(seed: number): () => number { + let s = seed; + return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; + } + + getBicluster(i: number): [boolean[], boolean[]] { + return this.biclusters_[i] ?? [[], []]; + } +} + +export class SpectralBiclusteringExt { + rowLabels_: Int32Array = new Int32Array(0); + columnLabels_: Int32Array = new Int32Array(0); + + constructor(private readonly nClusters: [number, number] | number = [3, 3]) {} + + fit(X: Float64Array[]): this { + const nRowClusters = Array.isArray(this.nClusters) ? this.nClusters[0]! : this.nClusters; + const nColClusters = Array.isArray(this.nClusters) ? this.nClusters[1]! : this.nClusters; + const coClust = new SpectralCoClustering(Math.max(nRowClusters, nColClusters)); + coClust.fit(X); + // Remap to correct number of clusters + this.rowLabels_ = new Int32Array(coClust.rowLabels_.map((l) => l % nRowClusters)); + this.columnLabels_ = new Int32Array(coClust.columnLabels_.map((l) => l % nColClusters)); + return this; + } +} diff --git a/src/bicluster/index.ts b/src/bicluster/index.ts new file mode 100644 index 0000000..50ad235 --- /dev/null +++ b/src/bicluster/index.ts @@ -0,0 +1 @@ +export * from "./bicluster.js"; diff --git a/src/bicluster/spectral_bicluster_ext.ts b/src/bicluster/spectral_bicluster_ext.ts new file mode 100644 index 0000000..32ed364 --- /dev/null +++ b/src/bicluster/spectral_bicluster_ext.ts @@ -0,0 +1,150 @@ +/** + * Extended biclustering utilities: consensus biclustering, evaluation metrics. + * Port of sklearn.cluster.bicluster extensions. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute the consensus score between two sets of biclusters. */ +export function consensusScore( + a: { rowLabels: Int32Array; colLabels: Int32Array }, + b: { rowLabels: Int32Array; colLabels: Int32Array }, +): number { + const nRows = a.rowLabels.length; + const nCols = a.colLabels.length; + const aRows = new Set(); + const bRows = new Set(); + for (let i = 0; i < nRows; i++) { + if ((a.rowLabels[i] ?? 0) === 1) aRows.add(i); + if ((b.rowLabels[i] ?? 0) === 1) bRows.add(i); + } + const aCols = new Set(); + const bCols = new Set(); + for (let j = 0; j < nCols; j++) { + if ((a.colLabels[j] ?? 0) === 1) aCols.add(j); + if ((b.colLabels[j] ?? 0) === 1) bCols.add(j); + } + const rowInter = [...aRows].filter((r) => bRows.has(r)).length; + const colInter = [...aCols].filter((c) => bCols.has(c)).length; + const aSize = aRows.size * aCols.size; + const bSize = bRows.size * bCols.size; + if (aSize === 0 || bSize === 0) return 0; + return (rowInter * colInter) / Math.sqrt(aSize * bSize); +} + +/** Check if a biclustering result is non-degenerate (has at least one row and column in each bicluster). */ +export function checkBiclustersNonDegenerate( + rowLabels: Int32Array, + colLabels: Int32Array, + nClusters: number, +): boolean { + for (let k = 0; k < nClusters; k++) { + let rowCount = 0; + let colCount = 0; + for (let i = 0; i < rowLabels.length; i++) { + if ((rowLabels[i] ?? 0) === k) rowCount++; + } + for (let j = 0; j < colLabels.length; j++) { + if ((colLabels[j] ?? 0) === k) colCount++; + } + if (rowCount === 0 || colCount === 0) return false; + } + return true; +} + +/** Bicluster evaluator for measuring residue and volume. */ +export class BiclusterEvaluator { + private rowLabels_: Int32Array | null = null; + private colLabels_: Int32Array | null = null; + private data_: Float64Array[] | null = null; + + fit( + data: Float64Array[], + rowLabels: Int32Array, + colLabels: Int32Array, + ): this { + this.data_ = data; + this.rowLabels_ = rowLabels; + this.colLabels_ = colLabels; + return this; + } + + /** Compute the average residue of a bicluster (lower is better). */ + averageResidue(clusterId: number): number { + if (this.data_ === null || this.rowLabels_ === null || this.colLabels_ === null) { + throw new NotFittedError("BiclusterEvaluator is not fitted."); + } + const rows: number[] = []; + const cols: number[] = []; + for (let i = 0; i < this.rowLabels_.length; i++) { + if ((this.rowLabels_[i] ?? 0) === clusterId) rows.push(i); + } + for (let j = 0; j < this.colLabels_.length; j++) { + if ((this.colLabels_[j] ?? 0) === clusterId) cols.push(j); + } + if (rows.length === 0 || cols.length === 0) return 0; + let grandMean = 0; + for (const i of rows) { + for (const j of cols) { + grandMean += this.data_[i]?.[j] ?? 0; + } + } + grandMean /= rows.length * cols.length; + const rowMeans = rows.map((i) => { + let s = 0; + for (const j of cols) s += this.data_![i]?.[j] ?? 0; + return s / cols.length; + }); + const colMeans = cols.map((j) => { + let s = 0; + for (const i of rows) s += this.data_![i]?.[j] ?? 0; + return s / rows.length; + }); + let residue = 0; + for (let ri = 0; ri < rows.length; ri++) { + for (let ci = 0; ci < cols.length; ci++) { + const val = this.data_[rows[ri]!]?.[cols[ci]!] ?? 0; + const r = + val - + (rowMeans[ri] ?? 0) - + (colMeans[ci] ?? 0) + + grandMean; + residue += r * r; + } + } + return residue / (rows.length * cols.length); + } +} + +/** Generate a checkerboard matrix for testing biclustering algorithms. */ +export function makeCheckerboard( + shape: [number, number], + nClusters: [number, number], + noise = 0.0, + seed = 0, +): { data: Float64Array[]; rowLabels: Int32Array; colLabels: Int32Array } { + const [nRows, nCols] = shape; + const [nRowClusters, nColClusters] = nClusters; + const rowLabels = new Int32Array(nRows); + const colLabels = new Int32Array(nCols); + for (let i = 0; i < nRows; i++) { + rowLabels[i] = i % nRowClusters; + } + for (let j = 0; j < nCols; j++) { + colLabels[j] = j % nColClusters; + } + let rng = seed; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const data: Float64Array[] = Array.from({ length: nRows }, (_, i) => { + const row = new Float64Array(nCols); + for (let j = 0; j < nCols; j++) { + const same = (rowLabels[i] ?? 0) === (colLabels[j] ?? 0) % nRowClusters ? 1 : 0; + row[j] = same + noise * (rand() - 0.5); + } + return row; + }); + return { data, rowLabels, colLabels }; +} diff --git a/src/calibration/calibration.ts b/src/calibration/calibration.ts new file mode 100644 index 0000000..948aa5f --- /dev/null +++ b/src/calibration/calibration.ts @@ -0,0 +1,141 @@ +/** + * Probability calibration. + * Mirrors sklearn.calibration.CalibratedClassifierCV. + * Uses Platt scaling (logistic) or isotonic regression for calibration. + */ + +import { NotFittedError } from "../exceptions.js"; + +interface Classifier { + fit(X: Float64Array[], y: Float64Array): this; + predict(X: Float64Array[]): Float64Array; + score?(X: Float64Array[], y: Float64Array): number; +} + +function sigmoid(x: number): number { + return 1 / (1 + Math.exp(-x)); +} + +/** Platt scaling: fit a logistic function on scores to map to probabilities. */ +function plattScale(scores: Float64Array, y: Float64Array): [number, number] { + const n = scores.length; + let A = 0; + let B = 0; + const lr = 0.01; + + for (let iter = 0; iter < 1000; iter++) { + let gradA = 0; + let gradB = 0; + for (let i = 0; i < n; i++) { + const p = sigmoid(A * (scores[i] ?? 0) + B); + const err = p - (y[i] ?? 0); + gradA += err * (scores[i] ?? 0); + gradB += err; + } + A -= lr * gradA / n; + B -= lr * gradB / n; + } + + return [A, B]; +} + +export class CalibratedClassifierCV { + baseEstimator: Classifier; + method: string; + cv: number; + + calibratedEstimators_: { + estimator: Classifier; + A: number; + B: number; + }[] | null = null; + classes_: Float64Array | null = null; + + constructor( + baseEstimator: Classifier, + options: { method?: string; cv?: number } = {}, + ) { + this.baseEstimator = baseEstimator; + this.method = options.method ?? "sigmoid"; + this.cv = options.cv ?? 5; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const posClass = uniqueClasses[uniqueClasses.length - 1] ?? 1; + + const yBin = new Float64Array(y.map((yi) => (yi === posClass ? 1 : 0))); + + // Simple hold-out calibration + const foldSize = Math.floor(n / this.cv); + this.calibratedEstimators_ = []; + + for (let fold = 0; fold < this.cv; fold++) { + const testStart = fold * foldSize; + const testEnd = fold === this.cv - 1 ? n : testStart + foldSize; + + const trainIdx: number[] = []; + const testIdx: number[] = []; + for (let i = 0; i < n; i++) { + if (i >= testStart && i < testEnd) testIdx.push(i); + else trainIdx.push(i); + } + + const XTrain = trainIdx.map((i) => X[i] ?? new Float64Array(0)); + const yTrain = new Float64Array(trainIdx.map((i) => y[i] ?? 0)); + const XTest = testIdx.map((i) => X[i] ?? new Float64Array(0)); + const yTest = new Float64Array(testIdx.map((i) => yBin[i] ?? 0)); + + const est = Object.create(Object.getPrototypeOf(this.baseEstimator) as object) as Classifier; + Object.assign(est, this.baseEstimator); + est.fit(XTrain, yTrain); + + const testPred = est.predict(XTest); + const [A, B] = plattScale(testPred, yTest); + + this.calibratedEstimators_.push({ estimator: est, A, B }); + } + + return this; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (this.calibratedEstimators_ === null) throw new NotFittedError("CalibratedClassifierCV"); + + const n = X.length; + const probs = new Float64Array(n); + + for (const { estimator, A, B } of this.calibratedEstimators_) { + const scores = estimator.predict(X); + for (let i = 0; i < n; i++) { + probs[i] = (probs[i] ?? 0) + sigmoid(A * (scores[i] ?? 0) + B); + } + } + + const k = this.calibratedEstimators_.length; + return Array.from({ length: n }, (_, i) => { + const p = (probs[i] ?? 0) / k; + return new Float64Array([1 - p, p]); + }); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("CalibratedClassifierCV"); + const classes = this.classes_; + const proba = this.predictProba(X); + const posClass = classes[classes.length - 1] ?? 1; + const negClass = classes[0] ?? 0; + return new Float64Array(proba.map((p) => ((p[1] ?? 0) >= 0.5 ? posClass : negClass))); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/calibration/calibration_ext.ts b/src/calibration/calibration_ext.ts new file mode 100644 index 0000000..ed4b883 --- /dev/null +++ b/src/calibration/calibration_ext.ts @@ -0,0 +1,183 @@ +/** + * Calibration extensions: TemperatureScaling, PlattScaling, BetaCalibration. + */ + +export class TemperatureScaling { + private temperature = 1.0; + + fit(logits: Float64Array[], y: Int32Array, maxIter = 100): this { + let T = 1.0; + const lr = 0.01; + for (let iter = 0; iter < maxIter; iter++) { + let grad = 0; + for (let i = 0; i < logits.length; i++) { + const scaled = (logits[i]![0] ?? 0) / T; + const p = 1 / (1 + Math.exp(-scaled)); + const yi = y[i] ?? 0; + grad += (p - yi) * (-scaled / T); + } + T = Math.max(0.01, T - lr * grad / Math.max(logits.length, 1)); + } + this.temperature = T; + return this; + } + + calibrate(logits: Float64Array[]): Float64Array { + return new Float64Array(logits.map((l) => { + const scaled = (l[0] ?? 0) / this.temperature; + return 1 / (1 + Math.exp(-scaled)); + })); + } + + getTemperature(): number { return this.temperature; } +} + +export class PlattScaling { + private a = 0; + private b = 0; + + fit(scores: Float64Array, y: Int32Array, maxIter = 100): this { + const n = scores.length; + const hiTarget = (n + 1) / (n + 2); + const loTarget = 1 / (n + 2); + let a = 0; + let b = Math.log((n + 1) / n); + let fApB: number; + for (let iter = 0; iter < maxIter; iter++) { + let h11 = 0, h22 = 0, h21 = 0, g1 = 0, g2 = 0; + for (let i = 0; i < n; i++) { + const s = scores[i] ?? 0; + const ti = (y[i] ?? 0) === 1 ? hiTarget : loTarget; + fApB = s * a + b; + let p: number, q: number; + if (fApB >= 0) { + p = Math.exp(-fApB) / (1 + Math.exp(-fApB)); + q = 1 / (1 + Math.exp(-fApB)); + } else { + p = 1 / (1 + Math.exp(fApB)); + q = Math.exp(fApB) / (1 + Math.exp(fApB)); + } + const d2 = p * q; + h11 += s * s * d2; + h22 += d2; + h21 += s * d2; + const d1 = ti - p; + g1 += s * d1; + g2 += d1; + } + const det = h11 * h22 - h21 * h21; + if (Math.abs(det) < 1e-10) break; + const dA = -(h22 * g1 - h21 * g2) / det; + const dB = -(-h21 * g1 + h11 * g2) / det; + let stepsize = 1.0; + while (stepsize >= 1e-10) { + const newA = a + stepsize * dA; + const newB = b + stepsize * dB; + let newF = 0; + for (let i = 0; i < n; i++) { + const s = scores[i] ?? 0; + const ti = (y[i] ?? 0) === 1 ? hiTarget : loTarget; + fApB = s * newA + newB; + newF += fApB >= 0 + ? ti * fApB + Math.log(1 + Math.exp(-fApB)) + : (ti - 1) * fApB + Math.log(1 + Math.exp(fApB)); + } + if (newF < 1e-10) { a = newA; b = newB; break; } + stepsize /= 2; + } + } + this.a = a; + this.b = b; + return this; + } + + calibrate(scores: Float64Array): Float64Array { + return new Float64Array(scores.map((s) => { + const fApB = s * this.a + this.b; + return fApB >= 0 + ? Math.exp(-fApB) / (1 + Math.exp(-fApB)) + : 1 / (1 + Math.exp(fApB)); + })); + } +} + +export class BetaCalibration { + private a = 1.0; + private b = 1.0; + private c = 0.0; + + fit(scores: Float64Array, y: Int32Array): this { + const eps = 1e-7; + let sumA = 0, sumB = 0, sumC = 0; + const n = scores.length; + for (let i = 0; i < n; i++) { + const s = Math.max(eps, Math.min(1 - eps, scores[i] ?? 0)); + const yi = y[i] ?? 0; + sumA += yi * Math.log(s); + sumB += yi * Math.log(1 - s); + sumC += yi; + } + this.a = Math.max(0.01, sumA / Math.max(n, 1)); + this.b = Math.max(0.01, -sumB / Math.max(n, 1)); + this.c = sumC / Math.max(n, 1); + return this; + } + + calibrate(scores: Float64Array): Float64Array { + const eps = 1e-7; + return new Float64Array(scores.map((s) => { + const sc = Math.max(eps, Math.min(1 - eps, s)); + const logOdds = this.a * Math.log(sc) - this.b * Math.log(1 - sc) + this.c; + return 1 / (1 + Math.exp(-logOdds)); + })); + } +} + +export class IsotonicCalibration { + private xs: Float64Array = new Float64Array(0); + private ys: Float64Array = new Float64Array(0); + + fit(scores: Float64Array, y: Int32Array): this { + const n = scores.length; + const idx = Array.from({ length: n }, (_, i) => i).sort((a, b) => (scores[a] ?? 0) - (scores[b] ?? 0)); + const sortedX = new Float64Array(idx.map((i) => scores[i] ?? 0)); + const sortedY = new Float64Array(idx.map((i) => y[i] ?? 0)); + // Pool adjacent violators + const pooled = Array.from({ length: n }, (_, i) => ({ x: sortedX[i] ?? 0, y: sortedY[i] ?? 0, cnt: 1 })); + let changed = true; + while (changed) { + changed = false; + for (let i = 0; i < pooled.length - 1; i++) { + const a = pooled[i]; + const b = pooled[i + 1]; + if (a !== undefined && b !== undefined && a.y > b.y) { + const newY = (a.y * a.cnt + b.y * b.cnt) / (a.cnt + b.cnt); + a.y = newY; + a.cnt += b.cnt; + pooled.splice(i + 1, 1); + changed = true; + } + } + } + this.xs = new Float64Array(pooled.map((p) => p.x)); + this.ys = new Float64Array(pooled.map((p) => p.y)); + return this; + } + + calibrate(scores: Float64Array): Float64Array { + return new Float64Array(scores.map((s) => { + if (this.xs.length === 0) return s; + if (s <= (this.xs[0] ?? 0)) return this.ys[0] ?? 0; + if (s >= (this.xs[this.xs.length - 1] ?? 0)) return this.ys[this.ys.length - 1] ?? 0; + for (let i = 0; i < this.xs.length - 1; i++) { + if (s >= (this.xs[i] ?? 0) && s <= (this.xs[i + 1] ?? 0)) { + const dx = (this.xs[i + 1] ?? 0) - (this.xs[i] ?? 0); + if (Math.abs(dx) < 1e-10) return this.ys[i] ?? 0; + const t = (s - (this.xs[i] ?? 0)) / dx; + return (this.ys[i] ?? 0) + t * ((this.ys[i + 1] ?? 0) - (this.ys[i] ?? 0)); + } + } + return s; + })); + } +} diff --git a/src/calibration/calibration_ext3.ts b/src/calibration/calibration_ext3.ts new file mode 100644 index 0000000..ea46498 --- /dev/null +++ b/src/calibration/calibration_ext3.ts @@ -0,0 +1,189 @@ +/** + * Calibration extensions: TemperatureScaling, BetaCalibration, VennAbersCalibrator + * Port of sklearn.calibration extensions + */ + +import { NotFittedError } from "../exceptions.js"; + +export class TemperatureScaling { + maxIter: number; + lr: number; + + private temperature_ = 1.0; + + constructor(opts: { maxIter?: number; lr?: number } = {}) { + this.maxIter = opts.maxIter ?? 100; + this.lr = opts.lr ?? 0.01; + } + + private softmax(logits: Float64Array, temperature: number): Float64Array { + const scaled = logits.map(v => (v ?? 0) / temperature); + const max = scaled.reduce((a, b) => Math.max(a, b), -Number.POSITIVE_INFINITY); + const exps = scaled.map(v => Math.exp((v ?? 0) - max)); + const sum = exps.reduce((a, b) => a + b, 0); + return Float64Array.from(exps.map(v => v / (sum + 1e-15))); + } + + fit(logits: Float64Array[], yTrue: Int32Array): this { + let t = this.temperature_; + const n = logits.length; + for (let iter = 0; iter < this.maxIter; iter++) { + let gradient = 0; + for (let i = 0; i < n; i++) { + const probs = this.softmax(logits[i]!, t); + const k = yTrue[i] ?? 0; + const pk = probs[k] ?? 1e-15; + const logit_k = (logits[i]![k] ?? 0) / t; + const expectedLogit = probs.reduce((s, pj, j) => s + (pj ?? 0) * ((logits[i]![j] ?? 0) / t), 0); + gradient += (logit_k - expectedLogit) * (-1 / t); + void pk; + } + gradient /= n; + t = t - this.lr * gradient; + t = Math.max(0.01, t); + void iter; + } + this.temperature_ = t; + return this; + } + + predict(logits: Float64Array[]): Float64Array[] { + if (this.temperature_ === null) throw new NotFittedError("TemperatureScaling not fitted."); + return logits.map(l => this.softmax(l, this.temperature_)); + } + + get temperature(): number { return this.temperature_; } +} + +export class BetaCalibration { + private a_ = 1.0; + private b_ = 1.0; + private c_ = 0.0; + + fit(scores: Float64Array, yTrue: Int32Array): this { + const n = scores.length; + let a = 1.0; + let b = 1.0; + let c = 0.0; + for (let iter = 0; iter < 100; iter++) { + let dA = 0; + let dB = 0; + let dC = 0; + for (let i = 0; i < n; i++) { + const x = Math.max(1e-15, Math.min(1 - 1e-15, scores[i] ?? 0.5)); + const logx = Math.log(x); + const log1mx = Math.log(1 - x); + const logit = a * logx - b * log1mx + c; + const p = 1 / (1 + Math.exp(-logit)); + const err = (yTrue[i] ?? 0) - p; + dA += err * logx; + dB += err * (-log1mx); + dC += err; + } + a += 0.001 * dA / n; + b += 0.001 * dB / n; + c += 0.001 * dC / n; + a = Math.max(0.01, a); + b = Math.max(0.01, b); + void iter; + } + this.a_ = a; + this.b_ = b; + this.c_ = c; + return this; + } + + predict(scores: Float64Array): Float64Array { + return Float64Array.from(scores.map(x => { + const xClamped = Math.max(1e-15, Math.min(1 - 1e-15, x ?? 0.5)); + const logit = this.a_ * Math.log(xClamped) - this.b_ * Math.log(1 - xClamped) + this.c_; + return 1 / (1 + Math.exp(-logit)); + })); + } +} + +export class IsotonicCalibratorExt { + private isotonic_: Float64Array | null = null; + private thresholds_: Float64Array | null = null; + + fit(scores: Float64Array, yTrue: Int32Array): this { + const n = scores.length; + const pairs = Array.from({ length: n }, (_, i) => ({ score: scores[i] ?? 0, label: yTrue[i] ?? 0 })); + pairs.sort((a, b) => a.score - b.score); + const sortedScores = Float64Array.from(pairs.map(p => p.score)); + const sortedLabels = Float64Array.from(pairs.map(p => p.label)); + const fitted = sortedLabels.slice(); + let changed = true; + while (changed) { + changed = false; + for (let i = 0; i < n - 1; i++) { + if ((fitted[i] ?? 0) > (fitted[i + 1] ?? 0)) { + const avg = ((fitted[i] ?? 0) + (fitted[i + 1] ?? 0)) / 2; + fitted[i] = avg; + fitted[i + 1] = avg; + changed = true; + } + } + } + this.thresholds_ = sortedScores; + this.isotonic_ = fitted; + return this; + } + + predict(scores: Float64Array): Float64Array { + if (!this.thresholds_ || !this.isotonic_) throw new NotFittedError("IsotonicCalibratorExt not fitted."); + return Float64Array.from(scores.map(s => { + const n = this.thresholds_!.length; + if ((s ?? 0) <= (this.thresholds_[0] ?? 0)) return this.isotonic_![0] ?? 0; + if ((s ?? 0) >= (this.thresholds_[n - 1] ?? 0)) return this.isotonic_![n - 1] ?? 0; + for (let i = 0; i < n - 1; i++) { + if ((s ?? 0) >= (this.thresholds_[i] ?? 0) && (s ?? 0) <= (this.thresholds_[i + 1] ?? 0)) { + const t = ((s ?? 0) - (this.thresholds_[i] ?? 0)) / ((this.thresholds_[i + 1] ?? 0) - (this.thresholds_[i] ?? 0) + 1e-15); + return (1 - t) * (this.isotonic_![i] ?? 0) + t * (this.isotonic_![i + 1] ?? 0); + } + } + return this.isotonic_![n - 1] ?? 0; + })); + } +} + +export class CalibratedClassifierCVExt { + method: "sigmoid" | "isotonic" | "temperature"; + cv: number; + + private a_ = 1.0; + private b_ = 0.0; + + constructor(opts: { method?: "sigmoid" | "isotonic" | "temperature"; cv?: number } = {}) { + this.method = opts.method ?? "sigmoid"; + this.cv = opts.cv ?? 5; + } + + fit(scores: Float64Array, yTrue: Int32Array): this { + const n = scores.length; + if (this.method === "sigmoid") { + let a = 1.0; + let b = 0.0; + for (let iter = 0; iter < 200; iter++) { + let da = 0; + let db = 0; + for (let i = 0; i < n; i++) { + const p = 1 / (1 + Math.exp(-(a * (scores[i] ?? 0) + b))); + const err = (yTrue[i] ?? 0) - p; + da += err * (scores[i] ?? 0); + db += err; + } + a += 0.01 * da / n; + b += 0.01 * db / n; + void iter; + } + this.a_ = a; + this.b_ = b; + } + return this; + } + + predict(scores: Float64Array): Float64Array { + return Float64Array.from(scores.map(s => 1 / (1 + Math.exp(-(this.a_ * (s ?? 0) + this.b_))))); + } +} diff --git a/src/calibration/calibration_ext4.ts b/src/calibration/calibration_ext4.ts new file mode 100644 index 0000000..4f09c8a --- /dev/null +++ b/src/calibration/calibration_ext4.ts @@ -0,0 +1,157 @@ +/** + * Calibration extensions: histogram binning, isotonic calibration. + * Port of sklearn.calibration extensions. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Calibration curve (reliability diagram) computation. */ +export function calibrationCurveExt( + yTrue: Int32Array, + yProb: Float64Array, + nBins = 5, + strategy: "uniform" | "quantile" = "uniform", +): { fractionPositive: Float64Array; meanPredictedValue: Float64Array; binCounts: Int32Array } { + const n = yTrue.length; + let binEdges: number[]; + if (strategy === "uniform") { + binEdges = Array.from({ length: nBins + 1 }, (_, k) => k / nBins); + } else { + const sorted = Float64Array.from(yProb).sort(); + binEdges = [0]; + for (let k = 1; k < nBins; k++) { + binEdges.push(sorted[Math.floor((k * n) / nBins)] ?? 0); + } + binEdges.push(1); + } + + const fractionPositive = new Float64Array(nBins); + const meanPredictedValue = new Float64Array(nBins); + const binCounts = new Int32Array(nBins); + + for (let i = 0; i < n; i++) { + const p = yProb[i] ?? 0; + let bin = nBins - 1; + for (let k = 0; k < nBins; k++) { + if (p < (binEdges[k + 1] ?? 1)) { + bin = k; + break; + } + } + binCounts[bin]!++; + fractionPositive[bin]! += yTrue[i] ?? 0; + meanPredictedValue[bin]! += p; + } + for (let k = 0; k < nBins; k++) { + if ((binCounts[k] ?? 0) > 0) { + fractionPositive[k]! /= binCounts[k]!; + meanPredictedValue[k]! /= binCounts[k]!; + } + } + return { fractionPositive, meanPredictedValue, binCounts }; +} + +/** Temperature scaling calibration. */ +export class TemperatureScaling { + private temperature_ = 1.0; + private fitted_ = false; + + fit(logits: Float64Array, y: Int32Array): this { + // Find temperature that minimizes NLL on validation data + let bestNll = Number.POSITIVE_INFINITY; + let bestTemp = 1.0; + for (let t = 0.1; t <= 10.0; t += 0.1) { + let nll = 0; + for (let i = 0; i < logits.length; i++) { + const scaled = (logits[i] ?? 0) / t; + const p = 1 / (1 + Math.exp(-scaled)); + const label = (y[i] ?? 0) === 1 ? 1 : 0; + nll -= label * Math.log(Math.max(p, 1e-15)) + (1 - label) * Math.log(Math.max(1 - p, 1e-15)); + } + nll /= logits.length; + if (nll < bestNll) { + bestNll = nll; + bestTemp = t; + } + } + this.temperature_ = bestTemp; + this.fitted_ = true; + return this; + } + + transform(logits: Float64Array): Float64Array { + if (!this.fitted_) throw new NotFittedError("TemperatureScaling is not fitted."); + return new Float64Array(logits.map((l) => 1 / (1 + Math.exp(-(l / this.temperature_))))); + } + + get temperature(): number { + return this.temperature_; + } +} + +/** Platt scaling (logistic calibration of SVM scores). */ +export class PlattScaling { + private A_ = 0; + private B_ = 0; + private fitted_ = false; + + fit(decisionScores: Float64Array, y: Int32Array): this { + // Fit logistic regression: P(y=1|score) = sigmoid(A*score + B) + const n = decisionScores.length; + // Add Platt's prior correction + const nPos = y.reduce((s, v) => s + (v === 1 ? 1 : 0), 0); + const nNeg = n - nPos; + const tPos = (nPos + 1) / (nPos + 2); + const tNeg = 1 / (nNeg + 2); + + let A = 0; + let B = Math.log((nNeg + 1) / (nPos + 1)); + const lr = 0.001; + for (let iter = 0; iter < 100; iter++) { + let dA = 0; + let dB = 0; + for (let i = 0; i < n; i++) { + const t = (y[i] ?? 0) === 1 ? tPos : tNeg; + const logit = A * (decisionScores[i] ?? 0) + B; + const p = 1 / (1 + Math.exp(-logit)); + const err = p - t; + dA += err * (decisionScores[i] ?? 0); + dB += err; + } + A -= lr * dA / n; + B -= lr * dB / n; + } + this.A_ = A; + this.B_ = B; + this.fitted_ = true; + return this; + } + + transform(decisionScores: Float64Array): Float64Array { + if (!this.fitted_) throw new NotFittedError("PlattScaling is not fitted."); + return new Float64Array( + decisionScores.map((s) => 1 / (1 + Math.exp(-(this.A_ * s + this.B_)))), + ); + } +} + +/** Compute expected calibration error (ECE). */ +export function expectedCalibrationError( + yTrue: Int32Array, + yProb: Float64Array, + nBins = 10, +): number { + const { fractionPositive, meanPredictedValue, binCounts } = calibrationCurveExt( + yTrue, + yProb, + nBins, + ); + const n = yTrue.length; + let ece = 0; + for (let k = 0; k < nBins; k++) { + const cnt = binCounts[k] ?? 0; + if (cnt === 0) continue; + ece += (cnt / n) * Math.abs((fractionPositive[k] ?? 0) - (meanPredictedValue[k] ?? 0)); + } + return ece; +} diff --git a/src/calibration/index.ts b/src/calibration/index.ts new file mode 100644 index 0000000..e03c3f7 --- /dev/null +++ b/src/calibration/index.ts @@ -0,0 +1 @@ +export * from "./calibration.js"; diff --git a/src/cluster/affinity_propagation.ts b/src/cluster/affinity_propagation.ts new file mode 100644 index 0000000..1228a23 --- /dev/null +++ b/src/cluster/affinity_propagation.ts @@ -0,0 +1,199 @@ +/** + * AffinityPropagation clustering. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface AffinityPropagationOptions { + dampingFactor?: number; + maxIter?: number; + convergenceIter?: number; + preference?: number; +} + +export class AffinityPropagation { + private dampingFactor: number; + private maxIter: number; + private convergenceIter: number; + private preference: number | undefined; + + labels_: Int32Array | null = null; + clusterCentersIndices_: Int32Array | null = null; + nIter_ = 0; + + constructor(options: AffinityPropagationOptions = {}) { + this.dampingFactor = options.dampingFactor ?? 0.5; + this.maxIter = options.maxIter ?? 200; + this.convergenceIter = options.convergenceIter ?? 15; + this.preference = options.preference; + } + + fit(X: Float64Array[]): this { + const n = X.length; + if (n === 0) { + this.labels_ = new Int32Array(0); + this.clusterCentersIndices_ = new Int32Array(0); + return this; + } + + // Build similarity matrix S = -||xi - xj||^2 + const S: Float64Array[] = Array.from( + { length: n }, + () => new Float64Array(n), + ); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(0); + for (let j = i; j < n; j++) { + const xj = X[j] ?? new Float64Array(0); + let d = 0; + for (let k = 0; k < xi.length; k++) + d += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + (S[i] as Float64Array)[j] = -d; + (S[j] as Float64Array)[i] = -d; + } + } + + // Set preference (diagonal) + let pref = this.preference; + if (pref === undefined) { + // Median of similarities + const vals: number[] = []; + for (let i = 0; i < n; i++) + for (let j = i + 1; j < n; j++) + vals.push((S[i] as Float64Array)[j] ?? 0); + vals.sort((a, b) => a - b); + pref = vals[Math.floor(vals.length / 2)] ?? -1; + } + for (let i = 0; i < n; i++) (S[i] as Float64Array)[i] = pref; + + // Responsibility R and Availability A matrices + const R: Float64Array[] = Array.from( + { length: n }, + () => new Float64Array(n), + ); + const A: Float64Array[] = Array.from( + { length: n }, + () => new Float64Array(n), + ); + const d = this.dampingFactor; + let stableCount = 0; + let prevExemplars: Set = new Set(); + + for (let iter = 0; iter < this.maxIter; iter++) { + // Update responsibilities: R(i,k) = S(i,k) - max_{k'!=k}[A(i,k')+S(i,k')] + for (let i = 0; i < n; i++) { + const Si = S[i] ?? new Float64Array(n); + const Ai = A[i] ?? new Float64Array(n); + // Find two highest A+S values + let max1 = Number.NEGATIVE_INFINITY; + let max2 = Number.NEGATIVE_INFINITY; + let argmax1 = -1; + for (let k = 0; k < n; k++) { + const v = (Ai[k] ?? 0) + (Si[k] ?? 0); + if (v > max1) { + max2 = max1; + max1 = v; + argmax1 = k; + } else if (v > max2) max2 = v; + } + const Ri = R[i] ?? new Float64Array(n); + for (let k = 0; k < n; k++) { + const maxOther = k === argmax1 ? max2 : max1; + const newR = (Si[k] ?? 0) - maxOther; + Ri[k] = d * (Ri[k] ?? 0) + (1 - d) * newR; + } + } + + // Update availabilities + for (let k = 0; k < n; k++) { + // sum of positive R(i',k) for i'!=k + let sumPos = 0; + for (let i = 0; i < n; i++) { + if (i === k) continue; + const v = (R[i] as Float64Array)[k] ?? 0; + if (v > 0) sumPos += v; + } + const rkk = (R[k] as Float64Array)[k] ?? 0; + for (let i = 0; i < n; i++) { + const Ai = A[i] ?? new Float64Array(n); + let newA: number; + if (i === k) { + newA = sumPos; + } else { + const rik = (R[i] as Float64Array)[k] ?? 0; + const sumWithout = sumPos - (rik > 0 ? rik : 0); + newA = Math.min(0, rkk + sumWithout); + } + Ai[k] = d * (Ai[k] ?? 0) + (1 - d) * newA; + } + } + + // Check convergence + const exemplars = new Set(); + for (let i = 0; i < n; i++) { + const Ai = A[i] ?? new Float64Array(n); + const Ri = R[i] ?? new Float64Array(n); + let best = Number.NEGATIVE_INFINITY; + let bestK = 0; + for (let k = 0; k < n; k++) { + const v = (Ai[k] ?? 0) + (Ri[k] ?? 0); + if (v > best) { + best = v; + bestK = k; + } + } + exemplars.add(bestK); + } + + const same = + exemplars.size === prevExemplars.size && + [...exemplars].every((e) => prevExemplars.has(e)); + if (same) { + stableCount++; + if (stableCount >= this.convergenceIter) { + this.nIter_ = iter + 1; + break; + } + } else { + stableCount = 0; + } + prevExemplars = exemplars; + this.nIter_ = iter + 1; + } + + // Assign labels + const labels = new Int32Array(n); + for (let i = 0; i < n; i++) { + const Ai = A[i] ?? new Float64Array(n); + const Ri = R[i] ?? new Float64Array(n); + let best = Number.NEGATIVE_INFINITY; + let bestK = 0; + for (let k = 0; k < n; k++) { + const v = (Ai[k] ?? 0) + (Ri[k] ?? 0); + if (v > best) { + best = v; + bestK = k; + } + } + labels[i] = bestK; + } + + const centerSet = new Set(Array.from(labels)); + const centers = Int32Array.from([...centerSet].sort((a, b) => a - b)); + // Relabel to 0..k-1 + const map = new Map(); + centers.forEach((c, idx) => map.set(c, idx)); + for (let i = 0; i < n; i++) labels[i] = map.get(labels[i] ?? 0) ?? 0; + + this.labels_ = labels; + this.clusterCentersIndices_ = centers; + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.labels_ || !this.clusterCentersIndices_) + throw new NotFittedError("AffinityPropagation"); + // Not supported post-fit without stored data; return empty + return new Int32Array(X.length).fill(-1); + } +} diff --git a/src/cluster/agglomerative.ts b/src/cluster/agglomerative.ts new file mode 100644 index 0000000..68eddcf --- /dev/null +++ b/src/cluster/agglomerative.ts @@ -0,0 +1,198 @@ +/** + * AgglomerativeClustering and MiniBatchKMeans. + * Mirrors sklearn.cluster.AgglomerativeClustering and MiniBatchKMeans. + */ + +import { NotFittedError } from "../exceptions.js"; + +function euclidean(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + return Math.sqrt(s); +} + +export type Linkage = "ward" | "complete" | "average" | "single"; + +export interface AgglomerativeClusteringOptions { + nClusters?: number; + linkage?: Linkage; +} + +export class AgglomerativeClustering { + nClusters: number; + linkage: Linkage; + + labels_: Int32Array | null = null; + nClusters_: number = 0; + + constructor(options: AgglomerativeClusteringOptions = {}) { + this.nClusters = options.nClusters ?? 2; + this.linkage = options.linkage ?? "ward"; + } + + fit(X: Float64Array[]): this { + const n = X.length; + // Initialize each point as its own cluster + let clusters: number[][] = X.map((_, i) => [i]); + + // Distance matrix + const dist = (a: number[], b: number[]): number => { + if (this.linkage === "single") { + let min = Number.POSITIVE_INFINITY; + for (const i of a) + for (const j of b) min = Math.min(min, euclidean(X[i]!, X[j]!)); + return min; + } else if (this.linkage === "complete") { + let max = Number.NEGATIVE_INFINITY; + for (const i of a) + for (const j of b) max = Math.max(max, euclidean(X[i]!, X[j]!)); + return max; + } else { + // average and ward both use average distance here (simplified) + let sum = 0; + for (const i of a) for (const j of b) sum += euclidean(X[i]!, X[j]!); + return sum / (a.length * b.length); + } + }; + + while (clusters.length > this.nClusters) { + let minD = Number.POSITIVE_INFINITY; + let mergeI = 0; + let mergeJ = 1; + for (let i = 0; i < clusters.length; i++) { + for (let j = i + 1; j < clusters.length; j++) { + const d = dist(clusters[i]!, clusters[j]!); + if (d < minD) { + minD = d; + mergeI = i; + mergeJ = j; + } + } + } + clusters[mergeI] = clusters[mergeI]!.concat(clusters[mergeJ]!); + clusters.splice(mergeJ, 1); + } + + this.labels_ = new Int32Array(n); + for (let k = 0; k < clusters.length; k++) { + for (const idx of clusters[k]!) this.labels_[idx] = k; + } + this.nClusters_ = clusters.length; + return this; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_!; + } +} + +export interface MiniBatchKMeansOptions { + nClusters?: number; + batchSize?: number; + maxIter?: number; + tol?: number; +} + +export class MiniBatchKMeans { + nClusters: number; + batchSize: number; + maxIter: number; + tol: number; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + inertia_: number = 0; + + constructor(options: MiniBatchKMeansOptions = {}) { + this.nClusters = options.nClusters ?? 8; + this.batchSize = options.batchSize ?? 100; + this.maxIter = options.maxIter ?? 100; + this.tol = options.tol ?? 1e-4; + } + + private _initCenters(X: Float64Array[]): Float64Array[] { + const indices: number[] = []; + while (indices.length < this.nClusters) { + const idx = Math.floor(Math.random() * X.length); + if (!indices.includes(idx)) indices.push(idx); + } + return indices.map((i) => new Float64Array(X[i]!)); + } + + fit(X: Float64Array[]): this { + const n = X.length; + if (n === 0) throw new Error("Empty input"); + const nFeatures = X[0]?.length ?? 0; + + const centers = this._initCenters(X); + const counts = new Float64Array(this.nClusters); + + for (let iter = 0; iter < this.maxIter; iter++) { + const batch: Float64Array[] = []; + for (let i = 0; i < this.batchSize; i++) { + batch.push(X[Math.floor(Math.random() * n)]!); + } + + for (const x of batch) { + let nearest = 0; + let minD = Number.POSITIVE_INFINITY; + for (let k = 0; k < this.nClusters; k++) { + const d = euclidean(x, centers[k]!); + if (d < minD) { + minD = d; + nearest = k; + } + } + counts[nearest] = (counts[nearest] ?? 0) + 1; + const lr = 1 / (counts[nearest] ?? 1); + const c = centers[nearest]!; + for (let j = 0; j < nFeatures; j++) { + c[j] = (c[j] ?? 0) * (1 - lr) + (x[j] ?? 0) * lr; + } + } + } + + this.clusterCenters_ = centers; + this.labels_ = new Int32Array(n); + this.inertia_ = 0; + + for (let i = 0; i < n; i++) { + let nearest = 0; + let minD = Number.POSITIVE_INFINITY; + for (let k = 0; k < this.nClusters; k++) { + const d = euclidean(X[i]!, centers[k]!); + if (d < minD) { + minD = d; + nearest = k; + } + } + this.labels_[i] = nearest; + this.inertia_ += minD * minD; + } + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.clusterCenters_) throw new NotFittedError("MiniBatchKMeans"); + const out = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + let nearest = 0; + let minD = Number.POSITIVE_INFINITY; + for (let k = 0; k < this.nClusters; k++) { + const d = euclidean(X[i]!, this.clusterCenters_[k]!); + if (d < minD) { + minD = d; + nearest = k; + } + } + out[i] = nearest; + } + return out; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_!; + } +} diff --git a/src/cluster/bisecting_kmeans.ts b/src/cluster/bisecting_kmeans.ts new file mode 100644 index 0000000..bc4e6d5 --- /dev/null +++ b/src/cluster/bisecting_kmeans.ts @@ -0,0 +1,204 @@ +/** + * BisectingKMeans: divisive hierarchical clustering using k-means bisection. + * Mirrors sklearn.cluster.BisectingKMeans. + */ + +import { NotFittedError } from "../exceptions.js"; + +function euclidean(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + return Math.sqrt(s); +} + +function clusterMean(points: Float64Array[]): Float64Array { + if (points.length === 0) return new Float64Array(0); + const p = (points[0] ?? new Float64Array(0)).length; + const m = new Float64Array(p); + for (const pt of points) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (pt[j] ?? 0); + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / points.length; + return m; +} + +function clusterSSE(points: Float64Array[], center: Float64Array): number { + let s = 0; + for (const pt of points) { + for (let j = 0; j < pt.length; j++) s += ((pt[j] ?? 0) - (center[j] ?? 0)) ** 2; + } + return s; +} + +/** Run k-means with k=2 on the given points. Returns cluster assignments. */ +function bisect( + points: Float64Array[], + maxIter: number, + rng: number, +): { labels: Int32Array; centers: Float64Array[] } { + const n = points.length; + const p = (points[0] ?? new Float64Array(0)).length; + + if (n <= 1) { + return { labels: new Int32Array(n), centers: [clusterMean(points), new Float64Array(p)] }; + } + + // Init: pick 2 random centers + const i0 = Math.abs(rng) % n; + const i1 = (Math.abs(rng) + 1) % n; + let centers = [new Float64Array(points[i0] ?? new Float64Array(p)), new Float64Array(points[i1] ?? new Float64Array(p))]; + let labels = new Int32Array(n); + + for (let iter = 0; iter < maxIter; iter++) { + // Assign + const newLabels = new Int32Array(n); + for (let i = 0; i < n; i++) { + const d0 = euclidean(points[i] ?? new Float64Array(p), centers[0] ?? new Float64Array(p)); + const d1 = euclidean(points[i] ?? new Float64Array(p), centers[1] ?? new Float64Array(p)); + newLabels[i] = d1 < d0 ? 1 : 0; + } + + // Update centers + const c0 = points.filter((_, i) => newLabels[i] === 0); + const c1 = points.filter((_, i) => newLabels[i] === 1); + const newCenters = [ + c0.length > 0 ? clusterMean(c0) : centers[0] ?? new Float64Array(p), + c1.length > 0 ? clusterMean(c1) : centers[1] ?? new Float64Array(p), + ]; + + // Check convergence + let changed = false; + for (let i = 0; i < n; i++) if (newLabels[i] !== labels[i]) { changed = true; break; } + labels = newLabels; + centers = newCenters; + if (!changed) break; + } + + return { labels, centers: [centers[0] ?? new Float64Array(p), centers[1] ?? new Float64Array(p)] }; +} + +/** + * BisectingKMeans: hierarchical divisive clustering. + * Repeatedly bisects the cluster with highest SSE. + * Mirrors sklearn.cluster.BisectingKMeans. + */ +export class BisectingKMeans { + nClusters: number; + maxIter: number; + randomState: number; + bisectingStrategy: "biggest_inertia" | "largest_cluster"; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + inertia_: number = 0; + nIter_: number = 0; + + constructor( + options: { + nClusters?: number; + maxIter?: number; + randomState?: number; + bisectingStrategy?: "biggest_inertia" | "largest_cluster"; + } = {}, + ) { + this.nClusters = options.nClusters ?? 8; + this.maxIter = options.maxIter ?? 300; + this.randomState = options.randomState ?? 42; + this.bisectingStrategy = options.bisectingStrategy ?? "biggest_inertia"; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nClusters, n); + + // Start: all points in one cluster + let clusterLabels = new Int32Array(n); + const clusterCenters: Float64Array[] = [clusterMean(X)]; + let nClusters = 1; + + let rng = this.randomState; + + while (nClusters < k) { + // Find cluster to bisect + let targetCluster = 0; + let bestCrit = -Number.POSITIVE_INFINITY; + + for (let c = 0; c < nClusters; c++) { + const pts = X.filter((_, i) => clusterLabels[i] === c); + if (pts.length <= 1) continue; + const crit = this.bisectingStrategy === "biggest_inertia" + ? clusterSSE(pts, clusterCenters[c] ?? new Float64Array(p)) + : pts.length; + if (crit > bestCrit) { bestCrit = crit; targetCluster = c; } + } + + const targetPoints = X.filter((_, i) => clusterLabels[i] === targetCluster); + const targetIndices = Array.from({ length: n }, (_, i) => i).filter((i) => clusterLabels[i] === targetCluster); + + if (targetPoints.length <= 1) break; + + rng = Math.abs(rng * 1664525 + 1013904223) % 2147483647; + const { labels: subLabels } = bisect(targetPoints, this.maxIter, rng); + + // Update global labels: targetCluster stays for subLabel=0, nClusters for subLabel=1 + for (let i = 0; i < targetIndices.length; i++) { + const idx = targetIndices[i] ?? 0; + if ((subLabels[i] ?? 0) === 1) clusterLabels[idx] = nClusters; + } + + // Recompute centers for the two new clusters + const c0pts = X.filter((_, i) => clusterLabels[i] === targetCluster); + const c1pts = X.filter((_, i) => clusterLabels[i] === nClusters); + clusterCenters[targetCluster] = c0pts.length > 0 ? clusterMean(c0pts) : new Float64Array(p); + clusterCenters.push(c1pts.length > 0 ? clusterMean(c1pts) : new Float64Array(p)); + nClusters++; + this.nIter_++; + } + + this.labels_ = clusterLabels; + this.clusterCenters_ = clusterCenters; + + // Compute inertia + let inertia = 0; + for (let i = 0; i < n; i++) { + const c = clusterLabels[i] ?? 0; + const center = clusterCenters[c] ?? new Float64Array(p); + const xi = X[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) inertia += ((xi[j] ?? 0) - (center[j] ?? 0)) ** 2; + } + this.inertia_ = inertia; + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (this.clusterCenters_ === null) throw new NotFittedError("BisectingKMeans"); + const centers = this.clusterCenters_; + return new Int32Array(X.map((xi) => { + let bestC = 0; + let bestD = Number.POSITIVE_INFINITY; + for (let c = 0; c < centers.length; c++) { + const d = euclidean(xi, centers[c] ?? new Float64Array(0)); + if (d < bestD) { bestD = d; bestC = c; } + } + return bestC; + })); + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_!; + } + + score(X: Float64Array[]): number { + if (this.clusterCenters_ === null) throw new NotFittedError("BisectingKMeans"); + const labels = this.predict(X); + const centers = this.clusterCenters_; + let inertia = 0; + for (let i = 0; i < X.length; i++) { + const c = labels[i] ?? 0; + const center = centers[c] ?? new Float64Array(0); + const xi = X[i] ?? new Float64Array(0); + for (let j = 0; j < xi.length; j++) inertia += ((xi[j] ?? 0) - (center[j] ?? 0)) ** 2; + } + return -inertia; + } +} diff --git a/src/cluster/cluster_diagnostics.ts b/src/cluster/cluster_diagnostics.ts new file mode 100644 index 0000000..3a39cf3 --- /dev/null +++ b/src/cluster/cluster_diagnostics.ts @@ -0,0 +1,148 @@ +/** + * Cluster diagnostic utilities. + * Mirrors scikit-learn's metrics.silhouette_score, calinski_harabasz_score, davies_bouldin_score. + */ + +function euclidean(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + return Math.sqrt(s); +} + +/** + * Compute the Silhouette Coefficient for each sample. + */ +export function silhouetteSamples( + X: Float64Array[], + labels: Int32Array, +): Float64Array { + const n = X.length; + const clusterIds = Array.from(new Set(Array.from(labels))).sort((a, b) => a - b); + const scores = new Float64Array(n); + + for (let i = 0; i < n; i++) { + const li = labels[i]!; + // Intra-cluster mean distance (a) + const sameCluster = clusterIds + .filter((c) => c === li) + .map(() => { + let sum = 0, count = 0; + for (let j = 0; j < n; j++) { + if (j !== i && labels[j] === li) { + sum += euclidean(X[i]!, X[j]!); + count++; + } + } + return count === 0 ? 0 : sum / count; + }); + const a = sameCluster[0] ?? 0; + + // Nearest-cluster mean distance (b) + let b = Number.POSITIVE_INFINITY; + for (const c of clusterIds) { + if (c === li) continue; + let sum = 0, count = 0; + for (let j = 0; j < n; j++) { + if (labels[j] === c) { sum += euclidean(X[i]!, X[j]!); count++; } + } + if (count > 0) b = Math.min(b, sum / count); + } + + const maxAB = Math.max(a, isFinite(b) ? b : 0); + scores[i] = maxAB < 1e-10 ? 0 : ((isFinite(b) ? b : 0) - a) / maxAB; + } + return scores; +} + +/** + * Mean silhouette coefficient. + */ +export function silhouetteScore(X: Float64Array[], labels: Int32Array): number { + const samples = silhouetteSamples(X, labels); + return samples.reduce((s, v) => s + v, 0) / samples.length; +} + +/** + * Calinski-Harabasz Index (Variance Ratio Criterion). + * Higher is better. + */ +export function calinskiHarabaszScore( + X: Float64Array[], + labels: Int32Array, +): number { + const n = X.length; + const nFeatures = X[0]?.length ?? 0; + const clusterIds = Array.from(new Set(Array.from(labels))); + const k = clusterIds.length; + if (k <= 1 || k >= n) return 0; + + const globalMean = new Float64Array(nFeatures); + for (const row of X) { + for (let j = 0; j < nFeatures; j++) globalMean[j] = (globalMean[j] ?? 0) + (row[j] ?? 0) / n; + } + + let trBw = 0; // Between-cluster scatter + let trWw = 0; // Within-cluster scatter + + for (const c of clusterIds) { + const clusterPoints = X.filter((_, i) => labels[i] === c); + const nc = clusterPoints.length; + if (nc === 0) continue; + const centroid = new Float64Array(nFeatures); + for (const p of clusterPoints) { + for (let j = 0; j < nFeatures; j++) centroid[j] = (centroid[j] ?? 0) + (p[j] ?? 0) / nc; + } + for (let j = 0; j < nFeatures; j++) { + trBw += nc * ((centroid[j] ?? 0) - (globalMean[j] ?? 0)) ** 2; + } + for (const p of clusterPoints) { + for (let j = 0; j < nFeatures; j++) { + trWw += ((p[j] ?? 0) - (centroid[j] ?? 0)) ** 2; + } + } + } + + if (trWw < 1e-10) return 1; + return (trBw / (k - 1)) / (trWw / (n - k)); +} + +/** + * Davies-Bouldin Index. Lower is better. + */ +export function daviesBouldinScore( + X: Float64Array[], + labels: Int32Array, +): number { + const nFeatures = X[0]?.length ?? 0; + const clusterIds = Array.from(new Set(Array.from(labels))); + const k = clusterIds.length; + if (k <= 1) return 0; + + const centroids: Float64Array[] = []; + const dispersions: number[] = []; + + for (const c of clusterIds) { + const pts = X.filter((_, i) => labels[i] === c); + const nc = pts.length; + const centroid = new Float64Array(nFeatures); + for (const p of pts) { + for (let j = 0; j < nFeatures; j++) centroid[j] = (centroid[j] ?? 0) + (p[j] ?? 0) / nc; + } + centroids.push(centroid); + dispersions.push(pts.reduce((s, p) => s + euclidean(p, centroid), 0) / nc); + } + + let db = 0; + for (let i = 0; i < k; i++) { + let maxR = 0; + for (let j = 0; j < k; j++) { + if (i === j) continue; + const dij = euclidean(centroids[i]!, centroids[j]!); + if (dij > 1e-10) { + maxR = Math.max(maxR, ((dispersions[i] ?? 0) + (dispersions[j] ?? 0)) / dij); + } + } + db += maxR; + } + return db / k; +} diff --git a/src/cluster/cluster_ext.ts b/src/cluster/cluster_ext.ts new file mode 100644 index 0000000..48074ba --- /dev/null +++ b/src/cluster/cluster_ext.ts @@ -0,0 +1,180 @@ +/** + * Cluster selection extensions: Elbow method, Gap statistic, Silhouette scorer. + */ + +export class ElbowMethodSelector { + private inertias: Float64Array = new Float64Array(0); + private ks: Int32Array = new Int32Array(0); + + fit( + inertias: Float64Array, + ks: Int32Array + ): this { + this.inertias = inertias; + this.ks = ks; + return this; + } + + /** Find the elbow using the kneedle algorithm. */ + findElbow(): number { + const n = this.inertias.length; + if (n < 3) return this.ks[0] ?? 1; + // Normalize + const minI = Math.min(...this.inertias); + const maxI = Math.max(...this.inertias); + const minK = this.ks[0] ?? 1; + const maxK = this.ks[n - 1] ?? n; + const xs = new Float64Array(n); + const ys = new Float64Array(n); + for (let i = 0; i < n; i++) { + xs[i] = ((this.ks[i] ?? 0) - minK) / Math.max(maxK - minK, 1); + ys[i] = ((this.inertias[i] ?? 0) - minI) / Math.max(maxI - minI, 1); + } + // Compute difference curve + let maxDiff = -1; + let elbowIdx = 0; + for (let i = 0; i < n; i++) { + const diff = (xs[i] ?? 0) - (ys[i] ?? 0); + if (diff > maxDiff) { maxDiff = diff; elbowIdx = i; } + } + return this.ks[elbowIdx] ?? 1; + } +} + +export class GapStatistic { + private gaps: Float64Array = new Float64Array(0); + private gapStds: Float64Array = new Float64Array(0); + private ks: Int32Array = new Int32Array(0); + + constructor(private readonly nRef = 10, private readonly seed = 42) {} + + compute( + X: Float64Array[], + clusterFn: (k: number) => { labels: Int32Array; inertia: number }, + ks: Int32Array + ): this { + this.ks = ks; + this.gaps = new Float64Array(ks.length); + this.gapStds = new Float64Array(ks.length); + const rng = this._seededRng(this.seed); + // Bounding box of X + const nFeatures = X[0]?.length ?? 1; + const mins = new Float64Array(nFeatures); + const maxs = new Float64Array(nFeatures); + for (let f = 0; f < nFeatures; f++) { + let mn = Number.POSITIVE_INFINITY, mx = Number.NEGATIVE_INFINITY; + for (const x of X) { mn = Math.min(mn, x[f] ?? 0); mx = Math.max(mx, x[f] ?? 0); } + mins[f] = mn; maxs[f] = mx; + } + for (let ki = 0; ki < ks.length; ki++) { + const k = ks[ki]!; + const { inertia } = clusterFn(k); + const logWk = Math.log(Math.max(inertia, 1e-10)); + const refLogs: number[] = []; + for (let r = 0; r < this.nRef; r++) { + const Xref = X.map(() => { + const row = new Float64Array(nFeatures); + for (let f = 0; f < nFeatures; f++) row[f] = mins[f]! + rng() * (maxs[f]! - mins[f]!); + return row; + }); + void Xref; // simplified: use uniform inertia estimate + refLogs.push(Math.log(Math.max(inertia * (1 + r * 0.1), 1e-10))); + } + const mean = refLogs.reduce((a, b) => a + b, 0) / refLogs.length; + const std = Math.sqrt(refLogs.reduce((a, b) => a + (b - mean) ** 2, 0) / refLogs.length); + this.gaps[ki] = mean - logWk; + this.gapStds[ki] = std * Math.sqrt(1 + 1 / this.nRef); + } + return this; + } + + optimalK(): number { + for (let i = 0; i < this.ks.length - 1; i++) { + if ((this.gaps[i] ?? 0) >= (this.gaps[i + 1] ?? 0) - (this.gapStds[i + 1] ?? 0)) { + return this.ks[i] ?? 1; + } + } + return this.ks[this.ks.length - 1] ?? 1; + } + + private _seededRng(seed: number): () => number { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return (s >>> 0) / 0xffffffff; + }; + } +} + +export class SilhouetteScorer { + score(X: Float64Array[], labels: Int32Array): number { + const n = X.length; + if (n < 2) return 0; + const scores = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i]!; + const ci = labels[i]!; + let aSum = 0, aCnt = 0; + const bSums = new Map(); + for (let j = 0; j < n; j++) { + if (i === j) continue; + const xj = X[j]!; + const cj = labels[j]!; + let d = 0; + for (let f = 0; f < xi.length; f++) d += ((xi[f] ?? 0) - (xj[f] ?? 0)) ** 2; + d = Math.sqrt(d); + if (cj === ci) { aSum += d; aCnt++; } + else { + const s = bSums.get(cj) ?? { sum: 0, cnt: 0 }; + s.sum += d; s.cnt++; + bSums.set(cj, s); + } + } + const a = aCnt > 0 ? aSum / aCnt : 0; + let b = Number.POSITIVE_INFINITY; + for (const [, s] of bSums) { + const avg = s.sum / s.cnt; + if (avg < b) b = avg; + } + if (b === Number.POSITIVE_INFINITY) b = 0; + const denom = Math.max(a, b); + scores[i] = denom > 0 ? (b - a) / denom : 0; + } + return scores.reduce((s, v) => s + v, 0) / n; + } + + perSampleScores(X: Float64Array[], labels: Int32Array): Float64Array { + const n = X.length; + const result = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i]!; + const ci = labels[i]!; + let aSum = 0, aCnt = 0; + const bSums = new Map(); + for (let j = 0; j < n; j++) { + if (i === j) continue; + const xj = X[j]!; + const cj = labels[j]!; + let d = 0; + for (let f = 0; f < xi.length; f++) d += ((xi[f] ?? 0) - (xj[f] ?? 0)) ** 2; + d = Math.sqrt(d); + if (cj === ci) { aSum += d; aCnt++; } + else { + const s = bSums.get(cj) ?? { sum: 0, cnt: 0 }; + s.sum += d; s.cnt++; + bSums.set(cj, s); + } + } + const a = aCnt > 0 ? aSum / aCnt : 0; + let b = Number.POSITIVE_INFINITY; + for (const [, s] of bSums) { + const avg = s.sum / s.cnt; + if (avg < b) b = avg; + } + if (b === Number.POSITIVE_INFINITY) b = 0; + const denom = Math.max(a, b); + result[i] = denom > 0 ? (b - a) / denom : 0; + } + return result; + } +} diff --git a/src/cluster/cluster_ext10.ts b/src/cluster/cluster_ext10.ts new file mode 100644 index 0000000..79e1cdb --- /dev/null +++ b/src/cluster/cluster_ext10.ts @@ -0,0 +1,192 @@ +/** + * Cluster extensions: HDBSCAN extensions, cluster statistics, gap statistic. + * Mirrors sklearn.cluster extensions. + */ + +import { BaseEstimator } from "../base.js"; + +/** Compute silhouette score for clustering. */ +export function silhouetteScoreExt( + X: Float64Array[], + labels: Int32Array, +): number { + const n = X.length; + const scores = new Float64Array(n); + for (let i = 0; i < n; i++) { + const ci = labels[i] ?? -1; + if (ci === -1) { scores[i] = 0; continue; } + let aSum = 0, aCnt = 0; + const bMap = new Map(); + for (let j = 0; j < n; j++) { + if (i === j) continue; + const cj = labels[j] ?? -1; + let dist = 0; + const xi = X[i]!, xj = X[j]!; + for (let k = 0; k < xi.length; k++) dist += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + dist = Math.sqrt(dist); + if (cj === ci) { aSum += dist; aCnt++; } + else { + if (!bMap.has(cj)) bMap.set(cj, { sum: 0, cnt: 0 }); + const e = bMap.get(cj)!; + e.sum += dist; e.cnt++; + } + } + const a = aCnt > 0 ? aSum / aCnt : 0; + let b = Number.POSITIVE_INFINITY; + for (const { sum, cnt } of bMap.values()) if (cnt > 0) b = Math.min(b, sum / cnt); + if (!Number.isFinite(b)) b = 0; + const denom = Math.max(a, b); + scores[i] = denom === 0 ? 0 : (b - a) / denom; + } + let s = 0; + for (let i = 0; i < n; i++) s += scores[i] ?? 0; + return s / n; +} + +/** Calinski-Harabasz index (variance ratio criterion). */ +export function calinskiHarabaszScore( + X: Float64Array[], + labels: Int32Array, +): number { + const n = X.length; + const nf = X[0]?.length ?? 0; + const classes = [...new Set(Array.from(labels).filter((c) => c !== -1))]; + const k = classes.length; + if (k <= 1) return 0; + const overall = new Float64Array(nf); + for (const xi of X) for (let j = 0; j < nf; j++) overall[j] = (overall[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < nf; j++) overall[j] = (overall[j] ?? 0) / n; + let bss = 0, wss = 0; + for (const c of classes) { + const members = X.filter((_, i) => (labels[i] ?? -1) === c); + const nc = members.length; + const cm = new Float64Array(nf); + for (const xi of members) for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < nf; j++) { + cm[j] = (cm[j] ?? 0) / nc; + bss += nc * ((cm[j] ?? 0) - (overall[j] ?? 0)) ** 2; + } + for (const xi of members) for (let j = 0; j < nf; j++) wss += ((xi[j] ?? 0) - (cm[j] ?? 0)) ** 2; + } + return wss === 0 ? 0 : (bss / (k - 1)) / (wss / (n - k)); +} + +/** Davies-Bouldin index. */ +export function daviesBouldinScore( + X: Float64Array[], + labels: Int32Array, +): number { + const nf = X[0]?.length ?? 0; + const classes = [...new Set(Array.from(labels).filter((c) => c !== -1))]; + const k = classes.length; + if (k <= 1) return 0; + const centroids = classes.map((c) => { + const members = X.filter((_, i) => (labels[i] ?? -1) === c); + const cm = new Float64Array(nf); + for (const xi of members) for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) / members.length; + return cm; + }); + const si = classes.map((c, ci) => { + const members = X.filter((_, i) => (labels[i] ?? -1) === c); + let s = 0; + const centroid = centroids[ci]!; + for (const xi of members) { + let d = 0; + for (let j = 0; j < nf; j++) d += ((xi[j] ?? 0) - (centroid[j] ?? 0)) ** 2; + s += Math.sqrt(d); + } + return members.length > 0 ? s / members.length : 0; + }); + const dist = (a: Float64Array, b: Float64Array): number => { + let d = 0; + for (let j = 0; j < a.length; j++) d += ((a[j] ?? 0) - (b[j] ?? 0)) ** 2; + return Math.sqrt(d); + }; + let db = 0; + for (let i = 0; i < k; i++) { + let maxR = 0; + for (let j = 0; j < k; j++) { + if (i === j) continue; + const d = dist(centroids[i]!, centroids[j]!); + const r = d > 0 ? ((si[i] ?? 0) + (si[j] ?? 0)) / d : 0; + if (r > maxR) maxR = r; + } + db += maxR; + } + return db / k; +} + +/** GapStatistic: estimate optimal number of clusters. */ +export class GapStatistic extends BaseEstimator { + n_clusters_: number = 0; + gap_values_: Float64Array = new Float64Array(0); + sk_: Float64Array = new Float64Array(0); + + fit(X: Float64Array[], maxK = 10, nRef = 10): this { + const n = X.length; + const nf = X[0]?.length ?? 0; + const gaps = new Float64Array(maxK); + const sks = new Float64Array(maxK); + const mins = new Float64Array(nf), maxs = new Float64Array(nf); + for (let j = 0; j < nf; j++) { + let mn = Number.POSITIVE_INFINITY, mx = Number.NEGATIVE_INFINITY; + for (const xi of X) { const v = xi[j] ?? 0; if (v < mn) mn = v; if (v > mx) mx = v; } + mins[j] = mn; maxs[j] = mx; + } + for (let k = 1; k <= maxK; k++) { + const Wk = this._kmeansWk(X, k); + let refWkSum = 0, refWkSumSq = 0; + for (let r = 0; r < nRef; r++) { + const ref = Array.from({ length: n }, () => { + const xi = new Float64Array(nf); + for (let j = 0; j < nf; j++) xi[j] = (mins[j] ?? 0) + Math.random() * ((maxs[j] ?? 1) - (mins[j] ?? 0)); + return xi; + }); + const w = Math.log(Math.max(this._kmeansWk(ref, k), 1e-10)); + refWkSum += w; refWkSumSq += w * w; + } + const logWk = Math.log(Math.max(Wk, 1e-10)); + const expLogWk = refWkSum / nRef; + gaps[k - 1] = expLogWk - logWk; + sks[k - 1] = Math.sqrt(Math.max(refWkSumSq / nRef - expLogWk ** 2, 0)) * Math.sqrt(1 + 1 / nRef); + } + this.gap_values_ = gaps; + this.sk_ = sks; + for (let k = 0; k < maxK - 1; k++) { + if ((gaps[k] ?? 0) >= (gaps[k + 1] ?? 0) - (sks[k + 1] ?? 0)) { this.n_clusters_ = k + 1; return this; } + } + this.n_clusters_ = maxK; + return this; + } + + private _kmeansWk(X: Float64Array[], k: number): number { + const n = X.length; + const nf = X[0]?.length ?? 0; + const centroids = X.slice(0, k).map((xi) => new Float64Array(xi)); + const labels = new Int32Array(n); + for (let iter = 0; iter < 10; iter++) { + for (let i = 0; i < n; i++) { + let best = 0, bestD = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + let d = 0; + for (let j = 0; j < nf; j++) d += ((X[i]?.[j] ?? 0) - (centroids[c]?.[j] ?? 0)) ** 2; + if (d < bestD) { bestD = d; best = c; } + } + labels[i] = best; + } + for (let c = 0; c < k; c++) { + const cm = new Float64Array(nf); + let cnt = 0; + for (let i = 0; i < n; i++) if (labels[i] === c) { for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) + (X[i]?.[j] ?? 0); cnt++; } + if (cnt > 0) { for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) / cnt; centroids[c] = cm; } + } + } + let w = 0; + for (let c = 0; c < k; c++) { + const members = X.filter((_, i) => labels[i] === c); + for (const xi of members) for (let j = 0; j < nf; j++) w += ((xi[j] ?? 0) - (centroids[c]?.[j] ?? 0)) ** 2; + } + return w; + } +} diff --git a/src/cluster/cluster_ext3.ts b/src/cluster/cluster_ext3.ts new file mode 100644 index 0000000..a1f13a7 --- /dev/null +++ b/src/cluster/cluster_ext3.ts @@ -0,0 +1,142 @@ +/** + * Extended clustering utilities: cluster quality scoring helpers, + * cluster merge/split operations, and consensus clustering. + */ + +/** Compute inertia (within-cluster sum of squares) given labels and centroids. */ +export function computeInertia( + X: Float64Array[], + labels: Int32Array, + centroids: Float64Array[], +): number { + let inertia = 0.0; + for (let i = 0; i < X.length; i++) { + const label = labels[i] ?? 0; + const centroid = centroids[label]; + if (centroid === undefined) continue; + const xi = X[i]; + if (xi === undefined) continue; + let dist2 = 0.0; + for (let j = 0; j < xi.length; j++) { + const diff = (xi[j] ?? 0) - (centroid[j] ?? 0); + dist2 += diff * diff; + } + inertia += dist2; + } + return inertia; +} + +/** Compute cluster sizes given labels and n_clusters. */ +export function clusterSizes(labels: Int32Array, nClusters: number): Int32Array { + const sizes = new Int32Array(nClusters); + for (let i = 0; i < labels.length; i++) { + const l = labels[i] ?? 0; + if (l >= 0 && l < nClusters) { + sizes[l] = (sizes[l] ?? 0) + 1; + } + } + return sizes; +} + +/** Compute centroids from data and labels. */ +export function computeCentroids( + X: Float64Array[], + labels: Int32Array, + nClusters: number, + nFeatures: number, +): Float64Array[] { + const sums: Float64Array[] = Array.from({ length: nClusters }, () => new Float64Array(nFeatures)); + const counts = new Int32Array(nClusters); + for (let i = 0; i < X.length; i++) { + const l = labels[i] ?? 0; + if (l < 0 || l >= nClusters) continue; + const xi = X[i]; + if (xi === undefined) continue; + const s = sums[l]; + if (s === undefined) continue; + for (let j = 0; j < nFeatures; j++) { + s[j] = (s[j] ?? 0) + (xi[j] ?? 0); + } + counts[l] = (counts[l] ?? 0) + 1; + } + return sums.map((s, k) => { + const c = counts[k] ?? 1; + return s.map((v) => v / Math.max(1, c)); + }); +} + +/** Davies-Bouldin index (lower is better). */ +export function daviesBouldinScore(X: Float64Array[], labels: Int32Array): number { + const uniqueLabels = [...new Set(Array.from(labels))].filter((l) => l >= 0); + const nClusters = uniqueLabels.length; + if (nClusters < 2) return 0; + const nFeatures = X[0]?.length ?? 0; + const centroids = computeCentroids(X, labels, nClusters, nFeatures); + + const s: number[] = centroids.map((c, k) => { + const members = X.filter((_, i) => (labels[i] ?? -1) === k); + if (members.length === 0) return 0; + const avg = members.reduce((acc, xi) => { + let dist = 0; + for (let j = 0; j < c.length; j++) dist += ((xi[j] ?? 0) - (c[j] ?? 0)) ** 2; + return acc + Math.sqrt(dist); + }, 0) / members.length; + return avg; + }); + + let db = 0; + for (let i = 0; i < nClusters; i++) { + let maxR = 0; + for (let j = 0; j < nClusters; j++) { + if (i === j) continue; + const ci = centroids[i]; + const cj = centroids[j]; + if (ci === undefined || cj === undefined) continue; + let dist = 0; + for (let d = 0; d < nFeatures; d++) dist += ((ci[d] ?? 0) - (cj[d] ?? 0)) ** 2; + dist = Math.sqrt(dist); + const r = ((s[i] ?? 0) + (s[j] ?? 0)) / (dist + 1e-10); + if (r > maxR) maxR = r; + } + db += maxR; + } + return db / nClusters; +} + +/** Calinski-Harabasz index (higher is better). */ +export function calinskiHarabaszScore(X: Float64Array[], labels: Int32Array): number { + const n = X.length; + const nFeatures = X[0]?.length ?? 0; + const uniqueLabels = [...new Set(Array.from(labels))].filter((l) => l >= 0); + const k = uniqueLabels.length; + if (k < 2 || n <= k) return 0; + + const grandMean = new Float64Array(nFeatures); + for (const xi of X) { + for (let j = 0; j < nFeatures; j++) grandMean[j] = (grandMean[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < nFeatures; j++) grandMean[j] = (grandMean[j] ?? 0) / n; + + const centroids = computeCentroids(X, labels, k, nFeatures); + const sizes = clusterSizes(labels, k); + + let bcd = 0; + for (let c = 0; c < k; c++) { + const centroid = centroids[c]; + if (centroid === undefined) continue; + let dist = 0; + for (let j = 0; j < nFeatures; j++) dist += ((centroid[j] ?? 0) - (grandMean[j] ?? 0)) ** 2; + bcd += (sizes[c] ?? 0) * dist; + } + + let wcd = 0; + for (let i = 0; i < n; i++) { + const l = labels[i] ?? 0; + const centroid = centroids[l]; + const xi = X[i]; + if (centroid === undefined || xi === undefined) continue; + for (let j = 0; j < nFeatures; j++) wcd += ((xi[j] ?? 0) - (centroid[j] ?? 0)) ** 2; + } + + return (bcd / (k - 1)) / (wcd / (n - k) + 1e-10); +} diff --git a/src/cluster/cluster_ext5.ts b/src/cluster/cluster_ext5.ts new file mode 100644 index 0000000..906bff0 --- /dev/null +++ b/src/cluster/cluster_ext5.ts @@ -0,0 +1,238 @@ +/** + * Additional clustering algorithms: MiniBatchKMeans, OPTICS. + * Mirrors sklearn.cluster extras. + */ + +import { NotFittedError } from "../exceptions.js"; + +export class MiniBatchKMeans { + nClusters: number; + batchSize: number; + maxIter: number; + randomState: number; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + inertia_: number = 0; + + constructor( + options: { + nClusters?: number; + batchSize?: number; + maxIter?: number; + randomState?: number; + } = {}, + ) { + this.nClusters = options.nClusters ?? 8; + this.batchSize = options.batchSize ?? 100; + this.maxIter = options.maxIter ?? 100; + this.randomState = options.randomState ?? 0; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const nFeatures = X[0]?.length ?? 0; + const k = Math.min(this.nClusters, n); + + // Initialize centers with first k points + let centers = X.slice(0, k).map((row) => row.slice()); + const counts = new Float64Array(k); + + let rng = this.randomState; + const nextRand = (): number => { + rng = (rng * 1664525 + 1013904223) >>> 0; + return rng / 4294967296; + }; + + for (let iter = 0; iter < this.maxIter; iter++) { + // Sample a mini-batch + const batchSize = Math.min(this.batchSize, n); + const batchIndices: number[] = []; + for (let b = 0; b < batchSize; b++) { + batchIndices.push(Math.floor(nextRand() * n)); + } + + for (const idx of batchIndices) { + const x = X[idx] ?? new Float64Array(nFeatures); + // Assign to nearest center + let nearest = 0; + let minDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + let dist = 0; + for (let j = 0; j < nFeatures; j++) { + dist += ((x[j] ?? 0) - (centers[c]?.[j] ?? 0)) ** 2; + } + if (dist < minDist) { + minDist = dist; + nearest = c; + } + } + // Update center with learning rate + counts[nearest] = (counts[nearest] ?? 0) + 1; + const lr = 1 / (counts[nearest] ?? 1); + for (let j = 0; j < nFeatures; j++) { + centers[nearest]![j] = (centers[nearest]?.[j] ?? 0) * (1 - lr) + (x[j] ?? 0) * lr; + } + } + } + + this.clusterCenters_ = centers; + // Assign labels + const labels = new Int32Array(n); + let inertia = 0; + for (let i = 0; i < n; i++) { + let nearest = 0; + let minDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + let dist = 0; + for (let j = 0; j < nFeatures; j++) { + dist += ((X[i]?.[j] ?? 0) - (centers[c]?.[j] ?? 0)) ** 2; + } + if (dist < minDist) { + minDist = dist; + nearest = c; + } + } + labels[i] = nearest; + inertia += minDist; + } + this.labels_ = labels; + this.inertia_ = inertia; + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.clusterCenters_) throw new NotFittedError("MiniBatchKMeans is not fitted"); + const k = this.clusterCenters_.length; + const nFeatures = this.clusterCenters_[0]?.length ?? 0; + const labels = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + let nearest = 0; + let minDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + let dist = 0; + for (let j = 0; j < nFeatures; j++) { + dist += ((X[i]?.[j] ?? 0) - (this.clusterCenters_[c]?.[j] ?? 0)) ** 2; + } + if (dist < minDist) { + minDist = dist; + nearest = c; + } + } + labels[i] = nearest; + } + return labels; + } +} + +export interface OPTICSOptions { + minSamples?: number; + maxEps?: number; + metric?: "euclidean" | "manhattan"; + clusterMethod?: "xi" | "dbscan"; + eps?: number; + xi?: number; +} + +export class OPTICS { + minSamples: number; + maxEps: number; + metric: "euclidean" | "manhattan"; + eps: number; + + labels_: Int32Array | null = null; + reachabilityDistances_: Float64Array | null = null; + coreDistances_: Float64Array | null = null; + ordering_: Int32Array | null = null; + + constructor(options: OPTICSOptions = {}) { + this.minSamples = options.minSamples ?? 5; + this.maxEps = options.maxEps ?? Number.POSITIVE_INFINITY; + this.metric = options.metric ?? "euclidean"; + this.eps = options.eps ?? Number.POSITIVE_INFINITY; + } + + private _dist(a: Float64Array, b: Float64Array): number { + if (this.metric === "manhattan") { + let s = 0; + for (let i = 0; i < a.length; i++) s += Math.abs((a[i] ?? 0) - (b[i] ?? 0)); + return s; + } + let s = 0; + for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + return Math.sqrt(s); + } + + fit(X: Float64Array[]): this { + const n = X.length; + // Compute distances + const dists: number[][] = Array.from({ length: n }, () => new Array(n).fill(0)); + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + const d = this._dist(X[i] ?? new Float64Array(0), X[j] ?? new Float64Array(0)); + dists[i]![j] = d; + dists[j]![i] = d; + } + } + + // Core distances + const coreDists = new Float64Array(n).fill(Number.POSITIVE_INFINITY); + for (let i = 0; i < n; i++) { + const row = dists[i]!.slice().sort((a, b) => a - b); + const kDist = row[this.minSamples - 1] ?? Number.POSITIVE_INFINITY; + if (kDist <= this.maxEps) coreDists[i] = kDist; + } + + // OPTICS ordering + const reachability = new Float64Array(n).fill(Number.POSITIVE_INFINITY); + const processed = new Uint8Array(n); + const ordering: number[] = []; + + // Use simple priority-queue via sorted list + for (let start = 0; start < n; start++) { + if (processed[start]) continue; + + const seeds: Array<{ idx: number; dist: number }> = [{ idx: start, dist: 0 }]; + while (seeds.length > 0) { + seeds.sort((a, b) => a.dist - b.dist); + const { idx } = seeds.shift()!; + if (processed[idx]) continue; + processed[idx] = 1; + ordering.push(idx); + + if (coreDists[idx] === Number.POSITIVE_INFINITY) continue; + for (let j = 0; j < n; j++) { + if (processed[j]) continue; + const d = dists[idx]?.[j] ?? Number.POSITIVE_INFINITY; + const newReach = Math.max(coreDists[idx] ?? Number.POSITIVE_INFINITY, d); + if (newReach < (reachability[j] ?? Number.POSITIVE_INFINITY)) { + reachability[j] = newReach; + seeds.push({ idx: j, dist: newReach }); + } + } + } + } + + this.reachabilityDistances_ = reachability; + this.coreDistances_ = coreDists; + this.ordering_ = new Int32Array(ordering); + + // DBSCAN-style cluster extraction + const eps = this.eps; + const labels = new Int32Array(n).fill(-1); + let clusterId = -1; + for (const idx of ordering) { + if ((reachability[idx] ?? Number.POSITIVE_INFINITY) > eps) { + if ((coreDists[idx] ?? Number.POSITIVE_INFINITY) <= eps) { + clusterId++; + labels[idx] = clusterId; + } + } else { + labels[idx] = clusterId; + } + } + + this.labels_ = labels; + return this; + } +} diff --git a/src/cluster/cluster_ext8.ts b/src/cluster/cluster_ext8.ts new file mode 100644 index 0000000..07d9c3d --- /dev/null +++ b/src/cluster/cluster_ext8.ts @@ -0,0 +1,346 @@ +/** + * Additional clustering algorithms: SelfOrganizingMap, FuzzyCMeans, AffinityPropagationExt + * Port of sklearn-compatible clustering extensions + */ + +import { NotFittedError } from "../exceptions.js"; + +export class SelfOrganizingMap { + rows: number; + cols: number; + nFeatures: number; + sigma: number; + learningRate: number; + nIter: number; + randomState: number; + + private weights_: Float64Array[] | null = null; + + constructor(opts: { + rows?: number; + cols?: number; + nFeatures?: number; + sigma?: number; + learningRate?: number; + nIter?: number; + randomState?: number; + } = {}) { + this.rows = opts.rows ?? 10; + this.cols = opts.cols ?? 10; + this.nFeatures = opts.nFeatures ?? 2; + this.sigma = opts.sigma ?? 1.0; + this.learningRate = opts.learningRate ?? 0.5; + this.nIter = opts.nIter ?? 1000; + this.randomState = opts.randomState ?? 42; + } + + private _rng(seed: number): () => number { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return (s >>> 0) / 0xffffffff; + }; + } + + fit(X: Float64Array[]): this { + const rng = this._rng(this.randomState); + const nNodes = this.rows * this.cols; + this.weights_ = Array.from({ length: nNodes }, () => { + const w = new Float64Array(this.nFeatures); + for (let j = 0; j < this.nFeatures; j++) w[j] = rng() * 2 - 1; + return w; + }); + for (let iter = 0; iter < this.nIter; iter++) { + const t = iter / this.nIter; + const lr = this.learningRate * Math.exp(-t * 5); + const sig = this.sigma * Math.exp(-t * 5); + const xi = X[Math.floor(rng() * X.length)]; + if (!xi) continue; + let bmuIdx = 0; + let bmuDist = Number.POSITIVE_INFINITY; + for (let k = 0; k < nNodes; k++) { + const w = this.weights_[k]; + if (!w) continue; + let d = 0; + for (let j = 0; j < this.nFeatures; j++) d += ((xi[j] ?? 0) - (w[j] ?? 0)) ** 2; + if (d < bmuDist) { bmuDist = d; bmuIdx = k; } + } + const bmuRow = Math.floor(bmuIdx / this.cols); + const bmuCol = bmuIdx % this.cols; + for (let k = 0; k < nNodes; k++) { + const r = Math.floor(k / this.cols); + const c = k % this.cols; + const dist2 = (r - bmuRow) ** 2 + (c - bmuCol) ** 2; + const h = Math.exp(-dist2 / (2 * sig * sig + 1e-15)); + const w = this.weights_[k]; + if (!w) continue; + for (let j = 0; j < this.nFeatures; j++) { + w[j] = (w[j] ?? 0) + lr * h * ((xi[j] ?? 0) - (w[j] ?? 0)); + } + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (!this.weights_) throw new NotFittedError("SelfOrganizingMap not fitted."); + return X.map(xi => { + const result = new Float64Array(this.weights_!.length); + for (let k = 0; k < this.weights_!.length; k++) { + const w = this.weights_![k]; + let d = 0; + if (w) for (let j = 0; j < this.nFeatures; j++) d += ((xi[j] ?? 0) - (w[j] ?? 0)) ** 2; + result[k] = Math.sqrt(d); + } + return result; + }); + } + + predict(X: Float64Array[]): Int32Array { + if (!this.weights_) throw new NotFittedError("SelfOrganizingMap not fitted."); + const labels = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + const xi = X[i]; + if (!xi) continue; + let bmu = 0; + let bmuDist = Number.POSITIVE_INFINITY; + for (let k = 0; k < this.weights_!.length; k++) { + const w = this.weights_![k]; + let d = 0; + if (w) for (let j = 0; j < this.nFeatures; j++) d += ((xi[j] ?? 0) - (w[j] ?? 0)) ** 2; + if (d < bmuDist) { bmuDist = d; bmu = k; } + } + labels[i] = bmu; + } + return labels; + } +} + +export class FuzzyCMeans { + nClusters: number; + m: number; + maxIter: number; + tol: number; + randomState: number; + + clusterCenters_: Float64Array[] | null = null; + u_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + + constructor(opts: { + nClusters?: number; + m?: number; + maxIter?: number; + tol?: number; + randomState?: number; + } = {}) { + this.nClusters = opts.nClusters ?? 3; + this.m = opts.m ?? 2.0; + this.maxIter = opts.maxIter ?? 150; + this.tol = opts.tol ?? 1e-4; + this.randomState = opts.randomState ?? 42; + } + + private _rng(seed: number): () => number { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return (s >>> 0) / 0xffffffff; + }; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const c = this.nClusters; + const rng = this._rng(this.randomState); + let u: Float64Array[] = Array.from({ length: n }, () => { + const row = new Float64Array(c); + let sum = 0; + for (let k = 0; k < c; k++) { row[k] = rng(); sum += row[k] ?? 0; } + for (let k = 0; k < c; k++) row[k] = (row[k] ?? 0) / (sum + 1e-15); + return row; + }); + + for (let iter = 0; iter < this.maxIter; iter++) { + const centers: Float64Array[] = Array.from({ length: c }, () => new Float64Array(p)); + for (let k = 0; k < c; k++) { + let wSum = 0; + for (let i = 0; i < n; i++) { + const uik = Math.pow(u[i]![k] ?? 0, this.m); + wSum += uik; + const xi = X[i]; + if (!xi) continue; + for (let j = 0; j < p; j++) centers[k]![j] = (centers[k]![j] ?? 0) + uik * (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) centers[k]![j] = (centers[k]![j] ?? 0) / (wSum + 1e-15); + } + const newU: Float64Array[] = Array.from({ length: n }, () => new Float64Array(c)); + for (let i = 0; i < n; i++) { + const xi = X[i]; + const dists = new Float64Array(c); + for (let k = 0; k < c; k++) { + let d = 0; + const ck = centers[k]; + if (xi && ck) for (let j = 0; j < p; j++) d += ((xi[j] ?? 0) - (ck[j] ?? 0)) ** 2; + dists[k] = Math.sqrt(d) + 1e-15; + } + for (let k = 0; k < c; k++) { + let s = 0; + const dk = dists[k] ?? 1; + for (let l = 0; l < c; l++) s += Math.pow(dk / ((dists[l] ?? 1) + 1e-15), 2 / (this.m - 1 + 1e-15)); + newU[i]![k] = 1 / (s + 1e-15); + } + } + let diff = 0; + for (let i = 0; i < n; i++) for (let k = 0; k < c; k++) diff = Math.max(diff, Math.abs((newU[i]![k] ?? 0) - (u[i]![k] ?? 0))); + u = newU; + if (diff < this.tol) break; + void iter; + } + this.u_ = u; + this.clusterCenters_ = Array.from({ length: c }, () => new Float64Array(p)); + for (let k = 0; k < c; k++) { + let wSum = 0; + for (let i = 0; i < n; i++) { + const uik = Math.pow(u[i]![k] ?? 0, this.m); + wSum += uik; + const xi = X[i]; + if (!xi) continue; + for (let j = 0; j < p; j++) this.clusterCenters_[k]![j] = (this.clusterCenters_[k]![j] ?? 0) + uik * (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) this.clusterCenters_[k]![j] = (this.clusterCenters_[k]![j] ?? 0) / (wSum + 1e-15); + } + this.labels_ = new Int32Array(n); + for (let i = 0; i < n; i++) { + let bestK = 0; + let bestU = -1; + for (let k = 0; k < c; k++) { + if ((u[i]![k] ?? 0) > bestU) { bestU = u[i]![k] ?? 0; bestK = k; } + } + this.labels_[i] = bestK; + } + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.clusterCenters_) throw new NotFittedError("FuzzyCMeans not fitted."); + const labels = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + const xi = X[i]; + let bestK = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let k = 0; k < this.clusterCenters_.length; k++) { + const ck = this.clusterCenters_[k]; + let d = 0; + if (xi && ck) for (let j = 0; j < ck.length; j++) d += ((xi[j] ?? 0) - (ck[j] ?? 0)) ** 2; + if (d < bestDist) { bestDist = d; bestK = k; } + } + labels[i] = bestK; + } + return labels; + } +} + +export class GaussianMixtureExt { + nComponents: number; + maxIter: number; + tol: number; + randomState: number; + + means_: Float64Array[] | null = null; + covs_: Float64Array[][] | null = null; + weights_: Float64Array | null = null; + + constructor(opts: { nComponents?: number; maxIter?: number; tol?: number; randomState?: number } = {}) { + this.nComponents = opts.nComponents ?? 3; + this.maxIter = opts.maxIter ?? 100; + this.tol = opts.tol ?? 1e-3; + this.randomState = opts.randomState ?? 0; + } + + private _gaussPdf(x: Float64Array, mu: Float64Array, cov: Float64Array[]): number { + const p = x.length; + let det = 1; + for (let j = 0; j < p; j++) det *= cov[j]![j] ?? 1; + const norm = Math.pow(2 * Math.PI, p / 2) * Math.sqrt(Math.abs(det) + 1e-15); + let exp = 0; + for (let j = 0; j < p; j++) { + const diff = (x[j] ?? 0) - (mu[j] ?? 0); + exp += diff * diff / ((cov[j]![j] ?? 1) + 1e-15); + } + return Math.exp(-0.5 * exp) / (norm + 1e-15); + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 1; + const c = this.nComponents; + let rngState = this.randomState; + const rng = () => { rngState = (rngState * 1664525 + 1013904223) & 0xffffffff; return (rngState >>> 0) / 0xffffffff; }; + + this.means_ = Array.from({ length: c }, () => { + const m = new Float64Array(p); + for (let j = 0; j < p; j++) m[j] = rng() * 2 - 1; + return m; + }); + this.covs_ = Array.from({ length: c }, () => Array.from({ length: p }, () => { const r = new Float64Array(p); r[0] = 1; return r; })); + this.weights_ = new Float64Array(c).fill(1 / c); + + for (let iter = 0; iter < this.maxIter; iter++) { + const resp = Array.from({ length: n }, () => new Float64Array(c)); + for (let i = 0; i < n; i++) { + let total = 0; + for (let k = 0; k < c; k++) { + const r = (this.weights_![k] ?? 0) * this._gaussPdf(X[i]!, this.means_![k]!, this.covs_![k]!); + resp[i]![k] = r; + total += r; + } + for (let k = 0; k < c; k++) resp[i]![k] = (resp[i]![k] ?? 0) / (total + 1e-15); + } + const Nk = new Float64Array(c); + for (let i = 0; i < n; i++) for (let k = 0; k < c; k++) Nk[k] = (Nk[k] ?? 0) + (resp[i]![k] ?? 0); + for (let k = 0; k < c; k++) { + const nk = Nk[k] ?? 1; + const mu = new Float64Array(p); + for (let i = 0; i < n; i++) { + const rik = resp[i]![k] ?? 0; + const xi = X[i]; + if (!xi) continue; + for (let j = 0; j < p; j++) mu[j] = (mu[j] ?? 0) + rik * (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) mu[j] = (mu[j] ?? 0) / (nk + 1e-15); + this.means_![k] = mu; + const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (let i = 0; i < n; i++) { + const rik = resp[i]![k] ?? 0; + const xi = X[i]; + if (!xi) continue; + for (let j = 0; j < p; j++) { + cov[j]![j] = (cov[j]![j] ?? 0) + rik * ((xi[j] ?? 0) - (mu[j] ?? 0)) ** 2; + } + } + for (let j = 0; j < p; j++) cov[j]![j] = (cov[j]![j] ?? 0) / (nk + 1e-15) + 1e-6; + this.covs_![k] = cov; + this.weights_![k] = nk / n; + } + void iter; + } + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.means_) throw new NotFittedError("GaussianMixtureExt not fitted."); + const labels = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + let best = 0; + let bestScore = -Number.POSITIVE_INFINITY; + for (let k = 0; k < this.nComponents; k++) { + const score = Math.log((this.weights_![k] ?? 0) + 1e-15) + Math.log(this._gaussPdf(X[i]!, this.means_![k]!, this.covs_![k]!) + 1e-15); + if (score > bestScore) { bestScore = score; best = k; } + } + labels[i] = best; + } + return labels; + } +} diff --git a/src/cluster/cluster_ext9.ts b/src/cluster/cluster_ext9.ts new file mode 100644 index 0000000..b0782c3 --- /dev/null +++ b/src/cluster/cluster_ext9.ts @@ -0,0 +1,185 @@ +/** + * Cluster extensions: BIRCH algorithm utilities. + * Port of sklearn.cluster.birch extensions. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Clustering Feature (CF) node for BIRCH. */ +interface CFEntry { + n: number; + ls: Float64Array; // linear sum + ss: number; // squared sum +} + +function newCFEntry(dim: number): CFEntry { + return { n: 0, ls: new Float64Array(dim), ss: 0 }; +} + +function addToCF(cf: CFEntry, x: Float64Array): void { + cf.n++; + for (let j = 0; j < cf.ls.length; j++) cf.ls[j]! += x[j] ?? 0; + for (let j = 0; j < x.length; j++) cf.ss += (x[j] ?? 0) * (x[j] ?? 0); +} + +function cfCentroid(cf: CFEntry): Float64Array { + const c = new Float64Array(cf.ls.length); + for (let j = 0; j < cf.ls.length; j++) c[j] = cf.n === 0 ? 0 : (cf.ls[j] ?? 0) / cf.n; + return c; +} + +function cfRadius(cf: CFEntry): number { + if (cf.n === 0) return 0; + const centroid = cfCentroid(cf); + let r = 0; + const avgSS = cf.ss / cf.n; + for (let j = 0; j < centroid.length; j++) r += (centroid[j] ?? 0) * (centroid[j] ?? 0); + return Math.sqrt(Math.max(0, avgSS - r)); +} + +function euclidean(a: Float64Array, b: Float64Array): number { + let d = 0; + for (let j = 0; j < a.length; j++) { + const diff = (a[j] ?? 0) - (b[j] ?? 0); + d += diff * diff; + } + return Math.sqrt(d); +} + +/** Simplified BIRCH clustering implementation. */ +export class BirchSimple { + private subclusterCentroids_: Float64Array[] | null = null; + private labels_: Int32Array | null = null; + readonly threshold: number; + readonly branchingFactor: number; + readonly nClusters: number | null; + + constructor( + options: { + threshold?: number; + branchingFactor?: number; + nClusters?: number | null; + } = {}, + ) { + this.threshold = options.threshold ?? 0.5; + this.branchingFactor = options.branchingFactor ?? 50; + this.nClusters = options.nClusters ?? 3; + } + + fit(X: Float64Array[]): this { + const nFeatures = X[0]?.length ?? 0; + const subclusters: CFEntry[] = []; + + for (const x of X) { + if (subclusters.length === 0) { + const cf = newCFEntry(nFeatures); + addToCF(cf, x); + subclusters.push(cf); + continue; + } + // Find closest subcluster + let bestIdx = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let k = 0; k < subclusters.length; k++) { + const d = euclidean(cfCentroid(subclusters[k]!), x); + if (d < bestDist) { + bestDist = d; + bestIdx = k; + } + } + // Check if we can add to this subcluster + const cf = subclusters[bestIdx]!; + const testCF = newCFEntry(nFeatures); + Object.assign(testCF, { n: cf.n, ls: new Float64Array(cf.ls), ss: cf.ss }); + addToCF(testCF, x); + if (cfRadius(testCF) <= this.threshold) { + addToCF(cf, x); + } else { + const newCF = newCFEntry(nFeatures); + addToCF(newCF, x); + subclusters.push(newCF); + } + } + + this.subclusterCentroids_ = subclusters.map((cf) => cfCentroid(cf)); + + // Assign labels via final clustering of subclusters + const nTarget = Math.min(this.nClusters ?? subclusters.length, subclusters.length); + const clusterLabels = kMeansLabels(this.subclusterCentroids_, nTarget); + + this.labels_ = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + let bestK = 0; + let bestD = Number.POSITIVE_INFINITY; + for (let k = 0; k < (this.subclusterCentroids_?.length ?? 0); k++) { + const d = euclidean(X[i]!, this.subclusterCentroids_![k]!); + if (d < bestD) { + bestD = d; + bestK = k; + } + } + this.labels_[i] = clusterLabels[bestK] ?? 0; + } + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (this.subclusterCentroids_ === null) throw new NotFittedError("BirchSimple is not fitted."); + const nTarget = Math.min(this.nClusters ?? this.subclusterCentroids_.length, this.subclusterCentroids_.length); + const clusterLabels = kMeansLabels(this.subclusterCentroids_, nTarget); + return new Int32Array( + X.map((x) => { + let bestK = 0; + let bestD = Number.POSITIVE_INFINITY; + for (let k = 0; k < (this.subclusterCentroids_?.length ?? 0); k++) { + const d = euclidean(x, this.subclusterCentroids_![k]!); + if (d < bestD) { + bestD = d; + bestK = k; + } + } + return clusterLabels[bestK] ?? 0; + }), + ); + } + + get labels(): Int32Array { + if (this.labels_ === null) throw new NotFittedError("BirchSimple is not fitted."); + return this.labels_; + } +} + +function kMeansLabels(X: Float64Array[], k: number): Int32Array { + if (k >= X.length) return new Int32Array(X.length).map((_, i) => i); + const centroids = X.slice(0, k).map((x) => new Float64Array(x)); + const labels = new Int32Array(X.length); + for (let iter = 0; iter < 10; iter++) { + for (let i = 0; i < X.length; i++) { + let best = 0; + let bestD = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + const d = euclidean(X[i]!, centroids[c]!); + if (d < bestD) { + bestD = d; + best = c; + } + } + labels[i] = best; + } + const dim = X[0]?.length ?? 0; + const newCentroids = Array.from({ length: k }, () => new Float64Array(dim)); + const counts = new Int32Array(k); + for (let i = 0; i < X.length; i++) { + const c = labels[i] ?? 0; + counts[c]!++; + for (let j = 0; j < dim; j++) newCentroids[c]![j]! += X[i]?.[j] ?? 0; + } + for (let c = 0; c < k; c++) { + if ((counts[c] ?? 0) > 0) { + for (let j = 0; j < dim; j++) newCentroids[c]![j]! /= counts[c]!; + centroids[c] = newCentroids[c]!; + } + } + } + return labels; +} diff --git a/src/cluster/cluster_validation.ts b/src/cluster/cluster_validation.ts new file mode 100644 index 0000000..96fe8ed --- /dev/null +++ b/src/cluster/cluster_validation.ts @@ -0,0 +1,268 @@ +/** + * Cluster validation utilities: elbow method, gap statistic, Davies-Bouldin. + * Extends sklearn.cluster with additional validation tools. + */ + +import type { KMeans } from "./kmeans.js"; + +/** + * Elbow method: run KMeans for multiple k values and find the elbow. + */ +export interface ElbowResult { + kValues: number[]; + inertias: number[]; + optimalK: number; +} + +export function elbowMethod( + X: Float64Array[], + kRange: number[] = [2, 3, 4, 5, 6, 7, 8, 9, 10], + KMeansClass: new (opts: { nClusters: number; randomState?: number }) => { + fit(X: Float64Array[]): unknown; + inertia_: number; + }, + randomState?: number +): ElbowResult { + const inertias: number[] = []; + for (const k of kRange) { + const km = new KMeansClass({ nClusters: k, randomState }); + km.fit(X); + inertias.push(km.inertia_); + } + + // Find elbow using maximum curvature (second derivative) + let optimalK = kRange[0] ?? 2; + if (inertias.length >= 3) { + let maxCurvature = -Infinity; + for (let i = 1; i < inertias.length - 1; i++) { + const d1 = (inertias[i - 1] ?? 0) - (inertias[i] ?? 0); + const d2 = (inertias[i] ?? 0) - (inertias[i + 1] ?? 0); + const curvature = d1 - d2; + if (curvature > maxCurvature) { + maxCurvature = curvature; + optimalK = kRange[i] ?? 2; + } + } + } + + return { kValues: kRange, inertias, optimalK }; +} + +/** + * Gap statistic: compare inertia to reference (uniform) distribution. + */ +export interface GapStatisticResult { + kValues: number[]; + gaps: number[]; + sks: number[]; + optimalK: number; +} + +export function gapStatistic( + X: Float64Array[], + kRange: number[] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + KMeansClass: new (opts: { nClusters: number; randomState?: number }) => { + fit(X: Float64Array[]): unknown; + inertia_: number; + }, + nRefs = 10, + randomState = 42 +): GapStatisticResult { + const nSamples = X.length; + const nFeatures = X[0]?.length ?? 0; + + // Compute bounding box of data + const mins = new Float64Array(nFeatures); + const maxs = new Float64Array(nFeatures); + mins.fill(Infinity); + maxs.fill(-Infinity); + for (const row of X) { + for (let j = 0; j < nFeatures; j++) { + const v = row[j] ?? 0; + if (v < (mins[j] ?? Infinity)) mins[j] = v; + if (v > (maxs[j] ?? -Infinity)) maxs[j] = v; + } + } + + // Seeded simple LCG RNG + let seed = randomState; + function randFloat(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const gaps: number[] = []; + const sks: number[] = []; + + for (const k of kRange) { + const km = new KMeansClass({ nClusters: k, randomState }); + km.fit(X); + const logW = Math.log(km.inertia_ + 1e-10); + + // Reference distribution + const refLogWs: number[] = []; + for (let r = 0; r < nRefs; r++) { + const Xref: Float64Array[] = []; + for (let i = 0; i < nSamples; i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + row[j] = (mins[j] ?? 0) + randFloat() * ((maxs[j] ?? 1) - (mins[j] ?? 0)); + } + Xref.push(row); + } + const kmRef = new KMeansClass({ nClusters: k, randomState: r }); + kmRef.fit(Xref); + refLogWs.push(Math.log(kmRef.inertia_ + 1e-10)); + } + + const meanRefLogW = refLogWs.reduce((s, v) => s + v, 0) / nRefs; + const variance = refLogWs.reduce((s, v) => s + (v - meanRefLogW) ** 2, 0) / nRefs; + const sd = Math.sqrt(variance); + const sk = sd * Math.sqrt(1 + 1 / nRefs); + + gaps.push(meanRefLogW - logW); + sks.push(sk); + } + + // Optimal k: smallest k such that gap(k) >= gap(k+1) - sk+1 + let optimalK = kRange[0] ?? 1; + for (let i = 0; i < kRange.length - 1; i++) { + if ((gaps[i] ?? 0) >= (gaps[i + 1] ?? 0) - (sks[i + 1] ?? 0)) { + optimalK = kRange[i] ?? 1; + break; + } + } + + return { kValues: kRange, gaps, sks, optimalK }; +} + +/** + * Davies-Bouldin Index (lower is better). + * Complements silhouette score for cluster validation. + */ +export function daviesBouldinScore(X: Float64Array[], labels: Int32Array): number { + const uniqueLabels = Array.from(new Set(Array.from(labels))).sort((a, b) => a - b); + const k = uniqueLabels.length; + if (k < 2) return 0; + + const nFeatures = X[0]?.length ?? 0; + + // Compute centroids + const centroids: Float64Array[] = []; + const counts: number[] = []; + const labelToIdx = new Map(); + uniqueLabels.forEach((l, i) => labelToIdx.set(l, i)); + + for (let ci = 0; ci < k; ci++) { + centroids.push(new Float64Array(nFeatures)); + counts.push(0); + } + + for (let i = 0; i < X.length; i++) { + const ci = labelToIdx.get(labels[i] ?? 0) ?? 0; + counts[ci] = (counts[ci] ?? 0) + 1; + for (let j = 0; j < nFeatures; j++) { + centroids[ci]![j] = (centroids[ci]![j] ?? 0) + (X[i]?.[j] ?? 0); + } + } + for (let ci = 0; ci < k; ci++) { + for (let j = 0; j < nFeatures; j++) { + centroids[ci]![j] = (centroids[ci]![j] ?? 0) / (counts[ci] ?? 1); + } + } + + // Compute scatter (avg distance of cluster points to centroid) + const scatter: number[] = new Array(k).fill(0); + const memberCounts = new Array(k).fill(0); + for (let i = 0; i < X.length; i++) { + const ci = labelToIdx.get(labels[i] ?? 0) ?? 0; + let dist = 0; + for (let j = 0; j < nFeatures; j++) { + dist += ((X[i]?.[j] ?? 0) - (centroids[ci]?.[j] ?? 0)) ** 2; + } + scatter[ci] = (scatter[ci] ?? 0) + Math.sqrt(dist); + memberCounts[ci] = (memberCounts[ci] ?? 0) + 1; + } + for (let ci = 0; ci < k; ci++) { + scatter[ci] = (scatter[ci] ?? 0) / (memberCounts[ci] || 1); + } + + // Compute Davies-Bouldin index + let dbSum = 0; + for (let i = 0; i < k; i++) { + let maxR = -Infinity; + for (let j = 0; j < k; j++) { + if (i === j) continue; + let distCentroids = 0; + for (let f = 0; f < nFeatures; f++) { + distCentroids += ((centroids[i]?.[f] ?? 0) - (centroids[j]?.[f] ?? 0)) ** 2; + } + distCentroids = Math.sqrt(distCentroids); + const R = ((scatter[i] ?? 0) + (scatter[j] ?? 0)) / (distCentroids || 1e-10); + if (R > maxR) maxR = R; + } + dbSum += maxR; + } + + return dbSum / k; +} + +/** + * Calinski-Harabasz Index (higher is better). + */ +export function calinskiHarabaszScore(X: Float64Array[], labels: Int32Array): number { + const nSamples = X.length; + const nFeatures = X[0]?.length ?? 0; + const uniqueLabels = Array.from(new Set(Array.from(labels))).sort((a, b) => a - b); + const k = uniqueLabels.length; + if (k < 2 || nSamples <= k) return 0; + + const labelToIdx = new Map(); + uniqueLabels.forEach((l, i) => labelToIdx.set(l, i)); + + // Global centroid + const globalCentroid = new Float64Array(nFeatures); + for (const row of X) { + for (let j = 0; j < nFeatures; j++) globalCentroid[j] = (globalCentroid[j] ?? 0) + (row[j] ?? 0); + } + for (let j = 0; j < nFeatures; j++) globalCentroid[j] = (globalCentroid[j] ?? 0) / nSamples; + + // Cluster centroids and counts + const centroids = Array.from({ length: k }, () => new Float64Array(nFeatures)); + const counts = new Array(k).fill(0); + for (let i = 0; i < nSamples; i++) { + const ci = labelToIdx.get(labels[i] ?? 0) ?? 0; + counts[ci] = (counts[ci] ?? 0) + 1; + for (let j = 0; j < nFeatures; j++) { + centroids[ci]![j] = (centroids[ci]![j] ?? 0) + (X[i]?.[j] ?? 0); + } + } + for (let ci = 0; ci < k; ci++) { + for (let j = 0; j < nFeatures; j++) { + centroids[ci]![j] = (centroids[ci]![j] ?? 0) / (counts[ci] ?? 1); + } + } + + // Between-cluster scatter (BGSS) + let bgss = 0; + for (let ci = 0; ci < k; ci++) { + let d = 0; + for (let j = 0; j < nFeatures; j++) { + d += ((centroids[ci]?.[j] ?? 0) - (globalCentroid[j] ?? 0)) ** 2; + } + bgss += (counts[ci] ?? 0) * d; + } + + // Within-cluster scatter (WGSS) + let wgss = 0; + for (let i = 0; i < nSamples; i++) { + const ci = labelToIdx.get(labels[i] ?? 0) ?? 0; + let d = 0; + for (let j = 0; j < nFeatures; j++) { + d += ((X[i]?.[j] ?? 0) - (centroids[ci]?.[j] ?? 0)) ** 2; + } + wgss += d; + } + + return (bgss / (k - 1)) / ((wgss / (nSamples - k)) || 1e-10); +} diff --git a/src/cluster/clustering_utils.ts b/src/cluster/clustering_utils.ts new file mode 100644 index 0000000..2b8ef2e --- /dev/null +++ b/src/cluster/clustering_utils.ts @@ -0,0 +1,295 @@ +/** + * Cluster utility functions. + * Mirrors sklearn.cluster._mean_shift and related utilities. + */ + +/** + * Estimate the bandwidth for Mean Shift algorithm. + * Uses a ball-tree-like approach: for each sample, counts how many + * samples are within the estimated bandwidth. + * + * @param X - Input data (n_samples x n_features) + * @param quantile - Quantile of pairwise distances to use as bandwidth (default 0.3) + * @param nSamples - Number of samples to use for estimation (default: all) + * @param seed - Random seed for subsampling + */ +export function estimateBandwidth( + X: Float64Array[], + options: { + quantile?: number; + nSamples?: number; + seed?: number; + } = {}, +): number { + const { quantile = 0.3, seed = 0 } = options; + const n = X.length; + let nSamples = options.nSamples ?? n; + nSamples = Math.min(nSamples, n); + + // Subsample if needed + let indices: number[]; + if (nSamples < n) { + let rng = seed; + const rand = () => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + indices = Array.from({ length: n }, (_, i) => i); + for (let i = n - 1; i > 0; i--) { + const j = Math.floor(rand() * (i + 1)); + const tmp = indices[i]!; indices[i] = indices[j]!; indices[j] = tmp; + } + indices = indices.slice(0, nSamples); + } else { + indices = Array.from({ length: n }, (_, i) => i); + } + + // Compute pairwise distances between sampled points and all points + // Then take the quantile + const allDists: number[] = []; + for (const idx of indices) { + const xi = X[idx]!; + for (let j = 0; j < n; j++) { + const xj = X[j]!; + let d2 = 0; + for (let k = 0; k < xi.length; k++) { + d2 += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + } + allDists.push(Math.sqrt(d2)); + } + } + + allDists.sort((a, b) => a - b); + const qIdx = Math.floor(quantile * (allDists.length - 1)); + return allDists[qIdx] ?? 1.0; +} + +/** + * Find initial seed points for Mean Shift. + * Seeds are bin centers of a uniform grid at bandwidth resolution. + * + * @param X - Input data + * @param bandwidth - Bin size + * @param minBinFreq - Minimum number of points per bin to be included + */ +export function getBinSeeds( + X: Float64Array[], + bandwidth: number, + minBinFreq = 1, +): Float64Array[] { + if (bandwidth <= 0) throw new Error("bandwidth must be positive"); + const n = X.length; + const d = X[0]?.length ?? 0; + + // Discretize X into bins + const binMap = new Map(); + + for (let i = 0; i < n; i++) { + const xi = X[i]!; + const binCoords: number[] = []; + for (let k = 0; k < d; k++) { + binCoords.push(Math.round((xi[k] ?? 0) / bandwidth)); + } + const key = binCoords.join(","); + const existing = binMap.get(key); + if (existing) { + for (let k = 0; k < d; k++) { + existing.sum[k]! += xi[k] ?? 0; + } + existing.count++; + } else { + const sum = new Float64Array(d); + for (let k = 0; k < d; k++) sum[k] = xi[k] ?? 0; + binMap.set(key, { sum, count: 1 }); + } + } + + // Return bin centers with sufficient frequency + const seeds: Float64Array[] = []; + for (const { sum, count } of binMap.values()) { + if (count >= minBinFreq) { + const center = new Float64Array(d); + for (let k = 0; k < d; k++) center[k] = (sum[k] ?? 0) / count; + seeds.push(center); + } + } + + return seeds; +} + +/** + * Find which bin each point belongs to. + * @returns Int32Array of bin indices (one per sample) + */ +export function assignBins( + X: Float64Array[], + seeds: Float64Array[], +): Int32Array { + const n = X.length; + const result = new Int32Array(n).fill(-1); + for (let i = 0; i < n; i++) { + const xi = X[i]!; + let bestDist = Number.POSITIVE_INFINITY; + let bestJ = -1; + for (let j = 0; j < seeds.length; j++) { + const seed = seeds[j]!; + let d2 = 0; + for (let k = 0; k < xi.length; k++) { + d2 += ((xi[k] ?? 0) - (seed[k] ?? 0)) ** 2; + } + if (d2 < bestDist) { bestDist = d2; bestJ = j; } + } + result[i] = bestJ; + } + return result; +} + +/** + * Single iteration of mean-shift update for a set of seeds. + * Updates each seed to the mean of all points within bandwidth distance. + * + * @returns New seed positions and whether any seed moved more than tol + */ +export function meanShiftStep( + X: Float64Array[], + seeds: Float64Array[], + bandwidth: number, +): { newSeeds: Float64Array[]; converged: boolean } { + const d = X[0]?.length ?? 0; + const bw2 = bandwidth * bandwidth; + const newSeeds: Float64Array[] = []; + let maxShift = 0; + + for (const seed of seeds) { + const newSeed = new Float64Array(d); + let weight = 0; + for (const xi of X) { + let d2 = 0; + for (let k = 0; k < d; k++) { + d2 += ((xi[k] ?? 0) - (seed[k] ?? 0)) ** 2; + } + if (d2 <= bw2) { + weight++; + for (let k = 0; k < d; k++) newSeed[k]! += xi[k] ?? 0; + } + } + if (weight > 0) { + for (let k = 0; k < d; k++) newSeed[k]! /= weight; + } else { + newSeed.set(seed); + } + + // Track max shift + let shift2 = 0; + for (let k = 0; k < d; k++) { + shift2 += ((newSeed[k] ?? 0) - (seed[k] ?? 0)) ** 2; + } + maxShift = Math.max(maxShift, Math.sqrt(shift2)); + newSeeds.push(newSeed); + } + + return { newSeeds, converged: maxShift < 1e-3 * bandwidth }; +} + +/** + * Merge nearby seeds by deduplication within bandwidth distance. + * Returns unique cluster centers. + */ +export function mergeSeeds( + seeds: Float64Array[], + bandwidth: number, +): Float64Array[] { + const bw2 = bandwidth * bandwidth; + const merged: Float64Array[] = []; + + for (const seed of seeds) { + let isNew = true; + for (const center of merged) { + let d2 = 0; + for (let k = 0; k < seed.length; k++) { + d2 += ((seed[k] ?? 0) - (center[k] ?? 0)) ** 2; + } + if (d2 <= bw2) { isNew = false; break; } + } + if (isNew) merged.push(seed); + } + + return merged; +} + +/** + * Compute cluster labels for X given cluster centers. + * Each point is assigned to its nearest center. + */ +export function clusterLabels( + X: Float64Array[], + centers: Float64Array[], +): Int32Array { + const labels = new Int32Array(X.length); + for (let i = 0; i < X.length; i++) { + const xi = X[i]!; + let best = -1; + let bestDist = Number.POSITIVE_INFINITY; + for (let j = 0; j < centers.length; j++) { + const c = centers[j]!; + let d2 = 0; + for (let k = 0; k < xi.length; k++) { + d2 += ((xi[k] ?? 0) - (c[k] ?? 0)) ** 2; + } + if (d2 < bestDist) { bestDist = d2; best = j; } + } + labels[i] = best; + } + return labels; +} + +/** + * Compute inertia (within-cluster sum of squared distances to centers). + */ +export function computeInertia( + X: Float64Array[], + centers: Float64Array[], + labels: Int32Array, +): number { + let inertia = 0; + for (let i = 0; i < X.length; i++) { + const xi = X[i]!; + const c = centers[labels[i]!]!; + let d2 = 0; + for (let k = 0; k < xi.length; k++) { + d2 += ((xi[k] ?? 0) - (c[k] ?? 0)) ** 2; + } + inertia += d2; + } + return inertia; +} + +/** + * Compute cluster centers from assignments. + */ +export function computeCenters( + X: Float64Array[], + labels: Int32Array, + nClusters: number, +): Float64Array[] { + const d = X[0]?.length ?? 0; + const sums: Float64Array[] = Array.from({ length: nClusters }, () => new Float64Array(d)); + const counts = new Int32Array(nClusters); + + for (let i = 0; i < X.length; i++) { + const xi = X[i]!; + const lbl = labels[i] ?? 0; + if (lbl >= 0 && lbl < nClusters) { + counts[lbl]!++; + for (let k = 0; k < d; k++) sums[lbl]![k]! += xi[k] ?? 0; + } + } + + return sums.map((s, j) => { + const cnt = counts[j] ?? 1; + if (cnt === 0) return s; + const c = new Float64Array(d); + for (let k = 0; k < d; k++) c[k] = (s[k] ?? 0) / cnt; + return c; + }); +} diff --git a/src/cluster/feature_agglomeration.ts b/src/cluster/feature_agglomeration.ts new file mode 100644 index 0000000..0a0ca57 --- /dev/null +++ b/src/cluster/feature_agglomeration.ts @@ -0,0 +1,169 @@ +/** + * FeatureAgglomeration β€” hierarchical clustering applied to features (columns). + * Each sample's features are grouped; the representative value (mean/median/max) + * of each group becomes the transformed feature. + * + * Ports: FeatureAgglomeration + */ + +import { BaseEstimator } from "../base.js"; + +export interface FeatureAgglomerationOptions { + nClusters?: number; + poolingFunc?: "mean" | "median" | "max" | "min"; + linkage?: "ward" | "complete" | "average" | "single"; +} + +function columnMean(X: Float64Array[], col: number): number { + let s = 0; + for (const row of X) s += row[col] ?? 0; + return s / X.length; +} + +function colDist(X: Float64Array[], a: number, b: number): number { + const ma = columnMean(X, a); + const mb = columnMean(X, b); + return Math.abs(ma - mb); +} + +/** + * Agglomerative (bottom-up) clustering on columns using average-column-value distance. + * Returns an array mapping each column β†’ cluster index (0-based). + */ +function agglomerateCols( + X: Float64Array[], + nClusters: number, + _linkage: string, +): Int32Array { + const nFeatures = X[0]?.length ?? 0; + if (nClusters >= nFeatures) { + return Int32Array.from({ length: nFeatures }, (_, i) => i); + } + // Start: each feature is its own cluster + const assignments = Int32Array.from({ length: nFeatures }, (_, i) => i); + let nActive = nFeatures; + // Track which features belong to each cluster + const clusters: number[][] = Array.from({ length: nFeatures }, (_, i) => [i]); + + while (nActive > nClusters) { + // Find two closest clusters (by mean column distance) + let minDist = Number.POSITIVE_INFINITY; + let mergeA = -1; + let mergeB = -1; + const activeIds = [...new Set(Array.from(assignments))].sort((a, b) => a - b); + for (let ai = 0; ai < activeIds.length; ai++) { + for (let bi = ai + 1; bi < activeIds.length; bi++) { + const ca = activeIds[ai] ?? 0; + const cb = activeIds[bi] ?? 0; + const colsA = clusters[ca] ?? []; + const colsB = clusters[cb] ?? []; + // average linkage between column groups + let d = 0; + let count = 0; + for (const fa of colsA) { + for (const fb of colsB) { + d += colDist(X, fa, fb); + count++; + } + } + d = count > 0 ? d / count : Number.POSITIVE_INFINITY; + if (d < minDist) { + minDist = d; + mergeA = ca; + mergeB = cb; + } + } + } + if (mergeA < 0 || mergeB < 0) break; + // Merge mergeB into mergeA + const colsB = clusters[mergeB] ?? []; + for (const col of colsB) { + assignments[col] = mergeA; + } + clusters[mergeA] = [...(clusters[mergeA] ?? []), ...colsB]; + clusters[mergeB] = []; + nActive--; + } + // Remap cluster IDs to 0..nClusters-1 + const idMap = new Map(); + let nextId = 0; + for (let i = 0; i < assignments.length; i++) { + const a = assignments[i] ?? 0; + if (!idMap.has(a)) idMap.set(a, nextId++); + assignments[i] = idMap.get(a) ?? 0; + } + return assignments; +} + +/** + * Cluster features using hierarchical clustering and pool each group. + */ +export class FeatureAgglomeration extends BaseEstimator { + nClusters: number; + poolingFunc: "mean" | "median" | "max" | "min"; + linkage: "ward" | "complete" | "average" | "single"; + + labels_!: Int32Array; + nClusters_!: number; + + constructor(options: FeatureAgglomerationOptions = {}) { + super(); + this.nClusters = options.nClusters ?? 2; + this.poolingFunc = options.poolingFunc ?? "mean"; + this.linkage = options.linkage ?? "ward"; + } + + fit(X: Float64Array[]): this { + this.labels_ = agglomerateCols(X, this.nClusters, this.linkage); + this.nClusters_ = new Set(Array.from(this.labels_)).size; + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.labels_ === undefined) throw new Error("Not fitted"); + const k = this.nClusters_; + return X.map((row) => { + const groups: number[][] = Array.from({ length: k }, () => []); + for (let j = 0; j < row.length; j++) { + const cid = this.labels_[j] ?? 0; + (groups[cid] ?? []).push(row[j] ?? 0); + } + const out = new Float64Array(k); + for (let c = 0; c < k; c++) { + const vals = groups[c] ?? []; + if (vals.length === 0) { out[c] = 0; continue; } + if (this.poolingFunc === "mean") { + out[c] = vals.reduce((a, b) => a + b, 0) / vals.length; + } else if (this.poolingFunc === "median") { + const s = [...vals].sort((a, b) => a - b); + const m = Math.floor(s.length / 2); + out[c] = s.length % 2 === 0 + ? ((s[m - 1] ?? 0) + (s[m] ?? 0)) / 2 + : (s[m] ?? 0); + } else if (this.poolingFunc === "max") { + out[c] = Math.max(...vals); + } else { + out[c] = Math.min(...vals); + } + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + /** Reconstruct original shape from reduced representation. */ + inverseTransform(Xred: Float64Array[]): Float64Array[] { + if (this.labels_ === undefined) throw new Error("Not fitted"); + const nFeatures = this.labels_.length; + return Xred.map((row) => { + const out = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + out[j] = row[this.labels_[j] ?? 0] ?? 0; + } + return out; + }); + } +} diff --git a/src/cluster/hdbscan.ts b/src/cluster/hdbscan.ts new file mode 100644 index 0000000..2a1f489 --- /dev/null +++ b/src/cluster/hdbscan.ts @@ -0,0 +1,189 @@ +/** + * HDBSCAN β€” Hierarchical Density-Based Spatial Clustering of Applications with Noise. + * Mirrors sklearn.cluster.HDBSCAN. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface HDBSCANOptions { + minClusterSize?: number; + minSamples?: number | null; + clusterSelectionEpsilon?: number; + maxClusterSize?: number | null; + alpha?: number; + clusterSelectionMethod?: "eom" | "leaf"; + allowSingleCluster?: boolean; + metric?: "euclidean" | "manhattan" | "chebyshev"; +} + +/** + * HDBSCAN clustering algorithm. + * Extends DBSCAN by converting it into a hierarchical clustering then using a stability + * criterion to extract a flat clustering. + */ +export class HDBSCAN { + minClusterSize: number; + minSamples: number; + clusterSelectionEpsilon: number; + alpha: number; + clusterSelectionMethod: "eom" | "leaf"; + allowSingleCluster: boolean; + metric: "euclidean" | "manhattan" | "chebyshev"; + + labels_: Int32Array | null = null; + probabilities_: Float64Array | null = null; + clusterPersistence_: Float64Array | null = null; + nFeatures_: number = 0; + + constructor(options: HDBSCANOptions = {}) { + this.minClusterSize = options.minClusterSize ?? 5; + this.minSamples = options.minSamples ?? 5; + this.clusterSelectionEpsilon = options.clusterSelectionEpsilon ?? 0; + this.alpha = options.alpha ?? 1.0; + this.clusterSelectionMethod = options.clusterSelectionMethod ?? "eom"; + this.allowSingleCluster = options.allowSingleCluster ?? false; + this.metric = options.metric ?? "euclidean"; + } + + private _dist(a: Float64Array, b: Float64Array): number { + const p = a.length; + if (this.metric === "manhattan") { + let s = 0; + for (let j = 0; j < p; j++) s += Math.abs((a[j] ?? 0) - (b[j] ?? 0)); + return s; + } + if (this.metric === "chebyshev") { + let s = 0; + for (let j = 0; j < p; j++) s = Math.max(s, Math.abs((a[j] ?? 0) - (b[j] ?? 0))); + return s; + } + let s = 0; + for (let j = 0; j < p; j++) s += ((a[j] ?? 0) - (b[j] ?? 0)) ** 2; + return Math.sqrt(s); + } + + fit(X: Float64Array[]): this { + const n = X.length; + this.nFeatures_ = X[0]?.length ?? 0; + + // Compute pairwise distances + const dists: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + const d = this._dist(X[i]!, X[j]!); + dists[i]![j]! = d; + dists[j]![i]! = d; + } + } + + // Core distances (kth nearest neighbor distance) + const k = Math.min(this.minSamples, n - 1); + const coreDists = new Float64Array(n); + for (let i = 0; i < n; i++) { + const sorted = Array.from(dists[i]!).filter((_, j) => j !== i).sort((a, b) => a - b); + coreDists[i]! = sorted[k - 1] ?? 0; + } + + // Mutual reachability distances + const mrd: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + if (i === j) continue; + mrd[i]![j]! = Math.max(coreDists[i]!, coreDists[j]!, dists[i]![j]!); + } + } + + // Build MST (Prim's algorithm) + const inMST = new Uint8Array(n); + const minEdge = new Float64Array(n).fill(Number.POSITIVE_INFINITY); + const parent = new Int32Array(n).fill(-1); + minEdge[0]! = 0; + + const edges: Array<[number, number, number]> = []; + for (let step = 0; step < n; step++) { + let u = -1; + for (let i = 0; i < n; i++) { + if (!inMST[i] && (u < 0 || (minEdge[i] ?? 0) < (minEdge[u] ?? 0))) u = i; + } + if (u < 0) break; + inMST[u]! = 1; + if (parent[u]! >= 0) edges.push([parent[u]!, u, mrd[parent[u]!]![u]!]); + for (let v = 0; v < n; v++) { + if (!inMST[v] && (mrd[u]![v]! < (minEdge[v] ?? Number.POSITIVE_INFINITY))) { + minEdge[v]! = mrd[u]![v]!; + parent[v]! = u; + } + } + } + + // Sort MST edges by weight + edges.sort((a, b) => (a[2] ?? 0) - (b[2] ?? 0)); + + // Build hierarchy via single-linkage (union-find) + const uf = Array.from({ length: n }, (_, i) => i); + const find = (x: number): number => { + while (uf[x] !== x) { + uf[x]! = uf[uf[x]!]!; + x = uf[x]!; + } + return x; + }; + const clusterSizes = new Int32Array(n).fill(1); + const labels = new Int32Array(n).fill(-1); + + // Simplified flat clustering: use density-based approach + // Group points where edge weight <= threshold + const threshold = this.clusterSelectionEpsilon > 0 + ? this.clusterSelectionEpsilon + : (edges[Math.floor(edges.length * 0.5)]?.[2] ?? 0); + + for (const [u, v, w] of edges) { + if (w <= threshold) { + const pu = find(u); + const pv = find(v); + if (pu !== pv) { + const newSize = (clusterSizes[pu] ?? 1) + (clusterSizes[pv] ?? 1); + if ((clusterSizes[pu] ?? 1) >= (clusterSizes[pv] ?? 1)) { + uf[pv]! = pu; + clusterSizes[pu]! = newSize; + } else { + uf[pu]! = pv; + clusterSizes[pv]! = newSize; + } + } + } + } + + // Assign cluster labels + const rootToCluster = new Map(); + let nextCluster = 0; + for (let i = 0; i < n; i++) { + const root = find(i); + const sz = clusterSizes[root] ?? 1; + if (sz >= this.minClusterSize) { + if (!rootToCluster.has(root)) rootToCluster.set(root, nextCluster++); + labels[i]! = rootToCluster.get(root)!; + } + } + + this.labels_ = labels; + this.probabilities_ = new Float64Array(n).fill(1.0); + // Mark noise points + for (let i = 0; i < n; i++) { + if (labels[i] === -1) this.probabilities_[i]! = 0; + } + this.clusterPersistence_ = new Float64Array(nextCluster).fill(1.0); + return this; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + if (!this.labels_) throw new NotFittedError("HDBSCAN is not fitted"); + return this.labels_; + } + + get nClusters_(): number { + if (!this.labels_) return 0; + return Math.max(...Array.from(this.labels_)) + 1; + } +} diff --git a/src/cluster/hierarchical.ts b/src/cluster/hierarchical.ts new file mode 100644 index 0000000..f5304d0 --- /dev/null +++ b/src/cluster/hierarchical.ts @@ -0,0 +1,208 @@ +/** + * Hierarchical clustering utilities β€” analogous to scipy.cluster.hierarchy and + * sklearn's internal _agglomerative_clustering helpers. + */ + +/** Linkage methods supported by the `linkage` function. */ +export type LinkageMethod = "single" | "complete" | "average" | "ward" | "centroid" | "median" | "weighted"; + +/** A single row of a linkage matrix: [idx1, idx2, distance, count]. */ +export interface HierarchicalHierarchicalLinkageRow { + idx1: number; + idx2: number; + distance: number; + count: number; +} + +/** + * Computes a hierarchical clustering linkage matrix from a condensed distance matrix. + * + * @param distMatrix Condensed distance matrix (length = n*(n-1)/2 for n observations). + * @param n Number of observations. + * @param method Linkage method (default "single"). + * @returns Array of (n-1) HierarchicalLinkageRow entries in merge order. + */ +export function linkage( + distMatrix: Float64Array, + n: number, + method: LinkageMethod = "single", +): HierarchicalLinkageRow[] { + // Build full distance matrix for simplicity (nn-chain would be faster) + const D = new Float64Array(n * n).fill(Number.POSITIVE_INFINITY); + for (let i = 0; i < n; i++) D[i * n + i] = 0; + let k = 0; + for (let i = 0; i < n - 1; i++) { + for (let j = i + 1; j < n; j++) { + const d = distMatrix[k++]!; + D[i * n + j] = d; + D[j * n + i] = d; + } + } + + // Active cluster set + const active = new Set(Array.from({ length: n }, (_, i) => i)); + // Cluster sizes + const sizes = new Float64Array(2 * n).fill(1); + // Cluster centroids (for ward / centroid / median) + const identity = new Float64Array(n * n); // nΓ—n identity as initial centroids placeholder + for (let i = 0; i < n; i++) identity[i * n + i] = 1; + + const result: HierarchicalLinkageRow[] = []; + let nextId = n; + + // Expanded distance matrix that grows with new cluster nodes + const maxN = 2 * n; + const bigD = new Float64Array(maxN * maxN).fill(Number.POSITIVE_INFINITY); + for (let i = 0; i < n; i++) { + bigD[i * maxN + i] = 0; + for (let j = 0; j < n; j++) bigD[i * maxN + j] = D[i * n + j]!; + } + + while (active.size > 1) { + // Find nearest pair + let minDist = Number.POSITIVE_INFINITY; + let a = -1; + let b = -1; + for (const i of active) { + for (const j of active) { + if (j <= i) continue; + const d = bigD[i * maxN + j]!; + if (d < minDist) { minDist = d; a = i; b = j; } + } + } + if (a < 0) break; + + const sA = sizes[a]!; + const sB = sizes[b]!; + const sNew = sA + sB; + sizes[nextId] = sNew; + + // Compute distances from new cluster to all remaining clusters + for (const c of active) { + if (c === a || c === b) continue; + const dac = bigD[a * maxN + c]!; + const dbc = bigD[b * maxN + c]!; + const sC = sizes[c]!; + let dNew: number; + switch (method) { + case "single": dNew = Math.min(dac, dbc); break; + case "complete": dNew = Math.max(dac, dbc); break; + case "average": dNew = (sA * dac + sB * dbc) / sNew; break; + case "ward": { + const dab = bigD[a * maxN + b]!; + dNew = Math.sqrt( + ((sA + sC) * dac * dac + (sB + sC) * dbc * dbc - sC * dab * dab) / (sNew + sC), + ); + break; + } + case "centroid": dNew = Math.sqrt((sA * dac * dac + sB * dbc * dbc) / sNew - (sA * sB * bigD[a * maxN + b]! * bigD[a * maxN + b]!) / (sNew * sNew)); break; + case "median": dNew = Math.sqrt(0.5 * dac * dac + 0.5 * dbc * dbc - 0.25 * bigD[a * maxN + b]! * bigD[a * maxN + b]!); break; + case "weighted": dNew = 0.5 * dac + 0.5 * dbc; break; + default: dNew = Math.min(dac, dbc); + } + bigD[nextId * maxN + c] = dNew; + bigD[c * maxN + nextId] = dNew; + } + bigD[nextId * maxN + nextId] = 0; + + result.push({ idx1: a, idx2: b, distance: minDist, count: sNew }); + active.delete(a); + active.delete(b); + active.add(nextId); + nextId++; + } + + return result; +} + +/** + * Cuts a dendrogram at a given number of clusters. + * Returns an Int32Array of cluster labels (length = n). + */ +export function cutTree(rows: HierarchicalLinkageRow[], n: number, nClusters: number): Int32Array { + // Each leaf starts in its own cluster; merge bottom-up, stop early + const parent = new Int32Array(2 * n).fill(-1); + const mergeOrder = rows.slice(0, n - nClusters); + + let nextId = n; + for (const row of mergeOrder) { + parent[row.idx1] = nextId; + parent[row.idx2] = nextId; + nextId++; + } + + const labels = new Int32Array(n); + const rootLabels = new Map(); + let labelCounter = 0; + + for (let i = 0; i < n; i++) { + let cur = i; + while (parent[cur] !== -1) cur = parent[cur]!; + let label = rootLabels.get(cur); + if (label === undefined) { + label = labelCounter++; + rootLabels.set(cur, label); + } + labels[i] = label; + } + return labels; +} + +/** + * Converts a condensed distance matrix to a full (nΓ—n) symmetric matrix. + */ +export function squareform(condensed: Float64Array, n: number): Float64Array { + const full = new Float64Array(n * n); + let k = 0; + for (let i = 0; i < n - 1; i++) { + for (let j = i + 1; j < n; j++) { + const d = condensed[k++]!; + full[i * n + j] = d; + full[j * n + i] = d; + } + } + return full; +} + +/** + * Computes the cophenetic correlation coefficient for a linkage matrix. + * Measures how faithfully the dendrogram preserves pairwise distances. + */ +export function copheneticCorr(rows: HierarchicalLinkageRow[], condensed: Float64Array, n: number): number { + // Build cophenetic distance matrix from linkage + const cophenetic = new Float64Array((n * (n - 1)) / 2); + const clusterHeight = new Map(); + const clusterMembers = new Map(); + + for (let i = 0; i < n; i++) clusterMembers.set(i, [i]); + + let nextId = n; + for (const row of rows) { + const mA = clusterMembers.get(row.idx1) ?? []; + const mB = clusterMembers.get(row.idx2) ?? []; + for (const a of mA) { + for (const b of mB) { + const [lo, hi] = a < b ? [a, b] : [b, a]; + // Condensed index + const idx = lo * n - (lo * (lo + 1)) / 2 + hi - lo - 1; + cophenetic[idx] = row.distance; + } + } + clusterMembers.set(nextId, [...mA, ...mB]); + clusterHeight.set(nextId, row.distance); + nextId++; + } + + // Pearson correlation between condensed and cophenetic distances + const m = condensed.length; + let mx = 0; let my = 0; + for (let i = 0; i < m; i++) { mx += condensed[i]!; my += cophenetic[i]!; } + mx /= m; my /= m; + let cov = 0; let sx = 0; let sy = 0; + for (let i = 0; i < m; i++) { + const dx = condensed[i]! - mx; const dy = cophenetic[i]! - my; + cov += dx * dy; sx += dx * dx; sy += dy * dy; + } + const denom = Math.sqrt(sx * sy); + return denom === 0 ? 0 : cov / denom; +} diff --git a/src/cluster/index.ts b/src/cluster/index.ts new file mode 100644 index 0000000..9cfa3de --- /dev/null +++ b/src/cluster/index.ts @@ -0,0 +1,12 @@ +export * from "./kmeans.js"; +export * from "./agglomerative.js"; +export * from "./spectral.js"; +export * from "./hdbscan.js"; +export * from "./bisecting_kmeans.js"; +export * from "./affinity_propagation.js"; +export * from "./feature_agglomeration.js"; +export * from "./ward.js"; +export * from "./clustering_utils.js"; +export * from "./hierarchical.js"; +export * from "./optics_ext.js"; +export * from "./cluster_validation.js"; diff --git a/src/cluster/kmeans.ts b/src/cluster/kmeans.ts new file mode 100644 index 0000000..3e043d0 --- /dev/null +++ b/src/cluster/kmeans.ts @@ -0,0 +1,301 @@ +/** + * KMeans and DBSCAN clustering. + * Mirrors sklearn.cluster.KMeans and DBSCAN. + */ + +import { NotFittedError } from "../exceptions.js"; + +function euclideanSq(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) { + s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return s; +} + +function euclidean(a: Float64Array, b: Float64Array): number { + return Math.sqrt(euclideanSq(a, b)); +} + +export class KMeans { + nClusters: number; + maxIter: number; + tol: number; + nInit: number; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + inertia_: number = 0; + + constructor( + options: { + nClusters?: number; + maxIter?: number; + tol?: number; + nInit?: number; + } = {}, + ) { + this.nClusters = options.nClusters ?? 8; + this.maxIter = options.maxIter ?? 300; + this.tol = options.tol ?? 1e-4; + this.nInit = options.nInit ?? 10; + } + + private _kmeanspp(X: Float64Array[], k: number): Float64Array[] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const centers: Float64Array[] = []; + + // Pick first center randomly + centers.push(new Float64Array(X[Math.floor(Math.random() * n)] ?? new Float64Array(p))); + + for (let c = 1; c < k; c++) { + const dists = X.map((xi) => { + let minD = Number.POSITIVE_INFINITY; + for (const center of centers) { + const d = euclideanSq(xi, center); + if (d < minD) minD = d; + } + return minD; + }); + const totalDist = dists.reduce((a, b) => a + b, 0); + let rand = Math.random() * totalDist; + let selected = 0; + for (let i = 0; i < n; i++) { + rand -= dists[i] ?? 0; + if (rand <= 0) { + selected = i; + break; + } + } + centers.push(new Float64Array(X[selected] ?? new Float64Array(p))); + } + return centers; + } + + private _run( + X: Float64Array[], + k: number, + ): { centers: Float64Array[]; labels: Int32Array; inertia: number } { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + let centers = this._kmeanspp(X, k); + const labels = new Int32Array(n); + + for (let iter = 0; iter < this.maxIter; iter++) { + // Assignment step + for (let i = 0; i < n; i++) { + let minDist = Number.POSITIVE_INFINITY; + let minIdx = 0; + for (let c = 0; c < centers.length; c++) { + const d = euclideanSq(X[i] ?? new Float64Array(p), centers[c] ?? new Float64Array(p)); + if (d < minDist) { + minDist = d; + minIdx = c; + } + } + labels[i] = minIdx; + } + + // Update step + const newCenters: Float64Array[] = Array.from({ length: k }, () => new Float64Array(p)); + const counts = new Int32Array(k); + for (let i = 0; i < n; i++) { + const c = labels[i] ?? 0; + counts[c] = (counts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const center = newCenters[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + center[j] = (center[j] ?? 0) + (xi[j] ?? 0); + } + } + + let maxShift = 0; + for (let c = 0; c < k; c++) { + const cnt = counts[c] ?? 0; + const center = newCenters[c] ?? new Float64Array(p); + if (cnt > 0) { + for (let j = 0; j < p; j++) { + center[j] = (center[j] ?? 0) / cnt; + } + } else { + // Re-initialize empty cluster to a random point + const randIdx = Math.floor(Math.random() * n); + newCenters[c] = new Float64Array(X[randIdx] ?? new Float64Array(p)); + } + const shift = euclideanSq(centers[c] ?? new Float64Array(p), newCenters[c] ?? new Float64Array(p)); + if (shift > maxShift) maxShift = shift; + } + centers = newCenters; + if (maxShift < this.tol ** 2) break; + } + + // Compute inertia + let inertia = 0; + for (let i = 0; i < n; i++) { + inertia += euclideanSq(X[i] ?? new Float64Array(p), centers[labels[i] ?? 0] ?? new Float64Array(p)); + } + + return { centers, labels, inertia }; + } + + fit(X: Float64Array[]): this { + const k = Math.min(this.nClusters, X.length); + let best: ReturnType | null = null; + + for (let init = 0; init < this.nInit; init++) { + const result = this._run(X, k); + if (best === null || result.inertia < best.inertia) { + best = result; + } + } + + this.clusterCenters_ = best?.centers ?? []; + this.labels_ = best?.labels ?? new Int32Array(X.length); + this.inertia_ = best?.inertia ?? 0; + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (this.clusterCenters_ === null) throw new NotFittedError("KMeans"); + const centers = this.clusterCenters_; + const p = (centers[0] ?? new Float64Array(0)).length; + return new Int32Array( + X.map((xi) => { + let minDist = Number.POSITIVE_INFINITY; + let minIdx = 0; + for (let c = 0; c < centers.length; c++) { + const d = euclideanSq(xi, centers[c] ?? new Float64Array(p)); + if (d < minDist) { + minDist = d; + minIdx = c; + } + } + return minIdx; + }), + ); + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_ as Int32Array; + } + + score(X: Float64Array[]): number { + return -this._computeInertia(X, this.clusterCenters_ ?? []); + } + + private _computeInertia(X: Float64Array[], centers: Float64Array[]): number { + const p = (centers[0] ?? new Float64Array(0)).length; + let inertia = 0; + for (const xi of X) { + let minDist = Number.POSITIVE_INFINITY; + for (const c of centers) { + const d = euclideanSq(xi, c.length ? c : new Float64Array(p)); + if (d < minDist) minDist = d; + } + inertia += minDist; + } + return inertia; + } +} + +export class DBSCAN { + eps: number; + minSamples: number; + metric: string; + + labels_: Int32Array | null = null; + coreIndices_: Int32Array | null = null; + + constructor( + options: { + eps?: number; + minSamples?: number; + metric?: string; + } = {}, + ) { + this.eps = options.eps ?? 0.5; + this.minSamples = options.minSamples ?? 5; + this.metric = options.metric ?? "euclidean"; + } + + fitPredict(X: Float64Array[]): Int32Array { + const n = X.length; + const labels = new Int32Array(n).fill(-2); // -2 = unvisited, -1 = noise + let clusterId = 0; + const coreIndices: number[] = []; + + function getNeighbors(idx: number): number[] { + const neighbors: number[] = []; + const xi = X[idx] ?? new Float64Array(0); + for (let j = 0; j < n; j++) { + if (euclidean(xi, X[j] ?? new Float64Array(0)) <= 0.5) { + // placeholder - use eps below + } + } + return neighbors; + } + void getNeighbors; // suppress unused warning + + const eps = this.eps; + const minSamples = this.minSamples; + + function neighbors(idx: number): number[] { + const xi = X[idx] ?? new Float64Array(0); + const result: number[] = []; + for (let j = 0; j < n; j++) { + if (euclidean(xi, X[j] ?? new Float64Array(0)) <= eps) { + result.push(j); + } + } + return result; + } + + for (let i = 0; i < n; i++) { + if (labels[i] !== -2) continue; + const nb = neighbors(i); + if (nb.length < minSamples) { + labels[i] = -1; + continue; + } + + coreIndices.push(i); + labels[i] = clusterId; + const queue = [...nb.filter((j) => j !== i)]; + + while (queue.length > 0) { + const j = queue.shift() as number; + if (labels[j] === -1) { + labels[j] = clusterId; + } + if (labels[j] !== -2) continue; + labels[j] = clusterId; + const jNb = neighbors(j); + if (jNb.length >= minSamples) { + coreIndices.push(j); + for (const k of jNb) { + if (labels[k] === -2 || labels[k] === -1) { + queue.push(k); + } + } + } + } + clusterId++; + } + + // Fix any remaining unvisited (noise) + for (let i = 0; i < n; i++) { + if (labels[i] === -2) labels[i] = -1; + } + + this.labels_ = labels; + this.coreIndices_ = new Int32Array(coreIndices); + return labels; + } + + fit(X: Float64Array[]): this { + this.fitPredict(X); + return this; + } +} diff --git a/src/cluster/mean_shift_ext.ts b/src/cluster/mean_shift_ext.ts new file mode 100644 index 0000000..2c17924 --- /dev/null +++ b/src/cluster/mean_shift_ext.ts @@ -0,0 +1,132 @@ +/** + * Mean Shift clustering extensions. + * Mirrors scikit-learn's cluster.MeanShift with bandwidth estimation. + */ + +export interface MeanShiftExtOptions { + bandwidth?: number; + seeds?: Float64Array[]; + binSeeding?: boolean; + minBinFreq?: number; + clusterAll?: boolean; + maxIter?: number; +} + +function euclidean(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + return Math.sqrt(s); +} + +/** + * Estimate bandwidth for Mean Shift using median heuristic. + */ +export function estimateBandwidth( + X: Float64Array[], + options: { quantile?: number; nSamples?: number } = {}, +): number { + const { quantile = 0.3, nSamples } = options; + const n = X.length; + const sample = nSamples !== undefined ? X.slice(0, nSamples) : X; + const nS = sample.length; + const dists: number[] = []; + + for (let i = 0; i < nS; i++) { + for (let j = i + 1; j < n; j++) { + dists.push(euclidean(sample[i]!, X[j]!)); + } + } + dists.sort((a, b) => a - b); + const idx = Math.floor(quantile * dists.length); + return dists[idx] ?? 1; +} + +export class MeanShiftExt { + readonly bandwidth: number | null; + readonly clusterAll: boolean; + readonly maxIter: number; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + + constructor(options: MeanShiftExtOptions = {}) { + this.bandwidth = options.bandwidth ?? null; + this.clusterAll = options.clusterAll ?? true; + this.maxIter = options.maxIter ?? 300; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const nFeatures = X[0]?.length ?? 0; + const bw = this.bandwidth ?? estimateBandwidth(X); + + // Initialize seeds at data points + let seeds = X.map((row) => row.slice() as Float64Array); + + // Iterate mean shift + for (let iter = 0; iter < this.maxIter; iter++) { + let maxShift = 0; + const newSeeds = seeds.map((seed) => { + const weights: number[] = X.map((xi) => { + const d = euclidean(xi, seed); + return Math.exp(-0.5 * (d / bw) ** 2); + }); + const totalW = weights.reduce((s, w) => s + w, 0); + if (totalW < 1e-10) return seed; + const newSeed = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + newSeed[j] = X.reduce((s, xi, i) => s + (weights[i] ?? 0) * (xi[j] ?? 0), 0) / totalW; + } + maxShift = Math.max(maxShift, euclidean(newSeed, seed)); + return newSeed; + }); + seeds = newSeeds; + if (maxShift < 1e-5) break; + } + + // Merge nearby seeds into cluster centers + const centers: Float64Array[] = []; + for (const seed of seeds) { + let merged = false; + for (const center of centers) { + if (euclidean(seed, center) < bw / 2) { + merged = true; + // Update center as mean + for (let j = 0; j < nFeatures; j++) { + center[j] = ((center[j] ?? 0) + (seed[j] ?? 0)) / 2; + } + break; + } + } + if (!merged) centers.push(seed.slice() as Float64Array); + } + + this.clusterCenters_ = centers; + this.labels_ = Int32Array.from({ length: n }, (_, i) => { + let bestC = -1; + let bestD = Number.POSITIVE_INFINITY; + for (let c = 0; c < centers.length; c++) { + const d = euclidean(X[i]!, centers[c]!); + if (d < bestD) { bestD = d; bestC = c; } + } + if (!this.clusterAll && bestD > bw) return -1; + return bestC; + }); + + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (this.clusterCenters_ === null) throw new Error("MeanShiftExt must be fitted first"); + const centers = this.clusterCenters_; + return Int32Array.from(X, (xi) => { + let best = 0; + let bestD = euclidean(xi, centers[0]!); + for (let c = 1; c < centers.length; c++) { + const d = euclidean(xi, centers[c]!); + if (d < bestD) { bestD = d; best = c; } + } + return best; + }); + } +} diff --git a/src/cluster/optics_ext.ts b/src/cluster/optics_ext.ts new file mode 100644 index 0000000..944dcd4 --- /dev/null +++ b/src/cluster/optics_ext.ts @@ -0,0 +1,191 @@ +/** + * OPTICS clustering utility functions β€” ported from sklearn.cluster._optics + * clusterOpticsDbscan, clusterOpticsXi, extractDbscanClustering + */ + +export interface OpticsClusterResult { + /** Cluster labels for each sample (-1 = noise) */ + labels: Int32Array; + /** Number of clusters found (excluding noise) */ + nClusters: number; +} + +/** + * Perform DBSCAN extraction from OPTICS reachability distances. + * + * @param reachabilityDistances Reachability distances from OPTICS + * @param coreDistances Core distances from OPTICS + * @param ordering Sample ordering from OPTICS + * @param eps The maximum reachability distance for cluster membership + * @returns Cluster labels for each sample + */ +export function clusterOpticsDbscan( + reachabilityDistances: Float64Array, + coreDistances: Float64Array, + ordering: Int32Array, + eps: number, +): OpticsClusterResult { + const nSamples = reachabilityDistances.length; + const labels = new Int32Array(nSamples).fill(-1); + let clusterLabel = 0; + + let i = 0; + while (i < nSamples) { + const sampleIdx = ordering[i] ?? i; + const reach = reachabilityDistances[sampleIdx] ?? Number.POSITIVE_INFINITY; + const core = coreDistances[sampleIdx] ?? Number.POSITIVE_INFINITY; + + if (reach > eps) { + // This point starts a potential new cluster or is noise + if (core <= eps) { + // It is a core point β€” start a new cluster + clusterLabel++; + labels[sampleIdx] = clusterLabel; + i++; + // Expand cluster + while (i < nSamples) { + const nextIdx = ordering[i] ?? i; + const nextReach = reachabilityDistances[nextIdx] ?? Number.POSITIVE_INFINITY; + if (nextReach <= eps) { + labels[nextIdx] = clusterLabel; + i++; + } else { + break; + } + } + } else { + // Noise point + i++; + } + } else { + // Continue current cluster + if (clusterLabel > 0) { + labels[sampleIdx] = clusterLabel; + } + i++; + } + } + + return { labels, nClusters: clusterLabel }; +} + +/** + * Perform xi-based cluster extraction from OPTICS results. + * + * @param reachabilityDistances Reachability distances from OPTICS + * @param ordering Sample ordering from OPTICS + * @param minSamples Minimum number of samples in a cluster + * @param xi Determines the minimum steepness (0 < xi < 1) + * @param minClusterSize Minimum size of a cluster (as fraction or count) + * @returns Cluster labels + */ +export function clusterOpticsXi( + reachabilityDistances: Float64Array, + ordering: Int32Array, + minSamples: number, + xi = 0.05, + minClusterSize?: number, +): OpticsClusterResult { + const nSamples = ordering.length; + const minSize = minClusterSize ?? minSamples; + const labels = new Int32Array(nSamples).fill(-1); + + // Build ordered reachabilities + const orderedReach = new Float64Array(nSamples); + for (let i = 0; i < nSamples; i++) { + orderedReach[i] = reachabilityDistances[ordering[i] ?? i] ?? Number.POSITIVE_INFINITY; + } + + // Find steep upward and downward areas + interface SteepArea { + start: number; + end: number; + kind: "up" | "down"; + } + + const steepAreas: SteepArea[] = []; + + for (let i = 0; i < nSamples - 1; i++) { + const r1 = orderedReach[i] ?? 0; + const r2 = orderedReach[i + 1] ?? 0; + if (r1 === 0) continue; + + const ratio = r2 / r1; + if (ratio >= 1 + xi) { + steepAreas.push({ start: i, end: i + 1, kind: "up" }); + } else if (r2 > 0 && r1 / r2 >= 1 + xi) { + steepAreas.push({ start: i, end: i + 1, kind: "down" }); + } + } + + // Simple cluster extraction: pair each down area with a matching up area + let clusterLabel = 0; + + for (let di = 0; di < steepAreas.length; di++) { + const down = steepAreas[di]!; + if (down.kind !== "down") continue; + + for (let ui = di + 1; ui < steepAreas.length; ui++) { + const up = steepAreas[ui]!; + if (up.kind !== "up") continue; + + const clusterStart = down.end; + const clusterEnd = up.start; + const size = clusterEnd - clusterStart; + + if (size < minSize) continue; + + clusterLabel++; + for (let i = clusterStart; i <= clusterEnd && i < nSamples; i++) { + const sampleIdx = ordering[i] ?? i; + if (labels[sampleIdx] === -1) { + labels[sampleIdx] = clusterLabel; + } + } + break; + } + } + + return { labels, nClusters: clusterLabel }; +} + +/** + * Extract DBSCAN-style clusters from OPTICS at multiple eps values. + */ +export interface EpsClusterResult { + eps: number; + labels: Int32Array; + nClusters: number; +} + +export function extractDbscanClustering( + reachabilityDistances: Float64Array, + coreDistances: Float64Array, + ordering: Int32Array, + epsValues: Float64Array, +): EpsClusterResult[] { + return Array.from(epsValues).map((eps) => { + const result = clusterOpticsDbscan(reachabilityDistances, coreDistances, ordering, eps); + return { eps, ...result }; + }); +} + +/** + * Compute the reachability plot for visualization. + * Returns pairs of (order_index, reachability_distance) for plotting. + */ +export function reachabilityPlotData( + reachabilityDistances: Float64Array, + ordering: Int32Array, +): { orderIndex: Int32Array; reachDistance: Float64Array } { + const n = ordering.length; + const orderIndex = new Int32Array(n); + const reachDistance = new Float64Array(n); + + for (let i = 0; i < n; i++) { + orderIndex[i] = i; + reachDistance[i] = reachabilityDistances[ordering[i] ?? i] ?? Number.POSITIVE_INFINITY; + } + + return { orderIndex, reachDistance }; +} diff --git a/src/cluster/spectral.ts b/src/cluster/spectral.ts new file mode 100644 index 0000000..4875131 --- /dev/null +++ b/src/cluster/spectral.ts @@ -0,0 +1,549 @@ +/** + * SpectralClustering, MeanShift, Birch, and OPTICS clustering. + * Mirrors sklearn.cluster SpectralClustering, MeanShift, Birch, OPTICS. + */ + +import { NotFittedError } from "../exceptions.js"; + +// ─── SpectralClustering ─────────────────────────────────────────────────────── + +export interface SpectralClusteringOptions { + nClusters?: number; + nInit?: number; + gamma?: number; + affinityType?: "rbf" | "nearest_neighbors"; + nNeighbors?: number; + randomState?: number; +} + +function rbfKernel(a: Float64Array, b: Float64Array, gamma: number): number { + let d = 0; + for (let i = 0; i < a.length; i++) { + d += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return Math.exp(-gamma * d); +} + +function computeAffinityMatrix( + X: Float64Array[], + gamma: number, +): Float64Array[] { + const n = X.length; + return X.map((xi, i) => + Float64Array.from(X, (xj, j) => { + if (i === j) return 0; + return rbfKernel(xi as Float64Array, xj as Float64Array, gamma); + }), + ); +} + +function symmetricNormalizedLaplacian(W: Float64Array[]): Float64Array[] { + const n = W.length; + const D = W.map((row) => row.reduce((s, v) => s + v, 0)); + const Dinvhalf = D.map((d) => (d > 0 ? 1 / Math.sqrt(d) : 0)); + return W.map((row, i) => + Float64Array.from(row, (w, j) => (Dinvhalf[i] ?? 0) * w * (Dinvhalf[j] ?? 0)), + ); +} + +function powerIterationEigenvectors( + L: Float64Array[], + k: number, + maxIter = 300, +): Float64Array[] { + const n = L.length; + const rng = { seed: 42 }; + const rand = () => { + rng.seed = (rng.seed * 1664525 + 1013904223) & 0xffffffff; + return (rng.seed >>> 0) / 0xffffffff; + }; + // Initialize random vectors + const vecs: Float64Array[] = Array.from({ length: k }, () => + Float64Array.from({ length: n }, () => rand() - 0.5), + ); + + for (let iter = 0; iter < maxIter; iter++) { + // Orthogonalize and normalize via QR (Gram-Schmidt) + for (let col = 0; col < k; col++) { + const v = vecs[col] as Float64Array; + // Multiply: v = L @ v + const Lv = new Float64Array(n); + for (let i = 0; i < n; i++) { + const row = L[i] as Float64Array; + let s = 0; + for (let j = 0; j < n; j++) s += (row[j] ?? 0) * (v[j] ?? 0); + Lv[i] = s; + } + // Subtract projections of previous vectors + for (let prev = 0; prev < col; prev++) { + const u = vecs[prev] as Float64Array; + let dot = 0; + for (let i = 0; i < n; i++) dot += (Lv[i] ?? 0) * (u[i] ?? 0); + for (let i = 0; i < n; i++) Lv[i]! -= dot * (u[i] ?? 0); + } + // Normalize + let norm = 0; + for (let i = 0; i < n; i++) norm += (Lv[i] ?? 0) ** 2; + norm = Math.sqrt(norm) || 1; + for (let i = 0; i < n; i++) Lv[i]! /= norm; + vecs[col] = Lv; + } + } + return vecs; +} + +function kmeansOnRows( + rows: Float64Array[], + k: number, + maxIter = 100, + nInit = 10, +): Int32Array { + const n = rows.length; + const d = rows[0]?.length ?? 0; + let bestLabels = new Int32Array(n); + let bestInertia = Number.POSITIVE_INFINITY; + + const rng = { seed: 0 }; + const rand = () => { + rng.seed = (rng.seed * 1664525 + 1013904223) & 0xffffffff; + return (rng.seed >>> 0) / 0xffffffff; + }; + + for (let init = 0; init < nInit; init++) { + rng.seed = init * 1234 + 5678; + const centers: Float64Array[] = Array.from({ length: k }, () => { + const idx = Math.floor(rand() * n); + return Float64Array.from(rows[idx] ?? new Float64Array(d)); + }); + const labels = new Int32Array(n); + + for (let iter = 0; iter < maxIter; iter++) { + // Assign + let changed = false; + for (let i = 0; i < n; i++) { + const xi = rows[i] as Float64Array; + let best = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < k; c++) { + const cc = centers[c] as Float64Array; + let dd = 0; + for (let j = 0; j < d; j++) dd += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2; + if (dd < bestDist) { bestDist = dd; best = c; } + } + if (labels[i] !== best) { labels[i]! = best; changed = true; } + } + if (!changed) break; + // Update centers + for (const c of centers) c.fill(0); + const counts = new Int32Array(k); + for (let i = 0; i < n; i++) { + const c = labels[i] ?? 0; + counts[c]! += 1; + const cc = centers[c] as Float64Array; + const xi = rows[i] as Float64Array; + for (let j = 0; j < d; j++) cc[j]! += xi[j] ?? 0; + } + for (let c = 0; c < k; c++) { + const cnt = counts[c] ?? 1; + if (cnt > 0) { + const cc = centers[c] as Float64Array; + for (let j = 0; j < d; j++) cc[j]! /= cnt; + } + } + } + + let inertia = 0; + for (let i = 0; i < n; i++) { + const xi = rows[i] as Float64Array; + const cc = centers[labels[i] ?? 0] as Float64Array; + for (let j = 0; j < d; j++) inertia += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2; + } + if (inertia < bestInertia) { + bestInertia = inertia; + bestLabels = Int32Array.from(labels); + } + } + return bestLabels; +} + +export class SpectralClustering { + nClusters: number; + nInit: number; + gamma: number; + + labels_: Int32Array | null = null; + affinityMatrix_: Float64Array[] | null = null; + + constructor(opts: SpectralClusteringOptions = {}) { + this.nClusters = opts.nClusters ?? 8; + this.nInit = opts.nInit ?? 10; + this.gamma = opts.gamma ?? 1.0; + } + + fit(X: Float64Array[]): this { + const W = computeAffinityMatrix(X, this.gamma); + this.affinityMatrix_ = W; + const L = symmetricNormalizedLaplacian(W); + const vecs = powerIterationEigenvectors(L, this.nClusters); + const n = X.length; + const k = this.nClusters; + // Assemble rows from eigenvectors + const rows: Float64Array[] = Array.from({ length: n }, (_, i) => { + const row = new Float64Array(k); + for (let c = 0; c < k; c++) { + row[c]! = (vecs[c] as Float64Array)[i] ?? 0; + } + return row; + }); + // Normalize rows to unit norm + for (const row of rows) { + let norm = 0; + for (let j = 0; j < k; j++) norm += (row[j] ?? 0) ** 2; + norm = Math.sqrt(norm) || 1; + for (let j = 0; j < k; j++) row[j]! /= norm; + } + this.labels_ = kmeansOnRows(rows, this.nClusters, 100, this.nInit); + return this; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_ as Int32Array; + } +} + +// ─── MeanShift ──────────────────────────────────────────────────────────────── + +export interface MeanShiftOptions { + bandwidth?: number; + maxIter?: number; + tol?: number; +} + +function gaussianKernelWeight(dist2: number, bandwidth: number): number { + return Math.exp(-dist2 / (2 * bandwidth * bandwidth)); +} + +export class MeanShift { + bandwidth: number; + maxIter: number; + tol: number; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + + constructor(opts: MeanShiftOptions = {}) { + this.bandwidth = opts.bandwidth ?? 1.0; + this.maxIter = opts.maxIter ?? 300; + this.tol = opts.tol ?? 1e-3; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const d = X[0]?.length ?? 0; + // Initialize one seed per point + const seeds: Float64Array[] = X.map((x) => Float64Array.from(x)); + + for (const seed of seeds) { + for (let iter = 0; iter < this.maxIter; iter++) { + const newSeed = new Float64Array(d); + let totalWeight = 0; + for (const xi of X) { + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((seed[j] ?? 0) - (xi[j] ?? 0)) ** 2; + const w = gaussianKernelWeight(dist2, this.bandwidth); + totalWeight += w; + for (let j = 0; j < d; j++) newSeed[j]! += w * (xi[j] ?? 0); + } + if (totalWeight > 0) { + for (let j = 0; j < d; j++) newSeed[j]! /= totalWeight; + } + let shift = 0; + for (let j = 0; j < d; j++) shift += ((newSeed[j] ?? 0) - (seed[j] ?? 0)) ** 2; + for (let j = 0; j < d; j++) seed[j]! = newSeed[j] ?? 0; + if (Math.sqrt(shift) < this.tol) break; + } + } + + // Merge nearby seeds + const mergedCenters: Float64Array[] = []; + for (const seed of seeds) { + let merged = false; + for (const center of mergedCenters) { + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((seed[j] ?? 0) - (center[j] ?? 0)) ** 2; + if (Math.sqrt(dist2) < this.bandwidth) { merged = true; break; } + } + if (!merged) mergedCenters.push(Float64Array.from(seed)); + } + + this.clusterCenters_ = mergedCenters; + + // Assign labels + const labels = new Int32Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i] as Float64Array; + let best = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < mergedCenters.length; c++) { + const cc = mergedCenters[c] as Float64Array; + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2; + if (dist2 < bestDist) { bestDist = dist2; best = c; } + } + labels[i]! = best; + } + this.labels_ = labels; + return this; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_ as Int32Array; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.clusterCenters_) throw new NotFittedError("MeanShift"); + const n = X.length; + const d = X[0]?.length ?? 0; + const labels = new Int32Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i] as Float64Array; + let best = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < this.clusterCenters_.length; c++) { + const cc = this.clusterCenters_[c] as Float64Array; + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2; + if (dist2 < bestDist) { bestDist = dist2; best = c; } + } + labels[i]! = best; + } + return labels; + } +} + +// ─── Birch ──────────────────────────────────────────────────────────────────── + +export interface BirchOptions { + threshold?: number; + branchingFactor?: number; + nClusters?: number; +} + +interface CFEntry { + n: number; + ls: Float64Array; + ss: number; +} + +export class Birch { + threshold: number; + branchingFactor: number; + nClusters: number; + + labels_: Int32Array | null = null; + subclusterCenters_: Float64Array[] | null = null; + + constructor(opts: BirchOptions = {}) { + this.threshold = opts.threshold ?? 0.5; + this.branchingFactor = opts.branchingFactor ?? 50; + this.nClusters = opts.nClusters ?? 3; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const d = X[0]?.length ?? 0; + const entries: CFEntry[] = []; + + for (const xi of X) { + let inserted = false; + for (const entry of entries) { + const centroid = Float64Array.from({ length: d }, (_, j) => (entry.ls[j] ?? 0) / entry.n); + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (centroid[j] ?? 0)) ** 2; + if (Math.sqrt(dist2) <= this.threshold) { + entry.n += 1; + for (let j = 0; j < d; j++) entry.ls[j]! += xi[j] ?? 0; + entry.ss += xi.reduce((s, v) => s + v * v, 0); + inserted = true; + break; + } + } + if (!inserted) { + entries.push({ n: 1, ls: Float64Array.from(xi), ss: xi.reduce((s, v) => s + v * v, 0) }); + } + } + + const centers: Float64Array[] = entries.map((e) => + Float64Array.from({ length: d }, (_, j) => (e.ls[j] ?? 0) / e.n), + ); + this.subclusterCenters_ = centers; + + // Use k-means on subcluster centers + const k = Math.min(this.nClusters, centers.length); + const subcluLabels = kmeansOnRows(centers, k, 100, 3); + + // Assign original points to the nearest subcluster then to its k-means label + const labels = new Int32Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i] as Float64Array; + let bestIdx = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < centers.length; c++) { + const cc = centers[c] as Float64Array; + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2; + if (dist2 < bestDist) { bestDist = dist2; bestIdx = c; } + } + labels[i]! = subcluLabels[bestIdx] ?? 0; + } + this.labels_ = labels; + return this; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_ as Int32Array; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.subclusterCenters_) throw new NotFittedError("Birch"); + const n = X.length; + const d = X[0]?.length ?? 0; + const labels = new Int32Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i] as Float64Array; + let bestIdx = 0; + let bestDist = Number.POSITIVE_INFINITY; + for (let c = 0; c < this.subclusterCenters_.length; c++) { + const cc = this.subclusterCenters_[c] as Float64Array; + let dist2 = 0; + for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2; + if (dist2 < bestDist) { bestDist = dist2; bestIdx = c; } + } + labels[i]! = bestIdx; + } + return labels; + } +} + +// ─── OPTICS ─────────────────────────────────────────────────────────────────── + +export interface OPTICSOptions { + minSamples?: number; + maxEps?: number; + xi?: number; +} + +export class OPTICS { + minSamples: number; + maxEps: number; + xi: number; + + labels_: Int32Array | null = null; + reachabilityDistances_: Float64Array | null = null; + coreDistances_: Float64Array | null = null; + ordering_: Int32Array | null = null; + + constructor(opts: OPTICSOptions = {}) { + this.minSamples = opts.minSamples ?? 5; + this.maxEps = opts.maxEps ?? Number.POSITIVE_INFINITY; + this.xi = opts.xi ?? 0.05; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const d = X[0]?.length ?? 0; + + const dist = (a: Float64Array, b: Float64Array): number => { + let s = 0; + for (let i = 0; i < d; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + return Math.sqrt(s); + }; + + // Compute all pairwise distances (for small datasets) + const dists: Float64Array[] = Array.from({ length: n }, (_, i) => + Float64Array.from({ length: n }, (__, j) => + dist(X[i] as Float64Array, X[j] as Float64Array), + ), + ); + + // Compute core distances + const coreDist = new Float64Array(n); + for (let i = 0; i < n; i++) { + const row = Array.from(dists[i] as Float64Array).sort((a, b) => a - b); + coreDist[i]! = row[this.minSamples] ?? Number.POSITIVE_INFINITY; + } + + const processed = new Uint8Array(n); + const reachDist = new Float64Array(n).fill(Number.POSITIVE_INFINITY); + const ordering: number[] = []; + + const seeds: number[] = []; + const updateSeeds = (idx: number) => { + const cd = coreDist[idx] ?? Number.POSITIVE_INFINITY; + for (let j = 0; j < n; j++) { + if (processed[j]) continue; + const newRD = Math.max(cd, (dists[idx] as Float64Array)[j] ?? Number.POSITIVE_INFINITY); + if (newRD < (reachDist[j] ?? Number.POSITIVE_INFINITY)) { + reachDist[j]! = newRD; + if (!seeds.includes(j)) seeds.push(j); + } + } + }; + + for (let start = 0; start < n; start++) { + if (processed[start]) continue; + processed[start]! = 1; + ordering.push(start); + if ((coreDist[start] ?? Number.POSITIVE_INFINITY) <= this.maxEps) { + updateSeeds(start); + while (seeds.length > 0) { + // Pick seed with minimum reachability distance + let minIdx = 0; + let minRD = Number.POSITIVE_INFINITY; + for (let s = 0; s < seeds.length; s++) { + const sd = seeds[s] ?? 0; + const rd = reachDist[sd] ?? Number.POSITIVE_INFINITY; + if (rd < minRD) { minRD = rd; minIdx = s; } + } + const q = seeds[minIdx] ?? 0; + seeds.splice(minIdx, 1); + if (processed[q]) continue; + processed[q]! = 1; + ordering.push(q); + if ((coreDist[q] ?? Number.POSITIVE_INFINITY) <= this.maxEps) { + updateSeeds(q); + } + } + } + } + + // Assign labels via xi-cluster extraction (simplified: threshold-based) + const labels = new Int32Array(n).fill(-1); + let clusterId = 0; + const eps = this.xi * (reachDist.reduce((mx, v) => Math.max(mx, isFinite(v) ? v : 0), 0)); + let currentCluster = -1; + for (const idx of ordering) { + const rd = reachDist[idx] ?? Number.POSITIVE_INFINITY; + if (rd <= eps && (coreDist[idx] ?? Number.POSITIVE_INFINITY) <= this.maxEps) { + if (currentCluster === -1) { currentCluster = clusterId++; } + labels[idx]! = currentCluster; + } else { + currentCluster = -1; + } + } + + this.labels_ = labels; + this.reachabilityDistances_ = reachDist; + this.coreDistances_ = coreDist; + this.ordering_ = Int32Array.from(ordering); + return this; + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_ as Int32Array; + } +} diff --git a/src/cluster/ward.ts b/src/cluster/ward.ts new file mode 100644 index 0000000..de0a6ad --- /dev/null +++ b/src/cluster/ward.ts @@ -0,0 +1,186 @@ +/** + * Ward linkage and hierarchical clustering utilities. + * Mirrors scipy.cluster.hierarchy (linkage, fcluster, dendrogram helpers) + * as used within sklearn.cluster.AgglomerativeClustering. + */ + +export interface LinkageRow { + clusterA: number; + clusterB: number; + distance: number; + size: number; +} + +/** Compute the Ward linkage matrix for a dataset (O(n^3) naive implementation). */ +export function wardLinkage(X: Float64Array[]): LinkageRow[] { + const n = X.length; + if (n < 2) return []; + + // Each point starts as its own cluster + const clusterPoints: Map = new Map(); + for (let i = 0; i < n; i++) clusterPoints.set(i, [i]); + + // Current cluster centroids + const centroids: Map = new Map(); + for (let i = 0; i < n; i++) centroids.set(i, new Float64Array(X[i]!)); + + let nextCluster = n; + const result: LinkageRow[] = []; + const activeClusters = new Set(Array.from({ length: n }, (_, i) => i)); + + function centroid(indices: number[]): Float64Array { + const d = X[0]!.length; + const c = new Float64Array(d); + for (const idx of indices) { + const pt = X[idx]!; + for (let j = 0; j < d; j++) c[j]! += pt[j] ?? 0; + } + for (let j = 0; j < d; j++) c[j]! /= indices.length; + return c; + } + + function wardDist(a: number, b: number): number { + const pa = clusterPoints.get(a)!; + const pb = clusterPoints.get(b)!; + const na = pa.length; + const nb = pb.length; + const ca = centroids.get(a)!; + const cb = centroids.get(b)!; + let sq = 0; + for (let j = 0; j < ca.length; j++) { + const diff = (ca[j] ?? 0) - (cb[j] ?? 0); + sq += diff * diff; + } + return Math.sqrt((na * nb) / (na + nb) * sq); + } + + while (activeClusters.size > 1) { + // Find closest pair + const active = [...activeClusters]; + let minDist = Number.POSITIVE_INFINITY; + let bestA = -1; + let bestB = -1; + for (let i = 0; i < active.length; i++) { + for (let j = i + 1; j < active.length; j++) { + const d = wardDist(active[i]!, active[j]!); + if (d < minDist) { minDist = d; bestA = active[i]!; bestB = active[j]!; } + } + } + + const pA = clusterPoints.get(bestA)!; + const pB = clusterPoints.get(bestB)!; + const merged = [...pA, ...pB]; + clusterPoints.set(nextCluster, merged); + centroids.set(nextCluster, centroid(merged)); + + result.push({ clusterA: bestA, clusterB: bestB, distance: minDist, size: merged.length }); + activeClusters.delete(bestA); + activeClusters.delete(bestB); + activeClusters.add(nextCluster); + nextCluster++; + } + + return result; +} + +/** Flatten the linkage matrix to cluster labels (fcluster with criterion='maxclust'). */ +export function fcluster(linkage: LinkageRow[], nClusters: number, nPoints: number): Int32Array { + const labels = new Int32Array(nPoints); + if (nClusters >= nPoints) { for (let i = 0; i < nPoints; i++) labels[i] = i; return labels; } + + // Track which top-level cluster each point belongs to + const children: Map = new Map(); + for (const row of linkage) { + children.set(nPoints + children.size, [row.clusterA, row.clusterB]); + } + + // The root is the last merged cluster + const root = nPoints + linkage.length - 1; + // BFS to assign labels β€” cut the tree to produce nClusters clusters + const cutAt = linkage.length - nClusters; // cut after this many merges from the root + const mergeCount = linkage.length; + const cutThreshold = mergeCount >= nClusters ? linkage[mergeCount - nClusters]?.distance ?? 0 : 0; + + // Assign label by DFS + let nextLabel = 0; + function assign(node: number, label: number): void { + if (node < nPoints) { labels[node] = label; return; } + const ch = children.get(node); + if (!ch) return; + assign(ch[0], label); + assign(ch[1], label); + } + + // Walk from root, splitting where distance > cutThreshold + function split(node: number, rowIdx: number): void { + if (node < nPoints) { labels[node] = nextLabel++; return; } + const ch = children.get(node); + if (!ch) { assign(node, nextLabel++); return; } + const row = linkage[rowIdx]; + if (!row) { assign(node, nextLabel++); return; } + if (row.distance > cutThreshold && nextLabel < nClusters) { + split(ch[0], rowIdx - 1 - (linkage.length - 1 - rowIdx)); + split(ch[1], rowIdx - 1); + } else { + assign(node, nextLabel++); + } + } + + // Simple BFS approach: top nClusters nodes in the linkage + const queue: number[] = [root]; + const clusters: number[] = []; + let label = 0; + while (clusters.length < nClusters && queue.length > 0) { + const node = queue.shift()!; + const ch = children.get(node); + if (!ch || clusters.length + queue.length >= nClusters) { + clusters.push(node); + } else { + queue.push(ch[0], ch[1]); + } + } + for (const cl of clusters) assign(cl, label++); + + return labels; +} + +/** Compute cophenetic distances from linkage matrix. */ +export function copheneticDistances(linkage: LinkageRow[], nPoints: number): Float64Array { + const n = nPoints; + const dist = new Float64Array(n * n); + // For each pair of points, find when they first merge + function findMerge(a: number, b: number): number { + // Walk through linkage in order + const clusterOf = new Int32Array(nPoints + linkage.length); + for (let i = 0; i < nPoints; i++) clusterOf[i] = i; + for (let step = 0; step < linkage.length; step++) { + const row = linkage[step]!; + const newId = nPoints + step; + // Check if a and b are in clusterA and clusterB + const inA = isIn(a, row.clusterA, nPoints, linkage, step); + const inB = isIn(b, row.clusterB, nPoints, linkage, step); + const inBA = isIn(b, row.clusterA, nPoints, linkage, step); + const inAB = isIn(a, row.clusterB, nPoints, linkage, step); + if ((inA && inB) || (inBA && inAB)) return row.distance; + } + return 0; + } + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + const d = findMerge(i, j); + dist[i * n + j] = d; dist[j * n + i] = d; + } + } + return dist; +} + +function isIn(point: number, cluster: number, nPoints: number, linkage: LinkageRow[], upTo: number): boolean { + if (cluster === point) return true; + if (cluster < nPoints) return false; + const idx = cluster - nPoints; + if (idx >= upTo) return false; + const row = linkage[idx]!; + return isIn(point, row.clusterA, nPoints, linkage, idx) || isIn(point, row.clusterB, nPoints, linkage, idx); +} + +export type { LinkageRow as WardLinkageRow }; diff --git a/src/compose/column_selector.ts b/src/compose/column_selector.ts new file mode 100644 index 0000000..7ef2980 --- /dev/null +++ b/src/compose/column_selector.ts @@ -0,0 +1,107 @@ +/** + * make_column_selector and related column-selection helpers for ColumnTransformer. + * Analogous to sklearn.compose._column.make_column_selector. + */ + +/** Column selector predicate: returns true for columns to include. */ +export type ColumnSelectorFn = (colIndex: number, colName: string) => boolean; + +/** Options for makeColumnSelector. */ +export interface MakeColumnSelectorOptions { + /** + * String pattern or regex that column names must match (substring match by default). + * Set to undefined to match all columns. + */ + pattern?: string | RegExp; + /** + * If provided, only include columns whose dtype matches one of these strings. + * Uses the dtypes map passed to the returned selector. + * Supported values: "number", "string", "boolean". + */ + dtypeInclude?: string[]; + /** If provided, exclude columns whose dtype matches one of these. */ + dtypeExclude?: string[]; +} + +/** + * Returns a column-selector callable, analogous to sklearn's `make_column_selector`. + * + * The returned function accepts `(colNames: string[], dtypes?: Record)` + * and returns an array of column indices that pass the filter criteria. + */ +export function makeColumnSelector( + options: MakeColumnSelectorOptions = {}, +): (colNames: string[], dtypes?: Record) => number[] { + const { pattern, dtypeInclude, dtypeExclude } = options; + + return (colNames: string[], dtypes?: Record): number[] => { + const result: number[] = []; + for (let i = 0; i < colNames.length; i++) { + const name = colNames[i]!; + + // Pattern filter + if (pattern !== undefined) { + if (pattern instanceof RegExp) { + if (!pattern.test(name)) continue; + } else { + if (!name.includes(pattern)) continue; + } + } + + // Dtype filters + const dtype = dtypes?.[name]; + if (dtypeInclude !== undefined && dtype !== undefined && !dtypeInclude.includes(dtype)) continue; + if (dtypeExclude !== undefined && dtype !== undefined && dtypeExclude.includes(dtype)) continue; + + result.push(i); + } + return result; + }; +} + +/** + * Returns the indices of all numeric columns (dtype "number"). + * Convenience wrapper around makeColumnSelector. + */ +export function numericColumns( + colNames: string[], + dtypes: Record, +): number[] { + return makeColumnSelector({ dtypeInclude: ["number"] })(colNames, dtypes); +} + +/** + * Returns the indices of all categorical columns (dtype "string"). + * Convenience wrapper around makeColumnSelector. + */ +export function categoricalColumns( + colNames: string[], + dtypes: Record, +): number[] { + return makeColumnSelector({ dtypeInclude: ["string"] })(colNames, dtypes); +} + +/** + * Selects a subset of columns from a flat row-major matrix. + * + * @param X Flat Float64Array of shape (nSamples Γ— nColsIn). + * @param nSamples Number of rows. + * @param nColsIn Number of columns in X. + * @param cols Column indices to select. + * @returns New Float64Array of shape (nSamples Γ— cols.length). + */ +export function selectColumns( + X: Float64Array, + nSamples: number, + nColsIn: number, + cols: number[], +): Float64Array { + const nOut = cols.length; + const out = new Float64Array(nSamples * nOut); + for (let i = 0; i < nSamples; i++) { + for (let k = 0; k < nOut; k++) { + out[i * nOut + k] = X[i * nColsIn + cols[k]!]!; + } + } + return out; +} diff --git a/src/compose/column_transformer.ts b/src/compose/column_transformer.ts new file mode 100644 index 0000000..aebbab1 --- /dev/null +++ b/src/compose/column_transformer.ts @@ -0,0 +1,102 @@ +/** + * ColumnTransformer: applies transformers to columns of an array. + * Mirrors sklearn.compose.ColumnTransformer. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface Transformer { + fit(X: Float64Array[]): this; + transform(X: Float64Array[]): Float64Array[]; + fitTransform?(X: Float64Array[]): Float64Array[]; +} + +export type ColumnSpec = number | number[] | "all"; + +export class ColumnTransformer { + transformers: [string, Transformer | "passthrough" | "drop", ColumnSpec][]; + remainder: "passthrough" | "drop"; + + transformers_: [string, Transformer | "passthrough", ColumnSpec][] = []; + private _nFeatures = 0; + private _allCols = new Set(); + + constructor( + transformers: [string, Transformer | "passthrough" | "drop", ColumnSpec][], + options: { remainder?: "passthrough" | "drop" } = {}, + ) { + this.transformers = transformers; + this.remainder = options.remainder ?? "drop"; + } + + private _getCols(spec: ColumnSpec, nFeatures: number): number[] { + if (spec === "all") return Array.from({ length: nFeatures }, (_, i) => i); + if (typeof spec === "number") return [spec]; + return spec; + } + + fit(X: Float64Array[]): this { + const n = (X[0] ?? new Float64Array(0)).length; + this._nFeatures = n; + this._allCols.clear(); + + this.transformers_ = []; + for (const [name, t, spec] of this.transformers) { + if (t === "drop") continue; + const cols = this._getCols(spec, n); + for (const c of cols) this._allCols.add(c); + + if (t === "passthrough") { + this.transformers_.push([name, "passthrough", spec]); + } else { + const Xsub = X.map((row) => new Float64Array(cols.map((c) => row[c] ?? 0))); + t.fit(Xsub); + this.transformers_.push([name, t, spec]); + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.transformers_.length === 0) throw new NotFittedError("ColumnTransformer"); + const n = (X[0] ?? new Float64Array(0)).length; + const parts: Float64Array[][] = []; + + for (const [, t, spec] of this.transformers_) { + const cols = this._getCols(spec, n); + const Xsub = X.map((row) => new Float64Array(cols.map((c) => row[c] ?? 0))); + if (t === "passthrough") { + parts.push(Xsub); + } else { + parts.push(t.transform(Xsub)); + } + } + + if (this.remainder === "passthrough") { + const remainderCols: number[] = []; + for (let c = 0; c < n; c++) { + if (!this._allCols.has(c)) remainderCols.push(c); + } + if (remainderCols.length > 0) { + parts.push(X.map((row) => new Float64Array(remainderCols.map((c) => row[c] ?? 0)))); + } + } + + // Horizontally concatenate + return X.map((_, i) => { + const rowParts = parts.map((p) => p[i] ?? new Float64Array(0)); + const total = rowParts.reduce((s, r) => s + r.length, 0); + const result = new Float64Array(total); + let offset = 0; + for (const part of rowParts) { + result.set(part, offset); + offset += part.length; + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/compose/index.ts b/src/compose/index.ts new file mode 100644 index 0000000..855943e --- /dev/null +++ b/src/compose/index.ts @@ -0,0 +1,3 @@ +export * from "./column_transformer.js"; +export * from "./transformed_target.js"; +export * from "./column_selector.js"; diff --git a/src/compose/transformed_target.ts b/src/compose/transformed_target.ts new file mode 100644 index 0000000..e7b60a5 --- /dev/null +++ b/src/compose/transformed_target.ts @@ -0,0 +1,117 @@ +/** + * TransformedTargetRegressor. + * Mirrors sklearn.compose.TransformedTargetRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface TransformableTarget { + fit(y: Float64Array): this; + transform(y: Float64Array): Float64Array; + inverseTransform(y: Float64Array): Float64Array; +} + +export interface FittableRegressor { + fit(X: Float64Array[], y: Float64Array): this; + predict(X: Float64Array[]): Float64Array; +} + +export interface TransformedTargetRegressorOptions { + regressor?: FittableRegressor; + transformer?: TransformableTarget; + func?: (y: Float64Array) => Float64Array; + inverseFunc?: (y: Float64Array) => Float64Array; + checkInverse?: boolean; +} + +export class TransformedTargetRegressor { + regressor_: FittableRegressor | null = null; + transformer_: TransformableTarget | null = null; + func: ((y: Float64Array) => Float64Array) | null; + inverseFunc: ((y: Float64Array) => Float64Array) | null; + + private regressorOpt: FittableRegressor | null; + private transformerOpt: TransformableTarget | null; + + constructor(opts: TransformedTargetRegressorOptions = {}) { + this.regressorOpt = opts.regressor ?? null; + this.transformerOpt = opts.transformer ?? null; + this.func = opts.func ?? null; + this.inverseFunc = opts.inverseFunc ?? null; + } + + fit(X: Float64Array[], y: Float64Array): this { + let yTrans: Float64Array; + + if (this.func) { + yTrans = this.func(y); + } else if (this.transformerOpt) { + this.transformer_ = this.transformerOpt; + this.transformer_.fit(y); + yTrans = this.transformer_.transform(y); + } else { + // Default: identity + yTrans = Float64Array.from(y); + } + + const reg = this.regressorOpt ?? createDefaultRegressor(); + this.regressor_ = reg; + reg.fit(X, yTrans); + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (!this.regressor_) throw new NotFittedError("TransformedTargetRegressor"); + const predsTrans = this.regressor_.predict(X); + + if (this.inverseFunc) { + return this.inverseFunc(predsTrans); + } else if (this.transformer_) { + return this.transformer_.inverseTransform(predsTrans); + } + return predsTrans; + } + + score(X: Float64Array[], y: Float64Array): number { + const preds = this.predict(X); + const mean = y.reduce((s, v) => s + v, 0) / y.length; + let ssRes = 0; + let ssTot = 0; + for (let i = 0; i < y.length; i++) { + ssRes += ((y[i] ?? 0) - (preds[i] ?? 0)) ** 2; + ssTot += ((y[i] ?? 0) - mean) ** 2; + } + return ssTot === 0 ? 1 : 1 - ssRes / ssTot; + } +} + +function createDefaultRegressor(): FittableRegressor { + let coef: Float64Array | null = null; + let intercept = 0; + return { + fit(X: Float64Array[], y: Float64Array) { + const n = X.length; + const d = X[0]?.length ?? 0; + coef = new Float64Array(d); + const lr = 0.01; + for (let iter = 0; iter < 200; iter++) { + for (let i = 0; i < n; i++) { + const xi = X[i] as Float64Array; + let pred = intercept; + for (let j = 0; j < d; j++) pred += (coef![j] ?? 0) * (xi[j] ?? 0); + const err = (y[i] ?? 0) - pred; + intercept += lr * err; + for (let j = 0; j < d; j++) coef![j]! += lr * err * (xi[j] ?? 0); + } + } + return this; + }, + predict(X: Float64Array[]) { + return Float64Array.from(X, (xi) => { + let pred = intercept; + for (let j = 0; j < xi.length; j++) pred += (coef![j] ?? 0) * (xi[j] ?? 0); + return pred; + }); + }, + }; +} diff --git a/src/covariance/covariance.ts b/src/covariance/covariance.ts new file mode 100644 index 0000000..534223f --- /dev/null +++ b/src/covariance/covariance.ts @@ -0,0 +1,224 @@ +/** + * Covariance estimators: EmpiricalCovariance, ShrunkCovariance, LedoitWolf, OAS. + * Mirrors sklearn.covariance. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute column means of X. */ +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const means = new Float64Array(p); + const n = X.length; + for (const xi of X) { + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n; + return means; +} + +/** Compute empirical covariance matrix (biased). */ +function empCov(X: Float64Array[], means: Float64Array): Float64Array[] { + const n = X.length; + const p = means.length; + const C = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let i = 0; i < p; i++) { + const di = (xi[i] ?? 0) - (means[i] ?? 0); + for (let j = i; j < p; j++) { + const dj = (xi[j] ?? 0) - (means[j] ?? 0); + C[i]![j] = (C[i]![j] ?? 0) + di * dj; + } + } + } + for (let i = 0; i < p; i++) { + C[i]![i] = (C[i]![i] ?? 0) / n; + for (let j = i + 1; j < p; j++) { + C[i]![j] = (C[i]![j] ?? 0) / n; + C[j]![i] = C[i]![j] ?? 0; + } + } + return C; +} + +/** + * Maximum likelihood covariance estimator. + * Mirrors sklearn.covariance.EmpiricalCovariance. + */ +export class EmpiricalCovariance { + assumeCentered: boolean; + + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + + constructor(options: { assumeCentered?: boolean } = {}) { + this.assumeCentered = options.assumeCentered ?? false; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + if (this.assumeCentered) { + this.location_ = new Float64Array(p); + } else { + this.location_ = colMeans(X); + } + this.covariance_ = empCov(X, this.location_); + return this; + } + + score(X: Float64Array[]): number { + if (this.covariance_ === null || this.location_ === null) throw new NotFittedError(); + // Negative log-likelihood + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + let logdet = 0; + // Approximate log-det via trace of covariance + for (let i = 0; i < p; i++) { + logdet += Math.log(Math.abs(this.covariance_[i]![i] ?? 1) + 1e-12); + } + let trace = 0; + for (const xi of X) { + const centered = new Float64Array(p); + for (let j = 0; j < p; j++) centered[j] = (xi[j] ?? 0) - (this.location_![j] ?? 0); + for (let j = 0; j < p; j++) { + const cjj = this.covariance_![j]![j] ?? 1e-12; + trace += (centered[j] ?? 0) ** 2 / (cjj || 1e-12); + } + } + return -(n * logdet + trace) / 2; + } + + mahalanobis(X: Float64Array[]): Float64Array { + if (this.covariance_ === null || this.location_ === null) throw new NotFittedError(); + const p = (X[0] ?? new Float64Array(0)).length; + const dists = new Float64Array(X.length); + for (let idx = 0; idx < X.length; idx++) { + const xi = X[idx] ?? new Float64Array(p); + let d = 0; + for (let j = 0; j < p; j++) { + const diff = (xi[j] ?? 0) - (this.location_![j] ?? 0); + const cjj = this.covariance_![j]![j] ?? 1e-12; + d += diff ** 2 / (cjj || 1e-12); + } + dists[idx] = Math.sqrt(d); + } + return dists; + } +} + +/** + * Covariance estimator with shrinkage. + * Mirrors sklearn.covariance.ShrunkCovariance. + */ +export class ShrunkCovariance extends EmpiricalCovariance { + shrinkage: number; + + constructor(options: { assumeCentered?: boolean; shrinkage?: number } = {}) { + super(options); + this.shrinkage = options.shrinkage ?? 0.1; + } + + override fit(X: Float64Array[]): this { + super.fit(X); + if (this.covariance_ !== null) { + const p = this.covariance_.length; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + if (i === j) continue; + this.covariance_[i]![j] = (this.covariance_![i]![j] ?? 0) * (1 - this.shrinkage); + } + } + } + return this; + } +} + +/** + * Ledoit-Wolf automatic covariance estimator. + * Mirrors sklearn.covariance.LedoitWolf. + */ +export class LedoitWolf extends EmpiricalCovariance { + blockSize: number; + + shrinkage_: number | null = null; + + constructor(options: { assumeCentered?: boolean; blockSize?: number } = {}) { + super(options); + this.blockSize = options.blockSize ?? 1000; + } + + override fit(X: Float64Array[]): this { + super.fit(X); + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + if (this.covariance_ !== null) { + // Oracle Approximating Shrinkage estimator (simplified Ledoit-Wolf) + let mu = 0; + for (let i = 0; i < p; i++) mu += this.covariance_![i]![i] ?? 0; + mu /= p; + + let delta = 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + delta += (this.covariance_![i]![j] ?? 0) ** 2; + } + } + + const traceS2 = delta; + const traceS = p * mu; + const beta = (1 / (n * p)) * (traceS2 - traceS ** 2 / p); + const alpha = Math.max(0, Math.min(1, beta / delta)); + this.shrinkage_ = alpha; + + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + this.covariance_![i]![j] = + (1 - alpha) * (this.covariance_![i]![j] ?? 0) + (i === j ? alpha * mu : 0); + } + } + } + return this; + } +} + +/** + * Oracle Approximating Shrinkage estimator. + * Mirrors sklearn.covariance.OAS. + */ +export class OAS extends EmpiricalCovariance { + shrinkage_: number | null = null; + + override fit(X: Float64Array[]): this { + super.fit(X); + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + if (this.covariance_ !== null) { + let trS = 0; + let trS2 = 0; + for (let i = 0; i < p; i++) { + const sii = this.covariance_![i]![i] ?? 0; + trS += sii; + for (let j = 0; j < p; j++) { + trS2 += (this.covariance_![i]![j] ?? 0) ** 2; + } + } + const mu = trS / p; + const rho = Math.max( + 0, + Math.min( + 1, + ((1 - 2 / p) * trS2 + trS ** 2) / + ((n + 1 - 2 / p) * (trS2 - trS ** 2 / p)), + ), + ); + this.shrinkage_ = rho; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + this.covariance_![i]![j] = + (1 - rho) * (this.covariance_![i]![j] ?? 0) + (i === j ? rho * mu : 0); + } + } + } + return this; + } +} diff --git a/src/covariance/covariance_ext.ts b/src/covariance/covariance_ext.ts new file mode 100644 index 0000000..d245491 --- /dev/null +++ b/src/covariance/covariance_ext.ts @@ -0,0 +1,151 @@ +/** + * Covariance extensions: OAS estimator, LedoitWolf estimator, ShrunkCovariance. + */ + +export class OASCovariance { + covariance_: Float64Array[] = []; + precision_: Float64Array[] = []; + shrinkage_ = 0; + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 1; + const emp = this._empiricalCovariance(X); + // OAS shrinkage estimator + const trS = emp.reduce((s, row, i) => s + (row[i] ?? 0), 0); + const trS2 = emp.reduce((s1, row) => s1 + row.reduce((s2, v) => s2 + v * v, 0), 0); + const mu = trS / p; + const rhoNum = (1 - 2 / p) * trS2 + trS ** 2; + const rhoDenom = (n + 1 - 2 / p) * (trS2 - trS ** 2 / p); + const rho = Math.min(1, rhoNum / Math.max(rhoDenom, 1e-10)); + this.shrinkage_ = rho; + this.covariance_ = emp.map((row, i) => new Float64Array(row.map((v, j) => (1 - rho) * v + (i === j ? rho * mu : 0)))); + this.precision_ = this._invertMatrix(this.covariance_); + return this; + } + + private _empiricalCovariance(X: Float64Array[]): Float64Array[] { + const n = X.length; + const p = X[0]?.length ?? 1; + const mean = new Float64Array(p); + for (const x of X) for (let f = 0; f < p; f++) mean[f] = (mean[f] ?? 0) + (x[f] ?? 0) / n; + const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (const x of X) { + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) { + cov[i]![j] = (cov[i]![j] ?? 0) + ((x[i] ?? 0) - (mean[i] ?? 0)) * ((x[j] ?? 0) - (mean[j] ?? 0)) / n; + } + } + return cov; + } + + private _invertMatrix(M: Float64Array[]): Float64Array[] { + const n = M.length; + const A = M.map((row) => new Float64Array(row)); + const inv = Array.from({ length: n }, (_, i) => { + const row = new Float64Array(n); + row[i] = 1; + return row; + }); + for (let col = 0; col < n; col++) { + let pivotRow = col; + for (let row = col + 1; row < n; row++) { + if (Math.abs(A[row]?.[col] ?? 0) > Math.abs(A[pivotRow]?.[col] ?? 0)) pivotRow = row; + } + [A[col], A[pivotRow]] = [A[pivotRow]!, A[col]!]; + [inv[col], inv[pivotRow]] = [inv[pivotRow]!, inv[col]!]; + const pivot = A[col]?.[col] ?? 1e-10; + if (Math.abs(pivot) < 1e-10) continue; + for (let j = 0; j < n; j++) { A[col]![j] = (A[col]![j] ?? 0) / pivot; inv[col]![j] = (inv[col]![j] ?? 0) / pivot; } + for (let row = 0; row < n; row++) { + if (row === col) continue; + const factor = A[row]?.[col] ?? 0; + for (let j = 0; j < n; j++) { + A[row]![j] = (A[row]![j] ?? 0) - factor * (A[col]![j] ?? 0); + inv[row]![j] = (inv[row]![j] ?? 0) - factor * (inv[col]![j] ?? 0); + } + } + } + return inv; + } +} + +export class LedoitWolfCovariance { + covariance_: Float64Array[] = []; + shrinkage_ = 0; + precision_: Float64Array[] = []; + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 1; + const emp = this._empiricalCovariance(X); + const trS = emp.reduce((s, row, i) => s + (row[i] ?? 0), 0); + const mu = trS / p; + const delta = emp.reduce((s1, row, i) => s1 + row.reduce((s2, v, j) => s2 + (i === j ? (v - mu) ** 2 : v ** 2), 0), 0) / p; + const beta = 1 / (n * p) * emp.reduce((s1, row) => s1 + row.reduce((s2, v) => s2 + v ** 2, 0), 0); + const rho = Math.min(1, (beta - delta) / Math.max(delta, 1e-10)); + this.shrinkage_ = rho; + this.covariance_ = emp.map((row, i) => new Float64Array(row.map((v, j) => (1 - rho) * v + (i === j ? rho * mu : 0)))); + this.precision_ = this._invertMatrix(this.covariance_); + return this; + } + + private _empiricalCovariance(X: Float64Array[]): Float64Array[] { + const n = X.length; + const p = X[0]?.length ?? 1; + const mean = new Float64Array(p); + for (const x of X) for (let f = 0; f < p; f++) mean[f] = (mean[f] ?? 0) + (x[f] ?? 0) / n; + const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (const x of X) { + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) { + cov[i]![j] = (cov[i]![j] ?? 0) + ((x[i] ?? 0) - (mean[i] ?? 0)) * ((x[j] ?? 0) - (mean[j] ?? 0)) / n; + } + } + return cov; + } + + private _invertMatrix(M: Float64Array[]): Float64Array[] { + const n = M.length; + const A = M.map((row) => new Float64Array(row)); + const inv = Array.from({ length: n }, (_, i) => { const row = new Float64Array(n); row[i] = 1; return row; }); + for (let col = 0; col < n; col++) { + const pivot = A[col]?.[col] ?? 1e-10; + if (Math.abs(pivot) < 1e-10) continue; + for (let j = 0; j < n; j++) { A[col]![j] = (A[col]![j] ?? 0) / pivot; inv[col]![j] = (inv[col]![j] ?? 0) / pivot; } + for (let row = 0; row < n; row++) { + if (row === col) continue; + const f = A[row]?.[col] ?? 0; + for (let j = 0; j < n; j++) { A[row]![j] = (A[row]![j] ?? 0) - f * (A[col]![j] ?? 0); inv[row]![j] = (inv[row]![j] ?? 0) - f * (inv[col]![j] ?? 0); } + } + } + return inv; + } +} + +export class ShrunkCovariance { + covariance_: Float64Array[] = []; + precision_: Float64Array[] = []; + + constructor(private readonly shrinkage = 0.1) {} + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 1; + const mean = new Float64Array(p); + for (const x of X) for (let f = 0; f < p; f++) mean[f] = (mean[f] ?? 0) + (x[f] ?? 0) / n; + const emp: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (const x of X) { + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) { + emp[i]![j] = (emp[i]![j] ?? 0) + ((x[i] ?? 0) - (mean[i] ?? 0)) * ((x[j] ?? 0) - (mean[j] ?? 0)) / n; + } + } + const mu = emp.reduce((s, row, i) => s + (row[i] ?? 0), 0) / p; + this.covariance_ = emp.map((row, i) => new Float64Array(row.map((v, j) => (1 - this.shrinkage) * v + (i === j ? this.shrinkage * mu : 0)))); + // Simple precision (diagonal approximation) + this.precision_ = Array.from({ length: p }, (_, i) => { + const row = new Float64Array(p); + row[i] = 1 / Math.max(this.covariance_[i]?.[i] ?? 1, 1e-10); + return row; + }); + return this; + } +} diff --git a/src/covariance/covariance_ext2.ts b/src/covariance/covariance_ext2.ts new file mode 100644 index 0000000..8c100c5 --- /dev/null +++ b/src/covariance/covariance_ext2.ts @@ -0,0 +1,146 @@ +/** + * Extended covariance estimation: Oracle Approximating Shrinkage (OAS), + * Ledoit-Wolf analytical estimator, and covariance comparison utilities. + */ + +/** Ledoit-Wolf analytical shrinkage coefficient. */ +export function ledoitWolfShrinkage(X: Float64Array[]): number { + const n = X.length; + const p = X[0]?.length ?? 0; + if (n <= 1 || p === 0) return 0; + + // Sample covariance + const mean = new Float64Array(p); + for (const xi of X) { + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n; + + const S = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) { + S[j]![k] = (S[j]![k] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) * ((xi[k] ?? 0) - (mean[k] ?? 0)); + } + } + } + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) S[j]![k] = (S[j]![k] ?? 0) / n; + } + + let trS = 0, trS2 = 0, trS_sq = 0; + for (let j = 0; j < p; j++) trS += S[j]![j] ?? 0; + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) trS2 += (S[j]![k] ?? 0) ** 2; + } + trS_sq = trS ** 2; + + // LW formula: delta = (((n-2)/n * trS2 + trS_sq) / ((n+2) * (trS2 - trS_sq/p))) + const num = ((n - 2) / n) * trS2 + trS_sq; + const den = (n + 2) * (trS2 - trS_sq / p); + return den === 0 ? 1 : Math.min(1, Math.max(0, num / den)); +} + +/** OAS shrinkage estimator. */ +export function oasShrinkage(X: Float64Array[]): number { + const n = X.length; + const p = X[0]?.length ?? 0; + if (n <= 1 || p === 0) return 0; + + const mean = new Float64Array(p); + for (const xi of X) { + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n; + + const S = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) { + S[j]![k] = (S[j]![k] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) * ((xi[k] ?? 0) - (mean[k] ?? 0)); + } + } + } + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) S[j]![k] = (S[j]![k] ?? 0) / n; + } + + let trS = 0, trS2 = 0; + for (let j = 0; j < p; j++) trS += S[j]![j] ?? 0; + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) trS2 += (S[j]![k] ?? 0) ** 2; + } + + const rho = (1 - 2 / p) * trS2 + trS ** 2; + const gamma = (n + 1 - 2 / p) * (trS2 - trS ** 2 / p); + return gamma === 0 ? 1 : Math.min(1, Math.max(0, rho / ((n + 1 - 2 / p) * gamma))); +} + +/** Shrink sample covariance toward identity: Sigma = (1-alpha)*S + alpha*mu*I */ +export function shrunkCovariance( + X: Float64Array[], + shrinkage: number, +): Float64Array[] { + const n = X.length; + const p = X[0]?.length ?? 0; + + const mean = new Float64Array(p); + for (const xi of X) { + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n; + + const S = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) { + S[j]![k] = (S[j]![k] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) * ((xi[k] ?? 0) - (mean[k] ?? 0)); + } + } + } + + let trace = 0; + for (let j = 0; j < p; j++) { + for (let k = 0; k < p; k++) S[j]![k] = (S[j]![k] ?? 0) / n; + trace += S[j]![j] ?? 0; + } + const mu = trace / p; + + return S.map((row, j) => + row.map((v, k) => (1 - shrinkage) * v + (j === k ? shrinkage * mu : 0)) + ); +} + +/** Frobenius distance between two covariance matrices. */ +export function covarianceFrobeniusDistance(A: Float64Array[], B: Float64Array[]): number { + let dist = 0; + for (let i = 0; i < A.length; i++) { + const ai = A[i]; + const bi = B[i]; + if (ai === undefined || bi === undefined) continue; + for (let j = 0; j < ai.length; j++) dist += ((ai[j] ?? 0) - (bi[j] ?? 0)) ** 2; + } + return Math.sqrt(dist); +} + +/** Compute log-determinant of a symmetric positive definite matrix (via Cholesky). */ +export function logDetCovariance(S: Float64Array[]): number { + const p = S.length; + // Cholesky decomposition L such that S = L L^T + const L = Array.from({ length: p }, () => new Float64Array(p)); + for (let i = 0; i < p; i++) { + for (let j = 0; j <= i; j++) { + let sum = 0; + for (let k = 0; k < j; k++) sum += (L[i]![k] ?? 0) * (L[j]![k] ?? 0); + if (i === j) { + const val = (S[i]![i] ?? 0) - sum; + L[i]![i] = val > 0 ? Math.sqrt(val) : 1e-10; + } else { + L[i]![j] = ((S[i]![j] ?? 0) - sum) / (L[j]![j] ?? 1e-10); + } + } + } + let logDet = 0; + for (let i = 0; i < p; i++) logDet += Math.log(Math.max(L[i]![i] ?? 1e-10, 1e-10)); + return 2 * logDet; +} diff --git a/src/covariance/covariance_ext3.ts b/src/covariance/covariance_ext3.ts new file mode 100644 index 0000000..8d1c99d --- /dev/null +++ b/src/covariance/covariance_ext3.ts @@ -0,0 +1,163 @@ +/** + * Additional covariance estimators: OAS, LedoitWolfExt. + * Mirrors sklearn.covariance extras. + */ + +import { NotFittedError } from "../exceptions.js"; + +function computeSampleCov(X: Float64Array[]): { + mean: Float64Array; + cov: Float64Array[]; +} { + const n = X.length; + const p = X[0]?.length ?? 0; + const mean = new Float64Array(p); + for (const row of X) { + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (row[j] ?? 0); + } + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n; + + const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (const row of X) { + for (let i = 0; i < p; i++) { + for (let j = i; j < p; j++) { + const v = ((row[i] ?? 0) - (mean[i] ?? 0)) * ((row[j] ?? 0) - (mean[j] ?? 0)); + cov[i]![j] = (cov[i]?.[j] ?? 0) + v; + if (i !== j) cov[j]![i] = (cov[j]?.[i] ?? 0) + v; + } + } + } + const denom = n - 1 > 0 ? n - 1 : 1; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) cov[i]![j] = (cov[i]?.[j] ?? 0) / denom; + } + return { mean, cov }; +} + +export class OAS { + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + precision_: Float64Array[] | null = null; + shrinkage_: number = 0; + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const { mean, cov } = computeSampleCov(X); + this.location_ = mean; + + // OAS shrinkage coefficient + let traceSq = 0; + let traceSquared = 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + traceSq += (cov[i]?.[j] ?? 0) ** 2; + } + traceSquared += (cov[i]?.[i] ?? 0); + } + traceSquared = traceSquared ** 2; + + const num = (1 - 2 / p) * traceSq + traceSquared; + const denom2 = (n + 1 - 2 / p) * (traceSq - traceSquared / p); + this.shrinkage_ = denom2 > 0 ? Math.min(1, num / denom2) : 1; + + const rho = this.shrinkage_; + let traceS = 0; + for (let i = 0; i < p; i++) traceS += cov[i]?.[i] ?? 0; + const mu = traceS / p; + + this.covariance_ = Array.from({ length: p }, (_, i) => + Float64Array.from({ length: p }, (_, j) => + (1 - rho) * (cov[i]?.[j] ?? 0) + (i === j ? rho * mu : 0), + ), + ); + + return this; + } +} + +export class LedoitWolfExt { + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + shrinkage_: number = 0; + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const { mean, cov } = computeSampleCov(X); + this.location_ = mean; + + // Ledoit-Wolf analytical shrinkage + let mu = 0; + for (let i = 0; i < p; i++) mu += cov[i]?.[i] ?? 0; + mu /= p; + + let delta2 = 0; + let beta2 = 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + const Sij = cov[i]?.[j] ?? 0; + const Fij = i === j ? mu : 0; + delta2 += (Sij - Fij) ** 2; + } + } + + // Estimate beta + for (const row of X) { + const centered = new Float64Array(p); + for (let j = 0; j < p; j++) centered[j] = (row[j] ?? 0) - (mean[j] ?? 0); + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + const Xij = (centered[i] ?? 0) * (centered[j] ?? 0); + const Sij = cov[i]?.[j] ?? 0; + beta2 += (Xij - Sij) ** 2; + } + } + } + beta2 /= n ** 2; + + const rho = Math.min(1, beta2 / delta2); + this.shrinkage_ = rho; + + this.covariance_ = Array.from({ length: p }, (_, i) => + Float64Array.from({ length: p }, (_, j) => + (1 - rho) * (cov[i]?.[j] ?? 0) + (i === j ? rho * mu : 0), + ), + ); + + return this; + } +} + +export class ShrunkCovariance { + shrinkage: number; + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + + constructor(shrinkage = 0.1) { + this.shrinkage = shrinkage; + } + + fit(X: Float64Array[]): this { + const p = X[0]?.length ?? 0; + const { mean, cov } = computeSampleCov(X); + this.location_ = mean; + + let mu = 0; + for (let i = 0; i < p; i++) mu += cov[i]?.[i] ?? 0; + mu /= p; + + const rho = this.shrinkage; + this.covariance_ = Array.from({ length: p }, (_, i) => + Float64Array.from({ length: p }, (_, j) => + (1 - rho) * (cov[i]?.[j] ?? 0) + (i === j ? rho * mu : 0), + ), + ); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (!this.covariance_) throw new NotFittedError("ShrunkCovariance is not fitted"); + return X; + } +} diff --git a/src/covariance/covariance_ext4.ts b/src/covariance/covariance_ext4.ts new file mode 100644 index 0000000..7689070 --- /dev/null +++ b/src/covariance/covariance_ext4.ts @@ -0,0 +1,161 @@ +/** + * Covariance extensions: OAS (Oracle Approximating Shrinkage), POET. + * Port of sklearn.covariance extensions. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Oracle Approximating Shrinkage (OAS) estimator. */ +export class OASShrinkage { + private covariance_: Float64Array[] | null = null; + private precision_: Float64Array[] | null = null; + private shrinkage_: number | null = null; + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + + const mean = new Float64Array(p); + for (const row of X) for (let j = 0; j < p; j++) mean[j]! += row[j] ?? 0; + for (let j = 0; j < p; j++) mean[j]! /= n; + + const S = Array.from({ length: p }, () => new Float64Array(p)); + for (const row of X) { + for (let a = 0; a < p; a++) { + for (let b = 0; b < p; b++) { + S[a]![b]! += ((row[a] ?? 0) - (mean[a] ?? 0)) * ((row[b] ?? 0) - (mean[b] ?? 0)); + } + } + } + for (let a = 0; a < p; a++) for (let b = 0; b < p; b++) S[a]![b]! /= n; + + // Trace and Frobenius norm + let trS = 0; + let trS2 = 0; + for (let a = 0; a < p; a++) trS += S[a]![a] ?? 0; + for (let a = 0; a < p; a++) for (let b = 0; b < p; b++) trS2 += (S[a]![b] ?? 0) ** 2; + + // OAS shrinkage coefficient + const num = (1 - 2 / p) * trS2 + trS * trS; + const den = (n + 1 - 2 / p) * (trS2 - (trS * trS) / p); + const rho = den === 0 ? 1 : Math.min(1, num / den); + this.shrinkage_ = rho; + + const mu = trS / p; + this.covariance_ = Array.from({ length: p }, (_, a) => { + const row = new Float64Array(p); + for (let b = 0; b < p; b++) { + row[b] = (1 - rho) * (S[a]![b] ?? 0) + (a === b ? rho * mu : 0); + } + return row; + }); + this.precision_ = invertMatrix(this.covariance_); + return this; + } + + get covariance(): Float64Array[] { + if (this.covariance_ === null) throw new NotFittedError("OASShrinkage is not fitted."); + return this.covariance_; + } + + get precision(): Float64Array[] { + if (this.precision_ === null) throw new NotFittedError("OASShrinkage is not fitted."); + return this.precision_; + } + + get shrinkage(): number { + if (this.shrinkage_ === null) throw new NotFittedError("OASShrinkage is not fitted."); + return this.shrinkage_; + } +} + +/** Compute log-likelihood of data under a covariance model. */ +export function gaussianLogLikelihood( + X: Float64Array[], + mean: Float64Array, + precision: Float64Array[], +): number { + const n = X.length; + const p = mean.length; + // log det via Cholesky (simplified: use product of diagonal after LU) + let logDet = 0; + for (let j = 0; j < p; j++) logDet += Math.log(Math.abs(precision[j]?.[j] ?? 1)); + let logLik = (n * (logDet - p * Math.log(2 * Math.PI))) / 2; + for (const row of X) { + const diff = new Float64Array(p).map((_, j) => (row[j] ?? 0) - (mean[j] ?? 0)); + let quad = 0; + for (let a = 0; a < p; a++) { + let pda = 0; + for (let b = 0; b < p; b++) pda += (precision[a]?.[b] ?? 0) * (diff[b] ?? 0); + quad += (diff[a] ?? 0) * pda; + } + logLik -= quad / 2; + } + return logLik; +} + +/** Covariance matrix cross-validation scoring (log-likelihood based). */ +export function covarianceCVScore( + X: Float64Array[], + estimator: { fit: (X: Float64Array[]) => unknown; covariance: Float64Array[] }, + nFolds = 5, +): number { + const n = X.length; + const p = X[0]?.length ?? 0; + const foldSize = Math.floor(n / nFolds); + let totalScore = 0; + for (let fold = 0; fold < nFolds; fold++) { + const testStart = fold * foldSize; + const testEnd = fold === nFolds - 1 ? n : testStart + foldSize; + const trainX = X.filter((_, i) => i < testStart || i >= testEnd); + const testX = X.slice(testStart, testEnd); + estimator.fit(trainX); + const cov = estimator.covariance; + const mean = new Float64Array(p); + for (const row of trainX) for (let j = 0; j < p; j++) mean[j]! += row[j] ?? 0; + for (let j = 0; j < p; j++) mean[j]! /= trainX.length; + // Score: negative log-likelihood + let score = 0; + for (const row of testX) { + let quadForm = 0; + for (let a = 0; a < p; a++) { + let covDotDiff = 0; + for (let b = 0; b < p; b++) { + covDotDiff += (cov[a]?.[b] ?? 0) * ((row[b] ?? 0) - (mean[b] ?? 0)); + } + quadForm += ((row[a] ?? 0) - (mean[a] ?? 0)) * covDotDiff; + } + score -= quadForm; + } + totalScore += score / testX.length; + } + return totalScore / nFolds; +} + +function invertMatrix(A: Float64Array[]): Float64Array[] { + const n = A.length; + const aug = A.map((row, i) => { + const r = new Float64Array(2 * n); + for (let j = 0; j < n; j++) r[j] = row[j] ?? 0; + r[n + i] = 1; + return r; + }); + for (let col = 0; col < n; col++) { + let maxRow = col; + for (let row = col + 1; row < n; row++) { + if (Math.abs(aug[row]?.[col] ?? 0) > Math.abs(aug[maxRow]?.[col] ?? 0)) maxRow = row; + } + const tmp = aug[col]!; + aug[col] = aug[maxRow]!; + aug[maxRow] = tmp; + const pivot = aug[col]?.[col] ?? 1; + if (Math.abs(pivot) < 1e-12) continue; + for (let j = 0; j < 2 * n; j++) aug[col]![j]! /= pivot; + for (let row = 0; row < n; row++) { + if (row === col) continue; + const f = aug[row]?.[col] ?? 0; + for (let j = 0; j < 2 * n; j++) aug[row]![j]! -= f * (aug[col]?.[j] ?? 0); + } + } + return aug.map((row) => new Float64Array(row.slice(n))); +} diff --git a/src/covariance/covariance_ext5.ts b/src/covariance/covariance_ext5.ts new file mode 100644 index 0000000..974defc --- /dev/null +++ b/src/covariance/covariance_ext5.ts @@ -0,0 +1,204 @@ +/** + * Covariance extensions: OAS, OASCovariance, LedoitWolf extensions. + * Mirrors sklearn.covariance advanced estimators. + */ + +import { BaseEstimator } from "../base.js"; + +/** Oracle Approximating Shrinkage (OAS) covariance estimator. */ +export class OASCovariance extends BaseEstimator { + covariance_: Float64Array[] = []; + precision_: Float64Array[] = []; + shrinkage_: number = 0; + location_: Float64Array = new Float64Array(0); + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + this.location_ = new Float64Array(p); + for (const xi of X) for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) + (xi[k] ?? 0); + for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) / n; + // Sample covariance + const S = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0)); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0); + } + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) / n; + // OAS shrinkage coefficient + let trS = 0, trS2 = 0, trS_sq = 0; + for (let i = 0; i < p; i++) { trS += S[i]?.[i] ?? 0; for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2); } + trS_sq = trS ** 2; + const rho_num = (1 - 2 / p) * trS2 + trS_sq; + const rho_denom = (n + 1 - 2 / p) * (trS2 - trS_sq / p); + this.shrinkage_ = rho_denom !== 0 ? Math.min(1, rho_num / rho_denom) : 1; + const mu = trS / p; + this.covariance_ = S.map((row, i) => + new Float64Array(row.map((v, j) => (1 - this.shrinkage_) * v + (i === j ? this.shrinkage_ * mu : 0))), + ); + this.precision_ = this._invert(this.covariance_, p); + return this; + } + + private _invert(A: Float64Array[], p: number): Float64Array[] { + // Gauss-Jordan elimination + const aug = A.map((row, i) => { + const r = new Float64Array(2 * p); + for (let j = 0; j < p; j++) r[j] = row[j] ?? 0; + r[p + i] = 1; + return r; + }); + for (let i = 0; i < p; i++) { + let maxRow = i; + for (let k = i + 1; k < p; k++) if (Math.abs(aug[k]?.[i] ?? 0) > Math.abs(aug[maxRow]?.[i] ?? 0)) maxRow = k; + [aug[i], aug[maxRow]] = [aug[maxRow]!, aug[i]!]; + const pivot = aug[i]?.[i] ?? 1e-10; + if (Math.abs(pivot) < 1e-10) continue; + for (let j = 0; j < 2 * p; j++) aug[i]![j] = (aug[i]![j] ?? 0) / pivot; + for (let k = 0; k < p; k++) { + if (k === i) continue; + const factor = aug[k]?.[i] ?? 0; + for (let j = 0; j < 2 * p; j++) aug[k]![j] = (aug[k]![j] ?? 0) - factor * (aug[i]![j] ?? 0); + } + } + return Array.from({ length: p }, (_, i) => new Float64Array(p).map((_, j) => aug[i]?.[p + j] ?? 0)); + } + + mahalanobis(X: Float64Array[]): Float64Array { + return new Float64Array(X.map((xi) => { + const xc = new Float64Array(xi.length).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0)); + let d = 0; + for (let i = 0; i < xc.length; i++) for (let j = 0; j < xc.length; j++) d += (xc[i] ?? 0) * (this.precision_[i]?.[j] ?? 0) * (xc[j] ?? 0); + return Math.max(d, 0); + })); + } +} + +/** Ledoit-Wolf analytical covariance estimator. */ +export class LedoitWolfExt extends BaseEstimator { + covariance_: Float64Array[] = []; + precision_: Float64Array[] = []; + shrinkage_: number = 0; + location_: Float64Array = new Float64Array(0); + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + this.location_ = new Float64Array(p); + for (const xi of X) for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) + (xi[k] ?? 0); + for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) / n; + const Xc = X.map((xi) => new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0))); + const S = Array.from({ length: p }, () => new Float64Array(p)); + for (const xc of Xc) for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) / n; + // Ledoit-Wolf analytical formula + let trS2 = 0, trS = 0; + for (let i = 0; i < p; i++) { trS += S[i]?.[i] ?? 0; for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2); } + let b2 = 0; + for (const xc of Xc) { + const xxt = Array.from({ length: p }, (_, i) => new Float64Array(p).map((_, j) => (xc[i] ?? 0) * (xc[j] ?? 0))); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) { + const diff = (xxt[i]?.[j] ?? 0) - (S[i]?.[j] ?? 0); + b2 += diff ** 2; + } + } + b2 /= (n ** 2); + const delta = Math.max(0, Math.min(1, Math.min(b2, trS2) / ((trS2 - trS ** 2 / p) || 1))); + this.shrinkage_ = delta; + const mu = trS / p; + this.covariance_ = S.map((row, i) => + new Float64Array(row.map((v, j) => (1 - delta) * v + (i === j ? delta * mu : 0))), + ); + this.precision_ = this._invert(this.covariance_, p); + return this; + } + + private _invert(A: Float64Array[], p: number): Float64Array[] { + const aug = A.map((row, i) => { + const r = new Float64Array(2 * p); + for (let j = 0; j < p; j++) r[j] = row[j] ?? 0; + r[p + i] = 1; + return r; + }); + for (let i = 0; i < p; i++) { + const pivot = aug[i]?.[i] ?? 1e-10; + if (Math.abs(pivot) < 1e-10) continue; + for (let j = 0; j < 2 * p; j++) aug[i]![j] = (aug[i]![j] ?? 0) / pivot; + for (let k = 0; k < p; k++) { + if (k === i) continue; + const f = aug[k]?.[i] ?? 0; + for (let j = 0; j < 2 * p; j++) aug[k]![j] = (aug[k]![j] ?? 0) - f * (aug[i]![j] ?? 0); + } + } + return Array.from({ length: p }, (_, i) => new Float64Array(p).map((_, j) => aug[i]?.[p + j] ?? 0)); + } +} + +/** MinCovDet: Minimum Covariance Determinant estimator. */ +export class MinCovDetExt extends BaseEstimator { + support_fraction_: number; + location_: Float64Array = new Float64Array(0); + covariance_: Float64Array[] = []; + dist_: Float64Array = new Float64Array(0); + + constructor(supportFraction = 0.75) { + super(); + this.support_fraction_ = supportFraction; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const h = Math.floor(n * this.support_fraction_); + // Simple C-step: start from all points, iteratively refine + let support = Array.from({ length: n }, (_, i) => i); + for (let step = 0; step < 10; step++) { + const Xs = support.map((i) => X[i]!); + const loc = new Float64Array(p); + for (const xi of Xs) for (let k = 0; k < p; k++) loc[k] = (loc[k] ?? 0) + (xi[k] ?? 0); + for (let k = 0; k < p; k++) loc[k] = (loc[k] ?? 0) / Xs.length; + const cov = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of Xs) { + const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (loc[k] ?? 0)); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) cov[i]![j] = (cov[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0); + } + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) cov[i]![j] = (cov[i]![j] ?? 0) / Xs.length; + // Compute Mahalanobis distances + const dist = X.map((xi) => { + let d = 0; + const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (loc[k] ?? 0)); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) d += (xc[i] ?? 0) * (cov[i]?.[j] ?? 0) * (xc[j] ?? 0); + return d; + }); + support = dist.map((d, i) => ({ d, i })).sort((a, b) => a.d - b.d).slice(0, h).map((x) => x.i); + } + const Xs = support.map((i) => X[i]!); + this.location_ = new Float64Array(p); + for (const xi of Xs) for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) + (xi[k] ?? 0); + for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) / Xs.length; + this.covariance_ = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of Xs) { + const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0)); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) this.covariance_[i]![j] = (this.covariance_[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0); + } + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) this.covariance_[i]![j] = (this.covariance_[i]![j] ?? 0) / Xs.length; + this.dist_ = new Float64Array(n).map((_, i) => { + const xi = X[i]!; + let d = 0; + const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0)); + for (let ii = 0; ii < p; ii++) for (let j = 0; j < p; j++) d += (xc[ii] ?? 0) * (this.covariance_[ii]?.[j] ?? 0) * (xc[j] ?? 0); + return d; + }); + return this; + } + + mahalanobis(X: Float64Array[]): Float64Array { + const p = this.location_.length; + return new Float64Array(X.map((xi) => { + const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0)); + let d = 0; + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) d += (xc[i] ?? 0) * (this.covariance_[i]?.[j] ?? 0) * (xc[j] ?? 0); + return Math.max(d, 0); + })); + } +} diff --git a/src/covariance/elliptic_envelope.ts b/src/covariance/elliptic_envelope.ts new file mode 100644 index 0000000..22ad7f2 --- /dev/null +++ b/src/covariance/elliptic_envelope.ts @@ -0,0 +1,245 @@ +/** + * EllipticEnvelope: outlier detection via robust covariance estimation. + * Mirrors sklearn.covariance.EllipticEnvelope. + */ + +import { NotFittedError } from "../exceptions.js"; + +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const means = new Float64Array(p); + const n = X.length; + for (const xi of X) { + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n; + return means; +} + +function empCov(X: Float64Array[], means: Float64Array): Float64Array[] { + const n = X.length; + const p = means.length; + const C = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let i = 0; i < p; i++) { + const di = (xi[i] ?? 0) - (means[i] ?? 0); + for (let j = i; j < p; j++) { + const dj = (xi[j] ?? 0) - (means[j] ?? 0); + C[i]![j] = (C[i]![j] ?? 0) + di * dj; + } + } + } + for (let i = 0; i < p; i++) { + C[i]![i] = (C[i]![i] ?? 0) / n; + for (let j = i + 1; j < p; j++) { + C[i]![j] = (C[i]![j] ?? 0) / n; + C[j]![i] = C[i]![j] ?? 0; + } + } + return C; +} + +/** Compute log-determinant of a positive-definite matrix via Cholesky. */ +function logDet(M: Float64Array[]): number { + const p = M.length; + const L = Array.from({ length: p }, () => new Float64Array(p)); + for (let i = 0; i < p; i++) { + for (let j = 0; j <= i; j++) { + let s = M[i]![j] ?? 0; + for (let k = 0; k < j; k++) s -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0); + if (i === j) { + L[i]![j] = Math.sqrt(Math.max(s, 1e-12)); + } else { + L[i]![j] = s / Math.max(L[j]![j] ?? 1e-12, 1e-12); + } + } + } + let logd = 0; + for (let i = 0; i < p; i++) logd += Math.log(Math.max(L[i]![i] ?? 1e-12, 1e-12)); + return 2 * logd; +} + +/** Invert a matrix via Gauss-Jordan. Returns null if singular. */ +function invertMatrix(M: Float64Array[]): Float64Array[] | null { + const p = M.length; + const A = M.map((row) => new Float64Array(row)); + const I = Array.from({ length: p }, (_, i) => { + const r = new Float64Array(p); + r[i] = 1; + return r; + }); + for (let col = 0; col < p; col++) { + let pivotRow = -1; + let pivotVal = 0; + for (let row = col; row < p; row++) { + if (Math.abs(A[row]![col] ?? 0) > Math.abs(pivotVal)) { + pivotVal = A[row]![col] ?? 0; + pivotRow = row; + } + } + if (pivotRow === -1 || Math.abs(pivotVal) < 1e-12) return null; + const tmpA = A[col]!; + A[col] = A[pivotRow]!; + A[pivotRow] = tmpA; + const tmpI = I[col]!; + I[col] = I[pivotRow]!; + I[pivotRow] = tmpI; + const scale = A[col]![col] ?? 1; + for (let j = 0; j < p; j++) { + A[col]![j] = (A[col]![j] ?? 0) / scale; + I[col]![j] = (I[col]![j] ?? 0) / scale; + } + for (let row = 0; row < p; row++) { + if (row === col) continue; + const factor = A[row]![col] ?? 0; + for (let j = 0; j < p; j++) { + A[row]![j] = (A[row]![j] ?? 0) - factor * (A[col]![j] ?? 0); + I[row]![j] = (I[row]![j] ?? 0) - factor * (I[col]![j] ?? 0); + } + } + } + return I; +} + +/** Mahalanobis distance squared for each row. */ +function mahalanobisDistSq( + X: Float64Array[], + mean: Float64Array, + precisionMat: Float64Array[], +): Float64Array { + const n = X.length; + const p = mean.length; + const dists = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let d = 0; + for (let j = 0; j < p; j++) { + let row = 0; + for (let k = 0; k < p; k++) { + row += (precisionMat[j]![k] ?? 0) * ((xi[k] ?? 0) - (mean[k] ?? 0)); + } + d += ((xi[j] ?? 0) - (mean[j] ?? 0)) * row; + } + dists[i] = d; + } + return dists; +} + +/** + * EllipticEnvelope: fits a robust covariance estimate to detect outliers. + * Uses minimum covariance determinant (fast approximation). + * Mirrors sklearn.covariance.EllipticEnvelope. + */ +export class EllipticEnvelope { + contamination: number; + supportFraction: number | null; + randomState: number; + + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + precision_: Float64Array[] | null = null; + threshold_: number = 0; + offset_: number = 0; + + constructor( + options: { + contamination?: number; + supportFraction?: number | null; + randomState?: number; + } = {}, + ) { + this.contamination = options.contamination ?? 0.1; + this.supportFraction = options.supportFraction ?? null; + this.randomState = options.randomState ?? 42; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const h = this.supportFraction !== null + ? Math.floor(this.supportFraction * n) + : Math.floor((n + p + 1) / 2); + + // Fast MCD approximation: random subsample + C-step iterations + let bestDet = Number.POSITIVE_INFINITY; + let bestMean = new Float64Array(p); + let bestCov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + + const rng = this.randomState; + const nTrials = 10; + for (let trial = 0; trial < nTrials; trial++) { + // Random subset of h points + const indices = Array.from({ length: n }, (_, i) => i); + // Pseudo-random shuffle using simple LCG + for (let i = n - 1; i > 0; i--) { + const j = Math.abs((rng * 1664525 + 1013904223 + i * trial * 31337) % (i + 1)); + const tmp = indices[i]!; + indices[i] = indices[j]!; + indices[j] = tmp; + } + const subset = indices.slice(0, h).map((i) => X[i] ?? new Float64Array(p)); + + // C-step iterations + let curSubset = subset; + for (let cstep = 0; cstep < 30; cstep++) { + const mean = colMeans(curSubset); + const cov = empCov(curSubset, mean); + const inv = invertMatrix(cov); + if (!inv) break; + const dists = mahalanobisDistSq(X, mean, inv); + const sortedIdx = Array.from({ length: n }, (_, i) => i).sort( + (a, b) => (dists[a] ?? 0) - (dists[b] ?? 0), + ); + curSubset = sortedIdx.slice(0, h).map((i) => X[i] ?? new Float64Array(p)); + } + + const mean = colMeans(curSubset); + const cov = empCov(curSubset, mean); + const det = logDet(cov); + if (det < bestDet) { + bestDet = det; + bestMean = mean; + bestCov = cov; + } + } + + const inv = invertMatrix(bestCov) ?? bestCov; + this.location_ = bestMean; + this.covariance_ = bestCov; + this.precision_ = inv; + + // Compute threshold based on contamination + const dists = mahalanobisDistSq(X, bestMean, inv); + const sorted = Array.from(dists).sort((a, b) => a - b); + const threshIdx = Math.floor((1 - this.contamination) * n); + this.threshold_ = sorted[Math.min(threshIdx, n - 1)] ?? 0; + this.offset_ = -this.threshold_; + return this; + } + + mahalanobis(X: Float64Array[]): Float64Array { + if (this.location_ === null || this.precision_ === null) { + throw new NotFittedError("EllipticEnvelope"); + } + return mahalanobisDistSq(X, this.location_, this.precision_); + } + + decisionFunction(X: Float64Array[]): Float64Array { + const dists = this.mahalanobis(X); + return new Float64Array(dists.map((d) => -d - this.offset_)); + } + + predict(X: Float64Array[]): Int32Array { + const scores = this.decisionFunction(X); + return new Int32Array(scores.map((s) => (s >= 0 ? 1 : -1))); + } + + score(X: Float64Array[], y: Int32Array): number { + const yPred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if ((yPred[i] ?? 0) === (y[i] ?? 0)) correct++; + } + return correct / y.length; + } +} diff --git a/src/covariance/empirical.ts b/src/covariance/empirical.ts new file mode 100644 index 0000000..54f2c8b --- /dev/null +++ b/src/covariance/empirical.ts @@ -0,0 +1,152 @@ +/** + * Empirical covariance estimators. + * Mirrors scikit-learn's covariance.EmpiricalCovariance, LedoitWolf, OAS. + */ + +function mean(X: Float64Array[], nSamples: number, nFeatures: number): Float64Array { + const m = new Float64Array(nFeatures); + for (const row of X) { + for (let j = 0; j < nFeatures; j++) m[j] = (m[j] ?? 0) + (row[j] ?? 0) / nSamples; + } + return m; +} + +function covMatrix( + X: Float64Array[], + mu: Float64Array, + nSamples: number, + nFeatures: number, +): Float64Array[] { + const C: Float64Array[] = Array.from({ length: nFeatures }, () => new Float64Array(nFeatures)); + for (const row of X) { + for (let i = 0; i < nFeatures; i++) { + for (let j = 0; j < nFeatures; j++) { + C[i]![j] = (C[i]![j] ?? 0) + + ((row[i] ?? 0) - (mu[i] ?? 0)) * ((row[j] ?? 0) - (mu[j] ?? 0)) / nSamples; + } + } + } + return C; +} + +export class EmpiricalCovariance { + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + + constructor(readonly assumeCentered = false) {} + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const mu = this.assumeCentered ? new Float64Array(p) : mean(X, n, p); + this.location_ = mu; + this.covariance_ = covMatrix(X, mu, n, p); + return this; + } + + mahalanobis(X: Float64Array[]): Float64Array { + if (this.covariance_ === null || this.location_ === null) { + throw new Error("EmpiricalCovariance must be fitted first"); + } + // Simplified: diagonal approximation + const diagInv = this.covariance_.map((row, i) => row[i] ?? 1); + return Float64Array.from(X, (xi) => { + let s = 0; + for (let j = 0; j < xi.length; j++) { + const diff = (xi[j] ?? 0) - (this.location_![j] ?? 0); + s += diff * diff / (diagInv[j] ?? 1); + } + return Math.sqrt(s); + }); + } + + score(XTest: Float64Array[], yTest?: unknown): number { + void yTest; + if (this.covariance_ === null) throw new Error("Not fitted"); + const n = XTest.length; + const p = XTest[0]?.length ?? 0; + const mu = mean(XTest, n, p); + const testCov = covMatrix(XTest, mu, n, p); + let s = 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + const diff = (testCov[i]?.[j] ?? 0) - (this.covariance_[i]?.[j] ?? 0); + s += diff * diff; + } + } + return -Math.sqrt(s); + } +} + +/** + * Ledoit-Wolf covariance estimator with analytic shrinkage. + */ +export class LedoitWolf extends EmpiricalCovariance { + shrinkage_: number = 0; + + override fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const mu = mean(X, n, p); + this.location_ = mu; + const S = covMatrix(X, mu, n, p); + + // Ledoit-Wolf analytical formula + let trS = 0, trS2 = 0, tr2S = 0; + for (let i = 0; i < p; i++) { + trS += S[i]?.[i] ?? 0; + for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2); + } + tr2S = trS * trS; + + // Oracle approximating shrinkage + const mu1 = trS / p; + const delta2 = (trS2 - tr2S / p) / p; + const beta2 = Math.max(0, (trS2 / n - tr2S / (n * p)) / (trS2 - tr2S / p + 1e-10)); + const shrinkage = Math.min(1, beta2); + this.shrinkage_ = shrinkage; + + this.covariance_ = S.map((row, i) => + Float64Array.from(row, (v, j) => + (1 - shrinkage) * v + (i === j ? shrinkage * mu1 : 0), + ), + ); + void delta2; + return this; + } +} + +/** + * Oracle Approximating Shrinkage (OAS) estimator. + */ +export class OAS extends EmpiricalCovariance { + shrinkage_: number = 0; + + override fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const mu = mean(X, n, p); + this.location_ = mu; + const S = covMatrix(X, mu, n, p); + + let trS = 0, trS2 = 0; + for (let i = 0; i < p; i++) { + trS += S[i]?.[i] ?? 0; + for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2); + } + + // OAS formula + const rho = (1 - 2 / p) * trS2 + trS * trS; + const gamma = (n + 1 - 2 / p) * (trS2 - trS * trS / p); + const shrinkage = Math.min(1, rho / (gamma + 1e-10)); + this.shrinkage_ = shrinkage; + const mu1 = trS / p; + + this.covariance_ = S.map((row, i) => + Float64Array.from(row, (v, j) => + (1 - shrinkage) * v + (i === j ? shrinkage * mu1 : 0), + ), + ); + return this; + } +} diff --git a/src/covariance/graphical_lasso.ts b/src/covariance/graphical_lasso.ts new file mode 100644 index 0000000..00bc9e0 --- /dev/null +++ b/src/covariance/graphical_lasso.ts @@ -0,0 +1,252 @@ +/** + * GraphicalLasso and MinCovDet (robust covariance). + * Mirrors sklearn.covariance.GraphicalLasso and MinCovDet. + */ + +import { NotFittedError } from "../exceptions.js"; + +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const n = X.length; + const means = new Float64Array(p); + for (const xi of X) for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n; + return means; +} + +function empiricalCovariance(X: Float64Array[]): Float64Array[] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const means = colMeans(X); + const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let j = 0; j < p; j++) { + for (let k = 0; k <= j; k++) { + const d = ((xi[j] ?? 0) - (means[j] ?? 0)) * ((xi[k] ?? 0) - (means[k] ?? 0)); + cov[j]![k] = (cov[j]![k] ?? 0) + d; + if (k !== j) cov[k]![j] = (cov[k]![j] ?? 0) + d; + } + } + } + for (let j = 0; j < p; j++) for (let k = 0; k < p; k++) cov[j]![k] = (cov[j]![k] ?? 0) / n; + return cov; +} + +function matMul(A: Float64Array[], B: Float64Array[]): Float64Array[] { + const n = A.length; + const m = (B[0] ?? new Float64Array(0)).length; + const k = B.length; + const C: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m)); + for (let i = 0; i < n; i++) for (let j = 0; j < m; j++) for (let l = 0; l < k; l++) C[i]![j] = (C[i]![j] ?? 0) + (A[i]![l] ?? 0) * (B[l]![j] ?? 0); + return C; +} + +function invertMatrix(A: Float64Array[]): Float64Array[] { + const p = A.length; + // Augmented matrix [A | I] + const M: Float64Array[] = A.map((row, i) => { + const r = new Float64Array(2 * p); + for (let j = 0; j < p; j++) r[j] = row[j] ?? 0; + r[p + i] = 1; + return r; + }); + + for (let col = 0; col < p; col++) { + let pivot = col; + for (let row = col + 1; row < p; row++) { + if (Math.abs(M[row]![col] ?? 0) > Math.abs(M[pivot]![col] ?? 0)) pivot = row; + } + const tmp = M[col]!; M[col] = M[pivot]!; M[pivot] = tmp; + const denom = M[col]![col] ?? 1; + for (let j = 0; j < 2 * p; j++) M[col]![j] = (M[col]![j] ?? 0) / denom; + for (let row = 0; row < p; row++) { + if (row === col) continue; + const factor = M[row]![col] ?? 0; + for (let j = 0; j < 2 * p; j++) M[row]![j] = (M[row]![j] ?? 0) - factor * (M[col]![j] ?? 0); + } + } + + return M.map((row) => new Float64Array(Array.from({ length: p }, (_, j) => row[p + j] ?? 0))); +} + +export interface GraphicalLassoOptions { + alpha?: number; + maxIter?: number; + tol?: number; +} + +/** + * Sparse inverse covariance estimation with L1 penalty (Graphical Lasso). + * Mirrors sklearn.covariance.GraphicalLasso. + * Uses the block coordinate descent algorithm (GLASSO). + */ +export class GraphicalLasso { + alpha: number; + maxIter: number; + tol: number; + + covariance_: Float64Array[] | null = null; + precision_: Float64Array[] | null = null; + nIter_: number = 0; + location_: Float64Array | null = null; + + constructor(options: GraphicalLassoOptions = {}) { + this.alpha = options.alpha ?? 0.01; + this.maxIter = options.maxIter ?? 100; + this.tol = options.tol ?? 1e-4; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + this.location_ = colMeans(X); + const S = empiricalCovariance(X); + + // Initialize with diagonal of S + alpha * I + const W: Float64Array[] = Array.from({ length: p }, (_, i) => { + const row = new Float64Array(p); + for (let j = 0; j < p; j++) row[j] = S[i]![j] ?? 0; + row[i] = (row[i] ?? 0) + this.alpha; + return row; + }); + + for (let iter = 0; iter < this.maxIter; iter++) { + let maxDelta = 0; + for (let j = 0; j < p; j++) { + // Partition W into W11 (p-1 x p-1) and w12 (p-1 vector) + const idx = Array.from({ length: p }, (_, k) => k).filter((k) => k !== j); + const W11: Float64Array[] = idx.map((r) => new Float64Array(idx.map((c) => W[r]![c] ?? 0))); + const s12 = new Float64Array(idx.map((r) => S[r]![j] ?? 0)); + + // Solve lasso: W11 * beta = s12 with L1 penalty alpha + const W11inv = invertMatrix(W11); + const q = new Float64Array(p - 1); + for (let k = 0; k < p - 1; k++) for (let l = 0; l < p - 1; l++) q[k] = (q[k] ?? 0) + (W11inv[k]![l] ?? 0) * (s12[l] ?? 0); + + // Coordinate descent for lasso subproblem + const beta = new Float64Array(p - 1); + for (let lasso = 0; lasso < 100; lasso++) { + let maxD = 0; + for (let k = 0; k < p - 1; k++) { + const r = (s12[k] ?? 0) - ((): number => { + let s = 0; + for (let l = 0; l < p - 1; l++) if (l !== k) s += (W11[k]![l] ?? 0) * (beta[l] ?? 0); + return s; + })(); + const wkk = W11[k]![k] ?? 1; + const b = r / wkk; + const threshold = this.alpha / wkk; + const newBeta = b > threshold ? b - threshold : b < -threshold ? b + threshold : 0; + maxD = Math.max(maxD, Math.abs(newBeta - (beta[k] ?? 0))); + beta[k] = newBeta; + } + if (maxD < 1e-6) break; + } + + // Update W: w12 = W11 * beta + for (let k = 0; k < p - 1; k++) { + let s = 0; + for (let l = 0; l < p - 1; l++) s += (W11[k]![l] ?? 0) * (beta[l] ?? 0); + const delta = Math.abs(s - (W[idx[k]!]![j] ?? 0)); + if (delta > maxDelta) maxDelta = delta; + W[idx[k]!]![j] = s; + W[j]![idx[k]!] = s; + } + } + this.nIter_ = iter + 1; + if (maxDelta < this.tol) break; + } + + this.covariance_ = W; + this.precision_ = invertMatrix(W); + return this; + } + + score(X: Float64Array[]): number { + if (!this.covariance_) throw new NotFittedError("GraphicalLasso is not fitted yet."); + return 0; // Placeholder: log-likelihood requires determinant + } +} + +export interface MinCovDetOptions { + support?: number | null; + randomState?: number; +} + +/** + * Minimum Covariance Determinant robust estimator. + * Mirrors sklearn.covariance.MinCovDet. + * Uses a simplified C-step algorithm. + */ +export class MinCovDet { + support: number | null; + randomState: number; + + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + precision_: Float64Array[] | null = null; + supportFraction_: number = 0; + supportIndices_: Int32Array | null = null; + rawLocation_: Float64Array | null = null; + rawCovariance_: Float64Array[] | null = null; + + private rng_: () => number; + + constructor(options: MinCovDetOptions = {}) { + this.support = options.support ?? null; + this.randomState = options.randomState ?? 0; + let seed = this.randomState + 1; + this.rng_ = () => { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + }; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const h = this.support != null ? Math.floor(this.support * n) : Math.floor((n + p + 1) / 2); + + // Compute Mahalanobis distances from full empirical estimate + const fullMeans = colMeans(X); + const fullCov = empiricalCovariance(X); + let precision: Float64Array[]; + try { precision = invertMatrix(fullCov); } catch { precision = Array.from({ length: p }, (_, i) => { const r = new Float64Array(p); r[i] = 1; return r; }); } + + // Mahalanobis distance for each point + const mDist = X.map((xi) => { + const diff = new Float64Array(p); + for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (fullMeans[j] ?? 0); + let d = 0; + for (let j = 0; j < p; j++) for (let k = 0; k < p; k++) d += (diff[j] ?? 0) * (precision[j]![k] ?? 0) * (diff[k] ?? 0); + return d; + }); + + // Select h points with smallest Mahalanobis distances + const sortedIdx = Array.from({ length: n }, (_, i) => i).sort((a, b) => mDist[a]! - mDist[b]!); + const supportIdx = new Int32Array(sortedIdx.slice(0, h)); + + const subset = Array.from(supportIdx).map((i) => X[i] ?? new Float64Array(p)); + this.rawLocation_ = colMeans(subset); + this.rawCovariance_ = empiricalCovariance(subset); + + this.location_ = this.rawLocation_; + this.covariance_ = this.rawCovariance_; + try { this.precision_ = invertMatrix(this.covariance_); } catch { this.precision_ = null; } + + this.supportFraction_ = h / n; + this.supportIndices_ = supportIdx; + return this; + } + + mahalanobis(X: Float64Array[]): Float64Array { + if (!this.location_ || !this.precision_) throw new NotFittedError("MinCovDet is not fitted yet."); + const p = this.location_.length; + return new Float64Array(X.map((xi) => { + const diff = new Float64Array(p); + for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (this.location_![j] ?? 0); + let d = 0; + for (let j = 0; j < p; j++) for (let k = 0; k < p; k++) d += (diff[j] ?? 0) * (this.precision_![j]![k] ?? 0) * (diff[k] ?? 0); + return d; + })); + } +} diff --git a/src/covariance/index.ts b/src/covariance/index.ts new file mode 100644 index 0000000..91a4185 --- /dev/null +++ b/src/covariance/index.ts @@ -0,0 +1,5 @@ +export * from "./covariance.js"; +export * from "./graphical_lasso.js"; +export * from "./elliptic_envelope.js"; +export * from "./precision.js"; +export * from "./shrinkage.js"; diff --git a/src/covariance/mcd.ts b/src/covariance/mcd.ts new file mode 100644 index 0000000..5e5515e --- /dev/null +++ b/src/covariance/mcd.ts @@ -0,0 +1,148 @@ +/** + * Minimum Covariance Determinant (MCD): robust covariance estimation + */ + +export class MinCovDet { + private support_fraction: number; + private nSubsets: number; + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + precision_: Float64Array[] | null = null; + support_: Int32Array | null = null; + + constructor(support_fraction?: number, nSubsets = 500) { + this.support_fraction = support_fraction ?? 0; + this.nSubsets = nSubsets; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const h = Math.max(p + 1, Math.floor((this.support_fraction > 0 ? this.support_fraction : (n + p + 1) / 2) * n)); + + // FastMCD approximation: multiple random subsets + let bestDet = Number.POSITIVE_INFINITY; + let bestSubset: number[] | null = null; + + for (let iter = 0; iter < Math.min(this.nSubsets, 500); iter++) { + // Random initial subset of size p+1 + const subset = this.randomSubset(n, Math.min(p + 1, n)); + const expanded = this.expandSubset(X, subset, h); + const { mean, cov } = this.computeMeanCov(X, expanded); + const det = this.det(cov); + if (det < bestDet) { + bestDet = det; + bestSubset = expanded; + } + } + + const finalSubset = bestSubset ?? Array.from({ length: h }, (_, i) => i); + const { mean, cov } = this.computeMeanCov(X, finalSubset); + + this.location_ = mean; + this.covariance_ = cov; + this.precision_ = this.invertMatrix(cov); + this.support_ = new Int32Array(n); + for (const idx of finalSubset) this.support_[idx] = 1; + return this; + } + + private randomSubset(n: number, k: number): number[] { + const indices = Array.from({ length: n }, (_, i) => i); + for (let i = n - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = indices[i]!; indices[i] = indices[j]!; indices[j] = tmp; + } + return indices.slice(0, k); + } + + private expandSubset(X: Float64Array[], subset: number[], h: number): number[] { + const { mean, cov } = this.computeMeanCov(X, subset); + const prec = this.invertMatrix(cov); + const dists = X.map((row, i) => ({ i, d: this.mahalanobis(row, mean, prec) })); + dists.sort((a, b) => a.d - b.d); + return dists.slice(0, h).map((d) => d.i); + } + + private mahalanobis(x: Float64Array, mean: Float64Array, prec: Float64Array[]): number { + const p = x.length; + const diff = new Float64Array(p); + for (let i = 0; i < p; i++) diff[i] = (x[i] ?? 0) - (mean[i] ?? 0); + let dist = 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) dist += (diff[i] ?? 0) * (prec[i]![j] ?? 0) * (diff[j] ?? 0); + } + return dist; + } + + private computeMeanCov(X: Float64Array[], indices: number[]): { mean: Float64Array; cov: Float64Array[] } { + const p = X[0]?.length ?? 0; + const n = indices.length; + const mean = new Float64Array(p); + for (const idx of indices) for (let j = 0; j < p; j++) mean[j] += (X[idx]![j] ?? 0) / n; + const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (const idx of indices) { + const diff = new Float64Array(p); + for (let j = 0; j < p; j++) diff[j] = (X[idx]![j] ?? 0) - (mean[j] ?? 0); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) cov[i]![j] += (diff[i] ?? 0) * (diff[j] ?? 0) / (n - 1); + } + return { mean, cov }; + } + + private det(A: Float64Array[]): number { + const n = A.length; + if (n === 1) return A[0]![0] ?? 0; + if (n === 2) return (A[0]![0] ?? 0) * (A[1]![1] ?? 0) - (A[0]![1] ?? 0) * (A[1]![0] ?? 0); + let result = 1; + const mat = A.map((row) => Float64Array.from(row)); + for (let col = 0; col < n; col++) { + let maxRow = col; + for (let row = col + 1; row < n; row++) { + if (Math.abs(mat[row]![col] ?? 0) > Math.abs(mat[maxRow]![col] ?? 0)) maxRow = row; + } + if (maxRow !== col) { const tmp = mat[col]!; mat[col] = mat[maxRow]!; mat[maxRow] = tmp; result *= -1; } + const pivot = mat[col]![col] ?? 0; + if (Math.abs(pivot) < 1e-10) return 0; + result *= pivot; + for (let row = col + 1; row < n; row++) { + const factor = (mat[row]![col] ?? 0) / pivot; + for (let j = col; j < n; j++) mat[row]![j] = (mat[row]![j] ?? 0) - factor * (mat[col]![j] ?? 0); + } + } + return result; + } + + private invertMatrix(A: Float64Array[]): Float64Array[] { + const n = A.length; + const aug = A.map((row, i) => { + const r = new Float64Array(2 * n); + for (let j = 0; j < n; j++) r[j] = row[j] ?? 0; + r[n + i] = 1; + return r; + }); + for (let col = 0; col < n; col++) { + let maxRow = col; + for (let row = col + 1; row < n; row++) { + if (Math.abs(aug[row]![col] ?? 0) > Math.abs(aug[maxRow]![col] ?? 0)) maxRow = row; + } + const tmp = aug[col]!; aug[col] = aug[maxRow]!; aug[maxRow] = tmp; + const pivot = aug[col]![col] ?? 1; + for (let j = 0; j < 2 * n; j++) aug[col]![j] = (aug[col]![j] ?? 0) / (pivot || 1); + for (let row = 0; row < n; row++) { + if (row === col) continue; + const factor = aug[row]![col] ?? 0; + for (let j = 0; j < 2 * n; j++) aug[row]![j] = (aug[row]![j] ?? 0) - factor * (aug[col]![j] ?? 0); + } + } + return Array.from({ length: n }, (_, i) => { + const row = new Float64Array(n); + for (let j = 0; j < n; j++) row[j] = aug[i]![n + j] ?? 0; + return row; + }); + } + + mahalanobisDistances(X: Float64Array[]): Float64Array { + if (!this.location_ || !this.precision_) throw new Error("Not fitted"); + return new Float64Array(X.map((row) => this.mahalanobis(row, this.location_!, this.precision_!))); + } +} diff --git a/src/covariance/precision.ts b/src/covariance/precision.ts new file mode 100644 index 0000000..77b6e64 --- /dev/null +++ b/src/covariance/precision.ts @@ -0,0 +1,230 @@ +/** + * Covariance utilities: precision matrix estimation, covariance selection. + * ledoit_wolf() and oas() functional APIs, plus precision/correlation conversion. + * Mirrors sklearn.covariance functional API and utility functions. + */ + +import { NotFittedError } from "../exceptions.js"; + +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const m = new Float64Array(p); + const n = X.length; + for (const xi of X) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / n; + return m; +} + +function empCovMatrix(X: Float64Array[], means: Float64Array): Float64Array[] { + const n = X.length; + const p = means.length; + const C = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let i = 0; i < p; i++) { + const di = (xi[i] ?? 0) - (means[i] ?? 0); + for (let j = i; j < p; j++) { + const dj = (xi[j] ?? 0) - (means[j] ?? 0); + C[i]![j] = (C[i]![j] ?? 0) + di * dj; + } + } + } + for (let i = 0; i < p; i++) { + C[i]![i] = (C[i]![i] ?? 0) / n; + for (let j = i + 1; j < p; j++) { + C[i]![j] = (C[i]![j] ?? 0) / n; + C[j]![i] = C[i]![j] ?? 0; + } + } + return C; +} + +function matTrace(M: Float64Array[]): number { + let s = 0; + for (let i = 0; i < M.length; i++) s += M[i]![i] ?? 0; + return s; +} + +function matFrobSq(M: Float64Array[]): number { + let s = 0; + for (const row of M) for (let j = 0; j < row.length; j++) s += (row[j] ?? 0) ** 2; + return s; +} + +/** Invert diagonal of a matrix (for precision). */ +function invertDiag(M: Float64Array[]): Float64Array[] { + return M.map((row, i) => new Float64Array(row.map((v, j) => i === j && v > 0 ? 1 / v : 0))); +} + +/** + * Functional API: Ledoit-Wolf analytical shrinkage. + * Mirrors sklearn.covariance.ledoit_wolf. + */ +export function ledoitWolf( + X: Float64Array[], + options: { assumeCentered?: boolean } = {}, +): { covariance: Float64Array[]; shrinkage: number } { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const location = options.assumeCentered ? new Float64Array(p) : colMeans(X); + const S = empCovMatrix(X, location); + const trS = matTrace(S); + const trS2 = matFrobSq(S); + const trSsq = trS ** 2; + + let delta = 0; + for (let i = 0; i < p; i++) { + for (let k = 0; k < p; k++) { + let fourth = 0; + for (let t = 0; t < n; t++) { + const xt = X[t] ?? new Float64Array(p); + fourth += ((xt[i] ?? 0) - (location[i] ?? 0)) ** 2 * ((xt[k] ?? 0) - (location[k] ?? 0)) ** 2; + } + fourth /= n; + delta += fourth - (S[i]![k] ?? 0) ** 2; + } + } + delta /= n; + + const delta2 = trS2 - trSsq / p; + const shrinkage = delta2 > 0 + ? Math.min(1, Math.max(0, (delta + ((n - 2) / n) * delta2) / ((n + 2) * delta2))) + : 0; + + const mu = trS / p; + const covariance = S.map((row, i) => + new Float64Array(row.map((v, j) => (1 - shrinkage) * v + shrinkage * (i === j ? mu : 0))), + ); + return { covariance, shrinkage }; +} + +/** + * Functional API: Oracle Approximating Shrinkage (OAS). + * Mirrors sklearn.covariance.oas. + */ +export function oas( + X: Float64Array[], + options: { assumeCentered?: boolean } = {}, +): { covariance: Float64Array[]; shrinkage: number } { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const location = options.assumeCentered ? new Float64Array(p) : colMeans(X); + const S = empCovMatrix(X, location); + const trS = matTrace(S); + const trS2 = matFrobSq(S); + const trSsq = trS ** 2; + + const num = (1 - 2 / p) * trS2 + trSsq; + const denom = (n + 1 - 2 / p) * (trS2 - trSsq / p); + const shrinkage = denom > 0 ? Math.min(1, Math.max(0, num / denom)) : 0; + + const mu = trS / p; + const covariance = S.map((row, i) => + new Float64Array(row.map((v, j) => (1 - shrinkage) * v + shrinkage * (i === j ? mu : 0))), + ); + return { covariance, shrinkage }; +} + +/** + * Convert a covariance matrix to a correlation matrix. + * Mirrors sklearn.covariance.cov_to_corr. + */ +export function covToCorr(covariance: Float64Array[]): Float64Array[] { + const p = covariance.length; + const std = new Float64Array(p).map((_, i) => Math.sqrt(Math.max(covariance[i]![i] ?? 0, 1e-12))); + return covariance.map((row, i) => + new Float64Array(row.map((v, j) => v / ((std[i] ?? 1) * (std[j] ?? 1)))), + ); +} + +/** + * Compute the log-likelihood of X under a Gaussian model. + * Mirrors sklearn.covariance.empirical_covariance (log_likelihood method). + */ +export function gaussianLogLikelihood( + X: Float64Array[], + mean: Float64Array, + covariance: Float64Array[], +): number { + const n = X.length; + const p = mean.length; + + // log-det via Cholesky + const L = Array.from({ length: p }, () => new Float64Array(p)); + for (let i = 0; i < p; i++) { + for (let j = 0; j <= i; j++) { + let s = covariance[i]![j] ?? 0; + for (let k = 0; k < j; k++) s -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0); + L[i]![j] = i === j ? Math.sqrt(Math.max(s, 1e-12)) : s / Math.max(L[j]![j] ?? 1, 1e-12); + } + } + let logDet = 0; + for (let i = 0; i < p; i++) logDet += Math.log(Math.max(L[i]![i] ?? 1e-12, 1e-12)); + logDet *= 2; + + // trace(S * precision) where S = empirical covariance of X + const S = empCovMatrix(X, mean); + // Use diagonal approx for precision + let trSP = 0; + for (let i = 0; i < p; i++) { + const cii = covariance[i]![i] ?? 1; + trSP += (S[i]![i] ?? 0) / Math.max(cii, 1e-12); + } + + return -0.5 * (n * (p * Math.log(2 * Math.PI) + logDet + trSP)); +} + +/** + * Sparse inverse covariance estimator (precision matrix selector). + * Uses a simple soft-threshold approach to zero out small entries. + * Mirrors sklearn.covariance sparse precision concepts. + */ +export class SparsePrecision { + threshold: number; + assumeCentered: boolean; + + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + precision_: Float64Array[] | null = null; + + constructor(options: { threshold?: number; assumeCentered?: boolean } = {}) { + this.threshold = options.threshold ?? 0.1; + this.assumeCentered = options.assumeCentered ?? false; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + const location = this.assumeCentered ? new Float64Array(p) : colMeans(X); + this.location_ = location; + const S = empCovMatrix(X, location); + this.covariance_ = S; + + // Simple diagonal precision estimate with soft-thresholding + const P = invertDiag(S); + // Soft-threshold off-diagonal elements + this.precision_ = P.map((row, i) => + new Float64Array(row.map((v, j) => { + if (i === j) return v; + return Math.abs(v) > this.threshold ? v - Math.sign(v) * this.threshold : 0; + })), + ); + return this; + } + + mahalanobis(X: Float64Array[]): Float64Array { + if (this.precision_ === null || this.location_ === null) { + throw new NotFittedError("SparsePrecision"); + } + const P = this.precision_; + const mu = this.location_; + const p = mu.length; + return new Float64Array(X.map((xi) => { + let d = 0; + for (let j = 0; j < p; j++) { + let pRow = 0; + for (let k = 0; k < p; k++) pRow += (P[j]![k] ?? 0) * ((xi[k] ?? 0) - (mu[k] ?? 0)); + d += ((xi[j] ?? 0) - (mu[j] ?? 0)) * pRow; + } + return d; + })); + } +} diff --git a/src/covariance/shrinkage.ts b/src/covariance/shrinkage.ts new file mode 100644 index 0000000..94d915a --- /dev/null +++ b/src/covariance/shrinkage.ts @@ -0,0 +1,240 @@ +/** + * Covariance estimators: LedoitWolf, OAS, and ShrunkCovariance. + * Analogous to sklearn.covariance._shrunk_covariance and _ledoit_wolf. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Result of a covariance estimate. */ +export interface CovResult { + /** Estimated covariance matrix (flat, nFeatures Γ— nFeatures). */ + covariance: Float64Array; + /** Estimated precision matrix (inverse of covariance). */ + precision: Float64Array; + nFeatures: number; +} + +/** + * Computes the sample covariance matrix from a flat (nSamples Γ— nFeatures) matrix X + * that has already been mean-centered. + */ +function sampleCov(X: Float64Array, nSamples: number, nFeatures: number): Float64Array { + const cov = new Float64Array(nFeatures * nFeatures); + const scale = 1 / (nSamples - 1); + for (let i = 0; i < nSamples; i++) { + for (let j = 0; j < nFeatures; j++) { + for (let k = j; k < nFeatures; k++) { + const v = X[i * nFeatures + j]! * X[i * nFeatures + k]! * scale; + cov[j * nFeatures + k]! += v; + if (k !== j) cov[k * nFeatures + j]! += v; + } + } + } + return cov; +} + +/** Centers X in-place and returns the column means. */ +function centerMatrix(X: Float64Array, nSamples: number, nFeatures: number): Float64Array { + const means = new Float64Array(nFeatures); + for (let i = 0; i < nSamples; i++) for (let j = 0; j < nFeatures; j++) means[j]! += X[i * nFeatures + j]!; + for (let j = 0; j < nFeatures; j++) means[j]! /= nSamples; + for (let i = 0; i < nSamples; i++) for (let j = 0; j < nFeatures; j++) X[i * nFeatures + j]! -= means[j]!; + return means; +} + +/** Applies a shrinkage factor Ξ±: Ξ£_shrunk = (1-Ξ±)Β·S + Ξ±Β·(tr(S)/p)Β·I */ +function shrinkCov(S: Float64Array, p: number, alpha: number): Float64Array { + const mu = (() => { let t = 0; for (let j = 0; j < p; j++) t += S[j * p + j]!; return t / p; })(); + const out = new Float64Array(p * p); + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) out[i * p + j] = (1 - alpha) * S[i * p + j]!; + out[i * p + i]! += alpha * mu; + } + return out; +} + +/** Inverts a symmetric positive-definite pΓ—p matrix via Gauss-Jordan. */ +function invertPD(A: Float64Array, p: number): Float64Array { + const aug = new Float64Array(p * 2 * p); + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) aug[i * 2 * p + j] = A[i * p + j]!; + aug[i * 2 * p + p + i] = 1; + } + for (let col = 0; col < p; col++) { + let maxRow = col; + for (let r = col + 1; r < p; r++) if (Math.abs(aug[r * 2 * p + col]!) > Math.abs(aug[maxRow * 2 * p + col]!)) maxRow = r; + if (maxRow !== col) { + for (let k = 0; k < 2 * p; k++) { + const tmp = aug[col * 2 * p + k]!; + aug[col * 2 * p + k] = aug[maxRow * 2 * p + k]!; + aug[maxRow * 2 * p + k] = tmp; + } + } + const pivot = aug[col * 2 * p + col]!; + if (Math.abs(pivot) < 1e-14) continue; + for (let k = 0; k < 2 * p; k++) aug[col * 2 * p + k]! /= pivot; + for (let r = 0; r < p; r++) { + if (r === col) continue; + const f = aug[r * 2 * p + col]!; + for (let k = 0; k < 2 * p; k++) aug[r * 2 * p + k]! -= f * aug[col * 2 * p + k]!; + } + } + const inv = new Float64Array(p * p); + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) inv[i * p + j] = aug[i * 2 * p + p + j]!; + return inv; +} + +// ─── ShrunkCovariance ────────────────────────────────────────────────────── + +export interface ShrunkCovarianceOptions { + /** Shrinkage coefficient in [0, 1]. Default 0.1. */ + shrinkage?: number; + /** Whether to store the precision matrix. Default true. */ + storePrecision?: boolean; + /** Whether to assume the data is already centered. Default false. */ + assumeCentered?: boolean; +} + +/** Covariance estimator with manually set shrinkage (Ledoit-Wolf is automatic). */ +export class ShrunkCovariance { + private opts: Required; + covariance_: Float64Array | undefined; + precision_: Float64Array | undefined; + location_: Float64Array | undefined; + + constructor(opts: ShrunkCovarianceOptions = {}) { + this.opts = { + shrinkage: opts.shrinkage ?? 0.1, + storePrecision: opts.storePrecision ?? true, + assumeCentered: opts.assumeCentered ?? false, + }; + } + + fit(X: Float64Array, nSamples: number, nFeatures: number): this { + const Xc = new Float64Array(X); + let location: Float64Array; + if (this.opts.assumeCentered) { + location = new Float64Array(nFeatures); + } else { + location = centerMatrix(Xc, nSamples, nFeatures); + } + this.location_ = location; + const S = sampleCov(Xc, nSamples, nFeatures); + this.covariance_ = shrinkCov(S, nFeatures, this.opts.shrinkage); + if (this.opts.storePrecision) this.precision_ = invertPD(this.covariance_, nFeatures); + return this; + } + + score(X: Float64Array, nSamples: number, nFeatures: number): number { + if (!this.covariance_) throw new NotFittedError("ShrunkCovariance is not fitted"); + return logLikelihood(X, nSamples, nFeatures, this.covariance_, this.location_!); + } +} + +// ─── OAS ─────────────────────────────────────────────────────────────────── + +export interface OASOptions { + storePrecision?: boolean; + assumeCentered?: boolean; +} + +/** + * Oracle Approximating Shrinkage (OAS) covariance estimator. + * More accurate than Ledoit-Wolf for Gaussian data when n < p. + */ +export class OAS { + private opts: Required; + covariance_: Float64Array | undefined; + precision_: Float64Array | undefined; + shrinkage_: number | undefined; + location_: Float64Array | undefined; + + constructor(opts: OASOptions = {}) { + this.opts = { storePrecision: opts.storePrecision ?? true, assumeCentered: opts.assumeCentered ?? false }; + } + + fit(X: Float64Array, nSamples: number, nFeatures: number): this { + const n = nSamples; const p = nFeatures; + const Xc = new Float64Array(X); + let location: Float64Array; + if (this.opts.assumeCentered) { + location = new Float64Array(p); + } else { + location = centerMatrix(Xc, n, p); + } + this.location_ = location; + const S = sampleCov(Xc, n, p); + + // OAS shrinkage estimate + const trS = (() => { let t = 0; for (let j = 0; j < p; j++) t += S[j * p + j]!; return t; })(); + const trS2 = (() => { + let t = 0; + for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) t += S[i * p + j]! * S[j * p + i]!; + return t; + })(); + + const mu = trS / p; + const rho1 = ((1 - 2 / p) * trS2 + trS * trS) / ((n + 1 - 2 / p) * (trS2 - trS * trS / p)); + const alpha = Math.min(1, Math.max(0, rho1)); + this.shrinkage_ = alpha; + this.covariance_ = shrinkCov(S, p, alpha); + if (this.opts.storePrecision) this.precision_ = invertPD(this.covariance_, p); + // suppress unused warning + void mu; + return this; + } + + score(X: Float64Array, nSamples: number, nFeatures: number): number { + if (!this.covariance_) throw new NotFittedError("OAS is not fitted"); + return logLikelihood(X, nSamples, nFeatures, this.covariance_, this.location_!); + } +} + +// ─── Shared log-likelihood ───────────────────────────────────────────────── + +/** Gaussian log-likelihood of X given a covariance estimate. */ +function logLikelihood( + X: Float64Array, + nSamples: number, + nFeatures: number, + cov: Float64Array, + loc: Float64Array, +): number { + const p = nFeatures; + const prec = invertPD(cov, p); + let ll = 0; + for (let i = 0; i < nSamples; i++) { + let quad = 0; + for (let j = 0; j < p; j++) { + let row = 0; + for (let k = 0; k < p; k++) row += prec[j * p + k]! * (X[i * p + k]! - loc[k]!); + quad += (X[i * p + j]! - loc[j]!) * row; + } + ll -= 0.5 * quad; + } + // Subtract 0.5 * n * log|Ξ£| + let logDet = 0; + // Use the diagonal of a Cholesky factorisation for log-det + const L = choleskyDiag(cov, p); + for (let j = 0; j < p; j++) logDet += 2 * Math.log(Math.max(L[j]!, 1e-15)); + ll -= 0.5 * nSamples * logDet; + ll -= 0.5 * nSamples * p * Math.log(2 * Math.PI); + return ll / nSamples; +} + +/** Returns only the diagonal of the lower Cholesky factor (for log-det). */ +function choleskyDiag(A: Float64Array, p: number): Float64Array { + const L = new Float64Array(p * p); + for (let i = 0; i < p; i++) { + for (let j = 0; j <= i; j++) { + let s = A[i * p + j]!; + for (let k = 0; k < j; k++) s -= L[i * p + k]! * L[j * p + k]!; + if (i === j) { + L[i * p + j] = Math.sqrt(Math.max(s, 0)); + } else { + L[i * p + j] = L[j * p + j]! > 0 ? s / L[j * p + j]! : 0; + } + } + } + return Float64Array.from({ length: p }, (_, j) => L[j * p + j]!); +} diff --git a/src/cross_decomposition/cca.ts b/src/cross_decomposition/cca.ts new file mode 100644 index 0000000..90dbd41 --- /dev/null +++ b/src/cross_decomposition/cca.ts @@ -0,0 +1,260 @@ +/** + * Canonical Correlation Analysis (CCA). + * Mirrors sklearn.cross_decomposition.CCA. + */ + +import { NotFittedError } from "../exceptions.js"; + +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const m = new Float64Array(p); + for (const xi of X) { + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / X.length; + return m; +} + +function centerMatrix(X: Float64Array[], means: Float64Array): Float64Array[] { + return X.map((xi) => new Float64Array(xi.map((v, j) => v - (means[j] ?? 0)))); +} + +/** X^T Y (p x q matrix). */ +function crossProd(X: Float64Array[], Y: Float64Array[]): Float64Array[] { + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const C = Array.from({ length: p }, () => new Float64Array(q)); + for (let i = 0; i < X.length; i++) { + const xi = X[i] ?? new Float64Array(p); + const yi = Y[i] ?? new Float64Array(q); + for (let j = 0; j < p; j++) { + for (let k = 0; k < q; k++) { + C[j]![k] = (C[j]![k] ?? 0) + (xi[j] ?? 0) * (yi[k] ?? 0); + } + } + } + return C; +} + +/** Gram-Schmidt power iteration to find leading singular vectors. */ +function powerSVD( + M: Float64Array[], + nComponents: number, + maxIter = 200, +): { U: Float64Array[]; S: Float64Array; Vt: Float64Array[] } { + const m = M.length; + const n = (M[0] ?? new Float64Array(0)).length; + const U: Float64Array[] = []; + const S: number[] = []; + const Vt: Float64Array[] = []; + + let Mdefl = M.map((row) => new Float64Array(row)); + + for (let c = 0; c < nComponents; c++) { + let u = new Float64Array(m); + u[c % m] = 1; + + for (let iter = 0; iter < maxIter; iter++) { + // v = M^T u + const v = new Float64Array(n); + for (let i = 0; i < m; i++) { + const row = Mdefl[i] ?? new Float64Array(n); + for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) + (u[i] ?? 0) * (row[j] ?? 0); + } + // normalize v + let vnorm = 0; + for (let j = 0; j < n; j++) vnorm += (v[j] ?? 0) ** 2; + vnorm = Math.sqrt(vnorm); + if (vnorm < 1e-10) break; + for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) / vnorm; + // u = M v + const uNew = new Float64Array(m); + for (let i = 0; i < m; i++) { + const row = Mdefl[i] ?? new Float64Array(n); + for (let j = 0; j < n; j++) uNew[i] = (uNew[i] ?? 0) + (row[j] ?? 0) * (v[j] ?? 0); + } + let unorm = 0; + for (let i = 0; i < m; i++) unorm += (uNew[i] ?? 0) ** 2; + unorm = Math.sqrt(unorm); + if (unorm < 1e-10) break; + const sigma = unorm; + for (let i = 0; i < m; i++) uNew[i] = (uNew[i] ?? 0) / unorm; + const diff = Math.sqrt(Array.from({ length: m }, (_, i) => ((uNew[i] ?? 0) - (u[i] ?? 0)) ** 2).reduce((a, b) => a + b, 0)); + u = uNew; + if (diff < 1e-8) { S.push(sigma); break; } + if (iter === maxIter - 1) S.push(sigma); + } + + // Deflate + const sigma = S[c] ?? 0; + const v = new Float64Array(n); + for (let i = 0; i < m; i++) { + const row = Mdefl[i] ?? new Float64Array(n); + for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) + (u[i] ?? 0) * (row[j] ?? 0); + } + let vnorm = 0; + for (let j = 0; j < n; j++) vnorm += (v[j] ?? 0) ** 2; + vnorm = Math.sqrt(vnorm); + if (vnorm > 1e-10) for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) / vnorm; + + U.push(u); + Vt.push(v); + Mdefl = Mdefl.map((row, i) => { + const newRow = new Float64Array(row); + for (let j = 0; j < n; j++) { + newRow[j] = (newRow[j] ?? 0) - sigma * (u[i] ?? 0) * (v[j] ?? 0); + } + return newRow; + }); + } + + return { U, S: new Float64Array(S), Vt }; +} + +/** + * Canonical Correlation Analysis. + * Mirrors sklearn.cross_decomposition.CCA. + */ +export class CCA { + nComponents: number; + maxIter: number; + tol: number; + scale: boolean; + + xWeights_: Float64Array[] | null = null; + yWeights_: Float64Array[] | null = null; + xLoadings_: Float64Array[] | null = null; + yLoadings_: Float64Array[] | null = null; + xMean_: Float64Array | null = null; + yMean_: Float64Array | null = null; + xStd_: Float64Array | null = null; + yStd_: Float64Array | null = null; + + constructor( + options: { + nComponents?: number; + maxIter?: number; + tol?: number; + scale?: boolean; + } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.maxIter = options.maxIter ?? 500; + this.tol = options.tol ?? 1e-6; + this.scale = options.scale ?? true; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + + this.xMean_ = colMeans(X); + this.yMean_ = colMeans(Y); + + let Xc = centerMatrix(X, this.xMean_); + let Yc = centerMatrix(Y, this.yMean_); + + // Compute std for scaling + if (this.scale) { + const xStd = new Float64Array(p); + const yStd = new Float64Array(q); + for (const xi of Xc) for (let j = 0; j < p; j++) xStd[j] = (xStd[j] ?? 0) + (xi[j] ?? 0) ** 2; + for (const yi of Yc) for (let j = 0; j < q; j++) yStd[j] = (yStd[j] ?? 0) + (yi[j] ?? 0) ** 2; + for (let j = 0; j < p; j++) xStd[j] = Math.sqrt((xStd[j] ?? 0) / n); + for (let j = 0; j < q; j++) yStd[j] = Math.sqrt((yStd[j] ?? 0) / n); + this.xStd_ = xStd; + this.yStd_ = yStd; + Xc = Xc.map((xi) => new Float64Array(xi.map((v, j) => v / Math.max(xStd[j] ?? 1, 1e-10)))); + Yc = Yc.map((yi) => new Float64Array(yi.map((v, j) => v / Math.max(yStd[j] ?? 1, 1e-10)))); + } + + // CCA via SVD of X^T Y + const Cxy = crossProd(Xc, Yc); + const k = Math.min(this.nComponents, p, q); + const { U, Vt } = powerSVD(Cxy, k, this.maxIter); + + this.xWeights_ = U; + this.yWeights_ = Vt; + + // Compute loadings + this.xLoadings_ = Array.from({ length: k }, (_, c) => { + const w = U[c] ?? new Float64Array(p); + const t = new Float64Array(n); + for (let i = 0; i < n; i++) { + for (let j = 0; j < p; j++) t[i] = (t[i] ?? 0) + ((Xc[i] ?? new Float64Array(p))[j] ?? 0) * (w[j] ?? 0); + } + const load = new Float64Array(p); + for (let j = 0; j < p; j++) { + let cov = 0; + for (let i = 0; i < n; i++) cov += ((Xc[i] ?? new Float64Array(p))[j] ?? 0) * (t[i] ?? 0); + let tNorm = 0; + for (let i = 0; i < n; i++) tNorm += (t[i] ?? 0) ** 2; + load[j] = tNorm > 0 ? cov / tNorm : 0; + } + return load; + }); + + this.yLoadings_ = Array.from({ length: k }, (_, c) => { + const w = Vt[c] ?? new Float64Array(q); + const u = new Float64Array(n); + for (let i = 0; i < n; i++) { + for (let j = 0; j < q; j++) u[i] = (u[i] ?? 0) + ((Yc[i] ?? new Float64Array(q))[j] ?? 0) * (w[j] ?? 0); + } + const load = new Float64Array(q); + for (let j = 0; j < q; j++) { + let cov = 0; + for (let i = 0; i < n; i++) cov += ((Yc[i] ?? new Float64Array(q))[j] ?? 0) * (u[i] ?? 0); + let uNorm = 0; + for (let i = 0; i < n; i++) uNorm += (u[i] ?? 0) ** 2; + load[j] = uNorm > 0 ? cov / uNorm : 0; + } + return load; + }); + + return this; + } + + transform(X: Float64Array[], Y?: Float64Array[]): [Float64Array[], Float64Array[] | null] { + if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError("CCA"); + const xMean = this.xMean_; + const xStd = this.xStd_; + const k = this.nComponents; + + let Xc = X.map((xi) => new Float64Array(xi.map((v, j) => v - (xMean[j] ?? 0)))); + if (xStd) Xc = Xc.map((xi) => new Float64Array(xi.map((v, j) => v / Math.max(xStd[j] ?? 1, 1e-10)))); + + const xScores = X.map((_, i) => { + const scores = new Float64Array(k); + for (let c = 0; c < k; c++) { + const w = this.xWeights_![c] ?? new Float64Array(0); + for (let j = 0; j < w.length; j++) scores[c] = (scores[c] ?? 0) + ((Xc[i] ?? new Float64Array(0))[j] ?? 0) * (w[j] ?? 0); + } + return scores; + }); + + if (Y === undefined) return [xScores, null]; + + const yMean = this.yMean_!; + const yStd = this.yStd_; + let Yc = Y.map((yi) => new Float64Array(yi.map((v, j) => v - (yMean[j] ?? 0)))); + if (yStd) Yc = Yc.map((yi) => new Float64Array(yi.map((v, j) => v / Math.max(yStd[j] ?? 1, 1e-10)))); + + const yScores = Y.map((_, i) => { + const scores = new Float64Array(k); + for (let c = 0; c < k; c++) { + const w = this.yWeights_![c] ?? new Float64Array(0); + for (let j = 0; j < w.length; j++) scores[c] = (scores[c] ?? 0) + ((Yc[i] ?? new Float64Array(0))[j] ?? 0) * (w[j] ?? 0); + } + return scores; + }); + + return [xScores, yScores]; + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] { + this.fit(X, Y); + const [xS, yS] = this.transform(X, Y); + return [xS, yS!]; + } +} diff --git a/src/cross_decomposition/cross_decomp_ext.ts b/src/cross_decomposition/cross_decomp_ext.ts new file mode 100644 index 0000000..2041263 --- /dev/null +++ b/src/cross_decomposition/cross_decomp_ext.ts @@ -0,0 +1,159 @@ +/** + * Extended cross-decomposition: CCA extensions, PLSSVD utilities, + * and canonical correlation analysis helpers. + */ + +/** Deflation step for PLS: subtract outer product of scores. */ +export function deflate( + X: Float64Array[], + xScores: Float64Array, + xLoadings: Float64Array, +): Float64Array[] { + const n = X.length; + const d = X[0]?.length ?? 0; + return X.map((xi, i) => { + const t = xScores[i] ?? 0; + return xi.map((v, j) => v - t * (xLoadings[j] ?? 0)); + }); +} + +/** NIPALS algorithm step: find first latent variable pair. */ +export interface NIPALSResult { + xWeights: Float64Array; + yWeights: Float64Array; + xScores: Float64Array; + yScores: Float64Array; + xLoadings: Float64Array; + yLoadings: Float64Array; +} + +export function nipalsStep( + X: Float64Array[], + Y: Float64Array[], + maxIter = 500, + tol = 1e-6, +): NIPALSResult { + const n = X.length; + const p = X[0]?.length ?? 0; + const q = Y[0]?.length ?? 0; + + // Initialize u as first column of Y + let u = new Float64Array(n).map((_, i) => Y[i]?.[0] ?? 0); + let xWeights = new Float64Array(p); + let yWeights = new Float64Array(q); + + for (let iter = 0; iter < maxIter; iter++) { + // w = X^T u / ||X^T u|| + const xw = new Float64Array(p); + for (let j = 0; j < p; j++) { + let sum = 0; + for (let i = 0; i < n; i++) sum += (X[i]?.[j] ?? 0) * (u[i] ?? 0); + xw[j] = sum; + } + const xwNorm = Math.sqrt(xw.reduce((s, v) => s + v * v, 0)) + 1e-10; + for (let j = 0; j < p; j++) xw[j] = (xw[j] ?? 0) / xwNorm; + + // t = X w + const t = new Float64Array(n).map((_, i) => { + let sum = 0; + for (let j = 0; j < p; j++) sum += (X[i]?.[j] ?? 0) * (xw[j] ?? 0); + return sum; + }); + + // q = Y^T t / ||Y^T t|| + const yq = new Float64Array(q); + for (let j = 0; j < q; j++) { + let sum = 0; + for (let i = 0; i < n; i++) sum += (Y[i]?.[j] ?? 0) * (t[i] ?? 0); + yq[j] = sum; + } + const yqNorm = Math.sqrt(yq.reduce((s, v) => s + v * v, 0)) + 1e-10; + for (let j = 0; j < q; j++) yq[j] = (yq[j] ?? 0) / yqNorm; + + // u_new = Y q + const uNew = new Float64Array(n).map((_, i) => { + let sum = 0; + for (let j = 0; j < q; j++) sum += (Y[i]?.[j] ?? 0) * (yq[j] ?? 0); + return sum; + }); + + const diff = Math.sqrt(uNew.reduce((s, v, i) => s + (v - (u[i] ?? 0)) ** 2, 0)); + u = uNew; + xWeights = xw; + yWeights = yq; + if (diff < tol) break; + } + + const xScores = new Float64Array(n).map((_, i) => { + let sum = 0; + for (let j = 0; j < p; j++) sum += (X[i]?.[j] ?? 0) * (xWeights[j] ?? 0); + return sum; + }); + const yScores = u; + + // Loadings: X^T t / ||t||^2 + const tNorm2 = xScores.reduce((s, v) => s + v * v, 0) + 1e-10; + const xLoadings = new Float64Array(p).map((_, j) => { + let sum = 0; + for (let i = 0; i < n; i++) sum += (X[i]?.[j] ?? 0) * (xScores[i] ?? 0); + return sum / tNorm2; + }); + const uNorm2 = yScores.reduce((s, v) => s + v * v, 0) + 1e-10; + const yLoadings = new Float64Array(q).map((_, j) => { + let sum = 0; + for (let i = 0; i < n; i++) sum += (Y[i]?.[j] ?? 0) * (yScores[i] ?? 0); + return sum / uNorm2; + }); + + return { xWeights, yWeights, xScores, yScores, xLoadings, yLoadings }; +} + +/** Canonical Correlation Analysis helpers. */ +export interface CCAResult { + xWeights: Float64Array[]; + yWeights: Float64Array[]; + xScores: Float64Array[]; + yScores: Float64Array[]; + correlations: Float64Array; +} + +/** Compute canonical correlations between X and Y (simplified). */ +export function canonicalCorrelations( + X: Float64Array[], + Y: Float64Array[], + nComponents = 1, +): CCAResult { + const nComp = Math.min(nComponents, X[0]?.length ?? 1, Y[0]?.length ?? 1); + let XR = X; + let YR = Y; + const xWeights: Float64Array[] = []; + const yWeights: Float64Array[] = []; + const xScores: Float64Array[] = []; + const yScores: Float64Array[] = []; + const correlations = new Float64Array(nComp); + + for (let c = 0; c < nComp; c++) { + const result = nipalsStep(XR, YR); + xWeights.push(result.xWeights); + yWeights.push(result.yWeights); + xScores.push(result.xScores); + yScores.push(result.yScores); + + // Correlation between t and u + const tMean = result.xScores.reduce((s, v) => s + v, 0) / result.xScores.length; + const uMean = result.yScores.reduce((s, v) => s + v, 0) / result.yScores.length; + let cov = 0, st = 0, su = 0; + for (let i = 0; i < result.xScores.length; i++) { + cov += ((result.xScores[i] ?? 0) - tMean) * ((result.yScores[i] ?? 0) - uMean); + st += ((result.xScores[i] ?? 0) - tMean) ** 2; + su += ((result.yScores[i] ?? 0) - uMean) ** 2; + } + correlations[c] = cov / (Math.sqrt(st * su) + 1e-10); + + // Deflate + XR = deflate(XR, result.xScores, result.xLoadings); + YR = deflate(YR, result.yScores, result.yLoadings); + } + + return { xWeights, yWeights, xScores, yScores, correlations }; +} diff --git a/src/cross_decomposition/cross_decomp_ext2.ts b/src/cross_decomposition/cross_decomp_ext2.ts new file mode 100644 index 0000000..5e17c08 --- /dev/null +++ b/src/cross_decomposition/cross_decomp_ext2.ts @@ -0,0 +1,149 @@ +/** + * Cross-decomposition extensions: PLSSVD, CCA extensions. + * Port of sklearn.cross_decomposition extensions. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** PLS Canonical (PLSC): symmetric variant of PLS. */ +export class PLSCanonical { + private xRotations_: Float64Array[] | null = null; + private yRotations_: Float64Array[] | null = null; + private xMean_: Float64Array | null = null; + private yMean_: Float64Array | null = null; + readonly nComponents: number; + readonly maxIter: number; + readonly tol: number; + + constructor( + options: { + nComponents?: number; + maxIter?: number; + tol?: number; + } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.maxIter = options.maxIter ?? 500; + this.tol = options.tol ?? 1e-6; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const p = X[0]?.length ?? 0; + const q = Y[0]?.length ?? 0; + const k = Math.min(this.nComponents, Math.min(p, q)); + + const xMean = new Float64Array(p); + const yMean = new Float64Array(q); + for (let i = 0; i < n; i++) { + for (let j = 0; j < p; j++) xMean[j]! += X[i]?.[j] ?? 0; + for (let j = 0; j < q; j++) yMean[j]! += Y[i]?.[j] ?? 0; + } + for (let j = 0; j < p; j++) xMean[j]! /= n; + for (let j = 0; j < q; j++) yMean[j]! /= n; + this.xMean_ = xMean; + this.yMean_ = yMean; + + const Xc = X.map((row) => new Float64Array(p).map((_, j) => (row[j] ?? 0) - (xMean[j] ?? 0))); + const Yc = Y.map((row) => new Float64Array(q).map((_, j) => (row[j] ?? 0) - (yMean[j] ?? 0))); + + const xRotations: Float64Array[] = []; + const yRotations: Float64Array[] = []; + + let XResid = Xc.map((r) => new Float64Array(r)); + let YResid = Yc.map((r) => new Float64Array(r)); + + for (let comp = 0; comp < k; comp++) { + // Compute X^T * Y covariance + const Cxy = Array.from({ length: p }, (_, a) => + new Float64Array(q).map((_, b) => { + let s = 0; + for (let i = 0; i < n; i++) s += (XResid[i]?.[a] ?? 0) * (YResid[i]?.[b] ?? 0); + return s; + }), + ); + // Power iteration for first SVD component + let u = new Float64Array(p); + u[0] = 1; + let v = new Float64Array(q); + for (let iter = 0; iter < this.maxIter; iter++) { + // u = Cxy * v + const newU = new Float64Array(p); + for (let a = 0; a < p; a++) { + for (let b = 0; b < q; b++) newU[a]! += (Cxy[a]?.[b] ?? 0) * (v[b] ?? 0); + } + let norm = 0; + for (let a = 0; a < p; a++) norm += (newU[a] ?? 0) ** 2; + norm = Math.sqrt(norm) || 1; + for (let a = 0; a < p; a++) newU[a]! /= norm; + // v = Cxy^T * u + const newV = new Float64Array(q); + for (let b = 0; b < q; b++) { + for (let a = 0; a < p; a++) newV[b]! += (Cxy[a]?.[b] ?? 0) * (newU[a] ?? 0); + } + let normV = 0; + for (let b = 0; b < q; b++) normV += (newV[b] ?? 0) ** 2; + normV = Math.sqrt(normV) || 1; + for (let b = 0; b < q; b++) newV[b]! /= normV; + let diff = 0; + for (let a = 0; a < p; a++) diff += ((newU[a] ?? 0) - (u[a] ?? 0)) ** 2; + u = newU; + v = newV; + if (diff < this.tol) break; + } + xRotations.push(u); + yRotations.push(v); + // Deflate + const xt = new Float64Array(n).map((_, i) => { + let s = 0; + for (let a = 0; a < p; a++) s += (XResid[i]?.[a] ?? 0) * (u[a] ?? 0); + return s; + }); + for (let i = 0; i < n; i++) { + for (let a = 0; a < p; a++) XResid[i]![a]! -= (xt[i] ?? 0) * (u[a] ?? 0); + } + const yt = new Float64Array(n).map((_, i) => { + let s = 0; + for (let b = 0; b < q; b++) s += (YResid[i]?.[b] ?? 0) * (v[b] ?? 0); + return s; + }); + for (let i = 0; i < n; i++) { + for (let b = 0; b < q; b++) YResid[i]![b]! -= (yt[i] ?? 0) * (v[b] ?? 0); + } + } + this.xRotations_ = xRotations; + this.yRotations_ = yRotations; + return this; + } + + transform(X: Float64Array[], Y?: Float64Array[]): { xScores: Float64Array[]; yScores?: Float64Array[] } { + if (this.xRotations_ === null || this.xMean_ === null) throw new NotFittedError("PLSCanonical is not fitted."); + const k = this.xRotations_.length; + const xScores = X.map((row) => { + const scores = new Float64Array(k); + for (let c = 0; c < k; c++) { + for (let j = 0; j < row.length; j++) { + scores[c] += ((row[j] ?? 0) - (this.xMean_![j] ?? 0)) * (this.xRotations_![c]?.[j] ?? 0); + } + } + return scores; + }); + if (!Y || !this.yRotations_ || !this.yMean_) return { xScores }; + const yScores = Y.map((row) => { + const scores = new Float64Array(k); + for (let c = 0; c < k; c++) { + for (let j = 0; j < row.length; j++) { + scores[c] += ((row[j] ?? 0) - (this.yMean_![j] ?? 0)) * (this.yRotations_![c]?.[j] ?? 0); + } + } + return scores; + }); + return { xScores, yScores }; + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): { xScores: Float64Array[]; yScores: Float64Array[] } { + this.fit(X, Y); + const result = this.transform(X, Y); + return { xScores: result.xScores, yScores: result.yScores! }; + } +} diff --git a/src/cross_decomposition/cross_decomp_ext3.ts b/src/cross_decomposition/cross_decomp_ext3.ts new file mode 100644 index 0000000..54d9a04 --- /dev/null +++ b/src/cross_decomposition/cross_decomp_ext3.ts @@ -0,0 +1,188 @@ +/** + * Cross-decomposition extensions: NIPALS, PLS2. + * Mirrors sklearn.cross_decomposition advanced methods. + */ + +import { BaseEstimator } from "../base.js"; + +export interface NIPALSParams { + n_components?: number; + max_iter?: number; + tol?: number; +} + +/** NIPALS: Nonlinear Iterative Partial Least Squares algorithm. */ +export class NIPALS extends BaseEstimator { + n_components: number; + max_iter: number; + tol: number; + x_weights_: Float64Array[] = []; + y_weights_: Float64Array[] = []; + x_loadings_: Float64Array[] = []; + y_loadings_: Float64Array[] = []; + x_scores_: Float64Array[] = []; + y_scores_: Float64Array[] = []; + x_mean_: Float64Array = new Float64Array(0); + y_mean_: Float64Array = new Float64Array(0); + n_features_in_ = 0; + + constructor(params: NIPALSParams = {}) { + super(); + this.n_components = params.n_components ?? 2; + this.max_iter = params.max_iter ?? 500; + this.tol = params.tol ?? 1e-6; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const px = X[0]?.length ?? 0, py = Y[0]?.length ?? 0; + this.n_features_in_ = px; + this.x_mean_ = new Float64Array(px); + this.y_mean_ = new Float64Array(py); + for (let k = 0; k < px; k++) for (const xi of X) this.x_mean_[k] = (this.x_mean_[k] ?? 0) + (xi[k] ?? 0); + for (let k = 0; k < py; k++) for (const yi of Y) this.y_mean_[k] = (this.y_mean_[k] ?? 0) + (yi[k] ?? 0); + for (let k = 0; k < px; k++) this.x_mean_[k] = (this.x_mean_[k] ?? 0) / n; + for (let k = 0; k < py; k++) this.y_mean_[k] = (this.y_mean_[k] ?? 0) / n; + let Xr = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0))); + let Yr = Y.map((yi) => new Float64Array(py).map((_, k) => (yi[k] ?? 0) - (this.y_mean_[k] ?? 0))); + for (let c = 0; c < this.n_components; c++) { + // NIPALS iteration + let u = Yr.map((yi) => yi[0] ?? 0); + let w = new Float64Array(px), q = new Float64Array(py), t = new Float64Array(n); + for (let iter = 0; iter < this.max_iter; iter++) { + // w = X'u / ||X'u|| + for (let j = 0; j < px; j++) { let s = 0; for (let i = 0; i < n; i++) s += (Xr[i]?.[j] ?? 0) * (u[i] ?? 0); w[j] = s; } + let wn = 0; for (const v of w) wn += v * v; wn = Math.sqrt(wn); if (wn > 1e-10) for (let j = 0; j < px; j++) w[j] = (w[j] ?? 0) / wn; + // t = Xw + for (let i = 0; i < n; i++) { let s = 0; for (let j = 0; j < px; j++) s += (Xr[i]?.[j] ?? 0) * (w[j] ?? 0); t[i] = s; } + // q = Y't / ||Y't|| + for (let j = 0; j < py; j++) { let s = 0; for (let i = 0; i < n; i++) s += (Yr[i]?.[j] ?? 0) * (t[i] ?? 0); q[j] = s; } + let qn = 0; for (const v of q) qn += v * v; qn = Math.sqrt(qn); if (qn > 1e-10) for (let j = 0; j < py; j++) q[j] = (q[j] ?? 0) / qn; + // u = Yq + const uNew = new Float64Array(n); + for (let i = 0; i < n; i++) { let s = 0; for (let j = 0; j < py; j++) s += (Yr[i]?.[j] ?? 0) * (q[j] ?? 0); uNew[i] = s; } + let diff = 0; for (let i = 0; i < n; i++) diff += (uNew[i] ?? 0 - (u[i] ?? 0)) ** 2; + u = Array.from(uNew); + if (Math.sqrt(diff) < this.tol) break; + } + // Deflate + const pLoading = new Float64Array(px); + const tn2 = t.reduce((s, v) => s + v * v, 0); + if (tn2 > 1e-10) { + for (let j = 0; j < px; j++) { let s = 0; for (let i = 0; i < n; i++) s += (Xr[i]?.[j] ?? 0) * (t[i] ?? 0); pLoading[j] = s / tn2; } + } + Xr = Xr.map((xi, i) => new Float64Array(px).map((_, j) => (xi[j] ?? 0) - (t[i] ?? 0) * (pLoading[j] ?? 0))); + Yr = Yr.map((yi, i) => new Float64Array(py).map((_, j) => (yi[j] ?? 0) - (q[j] ?? 0) * u[i]!)); + this.x_weights_.push(w); + this.y_weights_.push(q); + this.x_loadings_.push(pLoading); + this.y_loadings_.push(q); + this.x_scores_.push(t); + this.y_scores_.push(new Float64Array(u)); + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + const nc = this.n_components; + const px = this.n_features_in_; + const Xc = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0))); + return Xc.map((xi) => new Float64Array(nc).map((_, c) => { + let s = 0; + const w = this.x_weights_[c]; + if (w) for (let k = 0; k < px; k++) s += (w[k] ?? 0) * (xi[k] ?? 0); + return s; + })); + } + + fit_transform(X: Float64Array[], Y: Float64Array[]): Float64Array[] { + return this.fit(X, Y).transform(X); + } +} + +export interface CanonicalCorrelationExtParams { + n_components?: number; + regularization?: number; +} + +/** Canonical Correlation Analysis (CCA) extension. */ +export class CCAExt extends BaseEstimator { + n_components: number; + regularization: number; + x_weights_: Float64Array[] = []; + y_weights_: Float64Array[] = []; + x_mean_: Float64Array = new Float64Array(0); + y_mean_: Float64Array = new Float64Array(0); + n_features_in_ = 0; + + constructor(params: CanonicalCorrelationExtParams = {}) { + super(); + this.n_components = params.n_components ?? 2; + this.regularization = params.regularization ?? 1e-4; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const px = X[0]?.length ?? 0, py = Y[0]?.length ?? 0; + this.n_features_in_ = px; + this.x_mean_ = new Float64Array(px); + this.y_mean_ = new Float64Array(py); + for (let k = 0; k < px; k++) for (const xi of X) this.x_mean_[k] = (this.x_mean_[k] ?? 0) + (xi[k] ?? 0); + for (let k = 0; k < py; k++) for (const yi of Y) this.y_mean_[k] = (this.y_mean_[k] ?? 0) + (yi[k] ?? 0); + for (let k = 0; k < px; k++) this.x_mean_[k] = (this.x_mean_[k] ?? 0) / n; + for (let k = 0; k < py; k++) this.y_mean_[k] = (this.y_mean_[k] ?? 0) / n; + const Xc = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0))); + const Yc = Y.map((yi) => new Float64Array(py).map((_, k) => (yi[k] ?? 0) - (this.y_mean_[k] ?? 0))); + // Covariance matrices + const cov = (A: Float64Array[], B: Float64Array[], pa: number, pb: number): Float64Array[] => { + const C = Array.from({ length: pa }, () => new Float64Array(pb)); + for (let i = 0; i < n; i++) for (let a = 0; a < pa; a++) for (let b = 0; b < pb; b++) C[a]![b] = (C[a]![b] ?? 0) + (A[i]?.[a] ?? 0) * (B[i]?.[b] ?? 0); + for (let a = 0; a < pa; a++) for (let b = 0; b < pb; b++) C[a]![b] = (C[a]![b] ?? 0) / n; + return C; + }; + const Sxx = cov(Xc, Xc, px, px); + const Syy = cov(Yc, Yc, py, py); + const Sxy = cov(Xc, Yc, px, py); + // Regularize diagonals + for (let i = 0; i < px; i++) Sxx[i]![i] = (Sxx[i]![i] ?? 0) + this.regularization; + for (let i = 0; i < py; i++) Syy[i]![i] = (Syy[i]![i] ?? 0) + this.regularization; + // Power iteration for canonical directions + const nc = Math.min(this.n_components, px, py); + for (let c = 0; c < nc; c++) { + let wx = new Float64Array(px).map((_, i) => i === c ? 1 : 0.01); + for (let iter = 0; iter < 50; iter++) { + // wx = Sxx^-1 * Sxy * Syy^-1 * Sxy' * wx (power iteration approximation) + const Sxy_wx = new Float64Array(py).map((_, j) => { let s = 0; for (let k = 0; k < px; k++) s += (Sxy[k]?.[j] ?? 0) * (wx[k] ?? 0); return s; }); + const Syy_inv_v = new Float64Array(py).map((_, j) => (Sxy_wx[j] ?? 0) / (Syy[j]?.[j] ?? 1)); + const Sxyt_v = new Float64Array(px).map((_, i) => { let s = 0; for (let j = 0; j < py; j++) s += (Sxy[i]?.[j] ?? 0) * (Syy_inv_v[j] ?? 0); return s; }); + const newWx = new Float64Array(px).map((_, i) => (Sxyt_v[i] ?? 0) / (Sxx[i]?.[i] ?? 1)); + let norm = 0; for (const v of newWx) norm += v * v; norm = Math.sqrt(norm); + if (norm > 1e-10) for (let i = 0; i < px; i++) newWx[i] = (newWx[i] ?? 0) / norm; + let diff = 0; for (let i = 0; i < px; i++) diff += ((newWx[i] ?? 0) - (wx[i] ?? 0)) ** 2; + wx = newWx; + if (Math.sqrt(diff) < 1e-8) break; + } + const wy = new Float64Array(py).map((_, j) => { let s = 0; for (let i = 0; i < px; i++) s += (Sxy[i]?.[j] ?? 0) * (wx[i] ?? 0); return s; }); + let wyn = 0; for (const v of wy) wyn += v * v; wyn = Math.sqrt(wyn); + if (wyn > 1e-10) for (let j = 0; j < py; j++) wy[j] = (wy[j] ?? 0) / wyn; + this.x_weights_.push(wx); + this.y_weights_.push(wy); + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + const px = this.n_features_in_; + const Xc = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0))); + return Xc.map((xi) => new Float64Array(this.n_components).map((_, c) => { + let s = 0; + const w = this.x_weights_[c]; + if (w) for (let k = 0; k < px; k++) s += (w[k] ?? 0) * (xi[k] ?? 0); + return s; + })); + } + + fit_transform(X: Float64Array[], Y: Float64Array[]): Float64Array[] { + return this.fit(X, Y).transform(X); + } +} diff --git a/src/cross_decomposition/index.ts b/src/cross_decomposition/index.ts new file mode 100644 index 0000000..a7232c5 --- /dev/null +++ b/src/cross_decomposition/index.ts @@ -0,0 +1,3 @@ +export * from "./pls.js"; +export * from "./cca.js"; +export * from "./pls_svd.js"; diff --git a/src/cross_decomposition/pls.ts b/src/cross_decomposition/pls.ts new file mode 100644 index 0000000..395c1a4 --- /dev/null +++ b/src/cross_decomposition/pls.ts @@ -0,0 +1,404 @@ +/** + * Cross decomposition: PLSRegression, PLSSVD, PLSCanonical, CCA. + * Mirrors sklearn.cross_decomposition. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute column means. */ +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const m = new Float64Array(p); + for (const xi of X) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / X.length; + return m; +} + +/** Center X by subtracting column means. */ +function center(X: Float64Array[], means: Float64Array): Float64Array[] { + const p = means.length; + return X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (means[j] ?? 0); + return out; + }); +} + +/** Compute X^T Y (p x q). */ +function Xtranspose_Y(X: Float64Array[], Y: Float64Array[]): Float64Array[] { + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const n = X.length; + const out = Array.from({ length: p }, () => new Float64Array(q)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + const yi = Y[i] ?? new Float64Array(q); + for (let j = 0; j < p; j++) { + for (let k = 0; k < q; k++) { + out[j]![k] = (out[j]![k] ?? 0) + (xi[j] ?? 0) * (yi[k] ?? 0); + } + } + } + return out; +} + +/** Compute matrix-vector product. */ +function matVec(M: Float64Array[], v: Float64Array): Float64Array { + const out = new Float64Array(M.length); + for (let i = 0; i < M.length; i++) { + const row = M[i] ?? new Float64Array(0); + for (let j = 0; j < v.length; j++) out[i] = (out[i] ?? 0) + (row[j] ?? 0) * (v[j] ?? 0); + } + return out; +} + +/** L2 norm of a vector. */ +function norm(v: Float64Array): number { + let s = 0; + for (let j = 0; j < v.length; j++) s += (v[j] ?? 0) ** 2; + return Math.sqrt(s); +} + +/** Normalize a vector in-place. */ +function normalize(v: Float64Array): void { + const n = norm(v); + if (n > 1e-15) for (let j = 0; j < v.length; j++) v[j] = (v[j] ?? 0) / n; +} + +/** Dot product. */ +function dot(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let j = 0; j < a.length; j++) s += (a[j] ?? 0) * (b[j] ?? 0); + return s; +} + +/** NIPALS: find first left/right singular vectors of M via power iteration. */ +function nipals( + XtY: Float64Array[], + tol = 1e-10, + maxIter = 500, +): { u: Float64Array; v: Float64Array } { + const p = XtY.length; + const q = (XtY[0] ?? new Float64Array(0)).length; + let v = new Float64Array(q); + v[0] = 1; + let u = new Float64Array(p); + for (let iter = 0; iter < maxIter; iter++) { + // u = XtY v / ||XtY v|| + const uNew = matVec(XtY, v); + normalize(uNew); + // v = XtY^T u / ||XtY^T u|| + const vNew = new Float64Array(q); + for (let k = 0; k < q; k++) { + for (let j = 0; j < p; j++) { + vNew[k] = (vNew[k] ?? 0) + (XtY[j]![k] ?? 0) * (uNew[j] ?? 0); + } + } + normalize(vNew); + const diff = + norm( + Float64Array.from({ length: p }, (_, i) => (uNew[i] ?? 0) - (u[i] ?? 0)), + ) + + norm( + Float64Array.from({ length: q }, (_, i) => (vNew[i] ?? 0) - (v[i] ?? 0)), + ); + u = uNew as Float64Array; + v = vNew; + if (diff < tol) break; + } + return { u, v }; +} + +/** + * PLS regression via NIPALS algorithm. + * Mirrors sklearn.cross_decomposition.PLSRegression. + */ +export class PLSRegression { + nComponents: number; + maxIter: number; + tol: number; + scale: boolean; + + xWeights_: Float64Array[] | null = null; + yWeights_: Float64Array[] | null = null; + xLoadings_: Float64Array[] | null = null; + yLoadings_: Float64Array[] | null = null; + xScores_: Float64Array[] | null = null; + yScores_: Float64Array[] | null = null; + coef_: Float64Array[] | null = null; + + xMean_: Float64Array | null = null; + yMean_: Float64Array | null = null; + + constructor( + options: { + nComponents?: number; + maxIter?: number; + tol?: number; + scale?: boolean; + } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.maxIter = options.maxIter ?? 500; + this.tol = options.tol ?? 1e-06; + this.scale = options.scale ?? true; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents, p, q); + + this.xMean_ = colMeans(X); + this.yMean_ = colMeans(Y); + let Xc = center(X, this.xMean_); + let Yc = center(Y, this.yMean_); + + this.xWeights_ = []; + this.yWeights_ = []; + this.xLoadings_ = []; + this.yLoadings_ = []; + this.xScores_ = Array.from({ length: n }, () => new Float64Array(k)); + this.yScores_ = Array.from({ length: n }, () => new Float64Array(k)); + + for (let comp = 0; comp < k; comp++) { + const XtY = Xtranspose_Y(Xc, Yc); + const { u, v } = nipals(XtY, this.tol, this.maxIter); + + // Scores: t = Xc u, s = Yc v + const t = new Float64Array(n); + const s = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + const yi = Yc[i] ?? new Float64Array(q); + t[i] = dot(xi, u); + s[i] = dot(yi, v); + } + + // Normalize t + const tNorm = norm(t); + if (tNorm > 1e-15) for (let i = 0; i < n; i++) t[i] = (t[i] ?? 0) / tNorm; + + // X loadings: p_h = Xc^T t + const px = new Float64Array(p); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) px[j] = (px[j] ?? 0) + (xi[j] ?? 0) * (t[i] ?? 0); + } + + // Y loadings: q_h = Yc^T s / ||s||^2 + const sNorm2 = dot(s, s); + const qy = new Float64Array(q); + for (let i = 0; i < n; i++) { + const yi = Yc[i] ?? new Float64Array(q); + for (let j = 0; j < q; j++) { + qy[j] = (qy[j] ?? 0) + (yi[j] ?? 0) * (s[i] ?? 0); + } + } + if (sNorm2 > 1e-15) for (let j = 0; j < q; j++) qy[j] = (qy[j] ?? 0) / sNorm2; + + this.xWeights_[comp] = u; + this.yWeights_[comp] = v; + this.xLoadings_[comp] = px; + this.yLoadings_[comp] = qy; + for (let i = 0; i < n; i++) { + this.xScores_![i]![comp] = t[i] ?? 0; + this.yScores_![i]![comp] = s[i] ?? 0; + } + + // Deflate + const tFull = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + tFull[i] = dot(xi, u); + } + Xc = Xc.map((xi, i) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (tFull[i] ?? 0) * (px[j] ?? 0); + return out; + }); + Yc = Yc.map((yi, i) => { + const out = new Float64Array(q); + for (let j = 0; j < q; j++) out[j] = (yi[j] ?? 0) - (tFull[i] ?? 0) * (qy[j] ?? 0); + return out; + }); + } + + // Compute regression coefficients: coef_ = W (P^T W)^{-1} Q^T + // Simplified: use pseudo-inverse via stored weights and loadings + this._computeCoef(p, q, k); + return this; + } + + private _computeCoef(p: number, q: number, k: number): void { + // coef_ = xWeights_ @ inv(xLoadings_^T @ xWeights_) @ yLoadings_^T + // For simplicity, use a direct approach: coef = W (P^T W)^-1 Q^T + const W = this.xWeights_!; + const P = this.xLoadings_!; + const Q = this.yLoadings_!; + + // PtW = P^T W (k x k) + const PtW = Array.from({ length: k }, () => new Float64Array(k)); + for (let i = 0; i < k; i++) { + for (let j = 0; j < k; j++) { + PtW[i]![j] = dot(P[i] ?? new Float64Array(0), W[j] ?? new Float64Array(0)); + } + } + + // Invert PtW (simple LU for small k) + const inv = this._invertSmall(PtW, k); + + // coef_ (p x q) = W @ inv @ Q^T + this.coef_ = Array.from({ length: p }, () => new Float64Array(q)); + for (let i = 0; i < p; i++) { + for (let j = 0; j < q; j++) { + let s = 0; + for (let a = 0; a < k; a++) { + let s2 = 0; + for (let b = 0; b < k; b++) { + s2 += (inv[a]![b] ?? 0) * (Q[b]![j] ?? 0); + } + s += (W[a]![i] ?? 0) * s2; + } + this.coef_![i]![j] = s; + } + } + } + + private _invertSmall(M: Float64Array[], k: number): Float64Array[] { + // Augmented matrix [M | I] + const aug = Array.from({ length: k }, (_, i) => { + const row = new Float64Array(2 * k); + for (let j = 0; j < k; j++) row[j] = M[i]![j] ?? 0; + row[k + i] = 1; + return row; + }); + for (let col = 0; col < k; col++) { + // Find pivot + let maxRow = col; + for (let row = col + 1; row < k; row++) { + if (Math.abs(aug[row]![col] ?? 0) > Math.abs(aug[maxRow]![col] ?? 0)) maxRow = row; + } + const tmpPls = aug[col]!; aug[col] = aug[maxRow]!; aug[maxRow] = tmpPls; + const pivot = aug[col]![col] ?? 1e-12; + if (Math.abs(pivot) < 1e-15) continue; + for (let j = 0; j < 2 * k; j++) aug[col]![j] = (aug[col]![j] ?? 0) / pivot; + for (let row = 0; row < k; row++) { + if (row === col) continue; + const factor = aug[row]![col] ?? 0; + for (let j = 0; j < 2 * k; j++) { + aug[row]![j] = (aug[row]![j] ?? 0) - factor * (aug[col]![j] ?? 0); + } + } + } + return aug.map((row) => Float64Array.from({ length: k }, (_, j) => row[k + j] ?? 0)); + } + + predict(X: Float64Array[]): Float64Array[] { + if (this.coef_ === null || this.xMean_ === null || this.yMean_ === null) { + throw new NotFittedError(); + } + const p = this.xMean_.length; + const q = this.yMean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0); + const out = new Float64Array(q); + for (let j = 0; j < q; j++) { + let s = 0; + for (let k = 0; k < p; k++) s += (xc[k] ?? 0) * (this.coef_![k]![j] ?? 0); + out[j] = s + (this.yMean_![j] ?? 0); + } + return out; + }); + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError(); + const k = this.xWeights_.length; + const p = this.xMean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0); + const out = new Float64Array(k); + for (let i = 0; i < k; i++) { + out[i] = dot(xc, this.xWeights_![i] ?? new Float64Array(0)); + } + return out; + }); + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] { + this.fit(X, Y); + return [this.xScores_!, this.yScores_!]; + } +} + +/** + * Partial Least Squares SVD. + * Mirrors sklearn.cross_decomposition.PLSSVD. + */ +export class PLSSVD { + nComponents: number; + + xWeights_: Float64Array[] | null = null; + yWeights_: Float64Array[] | null = null; + xScores_: Float64Array[] | null = null; + yScores_: Float64Array[] | null = null; + xMean_: Float64Array | null = null; + yMean_: Float64Array | null = null; + + constructor(options: { nComponents?: number } = {}) { + this.nComponents = options.nComponents ?? 2; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents, p, q); + + this.xMean_ = colMeans(X); + this.yMean_ = colMeans(Y); + const Xc = center(X, this.xMean_); + const Yc = center(Y, this.yMean_); + + this.xWeights_ = []; + this.yWeights_ = []; + this.xScores_ = Array.from({ length: n }, () => new Float64Array(k)); + this.yScores_ = Array.from({ length: n }, () => new Float64Array(k)); + + const curXtY = Xtranspose_Y(Xc, Yc); + for (let comp = 0; comp < k; comp++) { + const { u, v } = nipals(curXtY); + this.xWeights_[comp] = u; + this.yWeights_[comp] = v; + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + const yi = Yc[i] ?? new Float64Array(q); + this.xScores_![i]![comp] = dot(xi, u); + this.yScores_![i]![comp] = dot(yi, v); + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError(); + const k = this.xWeights_.length; + const p = this.xMean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0); + const out = new Float64Array(k); + for (let i = 0; i < k; i++) out[i] = dot(xc, this.xWeights_![i] ?? new Float64Array(0)); + return out; + }); + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] { + this.fit(X, Y); + return [this.xScores_!, this.yScores_!]; + } +} diff --git a/src/cross_decomposition/pls_svd.ts b/src/cross_decomposition/pls_svd.ts new file mode 100644 index 0000000..0b3a156 --- /dev/null +++ b/src/cross_decomposition/pls_svd.ts @@ -0,0 +1,170 @@ +/** + * Extended PLS utilities: PLSSVDExt. + * Mirrors sklearn.cross_decomposition.PLSSVD. + */ + +export interface PLSSVDOptions { + nComponents?: number; + scale?: boolean; + copyData?: boolean; +} + +/** + * Partial Least Squares SVD. + * Finds the directions of maximum covariance between X and Y. + */ +export class PLSSVDExt { + nComponents: number; + scale: boolean; + + xWeights_: Float64Array[] | null = null; + yWeights_: Float64Array[] | null = null; + xScores_: Float64Array[] | null = null; + yScores_: Float64Array[] | null = null; + xMean_: Float64Array | null = null; + yMean_: Float64Array | null = null; + xStd_: Float64Array | null = null; + yStd_: Float64Array | null = null; + nFeaturesFit_: number = 0; + nTargetsFit_: number = 0; + + constructor(options: PLSSVDOptions = {}) { + this.nComponents = options.nComponents ?? 2; + this.scale = options.scale ?? true; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const nSamples = X.length; + const nFeatures = X[0]?.length ?? 0; + const nTargets = Y[0]?.length ?? 0; + this.nFeaturesFit_ = nFeatures; + this.nTargetsFit_ = nTargets; + + // Center (and optionally scale) + this.xMean_ = new Float64Array(nFeatures); + this.yMean_ = new Float64Array(nTargets); + for (const row of X) for (let j = 0; j < nFeatures; j++) this.xMean_[j] = (this.xMean_[j] ?? 0) + (row[j] ?? 0); + for (const row of Y) for (let j = 0; j < nTargets; j++) this.yMean_[j] = (this.yMean_[j] ?? 0) + (row[j] ?? 0); + for (let j = 0; j < nFeatures; j++) this.xMean_[j] = (this.xMean_[j] ?? 0) / nSamples; + for (let j = 0; j < nTargets; j++) this.yMean_[j] = (this.yMean_[j] ?? 0) / nSamples; + + this.xStd_ = new Float64Array(nFeatures).fill(1); + this.yStd_ = new Float64Array(nTargets).fill(1); + if (this.scale) { + for (const row of X) for (let j = 0; j < nFeatures; j++) { + this.xStd_[j] = (this.xStd_[j] ?? 0) + ((row[j] ?? 0) - (this.xMean_[j] ?? 0)) ** 2; + } + for (let j = 0; j < nFeatures; j++) this.xStd_[j] = Math.sqrt((this.xStd_[j] ?? 0) / (nSamples - 1)) || 1; + for (const row of Y) for (let j = 0; j < nTargets; j++) { + this.yStd_[j] = (this.yStd_[j] ?? 0) + ((row[j] ?? 0) - (this.yMean_[j] ?? 0)) ** 2; + } + for (let j = 0; j < nTargets; j++) this.yStd_[j] = Math.sqrt((this.yStd_[j] ?? 0) / (nSamples - 1)) || 1; + } + + // Center and scale X, Y + const Xc = X.map(row => new Float64Array(nFeatures).map((_, j) => ((row[j] ?? 0) - (this.xMean_![j] ?? 0)) / (this.xStd_![j] ?? 1))); + const Yc = Y.map(row => new Float64Array(nTargets).map((_, j) => ((row[j] ?? 0) - (this.yMean_![j] ?? 0)) / (this.yStd_![j] ?? 1))); + + // Compute cross-covariance matrix C = X^T Y + const C: Float64Array[] = Array.from({ length: nFeatures }, () => new Float64Array(nTargets)); + for (let i = 0; i < nSamples; i++) { + for (let j = 0; j < nFeatures; j++) { + for (let k = 0; k < nTargets; k++) { + C[j]![k] = (C[j]![k] ?? 0) + (Xc[i]?.[j] ?? 0) * (Yc[i]?.[k] ?? 0); + } + } + } + + const k = Math.min(this.nComponents, nFeatures, nTargets); + + // SVD via power iteration + const xWeights: Float64Array[] = []; + const yWeights: Float64Array[] = []; + + let seed = 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return ((seed >>> 0) / 0xffffffff) * 2 - 1; + } + + for (let comp = 0; comp < k; comp++) { + let u = new Float64Array(nFeatures).map(() => rand()); + let normU = Math.sqrt(u.reduce((s, v) => s + v ** 2, 0)) || 1; + for (let j = 0; j < nFeatures; j++) u[j] = (u[j] ?? 0) / normU; + + for (let iter = 0; iter < 10; iter++) { + // v = C^T u + let v = new Float64Array(nTargets); + for (let j = 0; j < nFeatures; j++) for (let l = 0; l < nTargets; l++) v[l] += (C[j]?.[l] ?? 0) * (u[j] ?? 0); + let normV = Math.sqrt(v.reduce((s, v2) => s + v2 ** 2, 0)) || 1; + for (let l = 0; l < nTargets; l++) v[l] = (v[l] ?? 0) / normV; + + // u = C v + let uNew = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) for (let l = 0; l < nTargets; l++) uNew[j] += (C[j]?.[l] ?? 0) * (v[l] ?? 0); + + // Orthogonalize against previous + for (const pu of xWeights) { + let dot = 0; + for (let j = 0; j < nFeatures; j++) dot += (uNew[j] ?? 0) * (pu[j] ?? 0); + for (let j = 0; j < nFeatures; j++) uNew[j] = (uNew[j] ?? 0) - dot * (pu[j] ?? 0); + } + + normU = Math.sqrt(uNew.reduce((s, v2) => s + v2 ** 2, 0)) || 1; + u = new Float64Array(uNew.map(v2 => v2 / normU)); + } + + // Final v + const v = new Float64Array(nTargets); + for (let j = 0; j < nFeatures; j++) for (let l = 0; l < nTargets; l++) v[l] += (C[j]?.[l] ?? 0) * (u[j] ?? 0); + const normV = Math.sqrt(v.reduce((s, v2) => s + v2 ** 2, 0)) || 1; + for (let l = 0; l < nTargets; l++) v[l] = (v[l] ?? 0) / normV; + + xWeights.push(u); + yWeights.push(v); + } + + this.xWeights_ = xWeights; + this.yWeights_ = yWeights; + + // Compute scores + this.xScores_ = Xc.map(row => new Float64Array(xWeights.map(w => { + let dot = 0; + for (let j = 0; j < nFeatures; j++) dot += (row[j] ?? 0) * (w[j] ?? 0); + return dot; + }))); + this.yScores_ = Yc.map(row => new Float64Array(yWeights.map(w => { + let dot = 0; + for (let j = 0; j < nTargets; j++) dot += (row[j] ?? 0) * (w[j] ?? 0); + return dot; + }))); + + return this; + } + + transform(X: Float64Array[], Y?: Float64Array[]): { xScores: Float64Array[]; yScores?: Float64Array[] } { + if (!this.xWeights_ || !this.xMean_) throw new Error("PLSSVDExt not fitted"); + const nFeatures = this.nFeaturesFit_; + const xScores = X.map(row => new Float64Array(this.xWeights_!.map(w => { + let dot = 0; + for (let j = 0; j < nFeatures; j++) dot += ((row[j] ?? 0) - (this.xMean_![j] ?? 0)) / (this.xStd_![j] ?? 1) * (w[j] ?? 0); + return dot; + }))); + + if (Y) { + const nTargets = this.nTargetsFit_; + const yScores = Y.map(row => new Float64Array(this.yWeights_!.map(w => { + let dot = 0; + for (let j = 0; j < nTargets; j++) dot += ((row[j] ?? 0) - (this.yMean_![j] ?? 0)) / (this.yStd_![j] ?? 1) * (w[j] ?? 0); + return dot; + }))); + return { xScores, yScores }; + } + return { xScores }; + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): { xScores: Float64Array[]; yScores: Float64Array[] } { + this.fit(X, Y); + return { xScores: this.xScores_!, yScores: this.yScores_! }; + } +} diff --git a/src/datasets/california.ts b/src/datasets/california.ts new file mode 100644 index 0000000..0b9c6f8 --- /dev/null +++ b/src/datasets/california.ts @@ -0,0 +1,73 @@ +/** + * California Housing dataset utilities. + * Port of sklearn.datasets._california_housing + */ + +export interface CaliforniaHousingData { + data: Float64Array[]; + target: Float64Array; + featureNames: string[]; + targetNames: string[]; + description: string; +} + +/** + * Generate synthetic California housing-like data. + * Features: MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude + */ +export function makeCaliforniaHousing( + nSamples = 100, + randomState = 42, +): CaliforniaHousingData { + // Simple LCG random + let seed = randomState; + const rand = (): number => { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return ((seed >>> 0) / 0x100000000); + }; + const featureNames = [ + "MedInc", "HouseAge", "AveRooms", "AveBedrms", + "Population", "AveOccup", "Latitude", "Longitude", + ]; + const data: Float64Array[] = []; + const target = new Float64Array(nSamples); + for (let i = 0; i < nSamples; i++) { + const medInc = 0.5 + rand() * 10; + const houseAge = 1 + rand() * 52; + const aveRooms = 2 + rand() * 8; + const aveBedrms = 0.5 + rand() * 2; + const population = 100 + rand() * 3000; + const aveOccup = 1 + rand() * 5; + const latitude = 32 + rand() * 10; + const longitude = -124 + rand() * 10; + data.push(new Float64Array([medInc, houseAge, aveRooms, aveBedrms, population, aveOccup, latitude, longitude])); + // Simplified price model + target[i] = 0.5 + 0.4 * medInc - 0.001 * population + rand() * 0.5; + } + return { + data, + target, + featureNames, + targetNames: ["MedHouseVal"], + description: "Synthetic California Housing dataset (generated). " + + "Original from StatLib repository. 8 features, regression target is median house value.", + }; +} + +export interface FetchCaliforniaHousingOptions { + dataHome?: string; + download?: boolean; + returnXy?: boolean; + asFrame?: boolean; +} + +/** + * Fetch (or generate) the California Housing dataset. + * In browser/Bun environments, returns generated data. + */ +export function fetchCaliforniaHousing( + opts: FetchCaliforniaHousingOptions = {}, +): CaliforniaHousingData { + void opts; + return makeCaliforniaHousing(20640); +} diff --git a/src/datasets/datasets_ext.ts b/src/datasets/datasets_ext.ts new file mode 100644 index 0000000..ed2075a --- /dev/null +++ b/src/datasets/datasets_ext.ts @@ -0,0 +1,165 @@ +/** + * Extended datasets: makeMultilabelClassification, makeMultivariateNormal, makeCheckerboard, makeS_curve + */ + +export interface MultilabelDataset { + X: Float64Array[]; + Y: Int32Array[]; + nClasses: number; +} + +export function makeMultilabelClassification( + nSamples = 100, + nFeatures = 20, + nClasses = 5, + nLabels = 2, + randomState?: number +): MultilabelDataset { + const rng = randomState !== undefined ? seededRng(randomState) : Math.random; + const X: Float64Array[] = []; + const Y: Int32Array[] = []; + for (let i = 0; i < nSamples; i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) row[j] = rng() * 2 - 1; + X.push(row); + const labels = new Int32Array(nClasses); + const selected = new Set(); + while (selected.size < nLabels) selected.add(Math.floor(rng() * nClasses)); + for (const l of selected) labels[l] = 1; + Y.push(labels); + } + return { X, Y, nClasses }; +} + +function seededRng(seed: number): () => number { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return (s >>> 0) / 0xffffffff; + }; +} + +export interface MultivariateNormalDataset { + X: Float64Array[]; + mean: Float64Array; + cov: Float64Array[]; +} + +export function makeMultivariateNormal( + nSamples = 100, + mean: Float64Array, + cov: Float64Array[] +): MultivariateNormalDataset { + const nFeatures = mean.length; + // Cholesky decomposition of cov + const L: Float64Array[] = Array.from({ length: nFeatures }, () => new Float64Array(nFeatures)); + for (let i = 0; i < nFeatures; i++) { + for (let j = 0; j <= i; j++) { + let sum = cov[i]![j] ?? 0; + for (let k = 0; k < j; k++) sum -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0); + L[i]![j] = i === j ? Math.sqrt(Math.max(0, sum)) : (L[j]![j] ?? 1) < 1e-10 ? 0 : sum / (L[j]![j] ?? 1); + } + } + // Sample z ~ N(0, I) then x = L*z + mean + const X: Float64Array[] = []; + for (let s = 0; s < nSamples; s++) { + const z = new Float64Array(nFeatures); + for (let i = 0; i < nFeatures; i++) { + const u1 = Math.random(), u2 = Math.random(); + z[i] = Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2); + } + const x = new Float64Array(nFeatures); + for (let i = 0; i < nFeatures; i++) { + x[i] = mean[i] ?? 0; + for (let j = 0; j <= i; j++) x[i] += (L[i]![j] ?? 0) * (z[j] ?? 0); + } + X.push(x); + } + return { X, mean, cov }; +} + +export interface CheckerboardDataset { + X: Float64Array[]; + y: Int32Array; + nSquares: number; +} + +export function makeCheckerboard( + nSamples = 200, + nSquares = 4 +): CheckerboardDataset { + const X: Float64Array[] = []; + const y = new Int32Array(nSamples); + for (let i = 0; i < nSamples; i++) { + const x0 = Math.random(); + const x1 = Math.random(); + X.push(new Float64Array([x0, x1])); + const sq0 = Math.floor(x0 * nSquares); + const sq1 = Math.floor(x1 * nSquares); + y[i] = (sq0 + sq1) % 2; + } + return { X, y, nSquares }; +} + +export interface SCurveDataset { + X: Float64Array[]; + t: Float64Array; +} + +export function makeS_curve(nSamples = 100, noise = 0.0): SCurveDataset { + const t = new Float64Array(nSamples); + const X: Float64Array[] = []; + for (let i = 0; i < nSamples; i++) { + t[i] = 1.5 * Math.PI * (1 + 2 * Math.random()); + const ti = t[i] ?? 0; + const x = Math.sin(ti) + (noise > 0 ? (Math.random() - 0.5) * noise : 0); + const y = Math.sign(ti - Math.PI) * (Math.cos(ti) - 1) + (noise > 0 ? (Math.random() - 0.5) * noise : 0); + const z = 2 * Math.random() + (noise > 0 ? (Math.random() - 0.5) * noise : 0); + X.push(new Float64Array([x, y, z])); + } + return { X, t }; +} + +export function makeLowRankMatrix( + nSamples = 100, + nFeatures = 50, + effectiveRank = 10, + tailStrength = 0.5 +): Float64Array[] { + const n = Math.min(nSamples, nFeatures); + const singularVals = new Float64Array(n); + for (let i = 0; i < n; i++) { + const low = Math.exp(-i / effectiveRank); + const high = tailStrength * Math.exp(-i / (n * tailStrength + 1e-10)); + singularVals[i] = (1 - tailStrength) * low + high; + } + // Random orthogonal matrices via Gram-Schmidt + const makeOrthogonal = (rows: number, cols: number): Float64Array[] => { + const mat: Float64Array[] = Array.from({ length: rows }, () => { + const row = new Float64Array(cols); + for (let j = 0; j < cols; j++) row[j] = Math.random() - 0.5; + return row; + }); + for (let j = 0; j < cols; j++) { + for (let k = 0; k < j; k++) { + let dot = 0; + for (let i = 0; i < rows; i++) dot += (mat[i]![j] ?? 0) * (mat[i]![k] ?? 0); + for (let i = 0; i < rows; i++) mat[i]![j] = (mat[i]![j] ?? 0) - dot * (mat[i]![k] ?? 0); + } + let norm = 0; + for (let i = 0; i < rows; i++) norm += (mat[i]![j] ?? 0) ** 2; + norm = Math.sqrt(norm) || 1; + for (let i = 0; i < rows; i++) mat[i]![j] = (mat[i]![j] ?? 0) / norm; + } + return mat; + }; + const U = makeOrthogonal(nSamples, n); + const V = makeOrthogonal(nFeatures, n); + return Array.from({ length: nSamples }, (_, i) => { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + for (let k = 0; k < n; k++) row[j] += (U[i]![k] ?? 0) * (singularVals[k] ?? 0) * (V[j]![k] ?? 0); + } + return row; + }); +} diff --git a/src/datasets/datasets_ext3.ts b/src/datasets/datasets_ext3.ts new file mode 100644 index 0000000..5a4f4e9 --- /dev/null +++ b/src/datasets/datasets_ext3.ts @@ -0,0 +1,201 @@ +/** + * Additional dataset generators: make_moons, make_circles, make_blobs extensions. + * Mirrors sklearn.datasets extras. + */ + +export function makeMoons( + nSamples = 100, + noise = 0.1, + randomState = 0, +): { X: Float64Array[]; y: Int32Array } { + let rng = randomState; + const nextRand = (): number => { + rng = (rng * 1664525 + 1013904223) >>> 0; + return rng / 4294967296; + }; + const boxMuller = (): number => { + const u = nextRand(); + const v = nextRand(); + return Math.sqrt(-2 * Math.log(u + 1e-10)) * Math.cos(2 * Math.PI * v); + }; + + const nEach = Math.floor(nSamples / 2); + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < nEach; i++) { + const angle = (Math.PI * i) / nEach; + X.push(new Float64Array([Math.cos(angle) + noise * boxMuller(), Math.sin(angle) + noise * boxMuller()])); + y.push(0); + } + for (let i = 0; i < nSamples - nEach; i++) { + const angle = (Math.PI * i) / (nSamples - nEach); + X.push(new Float64Array([1 - Math.cos(angle) + noise * boxMuller(), 1 - Math.sin(angle) - 0.5 + noise * boxMuller()])); + y.push(1); + } + + return { X, y: new Int32Array(y) }; +} + +export function makeCircles( + nSamples = 100, + noise = 0.1, + factor = 0.8, + randomState = 0, +): { X: Float64Array[]; y: Int32Array } { + let rng = randomState; + const nextRand = (): number => { + rng = (rng * 1664525 + 1013904223) >>> 0; + return rng / 4294967296; + }; + const boxMuller = (): number => { + const u = nextRand(); + const v = nextRand(); + return Math.sqrt(-2 * Math.log(u + 1e-10)) * Math.cos(2 * Math.PI * v); + }; + + const nOuter = Math.floor(nSamples / 2); + const nInner = nSamples - nOuter; + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < nOuter; i++) { + const angle = (2 * Math.PI * i) / nOuter; + X.push(new Float64Array([Math.cos(angle) + noise * boxMuller(), Math.sin(angle) + noise * boxMuller()])); + y.push(0); + } + for (let i = 0; i < nInner; i++) { + const angle = (2 * Math.PI * i) / nInner; + X.push(new Float64Array([factor * Math.cos(angle) + noise * boxMuller(), factor * Math.sin(angle) + noise * boxMuller()])); + y.push(1); + } + + return { X, y: new Int32Array(y) }; +} + +export function makeSwissRoll( + nSamples = 100, + noise = 0.0, + randomState = 0, +): { X: Float64Array[]; t: Float64Array } { + let rng = randomState; + const nextRand = (): number => { + rng = (rng * 1664525 + 1013904223) >>> 0; + return rng / 4294967296; + }; + const boxMuller = (): number => { + const u = nextRand(); + const v = nextRand(); + return Math.sqrt(-2 * Math.log(u + 1e-10)) * Math.cos(2 * Math.PI * v); + }; + + const t = new Float64Array(nSamples); + const X: Float64Array[] = []; + + for (let i = 0; i < nSamples; i++) { + const ti = (1.5 + 2.5 * nextRand()) * Math.PI; + t[i] = ti; + const height = 21 * nextRand(); + X.push(new Float64Array([ + ti * Math.cos(ti) + noise * boxMuller(), + height + noise * boxMuller(), + ti * Math.sin(ti) + noise * boxMuller(), + ])); + } + + return { X, t }; +} + +export function makeCheckerboard( + shape: [number, number] = [10, 10], + nClusters = 4, + nSamples = 100, + noise = 0.0, + randomState = 0, +): { X: Float64Array[]; rows: Int32Array; cols: Int32Array } { + let rng = randomState; + const nextRand = (): number => { + rng = (rng * 1664525 + 1013904223) >>> 0; + return rng / 4294967296; + }; + + const [nRows, nCols] = shape; + const rowClusterSize = nRows / Math.sqrt(nClusters); + const colClusterSize = nCols / Math.sqrt(nClusters); + + const X: Float64Array[] = []; + const rows: number[] = []; + const cols: number[] = []; + + for (let i = 0; i < nSamples; i++) { + const r = Math.floor(nextRand() * nRows); + const c = Math.floor(nextRand() * nCols); + const rCluster = Math.floor(r / rowClusterSize); + const cCluster = Math.floor(c / colClusterSize); + + const baseVal = (rCluster + cCluster) % 2 === 0 ? 1.0 : 0.0; + X.push(new Float64Array([ + r + noise * (nextRand() - 0.5), + c + noise * (nextRand() - 0.5), + baseVal, + ])); + rows.push(r); + cols.push(c); + } + + return { X, rows: new Int32Array(rows), cols: new Int32Array(cols) }; +} + +export function makeSparseCoded( + nSamples = 100, + nComponents = 10, + nFeatures = 20, + nNonzeroCoefs = 3, + randomState = 0, +): { X: Float64Array[]; dictionary: Float64Array[]; code: Float64Array[] } { + let rng = randomState; + const nextRand = (): number => { + rng = (rng * 1664525 + 1013904223) >>> 0; + return (rng / 4294967296) * 2 - 1; + }; + + // Generate random dictionary + const dictionary: Float64Array[] = Array.from({ length: nComponents }, () => { + const v = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) v[j] = nextRand(); + let norm = 0; + for (const vj of v) norm += vj ** 2; + norm = Math.sqrt(norm); + if (norm > 0) for (let j = 0; j < nFeatures; j++) v[j] = (v[j] ?? 0) / norm; + return v; + }); + + // Generate sparse codes + const code: Float64Array[] = []; + for (let i = 0; i < nSamples; i++) { + const c = new Float64Array(nComponents); + const indices: number[] = []; + for (let k = 0; k < nNonzeroCoefs; k++) { + let idx = Math.floor(Math.abs(nextRand()) * nComponents); + while (indices.includes(idx)) idx = (idx + 1) % nComponents; + indices.push(idx); + c[idx] = nextRand(); + } + code.push(c); + } + + // Generate X = code @ dictionary + const X = code.map((c) => { + const x = new Float64Array(nFeatures); + for (let k = 0; k < nComponents; k++) { + const ck = c[k] ?? 0; + if (ck === 0) continue; + for (let j = 0; j < nFeatures; j++) { + x[j] = (x[j] ?? 0) + ck * (dictionary[k]?.[j] ?? 0); + } + } + return x; + }); + + return { X, dictionary, code }; +} diff --git a/src/datasets/datasets_ext4.ts b/src/datasets/datasets_ext4.ts new file mode 100644 index 0000000..482d75b --- /dev/null +++ b/src/datasets/datasets_ext4.ts @@ -0,0 +1,156 @@ +/** + * Datasets extensions: makeTimeSeries, makeAnomalyDetection, makeGraphData, makeRankingData + * Port of sklearn.datasets extensions + */ + +function seededRng(seed: number): () => number { + let s = seed; + return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; +} + +export function makeTimeSeries(opts: { + nSamples?: number; + nFeatures?: number; + nTimesteps?: number; + noise?: number; + randomState?: number; + trend?: boolean; + seasonality?: boolean; +}): { X: Float64Array[][]; y: Float64Array } { + const n = opts.nSamples ?? 100; + const p = opts.nFeatures ?? 1; + const T = opts.nTimesteps ?? 50; + const noise = opts.noise ?? 0.1; + const trend = opts.trend ?? true; + const seasonality = opts.seasonality ?? true; + const rng = seededRng(opts.randomState ?? 42); + + const X: Float64Array[][] = Array.from({ length: n }, () => { + const series: Float64Array[] = Array.from({ length: T }, (_, t) => { + const row = new Float64Array(p); + for (let j = 0; j < p; j++) { + let val = 0; + if (trend) val += t / T * (rng() * 2 - 1); + if (seasonality) val += Math.sin(2 * Math.PI * t / 12) * (rng() + 0.5); + val += (rng() * 2 - 1) * noise; + row[j] = val; + } + return row; + }); + return series; + }); + const y = Float64Array.from({ length: n }, (_, i) => X[i]!.reduce((s, ts) => s + (ts[0] ?? 0), 0) / T); + return { X, y }; +} + +export function makeAnomalyDetection(opts: { + nSamples?: number; + nFeatures?: number; + contamination?: number; + randomState?: number; +}): { X: Float64Array[]; y: Int32Array; anomalyIndices: number[] } { + const n = opts.nSamples ?? 200; + const p = opts.nFeatures ?? 2; + const contamination = opts.contamination ?? 0.1; + const rng = seededRng(opts.randomState ?? 0); + const nAnomalies = Math.floor(n * contamination); + + const X: Float64Array[] = Array.from({ length: n }, (_, i) => { + const row = new Float64Array(p); + const isAnomaly = i < nAnomalies; + for (let j = 0; j < p; j++) { + row[j] = isAnomaly ? (rng() * 10 - 5) + (rng() > 0.5 ? 5 : -5) : rng() * 4 - 2; + } + return row; + }); + for (let i = n - 1; i > 0; i--) { + const j = Math.floor(rng() * (i + 1)); + const tmp = X[i]!; + X[i] = X[j]!; + X[j] = tmp; + } + const anomalyIndices: number[] = []; + const y = new Int32Array(n).fill(1); + for (let i = 0; i < n; i++) { + const norm = X[i]!.reduce((s, v) => s + (v ?? 0) ** 2, 0); + if (norm > p * 4) { y[i] = -1; anomalyIndices.push(i); } + } + return { X, y, anomalyIndices }; +} + +export function makeRankingData(opts: { + nSamples?: number; + nFeatures?: number; + nGroups?: number; + randomState?: number; +}): { X: Float64Array[]; y: Int32Array; groups: Int32Array; relevanceScores: Float64Array } { + const n = opts.nSamples ?? 100; + const p = opts.nFeatures ?? 10; + const g = opts.nGroups ?? 10; + const rng = seededRng(opts.randomState ?? 0); + + const X: Float64Array[] = Array.from({ length: n }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1)); + const groups = Int32Array.from({ length: n }, (_, i) => Math.floor(i / Math.ceil(n / g))); + const weights = Float64Array.from({ length: p }, () => rng() * 2 - 1); + const relevanceScores = Float64Array.from(X.map(xi => { + let s = 0; + for (let j = 0; j < p; j++) s += (weights[j] ?? 0) * (xi[j] ?? 0); + return s; + })); + const y = Int32Array.from(relevanceScores.map(s => Math.min(4, Math.max(0, Math.floor((s + 3) / 2))))); + return { X, y, groups, relevanceScores }; +} + +export function makeMultiLabelData(opts: { + nSamples?: number; + nFeatures?: number; + nClasses?: number; + density?: number; + randomState?: number; +}): { X: Float64Array[]; y: Int32Array[] } { + const n = opts.nSamples ?? 100; + const p = opts.nFeatures ?? 20; + const c = opts.nClasses ?? 5; + const density = opts.density ?? 0.2; + const rng = seededRng(opts.randomState ?? 42); + + const X: Float64Array[] = Array.from({ length: n }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1)); + const weights: Float64Array[] = Array.from({ length: c }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1)); + const y: Int32Array[] = X.map(xi => { + const labels = new Int32Array(c); + for (let k = 0; k < c; k++) { + const score = weights[k]!.reduce((s, w, j) => s + (w ?? 0) * (xi[j] ?? 0), 0); + labels[k] = score > 0 && rng() < density + 0.5 ? 1 : 0; + } + return labels; + }); + return { X, y }; +} + +export function makeGraphData(opts: { + nNodes?: number; + nFeatures?: number; + edgeProbability?: number; + randomState?: number; +}): { nodeFeatures: Float64Array[]; adjacency: Float64Array[]; labels: Int32Array } { + const n = opts.nNodes ?? 50; + const p = opts.nFeatures ?? 8; + const edgeProb = opts.edgeProbability ?? 0.3; + const rng = seededRng(opts.randomState ?? 0); + + const nodeFeatures: Float64Array[] = Array.from({ length: n }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1)); + const adjacency: Float64Array[] = Array.from({ length: n }, (_, i) => { + const row = new Float64Array(n); + for (let j = i + 1; j < n; j++) { + if (rng() < edgeProb) { row[j] = 1; (adjacency[j] as Float64Array | undefined)?.set?.([1], i); } + } + return row; + }); + for (let i = 0; i < n; i++) for (let j = 0; j < i; j++) if ((adjacency[j]![i] ?? 0) > 0) adjacency[i]![j] = 1; + const labels = Int32Array.from({ length: n }, (_, i) => { + let degree = 0; + for (let j = 0; j < n; j++) if ((adjacency[i]![j] ?? 0) > 0) degree++; + return degree > n * edgeProb ? 1 : 0; + }); + return { nodeFeatures, adjacency, labels }; +} diff --git a/src/datasets/datasets_ext5.ts b/src/datasets/datasets_ext5.ts new file mode 100644 index 0000000..0640310 --- /dev/null +++ b/src/datasets/datasets_ext5.ts @@ -0,0 +1,164 @@ +/** + * Datasets extensions: synthetic datasets for benchmarking. + * Port of sklearn.datasets extensions. + */ + +/** Generate a dataset for benchmarking classifiers (Swiss roll with labels). */ +export function makeSwissRoll( + nSamples = 100, + noise = 0.0, + randomState = 0, +): { X: Float64Array[]; t: Float64Array } { + let rng = randomState; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const t = new Float64Array(nSamples).map(() => 1.5 * Math.PI * (1 + 2 * rand())); + const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => { + const ti = t[i] ?? 0; + return new Float64Array([ + ti * Math.cos(ti) + noise * (rand() - 0.5), + 21 * rand() + noise * (rand() - 0.5), + ti * Math.sin(ti) + noise * (rand() - 0.5), + ]); + }); + return { X, t }; +} + +/** Generate a dataset of S-curve manifold. */ +export function makeSCurve( + nSamples = 100, + noise = 0.0, + randomState = 0, +): { X: Float64Array[]; t: Float64Array } { + let rng = randomState; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const t = new Float64Array(nSamples).map(() => 3 * Math.PI * (rand() - 0.5)); + const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => { + const ti = t[i] ?? 0; + return new Float64Array([ + Math.sin(ti) + noise * (rand() - 0.5), + 2 * rand() + noise * (rand() - 0.5), + Math.sign(ti) * (Math.cos(ti) - 1) + noise * (rand() - 0.5), + ]); + }); + return { X, t }; +} + +/** Generate a checkerboard dataset. */ +export function makeCheckerboardData( + nSamples = 200, + nSquares = 4, + randomState = 0, +): { X: Float64Array[]; y: Int32Array } { + let rng = randomState; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const X: Float64Array[] = Array.from({ length: nSamples }, () => new Float64Array([rand(), rand()])); + const y = new Int32Array(nSamples).map((_, i) => { + const x1 = X[i]?.[0] ?? 0; + const x2 = X[i]?.[1] ?? 0; + const sq1 = Math.floor(x1 * nSquares); + const sq2 = Math.floor(x2 * nSquares); + return (sq1 + sq2) % 2; + }); + return { X, y }; +} + +/** Generate a dataset of XOR pattern. */ +export function makeXOR( + nSamples = 200, + noise = 0.1, + randomState = 0, +): { X: Float64Array[]; y: Int32Array } { + let rng = randomState; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const X: Float64Array[] = Array.from({ length: nSamples }, () => + new Float64Array([rand() * 2 - 1, rand() * 2 - 1]), + ); + const y = new Int32Array(nSamples).map((_, i) => { + const x1 = (X[i]?.[0] ?? 0) + noise * (rand() - 0.5); + const x2 = (X[i]?.[1] ?? 0) + noise * (rand() - 0.5); + return x1 * x2 > 0 ? 1 : 0; + }); + return { X, y }; +} + +/** Generate low-rank data with noise. */ +export function makeLowRankMatrix( + nSamples = 100, + nFeatures = 50, + effectiveRank = 10, + tailStrength = 0.5, + randomState = 0, +): Float64Array[] { + let rng = randomState; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const n = nSamples; + const p = nFeatures; + const k = Math.min(n, p, effectiveRank * 2); + // Generate random orthogonal-ish basis + const U: Float64Array[] = Array.from({ length: n }, () => + new Float64Array(k).map(() => rand() * 2 - 1), + ); + const V: Float64Array[] = Array.from({ length: k }, () => + new Float64Array(p).map(() => rand() * 2 - 1), + ); + // Singular values decay + const S = new Float64Array(k).map((_, i) => { + const hi = Math.exp(-i / effectiveRank); + const lo = tailStrength / k; + return hi * (1 - tailStrength) + lo; + }); + const X: Float64Array[] = Array.from({ length: n }, (_, i) => { + const row = new Float64Array(p); + for (let c = 0; c < k; c++) { + for (let j = 0; j < p; j++) { + row[j]! += (U[i]?.[c] ?? 0) * (S[c] ?? 0) * (V[c]?.[j] ?? 0); + } + } + return row; + }); + return X; +} + +/** Generate a multilabel classification dataset. */ +export function makeMultilabelClassification( + nSamples = 100, + nFeatures = 20, + nClasses = 5, + nLabels = 2, + randomState = 0, +): { X: Float64Array[]; Y: Int32Array[] } { + let rng = randomState; + const rand = (): number => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return (rng >>> 0) / 0xffffffff; + }; + const X: Float64Array[] = Array.from({ length: nSamples }, () => + new Float64Array(nFeatures).map(() => rand()), + ); + const Y: Int32Array[] = Array.from({ length: nSamples }, () => { + const labels = new Int32Array(nClasses); + // Select nLabels distinct labels + const chosen = new Set(); + while (chosen.size < Math.min(nLabels, nClasses)) { + chosen.add(Math.floor(rand() * nClasses)); + } + for (const c of chosen) labels[c] = 1; + return labels; + }); + return { X, Y }; +} diff --git a/src/datasets/digits.ts b/src/datasets/digits.ts new file mode 100644 index 0000000..1fb39fb --- /dev/null +++ b/src/datasets/digits.ts @@ -0,0 +1,124 @@ +/** + * Toy datasets: loadDigits and loadLinnerud β€” analogous to sklearn.datasets._base. + */ + +/** A single 8Γ—8 hand-written digit image dataset entry. */ +export interface DigitsDataset { + /** Pixel data: nSamples Γ— 64 (flattened 8Γ—8 images, values 0–16). */ + data: Float64Array; + /** Target digit labels (0–9). */ + target: Int32Array; + /** Number of samples. */ + nSamples: number; + /** Feature names: "pixel_0_0" … "pixel_7_7". */ + featureNames: string[]; + /** Target names: ["0","1",…,"9"]. */ + targetNames: string[]; + /** Description string. */ + DESCR: string; +} + +/** The Linnerud multivariate exercise dataset. */ +export interface LinnerudDataset { + /** Exercise data: 20 Γ— 3 (Chins, Situps, Jumps). */ + data: Float64Array; + /** Physiological measurements: 20 Γ— 3 (Weight, Waist, Pulse). */ + target: Float64Array; + nSamples: number; + featureNames: string[]; + targetNames: string[]; + DESCR: string; +} + +/** + * Generates a minimal synthetic digits dataset. + * Returns nSamples per class (default 10 per digit) arranged as 8Γ—8 pixel blocks. + */ +export function loadDigits(options: { nClass?: number; samplesPerClass?: number } = {}): DigitsDataset { + const nClass = options.nClass ?? 10; + const samplesPerClass = options.samplesPerClass ?? 10; + const nSamples = nClass * samplesPerClass; + const nFeatures = 64; + const data = new Float64Array(nSamples * nFeatures); + const target = new Int32Array(nSamples); + const rng = mulberry32(42); + + for (let cls = 0; cls < nClass; cls++) { + // Build a prototype 8Γ—8 pattern for this digit using a seeded pattern + const proto = new Float64Array(nFeatures); + const seed = cls * 17; + for (let px = 0; px < nFeatures; px++) { + const r = (seed * 6364136223846793005n + BigInt(px) * 2862933555777941757n) & 0xffffffffffffn; + proto[px] = Number(r % 17n); // 0-16 + } + + for (let s = 0; s < samplesPerClass; s++) { + const row = cls * samplesPerClass + s; + target[row] = cls; + for (let px = 0; px < nFeatures; px++) { + // Add small noise + const noise = (rng() - 0.5) * 2; + const val = Math.max(0, Math.min(16, (proto[px]!) + noise)); + data[row * nFeatures + px] = Math.round(val); + } + } + } + + const featureNames: string[] = []; + for (let r = 0; r < 8; r++) for (let c = 0; c < 8; c++) featureNames.push(`pixel_${r}_${c}`); + const targetNames = Array.from({ length: nClass }, (_, i) => String(i)); + + return { + data, target, nSamples, + featureNames, + targetNames, + DESCR: "Optical recognition of handwritten digits (synthetic).", + }; +} + +/** Returns the Linnerud dataset (20 samples, 3 exercise features, 3 physiological targets). */ +export function loadLinnerud(): LinnerudDataset { + // Transcribed from sklearn reference data + const exerciseRaw = [ + 5, 162, 60, 2, 110, 60, 12, 101, 101, 12, 105, 37, + 13, 155, 58, 4, 101, 42, 8, 101, 38, 6, 125, 40, + 15, 200, 40, 17, 251, 250, 17, 120, 38, 13, 210, 115, + 14, 215, 105, 1, 50, 50, 6, 70, 31, 12, 210, 120, + 4, 60, 25, 11, 230, 80, 15, 225, 73, 2, 110, 43, + 10, 150, 75, + ]; + const physiologicalRaw = [ + 191, 36, 50, 189, 37, 52, 193, 38, 58, 162, 35, 62, + 189, 35, 46, 182, 36, 56, 211, 38, 56, 167, 34, 60, + 176, 31, 74, 154, 33, 56, 169, 34, 50, 166, 33, 52, + 154, 34, 64, 247, 46, 50, 193, 36, 46, 202, 37, 62, + 176, 37, 54, 157, 32, 52, 156, 33, 54, 138, 33, 68, + ]; + + const nSamples = 20; + const data = new Float64Array(nSamples * 3); + const target = new Float64Array(nSamples * 3); + for (let i = 0; i < nSamples * 3; i++) { + data[i] = exerciseRaw[i] ?? 0; + target[i] = physiologicalRaw[i] ?? 0; + } + + return { + data, target, nSamples, + featureNames: ["Chins", "Situps", "Jumps"], + targetNames: ["Weight", "Waist", "Pulse"], + DESCR: "Linnerud physical exercise dataset (20 middle-aged men, 3 exercise Γ— 3 physiological).", + }; +} + +// --- helpers --- + +function mulberry32(seed: number): () => number { + let s = seed | 0; + return () => { + s = (s + 0x6d2b79f5) | 0; + let z = Math.imul(s ^ (s >>> 15), 1 | s); + z ^= z + Math.imul(z ^ (z >>> 7), 61 | z); + return ((z ^ (z >>> 14)) >>> 0) / 0x100000000; + }; +} diff --git a/src/datasets/fetch_datasets.ts b/src/datasets/fetch_datasets.ts new file mode 100644 index 0000000..bb59acd --- /dev/null +++ b/src/datasets/fetch_datasets.ts @@ -0,0 +1,226 @@ +/** + * Dataset fetch utilities: California housing, Covtype, KDDCup99, LFW. + * Mirrors sklearn.datasets.fetch_* functions. + */ + +export interface FetchedDataset { + data: Float64Array[]; + target: Float64Array; + featureNames: string[]; + targetNames?: string[]; + description: string; + nSamples: number; + nFeatures: number; +} + +/** + * Synthetic version of the California Housing dataset. + * Real dataset: 20,640 samples, 8 features. + */ +export function fetchCaliforniaHousing(options: { + nSamples?: number; + seed?: number; +} = {}): FetchedDataset { + const n = options.nSamples ?? 100; + let seed = options.seed ?? 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const featureNames = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]; + const data: Float64Array[] = []; + const target = new Float64Array(n); + + for (let i = 0; i < n; i++) { + const medInc = rand() * 15; + const houseAge = rand() * 52; + const aveRooms = 3 + rand() * 10; + const aveBedrms = 1 + rand() * 3; + const population = 100 + rand() * 35000; + const aveOccup = 1 + rand() * 10; + const latitude = 32 + rand() * 10; + const longitude = -124 + rand() * 10; + + data.push(new Float64Array([medInc, houseAge, aveRooms, aveBedrms, population, aveOccup, latitude, longitude])); + target[i] = 0.5 + medInc * 0.3 + rand() * 0.5; + } + + return { data, target, featureNames, description: "California Housing dataset (synthetic)", nSamples: n, nFeatures: 8 }; +} + +/** + * Synthetic version of the Forest Cover Type dataset. + * Real dataset: 581,012 samples, 54 features, 7 classes. + */ +export function fetchCovtype(options: { nSamples?: number; seed?: number } = {}): FetchedDataset { + const n = options.nSamples ?? 100; + let seed = options.seed ?? 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const nFeatures = 54; + const data: Float64Array[] = []; + const target = new Float64Array(n); + + for (let i = 0; i < n; i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) row[j] = rand() * 100; + data.push(row); + target[i] = (rand() * 7) | 0; + } + + return { + data, target, + featureNames: Array.from({ length: nFeatures }, (_, j) => `feature_${j}`), + targetNames: ["Spruce/Fir", "Lodgepole Pine", "Ponderosa Pine", "Cottonwood/Willow", "Aspen", "Douglas-fir", "Krummholz"], + description: "Forest Cover Type dataset (synthetic)", + nSamples: n, nFeatures + }; +} + +/** + * Synthetic version of the KDD Cup 1999 dataset. + */ +export function fetchKddcup99(options: { + subset?: "http" | "smtp" | "SF" | "SA" | null; + nSamples?: number; + seed?: number; +} = {}): FetchedDataset { + const n = options.nSamples ?? 100; + let seed = options.seed ?? 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const nFeatures = 41; + const data: Float64Array[] = []; + const target = new Float64Array(n); + + for (let i = 0; i < n; i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) row[j] = rand() * 1000; + data.push(row); + target[i] = rand() > 0.8 ? 1 : 0; + } + + return { + data, target, + featureNames: Array.from({ length: nFeatures }, (_, j) => `feature_${j}`), + targetNames: ["normal", "attack"], + description: `KDD Cup 99 dataset${options.subset ? ` (${options.subset} subset)` : ""} (synthetic)`, + nSamples: n, nFeatures + }; +} + +/** + * Synthetic version of the Labeled Faces in the Wild (LFW) dataset. + */ +export function fetchLfw(options: { + minFacesPerPerson?: number; + nComponents?: number; + nSamples?: number; + seed?: number; +} = {}): FetchedDataset { + const n = options.nSamples ?? 50; + const nFeatures = options.nComponents ?? 50 * 37; + let seed = options.seed ?? 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const data: Float64Array[] = []; + const target = new Float64Array(n); + const nPersons = 5; + + for (let i = 0; i < n; i++) { + const row = new Float64Array(nFeatures); + const person = (rand() * nPersons) | 0; + for (let j = 0; j < nFeatures; j++) row[j] = rand() + person * 0.1; + data.push(row); + target[i] = person; + } + + return { + data, target, + featureNames: Array.from({ length: nFeatures }, (_, j) => `pixel_${j}`), + targetNames: Array.from({ length: nPersons }, (_, i) => `person_${i}`), + description: "Labeled Faces in the Wild dataset (synthetic)", + nSamples: n, nFeatures + }; +} + +/** + * Synthetic version of the Olivetti Faces dataset. + * Real dataset: 400 samples, 4096 features (64x64), 40 classes. + */ +export function fetchOlivettiFaces(options: { seed?: number } = {}): FetchedDataset { + const n = 40; + const nFeatures = 4096; + let seed = options.seed ?? 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const data: Float64Array[] = []; + const target = new Float64Array(n); + for (let i = 0; i < n; i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) row[j] = rand(); + data.push(row); + target[i] = i % 40; + } + + return { + data, target, + featureNames: Array.from({ length: nFeatures }, (_, j) => `pixel_${j}`), + description: "Olivetti Faces dataset (synthetic)", + nSamples: n, nFeatures + }; +} + +/** + * Fetch a sample of the 20 Newsgroups dataset. + * Returns feature vectors (TF-IDF like) for text classification. + */ +export function fetch20Newsgroups(options: { + nSamples?: number; + nFeatures?: number; + seed?: number; + categories?: string[] | null; +} = {}): FetchedDataset { + const n = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 100; + const categories = options.categories ?? [ + "alt.atheism", "comp.graphics", "sci.med", "soc.religion.christian", "talk.politics.guns" + ]; + const nClasses = categories.length; + let seed = options.seed ?? 42; + function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + } + + const data: Float64Array[] = []; + const target = new Float64Array(n); + for (let i = 0; i < n; i++) { + const cls = (rand() * nClasses) | 0; + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) row[j] = rand() > 0.8 ? rand() : 0; + data.push(row); + target[i] = cls; + } + + return { + data, target, + featureNames: Array.from({ length: nFeatures }, (_, j) => `word_${j}`), + targetNames: categories, + description: "20 Newsgroups dataset (synthetic TF-IDF)", + nSamples: n, nFeatures + }; +} diff --git a/src/datasets/generator_ext.ts b/src/datasets/generator_ext.ts new file mode 100644 index 0000000..f89ecfb --- /dev/null +++ b/src/datasets/generator_ext.ts @@ -0,0 +1,262 @@ +/** + * Additional dataset generators β€” ported from sklearn.datasets + * make_low_rank_matrix, make_sparse_coded_signal, make_biclusters, make_checkerboard + */ + +export interface LowRankMatrixOptions { + nSamples?: number; + nFeatures?: number; + effectiveRank?: number; + tailStrength?: number; + randomState?: number | null; +} + +export interface LowRankMatrixResult { + X: Float64Array[]; +} + +/** + * Generate a mostly low-rank matrix with bell-shaped singular values. + * Useful for testing matrix decomposition algorithms. + */ +export function makeLowRankMatrix(options: LowRankMatrixOptions = {}): LowRankMatrixResult { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 100; + const effectiveRank = options.effectiveRank ?? 10; + const tailStrength = options.tailStrength ?? 0.5; + + let seed = options.randomState ?? 42; + function randn(): number { + seed = (1664525 * seed + 1013904223) & 0x7fffffff; + const u1 = seed / 0x7fffffff; + seed = (1664525 * seed + 1013904223) & 0x7fffffff; + const u2 = seed / 0x7fffffff; + return Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2); + } + + const n = Math.min(nSamples, nFeatures); + + // Singular values: bell-shaped around effectiveRank + const singularValues = new Float64Array(n); + for (let i = 0; i < n; i++) { + const x = (i - effectiveRank) / (effectiveRank / 2); + singularValues[i] = Math.exp(-0.5 * x * x) * (1 - tailStrength) + tailStrength / n; + } + + // Random orthonormal U (nSamples x n) and V (nFeatures x n) + // Simplified: just use random Gaussian matrices (not fully orthogonal) + const U: Float64Array[] = Array.from({ length: nSamples }, () => { + const row = new Float64Array(n); + for (let j = 0; j < n; j++) row[j] = randn(); + return row; + }); + + const V: Float64Array[] = Array.from({ length: nFeatures }, () => { + const row = new Float64Array(n); + for (let j = 0; j < n; j++) row[j] = randn(); + return row; + }); + + // X = U @ diag(singularValues) @ V.T + const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + let val = 0; + for (let k = 0; k < n; k++) { + val += (U[i]![k] ?? 0) * (singularValues[k] ?? 0) * (V[j]![k] ?? 0); + } + row[j] = val; + } + return row; + }); + + return { X }; +} + +export interface SparseCodingOptions { + nSamples?: number; + nComponents?: number; + nFeatures?: number; + nNonzeroCoefs?: number; + randomState?: number | null; +} + +export interface SparseCodingResult { + X: Float64Array[]; + dictionary: Float64Array[]; + code: Float64Array[]; +} + +/** + * Generate a sparse signal using a fixed dictionary. + * Useful for testing dictionary learning algorithms. + */ +export function makeSparseCodedSignal(options: SparseCodingOptions = {}): SparseCodingResult { + const nSamples = options.nSamples ?? 100; + const nComponents = options.nComponents ?? 40; + const nFeatures = options.nFeatures ?? 64; + const nNonzeroCoefs = options.nNonzeroCoefs ?? 3; + + let seed = options.randomState ?? 0; + function rand(): number { + seed = (1664525 * seed + 1013904223) & 0x7fffffff; + return seed / 0x7fffffff; + } + function randn(): number { + const u1 = rand() + 1e-10; + const u2 = rand(); + return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + } + + // Random dictionary (nComponents x nFeatures), normalized atoms + const dictionary: Float64Array[] = Array.from({ length: nComponents }, () => { + const atom = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) atom[j] = randn(); + let norm = 0; + for (let j = 0; j < nFeatures; j++) norm += (atom[j] ?? 0) ** 2; + norm = Math.sqrt(norm); + if (norm > 0) for (let j = 0; j < nFeatures; j++) atom[j]! /= norm; + return atom; + }); + + // Sparse codes (nSamples x nComponents) + const code: Float64Array[] = Array.from({ length: nSamples }, () => { + const row = new Float64Array(nComponents); + // Pick nNonzeroCoefs random non-zero positions + const positions: number[] = []; + const available = Array.from({ length: nComponents }, (_, i) => i); + for (let k = 0; k < nNonzeroCoefs && available.length > 0; k++) { + const idx = Math.floor(rand() * available.length); + positions.push(available[idx]!); + available.splice(idx, 1); + } + for (const pos of positions) { + row[pos] = randn(); + } + return row; + }); + + // X = code @ dictionary + const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + let val = 0; + for (let k = 0; k < nComponents; k++) { + val += (code[i]![k] ?? 0) * (dictionary[k]![j] ?? 0); + } + row[j] = val; + } + return row; + }); + + return { X, dictionary, code }; +} + +export interface BiclustersOptions { + shape?: [number, number]; + nClusters?: number; + noise?: number; + minsize?: number; + randomState?: number | null; +} + +export interface BiclustersResult { + X: Float64Array[]; + rows: boolean[][]; + columns: boolean[][]; +} + +/** + * Generate a 2D array with planted biclusters. + * Useful for testing biclustering algorithms. + */ +export function makeBiclusters(options: BiclustersOptions = {}): BiclustersResult { + const [nRows, nCols] = options.shape ?? [100, 100]; + const nClusters = options.nClusters ?? 5; + const noise = options.noise ?? 0.0; + + let seed = options.randomState ?? 0; + function rand(): number { + seed = (1664525 * seed + 1013904223) & 0x7fffffff; + return seed / 0x7fffffff; + } + + // Assign rows and columns to clusters + const rowAssignments = new Int32Array(nRows); + const colAssignments = new Int32Array(nCols); + for (let i = 0; i < nRows; i++) rowAssignments[i] = Math.floor(rand() * nClusters); + for (let j = 0; j < nCols; j++) colAssignments[j] = Math.floor(rand() * nClusters); + + const X: Float64Array[] = Array.from({ length: nRows }, (_, i) => { + const row = new Float64Array(nCols); + for (let j = 0; j < nCols; j++) { + const sameBicluster = (rowAssignments[i] ?? 0) === (colAssignments[j] ?? 0) ? 1 : 0; + const noiseVal = noise > 0 ? (rand() - 0.5) * noise : 0; + row[j] = sameBicluster + noiseVal; + } + return row; + }); + + // Build membership arrays + const rows: boolean[][] = Array.from({ length: nClusters }, (_, c) => + Array.from({ length: nRows }, (__, i) => (rowAssignments[i] ?? 0) === c) + ); + const columns: boolean[][] = Array.from({ length: nClusters }, (_, c) => + Array.from({ length: nCols }, (__, j) => (colAssignments[j] ?? 0) === c) + ); + + return { X, rows, columns }; +} + +export interface CheckerboardOptions { + shape?: [number, number]; + nClusters?: [number, number]; + noise?: number; + randomState?: number | null; +} + +export interface CheckerboardResult { + X: Float64Array[]; + rows: boolean[][]; + columns: boolean[][]; +} + +/** + * Generate a checkerboard pattern dataset for testing biclustering. + */ +export function makeCheckerboard(options: CheckerboardOptions = {}): CheckerboardResult { + const [nRows, nCols] = options.shape ?? [100, 100]; + const [nRowClusters, nColClusters] = options.nClusters ?? [4, 4]; + const noise = options.noise ?? 0.0; + + let seed = options.randomState ?? 0; + function rand(): number { + seed = (1664525 * seed + 1013904223) & 0x7fffffff; + return seed / 0x7fffffff; + } + + const X: Float64Array[] = Array.from({ length: nRows }, (_, i) => { + const row = new Float64Array(nCols); + const rowCluster = Math.floor(i / Math.ceil(nRows / nRowClusters)); + for (let j = 0; j < nCols; j++) { + const colCluster = Math.floor(j / Math.ceil(nCols / nColClusters)); + const val = ((rowCluster + colCluster) % 2 === 0) ? 1 : 0; + const noiseVal = noise > 0 ? (rand() - 0.5) * noise : 0; + row[j] = val + noiseVal; + } + return row; + }); + + const rows: boolean[][] = Array.from({ length: nRowClusters }, (_, rc) => + Array.from({ length: nRows }, (__, i) => + Math.floor(i / Math.ceil(nRows / nRowClusters)) === rc + ) + ); + const columns: boolean[][] = Array.from({ length: nColClusters }, (_, cc) => + Array.from({ length: nCols }, (__, j) => + Math.floor(j / Math.ceil(nCols / nColClusters)) === cc + ) + ); + + return { X, rows, columns }; +} diff --git a/src/datasets/index.ts b/src/datasets/index.ts new file mode 100644 index 0000000..a559672 --- /dev/null +++ b/src/datasets/index.ts @@ -0,0 +1,11 @@ +export * from "./make_datasets.js"; +export * from "./load_datasets.js"; +export * from "./svmlight.js"; +export * from "./openml.js"; +export * from "./samples_generator.js"; +export * from "./rcv1.js"; +export * from "./real_datasets.js"; +export * from "./digits.js"; +export * from "./newsgroups.js"; +export * from "./generator_ext.js"; +export * from "./fetch_datasets.js"; diff --git a/src/datasets/kddcup.ts b/src/datasets/kddcup.ts new file mode 100644 index 0000000..afeaea0 --- /dev/null +++ b/src/datasets/kddcup.ts @@ -0,0 +1,88 @@ +/** + * KDD Cup datasets: synthetic versions of network intrusion data. + */ + +export interface KDDCupDataset { + data: Float64Array[]; + target: Int32Array; + featureNames: string[]; + targetNames: string[]; + nSamples: number; + nFeatures: number; + description: string; +} + +export const KDD_FEATURE_NAMES = [ + "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", + "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", + "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", + "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", + "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", + "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", + "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", + "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", + "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", +] as const; + +export const KDD_TARGET_NAMES = ["normal", "dos", "probe", "r2l", "u2r"] as const; + +export function makeKDDCupSynthetic(nSamples = 500, seed = 42): KDDCupDataset { + const rng = seededRng(seed); + const nFeatures = KDD_FEATURE_NAMES.length; + const nClasses = KDD_TARGET_NAMES.length; + const data: Float64Array[] = []; + const target: number[] = []; + + for (let i = 0; i < nSamples; i++) { + const cls = Math.floor(rng() * nClasses); + const x = new Float64Array(nFeatures); + // Generate class-specific features + for (let f = 0; f < nFeatures; f++) { + x[f] = rng() * 100 + cls * 5; + } + // Specific feature patterns per class + switch (cls) { + case 0: // normal + x[0] = rng() * 10; // short duration + x[5] = rng() * 1000; // some dst_bytes + break; + case 1: // dos + x[4] = rng() * 10000 + 5000; // high src_bytes + x[22] = rng() * 200 + 100; // high count + break; + case 2: // probe + x[22] = rng() * 100; // count + x[24] = rng(); // serror_rate + break; + case 3: // r2l + x[11] = 0; // not logged in + x[9] = rng() * 5; // low hot + break; + case 4: // u2r + x[14] = 1; // su_attempted + x[13] = 1; // root_shell + break; + } + data.push(x); + target.push(cls); + } + + return { + data, + target: new Int32Array(target), + featureNames: [...KDD_FEATURE_NAMES], + targetNames: [...KDD_TARGET_NAMES], + nSamples, + nFeatures, + description: "Synthetic KDD Cup 1999 network intrusion dataset. Each row is a network connection with class labels: normal, dos, probe, r2l, u2r.", + }; +} + +function seededRng(seed: number): () => number { + let s = seed; + return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; +} + +export function loadKDDCup99(nSamples = 494021, seed = 42): KDDCupDataset { + return makeKDDCupSynthetic(Math.min(nSamples, 10000), seed); +} diff --git a/src/datasets/load_datasets.ts b/src/datasets/load_datasets.ts new file mode 100644 index 0000000..49a77c0 --- /dev/null +++ b/src/datasets/load_datasets.ts @@ -0,0 +1,276 @@ +/** + * Built-in datasets loader. + * Mirrors sklearn.datasets: load_iris, load_wine, load_breast_cancer, load_digits, + * make_swiss_roll, make_s_curve. + */ + +export interface Dataset { + data: Float64Array[]; + target: Int32Array; + featureNames: string[]; + targetNames: string[]; + nSamples: number; + nFeatures: number; +} + +export interface RegressionDataset { + data: Float64Array[]; + target: Float64Array; + featureNames: string[]; + nSamples: number; + nFeatures: number; +} + +function seededRng(seed: number): () => number { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return ((s >>> 0) / 4294967296); + }; +} + +export function loadIris(): Dataset { + // Canonical Fisher Iris dataset (150 samples, 4 features, 3 classes) + // Generated with parameters matching sklearn's load_iris + const rng = seededRng(42); + const nSamples = 150; + const means = [ + [5.006, 3.428, 1.462, 0.246], + [5.936, 2.77, 4.26, 1.326], + [6.588, 2.974, 5.552, 2.026], + ]; + const stds = [ + [0.352, 0.379, 0.174, 0.105], + [0.516, 0.314, 0.470, 0.198], + [0.636, 0.322, 0.552, 0.275], + ]; + + const data: Float64Array[] = []; + const target: number[] = []; + + for (let cls = 0; cls < 3; cls++) { + for (let i = 0; i < 50; i++) { + const row = new Float64Array(4); + for (let j = 0; j < 4; j++) { + // Box-Muller + const u1 = rng(); + const u2 = rng(); + const z = Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2); + row[j] = (means[cls]![j] ?? 0) + (stds[cls]![j] ?? 1) * z; + } + data.push(row); + target.push(cls); + } + } + + return { + data, + target: new Int32Array(target), + featureNames: [ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ], + targetNames: ["setosa", "versicolor", "virginica"], + nSamples, + nFeatures: 4, + }; +} + +export function loadWine(): Dataset { + const rng = seededRng(123); + const nSamples = 178; + const nFeatures = 13; + const data: Float64Array[] = []; + const target: number[] = []; + + const classSizes = [59, 71, 48]; + const classMeans = [ + [13.74, 2.01, 2.46, 17.0, 106.3, 2.84, 2.98, 0.29, 1.90, 5.53, 1.05, 3.33, 1115.7], + [12.28, 1.93, 2.24, 20.2, 94.5, 2.26, 2.08, 0.36, 1.47, 5.09, 0.99, 2.85, 519.5], + [13.15, 3.33, 2.44, 21.2, 99.3, 1.69, 0.78, 0.45, 1.15, 7.40, 0.68, 1.72, 629.9], + ]; + + for (let cls = 0; cls < 3; cls++) { + for (let i = 0; i < (classSizes[cls] ?? 50); i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + const z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + row[j] = (classMeans[cls]![j] ?? 0) * (1 + 0.15 * z); + } + data.push(row); + target.push(cls); + } + } + + const featureNames = [ + "alcohol", "malic_acid", "ash", "alcalinity_of_ash", "magnesium", + "total_phenols", "flavanoids", "nonflavanoid_phenols", "proanthocyanins", + "color_intensity", "hue", "od280/od315_of_diluted_wines", "proline", + ]; + + return { + data, + target: new Int32Array(target), + featureNames, + targetNames: ["class_0", "class_1", "class_2"], + nSamples, + nFeatures, + }; +} + +export function loadBreastCancer(): Dataset { + const rng = seededRng(456); + const nSamples = 569; + const nFeatures = 30; + const data: Float64Array[] = []; + const target: number[] = []; + + // 0=malignant (212), 1=benign (357) + const classSizes = [212, 357]; + const classMeans = [ + [17.46, 21.60, 115.4, 978.4, 0.103, 0.145, 0.161, 0.088, 0.192, 0.063, + 0.609, 1.210, 4.324, 72.67, 0.007, 0.032, 0.042, 0.015, 0.020, 0.004, + 21.13, 29.32, 141.4, 1422.3, 0.145, 0.374, 0.455, 0.182, 0.324, 0.091], + [12.15, 17.92, 78.1, 462.8, 0.092, 0.080, 0.046, 0.025, 0.174, 0.062, + 0.284, 1.220, 2.001, 20.01, 0.007, 0.013, 0.014, 0.006, 0.021, 0.004, + 13.38, 23.52, 87.0, 558.9, 0.124, 0.182, 0.167, 0.074, 0.271, 0.079], + ]; + + for (let cls = 0; cls < 2; cls++) { + for (let i = 0; i < (classSizes[cls] ?? 100); i++) { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + const z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + row[j] = Math.max(0, (classMeans[cls]![j] ?? 0) * (1 + 0.2 * z)); + } + data.push(row); + target.push(cls); + } + } + + const featureNames = [ + "mean radius", "mean texture", "mean perimeter", "mean area", + "mean smoothness", "mean compactness", "mean concavity", + "mean concave points", "mean symmetry", "mean fractal dimension", + "radius error", "texture error", "perimeter error", "area error", + "smoothness error", "compactness error", "concavity error", + "concave points error", "symmetry error", "fractal dimension error", + "worst radius", "worst texture", "worst perimeter", "worst area", + "worst smoothness", "worst compactness", "worst concavity", + "worst concave points", "worst symmetry", "worst fractal dimension", + ]; + + return { + data, + target: new Int32Array(target), + featureNames, + targetNames: ["malignant", "benign"], + nSamples, + nFeatures, + }; +} + +export interface SwissRollResult { + X: Float64Array[]; + t: Float64Array; +} + +export function makeSwissRoll( + nSamples: number = 100, + noise: number = 0.0, + randomState?: number, +): SwissRollResult { + const rng = seededRng(randomState ?? 42); + + const t = new Float64Array(nSamples); + const X: Float64Array[] = []; + + for (let i = 0; i < nSamples; i++) { + const ti = 1.5 * Math.PI * (1 + 2 * rng()); + const height = 21 * rng(); + t[i] = ti; + + const nx = noise > 0 ? (() => { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + })() : 0; + + const ny = noise > 0 ? (() => { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + })() : 0; + + const nz = noise > 0 ? (() => { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + })() : 0; + + X.push( + new Float64Array([ + ti * Math.cos(ti) + nx, + height + ny, + ti * Math.sin(ti) + nz, + ]), + ); + } + + return { X, t }; +} + +export interface SCurveResult { + X: Float64Array[]; + t: Float64Array; +} + +export function makeScurve( + nSamples: number = 100, + noise: number = 0.0, + randomState?: number, +): SCurveResult { + const rng = seededRng(randomState ?? 42); + const X: Float64Array[] = []; + const t = new Float64Array(nSamples); + + for (let i = 0; i < nSamples; i++) { + const ti = 3 * Math.PI * (rng() - 0.5); + const height = 2 * rng(); + t[i] = ti; + + const nx = noise > 0 ? (() => { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + })() : 0; + + const ny = noise > 0 ? (() => { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + })() : 0; + + const nz = noise > 0 ? (() => { + const u1 = Math.max(rng(), 1e-10); + const u2 = rng(); + return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); + })() : 0; + + X.push( + new Float64Array([ + Math.sin(ti) + nx, + Math.sign(Math.cos(ti)) * (Math.cos(ti) - 1) + height + ny, + Math.abs(Math.cos(ti)) + nz, + ]), + ); + } + + return { X, t }; +} diff --git a/src/datasets/make_datasets.ts b/src/datasets/make_datasets.ts new file mode 100644 index 0000000..e0241df --- /dev/null +++ b/src/datasets/make_datasets.ts @@ -0,0 +1,216 @@ +/** + * Synthetic dataset generators. + * Mirrors sklearn.datasets: make_classification, make_regression, make_blobs, + * make_moons, make_circles. + */ + +export interface DatasetResult { + X: Float64Array[]; + y: Float64Array; +} + +/** Gaussian random sample. */ +function randn(): number { + let u = 0; + let v = 0; + while (u === 0) u = Math.random(); + while (v === 0) v = Math.random(); + return Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v); +} + +/** Shuffle arrays in place using Fisher-Yates. */ +function shuffle(arr: T[]): T[] { + for (let i = arr.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = arr[i] as T; + arr[i] = arr[j] as T; + arr[j] = tmp; + } + return arr; +} + +export function makeClassification( + options: { + nSamples?: number; + nFeatures?: number; + nClasses?: number; + nInformative?: number; + nRedundant?: number; + noise?: number; + randomState?: number; + } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 20; + const nClasses = options.nClasses ?? 2; + const nInformative = Math.min(options.nInformative ?? 2, nFeatures); + const noise = options.noise ?? 0.0; + + const X: Float64Array[] = Array.from({ length: nSamples }, () => new Float64Array(nFeatures)); + const y = new Float64Array(nSamples); + + // Cluster centers for each class + const centers: Float64Array[] = Array.from({ length: nClasses }, () => { + const center = new Float64Array(nInformative); + for (let j = 0; j < nInformative; j++) center[j] = randn() * 2; + return center; + }); + + for (let i = 0; i < nSamples; i++) { + const cls = i % nClasses; + y[i] = cls; + const xi = X[i] ?? new Float64Array(nFeatures); + const center = centers[cls] ?? new Float64Array(nInformative); + + for (let j = 0; j < nInformative; j++) { + xi[j] = (center[j] ?? 0) + randn() * 0.5 + randn() * noise; + } + for (let j = nInformative; j < nFeatures; j++) { + xi[j] = randn(); + } + } + + return { X, y }; +} + +export function makeRegression( + options: { + nSamples?: number; + nFeatures?: number; + nInformative?: number; + noise?: number; + bias?: number; + } = {}, +): DatasetResult & { coef: Float64Array } { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 100; + const nInformative = Math.min(options.nInformative ?? 10, nFeatures); + const noise = options.noise ?? 0.0; + const bias = options.bias ?? 0.0; + + const coef = new Float64Array(nFeatures); + for (let j = 0; j < nInformative; j++) { + coef[j] = randn() * 10; + } + + const X: Float64Array[] = Array.from({ length: nSamples }, () => { + const xi = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) xi[j] = randn(); + return xi; + }); + + const y = new Float64Array(nSamples); + for (let i = 0; i < nSamples; i++) { + let yi = bias; + const xi = X[i] ?? new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + yi += (xi[j] ?? 0) * (coef[j] ?? 0); + } + y[i] = yi + randn() * noise; + } + + return { X, y, coef }; +} + +export function makeBlobs( + options: { + nSamples?: number; + nFeatures?: number; + centers?: number | Float64Array[]; + clusterStd?: number; + } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 2; + const clusterStd = options.clusterStd ?? 1.0; + + let centers: Float64Array[]; + if (typeof options.centers === "number" || options.centers === undefined) { + const k = typeof options.centers === "number" ? options.centers : 3; + centers = Array.from({ length: k }, () => { + const c = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) c[j] = (Math.random() - 0.5) * 20; + return c; + }); + } else { + centers = options.centers; + } + + const k = centers.length; + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < nSamples; i++) { + const cls = i % k; + const center = centers[cls] ?? new Float64Array(nFeatures); + const xi = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + xi[j] = (center[j] ?? 0) + randn() * clusterStd; + } + X.push(xi); + y.push(cls); + } + + const order = shuffle(Array.from({ length: nSamples }, (_, i) => i)); + return { + X: order.map((i) => X[i] ?? new Float64Array(nFeatures)), + y: new Float64Array(order.map((i) => y[i] ?? 0)), + }; +} + +export function makeMoons( + options: { nSamples?: number; noise?: number } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const noise = options.noise ?? 0.0; + const half = Math.floor(nSamples / 2); + + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < half; i++) { + const angle = (Math.PI * i) / half; + X.push(new Float64Array([Math.cos(angle) + randn() * noise, Math.sin(angle) + randn() * noise])); + y.push(0); + } + for (let i = 0; i < nSamples - half; i++) { + const angle = (Math.PI * i) / (nSamples - half); + X.push(new Float64Array([1 - Math.cos(angle) + randn() * noise, 1 - Math.sin(angle) - 0.5 + randn() * noise])); + y.push(1); + } + + const order = shuffle(Array.from({ length: nSamples }, (_, i) => i)); + return { + X: order.map((i) => X[i] ?? new Float64Array(2)), + y: new Float64Array(order.map((i) => y[i] ?? 0)), + }; +} + +export function makeCircles( + options: { nSamples?: number; noise?: number; factor?: number } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const noise = options.noise ?? 0.0; + const factor = options.factor ?? 0.8; + const half = Math.floor(nSamples / 2); + + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < half; i++) { + const angle = (2 * Math.PI * i) / half; + X.push(new Float64Array([Math.cos(angle) + randn() * noise, Math.sin(angle) + randn() * noise])); + y.push(0); + } + for (let i = 0; i < nSamples - half; i++) { + const angle = (2 * Math.PI * i) / (nSamples - half); + X.push(new Float64Array([factor * Math.cos(angle) + randn() * noise, factor * Math.sin(angle) + randn() * noise])); + y.push(1); + } + + const order = shuffle(Array.from({ length: nSamples }, (_, i) => i)); + return { + X: order.map((i) => X[i] ?? new Float64Array(2)), + y: new Float64Array(order.map((i) => y[i] ?? 0)), + }; +} diff --git a/src/datasets/newsgroups.ts b/src/datasets/newsgroups.ts new file mode 100644 index 0000000..0340927 --- /dev/null +++ b/src/datasets/newsgroups.ts @@ -0,0 +1,121 @@ +/** + * Fetch 20 Newsgroups text dataset (simulated/stub). + * Mirrors sklearn.datasets.fetch_20newsgroups and fetch_20newsgroups_vectorized. + */ + +/** Available 20 newsgroups target names. */ +export const NEWSGROUPS_CATEGORIES: string[] = [ + "alt.atheism", + "comp.graphics", + "comp.os.ms-windows.misc", + "comp.sys.ibm.pc.hardware", + "comp.sys.mac.hardware", + "comp.windows.x", + "misc.forsale", + "rec.autos", + "rec.motorcycles", + "rec.sport.baseball", + "rec.sport.hockey", + "sci.crypt", + "sci.electronics", + "sci.med", + "sci.space", + "soc.religion.christian", + "talk.politics.guns", + "talk.politics.mideast", + "talk.politics.misc", + "talk.religion.misc", +]; + +export interface NewsgroupsDataset { + data: string[]; + target: Int32Array; + targetNames: string[]; + description: string; + filenames: string[]; +} + +/** + * Simulate fetching 20 Newsgroups text dataset. + * In the browser/Node environment this returns synthetic examples. + * Mirrors sklearn.datasets.fetch_20newsgroups. + */ +export function fetch20Newsgroups(options: { + subset?: "train" | "test" | "all"; + categories?: string[]; + shuffle?: boolean; + randomState?: number; + removeHeaders?: boolean; + removeFooters?: boolean; + removeQuotes?: boolean; + nSamples?: number; +} = {}): NewsgroupsDataset { + const categories = options.categories ?? NEWSGROUPS_CATEGORIES; + const nSamples = options.nSamples ?? categories.length * 5; + const subset = options.subset ?? "train"; + + const targetNames = categories.filter(c => NEWSGROUPS_CATEGORIES.includes(c)); + const data: string[] = []; + const targetArr: number[] = []; + const filenames: string[] = []; + + const rng = mulberry32((options.randomState ?? 42) + (subset === "test" ? 1000 : 0)); + + for (let i = 0; i < nSamples; i++) { + const catIdx = Math.floor(rng() * targetNames.length); + const catName = targetNames[catIdx] ?? "misc.forsale"; + data.push(syntheticPost(catName, i, rng)); + targetArr.push(catIdx); + filenames.push(`${catName}/${1000 + i}`); + } + + if (options.shuffle ?? false) { + const order = Array.from({ length: nSamples }, (_, i) => i).sort( + () => rng() - 0.5, + ); + const shuffledData = order.map(i => data[i]!); + const shuffledTarget = order.map(i => targetArr[i] ?? 0); + const shuffledFiles = order.map(i => filenames[i]!); + return { + data: shuffledData, + target: new Int32Array(shuffledTarget), + targetNames, + description: "20 Newsgroups text dataset (synthetic stub)", + filenames: shuffledFiles, + }; + } + + return { + data, + target: new Int32Array(targetArr), + targetNames, + description: "20 Newsgroups text dataset (synthetic stub)", + filenames, + }; +} + +function mulberry32(seed: number): () => number { + let s = seed | 0; + return () => { + s = (s + 0x6d2b79f5) | 0; + let t = Math.imul(s ^ (s >>> 15), 1 | s); + t ^= t + Math.imul(t ^ (t >>> 7), 61 | t); + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +const categoryWords: Record = { + "comp.graphics": ["pixel", "image", "render", "texture", "OpenGL", "3D", "graphics", "polygon"], + "rec.sport.baseball": ["pitcher", "batter", "home run", "inning", "MLB", "baseball", "score"], + "rec.sport.hockey": ["puck", "goal", "NHL", "skate", "hockey", "ice", "player", "team"], + "sci.space": ["orbit", "NASA", "rocket", "satellite", "planet", "launch", "mission", "moon"], + "sci.med": ["drug", "patient", "doctor", "treatment", "clinical", "disease", "medicine"], + "sci.crypt": ["encryption", "RSA", "key", "cipher", "algorithm", "cryptography", "secure"], + "talk.politics.guns": ["gun", "NRA", "Second Amendment", "firearm", "rights", "ban", "crime"], +}; + +function syntheticPost(category: string, seed: number, rng: () => number): string { + const words = categoryWords[category] ?? ["news", "article", "post", "discussion"]; + const selected = Array.from({ length: 5 }, () => words[Math.floor(rng() * words.length)] ?? "news"); + return `From: user${seed}@example.com\nSubject: Re: ${selected[0]}\n\n${selected.join(" ")} is an interesting topic in ${category}.\nSee related post #${Math.floor(rng() * 10000)}.`; +} diff --git a/src/datasets/openml.ts b/src/datasets/openml.ts new file mode 100644 index 0000000..e8fe23b --- /dev/null +++ b/src/datasets/openml.ts @@ -0,0 +1,210 @@ +/** + * OpenML dataset utilities. + * Mirrors sklearn.datasets.fetch_openml. + */ + +export interface OpenMLDataset { + data: Float64Array[]; + target: Float64Array | Int32Array; + featureNames: string[]; + targetNames: string[]; + description: string; + details: Record; +} + +export interface FetchOpenMLOptions { + name?: string; + version?: number | "active"; + dataId?: number; + dataHome?: string; + targetColumn?: string | string[] | null; + cacheDir?: string; + returnX_y?: boolean; + asFrame?: boolean; + nRetries?: number; + delay?: number; + parser?: "auto" | "pandas" | "liac-arff"; +} + +const OPENML_BASE_URL = "https://api.openml.org/api/v1/json"; + +/** + * Fetch a dataset from OpenML by name or ID. + * Returns structured data suitable for machine learning. + */ +export async function fetchOpenML( + options: FetchOpenMLOptions +): Promise { + const { name, version = "active", dataId } = options; + + let url: string; + if (dataId != null) { + url = `${OPENML_BASE_URL}/data/${dataId}`; + } else if (name != null) { + url = `${OPENML_BASE_URL}/data/list/data_name/${encodeURIComponent(name)}/status/active/limit/1`; + } else { + throw new Error("fetchOpenML: must specify name or dataId"); + } + + let response: Response; + try { + response = await fetch(url); + } catch (e) { + throw new Error(`fetchOpenML: network error β€” ${String(e)}`); + } + + if (!response.ok) { + throw new Error(`fetchOpenML: HTTP ${response.status} for ${url}`); + } + + const json = (await response.json()) as Record; + + // Parse the dataset list to find the actual dataset ID + let actualDataId = dataId; + if (actualDataId == null) { + const datasets = json["data"] as { dataset?: { did?: number }[] } | undefined; + const did = datasets?.dataset?.[0]?.did; + if (did == null) throw new Error(`fetchOpenML: dataset "${name}" not found`); + actualDataId = did; + void version; // version is used for filtering in production; simplified here + } + + // Fetch dataset description + const descResponse = await fetch( + `${OPENML_BASE_URL}/data/${actualDataId}` + ); + if (!descResponse.ok) { + throw new Error(`fetchOpenML: HTTP ${descResponse.status} fetching dataset ${actualDataId}`); + } + const descJson = (await descResponse.json()) as { + data_set_description?: { + name?: string; + description?: string; + url?: string; + row_id_attribute?: string; + ignore_attribute?: string | string[]; + default_target_attribute?: string; + feature?: Array<{ name: string; data_type: string }>; + }; + }; + + const desc = descJson.data_set_description ?? {}; + const description = desc.description ?? ""; + const targetCol = + options.targetColumn ?? desc.default_target_attribute ?? "class"; + + // Fetch the actual data file + const dataUrl = desc.url; + if (!dataUrl) throw new Error("fetchOpenML: no data URL in dataset description"); + + const dataResponse = await fetch(dataUrl); + if (!dataResponse.ok) { + throw new Error(`fetchOpenML: HTTP ${dataResponse.status} fetching data file`); + } + const text = await dataResponse.text(); + return parseArff(text, targetCol as string, description, desc as Record); +} + +/** + * Parse ARFF format into OpenMLDataset. + */ +export function parseArff( + arffText: string, + targetColumn: string, + description = "", + details: Record = {} +): OpenMLDataset { + const lines = arffText.split(/\r?\n/); + const attributes: Array<{ name: string; type: string }> = []; + let inData = false; + const rows: string[][] = []; + + for (const rawLine of lines) { + const line = rawLine.trim(); + if (line.startsWith("%") || line === "") continue; + if (line.toLowerCase().startsWith("@attribute")) { + const match = line.match(/@attribute\s+['"]?([^'"]+?)['"]?\s+(.*)/i); + if (match) { + attributes.push({ name: match[1]!.trim(), type: match[2]!.trim() }); + } + } else if (line.toLowerCase().startsWith("@data")) { + inData = true; + } else if (inData) { + rows.push(line.split(",").map((s) => s.trim())); + } + } + + const targetIdx = attributes.findIndex( + (a) => a.name.toLowerCase() === targetColumn.toLowerCase() + ); + const featureIdxs = attributes + .map((_, i) => i) + .filter((i) => i !== targetIdx); + + const featureNames = featureIdxs.map((i) => attributes[i]?.name ?? `f${i}`); + const data: Float64Array[] = rows.map((row) => + new Float64Array(featureIdxs.map((i) => Number.parseFloat(row[i] ?? "0") || 0)) + ); + + const targetAttr = targetIdx >= 0 ? attributes[targetIdx] : null; + const targetType = targetAttr?.type ?? "NUMERIC"; + let target: Float64Array | Int32Array; + + if ( + targetType.toUpperCase().startsWith("NUMERIC") || + targetType.toUpperCase().startsWith("REAL") || + targetType.toUpperCase().startsWith("INTEGER") + ) { + target = new Float64Array( + rows.map((row) => Number.parseFloat(row[targetIdx] ?? "0") || 0) + ); + } else { + // Nominal β€” encode as integers + const vals = new Set(rows.map((row) => row[targetIdx] ?? "")); + const valMap = new Map([...vals].map((v, i) => [v, i])); + target = new Int32Array( + rows.map((row) => valMap.get(row[targetIdx] ?? "") ?? 0) + ); + } + + return { + data, + target, + featureNames, + targetNames: targetAttr ? [targetAttr.name] : [], + description, + details, + }; +} + +/** + * List available OpenML datasets matching the given criteria. + */ +export async function listOpenMLDatasets(options: { + tag?: string; + limit?: number; + offset?: number; +} = {}): Promise> { + let url = `${OPENML_BASE_URL}/data/list`; + const params: string[] = []; + if (options.tag) params.push(`tag/${encodeURIComponent(options.tag)}`); + if (params.length > 0) url += "/" + params.join("/"); + + const response = await fetch(url); + if (!response.ok) throw new Error(`listOpenMLDatasets: HTTP ${response.status}`); + + const json = (await response.json()) as { + data?: { + dataset?: Array<{ did: number; name: string; version: number; status: string }>; + }; + }; + + return (json.data?.dataset ?? []) + .slice(0, options.limit ?? 100) + .map((d) => ({ + id: d.did, + name: d.name, + version: d.version, + status: d.status, + })); +} diff --git a/src/datasets/rcv1.ts b/src/datasets/rcv1.ts new file mode 100644 index 0000000..f75106d --- /dev/null +++ b/src/datasets/rcv1.ts @@ -0,0 +1,157 @@ +/** + * RCV1 dataset utilities and sparse text dataset helpers. + * Mirrors sklearn.datasets.rcv1 and related sparse dataset loaders. + */ +import type { SparseMatrix } from "../utils/sparsefuncs.js"; + +export interface RCV1DatasetInfo { + nSamples: number; + nFeatures: number; + nCategories: number; + description: string; +} + +/** Metadata about the RCV1 corpus. */ +export const RCV1_INFO: RCV1DatasetInfo = { + nSamples: 804414, + nFeatures: 47236, + nCategories: 103, + description: + "RCV1 β€” Reuters Corpus Volume 1. A collection of 804,414 news articles " + + "annotated with 103 topic categories. Features are TF-IDF weighted bag-of-words.", +}; + +export interface TextDataset { + data: SparseMatrix; + target: Int32Array; + targetNames: string[]; + featureNames: string[]; + description: string; +} + +/** + * Build a sparse TF-IDF matrix from an array of tokenized documents. + * Each document is an array of term strings. + */ +export function buildTfIdf( + documents: string[][], + options: { maxFeatures?: number; sublinearTf?: boolean; smoothIdf?: boolean } = {} +): { matrix: SparseMatrix; vocabulary: Map; idf: Float64Array } { + const { maxFeatures, sublinearTf = false, smoothIdf = true } = options; + const nDocs = documents.length; + + // Build vocabulary + const df = new Map(); + for (const doc of documents) { + const seen = new Set(); + for (const term of doc) { + if (!seen.has(term)) { df.set(term, (df.get(term) ?? 0) + 1); seen.add(term); } + } + } + + // Sort by df descending, take top maxFeatures + let vocab = [...df.entries()].sort((a, b) => b[1] - a[1]); + if (maxFeatures !== undefined) vocab = vocab.slice(0, maxFeatures); + const termToIdx = new Map(vocab.map(([t], i) => [t, i])); + const nTerms = termToIdx.size; + + // IDF + const idf = new Float64Array(nTerms); + for (const [term, idx] of termToIdx) { + const dfi = df.get(term) ?? 0; + idf[idx] = Math.log(((smoothIdf ? 1 : 0) + nDocs) / ((smoothIdf ? 1 : 0) + dfi)) + 1; + } + + // Build CSR TF-IDF matrix + const dataArr: number[] = []; + const indicesArr: number[] = []; + const indptrArr: number[] = [0]; + + for (const doc of documents) { + const tf = new Map(); + for (const term of doc) { + const idx = termToIdx.get(term); + if (idx !== undefined) tf.set(idx, (tf.get(idx) ?? 0) + 1); + } + const docLen = doc.length; + const entries = [...tf.entries()].sort((a, b) => a[0] - b[0]); + for (const [idx, count] of entries) { + const tfVal = sublinearTf ? 1 + Math.log(count) : count / docLen; + const val = tfVal * (idf[idx] ?? 0); + if (val !== 0) { dataArr.push(val); indicesArr.push(idx); } + } + indptrArr.push(dataArr.length); + } + + const matrix: SparseMatrix = { + data: new Float64Array(dataArr), + indices: new Int32Array(indicesArr), + indptr: new Int32Array(indptrArr), + shape: [nDocs, nTerms], + }; + + return { matrix, vocabulary: termToIdx, idf }; +} + +/** + * Generate a synthetic sparse text dataset for testing. + * Returns documents drawn from `nCategories` topics with `nFeatures` vocabulary. + */ +export function makeSparseTextDataset(options: { + nSamples?: number; + nFeatures?: number; + nCategories?: number; + avgTermsPerDoc?: number; + randomState?: number; +} = {}): { X: SparseMatrix; y: Int32Array; featureNames: string[]; categoryNames: string[] } { + const { + nSamples = 200, + nFeatures = 500, + nCategories = 5, + avgTermsPerDoc = 20, + randomState = 42, + } = options; + + let seed = randomState | 0; + const rng = (): number => { + seed = (seed ^ (seed << 13)) >>> 0; + seed = (seed ^ (seed >>> 17)) >>> 0; + seed = (seed ^ (seed << 5)) >>> 0; + return (seed >>> 0) / 0xffffffff; + }; + + const featureNames = Array.from({ length: nFeatures }, (_, i) => `word_${i}`); + const categoryNames = Array.from({ length: nCategories }, (_, i) => `category_${i}`); + + const data: number[] = []; + const indices: number[] = []; + const indptr: number[] = [0]; + const y = new Int32Array(nSamples); + + for (let i = 0; i < nSamples; i++) { + const cat = Math.floor(rng() * nCategories); + y[i] = cat; + const nTerms = Math.max(1, Math.round(avgTermsPerDoc * (0.5 + rng()))); + const tfMap = new Map(); + for (let t = 0; t < nTerms; t++) { + // Category-biased term selection + const bias = rng() < 0.3 ? cat * Math.floor(nFeatures / nCategories) : 0; + const termIdx = (Math.floor(rng() * Math.floor(nFeatures / nCategories)) + bias) % nFeatures; + tfMap.set(termIdx, (tfMap.get(termIdx) ?? 0) + 1); + } + const entries = [...tfMap.entries()].sort((a, b) => a[0] - b[0]); + for (const [idx, count] of entries) { + data.push(count); indices.push(idx); + } + indptr.push(data.length); + } + + const X: SparseMatrix = { + data: new Float64Array(data), + indices: new Int32Array(indices), + indptr: new Int32Array(indptr), + shape: [nSamples, nFeatures], + }; + + return { X, y, featureNames, categoryNames }; +} diff --git a/src/datasets/real_datasets.ts b/src/datasets/real_datasets.ts new file mode 100644 index 0000000..6cf4f44 --- /dev/null +++ b/src/datasets/real_datasets.ts @@ -0,0 +1,344 @@ +/** + * Real-world dataset generators and synthetic alternatives. + * Mirrors sklearn.datasets (california_housing, covtype, kddcup99, etc.) + */ + +export interface RealDataset { + data: Float64Array[]; + target: Float64Array; + featureNames: string[]; + targetNames?: string[]; + description: string; +} + +export interface RealClassificationDataset extends RealDataset { + target: Float64Array; // integer class labels as floats + classes: Int32Array; +} + +/** + * Generate a synthetic version of the California Housing dataset. + * The real dataset has 20,640 instances and 8 features. + * This generator produces a statistically similar synthetic dataset. + * + * Features: MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude + * Target: median house value (in $100k) + */ +export function makeCaliforniaHousing(options: { + nSamples?: number; + noise?: number; + seed?: number; +} = {}): RealDataset { + const { nSamples = 1000, noise = 0.1, seed = 42 } = options; + let rng = seed; + const rand = () => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return ((rng >>> 0) / 0xffffffff); + }; + const randn = () => { + const u = rand() || 1e-10; + const v = rand() || 1e-10; + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); + }; + + const featureNames = [ + "MedInc", "HouseAge", "AveRooms", "AveBedrms", + "Population", "AveOccup", "Latitude", "Longitude", + ]; + + const data: Float64Array[] = []; + const target = new Float64Array(nSamples); + + for (let i = 0; i < nSamples; i++) { + const medInc = Math.max(0.5, 3.0 + randn() * 2.0); + const houseAge = Math.max(1, Math.min(52, 28 + randn() * 12)); + const aveRooms = Math.max(1, 5.4 + randn() * 2.0); + const aveBedrms = Math.max(0.5, 1.1 + randn() * 0.4); + const population = Math.max(10, 1400 + randn() * 1100); + const aveOccup = Math.max(1, 3.0 + randn() * 1.5); + const latitude = 35.6 + randn() * 2.1; + const longitude = -119.6 + randn() * 2.0; + + const row = new Float64Array([ + medInc, houseAge, aveRooms, aveBedrms, + population, aveOccup, latitude, longitude, + ]); + data.push(row); + + // Approximate the California housing formula + target[i] = Math.max(0.15, Math.min(5.0, + 0.4524 * medInc + - 0.0104 * houseAge + + 0.0 * aveRooms + - 0.0 * aveBedrms + - 0.0 * population / 1000 + - 0.0 * aveOccup + - 0.042 * latitude + + 0.0 * longitude + + 2.1 + randn() * noise, + )); + } + + return { + data, + target, + featureNames, + description: "Synthetic California Housing dataset (sklearn-compatible)", + }; +} + +/** + * Generate a synthetic version of the Forest Covertype dataset. + * The real dataset has 581,012 instances and 54 features with 7 cover types. + * + * Returns integer class labels 1-7 for cover type. + */ +export function makeCovtype(options: { + nSamples?: number; + seed?: number; +} = {}): RealClassificationDataset { + const { nSamples = 500, seed = 42 } = options; + let rng = seed; + const rand = () => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return ((rng >>> 0) / 0xffffffff); + }; + const randn = () => { + const u = rand() || 1e-10; + const v = rand() || 1e-10; + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); + }; + + // 54 features: 10 continuous, 4 binary wilderness areas, 40 binary soil types + const continuousFeatureNames = [ + "Elevation", "Aspect", "Slope", + "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", + "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", + ]; + const wildernessNames = [ + "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", + ]; + const soilNames = Array.from({ length: 40 }, (_, i) => `Soil_Type${i + 1}`); + const featureNames = [...continuousFeatureNames, ...wildernessNames, ...soilNames]; + + const data: Float64Array[] = []; + const target = new Float64Array(nSamples); + const classes = new Int32Array([1, 2, 3, 4, 5, 6, 7]); + + // Cover type priors (approximate): 1=36.5%, 2=48.7%, 3=6.2%, 4=0.5%, 5=1.6%, 6=2.9%, 7=3.5% + const priors = [0.365, 0.487, 0.062, 0.005, 0.016, 0.029, 0.035]; + const cdf = priors.reduce((acc, p, i) => { + acc.push((acc[i - 1] ?? 0) + p); + return acc; + }, []); + + for (let i = 0; i < nSamples; i++) { + // Sample class label + const u = rand(); + let cls = 1; + for (let c = 0; c < cdf.length; c++) { + if (u <= (cdf[c] ?? 1)) { cls = c + 1; break; } + } + target[i] = cls; + + // Continuous features (mean/std approximate per class) + const elevation = 2800 + cls * 50 + randn() * 200; + const aspect = 180 + randn() * 90; + const slope = 12 + randn() * 8; + const horizHydro = 300 + randn() * 250; + const vertHydro = 20 + randn() * 50; + const horizRoad = 2000 + randn() * 1500; + const hillshade9am = Math.max(0, Math.min(255, 200 + randn() * 40)); + const hillshadeNoon = Math.max(0, Math.min(255, 220 + randn() * 30)); + const hillshade3pm = Math.max(0, Math.min(255, 135 + randn() * 60)); + const horizFire = 1500 + randn() * 1200; + + // Binary wilderness area (one-hot) + const wArea = Math.floor(rand() * 4); + const w = new Float64Array(4); + w[wArea] = 1; + + // Binary soil type (one-hot among 40) + const sType = Math.floor(rand() * 40); + const s = new Float64Array(40); + s[sType] = 1; + + const row = new Float64Array([ + elevation, aspect, slope, horizHydro, vertHydro, + horizRoad, hillshade9am, hillshadeNoon, hillshade3pm, horizFire, + ...w, ...s, + ]); + data.push(row); + } + + return { + data, + target, + featureNames, + targetNames: ["Spruce/Fir", "Lodgepole Pine", "Ponderosa Pine", + "Cottonwood/Willow", "Aspen", "Douglas-fir", "Krummholz"], + classes, + description: "Synthetic Covertype dataset (sklearn-compatible, 7 classes, 54 features)", + }; +} + +/** + * Generate a synthetic version of the KDD Cup 1999 dataset. + * Returns a simplified intrusion detection dataset. + * + * @param subset - 'SA' (small) or 'SF' (larger subset), or '10percent' + */ +export function makeKddcup99(options: { + nSamples?: number; + subset?: "SA" | "SF" | "10percent"; + percentAnomalies?: number; + seed?: number; +} = {}): RealClassificationDataset { + const { + nSamples = 500, + percentAnomalies = 0.2, + seed = 42, + } = options; + + let rng = seed; + const rand = () => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return ((rng >>> 0) / 0xffffffff); + }; + const randn = () => { + const u = rand() || 1e-10; + const v = rand() || 1e-10; + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); + }; + + const featureNames = [ + "duration", "protocol_type", "service", "flag", + "src_bytes", "dst_bytes", "land", "wrong_fragment", + "urgent", "hot", "num_failed_logins", "logged_in", + "num_compromised", "root_shell", "su_attempted", + "num_root", "num_file_creations", "num_shells", + "num_access_files", "num_outbound_cmds", "is_host_login", + "is_guest_login", "count", "srv_count", + "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", + "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", + "dst_host_count", "dst_host_srv_count", + "dst_host_same_srv_rate", "dst_host_diff_srv_rate", + "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", + "dst_host_serror_rate", "dst_host_srv_serror_rate", + "dst_host_rerror_rate", "dst_host_srv_rerror_rate", + ]; + + const nAnomalies = Math.floor(nSamples * percentAnomalies); + const nNormal = nSamples - nAnomalies; + + const data: Float64Array[] = []; + const target = new Float64Array(nSamples); + // 0 = normal, 1 = anomaly + const classes = new Int32Array([0, 1]); + + for (let i = 0; i < nSamples; i++) { + const isAnomaly = i < nAnomalies; + target[i] = isAnomaly ? 1 : 0; + + const row = new Float64Array(featureNames.length); + if (isAnomaly) { + // Anomaly pattern: high src_bytes, high error rates + row[0] = Math.max(0, randn() * 2); + row[4] = Math.max(0, 100000 + randn() * 50000); + row[5] = Math.max(0, randn() * 100); + row[24] = Math.max(0, Math.min(1, 0.8 + randn() * 0.2)); + row[26] = Math.max(0, Math.min(1, 0.7 + randn() * 0.2)); + } else { + // Normal: small transfers, low error + row[0] = Math.max(0, randn() * 5); + row[4] = Math.max(0, 500 + randn() * 1000); + row[5] = Math.max(0, 2000 + randn() * 3000); + row[24] = Math.max(0, Math.min(1, 0.02 + randn() * 0.05)); + row[26] = Math.max(0, Math.min(1, 0.01 + randn() * 0.03)); + } + row[22] = Math.max(0, Math.min(511, Math.abs(randn() * 50 + 10))); + row[31] = Math.max(0, Math.min(255, Math.abs(randn() * 50 + 100))); + data.push(row); + } + + // Shuffle + for (let i = nSamples - 1; i > 0; i--) { + const j = Math.floor(rand() * (i + 1)); + const tmp = data[i]!; + data[i] = data[j]!; + data[j] = tmp; + const ttmp = target[i]!; + target[i] = target[j]!; + target[j] = ttmp; + } + + _ = nNormal; // suppress unused var + + return { + data, + target, + featureNames, + targetNames: ["normal", "anomaly"], + classes, + description: "Synthetic KDD Cup 1999 network intrusion detection dataset", + }; +} + +// Suppress TS unused variable error +let _: number; + +/** + * Load a synthetic version of the Olivetti faces dataset. + * 400 samples, 64x64 pixel face images (4096 features), 40 subjects. + */ +export function makeOlivettiFaces(options: { + nSamples?: number; + nSubjects?: number; + seed?: number; +} = {}): RealDataset { + const { nSamples = 400, nSubjects = 40, seed = 42 } = options; + let rng = seed; + const rand = () => { + rng = (rng * 1664525 + 1013904223) & 0xffffffff; + return ((rng >>> 0) / 0xffffffff); + }; + const randn = () => { + const u = rand() || 1e-10; + const v = rand() || 1e-10; + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); + }; + + const nFeatures = 4096; // 64x64 + const data: Float64Array[] = []; + const target = new Float64Array(nSamples); + const featureNames = Array.from({ length: nFeatures }, (_, i) => `pixel_${i}`); + + // Each subject has a "prototype" face + const prototypes: Float64Array[] = Array.from({ length: nSubjects }, () => { + const p = new Float64Array(nFeatures); + for (let f = 0; f < nFeatures; f++) { + p[f] = Math.max(0, Math.min(1, 0.5 + randn() * 0.2)); + } + return p; + }); + + for (let i = 0; i < nSamples; i++) { + const subject = i % nSubjects; + target[i] = subject; + const proto = prototypes[subject]!; + const row = new Float64Array(nFeatures); + for (let f = 0; f < nFeatures; f++) { + row[f] = Math.max(0, Math.min(1, proto[f]! + randn() * 0.05)); + } + data.push(row); + } + + return { + data, + target, + featureNames, + targetNames: Array.from({ length: nSubjects }, (_, i) => `subject_${i}`), + description: `Synthetic Olivetti faces dataset (${nSubjects} subjects, ${nSamples} samples)`, + }; +} diff --git a/src/datasets/sample_images.ts b/src/datasets/sample_images.ts new file mode 100644 index 0000000..fbafb60 --- /dev/null +++ b/src/datasets/sample_images.ts @@ -0,0 +1,76 @@ +/** + * Sample image datasets. + * Mirrors scikit-learn's datasets.load_sample_image and load_sample_images. + */ + +export interface SampleImage { + name: string; + data: Uint8Array; + height: number; + width: number; + channels: number; +} + +/** Available sample image names */ +export const SAMPLE_IMAGE_NAMES = ["china", "flower"] as const; +export type SampleImageName = (typeof SAMPLE_IMAGE_NAMES)[number]; + +/** Generate a synthetic sample image for testing/demos. */ +function generateSyntheticImage( + name: SampleImageName, + height: number, + width: number, +): Uint8Array { + const data = new Uint8Array(height * width * 3); + let seed = name === "china" ? 1337 : 7331; + const rng = (): number => { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + return (seed >>> 0) / 0xffffffff; + }; + + for (let i = 0; i < height; i++) { + for (let j = 0; j < width; j++) { + const base = (i * width + j) * 3; + if (name === "china") { + // Sky gradient + random texture + const t = i / height; + data[base] = Math.floor(135 + 120 * (1 - t) + rng() * 20); + data[base + 1] = Math.floor(206 * (1 - t * 0.5) + rng() * 20); + data[base + 2] = Math.floor(235 * (1 - t * 0.3) + rng() * 20); + } else { + // Flower: radial gradient + const cx = 0.5, cy = 0.5; + const r = Math.sqrt((j / width - cx) ** 2 + (i / height - cy) ** 2); + const angle = Math.atan2(i / height - cy, j / width - cx); + const petal = Math.sin(angle * 6) > 0 ? 1 : 0; + const inFlower = r < 0.4 ? 1 : 0; + data[base] = Math.floor(255 * petal * inFlower + rng() * 30); + data[base + 1] = Math.floor(200 * (1 - r) * inFlower + rng() * 30); + data[base + 2] = Math.floor(50 * inFlower + rng() * 30); + } + } + } + return data; +} + +/** + * Load a single sample image by name. + */ +export function loadSampleImage(imageName: SampleImageName): SampleImage { + const height = 427; + const width = imageName === "china" ? 640 : 483; + return { + name: imageName, + data: generateSyntheticImage(imageName, height, width), + height, + width, + channels: 3, + }; +} + +/** + * Load all sample images. + */ +export function loadSampleImages(): SampleImage[] { + return SAMPLE_IMAGE_NAMES.map((name) => loadSampleImage(name)); +} diff --git a/src/datasets/samples_generator.ts b/src/datasets/samples_generator.ts new file mode 100644 index 0000000..3023de0 --- /dev/null +++ b/src/datasets/samples_generator.ts @@ -0,0 +1,228 @@ +/** + * Additional synthetic dataset generators. + * Mirrors sklearn.datasets: make_hastie_10_2, make_friedman1/2/3, + * make_sparse_uncorrelated, make_checkerboard, make_multilabel_classification. + */ + +/** Result type for generated datasets. */ +export interface SamplesDatasetResult { + X: Float64Array[]; + y: Float64Array | Int32Array; +} + +/** Simple seeded Mulberry32 RNG for reproducibility. */ +function makeRng(seed: number): () => number { + let s = seed >>> 0; + return () => { + s = (s + 0x6d2b79f5) >>> 0; + let t = Math.imul(s ^ (s >>> 15), s | 1); + t ^= t + Math.imul(t ^ (t >>> 7), t | 61); + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +function randn(rng: () => number): number { + const u1 = Math.max(rng(), 1e-14); + const u2 = rng(); + return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); +} + +/** + * make_hastie_10_2 β€” 10-feature binary classification problem. + * y = sign(sum(X_i^2) - 9.34) where X ~ N(0,1). + */ +export function makeHastie10_2( + nSamples = 12000, + randomState = 0, +): { X: Float64Array[]; y: Int32Array } { + const rng = makeRng(randomState); + const X: Float64Array[] = Array.from({ length: nSamples }, () => { + const row = new Float64Array(10); + for (let j = 0; j < 10; j++) row[j]! = randn(rng); + return row; + }); + const y = Int32Array.from(X, (row) => { + let s = 0; + for (const v of row) s += v * v; + return s > 9.34 ? 1 : -1; + }); + return { X, y }; +} + +/** + * make_friedman1 β€” regression dataset from Friedman (1991). + * y = 10*sin(Ο€*X0*X1) + 20*(X2-0.5)^2 + 10*X3 + 5*X4 + noise + */ +export function makeFriedman1( + nSamples = 100, + nFeatures = 10, + noise = 0.0, + randomState = 0, +): SamplesDatasetResult { + if (nFeatures < 5) throw new Error("makeFriedman1 requires at least 5 features"); + const rng = makeRng(randomState); + const X: Float64Array[] = Array.from({ length: nSamples }, () => { + const row = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) row[j]! = rng(); + return row; + }); + const y = Float64Array.from(X, (row) => { + const x0 = row[0]! ?? 0; + const x1 = row[1]! ?? 0; + const x2 = row[2]! ?? 0; + const x3 = row[3]! ?? 0; + const x4 = row[4]! ?? 0; + return ( + 10 * Math.sin(Math.PI * x0 * x1) + + 20 * (x2 - 0.5) ** 2 + + 10 * x3 + + 5 * x4 + + (noise > 0 ? noise * randn(rng) : 0) + ); + }); + return { X, y }; +} + +/** + * make_friedman2 β€” regression with nonlinear interactions. + * y = sqrt(X0^2 + (X1*X2 - 1/(X1*X3))^2) + noise + */ +export function makeFriedman2( + nSamples = 100, + noise = 0.0, + randomState = 0, +): SamplesDatasetResult { + const rng = makeRng(randomState); + const bounds: [number, number][] = [[0, 100], [40 * Math.PI, 560 * Math.PI], [0, 1], [1, 11]]; + const X: Float64Array[] = Array.from({ length: nSamples }, () => { + const row = new Float64Array(4); + for (let j = 0; j < 4; j++) { + const [lo, hi] = bounds[j]!; + row[j]! = lo + rng() * (hi - lo); + } + return row; + }); + const y = Float64Array.from(X, (row) => { + const x0 = row[0]! ?? 0; + const x1 = row[1]! ?? 0; + const x2 = row[2]! ?? 0; + const x3 = Math.max(row[3]! ?? 1, 1e-6); + const inner = x1 * x2 - 1 / (x1 * x3); + return Math.sqrt(x0 ** 2 + inner ** 2) + (noise > 0 ? noise * randn(rng) : 0); + }); + return { X, y }; +} + +/** + * make_friedman3 β€” regression with arctan transformation. + * y = arctan((X1*X2 - 1/(X1*X3)) / X0) + noise + */ +export function makeFriedman3( + nSamples = 100, + noise = 0.0, + randomState = 0, +): SamplesDatasetResult { + const rng = makeRng(randomState); + const bounds: [number, number][] = [[0, 100], [40 * Math.PI, 560 * Math.PI], [0, 1], [1, 11]]; + const X: Float64Array[] = Array.from({ length: nSamples }, () => { + const row = new Float64Array(4); + for (let j = 0; j < 4; j++) { + const [lo, hi] = bounds[j]!; + row[j]! = lo + rng() * (hi - lo); + } + return row; + }); + const y = Float64Array.from(X, (row) => { + const x0 = Math.max(Math.abs(row[0]! ?? 0), 1e-6); + const x1 = row[1]! ?? 0; + const x2 = row[2]! ?? 0; + const x3 = Math.max(row[3]! ?? 1, 1e-6); + const inner = x1 * x2 - 1 / (x1 * x3); + return Math.atan(inner / x0) + (noise > 0 ? noise * randn(rng) : 0); + }); + return { X, y }; +} + +/** + * make_sparse_uncorrelated β€” regression dataset with 4 informative features + * and `nFeatures - 4` noise features. + */ +export function makeSparseUncorrelated( + nSamples = 100, + nFeatures = 10, + randomState = 0, +): SamplesDatasetResult { + const rng = makeRng(randomState); + const X: Float64Array[] = Array.from({ length: nSamples }, () => + Float64Array.from({ length: nFeatures }, () => randn(rng)), + ); + const coef = [1, 2, 0.5, -0.5]; // informative coefficients + const y = Float64Array.from(X, (row) => { + let s = 0; + for (let j = 0; j < coef.length; j++) s += (coef[j]! ?? 0) * (row[j]! ?? 0); + s += randn(rng); + return s; + }); + return { X, y }; +} + +/** + * make_multilabel_classification β€” random multilabel dataset. + * + * @param nSamples - Number of samples. + * @param nFeatures - Number of features. + * @param nClasses - Number of classes (labels). + * @param nLabels - Average number of labels per sample. + * @param randomState - Random seed. + */ +export function makeMultilabelClassification( + nSamples = 100, + nFeatures = 20, + nClasses = 5, + nLabels = 2, + randomState = 0, +): { X: Float64Array[]; y: Int32Array[] } { + const rng = makeRng(randomState); + const X: Float64Array[] = Array.from({ length: nSamples }, () => + Float64Array.from({ length: nFeatures }, () => rng() > 0.5 ? 1 : 0), + ); + const y: Int32Array[] = Array.from({ length: nSamples }, () => { + const row = new Int32Array(nClasses); + const nActive = Math.max(1, Math.round(nLabels + (rng() - 0.5) * 2)); + for (let k = 0; k < nActive && k < nClasses; k++) { + row[Math.floor(rng() * nClasses)]! = 1; + } + return row; + }); + return { X, y }; +} + +/** + * make_checkerboard β€” checkerboard pattern for biclustering. + * + * @param shape - [n_rows, n_cols]. + * @param nClusters - [n_row_clusters, n_col_clusters]. + * @param noise - Noise standard deviation. + * @param randomState - Random seed. + */ +export function makeCheckerboard( + shape: [number, number] = [300, 300], + nClusters: [number, number] = [4, 3], + noise = 0.5, + randomState = 0, +): { data: Float64Array[]; rowLabels: Int32Array; colLabels: Int32Array } { + const rng = makeRng(randomState); + const [nRows, nCols] = shape; + const [nRowC, nColC] = nClusters; + const rowLabels = Int32Array.from({ length: nRows }, (_, i) => i % nRowC); + const colLabels = Int32Array.from({ length: nCols }, (_, j) => j % nColC); + const data: Float64Array[] = Array.from({ length: nRows }, (_, i) => { + const row = new Float64Array(nCols); + for (let j = 0; j < nCols; j++) { + const match = (rowLabels[i]! % 2) === (colLabels[j]! % 2); + row[j]! = (match ? 1 : 0) + noise * randn(rng); + } + return row; + }); + return { data, rowLabels, colLabels }; +} diff --git a/src/datasets/svmlight.ts b/src/datasets/svmlight.ts new file mode 100644 index 0000000..3fc6d3d --- /dev/null +++ b/src/datasets/svmlight.ts @@ -0,0 +1,113 @@ +/** + * SVMLight format loading and saving utilities. + * Ports: load_svmlight_file, dump_svmlight_file + */ + +export interface SVMLightDataset { + data: Float64Array[]; + target: Float64Array; + nFeatures: number; +} + +/** + * Parse SVMLight / LibSVM format text. + * Format: