From 6c197b2bf242fe736174be86b7668d400298da9f Mon Sep 17 00:00:00 2001 From: Jeff Date: Thu, 12 Mar 2026 10:53:10 +0000 Subject: [PATCH 1/7] Add Bad Egg suspension advisory model with 8-feature scoring Adds a suspension risk advisory (Bad Egg) that computes a suspicion score using an 8-feature logistic regression model when scoring PR authors. Includes full integration across action, CLI, MCP server, and formatter outputs. Feature validation (scripts/validate_bad_egg_features.py) with temporal holdout on the production-relevant population (authors with merged PRs only) shows AUCs at chance level (0.47-0.56). Full results documented in experiments/bot_detection/BAD_EGG_VALIDATION.md. --- action.yml | 8 + .../bot_detection/BAD_EGG_VALIDATION.md | 177 +++++ scripts/validate_bad_egg_features.py | 676 ++++++++++++++++++ src/good_egg/action.py | 13 + src/good_egg/cli.py | 11 + src/good_egg/config.py | 53 ++ src/good_egg/formatter.py | 50 +- src/good_egg/github_client.py | 69 ++ src/good_egg/mcp_server.py | 7 + src/good_egg/models.py | 17 + src/good_egg/scorer.py | 163 ++++- tests/conftest.py | 73 ++ tests/test_action.py | 88 ++- tests/test_cli.py | 23 + tests/test_formatter.py | 83 +++ tests/test_github_client.py | 94 +++ tests/test_models.py | 79 ++ tests/test_scorer.py | 345 ++++++++- 18 files changed, 2024 insertions(+), 5 deletions(-) create mode 100644 experiments/bot_detection/BAD_EGG_VALIDATION.md create mode 100644 scripts/validate_bad_egg_features.py diff --git a/action.yml b/action.yml index 90d120f..d48c501 100644 --- a/action.yml +++ b/action.yml @@ -28,6 +28,10 @@ inputs: skip-known-contributors: description: 'Skip scoring for authors with merged PRs in the repo (true/false)' required: false + bad-egg: + description: 'Enable suspension advisory score (true/false)' + required: false + default: 'true' outputs: score: @@ -45,6 +49,9 @@ outputs: skipped: description: 'Whether scoring was skipped for an existing contributor (true/false)' value: ${{ steps.score.outputs.skipped }} + suspicion-level: + description: 'Suspension advisory level (HIGH, ELEVATED, NORMAL, or empty)' + value: ${{ steps.score.outputs.suspicion-level }} runs: using: 'composite' @@ -76,6 +83,7 @@ runs: INPUT_FAIL-ON-LOW: ${{ inputs.fail-on-low }} INPUT_SCORING_MODEL: ${{ inputs.scoring-model }} INPUT_SKIP_KNOWN_CONTRIBUTORS: ${{ inputs.skip-known-contributors }} + INPUT_BAD_EGG: ${{ inputs.bad-egg }} run: | cd ${{ github.action_path }} uv run python -m good_egg.action diff --git a/experiments/bot_detection/BAD_EGG_VALIDATION.md b/experiments/bot_detection/BAD_EGG_VALIDATION.md new file mode 100644 index 0000000..e9343f3 --- /dev/null +++ b/experiments/bot_detection/BAD_EGG_VALIDATION.md @@ -0,0 +1,177 @@ +# Bad Egg Feature Validation Results + +## TL;DR + +**The Bad Egg suspension advisory model has no discriminative power on the production-relevant population.** When restricted to authors who have merged PRs (the only users who would actually be scored), all 10 candidate features produce AUCs at or below chance (0.47–0.56) across three temporal cutoffs. No feature survives LOO ablation. The previous AUC of 0.643 was an artifact of including 279 suspended users with zero merged PRs — users whose trivially distinguishable features (merge_rate=0, total_prs=0) inflated apparent performance but who would never be scored in production. + +## Background + +### Previous work (PR 44, bot-detection branch) + +The bot-detection experiments (stages 1–15) evaluated author-level features for predicting PR outcomes and account suspension. Stage 15 ablation found `{merge_rate, median_additions, isolation_score}` as the recommended 3-feature set for suspension classification, with merge_rate as the only feature surviving decontamination (AUC 0.693±0.110 with temporal holdout). + +However, that analysis used the **full labeled population** (12,898 authors), of which 86% of suspended accounts had zero merged PRs. + +### The population problem + +The Bad Egg suspicion score only runs when `user_data.merged_prs` is non-empty — it's gated by `if self.config.bad_egg.enabled and user_data.merged_prs:` in both v1 and v2 scoring paths. This means the model never evaluates users without merged PRs. Training on zero-PR users and then deploying only to users with merged PRs creates a fundamental train/serve skew. + +### Previous (flawed) validation + +A validation script trained on all 12,898 labeled authors (323 suspended / 12,575 active) and reported CV AUC 0.637–0.643. This appeared to justify a 2-feature or 8-feature model. But: +- 279 of 323 suspended users (86%) had 0 merged PRs +- These users had merge_rate=0, total_prs=0, career_span_days=0 — trivially separable +- merge_rate appeared dispensable only because it had zero variance in 86% of positives +- The 2-feature trim (career_span_days + mean_title_length) was based on invalid evidence + +## This Validation + +### Ground truth expansion + +Before running validation, we completed ground truth coverage by checking all unchecked PR authors against the GitHub API: + +| Metric | Before | After | +|--------|--------|-------| +| Total authors in DB | 14,413 | 31,307 | +| Authors with status checked | 12,898 | 31,296 | +| Suspended accounts | 323 | 739 | +| **Suspended with merged PRs** | **44** | **417** | + +The expansion found 416 new suspended accounts (78 + 338 across two runs, with one network error requiring restart). The script (`check_account_status.py`) queries `GET /users/{login}` at 2s spacing with 50% rate limit budget, is idempotent, and writes directly to DuckDB on each request. + +### Methodology + +Replicates PR 44 (stage 15) methodology with the **correct population**: only authors with at least 1 merged PR before each cutoff date. + +**Cutoffs**: 2022-07-01, 2023-01-01, 2024-01-01 + +**Population per cutoff** (authors with ≥1 merged PR before cutoff, with known account status): + +| Cutoff | Total | Suspended | Active | CV method | +|--------|-------|-----------|--------|-----------| +| 2022-07-01 | 2,235 | 58 | 2,177 | 5-fold stratified | +| 2023-01-01 | 3,619 | 92 | 3,527 | 5-fold stratified | +| 2024-01-01 | 7,642 | 204 | 7,438 | 5-fold stratified | + +**10 candidate features** (all computable in a GitHub Action, no account_age): +1. merge_rate — merged / total PRs before cutoff +2. total_prs — count of all PRs (log-transformed) +3. career_span_days — max-min PR dates in days (log-transformed) +4. mean_title_length — average PR title length +5. median_additions — median lines added in merged PRs (log-transformed) +6. median_files_changed — median files changed in merged PRs (log-transformed) +7. total_repos — count of distinct repos +8. isolation_score — fraction of author's repos with no multi-repo contributor overlap +9. hub_score — degree centrality on bipartite author-repo graph +10. bipartite_clustering — bipartite clustering coefficient + +**Excluded**: account_age (100% NaN for suspended accounts — profiles unavailable, leaked indicator), LLM-based features (no commercial API calls in GH Action), k-NN features (circular — uses suspended accounts as seeds). + +**Model**: LogisticRegression(class_weight="balanced"), StandardScaler per fold, 5-fold stratified CV. + +**Statistical tests**: DeLong paired test for AUC comparison, Holm-Bonferroni correction (alpha=0.05). + +### Script + +`scripts/validate_bad_egg_features.py` — runs both analyses and prints results. + +## Results + +### LOO Ablation: no feature survives + +Every feature is DISPENSABLE at every cutoff after Holm-Bonferroni correction. Several features actually *hurt* the model when included (negative delta = removing improves AUC). + +**Cutoff 2022-07-01** (Full 10f AUC: 0.470 — worse than random): + +| Feature | Ablated AUC | Delta | p-value | adj-p | Verdict | +|---------|-------------|-------|---------|-------|---------| +| merge_rate | 0.456 | +0.014 | 0.519 | 1.000 | DISPENSABLE | +| median_additions | 0.465 | +0.005 | 0.615 | 1.000 | DISPENSABLE | +| isolation_score | 0.470 | +0.000 | 0.949 | 1.000 | DISPENSABLE | +| career_span_days | 0.504 | -0.034 | 0.023 | 0.182 | DISPENSABLE | +| median_files_changed | 0.488 | -0.018 | 0.202 | 1.000 | DISPENSABLE | + +**Cutoff 2023-01-01** (Full 10f AUC: 0.491): + +| Feature | Ablated AUC | Delta | p-value | adj-p | Verdict | +|---------|-------------|-------|---------|-------|---------| +| career_span_days | 0.486 | +0.005 | 0.447 | 1.000 | DISPENSABLE | +| merge_rate | 0.500 | -0.009 | 0.103 | 1.000 | DISPENSABLE | +| median_files_changed | 0.511 | -0.020 | 0.176 | 1.000 | DISPENSABLE | + +**Cutoff 2024-01-01** (Full 10f AUC: 0.533): + +| Feature | Ablated AUC | Delta | p-value | adj-p | Verdict | +|---------|-------------|-------|---------|-------|---------| +| career_span_days | 0.515 | +0.018 | 0.053 | 0.367 | DISPENSABLE | +| median_additions | 0.542 | -0.010 | 0.002 | 0.023 | DISPENSABLE* | +| bipartite_clustering | 0.543 | -0.010 | 0.034 | 0.275 | DISPENSABLE | + +\* median_additions has significant adjusted p at 2024-01-01 but removing it *improves* AUC (delta is negative), so it's correctly classified as DISPENSABLE. + +**Summary across cutoffs**: 0/10 features marked KEEP at any cutoff. + +### Forward Selection: AUC degrades with more features + +At every cutoff, AUC peaks with 1-3 features and declines as more are added — classic overfitting on noise. + +**Cutoff 2022-07-01**: bipartite_clustering starts at 0.541, degrades to 0.470 with all 10. + +**Cutoff 2023-01-01**: career_span_days starts at 0.541, degrades to 0.491 with all 10. + +**Cutoff 2024-01-01**: career_span_days starts at 0.562, peaks at 0.582 with {career_span_days, mean_title_length}, degrades to 0.533 with all 10. + +No addition step achieves a significant DeLong p-value. + +### Full 10f vs 3f (PR 44 recommended set) Comparison + +| Cutoff | 10f AUC | 3f AUC | Delta | p-value | +|--------|---------|--------|-------|---------| +| 2022-07-01 | 0.465 | 0.435 | +0.030 | 0.489 | +| 2023-01-01 | 0.478 | 0.451 | +0.026 | 0.421 | +| 2024-01-01 | 0.557 | 0.536 | +0.021 | 0.258 | + +Neither model meaningfully outperforms chance. The differences are not significant. + +### Refit attempt (all data, 3-feature) + +Refitting on all data through 2026 (18,795 authors, 415 suspended): +- **CV AUC: 0.552** — barely above chance +- Probability range: [0.31, 0.55] — max probability can't even reach 0.60 +- At t=0.50: flags 59% of users with 2.5% precision +- At t=0.55: flags 1% of users with 1.6% precision +- At t=0.60+: flags nobody + +The model cannot produce actionable thresholds. + +## Why the Signal Disappeared + +The original model's apparent AUC of 0.643 was driven almost entirely by the **zero-PR suspended accounts**. These 279 users (86% of suspended) had: +- merge_rate = 0 (no merged PRs, some had only closed/rejected PRs) +- total_prs typically very low +- career_span_days = 0 (single PR or none) +- All other features at zero or near-zero + +Active users with merged PRs have non-zero values for these features by definition. This made separation trivial — but only for a population that would never be scored. + +When we restrict to users who have merged PRs (the production population), suspended accounts are **behaviorally indistinguishable** from active ones. Suspended accounts that have merged PRs look like normal contributors — they got PRs merged into real repos, which requires passing code review. Their merge rates, PR sizes, title lengths, career spans, and network positions are all within normal ranges. + +## Implications + +1. **The Bad Egg model should not ship.** No feature combination produces actionable discrimination on the production-relevant population. The 8-feature model in config.py is fit on noise. + +2. **The feature set is exhausted.** All 10 candidate features that can be computed in a GitHub Action have been tested. None works. This includes graph features (hub_score, bipartite_clustering) that had promising coefficients in the inflated-population model. + +3. **Account age would help but is unavailable.** Suspended accounts have their profiles removed, making `created_at` inaccessible via the API. This is the strongest predictor of suspension (young accounts are disproportionately suspended) but it's a leaked indicator — it's unavailable precisely for the accounts we want to detect. + +4. **The fundamental asymmetry**: Suspension correlates with account-level metadata (age, profile completeness, activity patterns across all of GitHub) rather than PR-level behavioral features. Users who get PRs merged have already passed a human filter (code review), making their PR behavior look legitimate. + +## Appendix: Raw Output + +Full output from `scripts/validate_bad_egg_features.py` is reproduced above. The script can be re-run with: + +```bash +uv run python scripts/validate_bad_egg_features.py +``` + +Data: `experiments/bot_detection/data/bot_detection.duckdb` (31,307 authors, 200,172 PRs, 96 repos). diff --git a/scripts/validate_bad_egg_features.py b/scripts/validate_bad_egg_features.py new file mode 100644 index 0000000..f643477 --- /dev/null +++ b/scripts/validate_bad_egg_features.py @@ -0,0 +1,676 @@ +"""Validate Bad Egg feature importance with correct population and temporal holdout. + +Replicates PR 44 (bot-detection branch) methodology: +- Population: only authors with ≥1 merged PR before each cutoff (production-relevant) +- Temporal holdout: features computed from pre-cutoff PRs only +- LOO ablation with DeLong tests + Holm-Bonferroni correction +- Forward selection with DeLong stopping + +10 candidate features (all GH-action compatible, no account_age): + merge_rate, total_prs, career_span_days, mean_title_length, + median_additions, median_files_changed, total_repos, + isolation_score, hub_score, bipartite_clustering. +""" + +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path +from typing import Any + +import duckdb +import networkx as nx +import numpy as np +import pandas as pd +from scipy import stats as sp_stats +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import LeaveOneOut, StratifiedKFold +from sklearn.preprocessing import StandardScaler + +BASE = Path(__file__).resolve().parent.parent / "experiments" / "bot_detection" +DB_PATH = BASE / "data" / "bot_detection.duckdb" + +FEATURES = [ + "merge_rate", "total_prs", "career_span_days", "mean_title_length", + "median_additions", "median_files_changed", "total_repos", + "isolation_score", "hub_score", "bipartite_clustering", +] +LOG_TRANSFORM = {"total_prs", "career_span_days", "median_additions", "median_files_changed"} + +CUTOFFS = ["2022-07-01", "2023-01-01", "2024-01-01"] + + +# --------------------------------------------------------------------------- +# Data loading with temporal filtering +# --------------------------------------------------------------------------- + +def get_features_before_cutoff( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> pd.DataFrame: + """Compute aggregate features per author from PRs before cutoff. + + Only includes authors with at least 1 merged PR before cutoff. + """ + rows = con.execute(""" + WITH author_prs AS ( + SELECT + author, + state, + created_at, + title, + additions, + files_changed, + repo + FROM prs + WHERE created_at < ?::TIMESTAMP + AND author IS NOT NULL + ), + merged_authors AS ( + SELECT DISTINCT author + FROM author_prs + WHERE state = 'MERGED' + ) + SELECT + ma.author, + -- merge_rate + SUM(CASE WHEN ap.state = 'MERGED' THEN 1 ELSE 0 END)::DOUBLE + / COUNT(*)::DOUBLE AS merge_rate, + -- total_prs + COUNT(*)::DOUBLE AS total_prs, + -- career_span_days + COALESCE( + EXTRACT(EPOCH FROM (MAX(ap.created_at) - MIN(ap.created_at))) / 86400.0, + 0.0 + ) AS career_span_days, + -- mean_title_length + AVG(LENGTH(ap.title))::DOUBLE AS mean_title_length, + -- median_additions (only from merged PRs) + MEDIAN(CASE WHEN ap.state = 'MERGED' THEN ap.additions END)::DOUBLE + AS median_additions, + -- median_files_changed (only from merged PRs) + MEDIAN(CASE WHEN ap.state = 'MERGED' THEN ap.files_changed END)::DOUBLE + AS median_files_changed, + -- total_repos + COUNT(DISTINCT ap.repo)::DOUBLE AS total_repos + FROM merged_authors ma + JOIN author_prs ap ON ap.author = ma.author + GROUP BY ma.author + """, [cutoff]).fetchdf() + + return rows + + +def compute_per_user_isolation_before( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> dict[str, float]: + """Compute isolation_score per author from pre-cutoff author-repo pairs.""" + pairs = con.execute(""" + SELECT author, repo, COUNT(*) as pr_count + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + GROUP BY author, repo + """, [cutoff]).fetchall() + + repo_contributors: dict[str, set[str]] = defaultdict(set) + author_repos: dict[str, set[str]] = defaultdict(set) + for author, repo, _ in pairs: + repo_contributors[repo].add(author) + author_repos[author].add(repo) + + isolation_scores: dict[str, float] = {} + for author, repos in author_repos.items(): + if not repos: + isolation_scores[author] = 1.0 + continue + + contributor_repo_count: dict[str, int] = defaultdict(int) + for repo in repos: + for c in repo_contributors[repo]: + if c != author: + contributor_repo_count[c] += 1 + + multi_repo = {c for c, count in contributor_repo_count.items() if count >= 2} + + isolated = 0 + for repo in repos: + other_contribs = repo_contributors[repo] - {author} + if not (other_contribs & multi_repo): + isolated += 1 + + isolation_scores[author] = isolated / len(repos) + + return isolation_scores + + +def compute_graph_features_before( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> dict[str, dict[str, float]]: + """Compute hub_score and bipartite_clustering from pre-cutoff bipartite graph. + + Returns {"hub_score": {author: val}, "bipartite_clustering": {author: val}}. + """ + triples = con.execute(""" + SELECT author, repo, COUNT(*) as pr_count + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + GROUP BY author, repo + """, [cutoff]).fetchall() + + if not triples: + return {"hub_score": {}, "bipartite_clustering": {}} + + g = nx.Graph() + authors_set = set() + repos_set = set() + for author, repo, count in triples: + a_node = f"a:{author}" + r_node = f"r:{repo}" + g.add_edge(a_node, r_node, weight=count) + authors_set.add(a_node) + repos_set.add(r_node) + + # Hub score = degree centrality for author nodes + centrality = nx.degree_centrality(g) + hub_scores = { + a.removeprefix("a:"): centrality.get(a, 0.0) for a in authors_set + } + + # Bipartite clustering + try: + clustering = nx.bipartite.clustering(g, authors_set) + bip_clustering = { + a.removeprefix("a:"): clustering.get(a, 0.0) for a in authors_set + } + except Exception: + bip_clustering = {a.removeprefix("a:"): 0.0 for a in authors_set} + + return {"hub_score": hub_scores, "bipartite_clustering": bip_clustering} + + +def load_cutoff_data( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> pd.DataFrame: + """Load features + labels for authors with merged PRs before cutoff.""" + print(f"\n--- Loading data for cutoff {cutoff} ---") + + df = get_features_before_cutoff(con, cutoff) + print(f" Authors with merged PRs before {cutoff}: {len(df)}") + + # Add isolation_score + print(" Computing isolation_score...") + iso = compute_per_user_isolation_before(con, cutoff) + df["isolation_score"] = df["author"].map(iso).fillna(1.0) + + # Add graph features + print(" Computing graph features (hub_score, bipartite_clustering)...") + graph_feats = compute_graph_features_before(con, cutoff) + df["hub_score"] = df["author"].map(graph_feats["hub_score"]).fillna(0.0) + df["bipartite_clustering"] = ( + df["author"].map(graph_feats["bipartite_clustering"]).fillna(0.0) + ) + + # Join labels + labels = con.execute( + "SELECT login, account_status FROM authors WHERE account_status IS NOT NULL" + ).fetchdf() + df = df.merge(labels, left_on="author", right_on="login", how="inner") + + # Filter to labeled only + df = df[df["account_status"].isin(["active", "suspended"])].copy() + + n_susp = (df["account_status"] == "suspended").sum() + n_active = (df["account_status"] == "active").sum() + print(f" Labeled: {len(df)} ({n_susp} suspended, {n_active} active)") + + return df + + +# --------------------------------------------------------------------------- +# DeLong test + Holm-Bonferroni (from bot-detection stats.py) +# --------------------------------------------------------------------------- + +def delong_auc_test( + y_true: np.ndarray, + y_scores_a: np.ndarray, + y_scores_b: np.ndarray, +) -> dict[str, Any]: + """Paired DeLong test for comparing two AUC-ROC values.""" + n1 = np.sum(y_true == 1) + n0 = np.sum(y_true == 0) + + if n1 == 0 or n0 == 0: + return { + "auc_a": float("nan"), "auc_b": float("nan"), + "z_statistic": float("nan"), "p_value": float("nan"), + } + + auc_a = roc_auc_score(y_true, y_scores_a) + auc_b = roc_auc_score(y_true, y_scores_b) + + pos_idx = np.where(y_true == 1)[0] + neg_idx = np.where(y_true == 0)[0] + + def placement_values( + scores: np.ndarray, + ) -> tuple[np.ndarray, np.ndarray]: + pos_scores = scores[pos_idx] + neg_scores = scores[neg_idx] + v10 = np.array([ + np.mean(ps > neg_scores) + 0.5 * np.mean(ps == neg_scores) + for ps in pos_scores + ]) + v01 = np.array([ + np.mean(pos_scores > ns) + 0.5 * np.mean(pos_scores == ns) + for ns in neg_scores + ]) + return v10, v01 + + v10_a, v01_a = placement_values(y_scores_a) + v10_b, v01_b = placement_values(y_scores_b) + + s10 = np.cov(np.stack([v10_a, v10_b])) + s01 = np.cov(np.stack([v01_a, v01_b])) + + if s10.ndim == 0: + s10 = np.array([[s10]]) + if s01.ndim == 0: + s01 = np.array([[s01]]) + + s = s10 / n1 + s01 / n0 + contrast = np.array([1, -1]) + var_diff = contrast @ s @ contrast + + if var_diff <= 0: + return {"auc_a": auc_a, "auc_b": auc_b, "z_statistic": 0.0, "p_value": 1.0} + + z = (auc_a - auc_b) / np.sqrt(var_diff) + p_value = 2.0 * sp_stats.norm.sf(abs(z)) + + return { + "auc_a": float(auc_a), "auc_b": float(auc_b), + "z_statistic": float(z), "p_value": float(p_value), + } + + +def holm_bonferroni( + p_values: dict[str, float], alpha: float = 0.05, +) -> dict[str, dict[str, Any]]: + """Apply Holm-Bonferroni correction to a set of p-values.""" + sorted_tests = sorted(p_values.items(), key=lambda x: x[1]) + m = len(sorted_tests) + results: dict[str, dict[str, Any]] = {} + + prev_adj = 0.0 + for rank, (name, p) in enumerate(sorted_tests, start=1): + adjusted_p = min(1.0, p * (m - rank + 1)) + adjusted_p = max(adjusted_p, prev_adj) + prev_adj = adjusted_p + results[name] = { + "p_value": p, "adjusted_p": adjusted_p, + "reject": adjusted_p <= alpha, "rank": rank, + } + + return results + + +# --------------------------------------------------------------------------- +# CV helper with LOO/stratified switching +# --------------------------------------------------------------------------- + +def get_oof_probabilities( + x: np.ndarray, + y: np.ndarray, + seed: int = 42, +) -> np.ndarray: + """Get out-of-fold probabilities using LOO or 5-fold stratified CV.""" + n_pos = y.sum() + oof = np.full(len(y), np.nan) + + if n_pos < 30: + # LOO-CV + loo = LeaveOneOut() + for train_idx, test_idx in loo.split(x, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x[train_idx]) + x_test = scaler.transform(x[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + else: + # 5-fold stratified CV + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) + for train_idx, test_idx in skf.split(x, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x[train_idx]) + x_test = scaler.transform(x[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + + return oof + + +def prepare_features(df: pd.DataFrame, feature_list: list[str]) -> np.ndarray: + """Extract and log-transform feature matrix.""" + arrays = [] + for col in feature_list: + vals = df[col].fillna(0).values.astype(float) + if col in LOG_TRANSFORM: + vals = np.log1p(np.abs(vals)) * np.sign(vals) + arrays.append(vals) + return np.column_stack(arrays) + + +# --------------------------------------------------------------------------- +# LOO ablation per cutoff +# --------------------------------------------------------------------------- + +def run_ablation( + df: pd.DataFrame, features: list[str], +) -> list[dict[str, Any]]: + """Drop each feature one at a time, compare AUC via DeLong test.""" + y = (df["account_status"] == "suspended").astype(int).values + x_full = prepare_features(df, features) + oof_full = get_oof_probabilities(x_full, y) + auc_full = roc_auc_score(y, oof_full) + + results = [] + for feat in features: + ablated = [f for f in features if f != feat] + x_abl = prepare_features(df, ablated) + oof_abl = get_oof_probabilities(x_abl, y) + auc_abl = roc_auc_score(y, oof_abl) + + delong = delong_auc_test(y, oof_full, oof_abl) + delta = auc_full - auc_abl # positive = removing hurts + + results.append({ + "feature": feat, + "full_auc": auc_full, + "ablated_auc": auc_abl, + "delta": delta, + "p_value": delong["p_value"], + "z_stat": delong["z_statistic"], + }) + + # Holm-Bonferroni correction + p_vals = {r["feature"]: r["p_value"] for r in results} + hb = holm_bonferroni(p_vals) + + for r in results: + feat = r["feature"] + r["adjusted_p"] = hb[feat]["adjusted_p"] + # KEEP only if removing significantly HURTS AUC (delta > 0 and reject) + r["verdict"] = ( + "KEEP" if hb[feat]["reject"] and r["delta"] > 0 + else "DISPENSABLE" + ) + + return results + + +# --------------------------------------------------------------------------- +# Forward selection per cutoff +# --------------------------------------------------------------------------- + +def run_forward_selection( + df: pd.DataFrame, features: list[str], +) -> list[dict[str, Any]]: + """Greedy forward selection with DeLong stopping.""" + y = (df["account_status"] == "suspended").astype(int).values + selected: list[str] = [] + trajectory: list[dict[str, Any]] = [] + prev_oof: np.ndarray | None = None + + for step in range(len(features)): + best_feat = None + best_auc = -1.0 + best_oof = None + + for candidate in features: + if candidate in selected: + continue + trial = selected + [candidate] + x_trial = prepare_features(df, trial) + oof_trial = get_oof_probabilities(x_trial, y) + auc_trial = roc_auc_score(y, oof_trial) + + if auc_trial > best_auc: + best_auc = auc_trial + best_feat = candidate + best_oof = oof_trial + + if best_feat is None: + break + + # DeLong test vs previous step + p_value = None + if prev_oof is not None and best_oof is not None: + delong = delong_auc_test(y, best_oof, prev_oof) + p_value = delong["p_value"] + + selected.append(best_feat) + prev_oof = best_oof + + trajectory.append({ + "step": step + 1, + "feature": best_feat, + "auc": best_auc, + "p_value": p_value, + "selected": list(selected), + }) + + return trajectory + + +# --------------------------------------------------------------------------- +# Aggregation and recommendation +# --------------------------------------------------------------------------- + +def aggregate_ablation( + all_results: dict[str, list[dict[str, Any]]], +) -> None: + """Print aggregated ablation results across cutoffs.""" + print("\n" + "=" * 80) + print("AGGREGATED LOO ABLATION RESULTS") + print("=" * 80) + + feature_verdicts: dict[str, list[str]] = defaultdict(list) + feature_deltas: dict[str, list[float]] = defaultdict(list) + + for cutoff, results in all_results.items(): + print(f"\n--- Cutoff: {cutoff} ---") + print(f" Full AUC: {results[0]['full_auc']:.4f}") + print(f" {'Feature':<25s} {'Ablated AUC':>11s} {'Delta':>8s} " + f"{'p-value':>10s} {'adj-p':>10s} {'Verdict'}") + for r in sorted(results, key=lambda x: x["delta"], reverse=True): + print(f" {r['feature']:<25s} {r['ablated_auc']:>11.4f} " + f"{r['delta']:>+8.4f} {r['p_value']:>10.4f} " + f"{r['adjusted_p']:>10.4f} {r['verdict']}") + feature_verdicts[r["feature"]].append(r["verdict"]) + feature_deltas[r["feature"]].append(r["delta"]) + + print("\n--- Summary across cutoffs ---") + print(f" {'Feature':<25s} {'Mean Delta':>10s} {'KEEP count':>10s} Verdict") + for feat in FEATURES: + verdicts = feature_verdicts.get(feat, []) + keep_count = sum(1 for v in verdicts if v == "KEEP") + mean_delta = np.mean(feature_deltas.get(feat, [0.0])) + overall = "KEEP" if keep_count >= 2 else "DISPENSABLE" + print(f" {feat:<25s} {mean_delta:>+10.4f} {keep_count:>10d}/{len(verdicts)} " + f"{overall}") + + +def aggregate_forward_selection( + all_trajectories: dict[str, list[dict[str, Any]]], +) -> None: + """Print aggregated forward selection results.""" + print("\n" + "=" * 80) + print("AGGREGATED FORWARD SELECTION RESULTS") + print("=" * 80) + + for cutoff, trajectory in all_trajectories.items(): + print(f"\n--- Cutoff: {cutoff} ---") + print(f" {'Step':>4s} {'Feature':<25s} {'AUC':>8s} {'p-value':>10s}") + for t in trajectory: + p_str = f"{t['p_value']:.4f}" if t["p_value"] is not None else "---" + print(f" {t['step']:>4d} {t['feature']:<25s} {t['auc']:>8.4f} {p_str:>10s}") + + +def refit_recommended( + con: duckdb.DuckDBPyConnection, + recommended: list[str], +) -> None: + """Refit model on all data with recommended features and print config.""" + print("\n" + "=" * 80) + print(f"REFIT WITH RECOMMENDED FEATURES: {recommended}") + print("=" * 80) + + # Use latest cutoff for refit + df = load_cutoff_data(con, "2026-01-01") + y = (df["account_status"] == "suspended").astype(int).values + x = prepare_features(df, recommended) + + # CV AUC + oof = get_oof_probabilities(x, y) + cv_auc = roc_auc_score(y, oof) + print(f"\n CV AUC (all data): {cv_auc:.4f}") + + # Final fit + scaler = StandardScaler() + x_scaled = scaler.fit_transform(x) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=42, + ) + model.fit(x_scaled, y) + + # Convert to unscaled coefficients + means = scaler.mean_ + stds = scaler.scale_ + coefs_scaled = model.coef_[0] + intercept_scaled = model.intercept_[0] + coefs_raw = coefs_scaled / stds + intercept_raw = intercept_scaled - np.sum(coefs_scaled * means / stds) + + print("\n Config update:") + print(f" intercept: float = {intercept_raw:.4f}") + for feat, w in zip(recommended, coefs_raw, strict=True): + key = feat + if feat in LOG_TRANSFORM: + key = feat + print(f" {key}_weight: float = {w:.4f}") + + # Threshold analysis + logit = np.full(len(df), intercept_raw) + for feat, w in zip(recommended, coefs_raw, strict=True): + vals = df[feat].fillna(0).values.astype(float) + if feat in LOG_TRANSFORM: + vals = np.log1p(np.abs(vals)) * np.sign(vals) + logit += w * vals + probs = 1.0 / (1.0 + np.exp(-logit)) + + print(f"\n Probability range: [{probs.min():.4f}, {probs.max():.4f}]") + print(f" Mean (suspended): {probs[y == 1].mean():.4f}") + print(f" Mean (active): {probs[y == 0].mean():.4f}") + print() + for threshold in [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]: + flagged = probs >= threshold + n_flagged = int(flagged.sum()) + n_true_pos = int((flagged & (y == 1)).sum()) + precision = n_true_pos / n_flagged if n_flagged > 0 else 0 + recall = n_true_pos / y.sum() if y.sum() > 0 else 0 + print(f" t={threshold:.2f}: flagged={n_flagged:5d} " + f"({100 * n_flagged / len(y):.1f}%), " + f"prec={precision:.3f}, recall={recall:.3f}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + con = duckdb.connect(str(DB_PATH), read_only=True) + + # Population overview + print("=" * 80) + print("POPULATION OVERVIEW") + print("=" * 80) + total_authors = con.execute("SELECT COUNT(*) FROM authors").fetchone()[0] + checked = con.execute( + "SELECT COUNT(*) FROM authors WHERE account_status IS NOT NULL" + ).fetchone()[0] + print(f"Total authors: {total_authors}, Checked: {checked}") + + status_rows = con.execute( + "SELECT account_status, COUNT(*) FROM authors GROUP BY account_status " + "ORDER BY COUNT(*) DESC" + ).fetchall() + for status, count in status_rows: + print(f" {status}: {count}") + + susp_merged = con.execute(""" + SELECT COUNT(DISTINCT a.login) + FROM authors a JOIN prs p ON a.login = p.author + WHERE a.account_status = 'suspended' AND p.state = 'MERGED' + """).fetchone()[0] + print(f"\nSuspended authors with merged PRs: {susp_merged}") + + # Run per-cutoff analyses + all_ablation: dict[str, list[dict[str, Any]]] = {} + all_forward: dict[str, list[dict[str, Any]]] = {} + + for cutoff in CUTOFFS: + df = load_cutoff_data(con, cutoff) + n_susp = (df["account_status"] == "suspended").sum() + cv_method = "LOO" if n_susp < 30 else "5-fold stratified" + print(f"\n CV method: {cv_method} (n_suspended={n_susp})") + + print(f"\n Running LOO ablation for {cutoff}...") + ablation = run_ablation(df, FEATURES) + all_ablation[cutoff] = ablation + + print(f" Running forward selection for {cutoff}...") + forward = run_forward_selection(df, FEATURES) + all_forward[cutoff] = forward + + # Aggregate results + aggregate_ablation(all_ablation) + aggregate_forward_selection(all_forward) + + # Compare full 10f vs recommended subset + print("\n" + "=" * 80) + print("FULL vs SUBSET COMPARISON") + print("=" * 80) + for cutoff in CUTOFFS: + df = load_cutoff_data(con, cutoff) + y = (df["account_status"] == "suspended").astype(int).values + + x_full = prepare_features(df, FEATURES) + oof_full = get_oof_probabilities(x_full, y) + auc_full = roc_auc_score(y, oof_full) + + # Try the PR 44 recommended 3-feature set + subset_3 = ["merge_rate", "median_additions", "isolation_score"] + x_3 = prepare_features(df, subset_3) + oof_3 = get_oof_probabilities(x_3, y) + auc_3 = roc_auc_score(y, oof_3) + + delong = delong_auc_test(y, oof_full, oof_3) + + print(f"\n Cutoff {cutoff}: 10f AUC={auc_full:.4f}, " + f"3f AUC={auc_3:.4f}, delta={auc_full - auc_3:+.4f}, " + f"p={delong['p_value']:.4f}") + + # Refit with recommended features + refit_recommended(con, ["merge_rate", "median_additions", "isolation_score"]) + + con.close() + + +if __name__ == "__main__": + main() diff --git a/src/good_egg/action.py b/src/good_egg/action.py index cb47dc0..0812625 100644 --- a/src/good_egg/action.py +++ b/src/good_egg/action.py @@ -45,6 +45,11 @@ async def run_action() -> None: os.environ.get("INPUT_SKIP-KNOWN-CONTRIBUTORS") or os.environ.get("INPUT_SKIP_KNOWN_CONTRIBUTORS") ) + bad_egg_input = ( + os.environ.get("INPUT_BAD-EGG") + or os.environ.get("INPUT_BAD_EGG") + or "true" + ) if not token: print("::error::GITHUB_TOKEN is required") @@ -88,6 +93,10 @@ async def run_action() -> None: "true", "1", "yes", )} ) + if bad_egg_input.lower() not in ("true", "1", "yes"): + config = config.model_copy( + update={"bad_egg": config.bad_egg.model_copy(update={"enabled": False})} + ) cache = Cache(ttls=config.cache_ttl.to_seconds()) score = await score_pr_author( @@ -131,6 +140,10 @@ async def run_action() -> None: _set_output("user", score.user_login) _set_output("scoring-model", score.scoring_model) _set_output("skipped", "true" if skipped else "false") + suspicion_level = "" + if score.suspicion_score is not None: + suspicion_level = score.suspicion_score.suspicion_level.value + _set_output("suspicion-level", suspicion_level) # Summary pct = score.normalized_score * 100 diff --git a/src/good_egg/cli.py b/src/good_egg/cli.py index ea0e07b..e8bce60 100644 --- a/src/good_egg/cli.py +++ b/src/good_egg/cli.py @@ -38,6 +38,12 @@ def main() -> None: default=False, help="Force full scoring even for known contributors", ) +@click.option( + "--no-bad-egg", + is_flag=True, + default=False, + help="Disable suspension advisory score", +) def score( username: str, repo: str, @@ -47,6 +53,7 @@ def score( output_json: bool, scoring_model: str | None, force_score: bool, + no_bad_egg: bool, ) -> None: """Score a GitHub user's trustworthiness relative to a repository.""" if not token: @@ -64,6 +71,10 @@ def score( config = config.model_copy(update={"scoring_model": scoring_model}) if force_score: config = config.model_copy(update={"skip_known_contributors": False}) + if no_bad_egg: + config = config.model_copy( + update={"bad_egg": config.bad_egg.model_copy(update={"enabled": False})} + ) cache = Cache(ttls=config.cache_ttl.to_seconds()) result = asyncio.run( diff --git a/src/good_egg/config.py b/src/good_egg/config.py index 0d44f8b..d998588 100644 --- a/src/good_egg/config.py +++ b/src/good_egg/config.py @@ -48,6 +48,7 @@ class CacheTTLConfig(BaseModel): repo_metadata_hours: int = 168 # 7 days user_profile_hours: int = 24 # 1 day user_prs_hours: int = 336 # 14 days + repo_contributors_hours: int = 168 # 7 days def to_seconds(self) -> dict[str, int]: """Convert TTLs to seconds for the cache layer.""" @@ -55,6 +56,7 @@ def to_seconds(self) -> dict[str, int]: "repo_metadata": self.repo_metadata_hours * 3600, "user_profile": self.user_profile_hours * 3600, "user_prs": self.user_prs_hours * 3600, + "repo_contributors": self.repo_contributors_hours * 3600, } @@ -145,6 +147,48 @@ class V2Config(BaseModel): ) +class BadEggModelConfig(BaseModel): + """Balanced LR coefficients for the 8-feature suspension advisory model. + + Features (log1p-transformed where noted): + merge_rate, log1p(total_prs), log1p(career_span_days), + mean_title_length, isolation_score, total_repos, + log1p(median_additions), log1p(median_files_changed). + + Fitted on 12,898 labeled authors (323 suspended / 12,575 active) + with balanced class weights. CV AUC 0.643. + """ + intercept: float = 1.8988 + merge_rate_weight: float = -0.1699 + total_prs_weight: float = 0.0845 + career_span_days_weight: float = -0.3109 + mean_title_length_weight: float = -0.0156 + isolation_score_weight: float = -0.9883 + total_repos_weight: float = 0.0365 + median_additions_weight: float = -0.1200 + median_files_changed_weight: float = 0.1413 + + +class BadEggThresholds(BaseModel): + """Fixed probability thresholds for advisory tiers. + + HIGH (>= 0.75) flags the highest-risk scored users. + ELEVATED (>= 0.65) flags the next tier. + """ + high: float = 0.75 + elevated: float = 0.65 + + +class BadEggConfig(BaseModel): + """Configuration for the Bad Egg suspension advisory score.""" + enabled: bool = True + model: BadEggModelConfig = Field(default_factory=BadEggModelConfig) + thresholds: BadEggThresholds = Field(default_factory=BadEggThresholds) + max_contributors_per_repo: int = 30 + skip_popular_repo_stars: int = 5000 + max_contributor_fetch_concurrency: int = 5 + + class GoodEggConfig(BaseModel): """Top-level configuration composing all sub-configs.""" scoring_model: Literal["v1", "v2", "v3"] = "v3" @@ -159,6 +203,7 @@ class GoodEggConfig(BaseModel): ) fetch: FetchConfig = Field(default_factory=FetchConfig) v2: V2Config = Field(default_factory=V2Config) + bad_egg: BadEggConfig = Field(default_factory=BadEggConfig) def load_config(path: str | Path | None = None) -> GoodEggConfig: @@ -220,4 +265,12 @@ def load_config(path: str | Path | None = None) -> GoodEggConfig: "true", "1", "yes", ) + bad_egg_enabled = os.environ.get("GOOD_EGG_BAD_EGG_ENABLED") + if bad_egg_enabled is not None: + if "bad_egg" not in config_data: + config_data["bad_egg"] = {} + config_data["bad_egg"]["enabled"] = bad_egg_enabled.lower() in ( + "true", "1", "yes", + ) + return GoodEggConfig(**config_data) diff --git a/src/good_egg/formatter.py b/src/good_egg/formatter.py index 1611e9a..7c4dc36 100644 --- a/src/good_egg/formatter.py +++ b/src/good_egg/formatter.py @@ -4,7 +4,7 @@ import click -from good_egg.models import TrustLevel, TrustScore +from good_egg.models import SuspicionLevel, TrustLevel, TrustScore COMMENT_MARKER = "" @@ -137,6 +137,27 @@ def format_markdown_comment(score: TrustScore) -> str: ) lines.append("") + # Suspension advisory + if ( + score.suspicion_score is not None + and score.suspicion_score.suspicion_level != SuspicionLevel.NORMAL + ): + pct_s = score.suspicion_score.probability * 100 + lines.append("### Suspension Advisory") + lines.append("") + lines.append("| | |") + lines.append("|---|---|") + lines.append( + f"| Risk Level | **{score.suspicion_score.suspicion_level.value}** |" + ) + lines.append(f"| Probability | {pct_s:.0f}% |") + lines.append("") + lines.append( + "> Advisory signal based on behavioral patterns." + " Does not confirm malicious intent." + ) + lines.append("") + # Low trust note if score.trust_level == TrustLevel.LOW: lines.append("> **First-time contributor -- review manually**") @@ -170,6 +191,21 @@ def format_cli_output(score: TrustScore, verbose: bool = False) -> str: f"Context: {score.context_repo}", ] + # Suspension advisory line + if score.suspicion_score is not None: + s_level = score.suspicion_score.suspicion_level + s_pct = score.suspicion_score.probability * 100 + if s_level == SuspicionLevel.HIGH: + lines.append( + click.style(f"Suspicion: HIGH ({s_pct:.0f}%)", fg="red") + ) + elif s_level == SuspicionLevel.ELEVATED: + lines.append( + click.style(f"Suspicion: ELEVATED ({s_pct:.0f}%)", fg="yellow") + ) + elif verbose: + lines.append(f"Suspicion: NORMAL ({s_pct:.0f}%)") + if verbose: lines.append("") lines.append( @@ -290,5 +326,17 @@ def format_check_run_summary(score: TrustScore) -> tuple[str, str]: f" (< {score.fresh_account.threshold_days} days)" ) + # Suspension advisory + if ( + score.suspicion_score is not None + and score.suspicion_score.suspicion_level != SuspicionLevel.NORMAL + ): + s_pct = score.suspicion_score.probability * 100 + summary_lines.append("") + summary_lines.append( + f"**Suspension Advisory:** {score.suspicion_score.suspicion_level.value}" + f" ({s_pct:.0f}%)" + ) + summary = "\n".join(summary_lines) return title, summary diff --git a/src/good_egg/github_client.py b/src/good_egg/github_client.py index a853e0a..7d813b9 100644 --- a/src/good_egg/github_client.py +++ b/src/good_egg/github_client.py @@ -532,11 +532,27 @@ async def get_user_contribution_data( ctx_meta = await self.fetch_repo_metadata_batch([context_repo]) contributed_repos.update(ctx_meta) + # Step 6: fetch repo contributors for Bad Egg isolation score + repo_contributors: dict[str, list[str]] = {} + if self._config.bad_egg.enabled and prs: + bad_egg_cfg = self._config.bad_egg + repos_to_fetch = [ + name for name, meta in contributed_repos.items() + if meta.stargazer_count < bad_egg_cfg.skip_popular_repo_stars + ] + if repos_to_fetch: + repo_contributors = await self.fetch_repo_contributors_batch( + repos_to_fetch, + max_per_repo=bad_egg_cfg.max_contributors_per_repo, + concurrency=bad_egg_cfg.max_contributor_fetch_concurrency, + ) + contrib_data = UserContributionData( profile=profile, merged_prs=prs, contributed_repos=contributed_repos, closed_pr_count=closed_pr_count, + repo_contributors=repo_contributors, ) # Cache the full contribution data @@ -549,6 +565,59 @@ async def get_user_contribution_data( return contrib_data + async def fetch_repo_contributors_batch( + self, + repos: list[str], + max_per_repo: int = 30, + concurrency: int = 5, + ) -> dict[str, list[str]]: + """Fetch top contributor logins for multiple repos. + + Uses REST GET /repos/{owner}/{name}/contributors with concurrency + limiting and per-repo caching. + """ + result: dict[str, list[str]] = {} + to_fetch: list[str] = [] + + for repo_name in repos: + if self._cache is not None: + cached = self._cache.get(f"repo_contributors:{repo_name}") + if cached is not None: + result[repo_name] = cached + continue + to_fetch.append(repo_name) + + if not to_fetch: + return result + + sem = asyncio.Semaphore(concurrency) + + async def fetch_one(repo_name: str) -> tuple[str, list[str]]: + async with sem: + try: + resp = await self._rest_request_with_retry( + "GET", + f"/repos/{repo_name}/contributors", + params={"per_page": max_per_repo, "anon": "false"}, + ) + logins = [c["login"] for c in resp.json() if "login" in c] + except (httpx.HTTPStatusError, GitHubAPIError): + logins = [] + if self._cache is not None: + self._cache.set( + f"repo_contributors:{repo_name}", + logins, + "repo_contributors", + ) + return repo_name, logins + + tasks = [fetch_one(name) for name in to_fetch] + for coro in asyncio.as_completed(tasks): + name, logins = await coro + result[name] = logins + + return result + async def check_existing_contributor( self, login: str, repo_owner: str, repo_name: str ) -> int: diff --git a/src/good_egg/mcp_server.py b/src/good_egg/mcp_server.py index 7764aff..185f616 100644 --- a/src/good_egg/mcp_server.py +++ b/src/good_egg/mcp_server.py @@ -151,6 +151,13 @@ async def check_pr_author( } if result.component_scores: summary["component_scores"] = result.component_scores + if result.suspicion_score is not None: + summary["suspicion_level"] = ( + result.suspicion_score.suspicion_level.value + ) + summary["suspicion_probability"] = ( + result.suspicion_score.probability + ) return json.dumps(summary) except Exception as exc: return _error_json(str(exc)) diff --git a/src/good_egg/models.py b/src/good_egg/models.py index f4175a1..582dd30 100644 --- a/src/good_egg/models.py +++ b/src/good_egg/models.py @@ -19,6 +19,21 @@ class TrustLevel(StrEnum): EXISTING_CONTRIBUTOR = "EXISTING_CONTRIBUTOR" +class SuspicionLevel(StrEnum): + """Suspension advisory risk levels.""" + HIGH = "HIGH" + ELEVATED = "ELEVATED" + NORMAL = "NORMAL" + + +class SuspicionScore(BaseModel): + """Advisory suspension risk score.""" + raw_score: float = 0.0 + probability: float = 0.0 + suspicion_level: SuspicionLevel = SuspicionLevel.NORMAL + component_scores: dict[str, float] = {} + + class UserProfile(BaseModel): """GitHub user profile data.""" login: str @@ -67,6 +82,7 @@ class UserContributionData(BaseModel): merged_prs: list[MergedPR] = [] contributed_repos: dict[str, RepoMetadata] = {} closed_pr_count: int = 0 + repo_contributors: dict[str, list[str]] = {} class ContributionSummary(BaseModel): @@ -103,3 +119,4 @@ class TrustScore(BaseModel): scoring_model: str = "v1" component_scores: dict[str, float] = {} fresh_account: FreshAccountAdvisory | None = None + suspicion_score: SuspicionScore | None = None diff --git a/src/good_egg/scorer.py b/src/good_egg/scorer.py index e61ad65..b40a982 100644 --- a/src/good_egg/scorer.py +++ b/src/good_egg/scorer.py @@ -13,6 +13,8 @@ from good_egg.models import ( ContributionSummary, FreshAccountAdvisory, + SuspicionLevel, + SuspicionScore, TrustLevel, TrustScore, UserContributionData, @@ -123,7 +125,7 @@ def _score_v1( top_contributions = self._build_top_contributions(user_data) language_match = self._check_language_match(user_data, context_language) - return TrustScore( + result = TrustScore( user_login=login, context_repo=context_repo, raw_score=raw_score, @@ -143,6 +145,11 @@ def _score_v1( fresh_account=fresh_account, ) + if self.config.bad_egg.enabled and user_data.merged_prs: + result.suspicion_score = self._compute_suspicion_score(user_data) + + return result + # ------------------------------------------------------------------ # v2 scoring path (Better Egg) # ------------------------------------------------------------------ @@ -215,7 +222,7 @@ def _score_v2( component_scores["merge_rate"] = merge_rate component_scores["log_account_age"] = log_account_age - return TrustScore( + result = TrustScore( user_login=login, context_repo=context_repo, raw_score=logit, @@ -237,6 +244,11 @@ def _score_v2( fresh_account=fresh_account, ) + if self.config.bad_egg.enabled and user_data.merged_prs: + result.suspicion_score = self._compute_suspicion_score(user_data) + + return result + # ------------------------------------------------------------------ # v3 scoring path (Diet Egg) # ------------------------------------------------------------------ @@ -302,6 +314,153 @@ def _build_fresh_account_advisory( created_at=user_data.profile.created_at, ) + # ------------------------------------------------------------------ + # Bad Egg: suspension advisory score + # ------------------------------------------------------------------ + + def _compute_suspicion_score( + self, user_data: UserContributionData, + ) -> SuspicionScore: + """Compute advisory suspension risk score using 8-feature LR model.""" + cfg = self.config.bad_egg + + # Feature 1: merge_rate + merged_count = len(user_data.merged_prs) + closed_count = user_data.closed_pr_count + total_prs = merged_count + closed_count + merge_rate = merged_count / total_prs if total_prs > 0 else 0.0 + + # Feature 2: total_prs (log-transformed) + log_total_prs = math.log1p(total_prs) + + # Feature 3: career_span_days (log-transformed) + if len(user_data.merged_prs) >= 2: + dates = [pr.merged_at for pr in user_data.merged_prs] + span = (max(dates) - min(dates)).total_seconds() / 86400.0 + else: + span = 0.0 + log_career_span = math.log1p(span) + + # Feature 4: mean_title_length + if user_data.merged_prs: + mean_title_len = sum( + len(pr.title) for pr in user_data.merged_prs + ) / len(user_data.merged_prs) + else: + mean_title_len = 0.0 + + # Feature 5: isolation_score (from bipartite contributor graph) + isolation_score = self._compute_isolation_score(user_data) + + # Feature 6: total_repos + total_repos = len({pr.repo_name_with_owner for pr in user_data.merged_prs}) + + # Feature 7: median_additions (log-transformed) + additions = sorted(pr.additions for pr in user_data.merged_prs) + n = len(additions) + if n > 0: + mid = n // 2 + median_adds = ( + (additions[mid - 1] + additions[mid]) / 2.0 + if n % 2 == 0 + else float(additions[mid]) + ) + else: + median_adds = 0.0 + log_median_additions = math.log1p(median_adds) + + # Feature 8: median_files_changed (log-transformed) + files = sorted(pr.changed_files for pr in user_data.merged_prs) + n_f = len(files) + if n_f > 0: + mid_f = n_f // 2 + median_files = ( + (files[mid_f - 1] + files[mid_f]) / 2.0 + if n_f % 2 == 0 + else float(files[mid_f]) + ) + else: + median_files = 0.0 + log_median_files = math.log1p(median_files) + + # Logistic regression + m = cfg.model + logit = ( + m.intercept + + m.merge_rate_weight * merge_rate + + m.total_prs_weight * log_total_prs + + m.career_span_days_weight * log_career_span + + m.mean_title_length_weight * mean_title_len + + m.isolation_score_weight * isolation_score + + m.total_repos_weight * total_repos + + m.median_additions_weight * log_median_additions + + m.median_files_changed_weight * log_median_files + ) + probability = 1.0 / (1.0 + math.exp(-logit)) + + # Classify tier + t = cfg.thresholds + if probability >= t.high: + level = SuspicionLevel.HIGH + elif probability >= t.elevated: + level = SuspicionLevel.ELEVATED + else: + level = SuspicionLevel.NORMAL + + return SuspicionScore( + raw_score=logit, + probability=probability, + suspicion_level=level, + component_scores={ + "merge_rate": merge_rate, + "log_total_prs": log_total_prs, + "log_career_span_days": log_career_span, + "mean_title_length": mean_title_len, + "isolation_score": isolation_score, + "total_repos": float(total_repos), + "log_median_additions": log_median_additions, + "log_median_files_changed": log_median_files, + }, + ) + + @staticmethod + def _compute_isolation_score(user_data: UserContributionData) -> float: + """Fraction of author's repos where no other multi-repo contributor works.""" + login = user_data.profile.login.lower() + user_repos = set(user_data.repo_contributors.keys()) + + if not user_repos: + return 1.0 + + # Build lookup: contributor -> set of repos they appear in + contributor_repos: dict[str, set[str]] = defaultdict(set) + for repo, contributors in user_data.repo_contributors.items(): + for c in contributors: + if c.lower() != login: + contributor_repos[c.lower()].add(repo) + + # Multi-repo contributors: appear in 2+ of this author's repos + multi_repo_contributors = { + c for c, repos in contributor_repos.items() if len(repos) >= 2 + } + + # A repo is isolated if none of its contributors are multi-repo + isolated = 0 + for repo in user_repos: + repo_contribs = { + c.lower() for c in user_data.repo_contributors.get(repo, []) + if c.lower() != login + } + if not repo_contribs & multi_repo_contributors: + isolated += 1 + + # Also count repos where we skipped fetching (popular repos) as non-isolated + all_contributed = set(user_data.contributed_repos.keys()) + skipped_repos = all_contributed - user_repos + total = len(user_repos) + len(skipped_repos) + + return isolated / total if total > 0 else 1.0 + # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ diff --git a/tests/conftest.py b/tests/conftest.py index 96c27c5..3f84aae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,8 @@ ContributionSummary, MergedPR, RepoMetadata, + SuspicionLevel, + SuspicionScore, TrustLevel, TrustScore, UserContributionData, @@ -215,6 +217,77 @@ def sample_v2_trust_score() -> TrustScore: ) +@pytest.fixture +def sample_suspicion_score_high() -> SuspicionScore: + return SuspicionScore( + raw_score=1.5, + probability=0.82, + suspicion_level=SuspicionLevel.HIGH, + component_scores={ + "merge_rate": 0.5, + "log_total_prs": 2.0, + "log_career_span_days": 2.0, + "mean_title_length": 12.0, + "isolation_score": 0.8, + "total_repos": 2.0, + "log_median_additions": 3.0, + "log_median_files_changed": 1.5, + }, + ) + + +@pytest.fixture +def sample_suspicion_score_normal() -> SuspicionScore: + return SuspicionScore( + raw_score=-1.0, + probability=0.27, + suspicion_level=SuspicionLevel.NORMAL, + component_scores={ + "merge_rate": 0.95, + "log_total_prs": 4.0, + "log_career_span_days": 6.5, + "mean_title_length": 45.0, + "isolation_score": 0.2, + "total_repos": 5.0, + "log_median_additions": 4.5, + "log_median_files_changed": 2.0, + }, + ) + + +@pytest.fixture +def sample_trust_score_with_suspicion( + sample_suspicion_score_high: SuspicionScore, +) -> TrustScore: + return TrustScore( + user_login="testuser", + context_repo="my-org/my-elixir-app", + raw_score=0.0045, + normalized_score=0.72, + trust_level=TrustLevel.HIGH, + account_age_days=1800, + total_merged_prs=3, + unique_repos_contributed=3, + top_contributions=[ + ContributionSummary( + repo_name="elixir-lang/elixir", + pr_count=1, + language="Elixir", + stars=23000, + ), + ], + language_match=True, + flags={ + "is_bot": False, + "is_new_account": False, + "has_insufficient_data": False, + "used_cached_data": False, + }, + scoring_metadata={"graph_nodes": 7, "graph_edges": 6}, + suspicion_score=sample_suspicion_score_high, + ) + + @pytest.fixture def sample_v2_contribution_data( sample_user_profile: UserProfile, diff --git a/tests/test_action.py b/tests/test_action.py index d936977..fb21e19 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -9,7 +9,7 @@ import pytest from good_egg.action import _set_output, run_action -from good_egg.models import TrustLevel, TrustScore +from good_egg.models import SuspicionLevel, SuspicionScore, TrustLevel, TrustScore @pytest.fixture @@ -318,3 +318,89 @@ async def test_respects_skip_known_contributors_false(self, mock_env, tmp_path): output_content = output_file.read_text() assert "skipped=false" in output_content + + +class TestBadEggAction: + @pytest.mark.asyncio + async def test_bad_egg_input_disables_scoring(self, mock_env, tmp_path): + """INPUT_BAD_EGG=false should disable bad egg scoring.""" + output_file = tmp_path / "output.txt" + output_file.touch() + mock_env_no_bad_egg = { + **mock_env, + "INPUT_BAD_EGG": "false", + "GITHUB_OUTPUT": str(output_file), + } + mock_score = _make_mock_score() + mock_client = AsyncMock() + mock_client.find_existing_comment = AsyncMock(return_value=None) + mock_client.post_pr_comment = AsyncMock(return_value={}) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch.dict(os.environ, mock_env_no_bad_egg, clear=False), \ + patch("good_egg.action.GitHubClient", return_value=mock_client), \ + patch("good_egg.action.score_pr_author", new_callable=AsyncMock, + return_value=mock_score) as mock_score_fn: + await run_action() + + # Verify config has bad_egg disabled + call_kwargs = mock_score_fn.call_args + config_passed = call_kwargs.kwargs.get("config") + assert config_passed.bad_egg.enabled is False + + output_content = output_file.read_text() + assert "suspicion-level=" in output_content + + @pytest.mark.asyncio + async def test_suspicion_level_output_set(self, mock_env, tmp_path): + """suspicion-level output should be set when suspicion_score is present.""" + output_file = tmp_path / "output.txt" + output_file.touch() + mock_env["GITHUB_OUTPUT"] = str(output_file) + + ss = SuspicionScore( + raw_score=1.0, + probability=0.12, + suspicion_level=SuspicionLevel.HIGH, + ) + mock_score = TrustScore( + **{**_make_mock_score().model_dump(), "suspicion_score": ss} + ) + mock_client = AsyncMock() + mock_client.find_existing_comment = AsyncMock(return_value=None) + mock_client.post_pr_comment = AsyncMock(return_value={}) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch.dict(os.environ, mock_env, clear=False), \ + patch("good_egg.action.GitHubClient", return_value=mock_client), \ + patch("good_egg.action.score_pr_author", new_callable=AsyncMock, + return_value=mock_score): + await run_action() + + output_content = output_file.read_text() + assert "suspicion-level=HIGH" in output_content + + @pytest.mark.asyncio + async def test_suspicion_level_empty_when_no_score(self, mock_env, tmp_path): + """suspicion-level output should be empty when no suspicion score.""" + output_file = tmp_path / "output.txt" + output_file.touch() + mock_env["GITHUB_OUTPUT"] = str(output_file) + + mock_score = _make_mock_score() + mock_client = AsyncMock() + mock_client.find_existing_comment = AsyncMock(return_value=None) + mock_client.post_pr_comment = AsyncMock(return_value={}) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch.dict(os.environ, mock_env, clear=False), \ + patch("good_egg.action.GitHubClient", return_value=mock_client), \ + patch("good_egg.action.score_pr_author", new_callable=AsyncMock, + return_value=mock_score): + await run_action() + + output_content = output_file.read_text() + assert "suspicion-level=\n" in output_content diff --git a/tests/test_cli.py b/tests/test_cli.py index 7c481e6..e546d71 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -141,6 +141,29 @@ def test_force_score_disables_skip( assert config_passed.skip_known_contributors is False + @patch("good_egg.cli.score_pr_author", new_callable=AsyncMock) + @patch("good_egg.cli.load_config") + def test_no_bad_egg_disables_suspicion( + self, mock_load_config: MagicMock, mock_score: AsyncMock + ) -> None: + """--no-bad-egg should disable the bad egg scoring.""" + from good_egg.config import GoodEggConfig + mock_load_config.return_value = GoodEggConfig() + trust_score = _make_trust_score() + mock_score.return_value = trust_score + + runner = CliRunner(env={"GITHUB_TOKEN": "ghp_fake123"}) + result = runner.invoke( + main, + ["score", "testuser", "--repo", "owner/repo", "--no-bad-egg"], + ) + assert result.exit_code == 0 + # Verify the config passed to score_pr_author has bad_egg disabled + call_kwargs = mock_score.call_args + config_passed = call_kwargs.kwargs.get("config") or call_kwargs[1].get("config") + assert config_passed.bad_egg.enabled is False + + class TestCacheCommands: @patch("good_egg.cli.Cache") def test_cache_stats(self, mock_cache_cls: MagicMock) -> None: diff --git a/tests/test_formatter.py b/tests/test_formatter.py index c8c0b8e..1e1fbf8 100644 --- a/tests/test_formatter.py +++ b/tests/test_formatter.py @@ -14,6 +14,8 @@ from good_egg.models import ( ContributionSummary, FreshAccountAdvisory, + SuspicionLevel, + SuspicionScore, TrustLevel, TrustScore, ) @@ -486,3 +488,84 @@ def test_json_fresh_account_none(self) -> None: result = format_json(score) parsed = json.loads(result) assert parsed["fresh_account"] is None + + +class TestSuspicionAdvisoryFormatting: + def _make_score_with_suspicion( + self, level: SuspicionLevel, probability: float = 0.12 + ) -> TrustScore: + ss = SuspicionScore( + raw_score=1.0, + probability=probability, + suspicion_level=level, + component_scores={ + "merge_rate": 0.5, + "log_median_additions": 3.0, + "isolation_score": 0.8, + }, + ) + return _make_score(suspicion_score=ss) + + def test_markdown_shows_advisory_for_high(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.HIGH, 0.12) + md = format_markdown_comment(score) + assert "Suspension Advisory" in md + assert "**HIGH**" in md + assert "12%" in md + assert "Does not confirm malicious intent" in md + + def test_markdown_shows_advisory_for_elevated(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.ELEVATED, 0.07) + md = format_markdown_comment(score) + assert "Suspension Advisory" in md + assert "**ELEVATED**" in md + assert "7%" in md + + def test_markdown_omits_advisory_for_normal(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.NORMAL, 0.02) + md = format_markdown_comment(score) + assert "Suspension Advisory" not in md + + def test_markdown_omits_advisory_when_none(self) -> None: + score = _make_score() + md = format_markdown_comment(score) + assert "Suspension Advisory" not in md + + def test_cli_shows_suspicion_high(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.HIGH, 0.12) + out = format_cli_output(score) + assert "Suspicion: HIGH (12%)" in out + + def test_cli_shows_suspicion_elevated(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.ELEVATED, 0.07) + out = format_cli_output(score) + assert "Suspicion: ELEVATED (7%)" in out + + def test_cli_hides_normal_non_verbose(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.NORMAL, 0.02) + out = format_cli_output(score, verbose=False) + assert "Suspicion" not in out + + def test_cli_shows_normal_verbose(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.NORMAL, 0.02) + out = format_cli_output(score, verbose=True) + assert "Suspicion: NORMAL (2%)" in out + + def test_check_run_includes_advisory_for_high(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.HIGH, 0.12) + _, summary = format_check_run_summary(score) + assert "Suspension Advisory" in summary + assert "HIGH" in summary + + def test_check_run_omits_advisory_for_normal(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.NORMAL, 0.02) + _, summary = format_check_run_summary(score) + assert "Suspension Advisory" not in summary + + def test_json_includes_suspicion_score(self) -> None: + score = self._make_score_with_suspicion(SuspicionLevel.HIGH, 0.12) + result = format_json(score) + parsed = json.loads(result) + assert parsed["suspicion_score"] is not None + assert parsed["suspicion_score"]["suspicion_level"] == "HIGH" + assert parsed["suspicion_score"]["probability"] == 0.12 diff --git a/tests/test_github_client.py b/tests/test_github_client.py index d48c4f8..9c6ef1b 100644 --- a/tests/test_github_client.py +++ b/tests/test_github_client.py @@ -1270,3 +1270,97 @@ async def test_returns_zero_on_api_error(self) -> None: ) assert count == 0 + + +# --------------------------------------------------------------------------- +# fetch_repo_contributors_batch +# --------------------------------------------------------------------------- + + +class TestFetchRepoContributorsBatch: + @respx.mock + async def test_fetches_contributors(self) -> None: + """Should fetch contributor logins from REST API.""" + respx.get(f"{BASE_URL}/repos/org/repo-a/contributors").mock( + return_value=httpx.Response( + 200, + json=[ + {"login": "alice", "contributions": 50}, + {"login": "bob", "contributions": 30}, + ], + ) + ) + + async with _make_client() as client: + result = await client.fetch_repo_contributors_batch(["org/repo-a"]) + + assert result == {"org/repo-a": ["alice", "bob"]} + + @respx.mock + async def test_handles_error_gracefully(self) -> None: + """Should return empty list for repos that error out.""" + respx.get(f"{BASE_URL}/repos/org/private-repo/contributors").mock( + return_value=httpx.Response(404, json={"message": "Not Found"}) + ) + + async with _make_client() as client: + result = await client.fetch_repo_contributors_batch( + ["org/private-repo"] + ) + + assert result == {"org/private-repo": []} + + @respx.mock + async def test_multiple_repos(self) -> None: + """Should handle multiple repos concurrently.""" + respx.get(f"{BASE_URL}/repos/org/repo-a/contributors").mock( + return_value=httpx.Response( + 200, json=[{"login": "alice"}] + ) + ) + respx.get(f"{BASE_URL}/repos/org/repo-b/contributors").mock( + return_value=httpx.Response( + 200, json=[{"login": "bob"}, {"login": "charlie"}] + ) + ) + + async with _make_client() as client: + result = await client.fetch_repo_contributors_batch( + ["org/repo-a", "org/repo-b"] + ) + + assert result["org/repo-a"] == ["alice"] + assert result["org/repo-b"] == ["bob", "charlie"] + + @respx.mock + async def test_uses_cache(self, tmp_path) -> None: + """Should use cached data when available.""" + cache = Cache(db_path=tmp_path / "cache.db") + cache.set("repo_contributors:org/cached-repo", ["cached-user"], "repo_contributors") + + # No mock needed -- should not hit API for cached repo + async with _make_client(cache=cache) as client: + result = await client.fetch_repo_contributors_batch( + ["org/cached-repo"] + ) + + assert result == {"org/cached-repo": ["cached-user"]} + cache.close() + + @respx.mock + async def test_caches_fetched_data(self, tmp_path) -> None: + """Should cache freshly fetched contributor data.""" + cache = Cache(db_path=tmp_path / "cache.db") + respx.get(f"{BASE_URL}/repos/org/new-repo/contributors").mock( + return_value=httpx.Response( + 200, json=[{"login": "fresh-user"}] + ) + ) + + async with _make_client(cache=cache) as client: + await client.fetch_repo_contributors_batch(["org/new-repo"]) + + # Verify it was cached + cached = cache.get("repo_contributors:org/new-repo") + assert cached == ["fresh-user"] + cache.close() diff --git a/tests/test_models.py b/tests/test_models.py index 98611fe..f895459 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -8,6 +8,8 @@ FreshAccountAdvisory, MergedPR, RepoMetadata, + SuspicionLevel, + SuspicionScore, TrustLevel, TrustScore, UserContributionData, @@ -177,6 +179,22 @@ def test_fresh_account_field_set(self) -> None: assert score.fresh_account.account_age_days == 100 assert score.fresh_account.threshold_days == 365 + def test_suspicion_score_default_none(self) -> None: + score = TrustScore(user_login="u", context_repo="o/r") + assert score.suspicion_score is None + + def test_suspicion_score_attached(self) -> None: + ss = SuspicionScore( + raw_score=1.0, + probability=0.12, + suspicion_level=SuspicionLevel.HIGH, + ) + score = TrustScore( + user_login="u", context_repo="o/r", suspicion_score=ss + ) + assert score.suspicion_score is not None + assert score.suspicion_score.suspicion_level == SuspicionLevel.HIGH + def test_top_contributions(self, sample_trust_score: TrustScore) -> None: assert len(sample_trust_score.top_contributions) == 2 assert sample_trust_score.top_contributions[0].repo_name == "elixir-lang/elixir" @@ -212,3 +230,64 @@ def test_serialization_roundtrip(self) -> None: restored = FreshAccountAdvisory(**data) assert restored.is_fresh is True assert restored.account_age_days == 200 + + +class TestSuspicionLevel: + def test_values(self) -> None: + assert SuspicionLevel.HIGH.value == "HIGH" + assert SuspicionLevel.ELEVATED.value == "ELEVATED" + assert SuspicionLevel.NORMAL.value == "NORMAL" + + def test_all_members(self) -> None: + assert len(SuspicionLevel) == 3 + + +class TestSuspicionScore: + def test_defaults(self) -> None: + ss = SuspicionScore() + assert ss.raw_score == 0.0 + assert ss.probability == 0.0 + assert ss.suspicion_level == SuspicionLevel.NORMAL + assert ss.component_scores == {} + + def test_creation(self) -> None: + ss = SuspicionScore( + raw_score=1.5, + probability=0.12, + suspicion_level=SuspicionLevel.HIGH, + component_scores={"merge_rate": 0.95, "isolation_score": 0.8}, + ) + assert ss.raw_score == 1.5 + assert ss.probability == 0.12 + assert ss.suspicion_level == SuspicionLevel.HIGH + assert ss.component_scores["merge_rate"] == 0.95 + + def test_serialization_roundtrip(self) -> None: + import json + ss = SuspicionScore( + raw_score=1.0, + probability=0.08, + suspicion_level=SuspicionLevel.ELEVATED, + component_scores={"merge_rate": 0.7}, + ) + data = json.loads(ss.model_dump_json()) + restored = SuspicionScore(**data) + assert restored.suspicion_level == SuspicionLevel.ELEVATED + assert restored.probability == 0.08 + + +class TestUserContributionDataRepoContributors: + def test_repo_contributors_default_empty( + self, sample_user_profile: UserProfile + ) -> None: + data = UserContributionData(profile=sample_user_profile) + assert data.repo_contributors == {} + + def test_repo_contributors_set( + self, sample_user_profile: UserProfile + ) -> None: + data = UserContributionData( + profile=sample_user_profile, + repo_contributors={"org/repo": ["alice", "bob"]}, + ) + assert data.repo_contributors["org/repo"] == ["alice", "bob"] diff --git a/tests/test_scorer.py b/tests/test_scorer.py index ec8005e..14ff317 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -8,10 +8,16 @@ import pytest -from good_egg.config import GoodEggConfig +from good_egg.config import ( + BadEggConfig, + BadEggModelConfig, + BadEggThresholds, + GoodEggConfig, +) from good_egg.models import ( MergedPR, RepoMetadata, + SuspicionLevel, TrustLevel, UserContributionData, UserProfile, @@ -46,12 +52,14 @@ def _make_contribution_data( merged_prs: list[MergedPR] | None = None, repos: dict[str, RepoMetadata] | None = None, closed_pr_count: int = 0, + repo_contributors: dict[str, list[str]] | None = None, ) -> UserContributionData: return UserContributionData( profile=_make_profile(login=login, is_bot=is_bot, days_old=days_old), merged_prs=merged_prs or [], contributed_repos=repos or {}, closed_pr_count=closed_pr_count, + repo_contributors=repo_contributors or {}, ) @@ -880,6 +888,221 @@ def test_v3_zero_total_prs(self) -> None: assert result.normalized_score == 1.0 +class TestSuspicionScoring: + def test_suspicion_score_computed_when_enabled(self) -> None: + scorer = TrustScorer(_make_config()) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is not None + assert result.suspicion_score.probability >= 0.0 + assert result.suspicion_score.probability <= 1.0 + + def test_suspicion_score_none_when_disabled(self) -> None: + config = GoodEggConfig(bad_egg=BadEggConfig(enabled=False)) + scorer = TrustScorer(config) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is None + + def test_suspicion_score_none_for_bot(self) -> None: + scorer = TrustScorer(_make_config()) + data = _make_contribution_data(is_bot=True) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is None + + def test_suspicion_score_none_for_no_prs(self) -> None: + scorer = TrustScorer(_make_config()) + data = _make_contribution_data(merged_prs=[]) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is None + + def test_suspicion_score_with_v2(self) -> None: + config = GoodEggConfig(scoring_model="v2") + scorer = TrustScorer(config) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data(merged_prs=prs, repos=repos, closed_pr_count=5) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is not None + assert result.scoring_model == "v2" + + def test_tier_classification_high(self) -> None: + zero_model = BadEggModelConfig( + intercept=2.0, + merge_rate_weight=0.0, + total_prs_weight=0.0, + career_span_days_weight=0.0, + mean_title_length_weight=0.0, + isolation_score_weight=0.0, + total_repos_weight=0.0, + median_additions_weight=0.0, + median_files_changed_weight=0.0, + ) + config = GoodEggConfig( + bad_egg=BadEggConfig( + model=zero_model, + thresholds=BadEggThresholds(high=0.10, elevated=0.05), + ) + ) + scorer = TrustScorer(config) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + # With intercept=2.0 and all other weights=0, sigmoid(2.0) ~ 0.88 + assert result.suspicion_score is not None + assert result.suspicion_score.suspicion_level == SuspicionLevel.HIGH + + def test_tier_classification_elevated(self) -> None: + zero_model = BadEggModelConfig( + intercept=-2.9, + merge_rate_weight=0.0, + total_prs_weight=0.0, + career_span_days_weight=0.0, + mean_title_length_weight=0.0, + isolation_score_weight=0.0, + total_repos_weight=0.0, + median_additions_weight=0.0, + median_files_changed_weight=0.0, + ) + config = GoodEggConfig( + bad_egg=BadEggConfig( + model=zero_model, + thresholds=BadEggThresholds(high=0.10, elevated=0.05), + ) + ) + scorer = TrustScorer(config) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + # sigmoid(-2.9) ~ 0.052, between 0.05 and 0.10 + assert result.suspicion_score is not None + assert result.suspicion_score.suspicion_level == SuspicionLevel.ELEVATED + + def test_tier_classification_normal(self) -> None: + zero_model = BadEggModelConfig( + intercept=-5.0, + merge_rate_weight=0.0, + total_prs_weight=0.0, + career_span_days_weight=0.0, + mean_title_length_weight=0.0, + isolation_score_weight=0.0, + total_repos_weight=0.0, + median_additions_weight=0.0, + median_files_changed_weight=0.0, + ) + config = GoodEggConfig( + bad_egg=BadEggConfig( + model=zero_model, + thresholds=BadEggThresholds(high=0.10, elevated=0.05), + ) + ) + scorer = TrustScorer(config) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + # sigmoid(-5.0) ~ 0.0067, below 0.05 + assert result.suspicion_score is not None + assert result.suspicion_score.suspicion_level == SuspicionLevel.NORMAL + + def test_merge_rate_computation(self) -> None: + scorer = TrustScorer(_make_config()) + prs, repos = _sample_prs_and_repos() + data = _make_contribution_data( + merged_prs=prs, repos=repos, closed_pr_count=3, + ) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is not None + expected = len(prs) / (len(prs) + 3) + assert ( + abs(result.suspicion_score.component_scores["merge_rate"] - expected) + < 1e-9 + ) + + def test_median_additions_odd(self) -> None: + scorer = TrustScorer(_make_config()) + prs = [ + MergedPR( + repo_name_with_owner="org/repo", + title="PR", + merged_at=datetime(2024, 1, 1, tzinfo=UTC), + additions=10, + ), + MergedPR( + repo_name_with_owner="org/repo", + title="PR", + merged_at=datetime(2024, 2, 1, tzinfo=UTC), + additions=30, + ), + MergedPR( + repo_name_with_owner="org/repo", + title="PR", + merged_at=datetime(2024, 3, 1, tzinfo=UTC), + additions=50, + ), + ] + repos = { + "org/repo": RepoMetadata( + name_with_owner="org/repo", stargazer_count=100, + ), + } + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is not None + expected = math.log1p(30.0) # median of [10, 30, 50] + assert ( + abs( + result.suspicion_score.component_scores["log_median_additions"] + - expected + ) + < 1e-9 + ) + + def test_median_additions_even(self) -> None: + scorer = TrustScorer(_make_config()) + prs = [ + MergedPR( + repo_name_with_owner="org/repo", + title="PR", + merged_at=datetime(2024, 1, 1, tzinfo=UTC), + additions=10, + ), + MergedPR( + repo_name_with_owner="org/repo", + title="PR", + merged_at=datetime(2024, 2, 1, tzinfo=UTC), + additions=30, + ), + ] + repos = { + "org/repo": RepoMetadata( + name_with_owner="org/repo", stargazer_count=100, + ), + } + data = _make_contribution_data(merged_prs=prs, repos=repos) + result = scorer.score(data, "org/repo") + + assert result.suspicion_score is not None + expected = math.log1p(20.0) # average of [10, 30] + assert ( + abs( + result.suspicion_score.component_scores["log_median_additions"] + - expected + ) + < 1e-9 + ) + + class TestFreshAccountAdvisory: def test_fresh_account_flagged_under_365(self) -> None: config = GoodEggConfig(scoring_model="v3") @@ -973,3 +1196,123 @@ def test_fresh_account_populated_in_v2(self) -> None: assert result.fresh_account is not None assert result.fresh_account.is_fresh is True + + +class TestIsolationScore: + def test_all_isolated(self) -> None: + """All repos have no overlapping contributors -> isolation = 1.0.""" + scorer = TrustScorer(_make_config()) + prs = [ + MergedPR( + repo_name_with_owner="org/repo-a", + title="PR", + merged_at=datetime(2024, 1, 1, tzinfo=UTC), + ), + MergedPR( + repo_name_with_owner="org/repo-b", + title="PR", + merged_at=datetime(2024, 2, 1, tzinfo=UTC), + ), + ] + repos = { + "org/repo-a": RepoMetadata( + name_with_owner="org/repo-a", stargazer_count=100, + ), + "org/repo-b": RepoMetadata( + name_with_owner="org/repo-b", stargazer_count=100, + ), + } + data = _make_contribution_data( + merged_prs=prs, repos=repos, + repo_contributors={ + "org/repo-a": ["alice"], + "org/repo-b": ["bob"], + }, + ) + score = scorer._compute_isolation_score(data) + assert score == 1.0 + + def test_none_isolated(self) -> None: + """All repos share a multi-repo contributor -> isolation = 0.0.""" + scorer = TrustScorer(_make_config()) + data = _make_contribution_data( + merged_prs=[ + MergedPR( + repo_name_with_owner="org/repo-a", + title="PR", + merged_at=datetime(2024, 1, 1, tzinfo=UTC), + ), + MergedPR( + repo_name_with_owner="org/repo-b", + title="PR", + merged_at=datetime(2024, 2, 1, tzinfo=UTC), + ), + ], + repos={ + "org/repo-a": RepoMetadata( + name_with_owner="org/repo-a", stargazer_count=100, + ), + "org/repo-b": RepoMetadata( + name_with_owner="org/repo-b", stargazer_count=100, + ), + }, + repo_contributors={ + "org/repo-a": ["shared-dev"], + "org/repo-b": ["shared-dev"], + }, + ) + score = scorer._compute_isolation_score(data) + assert score == 0.0 + + def test_no_repo_contributors(self) -> None: + """No repo_contributors data -> isolation defaults to 1.0.""" + scorer = TrustScorer(_make_config()) + data = _make_contribution_data( + merged_prs=[ + MergedPR( + repo_name_with_owner="org/repo", + title="PR", + merged_at=datetime(2024, 1, 1, tzinfo=UTC), + ), + ], + repos={ + "org/repo": RepoMetadata( + name_with_owner="org/repo", stargazer_count=100, + ), + }, + ) + score = scorer._compute_isolation_score(data) + assert score == 1.0 + + def test_skipped_popular_repos_non_isolated(self) -> None: + """Repos in contributed_repos but not in repo_contributors count as non-isolated.""" + scorer = TrustScorer(_make_config()) + data = _make_contribution_data( + merged_prs=[ + MergedPR( + repo_name_with_owner="org/small", + title="PR", + merged_at=datetime(2024, 1, 1, tzinfo=UTC), + ), + MergedPR( + repo_name_with_owner="org/popular", + title="PR", + merged_at=datetime(2024, 2, 1, tzinfo=UTC), + ), + ], + repos={ + "org/small": RepoMetadata( + name_with_owner="org/small", stargazer_count=100, + ), + "org/popular": RepoMetadata( + name_with_owner="org/popular", stargazer_count=10000, + ), + }, + repo_contributors={ + "org/small": ["alice"], + # org/popular skipped (too popular) + }, + ) + score = scorer._compute_isolation_score(data) + # 1 isolated (small) + 1 non-isolated (popular skipped) = 0.5 + assert score == 0.5 From b8c8c27d32162831fe754351b257daa2b375e3ce Mon Sep 17 00:00:00 2001 From: Jeff Date: Thu, 12 Mar 2026 11:40:05 +0000 Subject: [PATCH 2/7] Add proximity-based suspension detection experiments Test whether k-NN distance, Jaccard repo overlap, and personalized PageRank can detect suspended accounts in the merged-PR population where individual behavioral features failed (all AUCs ~0.50). Key findings: - H1 SUPPORTED: k-NN achieves AUC 0.570 on merged-PR population - H2 SUPPORTED: Jaccard max repo overlap achieves AUC 0.595 (best method) - H3 NOT SUPPORTED: 44 biased seeds don't generalize (AUC 0.44) - H4 SUPPORTED: Jaccard adds +0.049 AUC to LR (p<0.0001) Scripts: proximity_common.py (shared lib), proximity_knn_experiment.py, proximity_graph_experiment.py, proximity_combined.py, proximity_analysis.py --- .../proximity_results/PROXIMITY_ANALYSIS.md | 252 ++++++ .../proximity_results/combined_results.json | 52 ++ .../proximity_results/graph_results.json | 119 +++ .../proximity_results/knn_results.json | 766 ++++++++++++++++++ scripts/proximity_analysis.py | 378 +++++++++ scripts/proximity_combined.py | 372 +++++++++ scripts/proximity_common.py | 620 ++++++++++++++ scripts/proximity_graph_experiment.py | 399 +++++++++ scripts/proximity_knn_experiment.py | 402 +++++++++ 9 files changed, 3360 insertions(+) create mode 100644 experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md create mode 100644 experiments/bot_detection/proximity_results/combined_results.json create mode 100644 experiments/bot_detection/proximity_results/graph_results.json create mode 100644 experiments/bot_detection/proximity_results/knn_results.json create mode 100644 scripts/proximity_analysis.py create mode 100644 scripts/proximity_combined.py create mode 100644 scripts/proximity_common.py create mode 100644 scripts/proximity_graph_experiment.py create mode 100644 scripts/proximity_knn_experiment.py diff --git a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md new file mode 100644 index 0000000..71ef19a --- /dev/null +++ b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md @@ -0,0 +1,252 @@ +# Proximity-Based Suspension Detection — Results + +This report summarizes the results of proximity-based methods for detecting +suspended GitHub accounts among authors with merged PRs. + +## Methodology + +### Population + +- **Primary**: 19,598 authors with merged PRs (417 suspended, 19,181 active) +- **Replication**: 31,293 authors (739 suspended, 30,554 active) +- **Temporal cutoffs**: 2022-07-01 (58 susp), 2023-01-01 (92), 2024-01-01 (204) + +### Feature Sets + +- **F10** (10 core behavioral): merge_rate, total_prs, career_span_days, + mean_title_length, median_additions, median_files_changed, total_repos, + isolation_score, hub_score, bipartite_clustering +- **F16** (16 extended): F10 + rejection_rate, hour_entropy, empty_body_rate, + title_spam_score, weekend_ratio, prs_per_active_day +- **F16_no_mr**: F16 minus merge_rate and rejection_rate (decontaminated) + +### Methods + +- **k-NN cosine/euclidean**: NearestNeighbors on StandardScaled features, + score = negative mean distance to k nearest seeds +- **Jaccard max**: max repo-set Jaccard similarity to any suspended seed +- **Jaccard mean-k5**: mean of top-5 Jaccard similarities to seeds +- **PPR**: Personalized PageRank on bipartite author-repo graph with + restart on suspended seeds +- **Combined LR**: Logistic regression on behavioral features + proximity score + +### Holdout Strategies + +- **Strategy A** (discovery-order): 44 original seeds → 373 expansion test set +- **Strategy B** (suspended-only CV): 5-fold on suspended, active in every fold +- **Strategy C** (temporal): features from pre-cutoff PRs, CV within each cutoff + +### Statistical Tests + +- DeLong paired test for AUC comparisons +- Holm-Bonferroni correction for multiple comparisons + +## 1. k-NN Proximity Results + +### Strategy A: Discovery-Order Holdout (k-NN) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| baseline_lr_F16 | 0.6257 | 0.0267 | 0.0000 | 0.0200 | +| baseline_lr_F16_no_mr | 0.6256 | 0.0267 | 0.0000 | 0.0400 | +| baseline_lr_F10 | 0.6115 | 0.0255 | 0.0000 | 0.0000 | +| knn_F16_euclidean_k15 | 0.4413 | 0.0154 | 0.0000 | 0.0000 | +| knn_F16_euclidean_k10 | 0.4378 | 0.0153 | 0.0000 | 0.0000 | +| knn_F16_euclidean_k3 | 0.4360 | 0.0152 | 0.0000 | 0.0000 | +| knn_F16_euclidean_k5 | 0.4357 | 0.0152 | 0.0000 | 0.0000 | +| knn_F16_no_mr_euclidean_k15 | 0.4294 | 0.0151 | 0.0000 | 0.0000 | +| knn_F16_no_mr_euclidean_k10 | 0.4266 | 0.0150 | 0.0000 | 0.0000 | +| baseline_merge_rate | 0.4257 | 0.0178 | 0.0000 | 0.0200 | +| knn_F16_no_mr_euclidean_k5 | 0.4238 | 0.0149 | 0.0000 | 0.0000 | +| knn_F16_no_mr_cosine_k3 | 0.4230 | 0.0149 | 0.0000 | 0.0000 | +| knn_F16_no_mr_euclidean_k3 | 0.4228 | 0.0148 | 0.0000 | 0.0000 | +| knn_F10_euclidean_k15 | 0.4210 | 0.0149 | 0.0000 | 0.0000 | +| knn_F16_no_mr_cosine_k5 | 0.4208 | 0.0149 | 0.0000 | 0.0000 | +| knn_F10_euclidean_k5 | 0.4206 | 0.0148 | 0.0000 | 0.0000 | +| knn_F10_euclidean_k3 | 0.4199 | 0.0148 | 0.0000 | 0.0000 | +| knn_F10_euclidean_k10 | 0.4182 | 0.0148 | 0.0000 | 0.0000 | +| knn_F16_cosine_k3 | 0.4170 | 0.0147 | 0.0000 | 0.0000 | +| knn_F16_cosine_k5 | 0.4152 | 0.0147 | 0.0000 | 0.0000 | +| knn_F16_no_mr_cosine_k10 | 0.4138 | 0.0147 | 0.0000 | 0.0000 | +| knn_F16_cosine_k10 | 0.4102 | 0.0146 | 0.0000 | 0.0000 | +| knn_F16_no_mr_cosine_k15 | 0.4075 | 0.0145 | 0.0000 | 0.0000 | +| knn_F16_cosine_k15 | 0.4064 | 0.0145 | 0.0000 | 0.0000 | +| knn_F10_cosine_k5 | 0.4051 | 0.0144 | 0.0000 | 0.0000 | +| knn_F10_cosine_k3 | 0.4045 | 0.0144 | 0.0000 | 0.0000 | +| knn_F10_cosine_k10 | 0.4005 | 0.0143 | 0.0000 | 0.0000 | +| knn_F10_cosine_k15 | 0.3980 | 0.0142 | 0.0000 | 0.0000 | + +### Strategy B: Suspended-Only CV, Merged-PR Population (k-NN) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| baseline_lr_F16 | 0.5727 | 0.0253 | 0.0000 | 0.0000 | +| baseline_lr_F16_no_mr | 0.5709 | 0.0255 | 0.0000 | 0.0400 | +| knn_F16_euclidean_k3 | 0.5698 | 0.0270 | 0.0400 | 0.0200 | +| knn_F16_euclidean_k5 | 0.5673 | 0.0266 | 0.0400 | 0.0400 | +| knn_F10_cosine_k10 | 0.5669 | 0.0264 | 0.0000 | 0.0000 | +| knn_F10_cosine_k5 | 0.5668 | 0.0264 | 0.0000 | 0.0200 | +| knn_F16_cosine_k3 | 0.5666 | 0.0274 | 0.0400 | 0.0200 | +| knn_F10_cosine_k15 | 0.5661 | 0.0266 | 0.0800 | 0.0600 | +| knn_F16_cosine_k5 | 0.5658 | 0.0271 | 0.0400 | 0.0200 | +| knn_F10_cosine_k3 | 0.5649 | 0.0263 | 0.0400 | 0.0200 | +| knn_F10_euclidean_k3 | 0.5639 | 0.0252 | 0.0000 | 0.0200 | +| knn_F16_euclidean_k10 | 0.5638 | 0.0261 | 0.0000 | 0.0600 | +| knn_F10_euclidean_k5 | 0.5635 | 0.0255 | 0.0000 | 0.0200 | +| knn_F16_no_mr_cosine_k3 | 0.5629 | 0.0273 | 0.0400 | 0.0400 | +| knn_F10_euclidean_k10 | 0.5622 | 0.0254 | 0.0000 | 0.0000 | +| knn_F16_cosine_k10 | 0.5622 | 0.0263 | 0.0000 | 0.0400 | +| knn_F16_euclidean_k15 | 0.5619 | 0.0257 | 0.0400 | 0.0400 | +| knn_F16_no_mr_euclidean_k3 | 0.5612 | 0.0266 | 0.0400 | 0.0200 | +| knn_F10_euclidean_k15 | 0.5610 | 0.0254 | 0.0000 | 0.0200 | +| knn_F16_no_mr_cosine_k5 | 0.5609 | 0.0269 | 0.0400 | 0.0200 | +| knn_F16_cosine_k15 | 0.5606 | 0.0285 | 0.0800 | 0.0600 | +| baseline_lr_F10 | 0.5586 | 0.0241 | 0.0000 | 0.0200 | +| knn_F16_no_mr_euclidean_k5 | 0.5577 | 0.0262 | 0.0400 | 0.0400 | +| knn_F16_no_mr_cosine_k10 | 0.5571 | 0.0262 | 0.0000 | 0.0400 | +| knn_F16_no_mr_cosine_k15 | 0.5561 | 0.0284 | 0.0800 | 0.0600 | +| knn_F16_no_mr_euclidean_k10 | 0.5537 | 0.0257 | 0.0000 | 0.0600 | +| knn_F16_no_mr_euclidean_k15 | 0.5523 | 0.0253 | 0.0400 | 0.0400 | +| baseline_merge_rate | 0.4492 | 0.0205 | 0.0400 | 0.0400 | + +### Strategy B: Suspended-Only CV, All Authors (Stage 12 Replication) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| baseline_lr_F16 | 0.6214 | 0.0400 | 0.0800 | 0.1000 | +| knn_F16_cosine_k5 | 0.5573 | 0.0303 | 0.0800 | 0.0800 | +| baseline_merge_rate | 0.5137 | 0.0255 | 0.0800 | 0.1000 | + +### Strategy C: Temporal Holdout, cutoff=2022-07-01 (k-NN) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| knn_F10_cosine_k5 | 0.5348 | 0.0283 | 0.0000 | 0.0200 | +| knn_F10_cosine_k10 | 0.5200 | 0.0268 | 0.0000 | 0.0000 | +| baseline_merge_rate | 0.4831 | 0.0275 | 0.0000 | 0.0600 | +| baseline_lr_F16 | 0.4806 | 0.0370 | 0.0800 | 0.0400 | +| knn_F16_cosine_k10 | 0.4762 | 0.0240 | 0.0000 | 0.0000 | +| baseline_lr_F10 | 0.4725 | 0.0239 | 0.0400 | 0.0200 | +| knn_F16_cosine_k5 | 0.4714 | 0.0264 | 0.0400 | 0.0200 | + +### Strategy C: Temporal Holdout, cutoff=2023-01-01 (k-NN) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| baseline_lr_F16 | 0.5346 | 0.0270 | 0.0000 | 0.0000 | +| knn_F10_cosine_k10 | 0.5246 | 0.0269 | 0.0400 | 0.0200 | +| knn_F10_cosine_k5 | 0.5189 | 0.0252 | 0.0000 | 0.0000 | +| knn_F16_cosine_k10 | 0.5089 | 0.0255 | 0.0000 | 0.0400 | +| knn_F16_cosine_k5 | 0.5062 | 0.0245 | 0.0000 | 0.0000 | +| baseline_lr_F10 | 0.5007 | 0.0256 | 0.0000 | 0.0000 | +| baseline_merge_rate | 0.4616 | 0.0255 | 0.0000 | 0.0200 | + +### Strategy C: Temporal Holdout, cutoff=2024-01-01 (k-NN) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| baseline_lr_F10 | 0.5516 | 0.0307 | 0.0000 | 0.0400 | +| baseline_lr_F16 | 0.5497 | 0.0301 | 0.0000 | 0.0400 | +| knn_F10_cosine_k10 | 0.5376 | 0.0386 | 0.0800 | 0.0400 | +| knn_F10_cosine_k5 | 0.5365 | 0.0294 | 0.0000 | 0.0200 | +| knn_F16_cosine_k10 | 0.5248 | 0.0388 | 0.0800 | 0.0400 | +| knn_F16_cosine_k5 | 0.5233 | 0.0300 | 0.0000 | 0.0200 | +| baseline_merge_rate | 0.4586 | 0.0263 | 0.0400 | 0.0400 | + +## 2. Graph Proximity Results + +### Strategy A: Discovery-Order Holdout (Graph) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| jaccard_max | 0.5465 | 0.0225 | 0.0000 | 0.0000 | +| jaccard_mean_k5 | 0.5314 | 0.0210 | 0.0000 | 0.0000 | +| ppr | 0.4607 | 0.0165 | 0.0000 | 0.0000 | +| baseline_merge_rate | 0.4257 | 0.0178 | 0.0000 | 0.0200 | + +### Strategy B: Suspended-Only CV, Merged-PR Population (Graph) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| jaccard_max | 0.5952 | 0.0266 | 0.0800 | 0.0600 | +| jaccard_mean_k5 | 0.5906 | 0.0266 | 0.0800 | 0.0600 | +| ppr | 0.4787 | 0.0191 | 0.0000 | 0.0000 | +| baseline_merge_rate | 0.4492 | 0.0205 | 0.0400 | 0.0400 | + +### Strategy C: Temporal Holdout, cutoff=2022-07-01 (Graph) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| jaccard_max | 0.6060 | 0.0371 | 0.0000 | 0.0000 | +| ppr | 0.5223 | 0.0294 | 0.0400 | 0.0200 | +| baseline_merge_rate | 0.4831 | 0.0275 | 0.0000 | 0.0600 | + +### Strategy C: Temporal Holdout, cutoff=2023-01-01 (Graph) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| jaccard_max | 0.5607 | 0.0301 | 0.0000 | 0.0000 | +| ppr | 0.4799 | 0.0243 | 0.0400 | 0.0200 | +| baseline_merge_rate | 0.4616 | 0.0255 | 0.0400 | 0.0200 | + +### Strategy C: Temporal Holdout, cutoff=2024-01-01 (Graph) + +| Method | AUC-ROC | AUC-PR | P@25 | P@50 | +|--------|---------|--------|------|------| +| jaccard_max | 0.5557 | 0.0302 | 0.1200 | 0.0600 | +| baseline_merge_rate | 0.4586 | 0.0263 | 0.0000 | 0.0400 | +| ppr | 0.4424 | 0.0226 | 0.0000 | 0.0000 | + +## 3. Combined Model Results (H4) + +### F10 Features + +Behavioral LR baseline: AUC = 0.5586 + +| Model | AUC | Delta | DeLong p | +|-------|-----|-------|----------| +| LR + knn | 0.5646 | 0.0060 | 0.0857 | +| LR + graph | 0.6079 | 0.0493 | 0.0000 | +| LR + both | 0.5795 | 0.0209 | 0.0014 | + +### F16 Features + +Behavioral LR baseline: AUC = 0.5727 + +| Model | AUC | Delta | DeLong p | +|-------|-----|-------|----------| +| LR + knn | 0.5696 | -0.0030 | 0.2952 | +| LR + graph | 0.6062 | 0.0336 | 0.0000 | +| LR + both | 0.5875 | 0.0148 | 0.0011 | + +## 4. Hypothesis Verdicts + +**H1**: Profile proximity detects suspension in merged-PR population (k-NN AUC > 0.55) + +**Verdict**: SUPPORTED — Best AUC = 0.5698 > 0.55 threshold. Mean AUC = 0.5623. + + +**H2**: Graph-based proximity captures structural signal (AUC > 0.55) + +**Verdict**: SUPPORTED — Best AUC = 0.5952 > 0.55 threshold. Mean AUC = 0.5548. + + +**H3**: Proximity signal is robust to seed selection bias + +**Verdict**: NOT SUPPORTED — Strategy A AUC (0.4413) is substantially lower than Strategy B (0.5698), suggesting seed selection bias affects results. + + +**H4**: Proximity adds incremental value to behavioral features + +**Verdict**: SUPPORTED — F10+graph_combined: delta=+0.0493, p=0.0000; F10+both_combined: delta=+0.0209, p=0.0014; F16+graph_combined: delta=+0.0336, p=0.0000; F16+both_combined: delta=+0.0148, p=0.0011 + + +## 5. Summary + +**Best overall method**: jaccard_max (Strategy B merged (Graph)), AUC = 0.5952 + + +The best method exceeds the AUC > 0.55 threshold, suggesting proximity-based detection has *some* signal on the merged-PR population. However, the practical value depends on the magnitude and precision at operational thresholds. + + +**Stage 12 replication** (all-authors, F16, cosine, k=5): AUC = 0.5573 (original stage 12: 0.595) diff --git a/experiments/bot_detection/proximity_results/combined_results.json b/experiments/bot_detection/proximity_results/combined_results.json new file mode 100644 index 0000000..03313b6 --- /dev/null +++ b/experiments/bot_detection/proximity_results/combined_results.json @@ -0,0 +1,52 @@ +{ + "F10": { + "baseline_auc": 0.5585613986262634, + "knn_combined": { + "auc": 0.5645998606984804, + "delta": 0.006038462072217032, + "delong_p": 0.08569830435572577, + "delong_z": 1.7185392351131559 + }, + "graph_combined": { + "auc": 0.6078944153993316, + "delta": 0.04933301677306823, + "delong_p": 1.7156872178987075e-10, + "delong_z": 6.384847175652404 + }, + "both_combined": { + "auc": 0.5795063235163395, + "delta": 0.020944924890076044, + "delong_p": 0.0013666164808129242, + "delong_z": 3.201610500823981 + } + }, + "F16": { + "baseline_auc": 0.5726884630661563, + "knn_combined": { + "auc": 0.5696428207520006, + "delta": -0.0030456423141557165, + "delong_p": 0.295249810990437, + "delong_z": -1.0466743212522238 + }, + "graph_combined": { + "auc": 0.6062398504115221, + "delta": 0.03355138734536578, + "delong_p": 8.80508813646584e-08, + "delong_z": 5.349801241613703 + }, + "both_combined": { + "auc": 0.5874604627856028, + "delta": 0.014771999719446538, + "delong_p": 0.0011357543315286395, + "delong_z": 3.254541829313259 + } + }, + "config": { + "best_knn": { + "feature_set": "F16", + "k": 3, + "metric": "euclidean" + }, + "best_graph_method": "jaccard_max" + } +} \ No newline at end of file diff --git a/experiments/bot_detection/proximity_results/graph_results.json b/experiments/bot_detection/proximity_results/graph_results.json new file mode 100644 index 0000000..3d65de2 --- /dev/null +++ b/experiments/bot_detection/proximity_results/graph_results.json @@ -0,0 +1,119 @@ +{ + "strategy_a": { + "jaccard_max": { + "auc_roc": 0.5465033049768726, + "auc_pr": 0.022465416583062413, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "jaccard_mean_k5": { + "auc_roc": 0.5313560126314678, + "auc_pr": 0.02100632239510252, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "ppr": { + "auc_roc": 0.46065574274587245, + "auc_pr": 0.016452914378080687, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_merge_rate": { + "auc_roc": 0.4257166071261594, + "auc_pr": 0.017834975292600683, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + } + }, + "strategy_b_merged": { + "jaccard_max": { + "auc_roc": 0.5951759316179817, + "auc_pr": 0.026553332206537373, + "precision_at_25": 0.08, + "precision_at_50": 0.06 + }, + "jaccard_mean_k5": { + "auc_roc": 0.5905733679049149, + "auc_pr": 0.026626411660334037, + "precision_at_25": 0.08, + "precision_at_50": 0.06 + }, + "ppr": { + "auc_roc": 0.4787480166536705, + "auc_pr": 0.019093495380487643, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_merge_rate": { + "auc_roc": 0.44916413462212873, + "auc_pr": 0.02054584366761837, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + } + }, + "strategy_c": { + "2022-07-01": { + "n_suspended": 58, + "jaccard_max": { + "auc_roc": 0.6059588487795606, + "auc_pr": 0.037115393409398865, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "ppr": { + "auc_roc": 0.5223496428175439, + "auc_pr": 0.029418196346494015, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "baseline_merge_rate": { + "auc_roc": 0.4831308507436681, + "auc_pr": 0.027515703258021475, + "precision_at_25": 0.0, + "precision_at_50": 0.06 + } + }, + "2023-01-01": { + "n_suspended": 92, + "jaccard_max": { + "auc_roc": 0.5606871217070796, + "auc_pr": 0.030058112152036656, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "ppr": { + "auc_roc": 0.47990347752123375, + "auc_pr": 0.024250764597379723, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "baseline_merge_rate": { + "auc_roc": 0.46156050837637597, + "auc_pr": 0.025528807568463496, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + } + }, + "2024-01-01": { + "n_suspended": 204, + "jaccard_max": { + "auc_roc": 0.5557187785036037, + "auc_pr": 0.030171350343118766, + "precision_at_25": 0.12, + "precision_at_50": 0.06 + }, + "ppr": { + "auc_roc": 0.4423996541343076, + "auc_pr": 0.022609490752796845, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_merge_rate": { + "auc_roc": 0.4585765201482583, + "auc_pr": 0.026347127939675606, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + } + } + } +} \ No newline at end of file diff --git a/experiments/bot_detection/proximity_results/knn_results.json b/experiments/bot_detection/proximity_results/knn_results.json new file mode 100644 index 0000000..091aa80 --- /dev/null +++ b/experiments/bot_detection/proximity_results/knn_results.json @@ -0,0 +1,766 @@ +{ + "strategy_a": { + "knn_F10_cosine_k3": { + "feature_set": "F10", + "metric": "cosine", + "k": 3, + "n_seeds": 44, + "auc_roc": 0.4044881181989606, + "auc_pr": 0.014414108607487588, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_cosine_k5": { + "feature_set": "F10", + "metric": "cosine", + "k": 5, + "n_seeds": 44, + "auc_roc": 0.40509920102178865, + "auc_pr": 0.0144340615900681, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_cosine_k10": { + "feature_set": "F10", + "metric": "cosine", + "k": 10, + "n_seeds": 44, + "auc_roc": 0.40046646081990483, + "auc_pr": 0.014307517621694065, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_cosine_k15": { + "feature_set": "F10", + "metric": "cosine", + "k": 15, + "n_seeds": 44, + "auc_roc": 0.39800899096835807, + "auc_pr": 0.014244318522801305, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_euclidean_k3": { + "feature_set": "F10", + "metric": "euclidean", + "k": 3, + "n_seeds": 44, + "auc_roc": 0.41992760373766874, + "auc_pr": 0.014802489878207266, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_euclidean_k5": { + "feature_set": "F10", + "metric": "euclidean", + "k": 5, + "n_seeds": 44, + "auc_roc": 0.42058578969665716, + "auc_pr": 0.01483345415353403, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_euclidean_k10": { + "feature_set": "F10", + "metric": "euclidean", + "k": 10, + "n_seeds": 44, + "auc_roc": 0.4181891206291749, + "auc_pr": 0.014773148101088443, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_euclidean_k15": { + "feature_set": "F10", + "metric": "euclidean", + "k": 15, + "n_seeds": 44, + "auc_roc": 0.42099266574817884, + "auc_pr": 0.01486616322340496, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_cosine_k3": { + "feature_set": "F16", + "metric": "cosine", + "k": 3, + "n_seeds": 44, + "auc_roc": 0.4169836577276469, + "auc_pr": 0.014739879251986585, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_cosine_k5": { + "feature_set": "F16", + "metric": "cosine", + "k": 5, + "n_seeds": 44, + "auc_roc": 0.41519220106246224, + "auc_pr": 0.014690648830818845, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_cosine_k10": { + "feature_set": "F16", + "metric": "cosine", + "k": 10, + "n_seeds": 44, + "auc_roc": 0.4102059776814998, + "auc_pr": 0.014554505782506456, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_cosine_k15": { + "feature_set": "F16", + "metric": "cosine", + "k": 15, + "n_seeds": 44, + "auc_roc": 0.40637021695257247, + "auc_pr": 0.014457937614959608, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_euclidean_k3": { + "feature_set": "F16", + "metric": "euclidean", + "k": 3, + "n_seeds": 44, + "auc_roc": 0.43600088503578094, + "auc_pr": 0.01522210944790348, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_euclidean_k5": { + "feature_set": "F16", + "metric": "euclidean", + "k": 5, + "n_seeds": 44, + "auc_roc": 0.4357385331468403, + "auc_pr": 0.015220105266873136, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_euclidean_k10": { + "feature_set": "F16", + "metric": "euclidean", + "k": 10, + "n_seeds": 44, + "auc_roc": 0.43781847904951743, + "auc_pr": 0.015292436729887173, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_euclidean_k15": { + "feature_set": "F16", + "metric": "euclidean", + "k": 15, + "n_seeds": 44, + "auc_roc": 0.4412888759863879, + "auc_pr": 0.015417066303233279, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_cosine_k3": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 3, + "n_seeds": 44, + "auc_roc": 0.42301090234932825, + "auc_pr": 0.014948110265062093, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_cosine_k5": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 5, + "n_seeds": 44, + "auc_roc": 0.42075596200607923, + "auc_pr": 0.014881769150889488, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_cosine_k10": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 10, + "n_seeds": 44, + "auc_roc": 0.41376792522426054, + "auc_pr": 0.01467172196158133, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_cosine_k15": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 15, + "n_seeds": 44, + "auc_roc": 0.4075128523772338, + "auc_pr": 0.014501252637293344, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_euclidean_k3": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 3, + "n_seeds": 44, + "auc_roc": 0.42279719108763936, + "auc_pr": 0.014848649599265452, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_euclidean_k5": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 5, + "n_seeds": 44, + "auc_roc": 0.42384058845095396, + "auc_pr": 0.014881123034618608, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_euclidean_k10": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 10, + "n_seeds": 44, + "auc_roc": 0.4265827736982237, + "auc_pr": 0.0149654661071134, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_no_mr_euclidean_k15": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 15, + "n_seeds": 44, + "auc_roc": 0.429439152602001, + "auc_pr": 0.015054734880267947, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_merge_rate": { + "auc_roc": 0.4257166071261594, + "auc_pr": 0.017834975292600683, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "baseline_lr_F10": { + "auc_roc": 0.6115228248239957, + "auc_pr": 0.025476938671775345, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_lr_F16": { + "auc_roc": 0.6257008688082613, + "auc_pr": 0.026719348425251546, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "baseline_lr_F16_no_mr": { + "auc_roc": 0.6255722786442628, + "auc_pr": 0.026697287622769957, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + } + }, + "strategy_b_merged": { + "knn_F10_cosine_k3": { + "feature_set": "F10", + "metric": "cosine", + "k": 3, + "auc_roc": 0.5648716624427376, + "auc_pr": 0.026332979818528224, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F10_cosine_k5": { + "feature_set": "F10", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5668110316501503, + "auc_pr": 0.02643964429154735, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F10_cosine_k10": { + "feature_set": "F10", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5669114257626796, + "auc_pr": 0.02637048832190663, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_cosine_k15": { + "feature_set": "F10", + "metric": "cosine", + "k": 15, + "auc_roc": 0.5660563879848627, + "auc_pr": 0.026641563810612598, + "precision_at_25": 0.08, + "precision_at_50": 0.06 + }, + "knn_F10_euclidean_k3": { + "feature_set": "F10", + "metric": "euclidean", + "k": 3, + "auc_roc": 0.5639198562426322, + "auc_pr": 0.025244977861022855, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F10_euclidean_k5": { + "feature_set": "F10", + "metric": "euclidean", + "k": 5, + "auc_roc": 0.5635480354572501, + "auc_pr": 0.025455849589558262, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F10_euclidean_k10": { + "feature_set": "F10", + "metric": "euclidean", + "k": 10, + "auc_roc": 0.5621852760219226, + "auc_pr": 0.025425453360181642, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_euclidean_k15": { + "feature_set": "F10", + "metric": "euclidean", + "k": 15, + "auc_roc": 0.5609975499085639, + "auc_pr": 0.02537096891219008, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F16_cosine_k3": { + "feature_set": "F16", + "metric": "cosine", + "k": 3, + "auc_roc": 0.5665501069766156, + "auc_pr": 0.027397133050288222, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_cosine_k5": { + "feature_set": "F16", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5658208431430134, + "auc_pr": 0.02711022269851017, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_cosine_k10": { + "feature_set": "F16", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5621796499508595, + "auc_pr": 0.026313873065709755, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + }, + "knn_F16_cosine_k15": { + "feature_set": "F16", + "metric": "cosine", + "k": 15, + "auc_roc": 0.5605738442455983, + "auc_pr": 0.028492629877942433, + "precision_at_25": 0.08, + "precision_at_50": 0.06 + }, + "knn_F16_euclidean_k3": { + "feature_set": "F16", + "metric": "euclidean", + "k": 3, + "auc_roc": 0.5698498601671294, + "auc_pr": 0.02697154657365128, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_euclidean_k5": { + "feature_set": "F16", + "metric": "euclidean", + "k": 5, + "auc_roc": 0.56729199821416, + "auc_pr": 0.026623800369012823, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "knn_F16_euclidean_k10": { + "feature_set": "F16", + "metric": "euclidean", + "k": 10, + "auc_roc": 0.5637968328220485, + "auc_pr": 0.026097416626795336, + "precision_at_25": 0.0, + "precision_at_50": 0.06 + }, + "knn_F16_euclidean_k15": { + "feature_set": "F16", + "metric": "euclidean", + "k": 15, + "auc_roc": 0.5619487309896622, + "auc_pr": 0.02572090231622353, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "knn_F16_no_mr_cosine_k3": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 3, + "auc_roc": 0.5629135396651138, + "auc_pr": 0.02730182510756448, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "knn_F16_no_mr_cosine_k5": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5608635243934563, + "auc_pr": 0.026892239404064537, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_no_mr_cosine_k10": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 10, + "auc_roc": 0.557111060018051, + "auc_pr": 0.026177410114500563, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + }, + "knn_F16_no_mr_cosine_k15": { + "feature_set": "F16_no_mr", + "metric": "cosine", + "k": 15, + "auc_roc": 0.5561067438213549, + "auc_pr": 0.028385220632449804, + "precision_at_25": 0.08, + "precision_at_50": 0.06 + }, + "knn_F16_no_mr_euclidean_k3": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 3, + "auc_roc": 0.5611669571594692, + "auc_pr": 0.02660610314413787, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_no_mr_euclidean_k5": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 5, + "auc_roc": 0.5576524130781398, + "auc_pr": 0.026233114680906118, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "knn_F16_no_mr_euclidean_k10": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 10, + "auc_roc": 0.5537074120485688, + "auc_pr": 0.025690353154214196, + "precision_at_25": 0.0, + "precision_at_50": 0.06 + }, + "knn_F16_no_mr_euclidean_k15": { + "feature_set": "F16_no_mr", + "metric": "euclidean", + "k": 15, + "auc_roc": 0.5522593863806822, + "auc_pr": 0.025321533523377543, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "baseline_merge_rate": { + "auc_roc": 0.44916413462212873, + "auc_pr": 0.02054584366761837, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "baseline_lr_F10": { + "auc_roc": 0.5585613986262634, + "auc_pr": 0.024074913396397203, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "baseline_lr_F16": { + "auc_roc": 0.5726884630661563, + "auc_pr": 0.02530329263349593, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_lr_F16_no_mr": { + "auc_roc": 0.5708554891137401, + "auc_pr": 0.025466340204404955, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + } + }, + "strategy_b_all_authors": { + "knn_F16_cosine_k5": { + "feature_set": "F16", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5573032346377934, + "auc_pr": 0.03025566169051945, + "precision_at_25": 0.08, + "precision_at_50": 0.08 + }, + "baseline_merge_rate": { + "auc_roc": 0.5137408840604576, + "auc_pr": 0.0255277355942943, + "precision_at_25": 0.08, + "precision_at_50": 0.1 + }, + "baseline_lr_F16": { + "auc_roc": 0.6213897965252053, + "auc_pr": 0.039980753620119616, + "precision_at_25": 0.08, + "precision_at_50": 0.1 + } + }, + "strategy_c": { + "2022-07-01": { + "n_suspended": 58, + "knn_F10_cosine_k5": { + "feature_set": "F10", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5347916303676364, + "auc_pr": 0.028312011303451727, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F10_cosine_k10": { + "feature_set": "F10", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5200212250328671, + "auc_pr": 0.02684416872932269, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_cosine_k5": { + "feature_set": "F16", + "metric": "cosine", + "k": 5, + "auc_roc": 0.47143332330160137, + "auc_pr": 0.026437394559032663, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_cosine_k10": { + "feature_set": "F16", + "metric": "cosine", + "k": 10, + "auc_roc": 0.47623271506185355, + "auc_pr": 0.024034136022781045, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_merge_rate": { + "auc_roc": 0.4831308507436681, + "auc_pr": 0.027515703258021475, + "precision_at_25": 0.0, + "precision_at_50": 0.06 + }, + "baseline_lr_F10": { + "auc_roc": 0.47252229420429886, + "auc_pr": 0.023907372219780787, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "baseline_lr_F16": { + "auc_roc": 0.4805846387784518, + "auc_pr": 0.03696074400731085, + "precision_at_25": 0.08, + "precision_at_50": 0.04 + } + }, + "2023-01-01": { + "n_suspended": 92, + "knn_F10_cosine_k5": { + "feature_set": "F10", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5189038596664243, + "auc_pr": 0.02524322746836556, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F10_cosine_k10": { + "feature_set": "F10", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5246052193636667, + "auc_pr": 0.026944777843714116, + "precision_at_25": 0.04, + "precision_at_50": 0.02 + }, + "knn_F16_cosine_k5": { + "feature_set": "F16", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5062006138977576, + "auc_pr": 0.024503940878847726, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "knn_F16_cosine_k10": { + "feature_set": "F16", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5089156938400661, + "auc_pr": 0.025477075455066778, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + }, + "baseline_merge_rate": { + "auc_roc": 0.46156050837637597, + "auc_pr": 0.025528807568463496, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "baseline_lr_F10": { + "auc_roc": 0.5006795404395903, + "auc_pr": 0.025634042410773817, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + }, + "baseline_lr_F16": { + "auc_roc": 0.5345517806733152, + "auc_pr": 0.027039085941844482, + "precision_at_25": 0.0, + "precision_at_50": 0.0 + } + }, + "2024-01-01": { + "n_suspended": 204, + "knn_F10_cosine_k5": { + "feature_set": "F10", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5365459036532064, + "auc_pr": 0.029377995875937122, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F10_cosine_k10": { + "feature_set": "F10", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5376214615988907, + "auc_pr": 0.038588231900023445, + "precision_at_25": 0.08, + "precision_at_50": 0.04 + }, + "knn_F16_cosine_k5": { + "feature_set": "F16", + "metric": "cosine", + "k": 5, + "auc_roc": 0.5232747576040366, + "auc_pr": 0.030004999500581067, + "precision_at_25": 0.0, + "precision_at_50": 0.02 + }, + "knn_F16_cosine_k10": { + "feature_set": "F16", + "metric": "cosine", + "k": 10, + "auc_roc": 0.5248498700367482, + "auc_pr": 0.0387611499988658, + "precision_at_25": 0.08, + "precision_at_50": 0.04 + }, + "baseline_merge_rate": { + "auc_roc": 0.4585765201482583, + "auc_pr": 0.026347127939675606, + "precision_at_25": 0.04, + "precision_at_50": 0.04 + }, + "baseline_lr_F10": { + "auc_roc": 0.5516215090499765, + "auc_pr": 0.030733820462660773, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + }, + "baseline_lr_F16": { + "auc_roc": 0.5496690945805587, + "auc_pr": 0.030074286160196112, + "precision_at_25": 0.0, + "precision_at_50": 0.04 + } + } + }, + "delong_comparisons": { + "strategy_a": { + "best_knn": "knn_F16_euclidean_k15", + "best_knn_auc": 0.4412888759863879, + "baselines": { + "baseline_merge_rate": { + "auc": 0.4257166071261594, + "delta": 0.0155722688602285 + }, + "baseline_lr_F10": { + "auc": 0.6115228248239957, + "delta": -0.17023394883760778 + }, + "baseline_lr_F16": { + "auc": 0.6257008688082613, + "delta": -0.18441199282187337 + }, + "baseline_lr_F16_no_mr": { + "auc": 0.6255722786442628, + "delta": -0.1842834026578749 + } + } + }, + "strategy_b_merged": { + "best_knn": "knn_F16_euclidean_k3", + "best_knn_auc": 0.5698498601671294, + "baselines": { + "baseline_merge_rate": { + "auc": 0.44916413462212873, + "delta": 0.12068572554500062 + }, + "baseline_lr_F10": { + "auc": 0.5585613986262634, + "delta": 0.011288461540865935 + }, + "baseline_lr_F16": { + "auc": 0.5726884630661563, + "delta": -0.002838602899026932 + }, + "baseline_lr_F16_no_mr": { + "auc": 0.5708554891137401, + "delta": -0.0010056289466107149 + } + } + }, + "strategy_b_all_authors": { + "best_knn": "knn_F16_cosine_k5", + "best_knn_auc": 0.5573032346377934, + "baselines": { + "baseline_merge_rate": { + "auc": 0.5137408840604576, + "delta": 0.04356235057733582 + }, + "baseline_lr_F16": { + "auc": 0.6213897965252053, + "delta": -0.06408656188741191 + } + } + } + } +} \ No newline at end of file diff --git a/scripts/proximity_analysis.py b/scripts/proximity_analysis.py new file mode 100644 index 0000000..13f7b02 --- /dev/null +++ b/scripts/proximity_analysis.py @@ -0,0 +1,378 @@ +"""Aggregate proximity experiment results and generate analysis report. + +Reads all results JSONs, computes summary statistics, DeLong comparisons, +hypothesis verdicts, and writes PROXIMITY_ANALYSIS.md. +""" + +from __future__ import annotations + +import json +from typing import Any + +import numpy as np +from proximity_common import RESULTS_DIR + + +def load_results() -> dict[str, Any]: + """Load all result JSON files.""" + data: dict[str, Any] = {} + for name in ["knn_results", "graph_results", "combined_results"]: + path = RESULTS_DIR / f"{name}.json" + if path.exists(): + with open(path) as f: + data[name] = json.load(f) + print(f"Loaded {path}") + else: + print(f"Warning: {path} not found") + return data + + +def extract_auc_table( + results: dict[str, Any], + strategy_key: str, +) -> list[dict[str, Any]]: + """Extract method → AUC rows from a strategy's results.""" + strategy = results.get(strategy_key, {}) + rows: list[dict[str, Any]] = [] + + for key, val in strategy.items(): + if not isinstance(val, dict) or "auc_roc" not in val: + continue + row: dict[str, Any] = {"method": key} + row["auc_roc"] = val.get("auc_roc", float("nan")) + row["auc_pr"] = val.get("auc_pr", float("nan")) + row["precision_at_25"] = val.get("precision_at_25", float("nan")) + row["precision_at_50"] = val.get("precision_at_50", float("nan")) + # Extra metadata + for meta in ["feature_set", "metric", "k"]: + if meta in val: + row[meta] = val[meta] + rows.append(row) + + rows.sort(key=lambda r: -r.get("auc_roc", 0.0) if np.isfinite( + r.get("auc_roc", float("nan"))) else float("-inf")) + return rows + + +def format_auc_table(rows: list[dict[str, Any]], title: str) -> str: + """Format AUC table as markdown.""" + if not rows: + return f"### {title}\n\nNo results available.\n" + + lines = [f"### {title}\n"] + lines.append("| Method | AUC-ROC | AUC-PR | P@25 | P@50 |") + lines.append("|--------|---------|--------|------|------|") + + for r in rows: + auc = r.get("auc_roc", float("nan")) + apr = r.get("auc_pr", float("nan")) + p25 = r.get("precision_at_25", float("nan")) + p50 = r.get("precision_at_50", float("nan")) + + def fmt(v: float) -> str: + return f"{v:.4f}" if np.isfinite(v) else "—" + + lines.append( + f"| {r['method']} | {fmt(auc)} | {fmt(apr)} | " + f"{fmt(p25)} | {fmt(p50)} |" + ) + + lines.append("") + return "\n".join(lines) + + +def assess_hypothesis( + label: str, + description: str, + aucs: list[float], + threshold: float, + null_description: str, +) -> str: + """Assess whether hypothesis is supported.""" + valid_aucs = [a for a in aucs if np.isfinite(a)] + if not valid_aucs: + return (f"**{label}**: {description}\n\n" + f"**Verdict**: INCONCLUSIVE — no valid AUC results.\n\n") + + best = max(valid_aucs) + mean = np.mean(valid_aucs) + + if best > threshold: + verdict = "SUPPORTED" + detail = (f"Best AUC = {best:.4f} > {threshold:.2f} threshold. " + f"Mean AUC = {mean:.4f}.") + else: + verdict = "NOT SUPPORTED" + detail = (f"Best AUC = {best:.4f} ≤ {threshold:.2f} threshold. " + f"{null_description}") + + return (f"**{label}**: {description}\n\n" + f"**Verdict**: {verdict} — {detail}\n\n") + + +def generate_report(data: dict[str, Any]) -> str: + """Generate the full PROXIMITY_ANALYSIS.md report.""" + sections: list[str] = [] + + sections.append("# Proximity-Based Suspension Detection — Results\n") + sections.append( + "This report summarizes the results of proximity-based methods " + "for detecting suspended GitHub accounts among authors with merged PRs.\n" + ) + + # --- k-NN Results --- + knn = data.get("knn_results", {}) + + sections.append("## 1. k-NN Proximity Results\n") + + # Strategy A + rows_a = extract_auc_table(knn, "strategy_a") + sections.append(format_auc_table( + rows_a, "Strategy A: Discovery-Order Holdout (k-NN)", + )) + + # Strategy B merged + rows_b = extract_auc_table(knn, "strategy_b_merged") + sections.append(format_auc_table( + rows_b, "Strategy B: Suspended-Only CV, Merged-PR Population (k-NN)", + )) + + # Strategy B all-authors replication + rows_b_all = extract_auc_table(knn, "strategy_b_all_authors") + sections.append(format_auc_table( + rows_b_all, + "Strategy B: Suspended-Only CV, All Authors (Stage 12 Replication)", + )) + + # Strategy C temporal + stc = knn.get("strategy_c", {}) + for cutoff, cutoff_data in stc.items(): + if isinstance(cutoff_data, dict): + rows_c = extract_auc_table({"c": cutoff_data}, "c") + sections.append(format_auc_table( + rows_c, f"Strategy C: Temporal Holdout, cutoff={cutoff} (k-NN)", + )) + + # --- Graph Results --- + graph = data.get("graph_results", {}) + + sections.append("## 2. Graph Proximity Results\n") + + rows_ga = extract_auc_table(graph, "strategy_a") + sections.append(format_auc_table( + rows_ga, "Strategy A: Discovery-Order Holdout (Graph)", + )) + + rows_gb = extract_auc_table(graph, "strategy_b_merged") + sections.append(format_auc_table( + rows_gb, "Strategy B: Suspended-Only CV, Merged-PR Population (Graph)", + )) + + stcg = graph.get("strategy_c", {}) + for cutoff, cutoff_data in stcg.items(): + if isinstance(cutoff_data, dict): + rows_gc = extract_auc_table({"c": cutoff_data}, "c") + sections.append(format_auc_table( + rows_gc, + f"Strategy C: Temporal Holdout, cutoff={cutoff} (Graph)", + )) + + # --- Combined Results --- + combined = data.get("combined_results", {}) + + sections.append("## 3. Combined Model Results (H4)\n") + + for fs_name in ["F10", "F16"]: + fs_data = combined.get(fs_name, {}) + if not fs_data: + continue + + baseline_auc = fs_data.get("baseline_auc", float("nan")) + sections.append(f"### {fs_name} Features\n") + sections.append(f"Behavioral LR baseline: AUC = " + f"{baseline_auc:.4f}\n") + + lines = ["| Model | AUC | Delta | DeLong p |"] + lines.append("|-------|-----|-------|----------|") + + for combo_name in ["knn_combined", "graph_combined", "both_combined"]: + c = fs_data.get(combo_name, {}) + auc = c.get("auc", float("nan")) + delta = c.get("delta", float("nan")) + p = c.get("delong_p", float("nan")) + + def fmt(v: float) -> str: + return f"{v:.4f}" if np.isfinite(v) else "—" + + lines.append( + f"| LR + {combo_name.replace('_combined', '')} | " + f"{fmt(auc)} | {fmt(delta)} | {fmt(p)} |" + ) + + lines.append("") + sections.append("\n".join(lines)) + + # --- Hypothesis Verdicts --- + sections.append("## 4. Hypothesis Verdicts\n") + + # H1: k-NN on merged-PR pop AUC > 0.55 + h1_aucs = [ + r.get("auc_roc", float("nan")) + for r in rows_b + if r["method"].startswith("knn_") + ] + sections.append(assess_hypothesis( + "H1", "Profile proximity detects suspension in merged-PR population " + "(k-NN AUC > 0.55)", + h1_aucs, 0.55, + "No actionable proximity signal in this population.", + )) + + # H2: Graph proximity AUC > 0.55 + h2_aucs = [ + r.get("auc_roc", float("nan")) + for r in rows_gb + if r["method"] in ("jaccard_max", "jaccard_mean_k5", "ppr") + ] + sections.append(assess_hypothesis( + "H2", "Graph-based proximity captures structural signal (AUC > 0.55)", + h2_aucs, 0.55, + "Graph proximity does not capture structural signal.", + )) + + # H3: Strategy A generalizes + h3_aucs = [ + r.get("auc_roc", float("nan")) + for r in rows_a + if r["method"].startswith("knn_") + ] + h3_b_aucs = [ + r.get("auc_roc", float("nan")) + for r in rows_b + if r["method"].startswith("knn_") + ] + valid_h3 = [a for a in h3_aucs if np.isfinite(a)] + valid_h3b = [a for a in h3_b_aucs if np.isfinite(a)] + + if valid_h3 and valid_h3b: + best_a = max(valid_h3) + best_b = max(valid_h3b) + if best_a >= best_b - 0.03: + h3_verdict = (f"SUPPORTED — Strategy A AUC ({best_a:.4f}) is within " + f"0.03 of Strategy B ({best_b:.4f}), suggesting " + f"generalization from biased seeds.") + else: + h3_verdict = (f"NOT SUPPORTED — Strategy A AUC ({best_a:.4f}) is " + f"substantially lower than Strategy B ({best_b:.4f}), " + f"suggesting seed selection bias affects results.") + else: + h3_verdict = "INCONCLUSIVE — insufficient data." + + sections.append( + f"**H3**: Proximity signal is robust to seed selection bias\n\n" + f"**Verdict**: {h3_verdict}\n\n" + ) + + # H4: Proximity adds incremental value + h4_results: list[str] = [] + for fs_name in ["F10", "F16"]: + fs_data = combined.get(fs_name, {}) + for combo in ["knn_combined", "graph_combined", "both_combined"]: + c = fs_data.get(combo, {}) + p = c.get("delong_p", float("nan")) + delta = c.get("delta", float("nan")) + if np.isfinite(p) and p < 0.05 and delta > 0: + h4_results.append( + f"{fs_name}+{combo}: delta={delta:+.4f}, p={p:.4f}" + ) + + if h4_results: + h4_verdict = "SUPPORTED — " + "; ".join(h4_results) + else: + h4_verdict = ("NOT SUPPORTED — No proximity feature significantly " + "improves LR AUC (DeLong p > 0.05).") + + sections.append( + f"**H4**: Proximity adds incremental value to behavioral features\n\n" + f"**Verdict**: {h4_verdict}\n\n" + ) + + # --- Summary --- + sections.append("## 5. Summary\n") + + # Find overall best method + all_aucs: list[tuple[str, str, float]] = [] + for strat_name, rows in [ + ("Strategy A (k-NN)", rows_a), + ("Strategy B merged (k-NN)", rows_b), + ("Strategy A (Graph)", rows_ga), + ("Strategy B merged (Graph)", rows_gb), + ]: + for r in rows: + auc = r.get("auc_roc", float("nan")) + if np.isfinite(auc) and not r["method"].startswith("baseline"): + all_aucs.append((strat_name, r["method"], auc)) + + if all_aucs: + all_aucs.sort(key=lambda x: -x[2]) + best_strat, best_method, best_auc = all_aucs[0] + sections.append( + f"**Best overall method**: {best_method} ({best_strat}), " + f"AUC = {best_auc:.4f}\n\n" + ) + + # Compare to random baseline + if best_auc > 0.55: + sections.append( + "The best method exceeds the AUC > 0.55 threshold, suggesting " + "proximity-based detection has *some* signal on the merged-PR " + "population. However, the practical value depends on the " + "magnitude and precision at operational thresholds.\n" + ) + else: + sections.append( + "No method exceeds the AUC > 0.55 threshold on the merged-PR " + "population. Proximity-based detection does not appear to " + "rescue what individual behavioral features cannot.\n" + ) + + # Stage 12 replication comparison + if rows_b_all: + repl_knn = [ + r for r in rows_b_all + if r["method"].startswith("knn_") + ] + if repl_knn: + repl_best = max( + repl_knn, key=lambda r: r.get("auc_roc", 0.0), + ) + sections.append( + f"\n**Stage 12 replication** (all-authors, F16, cosine, k=5): " + f"AUC = {repl_best.get('auc_roc', float('nan')):.4f} " + f"(original stage 12: 0.595)\n" + ) + + return "\n".join(sections) + + +def main() -> None: + data = load_results() + + if not data: + print("No results found. Run experiment scripts first.") + return + + report = generate_report(data) + + output_path = RESULTS_DIR / "PROXIMITY_ANALYSIS.md" + with open(output_path, "w") as f: + f.write(report) + print(f"\nReport written to {output_path}") + + # Print to console too + print("\n" + "=" * 80) + print(report) + + +if __name__ == "__main__": + main() diff --git a/scripts/proximity_combined.py b/scripts/proximity_combined.py new file mode 100644 index 0000000..b4e4438 --- /dev/null +++ b/scripts/proximity_combined.py @@ -0,0 +1,372 @@ +"""Combined proximity + behavioral model experiment. + +Tests H4: Does adding proximity-derived features to a logistic regression +model improve AUC beyond behavioral-features-only baselines? + +Reads k-NN and graph results, identifies best proximity methods, then +trains LR on behavioral features + proximity score as additional feature. +""" + +from __future__ import annotations + +import json +from typing import Any + +import duckdb +import networkx as nx +import numpy as np +import pandas as pd +from proximity_common import ( + DB_PATH, + F10, + F16, + RESULTS_DIR, + delong_auc_test, + load_all_time_features, + prepare_features, +) +from proximity_graph_experiment import ( + build_author_repo_data, + jaccard_max_proximity, + ppr_proximity, +) +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import LeaveOneOut, StratifiedKFold +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import StandardScaler + + +def compute_knn_scores_cv( + df: pd.DataFrame, + feature_list: list[str], + k: int, + metric: str, + seed: int = 42, +) -> np.ndarray: + """Compute out-of-fold k-NN proximity scores via suspended-only CV.""" + y = (df["account_status"] == "suspended").astype(int).values + x = prepare_features(df, feature_list) + susp_idx = np.where(y == 1)[0] + active_idx = np.where(y == 0)[0] + n_susp = len(susp_idx) + + use_loo = n_susp < 30 + susp_oof = np.full(n_susp, np.nan) + active_accum = np.zeros(len(active_idx)) + active_count = np.zeros(len(active_idx)) + + if use_loo: + splits = [( + np.array([j for j in range(n_susp) if j != i]), + np.array([i]), + ) for i in range(n_susp)] + else: + from sklearn.model_selection import KFold + kf = KFold(n_splits=5, shuffle=True, random_state=seed) + splits = list(kf.split(np.arange(n_susp))) + + for train_pos, test_pos in splits: + seed_susp_idx = susp_idx[train_pos] + held_susp_idx = susp_idx[test_pos] + + train_idx = np.concatenate([seed_susp_idx, active_idx]) + scaler = StandardScaler() + scaler.fit(x[train_idx]) + + seed_scaled = scaler.transform(x[seed_susp_idx]) + eval_idx = np.concatenate([held_susp_idx, active_idx]) + eval_scaled = scaler.transform(x[eval_idx]) + + effective_k = min(k, len(seed_scaled)) + if effective_k == 0: + continue + nn = NearestNeighbors(n_neighbors=effective_k, metric=metric) + nn.fit(seed_scaled) + dists, _ = nn.kneighbors(eval_scaled) + scores = -dists.mean(axis=1) + + n_held = len(held_susp_idx) + susp_oof[test_pos] = scores[:n_held] + active_accum += scores[n_held:] + active_count += 1 + + safe = np.where(active_count > 0, active_count, 1.0) + oof = np.full(len(y), np.nan) + oof[susp_idx] = susp_oof + oof[active_idx] = active_accum / safe + + return oof + + +def compute_graph_scores_cv( + df: pd.DataFrame, + author_repos: dict[str, set[str]], + graph: nx.Graph, + method: str, + seed: int = 42, +) -> np.ndarray: + """Compute out-of-fold graph proximity scores via suspended-only CV.""" + author_col = "author" if "author" in df.columns else "login" + y = (df["account_status"] == "suspended").astype(int).values + authors = df[author_col].tolist() + susp_idx = np.where(y == 1)[0] + active_idx = np.where(y == 0)[0] + n_susp = len(susp_idx) + + use_loo = n_susp < 30 + susp_oof = np.full(n_susp, np.nan) + active_accum = np.zeros(len(active_idx)) + active_count = np.zeros(len(active_idx)) + + if use_loo: + splits = [( + np.array([j for j in range(n_susp) if j != i]), + np.array([i]), + ) for i in range(n_susp)] + else: + from sklearn.model_selection import KFold + kf = KFold(n_splits=5, shuffle=True, random_state=seed) + splits = list(kf.split(np.arange(n_susp))) + + for train_pos, test_pos in splits: + seed_authors = [authors[susp_idx[j]] for j in train_pos] + eval_idx_arr = np.concatenate([susp_idx[test_pos], active_idx]) + eval_authors = [authors[i] for i in eval_idx_arr] + + if method == "jaccard_max": + s_repos = {a: author_repos.get(a, set()) for a in seed_authors} + e_repos = {a: author_repos.get(a, set()) for a in eval_authors} + scores_dict = jaccard_max_proximity(s_repos, e_repos) + scores = np.array([scores_dict.get(a, 0.0) for a in eval_authors]) + elif method == "ppr": + ppr_dict = ppr_proximity(graph, seed_authors) + scores = np.array([ppr_dict.get(a, 0.0) for a in eval_authors]) + else: + scores = np.zeros(len(eval_authors)) + + n_held = len(test_pos) + susp_oof[test_pos] = scores[:n_held] + active_accum += scores[n_held:] + active_count += 1 + + safe = np.where(active_count > 0, active_count, 1.0) + oof = np.full(len(y), np.nan) + oof[susp_idx] = susp_oof + oof[active_idx] = active_accum / safe + + return oof + + +def lr_with_proximity_cv( + df: pd.DataFrame, + feature_list: list[str], + proximity_scores: np.ndarray, + seed: int = 42, +) -> tuple[np.ndarray, np.ndarray]: + """LR CV with behavioral features + proximity score as extra feature. + + Returns (y_true, oof_probs). + """ + y = (df["account_status"] == "suspended").astype(int).values + x_base = prepare_features(df, feature_list) + + # Add proximity as extra column + prox = proximity_scores.reshape(-1, 1) + # Fill NaN with median + median_prox = np.nanmedian(prox) + prox = np.where(np.isfinite(prox), prox, median_prox) + x_combined = np.hstack([x_base, prox]) + + n_pos = y.sum() + oof = np.full(len(y), np.nan) + + if n_pos < 30: + loo = LeaveOneOut() + for train_idx, test_idx in loo.split(x_combined, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x_combined[train_idx]) + x_test = scaler.transform(x_combined[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + else: + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) + for train_idx, test_idx in skf.split(x_combined, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x_combined[train_idx]) + x_test = scaler.transform(x_combined[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + + return y, oof + + +def main() -> None: + results: dict[str, Any] = {} + + # Load data + df = load_all_time_features(merged_pr_only=True) + if "login" in df.columns and "author" not in df.columns: + df = df.rename(columns={"login": "author"}) + + # Determine which feature sets are available + available_f16 = [f for f in F16 if f in df.columns] + feature_sets = {"F10": F10, "F16": available_f16} + + # Load graph data + con = duckdb.connect(str(DB_PATH), read_only=True) + author_repos, graph = build_author_repo_data(con) + con.close() + + # Load prior results to find best methods + knn_path = RESULTS_DIR / "knn_results.json" + graph_path = RESULTS_DIR / "graph_results.json" + + best_knn_config = {"feature_set": "F10", "k": 5, "metric": "cosine"} + best_graph_method = "jaccard_max" + + if knn_path.exists(): + with open(knn_path) as f: + knn_results = json.load(f) + # Find best k-NN from strategy_b_merged + stb = knn_results.get("strategy_b_merged", {}) + best_auc = 0.0 + for key, val in stb.items(): + if key.startswith("knn_") and isinstance(val, dict): + auc = val.get("auc_roc", 0.0) + if auc > best_auc: + best_auc = auc + best_knn_config = { + "feature_set": val.get("feature_set", "F10"), + "k": val.get("k", 5), + "metric": val.get("metric", "cosine"), + } + print(f"Best k-NN from prior results: {best_knn_config} " + f"(AUC={best_auc:.4f})") + else: + print("No prior k-NN results found, using defaults") + + if graph_path.exists(): + with open(graph_path) as f: + graph_results = json.load(f) + stb = graph_results.get("strategy_b_merged", {}) + best_graph_auc = 0.0 + for method in ["jaccard_max", "jaccard_mean_k5", "ppr"]: + val = stb.get(method, {}) + auc = val.get("auc_roc", 0.0) + if auc > best_graph_auc: + best_graph_auc = auc + best_graph_method = method + print(f"Best graph method from prior results: {best_graph_method} " + f"(AUC={best_graph_auc:.4f})") + else: + print("No prior graph results found, using defaults") + + # Compute OOF proximity scores + print("\nComputing k-NN OOF scores...") + fs_key = best_knn_config["feature_set"] + fs_list = feature_sets.get(fs_key, F10) + knn_oof = compute_knn_scores_cv( + df, fs_list, + k=best_knn_config["k"], + metric=best_knn_config["metric"], + ) + + print("Computing graph OOF scores...") + graph_oof = compute_graph_scores_cv( + df, author_repos, graph, best_graph_method, + ) + + # For each behavioral feature set, test: LR alone vs LR + proximity + for fs_name, fs in feature_sets.items(): + print(f"\n--- Combined models with {fs_name} ---") + + # Behavioral-only baseline + y_base, oof_base = lr_with_proximity_cv( + df, fs, np.zeros(len(df)), # zero proximity = behavioral only + ) + base_auc = roc_auc_score(y_base, oof_base) + print(f" LR({fs_name}) baseline: AUC={base_auc:.4f}") + + # LR + k-NN proximity + y_knn, oof_knn = lr_with_proximity_cv(df, fs, knn_oof) + knn_auc = roc_auc_score(y_knn, oof_knn) + dl_knn = delong_auc_test(y_knn, oof_knn, oof_base) + print(f" LR({fs_name}) + k-NN: AUC={knn_auc:.4f} " + f"(delta={knn_auc - base_auc:+.4f}, " + f"p={dl_knn['p_value']:.4f})") + + # LR + graph proximity + y_graph, oof_graph = lr_with_proximity_cv(df, fs, graph_oof) + graph_auc = roc_auc_score(y_graph, oof_graph) + dl_graph = delong_auc_test(y_graph, oof_graph, oof_base) + print(f" LR({fs_name}) + graph: AUC={graph_auc:.4f} " + f"(delta={graph_auc - base_auc:+.4f}, " + f"p={dl_graph['p_value']:.4f})") + + # LR + both + both_oof = np.column_stack([knn_oof, graph_oof]) + both_median = np.nanmedian(both_oof, axis=0) + both_oof = np.where(np.isfinite(both_oof), both_oof, both_median) + # Combine as mean of z-scored proximity scores + combined_prox = np.nanmean(both_oof, axis=1) + + y_both, oof_both = lr_with_proximity_cv(df, fs, combined_prox) + both_auc = roc_auc_score(y_both, oof_both) + dl_both = delong_auc_test(y_both, oof_both, oof_base) + print(f" LR({fs_name}) + both: AUC={both_auc:.4f} " + f"(delta={both_auc - base_auc:+.4f}, " + f"p={dl_both['p_value']:.4f})") + + results[fs_name] = { + "baseline_auc": base_auc, + "knn_combined": { + "auc": knn_auc, + "delta": knn_auc - base_auc, + "delong_p": dl_knn["p_value"], + "delong_z": dl_knn["z_statistic"], + }, + "graph_combined": { + "auc": graph_auc, + "delta": graph_auc - base_auc, + "delong_p": dl_graph["p_value"], + "delong_z": dl_graph["z_statistic"], + }, + "both_combined": { + "auc": both_auc, + "delta": both_auc - base_auc, + "delong_p": dl_both["p_value"], + "delong_z": dl_both["z_statistic"], + }, + } + + results["config"] = { + "best_knn": best_knn_config, + "best_graph_method": best_graph_method, + } + + # Save + output_path = RESULTS_DIR / "combined_results.json" + + def _default(obj: Any) -> Any: + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + raise TypeError(f"Not serializable: {type(obj)}") + + with open(output_path, "w") as f: + json.dump(results, f, indent=2, default=_default) + print(f"\nResults saved to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/proximity_common.py b/scripts/proximity_common.py new file mode 100644 index 0000000..09606e2 --- /dev/null +++ b/scripts/proximity_common.py @@ -0,0 +1,620 @@ +"""Shared utilities for proximity-based suspension detection experiments. + +Provides data loading, feature preparation, CV engine, metrics, and +statistical testing functions used by all proximity experiment scripts. +""" + +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path +from typing import Any + +import duckdb +import networkx as nx +import numpy as np +import pandas as pd +from scipy import stats as sp_stats +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import average_precision_score, roc_auc_score +from sklearn.model_selection import KFold, LeaveOneOut, StratifiedKFold +from sklearn.preprocessing import StandardScaler + +BASE = Path(__file__).resolve().parent.parent / "experiments" / "bot_detection" +DB_PATH = BASE / "data" / "bot_detection.duckdb" +PARQUET_PATH = BASE / "data" / "features" / "author_features.parquet" +RESULTS_DIR = BASE / "proximity_results" + +# Feature sets +F10 = [ + "merge_rate", "total_prs", "career_span_days", "mean_title_length", + "median_additions", "median_files_changed", "total_repos", + "isolation_score", "hub_score", "bipartite_clustering", +] + +F16 = F10 + [ + "rejection_rate", "hour_entropy", "empty_body_rate", + "title_spam_score", "weekend_ratio", "prs_per_active_day", +] + +F16_NO_MR = [f for f in F16 if f not in ("merge_rate", "rejection_rate")] + +LOG_TRANSFORM = {"total_prs", "career_span_days", "median_additions", "median_files_changed"} + +CUTOFFS = ["2022-07-01", "2023-01-01", "2024-01-01"] + + +# --------------------------------------------------------------------------- +# Data loading +# --------------------------------------------------------------------------- + +def load_all_time_features( + merged_pr_only: bool = True, +) -> pd.DataFrame: + """Load parquet features with updated DuckDB labels. + + Adds an 'is_original_discovery' column for Strategy A holdout. + """ + pq = pd.read_parquet(PARQUET_PATH) + original_suspended = set( + pq.loc[pq["account_status"] == "suspended", "login"].tolist() + ) + + # Get updated labels from DuckDB + con = duckdb.connect(str(DB_PATH), read_only=True) + labels = con.execute( + "SELECT login, account_status FROM authors " + "WHERE account_status IN ('active', 'suspended')" + ).fetchdf() + + if merged_pr_only: + merged_authors = set(r[0] for r in con.execute( + "SELECT DISTINCT author FROM prs " + "WHERE state = 'MERGED' AND author IS NOT NULL" + ).fetchall()) + con.close() + + # Join parquet features with updated labels + df = pq.drop(columns=["account_status"], errors="ignore") + df = df.merge(labels, on="login", how="inner") + + if merged_pr_only: + df = df[df["login"].isin(merged_authors)].copy() + + # Mark discovery order + df["is_original_discovery"] = ( + df["login"].isin(original_suspended) + & (df["account_status"] == "suspended") + ) + + n_susp = (df["account_status"] == "suspended").sum() + n_active = (df["account_status"] == "active").sum() + n_orig = df["is_original_discovery"].sum() + pop = "merged-PR" if merged_pr_only else "all" + print(f"Loaded {pop}: {len(df)} authors " + f"({n_susp} suspended [{n_orig} original], {n_active} active)") + + return df + + +def load_temporal_features( + cutoff: str, +) -> pd.DataFrame: + """Compute F16 features from DuckDB for authors with merged PRs before cutoff. + + Returns DataFrame with all 16 features + author + account_status + + is_original_discovery. + """ + con = duckdb.connect(str(DB_PATH), read_only=True) + + # Original 323 suspended from parquet + pq = pd.read_parquet(PARQUET_PATH, columns=["login", "account_status"]) + original_suspended = set( + pq.loc[pq["account_status"] == "suspended", "login"].tolist() + ) + + print(f"\n--- Loading temporal features for cutoff {cutoff} ---") + + # Core F10 features (8 SQL-native + 2 graph features) + df = _get_features_before_cutoff(con, cutoff) + print(f" Authors with merged PRs before {cutoff}: {len(df)}") + + # Isolation score + print(" Computing isolation_score...") + iso = _compute_isolation_before(con, cutoff) + df["isolation_score"] = df["author"].map(iso).fillna(1.0) + + # Graph features + print(" Computing graph features...") + graph_feats = _compute_graph_features_before(con, cutoff) + df["hub_score"] = df["author"].map(graph_feats["hub_score"]).fillna(0.0) + df["bipartite_clustering"] = ( + df["author"].map(graph_feats["bipartite_clustering"]).fillna(0.0) + ) + + # Extended F16 features + print(" Computing extended features (F16)...") + ext = _get_extended_features_before(con, cutoff) + df = df.merge(ext, on="author", how="left") + for col in ["rejection_rate", "hour_entropy", "empty_body_rate", + "title_spam_score", "weekend_ratio", "prs_per_active_day"]: + df[col] = df[col].fillna(0.0) + + # Labels + labels = con.execute( + "SELECT login, account_status FROM authors " + "WHERE account_status IN ('active', 'suspended')" + ).fetchdf() + con.close() + + df = df.merge(labels, left_on="author", right_on="login", how="inner") + df = df[df["account_status"].isin(["active", "suspended"])].copy() + + df["is_original_discovery"] = ( + df["author"].isin(original_suspended) + & (df["account_status"] == "suspended") + ) + + n_susp = (df["account_status"] == "suspended").sum() + n_active = (df["account_status"] == "active").sum() + print(f" Labeled: {len(df)} ({n_susp} suspended, {n_active} active)") + + return df + + +def _get_features_before_cutoff( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> pd.DataFrame: + """Compute 8 SQL-native features per author from PRs before cutoff.""" + return con.execute(""" + WITH author_prs AS ( + SELECT author, state, created_at, title, additions, + files_changed, repo + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + ), + merged_authors AS ( + SELECT DISTINCT author FROM author_prs WHERE state = 'MERGED' + ) + SELECT + ma.author, + SUM(CASE WHEN ap.state = 'MERGED' THEN 1 ELSE 0 END)::DOUBLE + / COUNT(*)::DOUBLE AS merge_rate, + COUNT(*)::DOUBLE AS total_prs, + COALESCE( + EXTRACT(EPOCH FROM (MAX(ap.created_at) - MIN(ap.created_at))) + / 86400.0, 0.0 + ) AS career_span_days, + AVG(LENGTH(ap.title))::DOUBLE AS mean_title_length, + MEDIAN(CASE WHEN ap.state = 'MERGED' THEN ap.additions END)::DOUBLE + AS median_additions, + MEDIAN(CASE WHEN ap.state = 'MERGED' THEN ap.files_changed END)::DOUBLE + AS median_files_changed, + COUNT(DISTINCT ap.repo)::DOUBLE AS total_repos + FROM merged_authors ma + JOIN author_prs ap ON ap.author = ma.author + GROUP BY ma.author + """, [cutoff]).fetchdf() + + +def _get_extended_features_before( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> pd.DataFrame: + """Compute F16 extension features from PRs before cutoff. + + Returns: rejection_rate, hour_entropy (stub=0), empty_body_rate, + title_spam_score (stub=0), weekend_ratio, prs_per_active_day. + """ + rows = con.execute(""" + SELECT + author, + -- rejection_rate = 1 - merge_rate + 1.0 - (SUM(CASE WHEN state = 'MERGED' THEN 1 ELSE 0 END)::DOUBLE + / COUNT(*)::DOUBLE) AS rejection_rate, + -- empty_body_rate + SUM(CASE WHEN body IS NULL OR TRIM(body) = '' THEN 1 ELSE 0 END)::DOUBLE + / COUNT(*)::DOUBLE AS empty_body_rate, + -- weekend_ratio (Saturday=6, Sunday=0 in DuckDB's dayofweek) + SUM(CASE WHEN EXTRACT(DOW FROM created_at) IN (0, 6) + THEN 1 ELSE 0 END)::DOUBLE + / COUNT(*)::DOUBLE AS weekend_ratio, + -- prs_per_active_day + CASE + WHEN EXTRACT(EPOCH FROM (MAX(created_at) - MIN(created_at))) + / 86400.0 > 0 + THEN COUNT(*)::DOUBLE / ( + EXTRACT(EPOCH FROM (MAX(created_at) - MIN(created_at))) + / 86400.0) + ELSE COUNT(*)::DOUBLE + END AS prs_per_active_day + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + GROUP BY author + """, [cutoff]).fetchdf() + + # Hour entropy and title_spam_score need Python computation + pr_data = con.execute(""" + SELECT author, EXTRACT(HOUR FROM created_at)::INT AS hour, title + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + """, [cutoff]).fetchdf() + + hour_entropy = _compute_hour_entropy(pr_data) + title_spam = _compute_title_spam_score(pr_data) + + rows["hour_entropy"] = rows["author"].map(hour_entropy).fillna(0.0) + rows["title_spam_score"] = rows["author"].map(title_spam).fillna(0.0) + + return rows + + +def _compute_hour_entropy(pr_data: pd.DataFrame) -> dict[str, float]: + """Shannon entropy of PR submission hour distribution per author.""" + result: dict[str, float] = {} + for author, group in pr_data.groupby("author"): + hours = group["hour"].values + if len(hours) < 2: + result[str(author)] = 0.0 + continue + counts = np.bincount(hours, minlength=24).astype(float) + probs = counts / counts.sum() + probs = probs[probs > 0] + result[str(author)] = float(-np.sum(probs * np.log2(probs))) + return result + + +def _compute_title_spam_score(pr_data: pd.DataFrame) -> dict[str, float]: + """Heuristic spam score based on PR title patterns.""" + spam_patterns = [ + "update readme", "add files", "initial commit", "first commit", + "create ", "delete ", "rename ", "added ", "updated ", + ] + result: dict[str, float] = {} + for author, group in pr_data.groupby("author"): + titles = group["title"].dropna().str.lower() + if len(titles) == 0: + result[str(author)] = 0.0 + continue + spam_count = sum( + 1 for t in titles + if any(p in t for p in spam_patterns) + ) + result[str(author)] = spam_count / len(titles) + return result + + +def _compute_isolation_before( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> dict[str, float]: + """Compute isolation_score per author from pre-cutoff data.""" + pairs = con.execute(""" + SELECT author, repo, COUNT(*) as pr_count + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + GROUP BY author, repo + """, [cutoff]).fetchall() + + repo_contributors: dict[str, set[str]] = defaultdict(set) + author_repos: dict[str, set[str]] = defaultdict(set) + for author, repo, _ in pairs: + repo_contributors[repo].add(author) + author_repos[author].add(repo) + + isolation_scores: dict[str, float] = {} + for author, repos in author_repos.items(): + if not repos: + isolation_scores[author] = 1.0 + continue + + contributor_repo_count: dict[str, int] = defaultdict(int) + for repo in repos: + for c in repo_contributors[repo]: + if c != author: + contributor_repo_count[c] += 1 + + multi_repo = {c for c, count in contributor_repo_count.items() if count >= 2} + + isolated = 0 + for repo in repos: + other_contribs = repo_contributors[repo] - {author} + if not (other_contribs & multi_repo): + isolated += 1 + + isolation_scores[author] = isolated / len(repos) + + return isolation_scores + + +def _compute_graph_features_before( + con: duckdb.DuckDBPyConnection, cutoff: str, +) -> dict[str, dict[str, float]]: + """Compute hub_score and bipartite_clustering from pre-cutoff graph.""" + triples = con.execute(""" + SELECT author, repo, COUNT(*) as pr_count + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + GROUP BY author, repo + """, [cutoff]).fetchall() + + if not triples: + return {"hub_score": {}, "bipartite_clustering": {}} + + g = nx.Graph() + authors_set = set() + repos_set = set() + for author, repo, count in triples: + a_node = f"a:{author}" + r_node = f"r:{repo}" + g.add_edge(a_node, r_node, weight=count) + authors_set.add(a_node) + repos_set.add(r_node) + + centrality = nx.degree_centrality(g) + hub_scores = { + a.removeprefix("a:"): centrality.get(a, 0.0) for a in authors_set + } + + try: + clustering = nx.bipartite.clustering(g, authors_set) + bip_clustering = { + a.removeprefix("a:"): clustering.get(a, 0.0) for a in authors_set + } + except Exception: + bip_clustering = {a.removeprefix("a:"): 0.0 for a in authors_set} + + return {"hub_score": hub_scores, "bipartite_clustering": bip_clustering} + + +# --------------------------------------------------------------------------- +# Feature preparation +# --------------------------------------------------------------------------- + +def prepare_features( + df: pd.DataFrame, + feature_list: list[str], +) -> np.ndarray: + """Extract columns, log-transform skewed ones, fill NaN with 0.""" + arrays = [] + for col in feature_list: + vals = df[col].fillna(0).values.astype(float) + if col in LOG_TRANSFORM: + vals = np.log1p(np.abs(vals)) * np.sign(vals) + arrays.append(vals) + return np.column_stack(arrays) + + +# --------------------------------------------------------------------------- +# CV engine — suspended-only splitting for k-NN +# --------------------------------------------------------------------------- + +def run_suspended_cv( + df: pd.DataFrame, + score_fn: Any, + n_splits: int = 5, +) -> tuple[np.ndarray, np.ndarray]: + """CV that folds only suspended accounts; active in every fold's eval. + + score_fn(seed_df, eval_df) -> np.ndarray of scores for eval_df rows. + Returns (y_true, y_scores) for the full population. + """ + y = (df["account_status"] == "suspended").astype(int).values + susp_idx = np.where(y == 1)[0] + active_idx = np.where(y == 0)[0] + n_susp = len(susp_idx) + + use_loo = n_susp < 30 + + susp_oof_scores = np.full(n_susp, np.nan) + active_score_accum = np.zeros(len(active_idx)) + active_fold_count = np.zeros(len(active_idx)) + + if use_loo: + for i in range(n_susp): + seed_susp_pos = np.array([j for j in range(n_susp) if j != i]) + if len(seed_susp_pos) == 0: + continue + + seed_susp_df = df.iloc[susp_idx[seed_susp_pos]] + + eval_idx = np.concatenate([susp_idx[[i]], active_idx]) + eval_df = df.iloc[eval_idx] + + scores = score_fn(seed_susp_df, eval_df) + + susp_oof_scores[i] = scores[0] + active_score_accum += scores[1:] + active_fold_count += 1 + else: + kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) + for fold_train_pos, fold_test_pos in kf.split(np.arange(n_susp)): + seed_susp_idx = susp_idx[fold_train_pos] + held_out_susp_idx = susp_idx[fold_test_pos] + + seed_susp_df = df.iloc[seed_susp_idx] + + eval_idx = np.concatenate([held_out_susp_idx, active_idx]) + eval_df = df.iloc[eval_idx] + + scores = score_fn(seed_susp_df, eval_df) + + n_held = len(held_out_susp_idx) + susp_oof_scores[fold_test_pos] = scores[:n_held] + active_score_accum += scores[n_held:] + active_fold_count += 1 + + safe_count = np.where(active_fold_count > 0, active_fold_count, 1.0) + active_avg_scores = active_score_accum / safe_count + + y_scores = np.full(len(y), np.nan) + y_scores[susp_idx] = susp_oof_scores + y_scores[active_idx] = active_avg_scores + + return y, y_scores + + +# --------------------------------------------------------------------------- +# Metrics +# --------------------------------------------------------------------------- + +def compute_metrics( + y_true: np.ndarray, + y_scores: np.ndarray, +) -> dict[str, Any]: + """Compute AUC-ROC, AUC-PR, Precision@25, Precision@50.""" + metrics: dict[str, Any] = {} + finite = np.isfinite(y_scores) + if y_true[finite].sum() > 0 and (1 - y_true[finite]).sum() > 0: + metrics["auc_roc"] = float(roc_auc_score(y_true[finite], y_scores[finite])) + metrics["auc_pr"] = float(average_precision_score( + y_true[finite], y_scores[finite], + )) + for k in [25, 50]: + if k <= finite.sum(): + top_k_idx = np.argsort(y_scores[finite])[-k:] + metrics[f"precision_at_{k}"] = float( + y_true[finite][top_k_idx].sum() / k + ) + else: + metrics["auc_roc"] = float("nan") + metrics["auc_pr"] = float("nan") + return metrics + + +# --------------------------------------------------------------------------- +# Baselines +# --------------------------------------------------------------------------- + +def lr_baseline( + df: pd.DataFrame, + feature_list: list[str], + seed: int = 42, +) -> tuple[np.ndarray, np.ndarray]: + """Logistic regression baseline with LOO/5-fold CV. + + Returns (y_true, y_scores). + """ + y = (df["account_status"] == "suspended").astype(int).values + x = prepare_features(df, feature_list) + n_pos = y.sum() + + oof = np.full(len(y), np.nan) + + if n_pos < 30: + loo = LeaveOneOut() + for train_idx, test_idx in loo.split(x, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x[train_idx]) + x_test = scaler.transform(x[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + else: + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) + for train_idx, test_idx in skf.split(x, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x[train_idx]) + x_test = scaler.transform(x[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + + return y, oof + + +def merge_rate_baseline(df: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]: + """1 - merge_rate as a simple baseline score.""" + y = (df["account_status"] == "suspended").astype(int).values + scores = 1.0 - df["merge_rate"].fillna(0).values.astype(float) + return y, scores + + +# --------------------------------------------------------------------------- +# Statistical tests +# --------------------------------------------------------------------------- + +def delong_auc_test( + y_true: np.ndarray, + y_scores_a: np.ndarray, + y_scores_b: np.ndarray, +) -> dict[str, Any]: + """Paired DeLong test for comparing two AUC-ROC values.""" + n1 = np.sum(y_true == 1) + n0 = np.sum(y_true == 0) + + if n1 == 0 or n0 == 0: + return { + "auc_a": float("nan"), "auc_b": float("nan"), + "z_statistic": float("nan"), "p_value": float("nan"), + } + + auc_a = roc_auc_score(y_true, y_scores_a) + auc_b = roc_auc_score(y_true, y_scores_b) + + pos_idx = np.where(y_true == 1)[0] + neg_idx = np.where(y_true == 0)[0] + + def placement_values( + scores: np.ndarray, + ) -> tuple[np.ndarray, np.ndarray]: + pos_scores = scores[pos_idx] + neg_scores = scores[neg_idx] + v10 = np.array([ + np.mean(ps > neg_scores) + 0.5 * np.mean(ps == neg_scores) + for ps in pos_scores + ]) + v01 = np.array([ + np.mean(pos_scores > ns) + 0.5 * np.mean(pos_scores == ns) + for ns in neg_scores + ]) + return v10, v01 + + v10_a, v01_a = placement_values(y_scores_a) + v10_b, v01_b = placement_values(y_scores_b) + + s10 = np.cov(np.stack([v10_a, v10_b])) + s01 = np.cov(np.stack([v01_a, v01_b])) + + if s10.ndim == 0: + s10 = np.array([[s10]]) + if s01.ndim == 0: + s01 = np.array([[s01]]) + + s = s10 / n1 + s01 / n0 + contrast = np.array([1, -1]) + var_diff = contrast @ s @ contrast + + if var_diff <= 0: + return {"auc_a": auc_a, "auc_b": auc_b, "z_statistic": 0.0, "p_value": 1.0} + + z = (auc_a - auc_b) / np.sqrt(var_diff) + p_value = 2.0 * sp_stats.norm.sf(abs(z)) + + return { + "auc_a": float(auc_a), "auc_b": float(auc_b), + "z_statistic": float(z), "p_value": float(p_value), + } + + +def holm_bonferroni( + p_values: dict[str, float], alpha: float = 0.05, +) -> dict[str, dict[str, Any]]: + """Apply Holm-Bonferroni correction to a set of p-values.""" + sorted_tests = sorted(p_values.items(), key=lambda x: x[1]) + m = len(sorted_tests) + results: dict[str, dict[str, Any]] = {} + + prev_adj = 0.0 + for rank, (name, p) in enumerate(sorted_tests, start=1): + adjusted_p = min(1.0, p * (m - rank + 1)) + adjusted_p = max(adjusted_p, prev_adj) + prev_adj = adjusted_p + results[name] = { + "p_value": p, "adjusted_p": adjusted_p, + "reject": adjusted_p <= alpha, "rank": rank, + } + + return results diff --git a/scripts/proximity_graph_experiment.py b/scripts/proximity_graph_experiment.py new file mode 100644 index 0000000..b27528e --- /dev/null +++ b/scripts/proximity_graph_experiment.py @@ -0,0 +1,399 @@ +"""Graph-based proximity experiment for suspension detection. + +Tests H2 (graph-based proximity captures structural signal) via: + - Shared-repo Jaccard proximity (max and mean-k variants) + - Personalized PageRank from suspended seeds + +Strategies A, B, C as in the k-NN experiment. +""" + +from __future__ import annotations + +import json +from collections import defaultdict +from typing import Any + +import duckdb +import networkx as nx +import numpy as np +import pandas as pd +from proximity_common import ( + CUTOFFS, + DB_PATH, + RESULTS_DIR, + compute_metrics, + load_all_time_features, + load_temporal_features, + merge_rate_baseline, + run_suspended_cv, +) + +# --------------------------------------------------------------------------- +# Graph proximity methods +# --------------------------------------------------------------------------- + +def build_author_repo_data( + con: duckdb.DuckDBPyConnection, + cutoff: str | None = None, +) -> tuple[dict[str, set[str]], nx.Graph]: + """Build author-repo mappings and bipartite graph. + + Returns (author_repos dict, bipartite Graph). + """ + if cutoff: + rows = con.execute(""" + SELECT author, repo, COUNT(*) as pr_count + FROM prs + WHERE created_at < ?::TIMESTAMP AND author IS NOT NULL + GROUP BY author, repo + """, [cutoff]).fetchall() + else: + rows = con.execute(""" + SELECT author, repo, COUNT(*) as pr_count + FROM prs WHERE author IS NOT NULL + GROUP BY author, repo + """).fetchall() + + author_repos: dict[str, set[str]] = defaultdict(set) + g = nx.Graph() + + for author, repo, count in rows: + author_repos[author].add(repo) + a_node = f"a:{author}" + r_node = f"r:{repo}" + g.add_edge(a_node, r_node, weight=count) + + return author_repos, g + + +def jaccard_max_proximity( + seed_repos: dict[str, set[str]], + eval_repos: dict[str, set[str]], +) -> dict[str, float]: + """Max Jaccard similarity between eval author's repos and any seed's repos.""" + scores: dict[str, float] = {} + seed_list = list(seed_repos.values()) + + for author, repos in eval_repos.items(): + if not repos: + scores[author] = 0.0 + continue + max_j = 0.0 + for s_repos in seed_list: + if not s_repos: + continue + inter = len(repos & s_repos) + union = len(repos | s_repos) + if union > 0: + max_j = max(max_j, inter / union) + scores[author] = max_j + + return scores + + +def jaccard_mean_k_proximity( + seed_repos: dict[str, set[str]], + eval_repos: dict[str, set[str]], + k: int = 5, +) -> dict[str, float]: + """Mean of top-k Jaccard similarities to seeds.""" + scores: dict[str, float] = {} + seed_list = list(seed_repos.values()) + + for author, repos in eval_repos.items(): + if not repos: + scores[author] = 0.0 + continue + jaccards = [] + for s_repos in seed_list: + if not s_repos: + continue + inter = len(repos & s_repos) + union = len(repos | s_repos) + jaccards.append(inter / union if union > 0 else 0.0) + + if not jaccards: + scores[author] = 0.0 + continue + + jaccards.sort(reverse=True) + top_k = jaccards[:min(k, len(jaccards))] + scores[author] = float(np.mean(top_k)) + + return scores + + +def ppr_proximity( + graph: nx.Graph, + seed_authors: list[str], + alpha: float = 0.85, +) -> dict[str, float]: + """Personalized PageRank with restart on suspended seed nodes. + + Returns PPR value at each author node (higher = closer to seeds). + """ + seed_nodes = [f"a:{a}" for a in seed_authors if f"a:{a}" in graph] + if not seed_nodes: + return {} + + personalization = {node: 1.0 / len(seed_nodes) for node in seed_nodes} + + try: + ppr = nx.pagerank(graph, alpha=alpha, personalization=personalization) + except nx.PowerIterationFailedConvergence: + ppr = nx.pagerank( + graph, alpha=alpha, personalization=personalization, max_iter=200, + ) + + # Extract author scores + scores: dict[str, float] = {} + for node, val in ppr.items(): + if node.startswith("a:"): + scores[node.removeprefix("a:")] = float(val) + + return scores + + +# --------------------------------------------------------------------------- +# Strategy implementations +# --------------------------------------------------------------------------- + +def run_strategy_a( + df: pd.DataFrame, + author_repos: dict[str, set[str]], + graph: nx.Graph, +) -> dict[str, Any]: + """Strategy A: Discovery-order holdout for graph methods.""" + print("\n=== Strategy A: Discovery-Order Holdout (Graph) ===") + results: dict[str, Any] = {} + + susp_mask = df["account_status"] == "suspended" + orig_mask = df["is_original_discovery"] + + seed_authors = df.loc[orig_mask, "author" if "author" in df.columns + else "login"].tolist() + test_df = df[~orig_mask | ~susp_mask].copy() + + author_col = "author" if "author" in test_df.columns else "login" + y_test = (test_df["account_status"] == "suspended").astype(int).values + test_authors = test_df[author_col].tolist() + + n_seeds = len(seed_authors) + print(f" Seeds: {n_seeds}, Test: {y_test.sum()} suspended, " + f"{(1 - y_test).sum()} active") + + # Jaccard max + seed_repo_map = {a: author_repos.get(a, set()) for a in seed_authors} + test_repo_map = {a: author_repos.get(a, set()) for a in test_authors} + + jm_scores = jaccard_max_proximity(seed_repo_map, test_repo_map) + scores_arr = np.array([jm_scores.get(a, 0.0) for a in test_authors]) + m = compute_metrics(y_test, scores_arr) + results["jaccard_max"] = m + print(f" jaccard_max: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Jaccard mean-k5 + jmk_scores = jaccard_mean_k_proximity(seed_repo_map, test_repo_map, k=5) + scores_arr = np.array([jmk_scores.get(a, 0.0) for a in test_authors]) + m = compute_metrics(y_test, scores_arr) + results["jaccard_mean_k5"] = m + print(f" jaccard_mean_k5: AUC={m.get('auc_roc', float('nan')):.4f}") + + # PPR + ppr_scores = ppr_proximity(graph, seed_authors) + scores_arr = np.array([ppr_scores.get(a, 0.0) for a in test_authors]) + m = compute_metrics(y_test, scores_arr) + results["ppr"] = m + print(f" ppr: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Baselines + y_mr, s_mr = merge_rate_baseline(test_df) + results["baseline_merge_rate"] = compute_metrics(y_mr, s_mr) + + return results + + +def run_strategy_b( + df: pd.DataFrame, + author_repos: dict[str, set[str]], + graph: nx.Graph, + label: str = "merged-PR", +) -> dict[str, Any]: + """Strategy B: Suspended-only CV for graph methods.""" + print(f"\n=== Strategy B: CV ({label}) (Graph) ===") + results: dict[str, Any] = {} + + author_col = "author" if "author" in df.columns else "login" + + # Jaccard max via CV + def jaccard_max_fn( + seed_df: pd.DataFrame, + eval_df: pd.DataFrame, + ) -> np.ndarray: + s_authors = seed_df[author_col].tolist() + e_authors = eval_df[author_col].tolist() + s_repos = {a: author_repos.get(a, set()) for a in s_authors} + e_repos = {a: author_repos.get(a, set()) for a in e_authors} + jm = jaccard_max_proximity(s_repos, e_repos) + return np.array([jm.get(a, 0.0) for a in e_authors]) + + y, scores = run_suspended_cv(df, jaccard_max_fn) + m = compute_metrics(y, scores) + results["jaccard_max"] = m + print(f" jaccard_max: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Jaccard mean-k5 via CV + def jaccard_mean_k5_fn( + seed_df: pd.DataFrame, + eval_df: pd.DataFrame, + ) -> np.ndarray: + s_authors = seed_df[author_col].tolist() + e_authors = eval_df[author_col].tolist() + s_repos = {a: author_repos.get(a, set()) for a in s_authors} + e_repos = {a: author_repos.get(a, set()) for a in e_authors} + jm = jaccard_mean_k_proximity(s_repos, e_repos, k=5) + return np.array([jm.get(a, 0.0) for a in e_authors]) + + y, scores = run_suspended_cv(df, jaccard_mean_k5_fn) + m = compute_metrics(y, scores) + results["jaccard_mean_k5"] = m + print(f" jaccard_mean_k5: AUC={m.get('auc_roc', float('nan')):.4f}") + + # PPR via CV + def ppr_fn( + seed_df: pd.DataFrame, + eval_df: pd.DataFrame, + ) -> np.ndarray: + s_authors = seed_df[author_col].tolist() + e_authors = eval_df[author_col].tolist() + ppr_s = ppr_proximity(graph, s_authors) + return np.array([ppr_s.get(a, 0.0) for a in e_authors]) + + y, scores = run_suspended_cv(df, ppr_fn) + m = compute_metrics(y, scores) + results["ppr"] = m + print(f" ppr: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Baselines + y_mr, s_mr = merge_rate_baseline(df) + results["baseline_merge_rate"] = compute_metrics(y_mr, s_mr) + + return results + + +def run_strategy_c( + cutoffs: list[str], +) -> dict[str, Any]: + """Strategy C: Temporal holdout for graph methods.""" + print("\n=== Strategy C: Temporal Holdout (Graph) ===") + results: dict[str, Any] = {} + + for cutoff in cutoffs: + df = load_temporal_features(cutoff) + n_susp = (df["account_status"] == "suspended").sum() + if n_susp < 5: + print(f" Skipping {cutoff}: only {n_susp} suspended") + continue + + con = duckdb.connect(str(DB_PATH), read_only=True) + author_repos, graph = build_author_repo_data(con, cutoff) + con.close() + + author_col = "author" if "author" in df.columns else "login" + cutoff_results: dict[str, Any] = {"n_suspended": int(n_susp)} + + # Jaccard max via CV + def jaccard_max_fn( + seed_df: pd.DataFrame, + eval_df: pd.DataFrame, + _ar: dict[str, set[str]] = author_repos, + _ac: str = author_col, + ) -> np.ndarray: + s_authors = seed_df[_ac].tolist() + e_authors = eval_df[_ac].tolist() + s_repos = {a: _ar.get(a, set()) for a in s_authors} + e_repos = {a: _ar.get(a, set()) for a in e_authors} + jm = jaccard_max_proximity(s_repos, e_repos) + return np.array([jm.get(a, 0.0) for a in e_authors]) + + y, scores = run_suspended_cv(df, jaccard_max_fn) + m = compute_metrics(y, scores) + cutoff_results["jaccard_max"] = m + print(f" {cutoff} jaccard_max: " + f"AUC={m.get('auc_roc', float('nan')):.4f}") + + # PPR via CV + def ppr_fn( + seed_df: pd.DataFrame, + eval_df: pd.DataFrame, + _g: nx.Graph = graph, + _ac: str = author_col, + ) -> np.ndarray: + s_authors = seed_df[_ac].tolist() + e_authors = eval_df[_ac].tolist() + ppr_s = ppr_proximity(_g, s_authors) + return np.array([ppr_s.get(a, 0.0) for a in e_authors]) + + y, scores = run_suspended_cv(df, ppr_fn) + m = compute_metrics(y, scores) + cutoff_results["ppr"] = m + print(f" {cutoff} ppr: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Baseline + y_mr, s_mr = merge_rate_baseline(df) + cutoff_results["baseline_merge_rate"] = compute_metrics(y_mr, s_mr) + + results[cutoff] = cutoff_results + + return results + + +def main() -> None: + all_results: dict[str, Any] = {} + + # Load data + df_merged = load_all_time_features(merged_pr_only=True) + + # The parquet uses 'login', temporal uses 'author' — normalize + if "login" in df_merged.columns and "author" not in df_merged.columns: + df_merged = df_merged.rename(columns={"login": "author"}) + + con = duckdb.connect(str(DB_PATH), read_only=True) + author_repos, graph = build_author_repo_data(con) + con.close() + + print(f"Graph: {graph.number_of_nodes()} nodes, " + f"{graph.number_of_edges()} edges") + + # Strategy A + all_results["strategy_a"] = run_strategy_a( + df_merged, author_repos, graph, + ) + + # Strategy B: merged-PR + all_results["strategy_b_merged"] = run_strategy_b( + df_merged, author_repos, graph, label="merged-PR", + ) + + # Strategy C: temporal + all_results["strategy_c"] = run_strategy_c(CUTOFFS) + + # Save results + output_path = RESULTS_DIR / "graph_results.json" + + def _default(obj: Any) -> Any: + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + raise TypeError(f"Not serializable: {type(obj)}") + + with open(output_path, "w") as f: + json.dump(all_results, f, indent=2, default=_default) + print(f"\nResults saved to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/proximity_knn_experiment.py b/scripts/proximity_knn_experiment.py new file mode 100644 index 0000000..dc9ef34 --- /dev/null +++ b/scripts/proximity_knn_experiment.py @@ -0,0 +1,402 @@ +"""k-NN proximity experiment for suspension detection. + +Tests H1 (profile proximity detects suspension in merged-PR population) +via k-NN distance scoring with cosine and euclidean metrics. + +Strategies: + A: Discovery-order holdout (44 original seeds → 373 expansion test) + B: Suspended-only CV on merged-PR and all-authors populations + C: Temporal holdout with pre-cutoff feature computation +""" + +from __future__ import annotations + +import json +from typing import Any + +import numpy as np +from proximity_common import ( + CUTOFFS, + F10, + F16, + F16_NO_MR, + RESULTS_DIR, + compute_metrics, + load_all_time_features, + load_temporal_features, + lr_baseline, + merge_rate_baseline, + prepare_features, + run_suspended_cv, +) +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import StandardScaler + + +def knn_proximity_score( + seed_x: np.ndarray, + eval_x: np.ndarray, + k: int, + metric: str, +) -> np.ndarray: + """Score eval set by negative mean distance to k nearest seeds. + + Higher score = closer to seeds = more suspicious. + """ + effective_k = min(k, len(seed_x)) + if effective_k == 0: + return np.zeros(len(eval_x)) + nn = NearestNeighbors(n_neighbors=effective_k, metric=metric) + nn.fit(seed_x) + distances, _ = nn.kneighbors(eval_x) + return -distances.mean(axis=1) + + +def run_strategy_a( + df: Any, + feature_sets: dict[str, list[str]], + metrics_list: list[str], + k_values: list[int], +) -> dict[str, Any]: + """Strategy A: Discovery-order holdout. + + Seeds = original 44 suspended with merged PRs. + Test positives = 373 expansion suspended with merged PRs. + Test negatives = all active with merged PRs. + """ + print("\n=== Strategy A: Discovery-Order Holdout ===") + results: dict[str, Any] = {} + + susp_mask = df["account_status"] == "suspended" + orig_mask = df["is_original_discovery"] + + seed_df = df[orig_mask].copy() + test_df = df[~orig_mask | ~susp_mask].copy() + + n_seeds = len(seed_df) + n_test_susp = (test_df["account_status"] == "suspended").sum() + n_test_active = (test_df["account_status"] == "active").sum() + print(f" Seeds: {n_seeds}, Test: {n_test_susp} suspended, " + f"{n_test_active} active") + + y_test = (test_df["account_status"] == "suspended").astype(int).values + + for fs_name, fs in feature_sets.items(): + for metric in metrics_list: + for k in k_values: + key = f"knn_{fs_name}_{metric}_k{k}" + + seed_x = prepare_features(seed_df, fs) + test_x = prepare_features(test_df, fs) + + # Scale on seeds + active (no test suspended) + all_train_x = np.vstack([ + seed_x, + prepare_features( + test_df[test_df["account_status"] == "active"], fs, + ), + ]) + scaler = StandardScaler() + scaler.fit(all_train_x) + + seed_scaled = scaler.transform(seed_x) + test_scaled = scaler.transform(test_x) + + scores = knn_proximity_score( + seed_scaled, test_scaled, k, metric, + ) + + m = compute_metrics(y_test, scores) + results[key] = { + "feature_set": fs_name, + "metric": metric, + "k": k, + "n_seeds": n_seeds, + **m, + } + print(f" {key}: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Baselines on test set + y_mr, s_mr = merge_rate_baseline(test_df) + results["baseline_merge_rate"] = compute_metrics(y_mr, s_mr) + print(f" baseline_merge_rate: " + f"AUC={results['baseline_merge_rate'].get('auc_roc', float('nan')):.4f}") + + for fs_name, fs in feature_sets.items(): + y_lr, s_lr = lr_baseline(test_df, fs) + bkey = f"baseline_lr_{fs_name}" + results[bkey] = compute_metrics(y_lr, s_lr) + print(f" {bkey}: " + f"AUC={results[bkey].get('auc_roc', float('nan')):.4f}") + + return results + + +def run_strategy_b( + df: Any, + feature_sets: dict[str, list[str]], + metrics_list: list[str], + k_values: list[int], + label: str = "merged-PR", +) -> dict[str, Any]: + """Strategy B: Suspended-only CV on all-time features.""" + print(f"\n=== Strategy B: CV ({label}) ===") + results: dict[str, Any] = {} + + for fs_name, fs in feature_sets.items(): + for metric in metrics_list: + for k in k_values: + key = f"knn_{fs_name}_{metric}_k{k}" + + def score_fn( + seed_df: Any, + eval_df: Any, + _fs: list[str] = fs, + _k: int = k, + _metric: str = metric, + ) -> np.ndarray: + seed_x = prepare_features(seed_df, _fs) + eval_x = prepare_features(eval_df, _fs) + + all_x = np.vstack([seed_x, eval_x]) + scaler = StandardScaler() + scaler.fit(all_x) + + return knn_proximity_score( + scaler.transform(seed_x), + scaler.transform(eval_x), + _k, + _metric, + ) + + y, scores = run_suspended_cv(df, score_fn) + m = compute_metrics(y, scores) + results[key] = { + "feature_set": fs_name, + "metric": metric, + "k": k, + **m, + } + print(f" {key}: AUC={m.get('auc_roc', float('nan')):.4f}") + + # Baselines + y_mr, s_mr = merge_rate_baseline(df) + results["baseline_merge_rate"] = compute_metrics(y_mr, s_mr) + print(f" baseline_merge_rate: " + f"AUC={results['baseline_merge_rate'].get('auc_roc', float('nan')):.4f}") + + for fs_name, fs in feature_sets.items(): + y_lr, s_lr = lr_baseline(df, fs) + bkey = f"baseline_lr_{fs_name}" + results[bkey] = compute_metrics(y_lr, s_lr) + print(f" {bkey}: " + f"AUC={results[bkey].get('auc_roc', float('nan')):.4f}") + + return results + + +def run_strategy_c( + feature_sets: dict[str, list[str]], + metrics_list: list[str], + k_values: list[int], +) -> dict[str, Any]: + """Strategy C: Temporal holdout with pre-cutoff features.""" + print("\n=== Strategy C: Temporal Holdout ===") + results: dict[str, Any] = {} + + for cutoff in CUTOFFS: + df = load_temporal_features(cutoff) + n_susp = (df["account_status"] == "suspended").sum() + if n_susp < 5: + print(f" Skipping {cutoff}: only {n_susp} suspended") + continue + + cutoff_results: dict[str, Any] = {"n_suspended": int(n_susp)} + + # Only test cosine + selected k values for temporal + for fs_name in ["F10", "F16"]: + fs = feature_sets[fs_name] + # Check all features are available + missing = [f for f in fs if f not in df.columns] + if missing: + print(f" Skipping {fs_name} at {cutoff}: missing {missing}") + continue + + for k in [5, 10]: + if k not in k_values: + continue + + key = f"knn_{fs_name}_cosine_k{k}" + + def score_fn( + seed_df: Any, + eval_df: Any, + _fs: list[str] = fs, + _k: int = k, + ) -> np.ndarray: + seed_x = prepare_features(seed_df, _fs) + eval_x = prepare_features(eval_df, _fs) + + all_x = np.vstack([seed_x, eval_x]) + scaler = StandardScaler() + scaler.fit(all_x) + + return knn_proximity_score( + scaler.transform(seed_x), + scaler.transform(eval_x), + _k, + "cosine", + ) + + y, scores = run_suspended_cv(df, score_fn) + m = compute_metrics(y, scores) + cutoff_results[key] = { + "feature_set": fs_name, + "metric": "cosine", + "k": k, + **m, + } + print(f" {cutoff} {key}: " + f"AUC={m.get('auc_roc', float('nan')):.4f}") + + # Baselines per cutoff + y_mr, s_mr = merge_rate_baseline(df) + cutoff_results["baseline_merge_rate"] = compute_metrics(y_mr, s_mr) + + for fs_name in ["F10", "F16"]: + fs = feature_sets[fs_name] + missing = [f for f in fs if f not in df.columns] + if missing: + continue + y_lr, s_lr = lr_baseline(df, fs) + cutoff_results[f"baseline_lr_{fs_name}"] = compute_metrics(y_lr, s_lr) + + results[cutoff] = cutoff_results + + return results + + +def run_delong_comparisons( + all_results: dict[str, Any], +) -> dict[str, Any]: + """Run DeLong tests comparing k-NN methods to baselines. + + Operates on stored y_true/y_scores that were saved during experiments. + For simplicity, we re-identify the best methods and note AUC comparisons. + """ + comparisons: dict[str, Any] = {} + + for strategy_name, strategy_results in all_results.items(): + if not isinstance(strategy_results, dict): + continue + + # Find all knn results and baselines + knn_keys = [ + k for k in strategy_results + if k.startswith("knn_") and isinstance(strategy_results[k], dict) + ] + baseline_keys = [ + k for k in strategy_results + if k.startswith("baseline_") and isinstance(strategy_results[k], dict) + ] + + if not knn_keys or not baseline_keys: + continue + + # Report best knn AUC vs baselines + best_knn_key = max( + knn_keys, + key=lambda k: strategy_results[k].get("auc_roc", float("-inf")), + ) + best_knn_auc = strategy_results[best_knn_key].get("auc_roc", float("nan")) + + comp: dict[str, Any] = { + "best_knn": best_knn_key, + "best_knn_auc": best_knn_auc, + "baselines": {}, + } + + for bk in baseline_keys: + b_auc = strategy_results[bk].get("auc_roc", float("nan")) + comp["baselines"][bk] = { + "auc": b_auc, + "delta": best_knn_auc - b_auc if np.isfinite(best_knn_auc) + and np.isfinite(b_auc) else float("nan"), + } + + comparisons[strategy_name] = comp + + return comparisons + + +def main() -> None: + feature_sets = {"F10": F10, "F16": F16, "F16_no_mr": F16_NO_MR} + metrics_list = ["cosine", "euclidean"] + k_values = [3, 5, 10, 15] + + all_results: dict[str, Any] = {} + + # Load merged-PR population + df_merged = load_all_time_features(merged_pr_only=True) + + # Check which F16 features are available in parquet + available_f16 = [f for f in F16 if f in df_merged.columns] + missing_f16 = [f for f in F16 if f not in df_merged.columns] + if missing_f16: + print(f"Warning: Missing F16 features in parquet: {missing_f16}") + # Use only available features + feature_sets["F16"] = available_f16 + feature_sets["F16_no_mr"] = [ + f for f in available_f16 + if f not in ("merge_rate", "rejection_rate") + ] + + # Strategy A: Discovery-order holdout + all_results["strategy_a"] = run_strategy_a( + df_merged, feature_sets, metrics_list, k_values, + ) + + # Strategy B: CV on merged-PR population + all_results["strategy_b_merged"] = run_strategy_b( + df_merged, feature_sets, metrics_list, k_values, + label="merged-PR", + ) + + # Strategy B: Replication on all-authors population + print("\n--- Loading all-authors population for replication ---") + df_all = load_all_time_features(merged_pr_only=False) + available_f16_all = [f for f in F16 if f in df_all.columns] + repl_fs = {"F16": available_f16_all} + all_results["strategy_b_all_authors"] = run_strategy_b( + df_all, repl_fs, ["cosine"], [5], + label="all-authors (stage 12 replication)", + ) + + # Strategy C: Temporal holdout + all_results["strategy_c"] = run_strategy_c( + {"F10": F10, "F16": F16}, metrics_list=["cosine"], + k_values=[5, 10], + ) + + # DeLong comparisons (AUC-level, no paired scores available) + all_results["delong_comparisons"] = run_delong_comparisons(all_results) + + # Save results + output_path = RESULTS_DIR / "knn_results.json" + + def _default(obj: Any) -> Any: + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + raise TypeError(f"Not serializable: {type(obj)}") + + with open(output_path, "w") as f: + json.dump(all_results, f, indent=2, default=_default) + print(f"\nResults saved to {output_path}") + + +if __name__ == "__main__": + main() From a0973d56445ee8aa78fc66adff44c0131a65ef9c Mon Sep 17 00:00:00 2001 From: Jeff Date: Thu, 12 Mar 2026 21:27:37 +0000 Subject: [PATCH 3/7] Add LLM-based suspension detection experiment (H5) Tests Gemini 3.1 Pro scoring of PR titles/bodies as a signal for detecting suspended GitHub accounts in the merged-PR population. Uses temporal cutoffs (Strategy C) to prevent lookahead bias. Three prompt variants (V1: titles, V2: titles+bodies, V3: full profile) scored across 3 cutoffs (2022-07-01, 2023-01-01, 2024-01-01) totaling 30,131 API calls. Results: standalone LLM AUC 0.50-0.57, marginal. Combined LR(F10)+LLM(V2)+Jaccard reaches AUC 0.577 at the 2024-01-01 cutoff (+0.026 over Jaccard alone, significant after Holm-Bonferroni). Second-phase re-ranking is ineffective. H5 weakly supported. --- .../proximity_results/PROXIMITY_ANALYSIS.md | 72 +- .../proximity_results/llm_results.json | 1963 +++++++++++++++++ scripts/proximity_llm_client.py | 204 ++ scripts/proximity_llm_experiment.py | 903 ++++++++ 4 files changed, 3139 insertions(+), 3 deletions(-) create mode 100644 experiments/bot_detection/proximity_results/llm_results.json create mode 100644 scripts/proximity_llm_client.py create mode 100644 scripts/proximity_llm_experiment.py diff --git a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md index 71ef19a..6fdf692 100644 --- a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md +++ b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md @@ -241,12 +241,78 @@ Behavioral LR baseline: AUC = 0.5727 **Verdict**: SUPPORTED — F10+graph_combined: delta=+0.0493, p=0.0000; F10+both_combined: delta=+0.0209, p=0.0014; F16+graph_combined: delta=+0.0336, p=0.0000; F16+both_combined: delta=+0.0148, p=0.0011 -## 5. Summary +## 5. LLM Scoring Results (H5) -**Best overall method**: jaccard_max (Strategy B merged (Graph)), AUC = 0.5952 +Tests whether LLM (Gemini 3.1 Pro) scoring of PR titles and bodies adds signal +beyond behavioral and graph-proximity features. Uses temporal cutoffs (Strategy C) +to prevent lookahead bias: the LLM only sees PR titles and bodies created before +the cutoff date. + +### Prompt Variants + +- **V1** (title-only): Up to 20 merged PR titles, cheapest +- **V2** (titles + bodies): Titles + first 500 chars of body for up to 10 PRs +- **V3** (full profile): V2 + author metadata (total_prs, merge_rate, total_repos, career_span) + +Model: `gemini/gemini-3.1-pro-preview`, temperature=1.0, 30,131 total API calls +across 3 cutoffs × 3 variants. Score failures dropped (not defaulted). + +### Standalone LLM Scoring + +| Cutoff | Population | V1 AUC | V2 AUC | V3 AUC | +|--------|-----------|--------|--------|--------| +| 2022-07-01 | 2,235 (58 susp) | 0.5444 | 0.5695 | 0.5209 | +| 2023-01-01 | 3,619 (92 susp) | 0.5168 | 0.5167 | 0.5045 | +| 2024-01-01 | 7,642 (204 susp) | 0.5372 | 0.5469 | 0.5408 | + +V2 (titles + bodies) is the best or tied-best variant at every cutoff. Standalone +AUC ranges 0.50–0.57, comparable to behavioral feature baselines but not strong +on its own. V3 (full profile) does not improve over V2, suggesting the metadata +block does not help the LLM beyond what it can infer from PR text. + +### Combined Models (LR + LLM ± Jaccard, F10 features) + +| Cutoff | LR Baseline | + Jaccard | + LLM (V2) | + LLM + Jaccard | +|--------|-------------|-----------|------------|-----------------| +| 2022-07-01 | 0.4600 | 0.4612 | 0.4914 | 0.4958 | +| 2023-01-01 | 0.5277 | 0.5321 | 0.5160 | 0.5222 | +| 2024-01-01 | 0.5357 | 0.5510 | 0.5630 | 0.5771 | +At the 2024-01-01 cutoff (largest population), LLM+Jaccard combined reaches +AUC 0.577, a +0.026 improvement over Jaccard alone (0.551). Three of 24 DeLong +tests are significant after Holm-Bonferroni correction — all from the 2024-01-01 +cutoff. At earlier cutoffs with smaller populations, no tests reach significance. + +### Second-Phase Re-ranking + +LLM re-ranking of top-N candidates from the first-phase model (LR+Jaccard) was +tested at top-100, top-200, and top-500 with alpha sweeps blending first-phase +and LLM scores (z-normalized). Results are uniformly negative: LLM re-ranking +does not improve precision at any operating point across any cutoff or variant. + +### H5 Verdict + +**H5**: LLM scoring of PR text adds signal beyond behavioral + graph features + +**Verdict**: WEAKLY SUPPORTED — On the largest population (2024-01-01, 7,642 +authors), LLM combined with Jaccard achieves the best single-cutoff AUC (0.577) +and 3/24 DeLong tests survive Holm-Bonferroni correction. However, the effect +is small (+0.026 over Jaccard alone), does not replicate at earlier cutoffs with +smaller populations, and second-phase re-ranking is ineffective. The LLM provides +marginal incremental value as a combined LR feature but is not useful as a +standalone detector or re-ranker on the merged-PR population. + +## 6. Summary + +**Best overall method**: jaccard_max (Strategy B merged (Graph)), AUC = 0.5952 -The best method exceeds the AUC > 0.55 threshold, suggesting proximity-based detection has *some* signal on the merged-PR population. However, the practical value depends on the magnitude and precision at operational thresholds. +**Best combined method**: LR(F10) + LLM(V2) + Jaccard at 2024-01-01 cutoff, +AUC = 0.5771 +The best methods exceed the AUC > 0.55 threshold, suggesting proximity-based +detection has *some* signal on the merged-PR population. However, the practical +value is limited — precision at operational thresholds (P@25, P@50) remains low +across all methods. LLM scoring provides marginal incremental value when combined +with Jaccard but is not useful standalone or for re-ranking. **Stage 12 replication** (all-authors, F16, cosine, k=5): AUC = 0.5573 (original stage 12: 0.595) diff --git a/experiments/bot_detection/proximity_results/llm_results.json b/experiments/bot_detection/proximity_results/llm_results.json new file mode 100644 index 0000000..2384eda --- /dev/null +++ b/experiments/bot_detection/proximity_results/llm_results.json @@ -0,0 +1,1963 @@ +{ + "warmup": { + "cutoff": "2022-07-01", + "n_authors": 50, + "n_suspended": 25, + "n_active": 25, + "v1_auc": 0.5176000000000001, + "v1_metrics": { + "auc_roc": 0.5176000000000001, + "auc_pr": 0.512, + "precision_at_25": 0.12, + "precision_at_50": 0.5 + }, + "n_parsed": 50, + "parse_rate": 1.0, + "susp_mean_score": 0.012000000000000002, + "active_mean_score": 0.012000000000000002, + "pipeline_status": "ok" + }, + "config": { + "model": "gemini/gemini-3.1-pro-preview", + "cutoffs": [ + "2022-07-01", + "2023-01-01", + "2024-01-01" + ] + }, + "2022-07-01": { + "cutoff": "2022-07-01", + "n_authors": 2235, + "n_suspended": 58, + "n_active": 2177, + "scoring_progress": { + "completed": "v3", + "n_scored": 2235 + }, + "standalone": { + "v1": { + "auc_roc": 0.5443799964894924, + "auc_pr": 0.03127923419993473, + "precision_at_25": 0.04, + "precision_at_50": 0.06, + "n_scored": 2219, + "n_dropped": 16 + }, + "v2": { + "auc_roc": 0.5694684238037159, + "auc_pr": 0.03781316668264091, + "precision_at_25": 0.04, + "precision_at_50": 0.04, + "n_scored": 2235, + "n_dropped": 0 + }, + "v3": { + "auc_roc": 0.5209240809085582, + "auc_pr": 0.037303514485561504, + "precision_at_25": 0.04, + "precision_at_50": 0.08, + "n_scored": 2235, + "n_dropped": 0 + } + }, + "combined": { + "v1": { + "n_dropped": 16, + "F10": { + "baseline_auc": 0.45031036078443887, + "jaccard_auc": 0.45508145973288233, + "llm_auc": 0.47817102554692115, + "llm_plus_jaccard_auc": 0.4823756562255661, + "delong_llm_vs_baseline": { + "z": 1.1394824797824876, + "p": 0.25450197171526223 + }, + "delong_llm_vs_jaccard": { + "z": 0.8309265408073401, + "p": 0.4060151291623282 + }, + "delong_both_vs_jaccard": { + "z": 1.0388918843650274, + "p": 0.2988550212082153 + }, + "delong_both_vs_baseline": { + "z": 1.1642326923061803, + "p": 0.24432972487594806 + } + }, + "F16": { + "baseline_auc": 0.4741897908056615, + "jaccard_auc": 0.46820597105426925, + "llm_auc": 0.48703505720531687, + "llm_plus_jaccard_auc": 0.48294212449536456, + "delong_llm_vs_baseline": { + "z": 0.901954045129347, + "p": 0.3670812800376566 + }, + "delong_llm_vs_jaccard": { + "z": 1.246742495801323, + "p": 0.2124919315274646 + }, + "delong_both_vs_jaccard": { + "z": 0.9253704099901143, + "p": 0.35477326511045926 + }, + "delong_both_vs_baseline": { + "z": 0.5051029572636321, + "p": 0.6134865244083731 + } + } + }, + "v2": { + "n_dropped": 0, + "F10": { + "baseline_auc": 0.4600011087703737, + "jaccard_auc": 0.4611811572394785, + "llm_auc": 0.49144266865189357, + "llm_plus_jaccard_auc": 0.49581439183945, + "delong_llm_vs_baseline": { + "z": 1.38284628873725, + "p": 0.16671200233191874 + }, + "delong_llm_vs_jaccard": { + "z": 1.0266540603351466, + "p": 0.30458338845997757 + }, + "delong_both_vs_jaccard": { + "z": 1.4283874042883566, + "p": 0.1531803825739019 + }, + "delong_both_vs_baseline": { + "z": 1.4204649180864097, + "p": 0.15547237493275493 + } + }, + "F16": { + "baseline_auc": 0.4670497204314701, + "jaccard_auc": 0.45536011277778654, + "llm_auc": 0.4976755421095148, + "llm_plus_jaccard_auc": 0.4901596629338064, + "delong_llm_vs_baseline": { + "z": 1.5985973169958796, + "p": 0.10991010627975638 + }, + "delong_llm_vs_jaccard": { + "z": 1.7973588195005656, + "p": 0.0722786738869812 + }, + "delong_both_vs_jaccard": { + "z": 1.7149046440198388, + "p": 0.08636271108254007 + }, + "delong_both_vs_baseline": { + "z": 1.081835195618125, + "p": 0.2793257645252759 + } + } + }, + "v3": { + "n_dropped": 0, + "F10": { + "baseline_auc": 0.4600011087703737, + "jaccard_auc": 0.4611811572394785, + "llm_auc": 0.4869204694850553, + "llm_plus_jaccard_auc": 0.49307414505884406, + "delong_llm_vs_baseline": { + "z": 1.2307677880682386, + "p": 0.21840972635809108 + }, + "delong_llm_vs_jaccard": { + "z": 0.8973264967785047, + "p": 0.3695447216033525 + }, + "delong_both_vs_jaccard": { + "z": 1.3821655754198328, + "p": 0.16692086743074286 + }, + "delong_both_vs_baseline": { + "z": 1.3552594120446397, + "p": 0.1753349216345108 + } + }, + "F16": { + "baseline_auc": 0.4670497204314701, + "jaccard_auc": 0.45536011277778654, + "llm_auc": 0.4991961414790997, + "llm_plus_jaccard_auc": 0.49484817765669303, + "delong_llm_vs_baseline": { + "z": 1.5820548104079528, + "p": 0.1136370585476952 + }, + "delong_llm_vs_jaccard": { + "z": 1.8297924417131242, + "p": 0.06728098144882265 + }, + "delong_both_vs_jaccard": { + "z": 1.8838368805928178, + "p": 0.05958704320883685 + }, + "delong_both_vs_baseline": { + "z": 1.2466674007999454, + "p": 0.21251947666517612 + } + } + } + }, + "holm_bonferroni_correction": { + "v3/F16/delong_both_vs_jaccard": { + "raw_p": 0.05958704320883685, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_llm_vs_jaccard": { + "raw_p": 0.06728098144882265, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_llm_vs_jaccard": { + "raw_p": 0.0722786738869812, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_both_vs_jaccard": { + "raw_p": 0.08636271108254007, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_llm_vs_baseline": { + "raw_p": 0.10991010627975638, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_llm_vs_baseline": { + "raw_p": 0.1136370585476952, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_both_vs_jaccard": { + "raw_p": 0.1531803825739019, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_both_vs_baseline": { + "raw_p": 0.15547237493275493, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_llm_vs_baseline": { + "raw_p": 0.16671200233191874, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_both_vs_jaccard": { + "raw_p": 0.16692086743074286, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_both_vs_baseline": { + "raw_p": 0.1753349216345108, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F16/delong_llm_vs_jaccard": { + "raw_p": 0.2124919315274646, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_both_vs_baseline": { + "raw_p": 0.21251947666517612, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_llm_vs_baseline": { + "raw_p": 0.21840972635809108, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_both_vs_baseline": { + "raw_p": 0.24432972487594806, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_llm_vs_baseline": { + "raw_p": 0.25450197171526223, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_both_vs_baseline": { + "raw_p": 0.2793257645252759, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_both_vs_jaccard": { + "raw_p": 0.2988550212082153, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_llm_vs_jaccard": { + "raw_p": 0.30458338845997757, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F16/delong_both_vs_jaccard": { + "raw_p": 0.35477326511045926, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F16/delong_llm_vs_baseline": { + "raw_p": 0.3670812800376566, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_llm_vs_jaccard": { + "raw_p": 0.3695447216033525, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_llm_vs_jaccard": { + "raw_p": 0.4060151291623282, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F16/delong_both_vs_baseline": { + "raw_p": 0.6134865244083731, + "adjusted_p": 1.0, + "reject_h0": false + } + }, + "second_phase": { + "v1": { + "n_dropped": 16, + "top_100": { + "n_suspended_in_top": 0, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.0, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + }, + "top_200": { + "n_suspended_in_top": 2, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + }, + "top_500": { + "n_suspended_in_top": 9, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.12, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.06, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.12, + "p_at_50": 0.06 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + } + }, + "v2": { + "n_dropped": 0, + "top_100": { + "n_suspended_in_top": 0, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.0, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + }, + "top_200": { + "n_suspended_in_top": 2, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + }, + "top_500": { + "n_suspended_in_top": 9, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + } + }, + "v3": { + "n_dropped": 0, + "top_100": { + "n_suspended_in_top": 0, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.0, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + }, + "top_200": { + "n_suspended_in_top": 2, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + }, + "top_500": { + "n_suspended_in_top": 9, + "first_phase_p_at_25": 0.0, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.0, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.8": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_1.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + } + } + } + } + } + }, + "2023-01-01": { + "cutoff": "2023-01-01", + "n_authors": 3619, + "n_suspended": 92, + "n_active": 3527, + "scoring_progress": { + "completed": "v3", + "n_scored": 3619 + }, + "standalone": { + "v1": { + "auc_roc": 0.5167762670877956, + "auc_pr": 0.028591935683013003, + "precision_at_25": 0.04, + "precision_at_50": 0.02, + "n_scored": 3616, + "n_dropped": 3 + }, + "v2": { + "auc_roc": 0.5167280975333144, + "auc_pr": 0.028506721054595913, + "precision_at_25": 0.04, + "precision_at_50": 0.02, + "n_scored": 3619, + "n_dropped": 0 + }, + "v3": { + "auc_roc": 0.5044886650805587, + "auc_pr": 0.031750228180886894, + "precision_at_25": 0.08, + "precision_at_50": 0.04, + "n_scored": 3619, + "n_dropped": 0 + } + }, + "combined": { + "v1": { + "n_dropped": 3, + "F10": { + "baseline_auc": 0.5252075827863594, + "jaccard_auc": 0.530031646350491, + "llm_auc": 0.5085423555248483, + "llm_plus_jaccard_auc": 0.512144980999852, + "delong_llm_vs_baseline": { + "z": -1.7683735504752194, + "p": 0.0769984753056826 + }, + "delong_llm_vs_jaccard": { + "z": -1.0618667466929754, + "p": 0.28829618313176675 + }, + "delong_both_vs_jaccard": { + "z": -2.1277321023238627, + "p": 0.03335930166941646 + }, + "delong_both_vs_baseline": { + "z": -0.6927557329685186, + "p": 0.4884628567196615 + } + }, + "F16": { + "baseline_auc": 0.5594417781177515, + "jaccard_auc": 0.5506017741696688, + "llm_auc": 0.5385724596555298, + "llm_plus_jaccard_auc": 0.5300624907466811, + "delong_llm_vs_baseline": { + "z": -2.4149087796991187, + "p": 0.015739150883507094 + }, + "delong_llm_vs_jaccard": { + "z": -0.694880622503546, + "p": 0.4871301173033915 + }, + "delong_both_vs_jaccard": { + "z": -2.4515514578651514, + "p": 0.014224184692993948 + }, + "delong_both_vs_baseline": { + "z": -1.9065436050674371, + "p": 0.05657971391053674 + } + } + }, + "v2": { + "n_dropped": 0, + "F10": { + "baseline_auc": 0.5276854945082038, + "jaccard_auc": 0.5320832460152118, + "llm_auc": 0.5159668889683313, + "llm_plus_jaccard_auc": 0.5222414664513504, + "delong_llm_vs_baseline": { + "z": -1.4373217000765899, + "p": 0.15062660568462108 + }, + "delong_llm_vs_jaccard": { + "z": -0.9103960064623613, + "p": 0.3626137030772695 + }, + "delong_both_vs_jaccard": { + "z": -1.321441911164683, + "p": 0.1863540592401648 + }, + "delong_both_vs_baseline": { + "z": -0.2929419939500567, + "p": 0.769566489597421 + } + }, + "F16": { + "baseline_auc": 0.549893060982976, + "jaccard_auc": 0.5439235832891607, + "llm_auc": 0.5351573575276438, + "llm_plus_jaccard_auc": 0.5313389874385177, + "delong_llm_vs_baseline": { + "z": -1.8941654038401041, + "p": 0.05820305862140235 + }, + "delong_llm_vs_jaccard": { + "z": -0.5848394458350912, + "p": 0.5586556782407692 + }, + "delong_both_vs_jaccard": { + "z": -1.6740966527813927, + "p": 0.09411160551223803 + }, + "delong_both_vs_baseline": { + "z": -1.1893916680459307, + "p": 0.2342855779943478 + } + } + }, + "v3": { + "n_dropped": 0, + "F10": { + "baseline_auc": 0.5276854945082038, + "jaccard_auc": 0.5320832460152118, + "llm_auc": 0.506021868566709, + "llm_plus_jaccard_auc": 0.5134428816212818, + "delong_llm_vs_baseline": { + "z": -2.7096041483765627, + "p": 0.0067363552308356105 + }, + "delong_llm_vs_jaccard": { + "z": -1.3764121005261454, + "p": 0.168694083780613 + }, + "delong_both_vs_jaccard": { + "z": -2.5548051823628843, + "p": 0.010624724550622532 + }, + "delong_both_vs_baseline": { + "z": -0.7909092114702223, + "p": 0.4289969726810161 + } + }, + "F16": { + "baseline_auc": 0.549893060982976, + "jaccard_auc": 0.5439235832891607, + "llm_auc": 0.5341557673105608, + "llm_plus_jaccard_auc": 0.5316471690437741, + "delong_llm_vs_baseline": { + "z": -2.1111242142384508, + "p": 0.03476163664569991 + }, + "delong_llm_vs_jaccard": { + "z": -0.5971377434489296, + "p": 0.5504154204035077 + }, + "delong_both_vs_jaccard": { + "z": -1.7376626441163228, + "p": 0.08227027463136692 + }, + "delong_both_vs_baseline": { + "z": -1.2754148631459352, + "p": 0.20216244063533384 + } + } + } + }, + "holm_bonferroni_correction": { + "v3/F10/delong_llm_vs_baseline": { + "raw_p": 0.0067363552308356105, + "adjusted_p": 0.16167252554005465, + "reject_h0": false + }, + "v3/F10/delong_both_vs_jaccard": { + "raw_p": 0.010624724550622532, + "adjusted_p": 0.24436866466431825, + "reject_h0": false + }, + "v1/F16/delong_both_vs_jaccard": { + "raw_p": 0.014224184692993948, + "adjusted_p": 0.31293206324586686, + "reject_h0": false + }, + "v1/F16/delong_llm_vs_baseline": { + "raw_p": 0.015739150883507094, + "adjusted_p": 0.33052216855364897, + "reject_h0": false + }, + "v1/F10/delong_both_vs_jaccard": { + "raw_p": 0.03335930166941646, + "adjusted_p": 0.6671860333883293, + "reject_h0": false + }, + "v3/F16/delong_llm_vs_baseline": { + "raw_p": 0.03476163664569991, + "adjusted_p": 0.6671860333883293, + "reject_h0": false + }, + "v1/F16/delong_both_vs_baseline": { + "raw_p": 0.05657971391053674, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_llm_vs_baseline": { + "raw_p": 0.05820305862140235, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_llm_vs_baseline": { + "raw_p": 0.0769984753056826, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_both_vs_jaccard": { + "raw_p": 0.08227027463136692, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_both_vs_jaccard": { + "raw_p": 0.09411160551223803, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_llm_vs_baseline": { + "raw_p": 0.15062660568462108, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_llm_vs_jaccard": { + "raw_p": 0.168694083780613, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_both_vs_jaccard": { + "raw_p": 0.1863540592401648, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_both_vs_baseline": { + "raw_p": 0.20216244063533384, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_both_vs_baseline": { + "raw_p": 0.2342855779943478, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_llm_vs_jaccard": { + "raw_p": 0.28829618313176675, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_llm_vs_jaccard": { + "raw_p": 0.3626137030772695, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_both_vs_baseline": { + "raw_p": 0.4289969726810161, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F16/delong_llm_vs_jaccard": { + "raw_p": 0.4871301173033915, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_both_vs_baseline": { + "raw_p": 0.4884628567196615, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_llm_vs_jaccard": { + "raw_p": 0.5504154204035077, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F16/delong_llm_vs_jaccard": { + "raw_p": 0.5586556782407692, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_both_vs_baseline": { + "raw_p": 0.769566489597421, + "adjusted_p": 1.0, + "reject_h0": false + } + }, + "second_phase": { + "v1": { + "n_dropped": 3, + "top_100": { + "n_suspended_in_top": 2, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + }, + "top_200": { + "n_suspended_in_top": 5, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + }, + "top_500": { + "n_suspended_in_top": 15, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.08, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.04, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.08, + "p_at_50": 0.04 + }, + "alpha_0.2": { + "p_at_25": 0.08, + "p_at_50": 0.04 + }, + "alpha_0.4": { + "p_at_25": 0.08, + "p_at_50": 0.04 + }, + "alpha_0.5": { + "p_at_25": 0.08, + "p_at_50": 0.04 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + } + }, + "v2": { + "n_dropped": 0, + "top_100": { + "n_suspended_in_top": 2, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + }, + "top_200": { + "n_suspended_in_top": 5, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + }, + "top_500": { + "n_suspended_in_top": 15, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + } + }, + "v3": { + "n_dropped": 0, + "top_100": { + "n_suspended_in_top": 2, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.04, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + }, + "top_200": { + "n_suspended_in_top": 5, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + }, + "top_500": { + "n_suspended_in_top": 15, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.02, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.02 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.02 + } + } + } + } + } + }, + "2024-01-01": { + "cutoff": "2024-01-01", + "n_authors": 7642, + "n_suspended": 204, + "n_active": 7438, + "scoring_progress": { + "completed": "v3", + "n_scored": 7642 + }, + "standalone": { + "v1": { + "auc_roc": 0.537193528567963, + "auc_pr": 0.03044811829378337, + "precision_at_25": 0.0, + "precision_at_50": 0.04, + "n_scored": 7640, + "n_dropped": 2 + }, + "v2": { + "auc_roc": 0.5468876041946761, + "auc_pr": 0.033503737338391466, + "precision_at_25": 0.04, + "precision_at_50": 0.08, + "n_scored": 7642, + "n_dropped": 0 + }, + "v3": { + "auc_roc": 0.5407904691857921, + "auc_pr": 0.03262793318356549, + "precision_at_25": 0.08, + "precision_at_50": 0.06, + "n_scored": 7642, + "n_dropped": 0 + } + }, + "combined": { + "v1": { + "n_dropped": 2, + "F10": { + "baseline_auc": 0.54062641732325, + "jaccard_auc": 0.5538925629423366, + "llm_auc": 0.5642077097111035, + "llm_plus_jaccard_auc": 0.5736952056239386, + "delong_llm_vs_baseline": { + "z": 2.1510711194821526, + "p": 0.03147058664100578 + }, + "delong_llm_vs_jaccard": { + "z": 0.7905747642470826, + "p": 0.42919217787627695 + }, + "delong_both_vs_jaccard": { + "z": 2.0077576063179965, + "p": 0.04466905579912315 + }, + "delong_both_vs_baseline": { + "z": 2.6029801275066604, + "p": 0.009241731042156774 + } + }, + "F16": { + "baseline_auc": 0.5415898675231254, + "jaccard_auc": 0.5515328845362782, + "llm_auc": 0.5607276867175057, + "llm_plus_jaccard_auc": 0.567617196152264, + "delong_llm_vs_baseline": { + "z": 1.8687393422910026, + "p": 0.06165908760334996 + }, + "delong_llm_vs_jaccard": { + "z": 0.7674956649603318, + "p": 0.4427868696430599 + }, + "delong_both_vs_jaccard": { + "z": 1.734152815618921, + "p": 0.08289097242055839 + }, + "delong_both_vs_baseline": { + "z": 2.2202184635498927, + "p": 0.026403941570933915 + } + } + }, + "v2": { + "n_dropped": 0, + "F10": { + "baseline_auc": 0.535728690508201, + "jaccard_auc": 0.5510191438769647, + "llm_auc": 0.5629820898512672, + "llm_plus_jaccard_auc": 0.5770592453168415, + "delong_llm_vs_baseline": { + "z": 2.3698408905831787, + "p": 0.017795741640889335 + }, + "delong_llm_vs_jaccard": { + "z": 0.8656942335795392, + "p": 0.38665786310338746 + }, + "delong_both_vs_jaccard": { + "z": 2.482760227404762, + "p": 0.013036880656823489 + }, + "delong_both_vs_baseline": { + "z": 3.439508590090745, + "p": 0.0005827712900346648 + } + }, + "F16": { + "baseline_auc": 0.5208857272406139, + "jaccard_auc": 0.5336013001597519, + "llm_auc": 0.5448646721393586, + "llm_plus_jaccard_auc": 0.557921629259394, + "delong_llm_vs_baseline": { + "z": 2.2779737517952383, + "p": 0.022728141516401772 + }, + "delong_llm_vs_jaccard": { + "z": 0.882042644486708, + "p": 0.3777537466626316 + }, + "delong_both_vs_jaccard": { + "z": 2.4534691164957914, + "p": 0.01414856881261478 + }, + "delong_both_vs_baseline": { + "z": 3.3482690738342513, + "p": 0.0008131801813857473 + } + } + }, + "v3": { + "n_dropped": 0, + "F10": { + "baseline_auc": 0.535728690508201, + "jaccard_auc": 0.5510191438769647, + "llm_auc": 0.5579730346023863, + "llm_plus_jaccard_auc": 0.5728641079986714, + "delong_llm_vs_baseline": { + "z": 2.0074370216093556, + "p": 0.04470315015029512 + }, + "delong_llm_vs_jaccard": { + "z": 0.5030204223149134, + "p": 0.6149499138727417 + }, + "delong_both_vs_jaccard": { + "z": 2.1918123426721428, + "p": 0.02839305952399251 + }, + "delong_both_vs_baseline": { + "z": 3.154616350978313, + "p": 0.0016070928301650015 + } + }, + "F16": { + "baseline_auc": 0.5208857272406139, + "jaccard_auc": 0.5336013001597519, + "llm_auc": 0.5419144667816037, + "llm_plus_jaccard_auc": 0.5540250383562944, + "delong_llm_vs_baseline": { + "z": 2.01024901046837, + "p": 0.04440484057012973 + }, + "delong_llm_vs_jaccard": { + "z": 0.6370065866661048, + "p": 0.524120553243463 + }, + "delong_both_vs_jaccard": { + "z": 2.1115851220241626, + "p": 0.034722049779460175 + }, + "delong_both_vs_baseline": { + "z": 3.0074297529102862, + "p": 0.0026346700895803973 + } + } + } + }, + "holm_bonferroni_correction": { + "v2/F10/delong_both_vs_baseline": { + "raw_p": 0.0005827712900346648, + "adjusted_p": 0.013986510960831956, + "reject_h0": true + }, + "v2/F16/delong_both_vs_baseline": { + "raw_p": 0.0008131801813857473, + "adjusted_p": 0.018703144171872188, + "reject_h0": true + }, + "v3/F10/delong_both_vs_baseline": { + "raw_p": 0.0016070928301650015, + "adjusted_p": 0.03535604226363003, + "reject_h0": true + }, + "v3/F16/delong_both_vs_baseline": { + "raw_p": 0.0026346700895803973, + "adjusted_p": 0.055328071881188344, + "reject_h0": false + }, + "v1/F10/delong_both_vs_baseline": { + "raw_p": 0.009241731042156774, + "adjusted_p": 0.18483462084313548, + "reject_h0": false + }, + "v2/F10/delong_both_vs_jaccard": { + "raw_p": 0.013036880656823489, + "adjusted_p": 0.24770073247964627, + "reject_h0": false + }, + "v2/F16/delong_both_vs_jaccard": { + "raw_p": 0.01414856881261478, + "adjusted_p": 0.254674238627066, + "reject_h0": false + }, + "v2/F10/delong_llm_vs_baseline": { + "raw_p": 0.017795741640889335, + "adjusted_p": 0.3025276078951187, + "reject_h0": false + }, + "v2/F16/delong_llm_vs_baseline": { + "raw_p": 0.022728141516401772, + "adjusted_p": 0.36365026426242836, + "reject_h0": false + }, + "v1/F16/delong_both_vs_baseline": { + "raw_p": 0.026403941570933915, + "adjusted_p": 0.39605912356400874, + "reject_h0": false + }, + "v3/F10/delong_both_vs_jaccard": { + "raw_p": 0.02839305952399251, + "adjusted_p": 0.39750283333589514, + "reject_h0": false + }, + "v1/F10/delong_llm_vs_baseline": { + "raw_p": 0.03147058664100578, + "adjusted_p": 0.40911762633307514, + "reject_h0": false + }, + "v3/F16/delong_both_vs_jaccard": { + "raw_p": 0.034722049779460175, + "adjusted_p": 0.4166645973535221, + "reject_h0": false + }, + "v3/F16/delong_llm_vs_baseline": { + "raw_p": 0.04440484057012973, + "adjusted_p": 0.488453246271427, + "reject_h0": false + }, + "v1/F10/delong_both_vs_jaccard": { + "raw_p": 0.04466905579912315, + "adjusted_p": 0.488453246271427, + "reject_h0": false + }, + "v3/F10/delong_llm_vs_baseline": { + "raw_p": 0.04470315015029512, + "adjusted_p": 0.488453246271427, + "reject_h0": false + }, + "v1/F16/delong_llm_vs_baseline": { + "raw_p": 0.06165908760334996, + "adjusted_p": 0.4932727008267997, + "reject_h0": false + }, + "v1/F16/delong_both_vs_jaccard": { + "raw_p": 0.08289097242055839, + "adjusted_p": 0.5802368069439087, + "reject_h0": false + }, + "v2/F16/delong_llm_vs_jaccard": { + "raw_p": 0.3777537466626316, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v2/F10/delong_llm_vs_jaccard": { + "raw_p": 0.38665786310338746, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F10/delong_llm_vs_jaccard": { + "raw_p": 0.42919217787627695, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v1/F16/delong_llm_vs_jaccard": { + "raw_p": 0.4427868696430599, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F16/delong_llm_vs_jaccard": { + "raw_p": 0.524120553243463, + "adjusted_p": 1.0, + "reject_h0": false + }, + "v3/F10/delong_llm_vs_jaccard": { + "raw_p": 0.6149499138727417, + "adjusted_p": 1.0, + "reject_h0": false + } + }, + "second_phase": { + "v1": { + "n_dropped": 2, + "top_100": { + "n_suspended_in_top": 3, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.06, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + }, + "top_200": { + "n_suspended_in_top": 5, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + }, + "top_500": { + "n_suspended_in_top": 16, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + } + }, + "v2": { + "n_dropped": 0, + "top_100": { + "n_suspended_in_top": 3, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.04, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + }, + "top_200": { + "n_suspended_in_top": 5, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.0, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + }, + "top_500": { + "n_suspended_in_top": 16, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.0, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + } + }, + "v3": { + "n_dropped": 0, + "top_100": { + "n_suspended_in_top": 3, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.06, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.06 + }, + "alpha_0.5": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_0.6": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + }, + "top_200": { + "n_suspended_in_top": 5, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.0, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.04 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + }, + "top_500": { + "n_suspended_in_top": 16, + "first_phase_p_at_25": 0.04, + "llm_rerank_p_at_25": 0.0, + "first_phase_p_at_50": 0.06, + "llm_rerank_p_at_50": 0.02, + "alpha_sweep": { + "alpha_0.0": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.2": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.4": { + "p_at_25": 0.0, + "p_at_50": 0.0 + }, + "alpha_0.5": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.6": { + "p_at_25": 0.0, + "p_at_50": 0.02 + }, + "alpha_0.8": { + "p_at_25": 0.04, + "p_at_50": 0.06 + }, + "alpha_1.0": { + "p_at_25": 0.04, + "p_at_50": 0.06 + } + } + } + } + } + } +} \ No newline at end of file diff --git a/scripts/proximity_llm_client.py b/scripts/proximity_llm_client.py new file mode 100644 index 0000000..4f56dbb --- /dev/null +++ b/scripts/proximity_llm_client.py @@ -0,0 +1,204 @@ +"""Async LLM client with concurrency control and file-based caching. + +Adapted from experiments/bot_detection/llm_client.py for use with +the proximity experiment pipeline. Uses litellm for model-agnostic +API calls (with its built-in retry and timeout) and asyncio for +concurrent scoring. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import json +import logging +import re +from collections.abc import Callable +from pathlib import Path + +logger = logging.getLogger(__name__) + +_DEFAULT_CACHE_DIR = Path("experiments/bot_detection/data/llm_cache") +_MAX_CONCURRENT = 15 + + +def _cache_key(model: str, prompt: str) -> str: + return hashlib.sha256(f"{model}:{prompt}".encode()).hexdigest() + + +def _load_cached(model: str, prompt: str, cache_dir: Path) -> str | None: + key = _cache_key(model, prompt) + cache_file = cache_dir / model.replace("/", "_") / f"{key}.json" + if cache_file.exists(): + data = json.loads(cache_file.read_text()) + return data.get("response") + return None + + +def _save_cached( + model: str, prompt: str, response: str, cache_dir: Path, +) -> None: + d = cache_dir / model.replace("/", "_") + d.mkdir(parents=True, exist_ok=True) + key = _cache_key(model, prompt) + cache_file = d / f"{key}.json" + cache_file.write_text(json.dumps({ + "model": model, + "prompt": prompt, + "response": response, + })) + + +def _gemini_call(model: str, prompt: str) -> str: + """Call Gemini via the google.generativeai SDK. + + The model parameter should include the bare model name (e.g. + "gemini/gemini-3.1-pro-preview"); the "gemini/" prefix is stripped. + Retries are handled by the caller (score_authors_batch re-enqueues + on failure); the SDK handles transport-level retries internally. + """ + import os + + import google.generativeai as genai # type: ignore[import-untyped] + + genai.configure(api_key=os.environ["GOOGLE_API_KEY"]) + + # Strip litellm-style prefix if present + model_name = model.removeprefix("gemini/") + + gen_model = genai.GenerativeModel(model_name) + response = gen_model.generate_content( + prompt, + generation_config=genai.types.GenerationConfig(temperature=1.0), + request_options={"timeout": 120}, + ) + return response.text + + +def parse_llm_score(response: str) -> float | None: + """Extract a 0.0-1.0 score from LLM JSON response, with fallbacks. + + Returns None if no score can be parsed (caller should drop the author). + """ + # Try JSON parse first + try: + data = json.loads(response) + if isinstance(data, dict) and "score" in data: + return max(0.0, min(1.0, float(data["score"]))) + except (json.JSONDecodeError, ValueError, TypeError): + pass + + # Try extracting JSON from markdown code blocks + json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", response, re.DOTALL) + if json_match: + try: + data = json.loads(json_match.group(1)) + if isinstance(data, dict) and "score" in data: + return max(0.0, min(1.0, float(data["score"]))) + except (json.JSONDecodeError, ValueError, TypeError): + pass + + # Fallback: regex for score pattern + score_match = re.search(r'"score"\s*:\s*([\d.]+)', response) + if score_match: + try: + return max(0.0, min(1.0, float(score_match.group(1)))) + except ValueError: + pass + + # Last resort: look for any float between 0 and 1 + float_match = re.search(r'\b(0\.\d+|1\.0|0\.0)\b', response) + if float_match: + try: + return float(float_match.group(1)) + except ValueError: + pass + + logger.warning("Could not parse score from response, dropping") + return None + + +async def score_authors_batch( + model: str, + authors_data: dict[str, dict], + prompt_builder: Callable[[dict], str], + cache_dir: Path | None = None, + max_concurrent: int = _MAX_CONCURRENT, +) -> dict[str, float]: + """Score authors asynchronously with semaphore-based concurrency. + + Args: + model: litellm model string (e.g. "gemini/gemini-3.1-pro-preview") + authors_data: {author: data_dict} for prompt building + prompt_builder: callable(data_dict) -> prompt string + cache_dir: file cache directory + max_concurrent: max parallel API calls + + Returns: + {author: score} dict (authors with failed calls or unparseable + responses are omitted) + """ + base = cache_dir or _DEFAULT_CACHE_DIR + sem = asyncio.Semaphore(max_concurrent) + completed = 0 + total = len(authors_data) + errors = 0 + + async def _score_one(author: str, data: dict) -> tuple[str, float] | None: + nonlocal completed, errors + async with sem: + prompt = prompt_builder(data) + # Check cache (sync file I/O is fast) + cached = _load_cached(model, prompt, base) + if cached is not None: + completed += 1 + if completed % 500 == 0: + logger.info( + "Progress: %d/%d (%.1f%%)", + completed, total, 100 * completed / total, + ) + score = parse_llm_score(cached) + if score is None: + errors += 1 + return None + return author, score + # Call LLM via thread (litellm is sync) + try: + result = await asyncio.to_thread(_gemini_call, model, prompt) + except Exception: + errors += 1 + logger.warning( + "LLM call failed for %s after retries, dropping", author, + ) + completed += 1 + return None + _save_cached(model, prompt, result, base) + completed += 1 + if completed % 500 == 0: + logger.info( + "Progress: %d/%d (%.1f%%), errors: %d", + completed, total, 100 * completed / total, errors, + ) + score = parse_llm_score(result) + if score is None: + errors += 1 + return None + return author, score + + tasks = [_score_one(a, d) for a, d in authors_data.items()] + results = await asyncio.gather(*tasks, return_exceptions=True) + + scores = {} + for r in results: + if r is None or isinstance(r, Exception): + if isinstance(r, Exception): + logger.warning("Task exception: %s", r) + continue + author, score = r + scores[author] = score + + logger.info( + "Scoring complete: %d/%d scored, %d dropped", + len(scores), total, total - len(scores), + ) + return scores diff --git a/scripts/proximity_llm_experiment.py b/scripts/proximity_llm_experiment.py new file mode 100644 index 0000000..42578f8 --- /dev/null +++ b/scripts/proximity_llm_experiment.py @@ -0,0 +1,903 @@ +"""LLM-based suspension detection experiment with temporal holdout. + +Tests whether LLM scoring of PR titles/bodies adds signal beyond behavioral +and graph-proximity features. Uses temporal cutoffs (Strategy C) to prevent +lookahead bias: the LLM only sees PR titles and bodies that existed before +the cutoff date, and all behavioral features and graph data are similarly +restricted to pre-cutoff information. + +Three prompt variants: + V1: PR titles only (cheapest) + V2: Titles + body excerpts (first 500 chars at submission time) + V3: Full profile with pre-cutoff metadata + titles + bodies + +Integration modes: + 1. Combined model: LLM score as extra LR feature alongside behavioral features + 2. Second-phase re-ranking: LLM re-ranks top-N from first-phase model + +Usage: + uv run python scripts/proximity_llm_experiment.py --warmup-only + uv run python scripts/proximity_llm_experiment.py + uv run python scripts/proximity_llm_experiment.py --cutoff 2024-01-01 +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import sys +from pathlib import Path +from typing import Any + +import duckdb +import numpy as np +import pandas as pd +from proximity_combined import compute_graph_scores_cv, lr_with_proximity_cv +from proximity_common import ( + CUTOFFS, + DB_PATH, + F10, + F16, + RESULTS_DIR, + compute_metrics, + delong_auc_test, + holm_bonferroni, + load_temporal_features, + prepare_features, +) +from proximity_graph_experiment import build_author_repo_data +from proximity_llm_client import score_authors_batch +from sklearn.metrics import roc_auc_score + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%H:%M:%S", +) +logger = logging.getLogger(__name__) + +MODEL = "gemini/gemini-3.1-pro-preview" +CACHE_DIR = Path("experiments/bot_detection/data/llm_cache") +OUTPUT_PATH = RESULTS_DIR / "llm_results.json" + + +# --------------------------------------------------------------------------- +# Data loading — PR text from DuckDB +# --------------------------------------------------------------------------- + +def load_author_pr_data( + con: duckdb.DuckDBPyConnection, + authors: list[str], + cutoff: str, + max_prs: int = 20, +) -> dict[str, dict]: + """Load up to max_prs most recent merged PRs per author before cutoff. + + Only includes PR title and body as they existed at creation time + (the created_at timestamp). No post-submission data is included. + + Returns {author: {"titles": [...], "bodies": [...], "metadata": {...}}} + """ + placeholders = ", ".join(["?"] * len(authors)) + rows = con.execute(f""" + WITH ranked AS ( + SELECT + author, title, body, + ROW_NUMBER() OVER ( + PARTITION BY author + ORDER BY created_at DESC + ) AS rn + FROM prs + WHERE author IN ({placeholders}) + AND state = 'MERGED' + AND created_at < ?::TIMESTAMP + AND author IS NOT NULL + ) + SELECT author, title, body + FROM ranked + WHERE rn <= {max_prs} + ORDER BY author + """, [*authors, cutoff]).fetchall() + + # Per-author metadata computed only from pre-cutoff PRs + meta_rows = con.execute(f""" + SELECT + author, + COUNT(*) AS total_prs, + SUM(CASE WHEN state = 'MERGED' THEN 1 ELSE 0 END)::DOUBLE + / NULLIF(COUNT(*), 0)::DOUBLE AS merge_rate, + COUNT(DISTINCT repo) AS total_repos, + COALESCE( + EXTRACT(EPOCH FROM (MAX(created_at) - MIN(created_at))) + / 86400.0, 0.0 + ) AS career_span_days + FROM prs + WHERE author IN ({placeholders}) + AND created_at < ?::TIMESTAMP + AND author IS NOT NULL + GROUP BY author + """, [*authors, cutoff]).fetchall() + + meta_map: dict[str, dict] = {} + for author, total_prs, merge_rate, total_repos, career_span in meta_rows: + meta_map[author] = { + "total_prs": int(total_prs), + "merge_rate": round(float(merge_rate or 0), 3), + "total_repos": int(total_repos), + "career_span_days": round(float(career_span or 0), 1), + } + + author_data: dict[str, dict] = {} + for author in authors: + author_data[author] = {"titles": [], "bodies": [], "metadata": meta_map.get(author, {})} + + for author, title, body in rows: + if author in author_data: + author_data[author]["titles"].append(title or "") + author_data[author]["bodies"].append(body or "") + + return author_data + + +# --------------------------------------------------------------------------- +# Prompt builders +# --------------------------------------------------------------------------- + +def build_v1_prompt(data: dict) -> str: + """V1: Title-only prompt (cheapest/fastest).""" + titles = data.get("titles", []) + if not titles: + titles = ["(no merged PRs found)"] + title_list = "\n".join(f"- {t}" for t in titles[:20]) + return f"""You are analyzing GitHub PR titles from a single author. These PRs were all \ +merged into their target repositories, meaning they passed code review. + +Assess whether this author's contribution pattern suggests they may be a \ +suspended or malicious GitHub account. Obvious spam is unlikely since these \ +PRs were accepted. Look for subtle patterns: repetitive trivial changes, \ +unusual naming conventions, contributions that look like gaming metrics. + +PR titles (most recent first): +{title_list} + +Rate suspicion from 0.0 (clearly legitimate) to 1.0 (highly suspicious). +Respond with JSON: {{"score": 0.X, "reasoning": "brief explanation"}}""" + + +def build_v2_prompt(data: dict) -> str: + """V2: Titles + body excerpts (first 500 chars of up to 10 PRs).""" + titles = data.get("titles", []) + bodies = data.get("bodies", []) + if not titles: + titles = ["(no merged PRs found)"] + bodies = [""] + + pr_list_parts = [] + for i, (title, body) in enumerate(zip(titles[:10], bodies[:10], strict=True)): + excerpt = (body[:500] + "...") if len(body) > 500 else body + excerpt = excerpt.strip() or "(empty body)" + pr_list_parts.append(f"PR {i + 1}: {title}\n Body: {excerpt}") + + pr_list = "\n".join(pr_list_parts) + remaining = len(titles) - 10 + if remaining > 0: + pr_list += f"\n... and {remaining} more PRs (titles only):" + for t in titles[10:20]: + pr_list += f"\n- {t}" + + return f"""You are analyzing GitHub PRs from a single author. These PRs were all \ +merged into their target repositories, meaning they passed code review. + +Assess whether this author's contribution pattern suggests they may be a \ +suspended or malicious GitHub account. Obvious spam is unlikely since these \ +PRs were accepted. Look for subtle patterns: repetitive trivial changes, \ +unusual naming conventions, low-effort body text, contributions that look \ +like gaming metrics. + +PRs (most recent first): +{pr_list} + +Rate suspicion from 0.0 (clearly legitimate) to 1.0 (highly suspicious). +Respond with JSON: {{"score": 0.X, "reasoning": "brief explanation"}}""" + + +def build_v3_prompt(data: dict) -> str: + """V3: Full profile with metadata + titles + body excerpts.""" + meta = data.get("metadata", {}) + titles = data.get("titles", []) + bodies = data.get("bodies", []) + + meta_block = f"""Author profile: +- Total PRs: {meta.get('total_prs', 'unknown')} +- Merge rate: {meta.get('merge_rate', 'unknown')} +- Repos contributed to: {meta.get('total_repos', 'unknown')} +- Career span: {meta.get('career_span_days', 'unknown')} days""" + + if not titles: + titles = ["(no merged PRs found)"] + bodies = [""] + + pr_list_parts = [] + for i, (title, body) in enumerate(zip(titles[:10], bodies[:10], strict=True)): + excerpt = (body[:500] + "...") if len(body) > 500 else body + excerpt = excerpt.strip() or "(empty body)" + pr_list_parts.append(f"PR {i + 1}: {title}\n Body: {excerpt}") + + pr_list = "\n".join(pr_list_parts) + remaining = len(titles) - 10 + if remaining > 0: + pr_list += f"\n... and {remaining} more PRs (titles only):" + for t in titles[10:20]: + pr_list += f"\n- {t}" + + return f"""You are analyzing a GitHub author's complete contribution profile. These PRs \ +were all merged into their target repositories, meaning they passed code review. + +Assess whether this author's contribution pattern suggests they may be a \ +suspended or malicious GitHub account. Obvious spam is unlikely since these \ +PRs were accepted. Look for subtle patterns: repetitive trivial changes, \ +unusual naming conventions, low-effort contributions, metrics inconsistent \ +with genuine development work. + +{meta_block} + +PRs (most recent first): +{pr_list} + +Rate suspicion from 0.0 (clearly legitimate) to 1.0 (highly suspicious). +Respond with JSON: {{"score": 0.X, "reasoning": "brief explanation"}}""" + + +# --------------------------------------------------------------------------- +# Standalone evaluation +# --------------------------------------------------------------------------- + +def evaluate_standalone( + y: np.ndarray, + authors: list[str], + scores_dict: dict[str, float], + label: str, +) -> dict[str, Any]: + """Evaluate standalone LLM scores (authors without scores are dropped).""" + mask = np.array([a in scores_dict for a in authors]) + y_eval = y[mask] + llm_arr = np.array([scores_dict[a] for a in authors if a in scores_dict]) + n_dropped = int((~mask).sum()) + + metrics = compute_metrics(y_eval, llm_arr) + + logger.info( + " %s standalone: AUC=%.4f, AUC-PR=%.4f, P@25=%.2f, P@50=%.2f " + "(%d scored, %d dropped)", + label, + metrics.get("auc_roc", float("nan")), + metrics.get("auc_pr", float("nan")), + metrics.get("precision_at_25", float("nan")), + metrics.get("precision_at_50", float("nan")), + len(llm_arr), + n_dropped, + ) + + return { + **metrics, + "n_scored": len(llm_arr), + "n_dropped": n_dropped, + } + + +# --------------------------------------------------------------------------- +# Combined model evaluation +# --------------------------------------------------------------------------- + +def _lr_multi_proximity_cv( + df: pd.DataFrame, + feature_list: list[str], + extra_columns: np.ndarray, + seed: int = 42, +) -> tuple[np.ndarray, np.ndarray]: + """LR CV with behavioral features + multiple extra columns. + + Like lr_with_proximity_cv but accepts a 2D array of extra features, + letting LR learn separate weights for each. + """ + from sklearn.linear_model import LogisticRegression + from sklearn.model_selection import LeaveOneOut, StratifiedKFold + from sklearn.preprocessing import StandardScaler + + y = (df["account_status"] == "suspended").astype(int).values + x_base = prepare_features(df, feature_list) + + # Fill NaN with column medians + if extra_columns.ndim == 1: + extra_columns = extra_columns.reshape(-1, 1) + medians = np.nanmedian(extra_columns, axis=0) + extra_clean = np.where(np.isfinite(extra_columns), extra_columns, medians) + x_combined = np.hstack([x_base, extra_clean]) + + n_pos = y.sum() + oof = np.full(len(y), np.nan) + + if n_pos < 30: + splitter = LeaveOneOut() + else: + splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) + + for train_idx, test_idx in splitter.split(x_combined, y): + scaler = StandardScaler() + x_train = scaler.fit_transform(x_combined[train_idx]) + x_test = scaler.transform(x_combined[test_idx]) + model = LogisticRegression( + class_weight="balanced", max_iter=1000, random_state=seed, + ) + model.fit(x_train, y[train_idx]) + oof[test_idx] = model.predict_proba(x_test)[:, 1] + + return y, oof + + +def evaluate_combined( + df: pd.DataFrame, + y: np.ndarray, + authors: list[str], + llm_scores: dict[str, float], + jaccard_oof: np.ndarray, + feature_sets: dict[str, list[str]], + variant_label: str, +) -> dict[str, Any]: + """Test LLM as extra LR feature, alone and combined with Jaccard. + + Authors without LLM scores are dropped from all evaluations to avoid + corrupting results with default values. + """ + mask = np.array([a in llm_scores for a in authors]) + n_dropped = int((~mask).sum()) + if n_dropped > 0: + logger.info( + " Dropping %d/%d authors without LLM scores for %s", + n_dropped, len(authors), variant_label, + ) + df_eval = df.iloc[mask].reset_index(drop=True) + y_eval = y[mask] + jaccard_eval = jaccard_oof[mask] + llm_arr = np.array([llm_scores[a] for a in authors if a in llm_scores]) + results: dict[str, Any] = {"n_dropped": n_dropped} + + for fs_name, fs_list in feature_sets.items(): + logger.info(" Combined models: %s + %s", fs_name, variant_label) + + # Baseline: LR(behavioral) only — on filtered population + y_base, oof_base = lr_with_proximity_cv( + df_eval, fs_list, np.zeros(len(df_eval)), + ) + base_auc = roc_auc_score(y_base, oof_base) + + # LR + Jaccard (prior best) + y_jac, oof_jac = lr_with_proximity_cv(df_eval, fs_list, jaccard_eval) + jac_auc = roc_auc_score(y_jac, oof_jac) + + # LR + LLM + y_llm, oof_llm = lr_with_proximity_cv(df_eval, fs_list, llm_arr) + llm_auc = roc_auc_score(y_llm, oof_llm) + + # LR + LLM + Jaccard (both as separate features — LR learns weights) + both_extra = np.column_stack([llm_arr, jaccard_eval]) + y_both, oof_both = _lr_multi_proximity_cv(df_eval, fs_list, both_extra) + both_auc = roc_auc_score(y_both, oof_both) + + # DeLong tests (all use same y since same filtered population) + dl_llm_vs_base = delong_auc_test(y_eval, oof_llm, oof_base) + dl_llm_vs_jac = delong_auc_test(y_eval, oof_llm, oof_jac) + dl_both_vs_jac = delong_auc_test(y_eval, oof_both, oof_jac) + dl_both_vs_base = delong_auc_test(y_eval, oof_both, oof_base) + + logger.info( + " LR(%s) baseline: AUC=%.4f", fs_name, base_auc, + ) + logger.info( + " LR(%s) + Jaccard: AUC=%.4f (delta=%+.4f)", + fs_name, jac_auc, jac_auc - base_auc, + ) + logger.info( + " LR(%s) + LLM(%s): AUC=%.4f (delta=%+.4f, p=%.4f)", + fs_name, variant_label, llm_auc, llm_auc - base_auc, + dl_llm_vs_base["p_value"], + ) + logger.info( + " LR(%s) + LLM + Jac: AUC=%.4f (delta=%+.4f vs Jac, p=%.4f)", + fs_name, both_auc, both_auc - jac_auc, + dl_both_vs_jac["p_value"], + ) + + results[fs_name] = { + "baseline_auc": float(base_auc), + "jaccard_auc": float(jac_auc), + "llm_auc": float(llm_auc), + "llm_plus_jaccard_auc": float(both_auc), + "delong_llm_vs_baseline": { + "z": float(dl_llm_vs_base["z_statistic"]), + "p": float(dl_llm_vs_base["p_value"]), + }, + "delong_llm_vs_jaccard": { + "z": float(dl_llm_vs_jac["z_statistic"]), + "p": float(dl_llm_vs_jac["p_value"]), + }, + "delong_both_vs_jaccard": { + "z": float(dl_both_vs_jac["z_statistic"]), + "p": float(dl_both_vs_jac["p_value"]), + }, + "delong_both_vs_baseline": { + "z": float(dl_both_vs_base["z_statistic"]), + "p": float(dl_both_vs_base["p_value"]), + }, + } + + return results + + +# --------------------------------------------------------------------------- +# Second-phase re-ranking +# --------------------------------------------------------------------------- + +def evaluate_second_phase( + y: np.ndarray, + authors: list[str], + first_phase_scores: np.ndarray, + llm_scores: dict[str, float], + variant_label: str, + top_ns: list[int] | None = None, + alphas: list[float] | None = None, +) -> dict[str, Any]: + """Re-rank top-N from first-phase model using LLM scores. + + Authors without LLM scores are dropped before evaluation. + """ + if top_ns is None: + top_ns = [100, 200, 500] + if alphas is None: + alphas = [0.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0] + + # Drop authors without LLM scores + mask = np.array([a in llm_scores for a in authors]) + y = y[mask] + first_phase_scores = first_phase_scores[mask] + scored_authors = [a for a in authors if a in llm_scores] + llm_arr = np.array([llm_scores[a] for a in scored_authors]) + results: dict[str, Any] = {"n_dropped": int((~mask).sum())} + + for top_n in top_ns: + if top_n > len(y): + continue + + top_idx = np.argsort(first_phase_scores)[-top_n:] + y_top = y[top_idx] + first_top = first_phase_scores[top_idx] + llm_top = llm_arr[top_idx] + + # Baseline: first-phase ordering precision + first_order = np.argsort(first_top)[::-1] + llm_order = np.argsort(llm_top)[::-1] + + phase_results: dict[str, Any] = { + "n_suspended_in_top": int(y_top.sum()), + } + + for k in [25, 50]: + if k <= top_n: + phase_results[f"first_phase_p_at_{k}"] = float( + y_top[first_order[:k]].sum() / k + ) + phase_results[f"llm_rerank_p_at_{k}"] = float( + y_top[llm_order[:k]].sum() / k + ) + + # Alpha sweep: combined = alpha * first_phase + (1-alpha) * llm + # Z-score normalize both to make alpha values interpretable + fp_std = np.std(first_top) + llm_std = np.std(llm_top) + first_z = ((first_top - np.mean(first_top)) / fp_std) if fp_std > 0 else first_top + llm_z = ((llm_top - np.mean(llm_top)) / llm_std) if llm_std > 0 else llm_top + + alpha_results: dict[str, Any] = {} + for alpha in alphas: + combined = alpha * first_z + (1 - alpha) * llm_z + combined_order = np.argsort(combined)[::-1] + a_result: dict[str, Any] = {} + for k in [25, 50]: + if k <= top_n: + a_result[f"p_at_{k}"] = float( + y_top[combined_order[:k]].sum() / k + ) + alpha_results[f"alpha_{alpha:.1f}"] = a_result + + phase_results["alpha_sweep"] = alpha_results + results[f"top_{top_n}"] = phase_results + + logger.info( + " Second-phase top_%d (%s): %d suspended, " + "first-phase P@25=%.2f, LLM-rerank P@25=%.2f", + top_n, + variant_label, + y_top.sum(), + phase_results.get("first_phase_p_at_25", float("nan")), + phase_results.get("llm_rerank_p_at_25", float("nan")), + ) + + return results + + +# --------------------------------------------------------------------------- +# Warmup +# --------------------------------------------------------------------------- + +def run_warmup( + df: pd.DataFrame, + con: duckdb.DuckDBPyConnection, + cutoff: str, +) -> dict[str, Any]: + """Run end-to-end pipeline on 50 authors to validate everything works. + + Uses the given cutoff to ensure no lookahead bias even in warmup. + """ + logger.info("=== WARMUP: 50 authors (25 suspended, 25 active), cutoff=%s ===", cutoff) + + y = (df["account_status"] == "suspended").astype(int).values + author_col = "author" if "author" in df.columns else "login" + susp_idx = np.where(y == 1)[0] + active_idx = np.where(y == 0)[0] + + rng = np.random.RandomState(42) + sample_susp = rng.choice(susp_idx, size=min(25, len(susp_idx)), replace=False) + sample_active = rng.choice(active_idx, size=min(25, len(active_idx)), replace=False) + sample_idx = np.concatenate([sample_susp, sample_active]) + + sample_df = df.iloc[sample_idx].reset_index(drop=True) + sample_authors = sample_df[author_col].tolist() + sample_y = (sample_df["account_status"] == "suspended").astype(int).values + + logger.info( + " Sample: %d authors (%d suspended, %d active)", + len(sample_authors), sample_y.sum(), (1 - sample_y).sum(), + ) + + # Load PR data for sample — only PRs before cutoff + author_pr_data = load_author_pr_data(con, sample_authors, cutoff) + + # Score with V1 + logger.info(" Scoring with V1 (title-only)...") + scores_v1 = asyncio.run( + score_authors_batch(MODEL, author_pr_data, build_v1_prompt, CACHE_DIR) + ) + + # Standalone metrics — drop authors without scores + scored_mask = np.array([a in scores_v1 for a in sample_authors]) + llm_arr = np.array([scores_v1[a] for a in sample_authors if a in scores_v1]) + metrics = compute_metrics(sample_y[scored_mask], llm_arr) + v1_auc = metrics.get("auc_roc", float("nan")) + + logger.info( + " V1 warmup AUC=%.4f, P@25=%.2f", + v1_auc, + metrics.get("precision_at_25", float("nan")), + ) + + # Score distribution + status_map = dict(zip( + sample_df[author_col], sample_df["account_status"], strict=True, + )) + susp_scores = [scores_v1[a] for a in sample_authors + if a in scores_v1 and status_map[a] == "suspended"] + active_scores = [scores_v1[a] for a in sample_authors + if a in scores_v1 and status_map[a] == "active"] + + logger.info( + " Score distribution — suspended: mean=%.3f, active: mean=%.3f", + np.mean(susp_scores) if susp_scores else 0, + np.mean(active_scores) if active_scores else 0, + ) + + # Validate parse success rate + n_parsed = sum(1 for a in sample_authors if a in scores_v1) + logger.info(" Parse success: %d/%d (%.1f%%)", + n_parsed, len(sample_authors), 100 * n_parsed / len(sample_authors)) + + result = { + "cutoff": cutoff, + "n_authors": len(sample_authors), + "n_suspended": int(sample_y.sum()), + "n_active": int((1 - sample_y).sum()), + "v1_auc": float(v1_auc), + "v1_metrics": metrics, + "n_parsed": n_parsed, + "parse_rate": n_parsed / len(sample_authors), + "susp_mean_score": float(np.mean(susp_scores)) if susp_scores else None, + "active_mean_score": float(np.mean(active_scores)) if active_scores else None, + "pipeline_status": "ok", + } + + logger.info("=== WARMUP COMPLETE ===") + return result + + +# --------------------------------------------------------------------------- +# Full experiment +# --------------------------------------------------------------------------- + +def run_cutoff_experiment( + df: pd.DataFrame, + con: duckdb.DuckDBPyConnection, + author_repos: dict[str, set], + graph: Any, + cutoff: str, + all_results: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Run LLM experiment for a single temporal cutoff. + + All PR data shown to the LLM is restricted to before the cutoff date, + preventing lookahead bias. Behavioral features and graph data are also + pre-cutoff (handled by caller via load_temporal_features / build_author_repo_data). + """ + author_col = "author" if "author" in df.columns else "login" + authors = df[author_col].tolist() + y = (df["account_status"] == "suspended").astype(int).values + + results: dict[str, Any] = { + "cutoff": cutoff, + "n_authors": len(authors), + "n_suspended": int(y.sum()), + "n_active": int((1 - y).sum()), + } + + # Load PR text data — only PRs before cutoff + logger.info("Loading PR data for %d authors (cutoff=%s)...", len(authors), cutoff) + author_pr_data = load_author_pr_data(con, authors, cutoff) + n_with_data = sum(1 for d in author_pr_data.values() if d["titles"]) + logger.info(" %d/%d authors have PR title data before %s", n_with_data, len(authors), cutoff) + + def _checkpoint() -> None: + """Save incremental results to disk.""" + if all_results is not None: + all_results[cutoff] = results + _save_results(all_results) + + # --- Scoring phase --- + logger.info("=== SCORING PHASE (cutoff=%s) ===", cutoff) + + all_scores: dict[str, dict[str, float]] = {} + for label, builder in [("v1", build_v1_prompt), ("v2", build_v2_prompt), + ("v3", build_v3_prompt)]: + logger.info("Scoring %s for %d authors...", label, len(authors)) + scores = asyncio.run( + score_authors_batch(MODEL, author_pr_data, builder, CACHE_DIR) + ) + logger.info("%s complete: %d scored", label, len(scores)) + all_scores[label] = scores + results["scoring_progress"] = { + "completed": label, + "n_scored": len(scores), + } + _checkpoint() + + # --- Standalone evaluation --- + logger.info("=== STANDALONE EVALUATION ===") + standalone = {} + for label, scores in all_scores.items(): + standalone[label] = evaluate_standalone(y, authors, scores, label) + results["standalone"] = standalone + _checkpoint() + + # --- Combined model --- + logger.info("=== COMBINED MODEL EVALUATION ===") + + # Compute Jaccard OOF scores from pre-cutoff graph + logger.info("Computing Jaccard OOF scores...") + jaccard_oof = compute_graph_scores_cv( + df, author_repos, graph, "jaccard_max", + ) + + available_f16 = [f for f in F16 if f in df.columns] + feature_sets = {"F10": [f for f in F10 if f in df.columns], "F16": available_f16} + + combined = {} + for label, scores in all_scores.items(): + combined[label] = evaluate_combined( + df, y, authors, scores, jaccard_oof, feature_sets, label, + ) + results["combined"] = combined + _checkpoint() + + # Holm-Bonferroni correction across all DeLong tests + all_p_values: dict[str, float] = {} + for variant, variant_data in combined.items(): + for fs_name, fs_data in variant_data.items(): + if not isinstance(fs_data, dict): + continue + for test_key in ["delong_llm_vs_baseline", "delong_llm_vs_jaccard", + "delong_both_vs_jaccard", "delong_both_vs_baseline"]: + if test_key in fs_data: + label_key = f"{variant}/{fs_name}/{test_key}" + all_p_values[label_key] = fs_data[test_key]["p"] + + if all_p_values: + corrected = holm_bonferroni(all_p_values) + results["holm_bonferroni_correction"] = { + k: {"raw_p": v["p_value"], "adjusted_p": v["adjusted_p"], + "reject_h0": v["reject"]} + for k, v in corrected.items() + } + n_reject = sum(1 for v in corrected.values() if v["reject"]) + logger.info( + "Holm-Bonferroni: %d/%d tests significant after correction", + n_reject, len(corrected), + ) + + # --- Second-phase re-ranking --- + logger.info("=== SECOND-PHASE RE-RANKING ===") + + # First-phase: LR(F10) + Jaccard from pre-cutoff features + f10_available = [f for f in F10 if f in df.columns] + logger.info("Computing first-phase OOF scores (LR(F10) + Jaccard)...") + _, first_phase_oof = lr_with_proximity_cv(df, f10_available, jaccard_oof) + + second_phase = {} + for label, scores in all_scores.items(): + second_phase[label] = evaluate_second_phase( + y, authors, first_phase_oof, scores, label, + ) + results["second_phase"] = second_phase + _checkpoint() + + return results + + +# --------------------------------------------------------------------------- +# JSON serializer +# --------------------------------------------------------------------------- + +def _json_default(obj: Any) -> Any: + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, set): + return list(obj) + raise TypeError(f"Not serializable: {type(obj)}") + + +def _save_results(results: dict[str, Any]) -> None: + """Write results to disk immediately. Called after every major phase.""" + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + with open(OUTPUT_PATH, "w") as f: + json.dump(results, f, indent=2, default=_json_default) + logger.info("Results checkpoint saved to %s", OUTPUT_PATH) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description="LLM suspension detection experiment") + parser.add_argument( + "--warmup-only", action="store_true", + help="Run warmup phase only (50 authors at first cutoff)", + ) + parser.add_argument( + "--cutoff", type=str, default=None, + help="Run a single cutoff instead of all (e.g. '2024-01-01')", + ) + args = parser.parse_args() + + cutoffs = [args.cutoff] if args.cutoff else CUTOFFS + con = duckdb.connect(str(DB_PATH), read_only=True) + + # Warmup on first cutoff + warmup_cutoff = cutoffs[0] + logger.info("Loading temporal features for warmup cutoff %s...", warmup_cutoff) + warmup_df = load_temporal_features(warmup_cutoff) + if "login" in warmup_df.columns and "author" not in warmup_df.columns: + warmup_df = warmup_df.rename(columns={"login": "author"}) + + n_susp = (warmup_df["account_status"] == "suspended").sum() + if n_susp < 5: + logger.error( + "Warmup cutoff %s has only %d suspended, aborting.", + warmup_cutoff, n_susp, + ) + con.close() + sys.exit(1) + + warmup_result = run_warmup(warmup_df, con, warmup_cutoff) + + if warmup_result["pipeline_status"] != "ok": + logger.error("Warmup failed! Aborting.") + con.close() + sys.exit(1) + + if args.warmup_only: + logger.info("Warmup-only mode, saving warmup results and exiting.") + output = {"warmup": warmup_result, "config": {"model": MODEL, "cutoffs": cutoffs}} + _save_results(output) + con.close() + return + + # Full experiment — one run per cutoff + # Load prior results to resume from if they exist + all_results: dict[str, Any] = {} + if OUTPUT_PATH.exists(): + try: + with open(OUTPUT_PATH) as f: + all_results = json.load(f) + logger.info("Loaded prior results from %s", OUTPUT_PATH) + except (json.JSONDecodeError, OSError): + pass + all_results["config"] = {"model": MODEL, "cutoffs": cutoffs} + all_results["warmup"] = warmup_result + _save_results(all_results) + + for cutoff in cutoffs: + # Skip cutoffs that already have complete results (second_phase present) + prior = all_results.get(cutoff, {}) + if isinstance(prior, dict) and "second_phase" in prior: + logger.info("Cutoff %s already complete, skipping", cutoff) + continue + + logger.info("\n" + "=" * 60) + logger.info("CUTOFF: %s", cutoff) + logger.info("=" * 60) + + df = load_temporal_features(cutoff) + if "login" in df.columns and "author" not in df.columns: + df = df.rename(columns={"login": "author"}) + + n_susp = (df["account_status"] == "suspended").sum() + if n_susp < 5: + logger.warning("Skipping cutoff %s: only %d suspended authors", cutoff, n_susp) + continue + + # Build graph from pre-cutoff data + author_repos, graph = build_author_repo_data(con, cutoff) + + cutoff_results = run_cutoff_experiment( + df, con, author_repos, graph, cutoff, all_results, + ) + all_results[cutoff] = cutoff_results + _save_results(all_results) + + con.close() + logger.info("All cutoffs complete. Final results at %s", OUTPUT_PATH) + + # Print summary + print("\n" + "=" * 60) + print("LLM EXPERIMENT SUMMARY") + print("=" * 60) + print(f"\nModel: {MODEL}") + print(f"Cutoffs: {cutoffs}") + + for cutoff in cutoffs: + cr = all_results.get(cutoff) + if not cr: + continue + print(f"\n--- Cutoff: {cutoff} ({cr['n_authors']} authors, " + f"{cr['n_suspended']} suspended) ---") + + print(" Standalone AUC-ROC:") + for variant in ["v1", "v2", "v3"]: + sa = cr.get("standalone", {}).get(variant, {}) + print(f" {variant}: {sa.get('auc_roc', float('nan')):.4f}") + + print(" Best combined AUC-ROC:") + for variant in ["v1", "v2", "v3"]: + comb = cr.get("combined", {}).get(variant, {}) + for fs_name, fs_data in comb.items(): + if isinstance(fs_data, dict) and "llm_plus_jaccard_auc" in fs_data: + print(f" {variant}/{fs_name}: " + f"LLM+Jac={fs_data['llm_plus_jaccard_auc']:.4f}" + f" (Jac only={fs_data['jaccard_auc']:.4f})") + + +if __name__ == "__main__": + main() From f1a1e41f7f25ad82f2f30e28a9e61965156ee898 Mon Sep 17 00:00:00 2001 From: Jeff Date: Fri, 13 Mar 2026 11:21:41 +0000 Subject: [PATCH 4/7] Document negative result: suspension detection not viable on merged-PR population MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the analysis report with a clear conclusion section explaining why none of the tested methods (behavioral LR, k-NN, Jaccard, PPR, LLM) produce operationally useful results for detecting suspended accounts among authors with merged PRs. Best AUC 0.608 with near-zero precision. The merged-PR population is too homogeneous — these authors passed code review by definition. --- .../proximity_results/PROXIMITY_ANALYSIS.md | 79 +++++++++++++++---- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md index 6fdf692..69902a1 100644 --- a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md +++ b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md @@ -302,17 +302,68 @@ smaller populations, and second-phase re-ranking is ineffective. The LLM provide marginal incremental value as a combined LR feature but is not useful as a standalone detector or re-ranker on the merged-PR population. -## 6. Summary - -**Best overall method**: jaccard_max (Strategy B merged (Graph)), AUC = 0.5952 - -**Best combined method**: LR(F10) + LLM(V2) + Jaccard at 2024-01-01 cutoff, -AUC = 0.5771 - -The best methods exceed the AUC > 0.55 threshold, suggesting proximity-based -detection has *some* signal on the merged-PR population. However, the practical -value is limited — precision at operational thresholds (P@25, P@50) remains low -across all methods. LLM scoring provides marginal incremental value when combined -with Jaccard but is not useful standalone or for re-ranking. - -**Stage 12 replication** (all-authors, F16, cosine, k=5): AUC = 0.5573 (original stage 12: 0.595) +## 6. Conclusion: Negative Result + +This branch explored whether suspended GitHub accounts can be detected among +authors who have merged PRs, using behavioral features, graph proximity, k-NN +similarity, and LLM-based PR text analysis. The answer is **no** — not at a +level that would justify a production feature. + +### What we tested + +| Method | Best AUC | Best P@25 | Viable? | +|--------|----------|-----------|---------| +| Behavioral LR (F16) | 0.573 | 0.00 | No | +| k-NN proximity | 0.570 | 0.08 | No | +| Jaccard repo overlap | 0.595 | 0.08 | No | +| LR + Jaccard combined | 0.608 | — | No | +| LLM standalone (V2) | 0.570 | 0.04 | No | +| LLM + Jaccard combined | 0.577 | — | No | + +### Why it doesn't work + +1. **AUC barely above chance.** Best result is 0.608 (LR+Jaccard, Strategy B). + Random is 0.50. This is statistically above chance but not operationally useful. + +2. **Precision is near zero.** Best P@25 is 0.08 (2/25 correct). Flagging the + top 25 most suspicious accounts produces a 92% false positive rate. + +3. **Signal doesn't survive temporal holdout.** Strategy B (no temporal + constraint) gives 0.608; Strategy C (temporal, the honest test) gives 0.577 + at best. The signal weakens under realistic conditions. + +4. **LLM scoring added almost nothing.** 30,131 Gemini API calls across 3 + prompt variants. Best contribution: +0.026 AUC over Jaccard alone, only + significant at one of three cutoffs. Second-phase re-ranking was completely + ineffective. + +5. **Base rate is the fundamental problem.** With ~2.5% prevalence of suspended + accounts in the merged-PR population, even a moderately good classifier + produces overwhelming false positives. Useful precision would require AUC + well above 0.85. + +### Why the merged-PR population is hard + +The prior bot-detection work (stage 6) achieved AUC 0.619 on the full +population, but that included zero-PR suspended accounts that are trivially +separable. Once restricted to authors who got PRs merged — meaning they passed +code review — the population becomes too homogeneous to distinguish. Suspended +accounts with merged PRs look like active accounts with merged PRs, because +in both cases a human reviewer accepted their work. + +### What was learned + +- Jaccard repo overlap is the strongest single signal (AUC 0.595), suggesting + suspended accounts do cluster in certain repositories. But the effect is too + weak to act on. +- Behavioral features (PR metadata, contribution patterns) carry minimal signal + in this population. +- LLM analysis of PR titles and bodies cannot reliably distinguish suspended + from active authors when both have merged contributions. +- The 8-feature Bad Egg scoring model (commit d4c1278) provides a reasonable + heuristic for the full population but should not be expected to perform well + on the merged-PR subset specifically. + +**This branch is not intended for merge.** It documents a thorough negative +result that establishes the limits of suspension detection on the merged-PR +population. From c486ee37b4f76fa6a3d72e1727a8f31a421726f9 Mon Sep 17 00:00:00 2001 From: Jeff Date: Fri, 13 Mar 2026 11:29:04 +0000 Subject: [PATCH 5/7] Fix markdown line wrapping in analysis report --- .../proximity_results/PROXIMITY_ANALYSIS.md | 86 +++++-------------- 1 file changed, 20 insertions(+), 66 deletions(-) diff --git a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md index 69902a1..9610df9 100644 --- a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md +++ b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md @@ -1,7 +1,6 @@ # Proximity-Based Suspension Detection — Results -This report summarizes the results of proximity-based methods for detecting -suspended GitHub accounts among authors with merged PRs. +This report summarizes the results of proximity-based methods for detecting suspended GitHub accounts among authors with merged PRs. ## Methodology @@ -243,10 +242,7 @@ Behavioral LR baseline: AUC = 0.5727 ## 5. LLM Scoring Results (H5) -Tests whether LLM (Gemini 3.1 Pro) scoring of PR titles and bodies adds signal -beyond behavioral and graph-proximity features. Uses temporal cutoffs (Strategy C) -to prevent lookahead bias: the LLM only sees PR titles and bodies created before -the cutoff date. +Tests whether LLM (Gemini 3.1 Pro) scoring of PR titles and bodies adds signal beyond behavioral and graph-proximity features. Uses temporal cutoffs (Strategy C) to prevent lookahead bias: the LLM only sees PR titles and bodies created before the cutoff date. ### Prompt Variants @@ -254,8 +250,7 @@ the cutoff date. - **V2** (titles + bodies): Titles + first 500 chars of body for up to 10 PRs - **V3** (full profile): V2 + author metadata (total_prs, merge_rate, total_repos, career_span) -Model: `gemini/gemini-3.1-pro-preview`, temperature=1.0, 30,131 total API calls -across 3 cutoffs × 3 variants. Score failures dropped (not defaulted). +Model: `gemini/gemini-3.1-pro-preview`, temperature=1.0, 30,131 total API calls across 3 cutoffs × 3 variants. Score failures dropped (not defaulted). ### Standalone LLM Scoring @@ -265,10 +260,7 @@ across 3 cutoffs × 3 variants. Score failures dropped (not defaulted). | 2023-01-01 | 3,619 (92 susp) | 0.5168 | 0.5167 | 0.5045 | | 2024-01-01 | 7,642 (204 susp) | 0.5372 | 0.5469 | 0.5408 | -V2 (titles + bodies) is the best or tied-best variant at every cutoff. Standalone -AUC ranges 0.50–0.57, comparable to behavioral feature baselines but not strong -on its own. V3 (full profile) does not improve over V2, suggesting the metadata -block does not help the LLM beyond what it can infer from PR text. +V2 (titles + bodies) is the best or tied-best variant at every cutoff. Standalone AUC ranges 0.50–0.57, comparable to behavioral feature baselines but not strong on its own. V3 (full profile) does not improve over V2, suggesting the metadata block does not help the LLM beyond what it can infer from PR text. ### Combined Models (LR + LLM ± Jaccard, F10 features) @@ -278,36 +270,21 @@ block does not help the LLM beyond what it can infer from PR text. | 2023-01-01 | 0.5277 | 0.5321 | 0.5160 | 0.5222 | | 2024-01-01 | 0.5357 | 0.5510 | 0.5630 | 0.5771 | -At the 2024-01-01 cutoff (largest population), LLM+Jaccard combined reaches -AUC 0.577, a +0.026 improvement over Jaccard alone (0.551). Three of 24 DeLong -tests are significant after Holm-Bonferroni correction — all from the 2024-01-01 -cutoff. At earlier cutoffs with smaller populations, no tests reach significance. +At the 2024-01-01 cutoff (largest population), LLM+Jaccard combined reaches AUC 0.577, a +0.026 improvement over Jaccard alone (0.551). Three of 24 DeLong tests are significant after Holm-Bonferroni correction — all from the 2024-01-01 cutoff. At earlier cutoffs with smaller populations, no tests reach significance. ### Second-Phase Re-ranking -LLM re-ranking of top-N candidates from the first-phase model (LR+Jaccard) was -tested at top-100, top-200, and top-500 with alpha sweeps blending first-phase -and LLM scores (z-normalized). Results are uniformly negative: LLM re-ranking -does not improve precision at any operating point across any cutoff or variant. +LLM re-ranking of top-N candidates from the first-phase model (LR+Jaccard) was tested at top-100, top-200, and top-500 with alpha sweeps blending first-phase and LLM scores (z-normalized). Results are uniformly negative: LLM re-ranking does not improve precision at any operating point across any cutoff or variant. ### H5 Verdict **H5**: LLM scoring of PR text adds signal beyond behavioral + graph features -**Verdict**: WEAKLY SUPPORTED — On the largest population (2024-01-01, 7,642 -authors), LLM combined with Jaccard achieves the best single-cutoff AUC (0.577) -and 3/24 DeLong tests survive Holm-Bonferroni correction. However, the effect -is small (+0.026 over Jaccard alone), does not replicate at earlier cutoffs with -smaller populations, and second-phase re-ranking is ineffective. The LLM provides -marginal incremental value as a combined LR feature but is not useful as a -standalone detector or re-ranker on the merged-PR population. +**Verdict**: WEAKLY SUPPORTED — On the largest population (2024-01-01, 7,642 authors), LLM combined with Jaccard achieves the best single-cutoff AUC (0.577) and 3/24 DeLong tests survive Holm-Bonferroni correction. However, the effect is small (+0.026 over Jaccard alone), does not replicate at earlier cutoffs with smaller populations, and second-phase re-ranking is ineffective. The LLM provides marginal incremental value as a combined LR feature but is not useful as a standalone detector or re-ranker on the merged-PR population. ## 6. Conclusion: Negative Result -This branch explored whether suspended GitHub accounts can be detected among -authors who have merged PRs, using behavioral features, graph proximity, k-NN -similarity, and LLM-based PR text analysis. The answer is **no** — not at a -level that would justify a production feature. +This branch explored whether suspended GitHub accounts can be detected among authors who have merged PRs, using behavioral features, graph proximity, k-NN similarity, and LLM-based PR text analysis. The answer is **no** — not at a level that would justify a production feature. ### What we tested @@ -322,48 +299,25 @@ level that would justify a production feature. ### Why it doesn't work -1. **AUC barely above chance.** Best result is 0.608 (LR+Jaccard, Strategy B). - Random is 0.50. This is statistically above chance but not operationally useful. +1. **AUC barely above chance.** Best result is 0.608 (LR+Jaccard, Strategy B). Random is 0.50. This is statistically above chance but not operationally useful. -2. **Precision is near zero.** Best P@25 is 0.08 (2/25 correct). Flagging the - top 25 most suspicious accounts produces a 92% false positive rate. +2. **Precision is near zero.** Best P@25 is 0.08 (2/25 correct). Flagging the top 25 most suspicious accounts produces a 92% false positive rate. -3. **Signal doesn't survive temporal holdout.** Strategy B (no temporal - constraint) gives 0.608; Strategy C (temporal, the honest test) gives 0.577 - at best. The signal weakens under realistic conditions. +3. **Signal doesn't survive temporal holdout.** Strategy B (no temporal constraint) gives 0.608; Strategy C (temporal, the honest test) gives 0.577 at best. The signal weakens under realistic conditions. -4. **LLM scoring added almost nothing.** 30,131 Gemini API calls across 3 - prompt variants. Best contribution: +0.026 AUC over Jaccard alone, only - significant at one of three cutoffs. Second-phase re-ranking was completely - ineffective. +4. **LLM scoring added almost nothing.** 30,131 Gemini API calls across 3 prompt variants. Best contribution: +0.026 AUC over Jaccard alone, only significant at one of three cutoffs. Second-phase re-ranking was completely ineffective. -5. **Base rate is the fundamental problem.** With ~2.5% prevalence of suspended - accounts in the merged-PR population, even a moderately good classifier - produces overwhelming false positives. Useful precision would require AUC - well above 0.85. +5. **Base rate is the fundamental problem.** With ~2.5% prevalence of suspended accounts in the merged-PR population, even a moderately good classifier produces overwhelming false positives. Useful precision would require AUC well above 0.85. ### Why the merged-PR population is hard -The prior bot-detection work (stage 6) achieved AUC 0.619 on the full -population, but that included zero-PR suspended accounts that are trivially -separable. Once restricted to authors who got PRs merged — meaning they passed -code review — the population becomes too homogeneous to distinguish. Suspended -accounts with merged PRs look like active accounts with merged PRs, because -in both cases a human reviewer accepted their work. +The prior bot-detection work (stage 6) achieved AUC 0.619 on the full population, but that included zero-PR suspended accounts that are trivially separable. Once restricted to authors who got PRs merged — meaning they passed code review — the population becomes too homogeneous to distinguish. Suspended accounts with merged PRs look like active accounts with merged PRs, because in both cases a human reviewer accepted their work. ### What was learned -- Jaccard repo overlap is the strongest single signal (AUC 0.595), suggesting - suspended accounts do cluster in certain repositories. But the effect is too - weak to act on. -- Behavioral features (PR metadata, contribution patterns) carry minimal signal - in this population. -- LLM analysis of PR titles and bodies cannot reliably distinguish suspended - from active authors when both have merged contributions. -- The 8-feature Bad Egg scoring model (commit d4c1278) provides a reasonable - heuristic for the full population but should not be expected to perform well - on the merged-PR subset specifically. - -**This branch is not intended for merge.** It documents a thorough negative -result that establishes the limits of suspension detection on the merged-PR -population. +- Jaccard repo overlap is the strongest single signal (AUC 0.595), suggesting suspended accounts do cluster in certain repositories. But the effect is too weak to act on. +- Behavioral features (PR metadata, contribution patterns) carry minimal signal in this population. +- LLM analysis of PR titles and bodies cannot reliably distinguish suspended from active authors when both have merged contributions. +- The 8-feature Bad Egg scoring model (commit d4c1278) provides a reasonable heuristic for the full population but should not be expected to perform well on the merged-PR subset specifically. + +**This branch is not intended for merge.** It documents a thorough negative result that establishes the limits of suspension detection on the merged-PR population. From b1eb20bf2d0fd9dc76ee8010991006d44011dac3 Mon Sep 17 00:00:00 2001 From: Jeff Date: Fri, 13 Mar 2026 11:33:46 +0000 Subject: [PATCH 6/7] Remove em dashes and LLM writing tropes from analysis report --- .../proximity_results/PROXIMITY_ANALYSIS.md | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md index 9610df9..e7df5d7 100644 --- a/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md +++ b/experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md @@ -1,4 +1,4 @@ -# Proximity-Based Suspension Detection — Results +# Proximity-Based Suspension Detection -- Results This report summarizes the results of proximity-based methods for detecting suspended GitHub accounts among authors with merged PRs. @@ -222,22 +222,22 @@ Behavioral LR baseline: AUC = 0.5727 **H1**: Profile proximity detects suspension in merged-PR population (k-NN AUC > 0.55) -**Verdict**: SUPPORTED — Best AUC = 0.5698 > 0.55 threshold. Mean AUC = 0.5623. +**Verdict**: SUPPORTED -- Best AUC = 0.5698 > 0.55 threshold. Mean AUC = 0.5623. **H2**: Graph-based proximity captures structural signal (AUC > 0.55) -**Verdict**: SUPPORTED — Best AUC = 0.5952 > 0.55 threshold. Mean AUC = 0.5548. +**Verdict**: SUPPORTED -- Best AUC = 0.5952 > 0.55 threshold. Mean AUC = 0.5548. **H3**: Proximity signal is robust to seed selection bias -**Verdict**: NOT SUPPORTED — Strategy A AUC (0.4413) is substantially lower than Strategy B (0.5698), suggesting seed selection bias affects results. +**Verdict**: NOT SUPPORTED -- Strategy A AUC (0.4413) is substantially lower than Strategy B (0.5698), suggesting seed selection bias affects results. **H4**: Proximity adds incremental value to behavioral features -**Verdict**: SUPPORTED — F10+graph_combined: delta=+0.0493, p=0.0000; F10+both_combined: delta=+0.0209, p=0.0014; F16+graph_combined: delta=+0.0336, p=0.0000; F16+both_combined: delta=+0.0148, p=0.0011 +**Verdict**: SUPPORTED -- F10+graph_combined: delta=+0.0493, p=0.0000; F10+both_combined: delta=+0.0209, p=0.0014; F16+graph_combined: delta=+0.0336, p=0.0000; F16+both_combined: delta=+0.0148, p=0.0011 ## 5. LLM Scoring Results (H5) @@ -270,7 +270,7 @@ V2 (titles + bodies) is the best or tied-best variant at every cutoff. Standalon | 2023-01-01 | 0.5277 | 0.5321 | 0.5160 | 0.5222 | | 2024-01-01 | 0.5357 | 0.5510 | 0.5630 | 0.5771 | -At the 2024-01-01 cutoff (largest population), LLM+Jaccard combined reaches AUC 0.577, a +0.026 improvement over Jaccard alone (0.551). Three of 24 DeLong tests are significant after Holm-Bonferroni correction — all from the 2024-01-01 cutoff. At earlier cutoffs with smaller populations, no tests reach significance. +At the 2024-01-01 cutoff (largest population), LLM+Jaccard combined reaches AUC 0.577, a +0.026 improvement over Jaccard alone (0.551). Three of 24 DeLong tests are significant after Holm-Bonferroni correction -- all from the 2024-01-01 cutoff. At earlier cutoffs with smaller populations, no tests reach significance. ### Second-Phase Re-ranking @@ -280,11 +280,11 @@ LLM re-ranking of top-N candidates from the first-phase model (LR+Jaccard) was t **H5**: LLM scoring of PR text adds signal beyond behavioral + graph features -**Verdict**: WEAKLY SUPPORTED — On the largest population (2024-01-01, 7,642 authors), LLM combined with Jaccard achieves the best single-cutoff AUC (0.577) and 3/24 DeLong tests survive Holm-Bonferroni correction. However, the effect is small (+0.026 over Jaccard alone), does not replicate at earlier cutoffs with smaller populations, and second-phase re-ranking is ineffective. The LLM provides marginal incremental value as a combined LR feature but is not useful as a standalone detector or re-ranker on the merged-PR population. +**Verdict**: WEAKLY SUPPORTED -- On the largest population (2024-01-01, 7,642 authors), LLM combined with Jaccard achieves the best single-cutoff AUC (0.577) and 3/24 DeLong tests survive Holm-Bonferroni correction. However, the effect is small (+0.026 over Jaccard alone), does not replicate at earlier cutoffs with smaller populations, and second-phase re-ranking is ineffective. The LLM provides marginal incremental value as a combined LR feature but is not useful as a standalone detector or re-ranker on the merged-PR population. ## 6. Conclusion: Negative Result -This branch explored whether suspended GitHub accounts can be detected among authors who have merged PRs, using behavioral features, graph proximity, k-NN similarity, and LLM-based PR text analysis. The answer is **no** — not at a level that would justify a production feature. +This branch explored whether suspended GitHub accounts can be detected among authors who have merged PRs, using behavioral features, graph proximity, k-NN similarity, and LLM-based PR text analysis. The answer is no, not at a level that would justify a production feature. ### What we tested @@ -293,25 +293,25 @@ This branch explored whether suspended GitHub accounts can be detected among aut | Behavioral LR (F16) | 0.573 | 0.00 | No | | k-NN proximity | 0.570 | 0.08 | No | | Jaccard repo overlap | 0.595 | 0.08 | No | -| LR + Jaccard combined | 0.608 | — | No | +| LR + Jaccard combined | 0.608 | -- | No | | LLM standalone (V2) | 0.570 | 0.04 | No | -| LLM + Jaccard combined | 0.577 | — | No | +| LLM + Jaccard combined | 0.577 | -- | No | ### Why it doesn't work -1. **AUC barely above chance.** Best result is 0.608 (LR+Jaccard, Strategy B). Random is 0.50. This is statistically above chance but not operationally useful. +1. AUC barely above chance. Best result is 0.608 (LR+Jaccard, Strategy B). Random is 0.50. Statistically above chance but not operationally useful. -2. **Precision is near zero.** Best P@25 is 0.08 (2/25 correct). Flagging the top 25 most suspicious accounts produces a 92% false positive rate. +2. Precision is near zero. Best P@25 is 0.08 (2/25 correct). Flagging the top 25 most suspicious accounts produces a 92% false positive rate. -3. **Signal doesn't survive temporal holdout.** Strategy B (no temporal constraint) gives 0.608; Strategy C (temporal, the honest test) gives 0.577 at best. The signal weakens under realistic conditions. +3. Signal doesn't survive temporal holdout. Strategy B (no temporal constraint) gives 0.608; Strategy C (temporal, the honest test) gives 0.577 at best. -4. **LLM scoring added almost nothing.** 30,131 Gemini API calls across 3 prompt variants. Best contribution: +0.026 AUC over Jaccard alone, only significant at one of three cutoffs. Second-phase re-ranking was completely ineffective. +4. LLM scoring added almost nothing. 30,131 Gemini API calls across 3 prompt variants. Best contribution: +0.026 AUC over Jaccard alone, only significant at one of three cutoffs. Second-phase re-ranking was ineffective. -5. **Base rate is the fundamental problem.** With ~2.5% prevalence of suspended accounts in the merged-PR population, even a moderately good classifier produces overwhelming false positives. Useful precision would require AUC well above 0.85. +5. Base rate is the fundamental problem. ~2.5% prevalence of suspended accounts in the merged-PR population means even a moderately good classifier produces overwhelming false positives. Useful precision would require AUC well above 0.85. ### Why the merged-PR population is hard -The prior bot-detection work (stage 6) achieved AUC 0.619 on the full population, but that included zero-PR suspended accounts that are trivially separable. Once restricted to authors who got PRs merged — meaning they passed code review — the population becomes too homogeneous to distinguish. Suspended accounts with merged PRs look like active accounts with merged PRs, because in both cases a human reviewer accepted their work. +The prior bot-detection work (stage 6) achieved AUC 0.619 on the full population, but that included zero-PR suspended accounts that are trivially separable. Once restricted to authors who got PRs merged (i.e. they passed code review), the population becomes too homogeneous to distinguish. Suspended accounts with merged PRs look the same as active accounts with merged PRs. ### What was learned @@ -320,4 +320,4 @@ The prior bot-detection work (stage 6) achieved AUC 0.619 on the full population - LLM analysis of PR titles and bodies cannot reliably distinguish suspended from active authors when both have merged contributions. - The 8-feature Bad Egg scoring model (commit d4c1278) provides a reasonable heuristic for the full population but should not be expected to perform well on the merged-PR subset specifically. -**This branch is not intended for merge.** It documents a thorough negative result that establishes the limits of suspension detection on the merged-PR population. +This branch is not intended for merge. It records a negative result on suspension detection in the merged-PR population. From 02e9a717d2cf4ef64afe58729adf3964d5e8f310 Mon Sep 17 00:00:00 2001 From: Jeff Date: Fri, 13 Mar 2026 11:39:39 +0000 Subject: [PATCH 7/7] Address Gemini review: update docstring accuracy, simplify median calc --- src/good_egg/config.py | 5 ++++- src/good_egg/scorer.py | 34 +++++++++++----------------------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/good_egg/config.py b/src/good_egg/config.py index d998588..c62cbcd 100644 --- a/src/good_egg/config.py +++ b/src/good_egg/config.py @@ -156,7 +156,10 @@ class BadEggModelConfig(BaseModel): log1p(median_additions), log1p(median_files_changed). Fitted on 12,898 labeled authors (323 suspended / 12,575 active) - with balanced class weights. CV AUC 0.643. + with balanced class weights. CV AUC 0.643 on the full population + (including zero-PR accounts). On the merged-PR-only population, + discriminative power is minimal (AUC ~0.57). See + experiments/bot_detection/proximity_results/PROXIMITY_ANALYSIS.md. """ intercept: float = 1.8988 merge_rate_weight: float = -0.1699 diff --git a/src/good_egg/scorer.py b/src/good_egg/scorer.py index b40a982..7c6684d 100644 --- a/src/good_egg/scorer.py +++ b/src/good_egg/scorer.py @@ -4,6 +4,7 @@ import math import os +import statistics from collections import defaultdict import networkx as nx @@ -277,7 +278,7 @@ def _score_v3( top_contributions = self._build_top_contributions(user_data) language_match = self._check_language_match(user_data, context_language) - return TrustScore( + result = TrustScore( user_login=login, context_repo=context_repo, raw_score=merge_rate, @@ -297,6 +298,11 @@ def _score_v3( fresh_account=fresh_account, ) + if self.config.bad_egg.enabled and user_data.merged_prs: + result.suspicion_score = self._compute_suspicion_score(user_data) + + return result + # ------------------------------------------------------------------ # Fresh account advisory # ------------------------------------------------------------------ @@ -356,31 +362,13 @@ def _compute_suspicion_score( total_repos = len({pr.repo_name_with_owner for pr in user_data.merged_prs}) # Feature 7: median_additions (log-transformed) - additions = sorted(pr.additions for pr in user_data.merged_prs) - n = len(additions) - if n > 0: - mid = n // 2 - median_adds = ( - (additions[mid - 1] + additions[mid]) / 2.0 - if n % 2 == 0 - else float(additions[mid]) - ) - else: - median_adds = 0.0 + additions = [pr.additions for pr in user_data.merged_prs] + median_adds = statistics.median(additions) if additions else 0.0 log_median_additions = math.log1p(median_adds) # Feature 8: median_files_changed (log-transformed) - files = sorted(pr.changed_files for pr in user_data.merged_prs) - n_f = len(files) - if n_f > 0: - mid_f = n_f // 2 - median_files = ( - (files[mid_f - 1] + files[mid_f]) / 2.0 - if n_f % 2 == 0 - else float(files[mid_f]) - ) - else: - median_files = 0.0 + files = [pr.changed_files for pr in user_data.merged_prs] + median_files = statistics.median(files) if files else 0.0 log_median_files = math.log1p(median_files) # Logistic regression