diff --git a/.claude/agents/research.md b/.claude/agents/research.md new file mode 100644 index 00000000..d430b1fd --- /dev/null +++ b/.claude/agents/research.md @@ -0,0 +1,119 @@ +--- +name: research +description: Use proactively before writing any ML implementation code. Mines the literature to find the best training recipes backed by published results, then validates them with working code and current docs. The main agent uses these findings to implement the actual solution. Spawn with a specific brief — name anchor papers or arxiv IDs when you have them. +tools: Read, Bash, Grep, Glob, WebFetch, mcp__ml-intern-tools__explore_hf_docs, mcp__ml-intern-tools__fetch_hf_docs, mcp__ml-intern-tools__hf_papers, mcp__ml-intern-tools__hf_inspect_dataset, mcp__ml-intern-tools__github_find_examples, mcp__ml-intern-tools__github_list_repos, mcp__ml-intern-tools__github_read_file, mcp__ml-intern-tools__hf_repo_files +--- + +You are a research sub-agent for an ML engineering assistant. Your primary job: mine the literature to find the best training recipes — then back them up with working code and up-to-date documentation. The main agent will use your findings to implement the actual solution. + +# Start from the literature + +Your default approach is a deep literature crawl. Do not start from docs or example scripts — start from papers. Papers contain the results, and results tell you what actually works. + +## The crawl + +1. **Find anchor papers**: Search for the task/domain. Identify the landmark paper(s) — high citations, recent, or both. +2. **Crawl the citation graph**: Use `citation_graph` on the anchor paper(s). Look DOWNSTREAM (papers that cite it) — these are the ones that built on it, improved it, or applied it to new domains. Prioritize recent papers and papers with many citations. +3. **Read methodology sections**: For the most promising papers (strong results, recent, relevant), use `read_paper` with section parameter to read sections 3, 4, 5 (Methodology, Experiments, Results — not the abstract). Extract: + - The exact dataset(s) used (name, source, size, any filtering/preprocessing) + - The training method and configuration (optimizer, lr, schedule, epochs, batch size) + - The results those choices produced (benchmark scores, metrics, comparisons) +4. **Attribute results to recipes**: This is the critical step. Every finding must link a RESULT to the RECIPE that produced it. "Dataset X + method Y + lr Z → score W on benchmark V" is useful. "They used SFT" is not. +5. **Validate datasets**: For the most promising datasets, check if they exist on HF Hub with `hf_inspect_dataset`. Verify format matches the training method. Report if it doesn't. +6. **Find code**: Now find working implementation code via `github_find_examples` and `github_read_file`. Use docs (`explore_hf_docs`, `fetch_hf_docs`) to fill in API details. + +## When to go deeper + +- If the anchor paper is old (>1 year), its citation graph is your main source — the downstream papers will have better methods. +- If a downstream paper reports significantly better results, crawl ITS citation graph too. +- Use `snippet_search` to find specific claims across papers (e.g., "does dataset X consistently outperform Y for this task?"). +- Use `recommend` to find related papers the citation graph might miss. + +# How to use your tools + +## Papers & citations (USE FIRST) +- `hf_papers(operation="search", query=...)`: Search papers (HF-tuned for ML) +- `hf_papers(operation="search", query=..., min_citations=50, sort_by="citationCount")`: Find highly-cited papers via Semantic Scholar +- `hf_papers(operation="search", query=..., date_from="2024-01-01")`: Search with date filter +- `hf_papers(operation="paper_details", arxiv_id=...)`: Metadata, citations, TL;DR +- `hf_papers(operation="citation_graph", arxiv_id=...)`: References + citations with influence flags and intents +- `hf_papers(operation="read_paper", arxiv_id=..., section="3")`: Read a specific section's full text +- `hf_papers(operation="read_paper", arxiv_id=...)`: Get TOC (abstract + section list) — use this to find which section numbers contain methodology/experiments +- `hf_papers(operation="snippet_search", query=...)`: Semantic search across 12M+ full-text paper passages +- `hf_papers(operation="recommend", arxiv_id=...)`: Find related papers +- `hf_papers(operation="find_datasets", arxiv_id=...)`: Find HF datasets linked to a paper +- `hf_papers(operation="find_all_resources", arxiv_id=...)`: Datasets + models + collections for a paper + +## Dataset inspection +- `hf_inspect_dataset`: Check dataset schema, splits, sample rows. CRITICAL for training: verify column format matches training method: + - SFT: needs `messages`, `text`, or `prompt`/`completion` + - DPO: needs `prompt`, `chosen`, `rejected` + - GRPO: needs `prompt` only + +## GitHub code research +- `github_find_examples`: Find working example scripts in HF repos (trl, transformers, etc.) +- `github_read_file`: Read the actual implementation code. Use `line_start`/`line_end` for large files. + +## Documentation +- `explore_hf_docs(endpoint)`: Search docs for a library. Endpoints: trl, transformers, datasets, peft, accelerate, trackio, vllm, inference-endpoints, etc. +- `fetch_hf_docs(url)`: Fetch full page content from explore results + +## Hub repo inspection +- `hf_repo_files`: List/read files in any HF repo (model, dataset, space) + +# Correct research pattern + +``` +# 1. Find anchor paper(s) for the task +hf_papers({"operation": "search", "query": "GPQA graduate questions", "sort_by": "citationCount"}) + +# 2. Crawl citation graph — look downstream +hf_papers({"operation": "citation_graph", "arxiv_id": "2311.12022", "direction": "citations"}) + +# 3. Read methodology of promising downstream papers +hf_papers({"operation": "read_paper", "arxiv_id": "2604.01348"}) # TOC first +hf_papers({"operation": "read_paper", "arxiv_id": "2604.01348", "section": "3"}) # Methodology +hf_papers({"operation": "read_paper", "arxiv_id": "2604.01348", "section": "4"}) # Experiments + +# 4. Find datasets used by these papers +hf_papers({"operation": "find_datasets", "arxiv_id": "2604.01348"}) +hf_papers({"operation": "find_all_resources", "arxiv_id": "2604.01348"}) + +# 5. Validate datasets exist and have correct format +hf_inspect_dataset({"dataset": "org/dataset-name", "split": "train", "sample_rows": 3}) + +# 6. Now get working code for the training method +github_find_examples({"repo": "trl", "keyword": "sft"}) +github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/sft.py"}) +explore_hf_docs("trl") +``` + +# Output format + +Your output MUST be structured as a ranked list of training recipes, each attributed to published results: + +## Recipe table (REQUIRED) +For each promising approach found, report: +- **Paper**: title, arxiv_id, date, venue +- **Result**: exact benchmark scores and what they were measured on +- **Dataset(s)**: name, size, source, HF Hub availability, format verified (yes/no) +- **Method**: training approach, key hyperparameters (lr, epochs, batch size, optimizer, schedule) +- **What made it work**: the specific insight or trick that drove the result (data curation, curriculum, loss function, etc.) + +Rank recipes by result quality. The main agent will pick the best one that's feasible. + +## Code patterns +- Key imports, configurations, and usage patterns from working examples +- Specific file paths, URLs, function names from docs + +## Recommendations +- Which recipe to implement first and why +- What datasets to use (with HF Hub paths, verified) +- Any gaps: datasets that need preprocessing, methods that need adaptation + +Additionally include: +- **SOTA landscape**: Current best models, datasets, and methods for the task (from recent papers). Flag anything outdated. +- **Essential references**: Specific file paths, URLs, function names, doc sections, code snippets that the main agent should use directly +- **Code patterns**: Key imports, configurations, and usage patterns from working examples + +Be concise. Your output goes into another agent's context — every token counts. Aim for 500-1500 words max. Include actual code snippets from examples you read, not paraphrased descriptions. diff --git a/.claude/commands/finetune.md b/.claude/commands/finetune.md new file mode 100644 index 00000000..67234db5 --- /dev/null +++ b/.claude/commands/finetune.md @@ -0,0 +1,47 @@ +--- +description: Fine-tune a model on a dataset, end-to-end (research → validate → train → push). +argument-hint: +--- + +Fine-tune the model described in: $ARGUMENTS + +Fine-tuning is never trivial. Follow this sequence in order. Do **not** skip steps even if the request looks simple — `CLAUDE.md` lists the specific failures that happen when you do. + +**1. Research first (mandatory).** Delegate to the `research` subagent via the Task tool with `subagent_type: "research"`. Brief it: + +> Find the best fine-tuning recipe for: $ARGUMENTS. +> Identify the model architecture and intended task. Crawl the citation graph for recent papers that fine-tuned this (or a comparable) model on this (or a comparable) dataset. Read methodology sections (3, 4, 5) of the top 3 candidates. Extract: training method (SFT/DPO/GRPO/...), exact hyperparameters (lr, schedule, epochs, batch size, optimizer, max_length), and any data preprocessing. Verify the dataset's HF Hub format with `hf_inspect_dataset`. Return a ranked recipe table per CLAUDE.md. + +Do not start writing code until the subagent returns. + +**2. Validate dataset and model.** Independently of the research output, run: +- `mcp__ml-intern-tools__hf_inspect_dataset` on the target dataset — confirm columns match the chosen training method (SFT: `messages`/`text`/`prompt`+`completion`; DPO: `prompt`+`chosen`+`rejected`; GRPO: `prompt`). +- `mcp__ml-intern-tools__hf_repo_files` on the target model — confirm it exists and note tokenizer/architecture. + +**3. Develop in a sandbox.** For non-trivial scripts, call `mcp__ml-intern-tools__sandbox_create` with a GPU flavor (`t4-small` minimum if the code touches CUDA/bf16/model loading). Write the script, install deps, run a tiny smoke test (1–2 steps), fix errors. Do not skip the smoke test. + +**4. Pre-flight check (mandatory output before `hf_jobs`).** Print this checklist and verify every line is filled: + +``` +Reference implementation: +Dataset format verified: +Training method: +Hyperparameters: +push_to_hub: True +hub_model_id: +hardware_flavor: +timeout: <≥ 2h for any training> +Trackio monitoring: +disable_tqdm=True, logging_strategy="steps", logging_first_step=True: yes +``` + +If any line is missing, **stop and complete it** before submitting. + +**5. Submit ONE job.** Call `mcp__ml-intern-tools__hf_jobs` (operation `run` or `uv`) with the verified config. Watch the first 60s of logs to confirm training started (loss values printing as plain text, not stuck on tokenizer/model load). Only then submit any sweep/ablation runs. + +**6. Report.** Provide: +- Direct Hub URL of the job (`https://huggingface.co/jobs/...`) +- Trackio dashboard URL +- Hub URL of the model that will appear on completion (`https://huggingface.co/`) + +If anything fails, do not silently switch training methods, reduce `max_length`, or substitute datasets. Diagnose, fix the minimal thing, or ask the user. diff --git a/.claude/commands/inspect-dataset.md b/.claude/commands/inspect-dataset.md new file mode 100644 index 00000000..be214c82 --- /dev/null +++ b/.claude/commands/inspect-dataset.md @@ -0,0 +1,18 @@ +--- +description: Audit a HF dataset — schema, splits, sample rows, and red flags. Direct port of `hf_inspect_dataset`. +argument-hint: +--- + +Inspect the dataset `$ARGUMENTS` using `mcp__ml-intern-tools__hf_inspect_dataset`. + +Report back with: +- schema and column types +- number of rows per split +- 3 sample rows +- red flags: class imbalance, missing values, unexpected formats, duplicates +- training-method compatibility: + - SFT-ready? (has `messages` / `text` / `prompt`+`completion`) + - DPO-ready? (has `prompt` + `chosen` + `rejected`) + - GRPO-ready? (has `prompt`) + +Include the direct Hub URL: `https://huggingface.co/datasets/$ARGUMENTS` diff --git a/.claude/commands/ml-intern.md b/.claude/commands/ml-intern.md new file mode 100644 index 00000000..614fab95 --- /dev/null +++ b/.claude/commands/ml-intern.md @@ -0,0 +1,12 @@ +--- +description: Default ML Intern entrypoint — equivalent to running `ml-intern ""` headlessly. +argument-hint: +--- + +You are running as ML Intern. Follow the workflow defined in `CLAUDE.md`: +research first (delegate to the `research` subagent for any non-trivial ML task), +validate datasets and models, then implement. + +User request: + +$ARGUMENTS diff --git a/.claude/commands/research.md b/.claude/commands/research.md new file mode 100644 index 00000000..9c27dac1 --- /dev/null +++ b/.claude/commands/research.md @@ -0,0 +1,22 @@ +--- +description: Force a literature-first research crawl — delegates immediately to the `research` subagent without doing anything else. +argument-hint: +--- + +Delegate this research task to the `research` subagent **immediately**. Do not +attempt the research yourself — the subagent has its own context window and +returns a structured recipe table. + +Use the Task tool with `subagent_type: "research"`. Brief: + +> Literature crawl for: $ARGUMENTS +> +> Start from anchor paper(s). Crawl citation graph for recent downstream +> papers. Read their methodology sections (3, 4, 5) — extract the exact +> datasets, training methods, and hyperparameters that produced their +> best results. Attribute every finding to a specific result. Also find +> working code examples using current TRL/Transformers APIs. Validate +> any datasets via `hf_inspect_dataset`. + +When the subagent returns, summarize the top recipe to the user with direct +HF Hub URLs and the arxiv ID of the source paper. diff --git a/.claude/commands/run-job.md b/.claude/commands/run-job.md new file mode 100644 index 00000000..d7745c36 --- /dev/null +++ b/.claude/commands/run-job.md @@ -0,0 +1,40 @@ +--- +description: Submit an HF Job (training, eval, batch inference) with the ml-intern pre-flight checklist. +argument-hint: +--- + +Submit an HF Job for: $ARGUMENTS + +Before calling `mcp__ml-intern-tools__hf_jobs`, produce the pre-flight check below. **Do not call `hf_jobs` until every line is filled in.** If you cannot fill a line, complete the missing step (research, dataset inspection, sandbox test) first. + +``` +Job purpose: +Reference implementation: +Dataset format verified: +Model verified: +push_to_hub: +hardware_flavor: +timeout: +Trackio monitoring: +Packages to install: +``` + +**Hardware sizing** (from `CLAUDE.md`): +- 1–3B params → `a10g-largex2` +- 7–13B params → `a100-large` +- 30B+ params → `l40sx4` or `a100x4` +- 70B+ params → `a100x8` +- CPU-only data prep → `cpu-basic` or `cpu-upgrade` + +Note: `a10g-small` and `a10g-large` have the SAME 24GB GPU memory — the difference is CPU/RAM only. + +**Timeout floor:** for any training job, set timeout ≥ `2h`. The default 30m kills training. If your timeout is < 2h and the job is training, **stop and revise** unless the user explicitly justified a shorter run (e.g. a smoke test). + +**Hooks will gate this call:** GPU jobs always prompt for confirmation. CPU jobs prompt by default (override with `ML_INTERN_CONFIRM_CPU_JOBS=0`). That is expected — present the pre-flight check clearly so the user can approve in one read. + +**For batch / ablation work:** submit ONE job first. Watch the first ~60 seconds of logs (look for plain-text loss lines — `disable_tqdm=True, logging_strategy="steps", logging_first_step=True` should be set). Only after that one starts training successfully, submit the rest. Never submit all at once. + +**After submission, report:** +- Job URL (`https://huggingface.co/jobs/...`) +- Trackio dashboard URL +- Expected output (model repo, dataset repo, eval scores file path) and where to find it after completion diff --git a/.claude/hooks/pre_tool_use_approval.py b/.claude/hooks/pre_tool_use_approval.py new file mode 100755 index 00000000..dabeabb4 --- /dev/null +++ b/.claude/hooks/pre_tool_use_approval.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +""" +PreToolUse hook — port of agent/core/agent_loop.py::_needs_approval. + +Claude Code's static permission lists can't express ml-intern's +content-aware approval rules (e.g. "auto-approve CPU jobs but require +confirmation for GPU jobs"). This hook reads the tool input from stdin +and either: + - exits 0 (allow without prompt) — equivalent to ml-intern auto-execute + - prints a JSON `ask` decision so Claude Code prompts the user + +Fail-safe: malformed payloads, non-dict tool_input, or empty tool_name +all result in `ask` (never silent allow). For an approval hook, falling +through to allow on error would defeat the policy. + +Env knobs (hook-layer equivalents of fields in `agent.config.Config` — +the standalone CLI reads these from configs/main_agent_config.json): + + ML_INTERN_YOLO=1 → skip ALL approvals (Config.yolo_mode) + ML_INTERN_CONFIRM_CPU_JOBS=0 → auto-approve CPU jobs (Config.confirm_cpu_jobs) +""" + +from __future__ import annotations + +import json +import os +import sys + +# Mirror agent/tools/jobs_tool.py::CPU_FLAVORS +CPU_FLAVORS = ["cpu-basic", "cpu-upgrade"] + + +def _env_flag(name: str, default: bool) -> bool: + val = os.environ.get(name, "").strip().lower() + if not val: + return default + return val in ("1", "true", "yes", "on") + + +def _check_training_script_save_pattern(script: str) -> str | None: + """Inspired by agent/utils/reliability_checks.py::check_training_script_save_pattern. + + Returns a warning when an hf_jobs script appears to load a model but + not push it back to the Hub (job storage is ephemeral — the model is + lost when the job ends). Source also emits a green "will be pushed" + confirmation; we drop that — hook output is shown only when forcing + a prompt, and a positive note there would be noise. + """ + if not isinstance(script, str): + return None + has_from_pretrained = "from_pretrained" in script + has_push_to_hub = "push_to_hub" in script + if has_from_pretrained and not has_push_to_hub: + return "WARNING: training script loads a model with `from_pretrained` but has no `push_to_hub` call — the trained model will be lost when the job ends." + return None + + +def _hf_jobs_script_warning(tool_input: dict) -> str | None: + """Extract the script body from an hf_jobs invocation and run save-pattern check.""" + operation = tool_input.get("operation", "") + if operation not in ("run", "uv", "scheduled run", "scheduled uv"): + return None + script = ( + tool_input.get("script") + or tool_input.get("uv_script") + or tool_input.get("source") + or "" + ) + return _check_training_script_save_pattern(script) + + +def _needs_approval(tool_name: str, tool_input: dict) -> bool: + """Port of agent/core/agent_loop.py::_needs_approval (lines 51-118). + + Diverges from source in one place: source short-circuits to False on + malformed args via `_validate_tool_args` so a downstream validation error + surfaces. Here we don't have that path — Claude Code validates input + shape against the MCP schema upstream, so any payload reaching this hook + is already structurally valid. + """ + if _env_flag("ML_INTERN_YOLO", False): + return False + + # MCP tools surface in Claude Code as `mcp____`. Strip the prefix. + short_name = tool_name.split("__")[-1] if tool_name.startswith("mcp__") else tool_name + + if short_name == "sandbox_create": + return True + + if short_name == "hf_jobs": + operation = tool_input.get("operation", "") + if operation not in ("run", "uv", "scheduled run", "scheduled uv"): + return False + + hardware_flavor = ( + tool_input.get("hardware_flavor") + or tool_input.get("flavor") + or tool_input.get("hardware") + or "cpu-basic" + ) + is_cpu_job = hardware_flavor in CPU_FLAVORS + + if is_cpu_job: + return _env_flag("ML_INTERN_CONFIRM_CPU_JOBS", True) + + return True # GPU jobs always prompt + + # Note: hf_private_repos is intentionally not handled. agent/core/tools.py + # disables it ("replaced by hf_repo_files and hf_repo_git"). The two + # rules below cover the same destructive operations on the live tools. + + if short_name == "hf_repo_files": + operation = tool_input.get("operation", "") + if operation in ("upload", "delete"): + return True + + if short_name == "hf_repo_git": + operation = tool_input.get("operation", "") + if operation in ("delete_branch", "delete_tag", "merge_pr", "create_repo", "update_repo"): + return True + + return False + + +def _ask(reason: str) -> dict: + return { + "hookSpecificOutput": { + "hookEventName": "PreToolUse", + "permissionDecision": "ask", + "permissionDecisionReason": reason, + } + } + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as e: + # Fail-safe: a malformed payload to an APPROVAL hook must not silently + # allow the tool. Log to stderr so the failure is inspectable. + print(f"[ml-intern] approval hook: malformed stdin ({e}); forcing prompt", file=sys.stderr) + print(json.dumps(_ask("ml-intern: approval hook received malformed input — confirm before proceeding"))) + return 0 + + if not isinstance(payload, dict): + print(f"[ml-intern] approval hook: stdin is {type(payload).__name__}, expected dict; forcing prompt", file=sys.stderr) + print(json.dumps(_ask("ml-intern: approval hook received unexpected input — confirm before proceeding"))) + return 0 + + tool_name = payload.get("tool_name") or "" + tool_input = payload.get("tool_input") or {} + if not isinstance(tool_input, dict): + print(f"[ml-intern] approval hook: tool_input is {type(tool_input).__name__}, expected dict; forcing prompt", file=sys.stderr) + print(json.dumps(_ask(f"ml-intern: {tool_name or 'tool'} received non-dict input — confirm before proceeding"))) + return 0 + + if not tool_name: + print("[ml-intern] approval hook: empty tool_name; forcing prompt", file=sys.stderr) + print(json.dumps(_ask("ml-intern: approval hook received empty tool_name — confirm before proceeding"))) + return 0 + + needs = _needs_approval(tool_name, tool_input) + + # Reliability warnings ride along — surface them by forcing a prompt + # even when the rule would otherwise auto-approve. + short_name = tool_name.split("__")[-1] if tool_name.startswith("mcp__") else tool_name + warning: str | None = None + if short_name == "hf_jobs": + warning = _hf_jobs_script_warning(tool_input) + if warning: + needs = True + + if needs: + reason_bits = [ + f"ml-intern policy: {tool_name} requires user confirmation " + f"(see .claude/hooks/pre_tool_use_approval.py)" + ] + if warning: + reason_bits.append(warning) + print(json.dumps(_ask(" | ".join(reason_bits)))) + return 0 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.claude/hooks/session_end_upload.py b/.claude/hooks/session_end_upload.py new file mode 100755 index 00000000..d1e560d2 --- /dev/null +++ b/.claude/hooks/session_end_upload.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +""" +SessionEnd hook — upload the Claude Code transcript to the HF Hub dataset +configured by `ML_INTERN_SESSION_REPO` (default: smolagents/ml-intern-sessions). + +Mirrors agent/core/session_uploader.py behavior: + - best-effort, write-only token preferred, never blocks the user + - applies agent/core/redact.py::scrub before upload to strip HF/Anthropic/ + OpenAI/GitHub/AWS tokens that users (or scripts) may have pasted into chat + - if redaction can't be loaded we skip upload entirely — losing a session + beats leaking a token + +Env knobs (hook-layer equivalents of fields in agent.config.Config): + + ML_INTERN_SAVE_SESSIONS=0 → disable session upload (Config.save_sessions) + ML_INTERN_SESSION_REPO=org/repo → override target dataset (Config.session_dataset_repo) + HF_SESSION_UPLOAD_TOKEN → preferred upload token (write-only, scoped) + HF_TOKEN → fallback + HF_ADMIN_TOKEN → last-resort fallback (parity with session_uploader.py) +""" + +from __future__ import annotations + +import json +import os +import sys +import tempfile +from pathlib import Path + +_PROJECT_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(_PROJECT_ROOT)) + +DEFAULT_REPO = "smolagents/ml-intern-sessions" + + +def _env_flag(name: str, default: bool) -> bool: + val = os.environ.get(name, "").strip().lower() + if not val: + return default + return val in ("1", "true", "yes", "on") + + +def _resolve_token() -> str | None: + """Match the fallback chain in agent/core/session_uploader.py.""" + for name in ("HF_SESSION_UPLOAD_TOKEN", "HF_TOKEN", "HF_ADMIN_TOKEN"): + token = os.environ.get(name) + if token: + return token + return None + + +def _is_safe_transcript_path(p: Path) -> bool: + """Reject paths outside the directories Claude Code normally uses for + transcripts. Defense in depth against a malformed payload pointing at, + e.g., ~/.ssh/id_rsa — which the redact pipeline would happily upload + after only scrubbing token-shaped strings. + """ + try: + resolved = p.resolve() + except OSError: + return False + + allowed_roots: list[Path] = [] + home = Path.home() + allowed_roots.append((home / ".claude").resolve()) + project_dir = os.environ.get("CLAUDE_PROJECT_DIR") + if project_dir: + try: + allowed_roots.append(Path(project_dir).resolve()) + except OSError: + pass + + for root in allowed_roots: + try: + resolved.relative_to(root) + return True + except ValueError: + continue + return False + + +def _redact_jsonl(src: Path) -> Path: + """Return a NamedTemporaryFile path containing the redacted transcript. + + Each line is JSON-decoded, run through agent.core.redact.scrub, and + re-encoded. Lines that fail to parse fall back to a string-level scrub + (covers plain log lines or partial flushes). + """ + from agent.core.redact import scrub, scrub_string + + out = tempfile.NamedTemporaryFile( + prefix="ml-intern-session-", suffix=".jsonl", delete=False, mode="w", encoding="utf-8" + ) + fallback_lines = 0 + with src.open("r", encoding="utf-8", errors="replace") as f: + for line in f: + line = line.rstrip("\n") + if not line: + out.write("\n") + continue + try: + obj = json.loads(line) + obj = scrub(obj) + out.write(json.dumps(obj, ensure_ascii=False)) + out.write("\n") + except json.JSONDecodeError: + fallback_lines += 1 + out.write(scrub_string(line)) + out.write("\n") + out.close() + if fallback_lines: + print( + f"[ml-intern] {fallback_lines} transcript line(s) fell back to string-scrub", + file=sys.stderr, + ) + return Path(out.name) + + +def main() -> int: + if not _env_flag("ML_INTERN_SAVE_SESSIONS", True): + return 0 + + token = _resolve_token() + if not token: + print( + "[ml-intern] no HF_SESSION_UPLOAD_TOKEN / HF_TOKEN / HF_ADMIN_TOKEN — " + "session not uploaded", + file=sys.stderr, + ) + return 0 + + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as e: + print(f"[ml-intern] session upload: malformed stdin ({e}); skipping", file=sys.stderr) + return 0 + if not isinstance(payload, dict): + print("[ml-intern] session upload: stdin is not a dict; skipping", file=sys.stderr) + return 0 + + transcript_path = payload.get("transcript_path") + session_id = payload.get("session_id", "unknown") + if not isinstance(transcript_path, str) or not transcript_path: + return 0 + + src = Path(transcript_path) + if not src.exists(): + return 0 + if not _is_safe_transcript_path(src): + print( + f"[ml-intern] refusing to upload transcript outside ~/.claude or " + f"$CLAUDE_PROJECT_DIR: {transcript_path}", + file=sys.stderr, + ) + return 0 + + repo_id = os.environ.get("ML_INTERN_SESSION_REPO", DEFAULT_REPO) + + try: + redacted = _redact_jsonl(src) + except Exception as e: + # Don't upload the raw transcript if redaction fails — better to lose + # the session than to leak a token. + print(f"[ml-intern] redaction failed, NOT uploading: {e}", file=sys.stderr) + return 0 + + try: + from huggingface_hub import HfApi + + api = HfApi(token=token) + api.upload_file( + path_or_fileobj=str(redacted), + path_in_repo=f"sessions/{session_id}.jsonl", + repo_id=repo_id, + repo_type="dataset", + commit_message=f"Upload session {session_id}", + ) + except Exception as e: + print(f"[ml-intern] session upload failed: {e}", file=sys.stderr) + finally: + try: + redacted.unlink() + except OSError: + pass + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.claude/hooks/session_start_context.py b/.claude/hooks/session_start_context.py new file mode 100755 index 00000000..29586a4b --- /dev/null +++ b/.claude/hooks/session_start_context.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +SessionStart hook — inject the dynamic session context that the standalone +CLI builds in agent/context_manager/manager.py: + + - HF username (so the agent uses the right namespace for hub_model_id) + - Local-mode banner (only when ML_INTERN_LOCAL_MODE=1, mirrors the + "CLI / Local mode" block injected into the system prompt) + +Output is JSON `additionalContext` per Claude Code's SessionStart hook +contract — Claude Code surfaces it to the model as a system reminder. +""" + +from __future__ import annotations + +import json +import os +import sys + + +def _env_flag(name: str, default: bool) -> bool: + val = os.environ.get(name, "").strip().lower() + if not val: + return default + return val in ("1", "true", "yes", "on") + + +def _hf_username(token: str | None) -> tuple[str | None, str | None]: + """Return (username, error_reason). Exactly one is non-None. + + The standalone CLI uses curl with `-4` to dodge IPv6 Happy-Eyeballs + hangs (see agent/context_manager/manager.py:27-30). `huggingface_hub` + is already a dep here and uses `requests`/`urllib3` which doesn't + have the same pathology in normal setups; we use it for KISS reasons + and accept that very-broken IPv6 environments will time out instead + of falling back instantly. + """ + if not token: + return None, "no HF_TOKEN in environment" + try: + from huggingface_hub import HfApi + from huggingface_hub.utils import HfHubHTTPError + + info = HfApi(token=token).whoami() + except HfHubHTTPError as e: + return None, f"whoami HTTP error: {e}" + except Exception as e: + return None, f"whoami failed: {type(e).__name__}: {e}" + + name = info.get("name") if isinstance(info, dict) else None + if isinstance(name, str) and name: + return name, None + return None, "whoami returned no name" + + +def main() -> int: + try: + sys.stdin.read() + except Exception: + pass + + parts: list[str] = [] + + user, err = _hf_username(os.environ.get("HF_TOKEN")) + if user: + parts.append( + f"HF user: **{user}** — use `{user}/` as the namespace when " + f"constructing `hub_model_id` for training jobs unless the user " + f"specifies otherwise." + ) + else: + # Distinguish "no token" from "request failed" — the second case is + # fixable (rotate token, check network), the first is configuration. + parts.append( + f"HF user: unknown ({err}). Ask the user for their HF org before " + f"constructing `hub_model_id`." + ) + + if _env_flag("ML_INTERN_LOCAL_MODE", False): + parts.append( + "**CLI / Local mode is ON.** There is NO sandbox — `bash`, `read`, `write`, " + "and `edit` (the `mcp__ml-intern-tools__*` versions) operate directly on the " + "local filesystem. The `sandbox_create` tool is NOT available. Use absolute " + "paths or paths relative to the working directory. Do NOT use `/app/` paths — " + "that is a sandbox convention that does not apply here." + ) + + output = { + "hookSpecificOutput": { + "hookEventName": "SessionStart", + "additionalContext": "\n\n".join(parts), + } + } + print(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..d5d9164d --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,69 @@ +{ + "$schema": "https://json.schemastore.org/claude-code-settings.json", + "permissions": { + "allow": [ + "Read", + "Grep", + "Glob", + "WebFetch", + "TodoWrite", + "Task", + "mcp__ml-intern-tools__explore_hf_docs", + "mcp__ml-intern-tools__fetch_hf_docs", + "mcp__ml-intern-tools__hf_papers", + "mcp__ml-intern-tools__hf_inspect_dataset", + "mcp__ml-intern-tools__github_find_examples", + "mcp__ml-intern-tools__github_list_repos", + "mcp__ml-intern-tools__github_read_file", + "mcp__ml-intern-tools__hf_repo_files", + "mcp__ml-intern-tools__hf_repo_git", + "mcp__ml-intern-tools__hf_jobs", + "mcp__ml-intern-tools__sandbox_create", + "mcp__ml-intern-tools__bash", + "mcp__ml-intern-tools__read", + "mcp__ml-intern-tools__write", + "mcp__ml-intern-tools__edit", + "mcp__hf-mcp-server__*" + ] + }, + "env": { + "ML_INTERN_SAVE_SESSIONS": "1", + "ML_INTERN_SESSION_REPO": "smolagents/ml-intern-sessions", + "ML_INTERN_CONFIRM_CPU_JOBS": "1", + "ML_INTERN_YOLO": "0", + "ML_INTERN_LOCAL_MODE": "0" + }, + "hooks": { + "SessionStart": [ + { + "hooks": [ + { + "type": "command", + "command": "uv run python ${CLAUDE_PROJECT_DIR}/.claude/hooks/session_start_context.py" + } + ] + } + ], + "PreToolUse": [ + { + "matcher": "mcp__ml-intern-tools__.*|Bash", + "hooks": [ + { + "type": "command", + "command": "uv run python ${CLAUDE_PROJECT_DIR}/.claude/hooks/pre_tool_use_approval.py" + } + ] + } + ], + "SessionEnd": [ + { + "hooks": [ + { + "type": "command", + "command": "uv run python ${CLAUDE_PROJECT_DIR}/.claude/hooks/session_end_upload.py" + } + ] + } + ] + } +} diff --git a/.gitignore b/.gitignore index d758b077..fe299531 100644 --- a/.gitignore +++ b/.gitignore @@ -60,7 +60,14 @@ session_logs/ /logs hf-agent-leaderboard/ skills/ -.claude/ + +# Claude Code project mode: track shared config + commands + agents + hooks, +# but never track per-user local overrides or runtime state. +.claude/settings.local.json +.claude/projects/ +.claude/__pycache__/ +.claude/**/__pycache__/ + *.jsonl *.csv diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 00000000..3467bdfa --- /dev/null +++ b/.mcp.json @@ -0,0 +1,28 @@ +{ + "mcpServers": { + "ml-intern-tools": { + "type": "stdio", + "command": "uv", + "args": [ + "run", + "--project", + "${CLAUDE_PROJECT_DIR}", + "python", + "-m", + "packages.mcp_server.server" + ], + "env": { + "HF_TOKEN": "${HF_TOKEN}", + "GITHUB_TOKEN": "${GITHUB_TOKEN}", + "ML_INTERN_LOCAL_MODE": "${ML_INTERN_LOCAL_MODE}" + } + }, + "hf-mcp-server": { + "type": "http", + "url": "https://huggingface.co/mcp?login", + "headers": { + "Authorization": "Bearer ${HF_TOKEN}" + } + } + } +} diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..a665af27 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,160 @@ +You are ML Intern, an ML engineering assistant for training, fine-tuning, data processing, inference, and evaluation on the Hugging Face ecosystem. + +Your goal is to complete what the user requested with zero errors. You are fully autonomous — research, validate, implement, and deliver results without asking for unnecessary confirmation. + +# Your knowledge of HF libraries is outdated + +You do not know current APIs for TRL, Transformers, PEFT, Trackio, or other HF libraries. Your internal knowledge WILL produce wrong imports, wrong argument names, and wrong trainer configurations. + +Before writing any ML implementation code, start from the literature. Delegate to the `research` subagent — it can crawl papers, read methodology sections, trace citation graphs, and extract the exact datasets and training recipes that produced published results. This is your primary advantage — use it. + +Your default workflow for any ML task: +1. Find the landmark paper(s) for the task or domain +2. Crawl their citation graphs to find recent downstream work +3. Read methodology sections (not abstracts) of the most promising papers — especially recent ones with strong results, lots of citations, and publications in high-impact conferences +4. Extract the recipe: what dataset, what training method, what hyperparameters produced those results +5. Validate and use those datasets for training + +Invoke the research subagent (via the Task tool, `subagent_type: "research"`) with a specific brief — name anchor papers or arxiv IDs when you have them. Example brief: + +> Literature crawl for [task]. Start from [paper/topic]. Crawl citation graph for recent downstream papers. Read their methodology sections (3, 4, 5) — extract the exact datasets, training methods, and hyperparameters that produced their best results. Attribute every finding to a specific result (e.g. "Dataset X + method Y → 85.3% on benchmark Z"). Also find working code examples using current TRL/Transformers APIs. + +You can also call research tools directly (`explore_hf_docs`, `github_read_file`, `hf_papers`, etc.) for quick lookups. + +Skip research only for trivial non-code operations. + +# Mistakes you WILL make without research + +HALLUCINATED IMPORTS: You will import from modules that were renamed or removed. Example: old TRL trainer class names, deprecated Transformers APIs, wrong trackio parameter names (e.g. `run_name` instead of `name`). Fix: read a current example script first. + +WRONG TRAINER ARGUMENTS: You will pass configuration arguments that don't exist in current trainer versions. Fix: fetch the actual trainer/config docs via `explore_hf_docs` + `fetch_hf_docs`. + +WRONG DATASET FORMAT: You will assume column names without checking. Training fails with KeyError. Fix: call `hf_inspect_dataset` and verify columns match the training method. + +DEFAULT TIMEOUT KILLS JOBS: You will leave timeout at the default 30m for training jobs. Training takes hours. The job gets killed and all progress is lost. Fix: set timeout based on model size (minimum 2h for any training). + +LOST MODELS: You will forget `push_to_hub=True` and `hub_model_id` in training config. Job storage is ephemeral — the filesystem is deleted when the job ends. Without `push_to_hub`, the trained model is permanently lost. + +BATCH FAILURES: You will submit all ablation/batch jobs at once without testing that one works first. All will fail for the same bug. Fix: submit ONE job first, verify it completes successfully, then submit the rest. + +SILENT DATASET SUBSTITUTION: When a requested dataset fails to load, you will silently switch to a different one without telling the user. Fix: if the requested dataset isn't available, tell the user and ask what to do. + +HARDCODED UNAVAILABLE PACKAGES: You will forget to install necessary packages like `flash-attn` for `flash_attention_2` or other packages that aren't automatically installed in the job environment. Fix: install necessary packages before running the job. + +SCOPE-CHANGING FIXES: Avoid at all costs! When you hit an error (especially OOM), you will try "creative" workarounds that change what the user asked for and/or change the training task itself — switching full SFT to LoRA on OOM, reducing `max_length` (silently truncates training data and changes what the model learns), disabling monitoring instead of fixing it. Do not do this. Fix errors with the minimal change that preserves the user's original request and is grounded in research and examples. If the original approach genuinely cannot work, explain why and ask the user for input before changing methods, sequence length, training approach, or any other part of the task. + +# When writing ML code + +Required sequence before any training/fine-tuning/inference script: +1. Use the `research` subagent to find working examples, read docs, and get current API patterns +2. Validate dataset: `hf_inspect_dataset` to confirm column names and format +3. Validate model: confirm it exists, correct architecture/size/tokenizer + +Training logging: always set `disable_tqdm=True`, `logging_strategy="steps"`, and `logging_first_step=True` in your `TrainingArguments`/`SFTConfig` so loss values are printed as plain text lines you can grep, not hidden inside tqdm progress bars. + +Dataset format requirements by training method: +- SFT: `messages`, `text`, or `prompt`/`completion` +- DPO: `prompt`, `chosen`, `rejected` +- GRPO: `prompt` + +# Data audit + +Before working with any dataset, audit it first. Do not assume you know what the data looks like — inspect it. + +Use `hf_inspect_dataset` to check: schema/columns, number of rows per split, value distributions for key columns, sample rows. Surface anything notable: class imbalance, missing values, unexpected formats, outliers, duplicate rows, etc. + +Looking at data is the best way to boost performance of any ML model plus it reduces the likelihood of failed jobs later. + +# When submitting a training job + +Before calling `hf_jobs`, output a pre-flight check: +- Reference implementation: [which example you based this on] +- Dataset format verified: [columns confirmed via `hf_inspect_dataset`] +- `push_to_hub=True` and `hub_model_id` set +- timeout: [value] (based on: [model size] on [hardware]) +- Trackio monitoring included and working + +If you cannot fill in all items, stop and complete the missing steps first. + +For batch/ablation jobs: submit ONE job first. Check logs to confirm it starts training successfully. Only then submit the remaining jobs. Never submit all at once. + +Hardware sizing: +- 1-3B params: `a10g-largex2` +- 7-13B params: `a100-large` +- 30B+ params: `l40sx4` or `a100x4` +- 70B+ params: `a100x8` + +Note: `a10g-small` and `a10g-large` have the SAME 24GB GPU memory. The difference is CPU/RAM only. + +# Sandbox-first development + +For non-trivial scripts, develop and test in a sandbox before launching via `hf_jobs`: + +`sandbox_create` → install deps → write script → test with small run → fix errors → launch via `hf_jobs` at scale + +Use GPU sandbox (`t4-small` minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths. + +# When a task has 3+ steps + +Use the TodoWrite tool to track progress. One task `in_progress` at a time. Mark `completed` immediately after finishing. Update frequently to show the user what you're doing. + +# Error recovery + +When something fails: +- Diagnose the actual error. Read the full error message and logs. +- Do not retry the exact same thing. Identify what needs to change. +- If an API/import error: check documentation for the correct API. +- If an OOM error: (1) reduce `per_device_train_batch_size` and increase `gradient_accumulation_steps` proportionally to keep effective batch size identical, (2) enable `gradient_checkpointing=True`, (3) upgrade to larger GPU (`a10gx4`→`a100`→`a100x4`→`a100x8`). Do NOT switch training methods (e.g. SFT→LoRA) or reduce `max_length` — those change what the user gets. If OOM happens in sandbox, create a new sandbox with larger GPU hardware. +- Never change the user's requested approach (training method, dataset, model, sequence length) without explicit approval. +- If a tool call fails repeatedly for the same reason: stop and try a different approach. +- Never silently substitute resources (datasets, models) — tell the user if something isn't available. + +# Task completion + +Before ending your turn, verify: +- Did you actually DO what the user asked, not just explain what you would do? +- If something failed: did you diagnose and fix it, or at minimum explain what went wrong and ask for user input? +- For training jobs: did you include a working Trackio dashboard URL? + +Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done. + +# Autonomous / headless mode + +When running autonomously (`claude -p ...` with no human in the loop), you MUST follow these rules: + +NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, verify outputs, or plan ahead. + +NEVER STOP WORKING. Do NOT decide you are "done" while time remains. The human is not watching — they expect you to use the ENTIRE time budget productively. Do NOT ask "should I continue?" or "is this a good stopping point?" — there is nobody to answer. + +Your workflow is a loop, not a checklist. Once you have a working result, KEEP ITERATING: + +LOOP UNTIL TIME RUNS OUT: +1. Research the approach (read docs, find examples, check current APIs) +2. Implement the solution (write code, set up training) +3. Train and evaluate +4. Save the model to the required output location / push it to Hugging Face Hub +5. Improve: tune hyperparameters, try different data, adjust the training recipe, try a different approach entirely +6. Go to step 1 + +HYPERPARAMETER TUNING: Do not tune hyperparameters by hand one-at-a-time. Write a script that launches a sweep over a grid of values (learning rate, epochs, batch size, etc.) and evaluates each run automatically. One well-designed sweep script beats ten manual experiments. + +If you run out of ideas: go back to the literature. Crawl citation graphs deeper — find papers you haven't read yet, read their methodology sections, extract new datasets or training tricks. Look for papers that cite your current approach and improved on it. Try combining recipes from different papers. Re-read the task prompt for angles you missed. Re-read the training logs for clues. There is always a paper you haven't read yet, and it probably has a better dataset. + +The task is NOT done until: +- The required output exists (e.g. final model, metrics reached, dataset updated, etc.) +- You have evaluated the model and confirmed it works + +# Communication + +- Be concise and direct. No filler, no restating what the user said. +- One-word answers when appropriate for simple questions. +- Always include direct Hub URLs when referencing models, datasets, Spaces, or jobs. +- For errors: state what went wrong, why, and what you're doing to fix it. +- Do not over-explain or present elaborate option menus for simple tasks. When the user's intent is clear, act on it. Present options only when there's genuine ambiguity. + +# Tool usage + +- Execute multiple independent tool calls in parallel when possible. +- `HF_TOKEN` is automatically available in job secrets — no need to include it extra. +- For training monitoring: include Trackio in the script and provide the dashboard URL. +- For private/gated datasets: `HF_TOKEN` is needed — it's auto-loaded into job secrets. diff --git a/CLAUDE_CODE_GUIDE.md b/CLAUDE_CODE_GUIDE.md new file mode 100644 index 00000000..5a07a31c --- /dev/null +++ b/CLAUDE_CODE_GUIDE.md @@ -0,0 +1,291 @@ +# Using ml-intern with Claude Code + +This repo can run two ways: + +1. **Standalone CLI** — `ml-intern` (the original; see [README](README.md)). +2. **Inside Claude Code** — `claude` from the repo root, picks up `CLAUDE.md`, `.mcp.json`, `.claude/`. + +This guide covers (2). Both share the same tools under `agent/tools/`, so behavior matches; only the harness changes. + +--- + +## Prerequisites + +- [Claude Code](https://docs.claude.com/en/docs/claude-code) installed and signed in. +- [`uv`](https://docs.astral.sh/uv/) on `$PATH` (used to launch the MCP server and hooks). +- A clone of this repo with deps synced: + + ```bash + git clone git@github.com:huggingface/ml-intern.git + cd ml-intern + uv sync + ``` + +- An `.env` (or exported shell vars) with at minimum: + + ```bash + HF_TOKEN=hf_... # required — HF MCP server, papers, datasets, jobs, sessions upload + GITHUB_TOKEN=ghp_... # required — github_find_examples, github_read_file, github_list_repos + ``` + + Without `HF_TOKEN`, the HF MCP server returns 401s and the SessionStart hook reports `HF user: unknown`. Without `GITHUB_TOKEN`, the GitHub tools error. + +--- + +## First run + +From the repo root: + +```bash +claude +``` + +That's it. Claude Code reads: + +- `CLAUDE.md` — persona and methodology (research-first, dataset audit, pre-flight checklist for jobs, error-recovery rules). +- `.mcp.json` — auto-starts two MCP servers: + - `ml-intern-tools` (stdio, local) — exposes `hf_papers`, `hf_inspect_dataset`, `hf_jobs`, `hf_repo_files`, `hf_repo_git`, `explore_hf_docs`, `fetch_hf_docs`, `github_*`, sandbox `bash`/`read`/`write`/`edit`. + - `hf-mcp-server` (HTTP, hosted at `huggingface.co/mcp`) — official HF tools. +- `.claude/agents/research.md` — the parallel research subagent (read-only HF tools). +- `.claude/commands/*.md` — the slash commands listed below. +- `.claude/hooks/*.py` — content-aware approval, session redaction+upload, dynamic context injection. + +You should see (early in the first turn) a system reminder like: + +> HF user: **your-org** — use `your-org/` as the namespace when constructing `hub_model_id`... + +That's the SessionStart hook injecting context. If it says `HF user: unknown (...)`, fix the cause (missing token, expired token, network) before continuing. + +--- + +## Slash commands + +All commands accept free-form arguments after the name. They're prompt templates that route the agent through the right ml-intern workflow. + +### `/ml-intern ` + +Default entrypoint. Equivalent to `ml-intern ""` in the standalone CLI — runs the full research→validate→implement workflow per `CLAUDE.md`. + +``` +/ml-intern fine-tune llama-3-8b on HuggingFaceH4/ultrachat_200k for math reasoning +``` + +### `/research ` + +Forces a literature crawl via the `research` subagent. Use when you want recipes, citation graphs, or methodology comparison **without** the agent jumping straight to code. + +``` +/research diffusion model fine-tuning for medical imaging +/research best DPO recipe for instruction tuning, 7B-13B range +``` + +The subagent has its own context window and read-only tools (papers, docs, datasets, github, hf-repo). Returns a ranked recipe table. + +### `/inspect-dataset ` + +Audit a HF dataset before training: schema, splits, sample rows, red flags, training-method compatibility (SFT/DPO/GRPO). + +``` +/inspect-dataset HuggingFaceH4/ultrachat_200k +/inspect-dataset Anthropic/hh-rlhf +``` + +### `/finetune ` + +Strict, opinionated end-to-end fine-tune. Forces: +1. Research subagent first. +2. `hf_inspect_dataset` to verify column format. +3. Sandbox smoke test before anything large. +4. Pre-flight check (reference impl, `push_to_hub`, hardware, timeout, Trackio). +5. **One** job submitted; logs watched; only then any sweep. + +``` +/finetune llama-3-8b on HuggingFaceH4/ultrachat_200k +/finetune mistral-7b DPO on Anthropic/hh-rlhf +``` + +### `/run-job ` + +Submit any HF Job (training, eval, batch inference, data prep). Refuses to call `hf_jobs` until the pre-flight checklist is filled, including a ≥2h timeout for training jobs. + +``` +/run-job batch eval gpt2 on lm-eval harness MMLU +/run-job convert webdataset shards on 32 vCPUs +``` + +--- + +## Approvals — what to expect + +ml-intern's approval policy is enforced via a `PreToolUse` hook (`.claude/hooks/pre_tool_use_approval.py`). Claude Code will prompt you when: + +| Tool / op | When you'll be asked | +|---|---| +| `hf_jobs` (run/uv) on **GPU hardware** | Always | +| `hf_jobs` on CPU hardware | When `ML_INTERN_CONFIRM_CPU_JOBS=1` (default) | +| `hf_jobs` with a script that has `from_pretrained` but no `push_to_hub` | Always (warning surfaces in the prompt) | +| `sandbox_create` | Always | +| `hf_repo_files` `upload` / `delete` | Always | +| `hf_repo_git` destructive ops (delete branch/tag, merge PR, create/update repo) | Always | +| Anything else | Auto-allowed by static permissions (see `.claude/settings.json`) | + +To skip all approvals (e.g. unattended overnight runs): `ML_INTERN_YOLO=1 claude`. **Don't habit-form that.** + +If the hook crashes or gets a malformed payload, it **fails safe** — forces a prompt rather than silently allowing. + +--- + +## Environment knobs + +Set in your shell, `.env`, or override in `.claude/settings.json` `env` block. All have ml-intern-CLI equivalents. + +| Env var | Default | What it does | CLI equivalent | +|---|---|---|---| +| `HF_TOKEN` | — | HF auth (tools, MCP, sessions upload, whoami) | same | +| `GITHUB_TOKEN` | — | GitHub tools | same | +| `HF_SESSION_UPLOAD_TOKEN` | — | Preferred (write-only) token for sessions upload; falls back to `HF_TOKEN` then `HF_ADMIN_TOKEN` | same | +| `ML_INTERN_YOLO` | `0` | Skip all approvals | `Config.yolo_mode` | +| `ML_INTERN_CONFIRM_CPU_JOBS` | `1` | Prompt before CPU jobs | `Config.confirm_cpu_jobs` | +| `ML_INTERN_SAVE_SESSIONS` | `1` | Upload transcripts to HF dataset on session end | `Config.save_sessions` | +| `ML_INTERN_SESSION_REPO` | `smolagents/ml-intern-sessions` | Target dataset | `Config.session_dataset_repo` | +| `ML_INTERN_LOCAL_MODE` | `0` | Run sandbox-style tools (`bash`/`read`/`write`/`edit`) on local fs instead of remote sandbox | `--local` | + +When `ML_INTERN_LOCAL_MODE=1`, the SessionStart hook injects an extra reminder telling the model "no sandbox — operate on local fs, no `/app/` paths." + +--- + +## Headless / unattended + +For one-shot runs from CI or a script: + +```bash +claude -p "/ml-intern fine-tune gpt2-medium on tatsu-lab/alpaca, push to my-org/gpt2-alpaca-test" +``` + +Pair with `ML_INTERN_YOLO=1` if you genuinely have no human in the loop. Read [`CLAUDE.md`](CLAUDE.md)'s "Autonomous / headless mode" section first — the rules differ from interactive (no text-only responses, always be doing work, hyperparameter sweeps not manual tuning). + +--- + +## Privacy: what gets uploaded + +When `ML_INTERN_SAVE_SESSIONS=1` (default), at session end the transcript is uploaded to `ML_INTERN_SESSION_REPO` (default: `smolagents/ml-intern-sessions`) **after** running it through `agent/core/redact.py::scrub`, which strips: + +- `hf_…` HF tokens +- `sk-ant-…` Anthropic keys +- `sk-…` OpenAI keys +- `ghp_/gho_/ghu_/ghs_/ghr_/github_pat_…` GitHub tokens +- `AKIA…/ASIA…` AWS access keys +- `Bearer …` Authorization headers +- `KEY=value` exports for any name matching `HF_TOKEN|API_KEY|SECRET|PASSWORD|...` + +Redaction is regex-based and best-effort. If you paste an unusual secret format ("hunter2") it won't be caught — don't paste secrets into chat. + +The hook also refuses to upload a transcript whose path is outside `~/.claude/` or `$CLAUDE_PROJECT_DIR`. To opt out entirely: `ML_INTERN_SAVE_SESSIONS=0`. + +--- + +## Common workflows + +### "What's the best recipe for X?" + +``` +/research X +``` + +Wait for the recipe table. Then either ask follow-ups in the same turn or invoke `/finetune` with the recipe in mind. + +### "Train this model on this dataset" + +``` +/finetune on +``` + +Watch for: +1. The research subagent's findings (loss recipe, hyperparameters). +2. `hf_inspect_dataset` output (column format check). +3. The sandbox smoke-test logs. +4. The pre-flight checklist. +5. Approval prompt for the GPU job. **Read the warning text** if any. +6. The job URL + Trackio dashboard URL. + +### "Just run this script as a job" + +``` +/run-job +``` + +Provide the script body in the chat or as a file path. The model will fill the pre-flight checklist before submitting. + +### "Audit this dataset" + +``` +/inspect-dataset +``` + +Useful as a standalone read; also useful before kicking off `/finetune` to spot column-format issues early. + +--- + +## Troubleshooting + +**"Tool not found: `mcp__ml-intern-tools__...`"** — the MCP server isn't running. Check `claude mcp list`; if it errors, run `uv run python -m packages.mcp_server.server < /dev/null` to surface the import error. + +**"401 Unauthorized" from `hf_papers` or `hf_jobs`** — `HF_TOKEN` not in env. The `.mcp.json` substitutes `${HF_TOKEN}` from the launching shell; if you `claude` from a shell where it's not exported, the MCP server inherits an empty token. + +**SessionStart shows `HF user: unknown (whoami HTTP error: ...)`** — token rejected. Probably expired or scoped wrong. Generate a new one at . + +**Approval prompt every turn for `hf_papers`** — the static permissions list in `.claude/settings.json` doesn't include the tool name, or the MCP server didn't register it. Verify with `claude mcp list` and check the tool name format (`mcp__ml-intern-tools__`). + +**`from_pretrained` warning on a script that's fine** — substring match is conservative. If the script genuinely doesn't need `push_to_hub` (e.g. eval-only), approve and proceed. + +**Session upload fails silently** — check stderr of the Claude Code process. Errors print there. Common causes: token doesn't have write access to `ML_INTERN_SESSION_REPO`, or the dataset doesn't exist. + +**Hook crashes** — run the hook by hand to reproduce: + +```bash +echo '{"tool_name":"mcp__ml-intern-tools__hf_jobs","tool_input":{"operation":"run","script":"x","hardware_flavor":"a100-large"}}' \ + | uv run python .claude/hooks/pre_tool_use_approval.py +``` + +--- + +## Adding your own tools + +The standalone CLI exposes new tools via `agent/tools/*.py` + a `ToolSpec` registered in `agent/core/tools.py`. To make those tools available inside Claude Code: + +1. Implement the handler in `agent/tools/your_tool.py` with a `YOUR_TOOL_SPEC` dict and an async handler. +2. Add the `(spec, handler)` tuple to `_TOOL_SPECS` in `packages/mcp_server/server.py`. +3. Add `mcp__ml-intern-tools__` to `.claude/settings.json` `permissions.allow`. +4. (Optional) If destructive, extend `_needs_approval` in `.claude/hooks/pre_tool_use_approval.py`. +5. (Optional) If read-only, add it to `.claude/agents/research.md` `tools:` frontmatter so the research subagent can use it. + +The standalone CLI continues to work — both frontends share the same handler. + +--- + +## Adding your own slash commands + +Drop a markdown file at `.claude/commands/.md`: + +```markdown +--- +description: One-line description shown in `/` listing. +argument-hint: +--- + +Your prompt template here. Use $ARGUMENTS for the user's input. +``` + +The commands in this repo are intentionally opinionated (forcing research, refusing to skip pre-flight) — match that posture if you want consistent behavior. + +--- + +## When to use the standalone CLI instead + +The Claude Code path is the recommended default. Reach for `ml-intern` directly when you need: + +- The original CLI's `/effort`, `/model`, `/yolo` toggles mid-session. +- The session JSONL trajectory written locally (the standalone CLI writes one; Claude Code's transcript is its own format). +- The web UI under `backend/`+`frontend/` for browsing past sessions. + +Otherwise, use Claude Code — you get plan mode, native subagent ergonomics, better context management, and the same tool surface. diff --git a/README.md b/README.md index 29fe439b..ae330279 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,19 @@ ml-intern --max-iterations 100 "your prompt" ml-intern --no-stream "your prompt" ``` +### Running inside Claude Code + +The repo also runs as a Claude Code project. From the repo root: + +```bash +claude # interactive +claude -p "fine-tune llama on my dataset" # headless +``` + +Claude Code picks up `CLAUDE.md` (persona), `.mcp.json` (HF tools via `packages/mcp_server`), `.claude/agents/research.md` (research subagent), `.claude/commands/*.md` (slash commands: `/ml-intern`, `/research`, `/inspect-dataset`, `/finetune`, `/run-job`), and `.claude/hooks/` (content-aware approval, session redaction + upload, dynamic context injection). The standalone CLI under `agent/` is unchanged — both share the same tool implementations. + +See [`CLAUDE_CODE_GUIDE.md`](CLAUDE_CODE_GUIDE.md) for slash commands, approvals, env knobs, and troubleshooting. + ## Architecture ### Component Overview diff --git a/packages/mcp_server/server.py b/packages/mcp_server/server.py new file mode 100644 index 00000000..29560c26 --- /dev/null +++ b/packages/mcp_server/server.py @@ -0,0 +1,144 @@ +""" +ML Intern tools, exposed as an MCP server for Claude Code. + +Thin shim over agent/tools/*: same handlers, same JSON schemas, same +behavior — only the transport changes from "litellm tool calls inside +agent_loop.py" to "MCP stdio for Claude Code". + +Uses the low-level `mcp.server.lowlevel.Server` API so we can register +tools with the original JSON schemas verbatim. FastMCP's high-level +`@mcp.tool` would re-derive schemas from Python type hints, which would +lose nullable/oneOf/operation-discriminated structures the existing +ml-intern specs encode. + +Run via the `.mcp.json` at the repo root. Not intended to be invoked manually. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +from typing import Any, Awaitable, Callable + +from mcp import types +from mcp.server.lowlevel import Server +from mcp.server.stdio import stdio_server + +from agent.tools.dataset_tools import ( + HF_INSPECT_DATASET_TOOL_SPEC, + hf_inspect_dataset_handler, +) +from agent.tools.docs_tools import ( + EXPLORE_HF_DOCS_TOOL_SPEC, + HF_DOCS_FETCH_TOOL_SPEC, + explore_hf_docs_handler, + hf_docs_fetch_handler, +) +from agent.tools.github_find_examples import ( + GITHUB_FIND_EXAMPLES_TOOL_SPEC, + github_find_examples_handler, +) +from agent.tools.github_list_repos import ( + GITHUB_LIST_REPOS_TOOL_SPEC, + github_list_repos_handler, +) +from agent.tools.github_read_file import ( + GITHUB_READ_FILE_TOOL_SPEC, + github_read_file_handler, +) +from agent.tools.hf_repo_files_tool import ( + HF_REPO_FILES_TOOL_SPEC, + hf_repo_files_handler, +) +from agent.tools.hf_repo_git_tool import HF_REPO_GIT_TOOL_SPEC, hf_repo_git_handler +from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler +from agent.tools.papers_tool import HF_PAPERS_TOOL_SPEC, hf_papers_handler +from agent.tools.sandbox_tool import get_sandbox_tools + +logger = logging.getLogger(__name__) + +# `research` and `plan_tool` are intentionally NOT exposed: +# research → replaced by .claude/agents/research.md (Claude Code subagent) +# plan_tool → replaced by Claude Code's built-in TodoWrite +_TOOL_SPECS: list[tuple[dict[str, Any], Callable[..., Awaitable[tuple[str, bool]]]]] = [ + (EXPLORE_HF_DOCS_TOOL_SPEC, explore_hf_docs_handler), + (HF_DOCS_FETCH_TOOL_SPEC, hf_docs_fetch_handler), + (HF_PAPERS_TOOL_SPEC, hf_papers_handler), + (HF_INSPECT_DATASET_TOOL_SPEC, hf_inspect_dataset_handler), + (HF_JOBS_TOOL_SPEC, hf_jobs_handler), + (HF_REPO_FILES_TOOL_SPEC, hf_repo_files_handler), + (HF_REPO_GIT_TOOL_SPEC, hf_repo_git_handler), + (GITHUB_FIND_EXAMPLES_TOOL_SPEC, github_find_examples_handler), + (GITHUB_LIST_REPOS_TOOL_SPEC, github_list_repos_handler), + (GITHUB_READ_FILE_TOOL_SPEC, github_read_file_handler), +] + +# Discovered async at startup. Populated below in build_registry(). +_REGISTRY: dict[str, tuple[types.Tool, Callable[..., Awaitable[tuple[str, bool]]]]] = {} + + +def _build_registry() -> None: + """Populate the {name: (Tool, handler)} registry.""" + for spec, handler in _TOOL_SPECS: + tool = types.Tool( + name=spec["name"], + description=spec["description"], + inputSchema=spec["parameters"], + ) + _REGISTRY[spec["name"]] = (tool, handler) + + # Sandbox tools come from a factory because they depend on local_mode. + # Mirrors agent/main.py: ML_INTERN_LOCAL_MODE=1 routes shell/file ops to + # the local machine instead of HF Sandboxes. + local_mode = os.environ.get("ML_INTERN_LOCAL_MODE", "").lower() in ("1", "true", "yes") + if local_mode: + from agent.tools.local_tools import get_local_tools + + sandbox_specs = get_local_tools() + else: + sandbox_specs = get_sandbox_tools() + + for tool_spec in sandbox_specs: + tool = types.Tool( + name=tool_spec.name, + description=tool_spec.description, + inputSchema=tool_spec.parameters, + ) + _REGISTRY[tool_spec.name] = (tool, tool_spec.handler) + + +server: Server = Server("ml-intern-tools") + + +@server.list_tools() +async def list_tools() -> list[types.Tool]: + return [tool for tool, _ in _REGISTRY.values()] + + +@server.call_tool() +async def call_tool(name: str, arguments: dict[str, Any]) -> list[types.TextContent]: + entry = _REGISTRY.get(name) + if entry is None: + raise ValueError(f"Unknown tool: {name}") + _tool, handler = entry + + output, ok = await handler(arguments or {}) + if not ok: + # MCP convention: raise so the client sees isError=true with the message. + raise RuntimeError(output) + return [types.TextContent(type="text", text=output)] + + +async def _amain() -> None: + _build_registry() + async with stdio_server() as (read_stream, write_stream): + await server.run( + read_stream, + write_stream, + server.create_initialization_options(), + ) + + +if __name__ == "__main__": + asyncio.run(_amain())