From ca4a18cbfdb47ab83e99367c217dda42c6833ff4 Mon Sep 17 00:00:00 2001 From: Professor Synapse <131487882+ProfSynapse@users.noreply.github.com> Date: Fri, 10 Apr 2026 11:50:19 -0400 Subject: [PATCH 1/2] Add Docker-first local runtime bootstrap and eval flow --- .agents/skills/fine-tuning/SKILL.md | 32 ++ .claude/skills/fine-tuning/SKILL.md | 32 ++ .skills/fine-tuning/SKILL.md | 32 ++ Evaluator/vllm_setup.py | 86 +-- docker/bucket-helper/Dockerfile | 12 + docker/bucket-helper/requirements.txt | 3 + docs/plans/local-docker-runtime-plan.md | 399 ++++++++++++++ shared/utilities/bucket_artifacts.py | 74 ++- shared/utilities/env.py | 8 + shared/utilities/paths.py | 82 +++ tuner/backends/evaluation/unsloth_backend.py | 45 +- tuner/cli/main.py | 8 + tuner/cli/parser.py | 34 +- tuner/cli/router.py | 179 ++++-- tuner/cloud/hf_jobs.py | 10 +- tuner/discovery/training_runs.py | 19 +- tuner/handlers/__init__.py | 61 +-- tuner/handlers/base.py | 4 +- tuner/handlers/bucket_handler.py | 192 ++++++- tuner/handlers/docker_handler.py | 540 +++++++++++++++++++ tuner/handlers/eval_handler.py | 415 +++++++++++++- tuner/handlers/train_handler.py | 130 ++++- tuner/utils/docker_runtime.py | 206 +++++++ 23 files changed, 2406 insertions(+), 197 deletions(-) create mode 100644 docker/bucket-helper/Dockerfile create mode 100644 docker/bucket-helper/requirements.txt create mode 100644 docs/plans/local-docker-runtime-plan.md create mode 100644 tuner/handlers/docker_handler.py create mode 100644 tuner/utils/docker_runtime.py diff --git a/.agents/skills/fine-tuning/SKILL.md b/.agents/skills/fine-tuning/SKILL.md index 829d0066..2378edcc 100644 --- a/.agents/skills/fine-tuning/SKILL.md +++ b/.agents/skills/fine-tuning/SKILL.md @@ -13,6 +13,13 @@ Train language models with SFT, KTO, and GRPO locally or on supported cloud prov | Task | Command | |------|---------| | Interactive menu | `./run.sh` → Train | +| Local Docker status | `python tuner.py docker status` | +| Bootstrap local Docker runtime | `python tuner.py docker bootstrap --docker-target all` | +| Build Docker bucket helper | `python tuner.py docker build --docker-target bucket` | +| Pull local Docker runtime | `python tuner.py docker pull --docker-target unsloth` | +| Smoke test local Docker runtime | `python tuner.py docker smoke --docker-target all` | +| Local Docker training | `python tuner.py train --runtime docker` | +| Local Docker evaluation | `python tuner.py eval --runtime docker` | | SFT training | `cd Trainers/rtx3090_sft && python train_sft.py --model-size 7b` | | KTO training | `cd Trainers/rtx3090_kto && python train_kto.py --model-size 7b` | | GRPO training | `cd Trainers/grpo && python train_grpo.py` | @@ -89,6 +96,9 @@ Use `--tier` on the local SFT and KTO trainers when you want a preset instead of - Treat `loss_summary.json` as a supporting artifact, not the canonical final loss metadata file. - The ledger should accumulate real model-size / hardware / timing / cost data so future hardware planning can optimize against observed evidence instead of memory. - For local trainer iteration, use the checked-in `train_sft.py`, `train_kto.py`, and `train_grpo.py` entrypoints. +- For Windows local GPU work, prefer Docker Desktop plus `python tuner.py docker smoke --docker-target all` as the first environment check before debugging conda or package drift. +- For first-time local Docker setup, prefer `python tuner.py docker bootstrap --docker-target all`. It should tell you whether Docker Desktop is installed/running, pull or build the required images, and finish with smoke tests. +- Prefer `python tuner.py train --runtime docker` and `python tuner.py eval --runtime docker` when you want the CLI to stay Docker-first locally while reusing the checked-in trainer and evaluator entrypoints. - For canonical HF experiments, prefer `python tuner.py cloud-pipeline ...` over `cloud-run`. - For full train → eval → exact loss → analysis → recommendation runs, prefer `python tuner.py run-experiment ...`. - Evolutionary SFT is experimental but now first-class in the cloud experiment path. Prefer a checked-in experiment spec or `cloud-pipeline --train-evolutionary-*` overrides over editing trainer YAMLs by hand. @@ -102,7 +112,9 @@ Use `--tier` on the local SFT and KTO trainers when you want a preset instead of - For in-flight cloud-run health checks, inspect the bucket-backed artifacts first (`training_latest.jsonl`, `stage_summary.json`, `training_lineage.json`, eval/loss partials). Use raw HF logs only as a fallback when the bucket prefix has not started writing yet. - For quick bucket spot checks, use `python tuner.py bucket read ...` or `python tuner.py bucket list ...` instead of manual `hf buckets cp` commands. - For local inspection or offline diffing, use `python tuner.py bucket pull ...` to sync a bucket-relative path into the current workspace while preserving its relative path. +- Pulled cloud adapters under `toolset-training-artifacts/runs/...` or `runs/...` should be treated as first-class local runs by `train`/`eval` discovery. Do not spin up a one-off container just because a run originated in HF Jobs. - For one-off uploads back into the HF artifact bucket, use `python tuner.py bucket push ...` instead of ad hoc `sync_bucket` snippets. +- If the active Python lacks modern HF Buckets support, `python tuner.py bucket ...` should fall back to the checked-in Docker bucket helper instead of mutating the main Unsloth environment. Prebuild it with `python tuner.py docker build --docker-target bucket` when you want the fallback path ready ahead of time. - For `a100-large` or larger tiers, bias toward aggressive packing. Do not lower batch just because the adapter recipe changed. Start from the highest known-good packed shape for the same model family and only back off after a real OOM or clear instability signal. - Treat large unused VRAM on `a100-large` as a mistake, not a comfort margin. If `training_lineage.json` shows tens of GB of reserved headroom, the run is underpacked and the next iteration should push batch size harder even if that risks OOM. - For vLLM eval on multi-GPU hardware, prequantized BitsAndBytes base models (for example `*-bnb-4bit`) cannot use tensor parallelism. Do not assume `x4` means vLLM will shard generation across all GPUs; in this path, eval may need to fall back to single-GPU while exact loss still fans out across all visible GPUs afterward. @@ -172,6 +184,26 @@ See `reference/lora-techniques.md` for full details, integration status, and com ## Common Patterns +**Bootstrap local Docker on a fresh machine:** +```bash +python tuner.py docker bootstrap --docker-target all +``` +Use this before touching conda if the goal is local GPU training/eval through Docker Desktop. The command should: +- tell you if Docker Desktop is missing or not running +- prepare `unsloth`, `vllm`, and the Buckets helper image +- run smoke tests so you know GPU containers actually work + +**Pull a cloud adapter and evaluate it locally through Docker:** +```bash +python tuner.py bucket pull \ + --path runs/hf_jobs/sft//final_model \ + --dest toolset-training-artifacts + +python tuner.py eval --runtime docker +``` +Gotcha: +- The pulled adapter should now appear in the normal local eval discovery flow. If it does not, inspect where the pull landed and keep it under `toolset-training-artifacts/runs/...` or `runs/...` inside the repo. + **Quick SFT test run:** ```bash cd Trainers/rtx3090_sft diff --git a/.claude/skills/fine-tuning/SKILL.md b/.claude/skills/fine-tuning/SKILL.md index 829d0066..2378edcc 100644 --- a/.claude/skills/fine-tuning/SKILL.md +++ b/.claude/skills/fine-tuning/SKILL.md @@ -13,6 +13,13 @@ Train language models with SFT, KTO, and GRPO locally or on supported cloud prov | Task | Command | |------|---------| | Interactive menu | `./run.sh` → Train | +| Local Docker status | `python tuner.py docker status` | +| Bootstrap local Docker runtime | `python tuner.py docker bootstrap --docker-target all` | +| Build Docker bucket helper | `python tuner.py docker build --docker-target bucket` | +| Pull local Docker runtime | `python tuner.py docker pull --docker-target unsloth` | +| Smoke test local Docker runtime | `python tuner.py docker smoke --docker-target all` | +| Local Docker training | `python tuner.py train --runtime docker` | +| Local Docker evaluation | `python tuner.py eval --runtime docker` | | SFT training | `cd Trainers/rtx3090_sft && python train_sft.py --model-size 7b` | | KTO training | `cd Trainers/rtx3090_kto && python train_kto.py --model-size 7b` | | GRPO training | `cd Trainers/grpo && python train_grpo.py` | @@ -89,6 +96,9 @@ Use `--tier` on the local SFT and KTO trainers when you want a preset instead of - Treat `loss_summary.json` as a supporting artifact, not the canonical final loss metadata file. - The ledger should accumulate real model-size / hardware / timing / cost data so future hardware planning can optimize against observed evidence instead of memory. - For local trainer iteration, use the checked-in `train_sft.py`, `train_kto.py`, and `train_grpo.py` entrypoints. +- For Windows local GPU work, prefer Docker Desktop plus `python tuner.py docker smoke --docker-target all` as the first environment check before debugging conda or package drift. +- For first-time local Docker setup, prefer `python tuner.py docker bootstrap --docker-target all`. It should tell you whether Docker Desktop is installed/running, pull or build the required images, and finish with smoke tests. +- Prefer `python tuner.py train --runtime docker` and `python tuner.py eval --runtime docker` when you want the CLI to stay Docker-first locally while reusing the checked-in trainer and evaluator entrypoints. - For canonical HF experiments, prefer `python tuner.py cloud-pipeline ...` over `cloud-run`. - For full train → eval → exact loss → analysis → recommendation runs, prefer `python tuner.py run-experiment ...`. - Evolutionary SFT is experimental but now first-class in the cloud experiment path. Prefer a checked-in experiment spec or `cloud-pipeline --train-evolutionary-*` overrides over editing trainer YAMLs by hand. @@ -102,7 +112,9 @@ Use `--tier` on the local SFT and KTO trainers when you want a preset instead of - For in-flight cloud-run health checks, inspect the bucket-backed artifacts first (`training_latest.jsonl`, `stage_summary.json`, `training_lineage.json`, eval/loss partials). Use raw HF logs only as a fallback when the bucket prefix has not started writing yet. - For quick bucket spot checks, use `python tuner.py bucket read ...` or `python tuner.py bucket list ...` instead of manual `hf buckets cp` commands. - For local inspection or offline diffing, use `python tuner.py bucket pull ...` to sync a bucket-relative path into the current workspace while preserving its relative path. +- Pulled cloud adapters under `toolset-training-artifacts/runs/...` or `runs/...` should be treated as first-class local runs by `train`/`eval` discovery. Do not spin up a one-off container just because a run originated in HF Jobs. - For one-off uploads back into the HF artifact bucket, use `python tuner.py bucket push ...` instead of ad hoc `sync_bucket` snippets. +- If the active Python lacks modern HF Buckets support, `python tuner.py bucket ...` should fall back to the checked-in Docker bucket helper instead of mutating the main Unsloth environment. Prebuild it with `python tuner.py docker build --docker-target bucket` when you want the fallback path ready ahead of time. - For `a100-large` or larger tiers, bias toward aggressive packing. Do not lower batch just because the adapter recipe changed. Start from the highest known-good packed shape for the same model family and only back off after a real OOM or clear instability signal. - Treat large unused VRAM on `a100-large` as a mistake, not a comfort margin. If `training_lineage.json` shows tens of GB of reserved headroom, the run is underpacked and the next iteration should push batch size harder even if that risks OOM. - For vLLM eval on multi-GPU hardware, prequantized BitsAndBytes base models (for example `*-bnb-4bit`) cannot use tensor parallelism. Do not assume `x4` means vLLM will shard generation across all GPUs; in this path, eval may need to fall back to single-GPU while exact loss still fans out across all visible GPUs afterward. @@ -172,6 +184,26 @@ See `reference/lora-techniques.md` for full details, integration status, and com ## Common Patterns +**Bootstrap local Docker on a fresh machine:** +```bash +python tuner.py docker bootstrap --docker-target all +``` +Use this before touching conda if the goal is local GPU training/eval through Docker Desktop. The command should: +- tell you if Docker Desktop is missing or not running +- prepare `unsloth`, `vllm`, and the Buckets helper image +- run smoke tests so you know GPU containers actually work + +**Pull a cloud adapter and evaluate it locally through Docker:** +```bash +python tuner.py bucket pull \ + --path runs/hf_jobs/sft//final_model \ + --dest toolset-training-artifacts + +python tuner.py eval --runtime docker +``` +Gotcha: +- The pulled adapter should now appear in the normal local eval discovery flow. If it does not, inspect where the pull landed and keep it under `toolset-training-artifacts/runs/...` or `runs/...` inside the repo. + **Quick SFT test run:** ```bash cd Trainers/rtx3090_sft diff --git a/.skills/fine-tuning/SKILL.md b/.skills/fine-tuning/SKILL.md index 829d0066..2378edcc 100644 --- a/.skills/fine-tuning/SKILL.md +++ b/.skills/fine-tuning/SKILL.md @@ -13,6 +13,13 @@ Train language models with SFT, KTO, and GRPO locally or on supported cloud prov | Task | Command | |------|---------| | Interactive menu | `./run.sh` → Train | +| Local Docker status | `python tuner.py docker status` | +| Bootstrap local Docker runtime | `python tuner.py docker bootstrap --docker-target all` | +| Build Docker bucket helper | `python tuner.py docker build --docker-target bucket` | +| Pull local Docker runtime | `python tuner.py docker pull --docker-target unsloth` | +| Smoke test local Docker runtime | `python tuner.py docker smoke --docker-target all` | +| Local Docker training | `python tuner.py train --runtime docker` | +| Local Docker evaluation | `python tuner.py eval --runtime docker` | | SFT training | `cd Trainers/rtx3090_sft && python train_sft.py --model-size 7b` | | KTO training | `cd Trainers/rtx3090_kto && python train_kto.py --model-size 7b` | | GRPO training | `cd Trainers/grpo && python train_grpo.py` | @@ -89,6 +96,9 @@ Use `--tier` on the local SFT and KTO trainers when you want a preset instead of - Treat `loss_summary.json` as a supporting artifact, not the canonical final loss metadata file. - The ledger should accumulate real model-size / hardware / timing / cost data so future hardware planning can optimize against observed evidence instead of memory. - For local trainer iteration, use the checked-in `train_sft.py`, `train_kto.py`, and `train_grpo.py` entrypoints. +- For Windows local GPU work, prefer Docker Desktop plus `python tuner.py docker smoke --docker-target all` as the first environment check before debugging conda or package drift. +- For first-time local Docker setup, prefer `python tuner.py docker bootstrap --docker-target all`. It should tell you whether Docker Desktop is installed/running, pull or build the required images, and finish with smoke tests. +- Prefer `python tuner.py train --runtime docker` and `python tuner.py eval --runtime docker` when you want the CLI to stay Docker-first locally while reusing the checked-in trainer and evaluator entrypoints. - For canonical HF experiments, prefer `python tuner.py cloud-pipeline ...` over `cloud-run`. - For full train → eval → exact loss → analysis → recommendation runs, prefer `python tuner.py run-experiment ...`. - Evolutionary SFT is experimental but now first-class in the cloud experiment path. Prefer a checked-in experiment spec or `cloud-pipeline --train-evolutionary-*` overrides over editing trainer YAMLs by hand. @@ -102,7 +112,9 @@ Use `--tier` on the local SFT and KTO trainers when you want a preset instead of - For in-flight cloud-run health checks, inspect the bucket-backed artifacts first (`training_latest.jsonl`, `stage_summary.json`, `training_lineage.json`, eval/loss partials). Use raw HF logs only as a fallback when the bucket prefix has not started writing yet. - For quick bucket spot checks, use `python tuner.py bucket read ...` or `python tuner.py bucket list ...` instead of manual `hf buckets cp` commands. - For local inspection or offline diffing, use `python tuner.py bucket pull ...` to sync a bucket-relative path into the current workspace while preserving its relative path. +- Pulled cloud adapters under `toolset-training-artifacts/runs/...` or `runs/...` should be treated as first-class local runs by `train`/`eval` discovery. Do not spin up a one-off container just because a run originated in HF Jobs. - For one-off uploads back into the HF artifact bucket, use `python tuner.py bucket push ...` instead of ad hoc `sync_bucket` snippets. +- If the active Python lacks modern HF Buckets support, `python tuner.py bucket ...` should fall back to the checked-in Docker bucket helper instead of mutating the main Unsloth environment. Prebuild it with `python tuner.py docker build --docker-target bucket` when you want the fallback path ready ahead of time. - For `a100-large` or larger tiers, bias toward aggressive packing. Do not lower batch just because the adapter recipe changed. Start from the highest known-good packed shape for the same model family and only back off after a real OOM or clear instability signal. - Treat large unused VRAM on `a100-large` as a mistake, not a comfort margin. If `training_lineage.json` shows tens of GB of reserved headroom, the run is underpacked and the next iteration should push batch size harder even if that risks OOM. - For vLLM eval on multi-GPU hardware, prequantized BitsAndBytes base models (for example `*-bnb-4bit`) cannot use tensor parallelism. Do not assume `x4` means vLLM will shard generation across all GPUs; in this path, eval may need to fall back to single-GPU while exact loss still fans out across all visible GPUs afterward. @@ -172,6 +184,26 @@ See `reference/lora-techniques.md` for full details, integration status, and com ## Common Patterns +**Bootstrap local Docker on a fresh machine:** +```bash +python tuner.py docker bootstrap --docker-target all +``` +Use this before touching conda if the goal is local GPU training/eval through Docker Desktop. The command should: +- tell you if Docker Desktop is missing or not running +- prepare `unsloth`, `vllm`, and the Buckets helper image +- run smoke tests so you know GPU containers actually work + +**Pull a cloud adapter and evaluate it locally through Docker:** +```bash +python tuner.py bucket pull \ + --path runs/hf_jobs/sft//final_model \ + --dest toolset-training-artifacts + +python tuner.py eval --runtime docker +``` +Gotcha: +- The pulled adapter should now appear in the normal local eval discovery flow. If it does not, inspect where the pull landed and keep it under `toolset-training-artifacts/runs/...` or `runs/...` inside the repo. + **Quick SFT test run:** ```bash cd Trainers/rtx3090_sft diff --git a/Evaluator/vllm_setup.py b/Evaluator/vllm_setup.py index 85770f68..c218850f 100644 --- a/Evaluator/vllm_setup.py +++ b/Evaluator/vllm_setup.py @@ -21,7 +21,7 @@ import requests -from shared.utilities.paths import iter_training_output_dirs +from shared.utilities.paths import iter_training_run_dirs # --------------------------------------------------------------------------- # Constants @@ -52,6 +52,7 @@ class TrainingRun: has_merged_16bit: bool has_lora: bool model_size: Optional[str] = None + source: str = "local_training" @property def display_name(self) -> str: @@ -238,44 +239,38 @@ def discover_training_runs(base_dir: Optional[Path] = None) -> List[TrainingRun] repo_root = base_dir.parent if base_dir.name == "Trainers" else base_dir for trainer_type in TRAINING_METHODS: - for output_dir in iter_training_output_dirs(trainer_type, repo_root): - if not output_dir.exists(): + for run_dir in iter_training_run_dirs(trainer_type, repo_root): + if not re.match(r"\d{8}_\d{6}", run_dir.name): continue - for run_dir in output_dir.iterdir(): - if not run_dir.is_dir(): - continue - - if not re.match(r"\d{8}_\d{6}", run_dir.name): - continue - - has_final_model = (run_dir / "final_model").exists() - has_merged_16bit = False - has_lora = False - - for subdir in run_dir.iterdir(): - if subdir.is_dir(): - if (subdir / "merged-16bit").exists(): - has_merged_16bit = True - if (subdir / "lora").exists(): - has_lora = True - - if has_final_model: - adapter_config = run_dir / "final_model" / "adapter_config.json" - has_lora = has_lora or adapter_config.exists() - - model_size = _detect_model_size(run_dir) - - runs.append(TrainingRun( - path=run_dir, - name=run_dir.name, - timestamp=run_dir.name, - trainer_type=trainer_type, - has_final_model=has_final_model, - has_merged_16bit=has_merged_16bit, - has_lora=has_lora, - model_size=model_size, - )) + has_final_model = (run_dir / "final_model").exists() + has_merged_16bit = False + has_lora = False + + for subdir in run_dir.iterdir(): + if subdir.is_dir(): + if (subdir / "merged-16bit").exists(): + has_merged_16bit = True + if (subdir / "lora").exists(): + has_lora = True + + if has_final_model: + adapter_config = run_dir / "final_model" / "adapter_config.json" + has_lora = has_lora or adapter_config.exists() + + model_size = _detect_model_size(run_dir) + + runs.append(TrainingRun( + path=run_dir, + name=run_dir.name, + timestamp=run_dir.name, + trainer_type=trainer_type, + has_final_model=has_final_model, + has_merged_16bit=has_merged_16bit, + has_lora=has_lora, + model_size=model_size, + source=_detect_run_source(run_dir), + )) # Sort by timestamp (newest first) runs.sort(key=lambda r: r.timestamp, reverse=True) @@ -299,15 +294,24 @@ def _detect_model_size(run_dir: Path) -> Optional[str]: with open(adapter_config) as f: config = json.load(f) base_model = config.get("base_model_name_or_path", "") - # Extract size from model name - for size in ["3b", "7b", "13b", "20b", "70b"]: - if size in base_model.lower(): - return size.upper() + match = re.search(r"(\d+(?:\.\d+)?)\s*([bm])", base_model.lower()) + if match: + return f"{match.group(1)}{match.group(2).upper()}" except Exception: pass return None +def _detect_run_source(run_dir: Path) -> str: + """Identify whether a run came from local training or imported artifacts.""" + parts = {part.lower() for part in run_dir.parts} + if "toolset-training-artifacts" in parts: + return "bucket_pull" + if "runs" in parts and "trainers" not in parts: + return "cloud_artifact" + return "local_training" + + def discover_huggingface_models() -> List[str]: """Return list of recommended base models from HuggingFace. diff --git a/docker/bucket-helper/Dockerfile b/docker/bucket-helper/Dockerfile new file mode 100644 index 00000000..ea0d99e9 --- /dev/null +++ b/docker/bucket-helper/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +WORKDIR /opt/bucket-helper + +COPY requirements.txt /opt/bucket-helper/requirements.txt + +RUN pip install --no-cache-dir -r /opt/bucket-helper/requirements.txt + +ENTRYPOINT ["python"] diff --git a/docker/bucket-helper/requirements.txt b/docker/bucket-helper/requirements.txt new file mode 100644 index 00000000..debda414 --- /dev/null +++ b/docker/bucket-helper/requirements.txt @@ -0,0 +1,3 @@ +huggingface_hub>=1.5.0,<2.0 +python-dotenv>=1.0.0,<2.0 +PyYAML>=6.0,<7.0 diff --git a/docs/plans/local-docker-runtime-plan.md b/docs/plans/local-docker-runtime-plan.md new file mode 100644 index 00000000..23da4567 --- /dev/null +++ b/docs/plans/local-docker-runtime-plan.md @@ -0,0 +1,399 @@ +# Implementation Plan: Local Docker Runtime + +> Generated by `/PACT:plan-mode` on 2026-04-09 +> Status: PENDING APPROVAL + + + + + +## Summary + +Introduce a first-class local Docker runtime for the repo so users can run the main GPU-heavy workflows through pinned container images instead of the fragile `unsloth_latest` conda environment. The target state is: + +- Docker-first local training using official `unsloth/unsloth` images +- Docker-first local evaluation using either direct Unsloth or a dedicated `vllm/vllm-openai` container +- host-side CLI orchestration that mounts the repo into containers and writes artifacts back into the existing workspace layout +- canonical skill/docs updated to recommend Docker Desktop as the default local setup path for Windows users with NVIDIA GPUs + +This is not a proposal to containerize every lightweight repo task. The design goal is narrower and more pragmatic: + +- containerize GPU execution +- keep orchestration and editing on the host +- reduce dependency drift +- align local and cloud runtime behavior + +--- + +## Specialist Perspectives + +### 📋 Preparation Phase +**Effort**: Medium + +#### Research Needed +- [x] Current local environment setup path in `setup_env.ps1`, `setup_env.sh`, and `Trainers/activate_unsloth_latest.*` +- [x] Current cloud image/profile abstraction in `Trainers/cloud/cloud_config.yaml` and `tuner/backends/training/cloud/base_cloud.py` +- [x] Current local evaluation/inference backends in `tuner/handlers/eval_handler.py` and `tuner/handlers/inference_handler.py` +- [x] Existing repo guidance around Docker and Unsloth images in `.skills/fine-tuning/*` and `docs/prep/*` +- [x] Current machine state: Docker engine reachable, NVIDIA RTX 3090 available +- [ ] Confirm best current pinned local training image tag for first rollout +- [ ] Confirm best current pinned local vLLM image tag for first rollout + +#### Current State Findings +- Local GPU execution is still centered on the `unsloth_latest` conda environment: + - `setup_env.ps1` + - `Trainers/activate_unsloth_latest.ps1` + - `tuner/utils/conda.py` +- Cloud execution already has a mature image abstraction: + - training image profiles: `stable`, `next` + - eval image profiles: `stable_unsloth`, `latest_unsloth`, `fast_vllm` +- The repo already assumes official Docker images are the clean dependency boundary for cloud workflows. +- Local vLLM is not a first-class runtime yet; current inference guidance is still manual/server-centric. +- Docker engine is available locally, but long `docker pull` operations need better observability so users can distinguish slow layers from a hung session. + +#### Key External References +- Official Unsloth Docker image: `unsloth/unsloth` +- Official vLLM Docker image: `vllm/vllm-openai` +- Docker Desktop for Windows +- NVIDIA Container Toolkit / Docker GPU support docs + +#### Questions to Resolve +- [ ] Should Docker become the default recommendation for all local GPU users, while conda remains fallback? +- [ ] Should the first local Docker rollout reuse cloud image profiles directly or introduce a separate `local_image_profiles` section? +- [ ] Should local Docker training support both interactive shell access and one-shot command execution in v1? +- [ ] Should local eval default to direct Unsloth or vLLM when both are available? +- [ ] Should local Docker support WSL path mounts explicitly, or standardize on Windows host mounts first? + +--- + +### 🏗️ Architecture Phase +**Effort**: High + +#### Components Affected + +| Component | Change Type | Impact | +|----------|-------------|--------| +| `tuner/cli/parser.py` | Modify | Add local Docker runtime/provider flags and commands | +| `tuner/cli/router.py` | Modify | Route new local Docker flows | +| `tuner/handlers/train_handler.py` | Modify | Add Docker-backed local training path | +| `tuner/handlers/eval_handler.py` | Modify | Add Docker-backed local evaluation backend selection | +| `tuner/handlers/inference_handler.py` | Modify | Replace manual vLLM guidance with managed Docker path | +| `tuner/utils/conda.py` | Modify later | Demote conda from default local GPU path | +| `Trainers/cloud/cloud_config.yaml` | Modify | Add reusable local image/runtime config | +| `shared/` local runtime helpers | New | Common Docker command assembly, mount handling, log streaming | +| `.skills/fine-tuning/SKILL.md` | Modify | Recommend Docker-first local setup | +| `.agents/skills/fine-tuning/SKILL.md` | Sync | Mirror canonical skill | +| `.claude/skills/fine-tuning/SKILL.md` | Sync | Mirror canonical skill | +| `docs/prep/*` | Modify selectively | Update installation guidance to prefer Docker | + +#### Design Approach + +The repo should treat local Docker as a new local execution runtime, not as a special case hacked into cloud code. + +Core split: + +1. **Host orchestration** +- CLI argument parsing +- config resolution +- workspace path selection +- output directory management +- user messaging + +2. **Container execution** +- training stack inside `unsloth/unsloth:*` +- direct LoRA eval inside `unsloth/unsloth:*` +- vLLM serving/eval inside `vllm/vllm-openai:*` + +3. **Shared contracts** +- image profile resolution +- mounted workspace path contract +- canonical output directories +- structured logs and smoke-test health checks + +#### Core Principle: Containerize GPU Execution, Not Everything + +The repo should not force every operation through Docker. The goal is to containerize the unstable, GPU-heavy dependency surface: + +- `torch` +- CUDA userspace +- `transformers` +- `trl` +- `peft` +- `unsloth` +- `vllm` + +Keep lightweight operations on the host: + +- editing configs +- generating plan/docs +- orchestration logic +- artifact inspection +- non-GPU tests + +#### Runtime Model + +```text +Host CLI + ├── resolve config + image profile + ├── prepare bind mounts / output dirs + ├── launch Docker container + ├── stream logs with plain progress + └── persist outputs to repo workspace + +Docker Runtime + ├── training image: unsloth/unsloth: + │ └── run repo trainer entrypoints against mounted workspace + └── eval image: vllm/vllm-openai: or unsloth/unsloth: + ├── start managed local server if needed + └── run evaluator against local endpoint or direct runtime +``` + +#### Key Decisions + +| Decision | Options | Recommendation | Rationale | +|----------|---------|----------------|-----------| +| Local execution default | A) Conda B) Docker C) Hybrid | **C) Hybrid, Docker-first** | Lowest migration risk, best dependency control | +| Image source | A) Custom repo Dockerfiles B) Official upstream images C) Mixed | **B) Official upstream images first** | Lower maintenance, already used in cloud | +| Config reuse | A) Separate local config B) Reuse cloud profiles C) Full unification | **B) Reuse cloud profiles with local overrides** | Avoid drift while keeping local-specific knobs | +| Training runtime | A) Unsloth image B) vLLM image C) Host Python | **A) Unsloth image** | Matches trainer stack | +| Fast eval runtime | A) Unsloth direct B) vLLM only C) Both | **C) Both** | Stable fallback plus fast path | +| Docker recommendation | A) Optional docs only B) Default docs path C) Mandatory | **B) Default docs path** | Strong recommendation without blocking edge cases | +| Pull observability | A) Default docker output B) Plain progress + health checks C) Silent background | **B) Plain progress + health checks** | Users need to see real progress | + +#### Interface Contracts + +**1. Local runtime selection** +```text +python tuner.py train --runtime docker ... +python tuner.py eval --runtime docker ... +python tuner.py infer --runtime docker --backend vllm ... +``` + +**2. Image profile resolution** +- Reuse `cloud_config.yaml` image-profile semantics +- Allow local commands to reference named profiles rather than raw tags + +**3. Mount contract** +- Repo root mounted read/write into container +- model cache mounted separately if needed +- outputs written back into canonical repo-relative paths + +**4. Logging contract** +- Pulls and runs must stream plain progress +- CLI should detect common stuck states: + - waiting for image pull + - no GPU visible in container + - image downloaded but container failed immediately + - vLLM server boot timeout + +--- + +### 💻 Code Phase +**Effort**: High + +#### Files to Modify + +| File | Changes | +|------|---------| +| `tuner/cli/parser.py` | Add local Docker flags / commands | +| `tuner/cli/router.py` | Route local Docker flows | +| `tuner/handlers/train_handler.py` | Support Docker runtime for local training | +| `tuner/handlers/eval_handler.py` | Add Docker-backed local eval path | +| `tuner/handlers/inference_handler.py` | Add managed local Docker vLLM flow | +| `Trainers/cloud/cloud_config.yaml` | Add local image/runtime config knobs | +| `.skills/fine-tuning/SKILL.md` | Update local setup guidance | +| `docs/prep/README.md` | Promote Docker-first path | +| `docs/prep/UNSLOTH_WINDOWS_INSTALLATION_GUIDE.md` | Align with new repo-standard runtime | + +#### Files to Create + +| File | Purpose | +|------|---------| +| `tuner/utils/docker_runtime.py` | Shared Docker command construction and process handling | +| `tuner/handlers/local_docker_handler.py` or equivalent helper | Centralize local Docker orchestration | +| `scripts/docker/` helpers as needed | Smoke tests, health checks, or wrapper entrypoints | + +#### Implementation Sequence +1. Add a small Docker utility layer for process execution, mounts, GPU args, and log streaming. +2. Add a local Docker smoke command for Unsloth image viability. +3. Add a local Docker smoke command for vLLM image viability. +4. Extend local train flow to run trainers through Docker. +5. Extend local eval flow to run direct Unsloth eval through Docker. +6. Extend local inference/eval flow to support managed vLLM Docker server lifecycle. +7. Update canonical skill and prep docs. +8. Sync skill trees with `.skills/scripts/sync_skill_trees.py`. + +#### Critical Note: Do Not Collapse Cloud and Local Handlers Prematurely +Cloud and local both use Docker images, but they do not have the same lifecycle: + +- cloud launches remote jobs against provider SDKs +- local launches host-managed Docker processes with bind mounts + +Shared pieces should be: + +- image profile resolution +- dependency/runtime policy +- helper command fragments where appropriate + +Not shared: + +- provider launch orchestration +- secrets injection semantics +- artifact transport semantics + +#### Critical Note: Windows Path Handling +The first local Docker implementation must treat Windows path mounting and log rendering as first-class concerns. A design that only works cleanly from Linux-style repo paths is not sufficient for this repo's current local usage. + +--- + +### 🧪 Test Phase +**Effort**: Medium-High + +#### Test Scenarios + +| Scenario | Type | Priority | +|----------|------|----------| +| Image profile resolves correctly for local Docker runtime | Unit | P0 | +| Docker command builder mounts repo paths correctly on Windows | Unit | P0 | +| Local training runtime selects Unsloth image | Unit | P0 | +| Local eval runtime selects Unsloth or vLLM image correctly | Unit | P0 | +| Docker log streamer surfaces pull/run progress clearly | Unit | P0 | +| Managed local vLLM startup waits for health before eval | Unit | P0 | +| Unsloth image smoke test imports expected modules | Manual | P0 | +| vLLM image smoke test starts server with GPU visible | Manual | P0 | +| Local train dry-run writes outputs into mounted workspace | Manual | P1 | +| Local eval against adapter path works through Docker | Manual | P1 | + +#### Coverage Targets +- New Docker utility layer: 90%+ +- New local runtime branch logic: 85%+ +- Preserve current handler coverage for non-Docker paths + +#### Manual Smoke Test Goals +- `docker run --gpus all ... nvidia-smi` succeeds +- Unsloth image can import `unsloth`, `transformers`, and trainer dependencies +- vLLM image can boot OpenAI-compatible server locally +- repo-mounted trainer/evaluator commands can execute without path or permission failures + +--- + +## Synthesized Implementation Roadmap + +### Phase Sequence + +```text +PREPARE + ├── lock first pinned image tags + ├── validate Docker Desktop + GPU assumptions + └── define local runtime CLI surface + ↓ +ARCHITECT + ├── finalize host/container boundary + ├── finalize config reuse strategy + └── finalize logging/health contracts + ↓ +CODE + ├── docker utility layer + ├── local smoke commands + ├── local train runtime + ├── local eval runtime + ├── local vLLM runtime + └── docs + skill updates + ↓ +TEST + ├── unit coverage for command builders and config + ├── manual image smoke tests + └── end-to-end dry-run validation +``` + +### Proposed Commit Sequence +1. `feat(local-docker): add shared docker runtime utilities` +2. `feat(local-docker): add local unsloth smoke runtime` +3. `feat(local-docker): add local docker training path` +4. `feat(local-docker): add local docker eval and vllm runtime` +5. `docs(local-docker): make docker-first local setup the default guidance` + +--- + +## Cross-Cutting Concerns + +| Concern | Status | Notes | +|---------|--------|-------| +| Dependency stability | Strong upside | Main reason for this effort | +| Windows UX | Needs attention | Mount paths and logs must be excellent | +| GPU visibility | Needs explicit checks | Must fail fast if Docker GPU support is broken | +| Disk usage | Significant | Large images and model caches need planning | +| Startup latency | Significant | Pull time and cold boot should be made visible | +| Observability | Critical | Users need better progress than silent long-running pulls | +| Security | Moderate | Local Docker is lower risk than remote, but mounted secrets still matter | + +--- + +## Open Questions + +### Require User Decision +- [ ] Should Docker-first local runtime become the default recommendation immediately, or ship first as an opt-in beta? +- [ ] Should conda remain documented as a full fallback path or only as an escape hatch for maintainers? + +### Require Further Research +- [ ] Which exact pinned Unsloth image tag should be the first local-default training image? +- [ ] Which exact pinned vLLM image tag should be the first local-default fast-eval image? +- [ ] Whether Windows host mounts into Docker Desktop are performant enough for full training from `F:\Code\Toolset-Training`, or whether WSL-path mounting should become the recommended path +- [ ] Best local model cache mount strategy to avoid repeated downloads across containers +- [ ] Best CLI pattern for interactive shell vs one-shot execution in containers + +--- + +## Risk Assessment + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| Docker GPU runtime works inconsistently across user machines | Medium | High | Add explicit smoke command and fail-fast diagnostics | +| Large image pulls feel hung or get interrupted | High | Medium | Force plain progress output and add guidance about expected size/time | +| Windows bind-mount performance is poor for training | Medium | High | Benchmark; recommend WSL-backed mounts if needed | +| Local Docker logic duplicates too much cloud logic | Medium | Medium | Share only image/profile resolution and helper utilities | +| vLLM local lifecycle is flaky | Medium | Medium | Keep direct Unsloth eval as fallback | +| Docs overpromise “Docker solves everything” | Medium | Medium | Frame Docker as dependency control, not total infra elimination | + +--- + +## Scope Assessment + +### In Scope +- First-class local Docker runtime for GPU-heavy repo workflows +- Local training via Unsloth container +- Local direct-adapter eval via Unsloth container +- Local fast eval / serving via vLLM container +- Canonical skill/docs update to prefer Docker-first setup +- Better pull/runtime logs and smoke diagnostics + +### Out of Scope +- Replacing cloud providers with local Docker +- Rewriting trainer logic to be Docker-native internally +- Eliminating all host Python paths +- Building and maintaining custom training images in v1 +- Full Kubernetes or multi-service local orchestration + +--- + +## Success Criteria + +- A Windows user with Docker Desktop and a supported NVIDIA GPU can run a repo-provided local smoke command and verify Unsloth container viability without touching conda. +- A Windows user can run local training and local eval through repo CLI commands that use Docker under the hood. +- A Windows user can start a managed local vLLM container for eval/inference without assembling manual commands. +- Canonical skills/docs recommend Docker as the default local GPU runtime. +- Long-running pulls and container startups provide enough progress detail that “slow” is visibly different from “stuck.” diff --git a/shared/utilities/bucket_artifacts.py b/shared/utilities/bucket_artifacts.py index 8c7bd6cc..84b3df8c 100644 --- a/shared/utilities/bucket_artifacts.py +++ b/shared/utilities/bucket_artifacts.py @@ -7,15 +7,63 @@ import json import shutil from pathlib import Path +from posixpath import normpath from typing import Any, Iterable, TextIO from huggingface_hub import HfFileSystem -from huggingface_hub import sync_bucket -from shared.cloud_artifacts import sync_directory_to_hf_bucket, sync_file_to_hf_bucket from shared.utilities.env import get_hf_token +def _load_sync_bucket(): + """Import sync_bucket lazily so list/read still work on older Hub builds.""" + try: + from huggingface_hub import sync_bucket as _sync_bucket + except ImportError: + return None + return _sync_bucket + + +def _strip_hf_scheme(path: str) -> str: + normalized = str(path or "").strip() + if normalized.startswith("hf://"): + return normalized[len("hf://") :] + return normalized + + +def _hf_relative_child(root: str, child: str) -> str: + root_norm = normpath(_strip_hf_scheme(root)).rstrip("/") + child_norm = normpath(_strip_hf_scheme(child)).rstrip("/") + prefix = f"{root_norm}/" + if child_norm.startswith(prefix): + return child_norm[len(prefix) :] + return Path(child_norm).name + + +def _copy_hf_tree(fs: HfFileSystem, artifact_path: str, target: Path) -> None: + """Fallback pull implementation when sync_bucket is unavailable.""" + info = fs.info(artifact_path) + entry_type = info.get("type", "file") + + if entry_type == "directory": + target.mkdir(parents=True, exist_ok=True) + raw_entries = fs.find(artifact_path, detail=True) + iterator = raw_entries.items() if isinstance(raw_entries, dict) else [] + for remote_path, details in iterator: + if details.get("type") == "directory": + continue + relative = _hf_relative_child(artifact_path, remote_path) + destination = target / relative + destination.parent.mkdir(parents=True, exist_ok=True) + with fs.open(remote_path, "rb") as src, open(destination, "wb") as dst: + shutil.copyfileobj(src, dst) + return + + target.parent.mkdir(parents=True, exist_ok=True) + with fs.open(artifact_path, "rb") as src, open(target, "wb") as dst: + shutil.copyfileobj(src, dst) + + def build_artifact_path(path: str, *, bucket_id: str | None = None) -> str: """Return a local path or fully-qualified HF bucket URI.""" normalized = str(path or "").strip() @@ -34,8 +82,17 @@ def _artifact_relative_path(path: str, *, bucket_id: str | None = None) -> Path: artifact_path = build_artifact_path(path, bucket_id=bucket_id) if artifact_path.startswith("hf://buckets/"): remainder = artifact_path[len("hf://buckets/") :] - parts = remainder.split("/", 1) - relative = parts[1] if len(parts) > 1 else "" + normalized_bucket = str(bucket_id or "").strip("/") + if normalized_bucket and remainder.startswith(f"{normalized_bucket}/"): + relative = remainder[len(normalized_bucket) + 1 :] + else: + parts = remainder.split("/") + if len(parts) >= 3: + relative = "/".join(parts[2:]) + elif len(parts) >= 2: + relative = "/".join(parts[1:]) + else: + relative = "" return Path(relative) local_path = Path(path) if local_path.is_absolute(): @@ -178,7 +235,12 @@ def pull_artifacts( target.parent.mkdir(parents=True, exist_ok=True) if artifact_path.startswith("hf://"): - sync_bucket(artifact_path, str(target), token=get_hf_token()) + sync_bucket = _load_sync_bucket() + if sync_bucket is not None: + sync_bucket(artifact_path, str(target), token=get_hf_token()) + else: + fs = HfFileSystem(token=get_hf_token()) + _copy_hf_tree(fs, artifact_path, target) return target source = Path(artifact_path) @@ -226,6 +288,8 @@ def push_artifacts( bucket_id: str, destination: str | None = None, ) -> str: + from shared.cloud_artifacts import sync_directory_to_hf_bucket, sync_file_to_hf_bucket + source = Path(path).resolve() if not source.exists(): raise FileNotFoundError(str(source)) diff --git a/shared/utilities/env.py b/shared/utilities/env.py index c8a3ea18..1d7febd6 100644 --- a/shared/utilities/env.py +++ b/shared/utilities/env.py @@ -73,6 +73,14 @@ def get_hf_token() -> Optional[str]: Returns: HuggingFace token or None """ + for key in ("HF_TOKEN", "HF_API_KEY"): + value = os.environ.get(key) + if value is None: + continue + value = value.strip() + if value: + return value + load_env_file() for key in ("HF_TOKEN", "HF_API_KEY"): value = os.environ.get(key) if value is None: diff --git a/shared/utilities/paths.py b/shared/utilities/paths.py index 7358d4fc..2deff751 100644 --- a/shared/utilities/paths.py +++ b/shared/utilities/paths.py @@ -15,6 +15,10 @@ CANONICAL_OUTPUT_DIRS = {method: f"{method}_output" for method in TRAINING_METHODS} LEGACY_OUTPUT_DIRS = {method: f"{method}_output_rtx3090" for method in TRAINING_METHODS} +LOCAL_ARTIFACT_RUN_ROOTS = ( + Path("toolset-training-artifacts") / "runs", + Path("runs"), +) def get_project_root() -> Path: @@ -135,6 +139,84 @@ def iter_training_output_dirs(method: str, repo_root: Optional[Path] = None) -> return [preferred_trainer_dir / get_canonical_output_dir_name(normalized)] +def get_local_artifact_run_roots(repo_root: Optional[Path] = None) -> list[Path]: + """ + Return repo-local roots that may contain pulled cloud training artifacts. + + These roots match the relative path preserved by ``python tuner.py bucket pull``. + """ + root = repo_root or get_project_root() + candidates: list[Path] = [] + for relative_root in LOCAL_ARTIFACT_RUN_ROOTS: + candidate = root / relative_root + if candidate not in candidates: + candidates.append(candidate) + return candidates + + +def iter_imported_training_run_dirs(method: str, repo_root: Optional[Path] = None) -> list[Path]: + """ + Return imported cloud run directories for a method. + + Expected artifact layout: + /runs//// + where ```` is either ``toolset-training-artifacts`` or the repo root. + """ + normalized = normalize_trainer_method(method) + runs: list[Path] = [] + seen: set[Path] = set() + + for runs_root in get_local_artifact_run_roots(repo_root): + if not runs_root.exists(): + continue + + for provider_dir in runs_root.iterdir(): + if not provider_dir.is_dir(): + continue + + method_dir = provider_dir / normalized + if not method_dir.exists(): + continue + + for run_dir in method_dir.iterdir(): + if run_dir.is_dir(): + resolved = run_dir.resolve() + if resolved not in seen: + seen.add(resolved) + runs.append(run_dir) + + return runs + + +def iter_training_run_dirs(method: str, repo_root: Optional[Path] = None) -> list[Path]: + """ + Return all locally discoverable run directories for a method. + + This includes native trainer outputs and imported cloud/bucket artifacts. + """ + normalized = normalize_trainer_method(method) + runs: list[Path] = [] + seen: set[Path] = set() + + for output_dir in iter_training_output_dirs(normalized, repo_root): + if not output_dir.exists(): + continue + for run_dir in output_dir.iterdir(): + if run_dir.is_dir(): + resolved = run_dir.resolve() + if resolved not in seen: + seen.add(resolved) + runs.append(run_dir) + + for run_dir in iter_imported_training_run_dirs(normalized, repo_root): + resolved = run_dir.resolve() + if resolved not in seen: + seen.add(resolved) + runs.append(run_dir) + + return runs + + def get_primary_training_output_dir(method: str, repo_root: Optional[Path] = None) -> Path: """ Get the preferred output directory for new runs of a method. diff --git a/tuner/backends/evaluation/unsloth_backend.py b/tuner/backends/evaluation/unsloth_backend.py index dee0d5c7..f7a982bb 100644 --- a/tuner/backends/evaluation/unsloth_backend.py +++ b/tuner/backends/evaluation/unsloth_backend.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import List, Optional, Tuple -from shared.utilities.paths import iter_training_output_dirs +from shared.utilities.paths import iter_training_run_dirs from .base import IEvaluationBackend @@ -62,12 +62,10 @@ def list_models(self) -> List[str]: models = [] for method in ("sft", "kto", "grpo"): - for output_dir in iter_training_output_dirs(method, self._repo_root): - if not output_dir.exists(): - continue - - for adapter_config in output_dir.rglob("final_model/adapter_config.json"): - adapter_dir = adapter_config.parent + for run_dir in iter_training_run_dirs(method, self._repo_root): + adapter_dir = run_dir / "final_model" + adapter_config = adapter_dir / "adapter_config.json" + if adapter_config.exists(): models.append(str(adapter_dir.resolve())) # Sort by modification time (newest first) @@ -131,13 +129,8 @@ def get_model_info(self, adapter_path: str) -> dict: size_mb = round(adapter_file.stat().st_size / (1024 ** 2), 1) # Detect trainer type from path - trainer_type = "unknown" - if "sft_output" in str(path): - trainer_type = "sft" - elif "kto_output" in str(path): - trainer_type = "kto" - elif "grpo_output" in str(path): - trainer_type = "grpo" + trainer_type = self._detect_trainer_type(path) + source = self._detect_source(path) # Extract run timestamp from parent directory timestamp = path.parent.name if path.parent else "unknown" @@ -153,7 +146,31 @@ def get_model_info(self, adapter_path: str) -> dict: "base_model_short": base_model_short, "size_mb": size_mb, "trainer_type": trainer_type, + "source": source, "timestamp": timestamp, "r": config.get("r"), # LoRA rank "lora_alpha": config.get("lora_alpha"), } + + @staticmethod + def _detect_trainer_type(path: Path) -> str: + parts = [part.lower() for part in path.parts] + markers = { + "sft": {"sft_output", "sft_output_rtx3090", "rtx3090_sft", "sft"}, + "kto": {"kto_output", "kto_output_rtx3090", "rtx3090_kto", "kto"}, + "grpo": {"grpo_output", "grpo_output_rtx3090", "rtx3090_grpo", "grpo"}, + } + for trainer_type, candidates in markers.items(): + if any(candidate in parts for candidate in candidates): + if trainer_type in {"sft", "kto", "grpo"}: + return trainer_type + return "unknown" + + @staticmethod + def _detect_source(path: Path) -> str: + parts = {part.lower() for part in path.parts} + if "toolset-training-artifacts" in parts: + return "bucket_pull" + if "runs" in parts and "trainers" not in parts: + return "cloud_artifact" + return "local_training" diff --git a/tuner/cli/main.py b/tuner/cli/main.py index 13c22adc..3deefcaa 100644 --- a/tuner/cli/main.py +++ b/tuner/cli/main.py @@ -7,12 +7,20 @@ """ import sys +import io from pathlib import Path from tuner.utils import load_env_file from .parser import create_parser from .router import route_command +if sys.platform == "win32": + if hasattr(sys.stdout, "buffer"): + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8") + if hasattr(sys.stderr, "buffer"): + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8") + + def main(): """ Main CLI entry point. diff --git a/tuner/cli/parser.py b/tuner/cli/parser.py index 5aff3805..90f828d9 100644 --- a/tuner/cli/parser.py +++ b/tuner/cli/parser.py @@ -67,6 +67,7 @@ def create_parser() -> argparse.ArgumentParser: Commands: (none) Interactive menu train Training workflow (SFT, KTO, GRPO) + docker Local Docker runtime helper (status, bootstrap, pull, smoke, build) cloud Cloud training (HF Jobs, Modal, RunPod) cloud-run Config-driven HF cloud job cloud-jobs Inspect or manage live HF Jobs @@ -109,6 +110,11 @@ def create_parser() -> argparse.ArgumentParser: Examples: python tuner.py # Interactive mode python tuner.py train # Go directly to training + python tuner.py docker status + python tuner.py docker bootstrap --docker-target all + python tuner.py docker build --docker-target bucket + python tuner.py docker pull --docker-target unsloth + python tuner.py docker smoke --docker-target vllm python tuner.py cloud # Cloud training submenu python tuner.py eval # Go directly to evaluation python tuner.py synthchat # Generate or improve data @@ -138,7 +144,7 @@ def create_parser() -> argparse.ArgumentParser: parser.add_argument( "command", nargs="?", - choices=["train", "cloud", "cloud-run", "cloud-jobs", "plan-hardware", "cloud-pipeline", "cloud-eval", "cloud-gym", "cloud-inspect", "bucket", "run-experiment", "analyze-experiment", "eval", "synthchat", "modelops", "ml", "flywheel", "experiment-loop", "surgery", "status", "doctor", "list", "list-runs", "compute-losses", "compare-runs", "judge-sample", "create-experiment", "cloud-compare", "download-experiment"], + choices=["train", "docker", "cloud", "cloud-run", "cloud-jobs", "plan-hardware", "cloud-pipeline", "cloud-eval", "cloud-gym", "cloud-inspect", "bucket", "run-experiment", "analyze-experiment", "eval", "synthchat", "modelops", "ml", "flywheel", "experiment-loop", "surgery", "status", "doctor", "list", "list-runs", "compute-losses", "compare-runs", "judge-sample", "create-experiment", "cloud-compare", "download-experiment"], help="Command to run (optional, defaults to interactive menu)" ) @@ -150,7 +156,7 @@ def create_parser() -> argparse.ArgumentParser: "subcommand", nargs="?", default=None, - help="Sub-command (e.g., 'datasets' for list, 'train' for ml)" + help="Sub-command (e.g., 'datasets' for list, 'train' for ml, 'bootstrap' for docker)" ) # Global flags @@ -166,6 +172,12 @@ def create_parser() -> argparse.ArgumentParser: dest="auto_confirm", help="Skip confirmation prompts for non-interactive command execution", ) + parser.add_argument( + "--runtime", + choices=["native", "docker"], + default="native", + help="Local runtime for train/eval flows. Use 'docker' to run on Docker Desktop instead of the local conda environment." + ) # Doctor-specific flags parser.add_argument( @@ -207,6 +219,24 @@ def create_parser() -> argparse.ArgumentParser: help="Path to flywheel config YAML (flywheel commands only)" ) + # Local Docker runtime flags + parser.add_argument( + "--docker-target", + choices=["unsloth", "vllm", "bucket", "all"], + dest="docker_target", + help="Docker runtime target for 'docker' command (unsloth, vllm, bucket, or all)" + ) + parser.add_argument( + "--docker-image", + dest="docker_image", + help="Explicit Docker image override for local Docker flows ('docker', 'train --runtime docker', 'eval --runtime docker')" + ) + parser.add_argument( + "--docker-profile", + dest="docker_profile", + help="Named Docker image profile from Trainers/cloud/cloud_config.yaml for local Docker flows" + ) + # Surgery-specific flags parser.add_argument( "--surgery-config", diff --git a/tuner/cli/router.py b/tuner/cli/router.py index c45b0a57..ad237166 100644 --- a/tuner/cli/router.py +++ b/tuner/cli/router.py @@ -68,21 +68,56 @@ def route_command(args: Namespace) -> int: """ # Check for JSON mode - affects error output json_mode = getattr(args, 'json', False) + command = getattr(args, 'command', None) + + # Special-case Docker helper so unrelated cloud import failures do not block it. + if command == 'docker': + try: + from tuner.handlers.docker_handler import DockerHandler + except ImportError as e: + error_msg = f"Handlers not yet implemented: {e}" + if json_mode: + output = { + "success": False, + "error": { + "message": error_msg, + "code": "HANDLER_IMPORT_ERROR", + }, + "timestamp": datetime.now().isoformat() + } + print(json.dumps(output, indent=2)) + else: + print(f"Error: {error_msg}") + print("This is expected during migration. Please use tuner_legacy.py instead.") + return 1 + handler = DockerHandler(args=args) + return handler.handle() + + if command == 'bucket': + try: + from tuner.handlers.bucket_handler import BucketHandler + except ImportError as e: + error_msg = f"Handlers not yet implemented: {e}" + if json_mode: + output = { + "success": False, + "error": { + "message": error_msg, + "code": "HANDLER_IMPORT_ERROR", + }, + "timestamp": datetime.now().isoformat() + } + print(json.dumps(output, indent=2)) + else: + print(f"Error: {error_msg}") + return 1 + handler = BucketHandler(args=args) + return handler.handle() - # Import handlers (deferred to avoid circular imports) + # Import local handlers first so cloud dependency drift does not block local commands. try: from tuner.handlers.train_handler import TrainHandler from tuner.handlers.eval_handler import EvalHandler - from tuner.handlers.cloud_pipeline_handler import CloudPipelineHandler - from tuner.handlers.hardware_plan_handler import HardwarePlanHandler - from tuner.handlers.cloud_eval_handler import CloudEvalHandler - from tuner.handlers.cloud_inspect_handler import CloudInspectHandler - from tuner.handlers.cloud_jobs_handler import CloudJobsHandler - from tuner.handlers.cloud_gym_handler import CloudGymHandler - from tuner.handlers.cloud_run_handler import CloudRunHandler - from tuner.handlers.bucket_handler import BucketHandler - from tuner.handlers.experiment_handler import ExperimentHandler - from tuner.handlers.experiment_analysis_handler import ExperimentAnalysisHandler from tuner.handlers.synthchat_handler import SynthChatHandler from tuner.handlers.modelops_handler import ModelOpsHandler from tuner.handlers.ml_handler import MLHandler @@ -93,7 +128,6 @@ def route_command(args: Namespace) -> int: from tuner.handlers.flywheel_handler import FlywheelHandler from tuner.handlers.surgery_handler import SurgeryHandler except ImportError as e: - # Graceful degradation if handlers not yet implemented error_msg = f"Handlers not yet implemented: {e}" if json_mode: output = { @@ -110,16 +144,13 @@ def route_command(args: Namespace) -> int: print("This is expected during migration. Please use tuner_legacy.py instead.") return 1 - # Get command from args - command = getattr(args, 'command', None) - # JSON mode without command is an error (interactive menu needs input) # Exception: status, doctor, and list commands work in JSON mode if json_mode and not command: output = { "success": False, "error": { - "message": "JSON mode requires a command (train, cloud, cloud-run, cloud-jobs, plan-hardware, cloud-pipeline, cloud-eval, cloud-gym, cloud-inspect, bucket, run-experiment, analyze-experiment, eval, synthchat, modelops, ml, flywheel, surgery, status, doctor, list)", + "message": "JSON mode requires a command (train, cloud, cloud-run, cloud-jobs, plan-hardware, cloud-pipeline, cloud-eval, cloud-gym, cloud-inspect, bucket, run-experiment, analyze-experiment, eval, synthchat, modelops, ml, flywheel, docker, surgery, status, doctor, list)", "code": "COMMAND_REQUIRED", }, "timestamp": datetime.now().isoformat() @@ -151,27 +182,22 @@ def route_command(args: Namespace) -> int: # Special handling for ml command (has subcommand and --config) if command == 'ml': ml_sub = getattr(args, 'subcommand', None) - # Map the generic subcommand to ml_subcommand for the handler if args is not None: args.ml_subcommand = ml_sub handler = MLHandler(args=args) return handler.handle() - # Special handling for flywheel command (has subcommand) if command == 'flywheel': handler = FlywheelHandler(args=args) return handler.handle() - # Autonomous experiment loop if command == 'experiment-loop': return _handle_experiment_loop(args, json_mode) - # Surgery command if command == 'surgery': handler = SurgeryHandler(args=args) return handler.handle() - # Experiment pipeline if command == 'compare-runs': import subprocess import sys @@ -197,43 +223,102 @@ def route_command(args: Namespace) -> int: print(f"Created experiment: {exp.experiment_id}") return 0 - # Import cloud handler (conditional - may not have deps) - try: - from tuner.handlers.cloud_train_handler import CloudTrainHandler - except ImportError: - CloudTrainHandler = None - - # Map commands to handlers - handlers = { + local_handlers = { 'train': TrainHandler, - 'cloud-pipeline': CloudPipelineHandler, - 'cloud-run': CloudRunHandler, - 'cloud-jobs': CloudJobsHandler, - 'plan-hardware': HardwarePlanHandler, 'eval': EvalHandler, - 'cloud-eval': CloudEvalHandler, - 'cloud-gym': CloudGymHandler, - 'cloud-inspect': CloudInspectHandler, - 'bucket': BucketHandler, - 'run-experiment': ExperimentHandler, - 'analyze-experiment': ExperimentAnalysisHandler, 'synthchat': SynthChatHandler, 'modelops': ModelOpsHandler, 'ml': MLHandler, } - if CloudTrainHandler is not None: - handlers['cloud'] = CloudTrainHandler - # Execute handler with args - if command and command in handlers: - handler_class = handlers[command] + if command and command in local_handlers: + handler_class = local_handlers[command] handler = handler_class(args=args) return handler.handle() - else: - # No command = interactive menu - handler = MainMenuHandler(args=args) + + cloud_commands = { + 'cloud', + 'cloud-pipeline', + 'cloud-run', + 'cloud-jobs', + 'plan-hardware', + 'cloud-eval', + 'cloud-gym', + 'cloud-inspect', + 'run-experiment', + 'analyze-experiment', + } + + if command in cloud_commands: + try: + from tuner.handlers.cloud_pipeline_handler import CloudPipelineHandler + from tuner.handlers.hardware_plan_handler import HardwarePlanHandler + from tuner.handlers.cloud_eval_handler import CloudEvalHandler + from tuner.handlers.cloud_inspect_handler import CloudInspectHandler + from tuner.handlers.cloud_jobs_handler import CloudJobsHandler + from tuner.handlers.cloud_gym_handler import CloudGymHandler + from tuner.handlers.cloud_run_handler import CloudRunHandler + from tuner.handlers.experiment_handler import ExperimentHandler + from tuner.handlers.experiment_analysis_handler import ExperimentAnalysisHandler + + try: + from tuner.handlers.cloud_train_handler import CloudTrainHandler + except ImportError: + CloudTrainHandler = None + except ImportError as e: + error_msg = f"Cloud handlers unavailable: {e}" + if json_mode: + output = { + "success": False, + "error": { + "message": error_msg, + "code": "HANDLER_IMPORT_ERROR", + }, + "timestamp": datetime.now().isoformat() + } + print(json.dumps(output, indent=2)) + else: + print(f"Error: {error_msg}") + return 1 + + cloud_handlers = { + 'cloud-pipeline': CloudPipelineHandler, + 'cloud-run': CloudRunHandler, + 'cloud-jobs': CloudJobsHandler, + 'plan-hardware': HardwarePlanHandler, + 'cloud-eval': CloudEvalHandler, + 'cloud-gym': CloudGymHandler, + 'cloud-inspect': CloudInspectHandler, + 'run-experiment': ExperimentHandler, + 'analyze-experiment': ExperimentAnalysisHandler, + } + if CloudTrainHandler is not None: + cloud_handlers['cloud'] = CloudTrainHandler + + if command == 'cloud' and CloudTrainHandler is None: + error_msg = "Cloud training handler unavailable in the current environment." + if json_mode: + output = { + "success": False, + "error": { + "message": error_msg, + "code": "HANDLER_IMPORT_ERROR", + }, + "timestamp": datetime.now().isoformat() + } + print(json.dumps(output, indent=2)) + else: + print(f"Error: {error_msg}") + return 1 + + handler_class = cloud_handlers[command] + handler = handler_class(args=args) return handler.handle() + # Execute handler with args + handler = MainMenuHandler(args=args) + return handler.handle() + def _handle_experiment_loop(args: Namespace, json_mode: bool) -> int: """Run the autonomous experiment loop.""" diff --git a/tuner/cloud/hf_jobs.py b/tuner/cloud/hf_jobs.py index b9322287..15e06b0e 100644 --- a/tuner/cloud/hf_jobs.py +++ b/tuner/cloud/hf_jobs.py @@ -8,11 +8,19 @@ from dataclasses import dataclass, field from typing import Any, Dict, Iterable, List, Optional -from shared.cloud_artifacts import normalize_hf_bucket_id from shared.utilities.env import get_env_var, get_hf_token from tuner.core.exceptions import CloudProviderError +def normalize_hf_bucket_id(bucket_id: str) -> str: + """Normalize bucket identifiers to the canonical namespace/name form.""" + normalized = str(bucket_id or "").strip() + for prefix in ("hf://buckets/", "buckets/"): + if normalized.startswith(prefix): + normalized = normalized[len(prefix):] + return normalized.strip("/") + + @dataclass(frozen=True) class RepoCheckoutSpec: """Exact repository source needed to reproduce a cloud job.""" diff --git a/tuner/discovery/training_runs.py b/tuner/discovery/training_runs.py index ed11ae02..e4671ab2 100644 --- a/tuner/discovery/training_runs.py +++ b/tuner/discovery/training_runs.py @@ -16,7 +16,7 @@ from pathlib import Path from typing import List -from shared.utilities.paths import iter_training_output_dirs +from shared.utilities.paths import iter_training_run_dirs class TrainingRunDiscovery: @@ -100,19 +100,12 @@ def discover(self, trainer_type: str, limit: int = 10) -> List[Path]: """ runs = [] - for output_dir in iter_training_output_dirs(trainer_type, self.repo_root): - if not output_dir.exists(): - continue + for d in iter_training_run_dirs(trainer_type, self.repo_root): + has_final = (d / "final_model").exists() + has_checkpoints = (d / "checkpoints").exists() and any((d / "checkpoints").iterdir()) - for d in sorted(output_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True): - if not d.is_dir(): - continue - - has_final = (d / "final_model").exists() - has_checkpoints = (d / "checkpoints").exists() and any((d / "checkpoints").iterdir()) - - if has_final or has_checkpoints: - runs.append(d) + if has_final or has_checkpoints: + runs.append(d) runs.sort(key=lambda p: p.stat().st_mtime, reverse=True) if limit is not None: diff --git a/tuner/handlers/__init__.py b/tuner/handlers/__init__.py index d5d809dc..f87570a9 100644 --- a/tuner/handlers/__init__.py +++ b/tuner/handlers/__init__.py @@ -1,38 +1,33 @@ """ -Command handlers for the Synaptic Tuner CLI. +Lazy exports for Synaptic Tuner command handlers. -This package contains handler implementations for different CLI commands: -- TrainHandler: Training workflow orchestration (STUB - to be implemented) -- UploadHandler: Model upload workflow (STUB - to be implemented) -- EvalHandler: Evaluation workflow -- PipelineHandler: Full pipeline (train -> upload -> eval) -- MainMenuHandler: Interactive main menu -- SynthChatHandler: Synthetic data generation and improvement -- StatusHandler: System status overview for AI assistants -- DoctorHandler: System diagnostics with recommendations and auto-fix - -Each handler implements the IHandler interface and can be registered -with the router for command dispatching. +Avoid eager imports here. Some handlers pull in heavyweight optional +dependencies, and importing the package itself should not force every +runtime path to load them. """ -from tuner.handlers.train_handler import TrainHandler -from tuner.handlers.upload_handler import UploadHandler -from tuner.handlers.eval_handler import EvalHandler -from tuner.handlers.pipeline_handler import PipelineHandler -from tuner.handlers.main_menu_handler import MainMenuHandler -from tuner.handlers.synthchat_handler import SynthChatHandler -from tuner.handlers.status_handler import StatusHandler -from tuner.handlers.doctor_handler import DoctorHandler -from tuner.handlers.ml_handler import MLHandler +from __future__ import annotations + +import importlib + +_HANDLER_MODULES = { + "TrainHandler": "tuner.handlers.train_handler", + "UploadHandler": "tuner.handlers.upload_handler", + "EvalHandler": "tuner.handlers.eval_handler", + "PipelineHandler": "tuner.handlers.pipeline_handler", + "MainMenuHandler": "tuner.handlers.main_menu_handler", + "SynthChatHandler": "tuner.handlers.synthchat_handler", + "StatusHandler": "tuner.handlers.status_handler", + "DoctorHandler": "tuner.handlers.doctor_handler", + "MLHandler": "tuner.handlers.ml_handler", +} + +__all__ = list(_HANDLER_MODULES) + -__all__ = [ - "TrainHandler", - "UploadHandler", - "EvalHandler", - "PipelineHandler", - "MainMenuHandler", - "SynthChatHandler", - "StatusHandler", - "DoctorHandler", - "MLHandler", -] +def __getattr__(name: str): + module_name = _HANDLER_MODULES.get(name) + if module_name is None: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + module = importlib.import_module(module_name) + return getattr(module, name) diff --git a/tuner/handlers/base.py b/tuner/handlers/base.py index a20d38b7..6f8f1733 100644 --- a/tuner/handlers/base.py +++ b/tuner/handlers/base.py @@ -230,7 +230,7 @@ def output_error( try: from tuner.ui import print_error print_error(message) - except ImportError: + except (ImportError, UnicodeEncodeError): print(f"Error: {message}", file=sys.stderr) def output_info(self, message: str, data: Optional[Dict[str, Any]] = None) -> None: @@ -260,7 +260,7 @@ def output_info(self, message: str, data: Optional[Dict[str, Any]] = None) -> No try: from tuner.ui import print_info print_info(message) - except ImportError: + except (ImportError, UnicodeEncodeError): print(f"Info: {message}") def output_list( diff --git a/tuner/handlers/bucket_handler.py b/tuner/handlers/bucket_handler.py index c467eeb3..df937828 100644 --- a/tuner/handlers/bucket_handler.py +++ b/tuner/handlers/bucket_handler.py @@ -5,6 +5,8 @@ from __future__ import annotations import json +import os +import subprocess from argparse import Namespace from pathlib import Path from typing import Any, Optional @@ -15,6 +17,16 @@ from tuner.cloud import load_huggingface_hub, resolve_hf_bucket_id from tuner.core.exceptions import CloudProviderError from tuner.handlers.base import BaseHandler +from tuner.utils.docker_runtime import ( + BUCKET_HELPER_ENV_MARKER, + BUCKET_HELPER_IMAGE, + CONTAINER_REPO_ROOT, + bucket_helper_image_present, + build_bucket_helper_image_command, + build_bucket_helper_run_command, + container_repo_path, + ensure_docker_cli, +) class BucketHandler(BaseHandler): @@ -30,6 +42,173 @@ def can_handle_direct_mode(self) -> bool: def _cloud_config_path(self) -> Path: return self.repo_root / "Trainers" / "cloud" / "cloud_config.yaml" + def _native_bucket_support(self) -> tuple[bool, str]: + """Check whether the current Python has the Buckets APIs we rely on.""" + try: + import huggingface_hub + except ImportError as exc: + return False, str(exc) + + version = getattr(huggingface_hub, "__version__", "unknown") + missing = [ + name for name in ("HfFileSystem", "create_bucket") + if not hasattr(huggingface_hub, name) + ] + if missing: + return False, f"huggingface_hub {version} does not support required APIs: {', '.join(missing)}" + return True, "" + + def _bucket_command_is_remote(self, subcommand: str) -> bool: + """Return True when the selected bucket command needs HF Buckets support.""" + if subcommand in {"push", "pull"}: + return True + if getattr(self.args, "bucket", None): + return True + + path = str(getattr(self.args, "path", "") or "").strip() + if not path: + return True + if path.startswith("hf://"): + return True + + local_path = Path(path) + if local_path.exists(): + return False + if path.startswith("./") or path.startswith("../") or local_path.is_absolute(): + return False + return True + + def _path_for_helper(self, value: Optional[str], *, local_default_to_repo: bool = False) -> Optional[str]: + """Translate a repo-local host path into the helper container mount path.""" + raw = str(value or "").strip() + if not raw: + if local_default_to_repo: + return str(CONTAINER_REPO_ROOT) + return None + if raw.startswith("hf://"): + return raw + + candidate = Path(raw) + treat_as_local = ( + candidate.exists() + or candidate.is_absolute() + or raw.startswith("./") + or raw.startswith("../") + or local_default_to_repo + ) + if not treat_as_local: + return raw + + resolved = (candidate if candidate.is_absolute() else (self.repo_root / candidate)).resolve() + try: + return container_repo_path(resolved, self.repo_root) + except ValueError as exc: + raise CloudProviderError( + f"Bucket helper can only access paths inside the repo workspace: {resolved}" + ) from exc + + def _helper_cli_args(self, subcommand: str) -> list[str]: + """Build the CLI argument list to forward into the helper container.""" + args = ["bucket", subcommand] + if self.json_mode: + args.append("--json") + + bucket_id = getattr(self.args, "bucket", None) + if bucket_id: + args.extend(["--bucket", str(bucket_id)]) + + path_value = getattr(self.args, "path", None) + if path_value: + if subcommand == "push": + translated = self._path_for_helper(path_value) + else: + translated = self._path_for_helper(path_value, local_default_to_repo=False) + args.extend(["--path", translated]) + + dest_value = getattr(self.args, "dest", None) + if subcommand == "pull": + translated_dest = self._path_for_helper(dest_value, local_default_to_repo=True) + args.extend(["--dest", translated_dest]) + elif dest_value: + args.extend(["--dest", str(dest_value)]) + + eval_path = getattr(self.args, "eval_path", None) + if eval_path: + args.extend(["--eval-path", str(eval_path)]) + + loss_path = getattr(self.args, "loss_path", None) + if loss_path: + args.extend(["--loss-path", str(loss_path)]) + + for flag_name, cli_flag in (("tail", "--tail"), ("limit", "--limit")): + value = getattr(self.args, flag_name, None) + if value is not None: + args.extend([cli_flag, str(value)]) + + for flag_name, cli_flag in ( + ("jsonl_latest", "--jsonl-latest"), + ("pretty", "--pretty"), + ("recursive", "--recursive"), + ("files_only", "--files-only"), + ("dirs_only", "--dirs-only"), + ): + if bool(getattr(self.args, flag_name, False)): + args.append(cli_flag) + + return args + + def _render_helper_output(self, output: str) -> str: + """Rewrite helper container repo paths back to the host workspace path.""" + if not output: + return output + return output.replace(str(CONTAINER_REPO_ROOT), str(self.repo_root).replace("\\", "/")) + + def _delegate_to_docker_helper(self, subcommand: str) -> int: + """Run the bucket command inside the dedicated Docker helper image.""" + docker_ok, docker_error = ensure_docker_cli() + if not docker_ok: + raise CloudProviderError( + f"{docker_error} The current Python also lacks required HF Buckets APIs." + ) + + if not bucket_helper_image_present(self.repo_root): + if not self.json_mode: + print(f"Building Docker bucket helper image: {BUCKET_HELPER_IMAGE}") + build = subprocess.run( + build_bucket_helper_image_command(self.repo_root), + cwd=str(self.repo_root), + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + if build.returncode != 0: + raise CloudProviderError( + build.stderr.strip() or build.stdout.strip() or "Failed to build Docker bucket helper image." + ) + + helper = subprocess.run( + build_bucket_helper_run_command( + self.repo_root, + helper_args=self._helper_cli_args(subcommand), + ), + cwd=str(self.repo_root), + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + rendered_stdout = self._render_helper_output(helper.stdout or "") + rendered_stderr = self._render_helper_output(helper.stderr or "") + + if helper.returncode != 0: + message = rendered_stderr.strip() or rendered_stdout.strip() or "Docker bucket helper failed." + raise CloudProviderError(message) + + if rendered_stdout.strip(): + print(rendered_stdout, end="" if rendered_stdout.endswith("\n") else "\n") + return 0 + def _default_bucket_id(self) -> Optional[str]: settings = load_cloud_config(self._cloud_config_path()).get("hf_jobs", {}) configured = str(settings.get("artifact_identifier", "")).strip() @@ -352,6 +531,14 @@ def _handle_push(self) -> int: def handle(self) -> int: try: subcommand = str(getattr(self.args, "subcommand", "") or "").strip().lower() + if ( + subcommand in {"read", "list", "pull", "push", "analyze"} + and os.getenv(BUCKET_HELPER_ENV_MARKER) != "1" + and self._bucket_command_is_remote(subcommand) + ): + native_ok, _reason = self._native_bucket_support() + if not native_ok: + return self._delegate_to_docker_helper(subcommand) if subcommand == "read": return self._handle_read() if subcommand == "list": @@ -364,5 +551,8 @@ def handle(self) -> int: return self._handle_analyze() raise CloudProviderError("Bucket command requires subcommand 'read', 'list', 'pull', 'push', or 'analyze'.") except Exception as exc: - self.output_error(str(exc), code="BUCKET_ERROR") + try: + self.output_error(str(exc), code="BUCKET_ERROR") + except UnicodeEncodeError: + print(f"Error: {exc}") return 1 diff --git a/tuner/handlers/docker_handler.py b/tuner/handlers/docker_handler.py new file mode 100644 index 00000000..4b4512f9 --- /dev/null +++ b/tuner/handlers/docker_handler.py @@ -0,0 +1,540 @@ +""" +Local Docker runtime helper for Synaptic Tuner. + +Location: tuner/handlers/docker_handler.py +Purpose: Validate and manage local Docker-backed model runtimes +Used by: Router when 'docker' command is invoked +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +from argparse import Namespace +from pathlib import Path +from typing import Optional + +from shared.utilities.env import get_hf_token +from tuner.backends.training.cloud.base_cloud import resolve_cloud_image +from tuner.core.exceptions import CloudProviderError +from tuner.handlers.base import BaseHandler +from tuner.utils.docker_runtime import ( + BUCKET_HELPER_IMAGE, + bucket_helper_image_present, + build_bucket_helper_image_command, +) + + +class DockerHandler(BaseHandler): + """Handler for ``tuner docker`` subcommands.""" + + _SUBCOMMANDS = { + "build": "_handle_build", + "bootstrap": "_handle_bootstrap", + "status": "_handle_status", + "pull": "_handle_pull", + "smoke": "_handle_smoke", + } + + def __init__(self, args: Optional[Namespace] = None): + super().__init__(args=args) + + @property + def name(self) -> str: + return "docker" + + def can_handle_direct_mode(self) -> bool: + return True + + @property + def cloud_config_path(self) -> Path: + return self.repo_root / "Trainers" / "cloud" / "cloud_config.yaml" + + def handle(self) -> int: + action = getattr(self.args, "subcommand", None) if self.args else None + if not action: + action = "status" + + method_name = self._SUBCOMMANDS.get(action) + if not method_name: + self.output_error(f"Unknown docker subcommand: {action}", code="UNKNOWN_SUBCOMMAND") + return 1 + return getattr(self, method_name)() + + def _ensure_docker_available(self) -> bool: + if shutil.which("docker") is None: + self.output_error( + "Docker CLI not found. Install Docker Desktop first.", + code="DOCKER_NOT_FOUND", + ) + return False + return True + + def _run(self, cmd: list[str], *, stream: bool = False, log_path: Optional[Path] = None) -> tuple[int, str]: + if not stream: + result = subprocess.run( + cmd, + cwd=str(self.repo_root), + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + output = (result.stdout or "") + (result.stderr or "") + return result.returncode, output.strip() + + log_handle = None + output_lines: list[str] = [] + if log_path is not None: + log_path.parent.mkdir(parents=True, exist_ok=True) + log_handle = log_path.open("w", encoding="utf-8") + + try: + process = subprocess.Popen( + cmd, + cwd=str(self.repo_root), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + errors="replace", + bufsize=1, + ) + assert process.stdout is not None + for raw_line in process.stdout: + line = raw_line.rstrip() + output_lines.append(line) + if not self.json_mode: + print(line) + if log_handle is not None: + log_handle.write(raw_line) + process.wait() + return process.returncode, "\n".join(output_lines).strip() + finally: + if log_handle is not None: + log_handle.close() + + def _docker_info(self) -> tuple[int, str]: + return self._run(["docker", "info", "--format", "{{.ServerVersion}}"]) + + def _resolve_target_images(self, *, target_override: Optional[str] = None) -> list[tuple[str, str, Optional[str]]]: + target = target_override or getattr(self.args, "docker_target", None) or "unsloth" + if target == "all": + targets = ["unsloth", "vllm", "bucket"] + else: + targets = [target] + + explicit_image = getattr(self.args, "docker_image", None) + requested_profile = getattr(self.args, "docker_profile", None) + resolved: list[tuple[str, str, Optional[str]]] = [] + + for runtime in targets: + try: + if explicit_image and len(targets) > 1: + image, profile = explicit_image, None + elif runtime == "bucket": + image, profile = BUCKET_HELPER_IMAGE, "local_build" + elif runtime == "unsloth": + image, profile = resolve_cloud_image( + self.cloud_config_path, + explicit_image=explicit_image, + requested_profile=requested_profile, + default_profile="latest_unsloth", + fallback_image=None, + profile_section="eval_image_profiles", + ) + else: + image, profile = resolve_cloud_image( + self.cloud_config_path, + explicit_image=explicit_image, + requested_profile=requested_profile, + default_profile="fast_vllm", + fallback_image=None, + profile_section="eval_image_profiles", + ) + except CloudProviderError as exc: + raise RuntimeError(str(exc)) from exc + resolved.append((runtime, image, profile)) + return resolved + + def _inspect_target_image(self, runtime: str, image: str, profile: Optional[str]) -> dict: + if runtime == "bucket": + present = bucket_helper_image_present(self.repo_root, image=image) + image_output = image if present else "" + else: + image_code, image_output = self._run( + ["docker", "images", "--format", "{{.Repository}}:{{.Tag}}|{{.ID}}|{{.Size}}", image] + ) + present = image_code == 0 and bool(image_output) + + return { + "runtime": runtime, + "image": image, + "profile": profile, + "present": present, + "local_images": image_output.splitlines() if image_output else [], + } + + def _build_bucket_helper_image(self) -> tuple[int, str]: + log_path = self.repo_root / "logs" / "bucket-docker-build.log" + if not self.json_mode: + print(f"Building bucket helper image: {BUCKET_HELPER_IMAGE}") + return self._run( + build_bucket_helper_image_command(self.repo_root), + stream=True, + log_path=log_path, + ) + + def _pull_target(self, runtime: str, image: str, profile: Optional[str]) -> tuple[int, dict]: + if runtime == "bucket": + code, output = self._build_bucket_helper_image() + return code, { + "runtime": runtime, + "image": image, + "profile": profile, + "success": code == 0, + "log_path": str(self.repo_root / "logs" / "bucket-docker-build.log"), + "tail": output.splitlines()[-10:] if output else [], + } + + log_path = self.repo_root / "logs" / f"{runtime}-docker-pull.log" + self.output_info(f"Pulling {runtime} image: {image}") + code, output = self._run(["docker", "pull", image], stream=True, log_path=log_path) + return code, { + "runtime": runtime, + "image": image, + "profile": profile, + "success": code == 0, + "log_path": str(log_path), + "tail": output.splitlines()[-10:] if output else [], + } + + def _handle_build(self) -> int: + if not self._ensure_docker_available(): + return 1 + + target = getattr(self.args, "docker_target", None) or "bucket" + if target not in {"bucket", "all"}: + self.output_error("Docker build currently supports only the local bucket helper image.", code="DOCKER_BUILD_UNSUPPORTED") + return 1 + + code, output = self._build_bucket_helper_image() + if code != 0: + self.output_error("docker build failed for bucket helper image", code="DOCKER_BUILD_FAILED") + return 1 + + payload = { + "runtime": "bucket", + "image": BUCKET_HELPER_IMAGE, + "present": bucket_helper_image_present(self.repo_root), + "tail": output.splitlines()[-10:] if output else [], + } + self.output(payload, f"Bucket helper image ready: {BUCKET_HELPER_IMAGE}") + return 0 + + def _handle_status(self) -> int: + if not self._ensure_docker_available(): + return 1 + + docker_version_code, docker_version = self._run(["docker", "--version"]) + info_code, server_version = self._docker_info() + if docker_version_code != 0: + self.output_error(docker_version or "Failed to run docker --version", code="DOCKER_VERSION_ERROR") + return 1 + + entries = [] + try: + for runtime, image, profile in self._resolve_target_images(): + entries.append(self._inspect_target_image(runtime, image, profile)) + except RuntimeError as exc: + self.output_error(str(exc), code="DOCKER_IMAGE_RESOLUTION_ERROR") + return 1 + + payload = { + "docker_cli": docker_version, + "docker_engine": server_version if info_code == 0 else "unavailable", + "targets": entries, + } + human = ( + f"Docker CLI: {docker_version}\n" + f"Docker Engine: {server_version if info_code == 0 else 'unavailable'}\n" + + "\n".join( + f"{item['runtime']}: {item['image']} ({'present' if item['present'] else 'missing'})" + for item in entries + ) + ) + self.output(payload, human) + return 0 if info_code == 0 else 1 + + def _handle_pull(self) -> int: + if not self._ensure_docker_available(): + return 1 + + try: + targets = self._resolve_target_images() + except RuntimeError as exc: + self.output_error(str(exc), code="DOCKER_IMAGE_RESOLUTION_ERROR") + return 1 + + results = [] + for runtime, image, profile in targets: + code, result = self._pull_target(runtime, image, profile) + results.append(result) + if code != 0: + error_code = "DOCKER_BUILD_FAILED" if runtime == "bucket" else "DOCKER_PULL_FAILED" + message = ( + "docker build failed for bucket helper image" + if runtime == "bucket" + else f"docker pull failed for {image}" + ) + self.output_error(message, code=error_code) + if self.json_mode: + self.output({"results": results}, success=False) + return 1 + + self.output({"results": results}, "Docker image pull complete.") + return 0 + + def _unsloth_smoke_command(self, image: str) -> list[str]: + repo_mount = str(self.repo_root) + smoke_code = ( + "import os, sys, torch; " + "print('cuda', torch.cuda.is_available()); " + "print('torch', torch.__version__); " + "from unsloth import FastLanguageModel; " + "print('unsloth-ok'); " + "print('repo-mounted', os.path.exists('/workspace/repo')); " + "sys.path.insert(0, '/workspace/repo'); " + "import tuner; " + "print('tuner-ok')" + ) + return [ + "docker", "run", "--rm", "--gpus", "all", + "-v", f"{repo_mount}:/workspace/repo", + "--entrypoint", "python", + image, + "-c", smoke_code, + ] + + def _vllm_smoke_command(self, image: str) -> list[str]: + smoke_code = ( + "import torch, vllm; " + "print('cuda', torch.cuda.is_available()); " + "print('torch', torch.__version__); " + "print('vllm', vllm.__version__)" + ) + return [ + "docker", "run", "--rm", "--gpus", "all", + "--entrypoint", "python3", + image, + "-c", smoke_code, + ] + + def _bucket_smoke_command(self, image: str) -> list[str]: + smoke_code = ( + "import huggingface_hub, dotenv, yaml; " + "print('hf_hub', huggingface_hub.__version__); " + "print('has_create_bucket', hasattr(huggingface_hub, 'create_bucket')); " + "print('has_hffs', hasattr(huggingface_hub, 'HfFileSystem'))" + ) + return [ + "docker", "run", "--rm", + "--entrypoint", "python", + image, + "-c", smoke_code, + ] + + def _smoke_target(self, runtime: str, image: str, profile: Optional[str]) -> tuple[int, dict]: + if runtime == "bucket" and not bucket_helper_image_present(self.repo_root, image=image): + code, _ = self._build_bucket_helper_image() + if code != 0: + return 1, { + "runtime": runtime, + "image": image, + "profile": profile, + "success": False, + "output": ["Bucket helper image is missing and could not be built."], + } + + if runtime == "unsloth": + cmd = self._unsloth_smoke_command(image) + elif runtime == "vllm": + cmd = self._vllm_smoke_command(image) + else: + cmd = self._bucket_smoke_command(image) + + code, output = self._run(cmd) + return code, { + "runtime": runtime, + "image": image, + "profile": profile, + "success": code == 0, + "output": output.splitlines(), + } + + def _bootstrap_guidance(self, *, cli_ok: bool, engine_ok: bool) -> list[str]: + guidance: list[str] = [] + if not cli_ok: + guidance.extend( + [ + "Install Docker Desktop for Windows and leave WSL 2 integration enabled.", + "Start Docker Desktop and wait for the engine status to show Running.", + "Re-run `python tuner.py docker bootstrap --docker-target all`.", + ] + ) + return guidance + + if not engine_ok: + guidance.extend( + [ + "Start Docker Desktop and wait for the engine to finish initializing.", + "If GPU containers are required, confirm the NVIDIA driver is installed on the host.", + "Re-run `python tuner.py docker status` to confirm the engine is reachable.", + ] + ) + return guidance + + guidance.extend( + [ + "Use `python tuner.py train --runtime docker` for local Docker-backed training.", + "Use `python tuner.py eval --runtime docker` for local Docker-backed evaluation.", + "Use `python tuner.py bucket pull ...` to bring cloud adapters local; pulled runs under `toolset-training-artifacts/runs/...` are now discoverable in local eval flows.", + ] + ) + return guidance + + def _handle_bootstrap(self) -> int: + cli_ok = shutil.which("docker") is not None + docker_version = None + server_version = None + info_code = 1 + + if cli_ok: + version_code, docker_version_output = self._run(["docker", "--version"]) + if version_code == 0: + docker_version = docker_version_output + info_code, server_version_output = self._docker_info() + if info_code == 0: + server_version = server_version_output + + engine_ok = cli_ok and info_code == 0 + guidance = self._bootstrap_guidance(cli_ok=cli_ok, engine_ok=engine_ok) + + if not cli_ok or not engine_ok: + payload = { + "docker_cli_found": cli_ok, + "docker_cli": docker_version, + "docker_engine": server_version, + "ready": False, + "guidance": guidance, + } + human_lines = [ + f"Docker CLI: {docker_version or 'missing'}", + f"Docker Engine: {server_version or 'unavailable'}", + "", + "Next steps:", + *[f" - {line}" for line in guidance], + ] + self.output(payload, "\n".join(human_lines), success=False) + return 1 + + try: + targets = self._resolve_target_images( + target_override=getattr(self.args, "docker_target", None) or "all" + ) + except RuntimeError as exc: + self.output_error(str(exc), code="DOCKER_IMAGE_RESOLUTION_ERROR") + return 1 + + status_entries = [self._inspect_target_image(runtime, image, profile) for runtime, image, profile in targets] + pull_results = [] + smoke_results = [] + + for runtime, image, profile in targets: + inspected = next((entry for entry in status_entries if entry["runtime"] == runtime), None) + present = bool(inspected and inspected["present"]) + if present: + pull_results.append( + { + "runtime": runtime, + "image": image, + "profile": profile, + "success": True, + "skipped": True, + "reason": "already_present", + } + ) + else: + code, result = self._pull_target(runtime, image, profile) + pull_results.append(result) + if code != 0: + self.output_error( + "Docker bootstrap failed while preparing local images.", + code="DOCKER_BOOTSTRAP_PULL_FAILED", + details={"runtime": runtime, "image": image}, + ) + return 1 + + code, smoke_result = self._smoke_target(runtime, image, profile) + smoke_results.append(smoke_result) + if code != 0: + self.output_error( + f"{runtime} smoke test failed", + code="DOCKER_BOOTSTRAP_SMOKE_FAILED", + details={"image": image, "output": smoke_result.get("output", [])}, + ) + return 1 + + hf_token_available = bool(get_hf_token()) + payload = { + "docker_cli_found": True, + "docker_cli": docker_version, + "docker_engine": server_version, + "ready": True, + "targets": [self._inspect_target_image(runtime, image, profile) for runtime, image, profile in targets], + "pull_results": pull_results, + "smoke_results": smoke_results, + "hf_token_available": hf_token_available, + "guidance": guidance, + } + human_lines = [ + f"Docker CLI: {docker_version}", + f"Docker Engine: {server_version}", + "Local Docker runtime is ready:", + ] + for result in smoke_results: + human_lines.append(f" - {result['runtime']}: {result['image']}") + human_lines.append(f"HF_TOKEN available via env/.env: {'yes' if hf_token_available else 'no'}") + human_lines.append("Next steps:") + human_lines.extend(f" - {line}" for line in guidance) + self.output(payload, "\n".join(human_lines)) + return 0 + + def _handle_smoke(self) -> int: + if not self._ensure_docker_available(): + return 1 + + try: + targets = self._resolve_target_images() + except RuntimeError as exc: + self.output_error(str(exc), code="DOCKER_IMAGE_RESOLUTION_ERROR") + return 1 + + results = [] + for runtime, image, profile in targets: + code, result = self._smoke_target(runtime, image, profile) + results.append(result) + if code != 0: + self.output_error( + f"{runtime} smoke test failed", + code="DOCKER_SMOKE_FAILED", + details={"image": image, "output": result.get("output", [])}, + ) + return 1 + + human_lines = ["Docker smoke tests passed:"] + for result in results: + human_lines.append(f" {result['runtime']}: {result['image']}") + self.output({"results": results}, "\n".join(human_lines)) + return 0 diff --git a/tuner/handlers/eval_handler.py b/tuner/handlers/eval_handler.py index e817cd19..ed2b5ca7 100644 --- a/tuner/handlers/eval_handler.py +++ b/tuner/handlers/eval_handler.py @@ -19,15 +19,29 @@ - All output is JSON formatted for programmatic parsing """ +import json +import socket +import subprocess +import time +from http.client import RemoteDisconnected from argparse import Namespace from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any, Callable, List, Optional, Tuple +from urllib.error import URLError +from urllib.request import urlopen from tuner.handlers.base import BaseHandler from tuner.backends.registry import EvaluationBackendRegistry from tuner.discovery import TrainingRunDiscovery, CheckpointDiscovery +from tuner.utils.docker_runtime import ( + CONTAINER_REPO_ROOT, + build_docker_run_command, + container_repo_path, + ensure_docker_cli, + resolve_eval_image, +) # Import shared UI components (delegates to Trainers/shared/ui/) from shared.ui import ( @@ -105,17 +119,24 @@ def _get_eval_status(self) -> dict: Returns dict with available backends, models, and scenarios. """ - # List available backends + runtime = getattr(self.args, "runtime", "native") if self.args else "native" backends = [] - # Check each backend - backend_configs = [ - ("unsloth", "Unsloth (LoRA - direct)"), - ("llamacpp", "llama.cpp (GGUF)"), - ("mlc", "MLC/WebLLM (WebGPU)"), - ("ollama", "Ollama (local server)"), - ("lmstudio", "LM Studio (local server)"), - ] + if runtime == "docker": + docker_ok, docker_error = ensure_docker_cli() + backend_configs = [ + ("unsloth", "Unsloth (Docker - direct LoRA)"), + ("vllm", "vLLM (Docker OpenAI server)"), + ] + else: + docker_ok, docker_error = True, "" + backend_configs = [ + ("unsloth", "Unsloth (LoRA - direct)"), + ("llamacpp", "llama.cpp (GGUF)"), + ("mlc", "MLC/WebLLM (WebGPU)"), + ("ollama", "Ollama (local server)"), + ("lmstudio", "LM Studio (local server)"), + ] for backend_id, backend_name in backend_configs: backend_info = { @@ -126,18 +147,30 @@ def _get_eval_status(self) -> dict: } try: - if backend_id in ("llamacpp", "mlc", "unsloth"): - backend = EvaluationBackendRegistry.get(backend_id, repo_root=self.repo_root) + if runtime == "docker": + backend_info["available"] = docker_ok + if backend_id == "unsloth" and docker_ok: + backend = EvaluationBackendRegistry.get("unsloth", repo_root=self.repo_root) + models = backend.list_models() + backend_info["models"] = models[:20] if models else [] + backend_info["model_count"] = len(models) if models else 0 + elif backend_id == "vllm" and docker_ok: + runs = self._discover_vllm_runs() + backend_info["models"] = [r.display_name for r in runs[:20]] + backend_info["model_count"] = len(runs) else: - backend = EvaluationBackendRegistry.get(backend_id) + if backend_id in ("llamacpp", "mlc", "unsloth"): + backend = EvaluationBackendRegistry.get(backend_id, repo_root=self.repo_root) + else: + backend = EvaluationBackendRegistry.get(backend_id) - is_connected, _ = backend.validate_connection() - backend_info["available"] = is_connected + is_connected, _ = backend.validate_connection() + backend_info["available"] = is_connected - if is_connected: - models = backend.list_models() - backend_info["models"] = models[:20] if models else [] # Limit for brevity - backend_info["model_count"] = len(models) if models else 0 + if is_connected: + models = backend.list_models() + backend_info["models"] = models[:20] if models else [] + backend_info["model_count"] = len(models) if models else 0 except (ValueError, Exception): pass @@ -161,8 +194,11 @@ def _get_eval_status(self) -> dict: return { "command": "eval", "status": "ready", + "runtime": runtime, "backends": backends, "scenarios": scenarios, + "docker_available": docker_ok, + "docker_error": docker_error or None, } def _list_scenarios(self): @@ -176,6 +212,16 @@ def _list_scenarios(self): discovery = PromptSetDiscovery(repo_root=self.repo_root) return discovery.discover_all() + def _discover_vllm_runs(self): + """Discover training runs that can be evaluated through vLLM.""" + try: + from Evaluator.vllm_setup import discover_training_runs + except ImportError: + return [] + + runs = discover_training_runs(self.repo_root / "Trainers") + return [run for run in runs if run.best_model_path or run.lora_path] + # -- Generic table display infrastructure ---------------------------------- @dataclass @@ -285,6 +331,7 @@ def _display_lora_models_table(self, backend, models: List[str]) -> None: columns=[ C("Run"), C("Base Model", style=COLORS["aqua"]), C("Type", style=COLORS["purple"]), + C("Source", style="dim"), C("Size", style="dim", justify="right"), ], row_extractor=lambda i, mp: self._lora_row(backend, mp), @@ -299,6 +346,7 @@ def _lora_row(backend, model_path: str) -> List[str]: info.get("timestamp", "unknown"), info.get("base_model_short", "unknown"), info.get("trainer_type", "-").upper(), + info.get("source", "-"), f"{info.get('size_mb', 0):.0f}MB" if info.get("size_mb") else "-", ] @@ -308,7 +356,8 @@ def _lora_plain(backend, model_path: str) -> str: return ( f"{info.get('timestamp', 'unknown')} " f"({info.get('base_model_short', 'unknown')}) " - f"[{info.get('trainer_type', '-').upper()}]" + f"[{info.get('trainer_type', '-').upper()}] " + f"{info.get('source', '-')}" ) def _display_mlc_models_table(self, backend, models: List[str]) -> None: @@ -350,6 +399,7 @@ def _display_training_runs_table(self, runs: List[Path], trainer_type: str) -> N title=f"Available {trainer_type.upper()} Training Runs", columns=[ C("Run"), C("Has Final", style=COLORS["aqua"], justify="center"), + C("Source", style="dim"), C("Checkpoints", style=COLORS["purple"], justify="right"), ], row_extractor=lambda i, rp: self._training_run_row(rp), @@ -364,12 +414,18 @@ def _training_run_row(run_path: Path) -> List[str]: cp_count = 0 if checkpoints_dir.exists(): cp_count = len(list(checkpoints_dir.glob("checkpoint-*"))) - return [run_path.name, has_final, str(cp_count)] + source = "bucket_pull" if "toolset-training-artifacts" in {part.lower() for part in run_path.parts} else ( + "cloud_artifact" if "runs" in {part.lower() for part in run_path.parts} and "trainers" not in {part.lower() for part in run_path.parts} + else "local_training" + ) + return [run_path.name, has_final, source, str(cp_count)] @staticmethod def _training_run_plain(run_path: Path) -> str: has_final = "(final)" if (run_path / "final_model").exists() else "" - return f"{run_path.name} {has_final}" + parts = {part.lower() for part in run_path.parts} + source = "bucket_pull" if "toolset-training-artifacts" in parts else ("cloud_artifact" if "runs" in parts and "trainers" not in parts else "local_training") + return f"{run_path.name} {has_final} [{source}]" def _display_checkpoints_table(self, checkpoints: List, trainer_type: str) -> None: """Display available checkpoints with metrics in a table.""" @@ -523,6 +579,305 @@ def _display_scenarios_table(self, scenarios) -> None: ) self._display_table(scenarios, spec) + def _display_vllm_runs_table(self, runs) -> None: + """Display vLLM-compatible training runs.""" + C = self._ColumnSpec + spec = self._TableSpec( + title="Available vLLM Model Candidates", + columns=[ + C("Run"), + C("Trainer", style=COLORS["aqua"]), + C("Source", style=COLORS["purple"]), + C("Model", style="dim"), + ], + row_extractor=lambda i, run: [ + run.timestamp, + run.trainer_type.upper(), + run.source, + run.best_model_path.name if run.best_model_path else "LoRA", + ], + plain_formatter=lambda i, run: ( + f"{run.timestamp} [{run.trainer_type.upper()}] " + f"{run.source} " + f"{run.best_model_path.name if run.best_model_path else 'LoRA'}" + ), + ) + self._display_table(runs, spec) + + def _select_vllm_run(self): + """Select a local training run for Dockerized vLLM evaluation.""" + runs = self._discover_vllm_runs() + if not runs: + print_error("No vLLM-compatible training runs found.") + print_info("A merged-16bit export or a final_model LoRA adapter is required.") + return None + + self._display_vllm_runs_table(runs) + + while True: + try: + sel = prompt(f"Select model run (1-{len(runs)})", "1") + idx = int(sel) - 1 + if 0 <= idx < len(runs): + return runs[idx] + except ValueError: + pass + print_error("Invalid selection.") + + def _find_available_port(self, preferred: int = 8000) -> int: + """Reserve a local TCP port for the Dockerized vLLM server.""" + for candidate in (preferred, 0): + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", candidate)) + return sock.getsockname()[1] + except OSError: + continue + return preferred + + def _wait_for_vllm_models(self, host: str, port: int, timeout_seconds: int = 300) -> list[str]: + """Poll the local vLLM endpoint until models are available.""" + deadline = time.time() + timeout_seconds + last_error = "vLLM server did not return any models." + while time.time() < deadline: + try: + with urlopen(f"http://{host}:{port}/v1/models", timeout=5) as response: + payload = json.loads(response.read().decode("utf-8")) + model_ids = [item.get("id") for item in payload.get("data", []) if item.get("id")] + if model_ids: + return model_ids + last_error = "vLLM server is up but returned no models." + except (URLError, TimeoutError, json.JSONDecodeError, RemoteDisconnected, ConnectionResetError) as exc: + last_error = str(exc) + time.sleep(2) + raise RuntimeError(last_error) + + def _run_docker_unsloth_evaluation(self, model: str, scenario) -> int: + """Run direct Unsloth evaluation inside the Docker runtime.""" + try: + image, profile = resolve_eval_image( + self.repo_root, + runtime="unsloth", + explicit_image=getattr(self.args, "docker_image", None), + requested_profile=getattr(self.args, "docker_profile", None), + ) + except Exception as exc: + print_error(f"Failed to resolve Docker evaluation image: {exc}") + return 1 + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + results_dir = self.repo_root / "Evaluator" / "results" + results_dir.mkdir(parents=True, exist_ok=True) + output_json = results_dir / f"run_{timestamp}.json" + output_md = results_dir / f"run_{timestamp}.md" + + cmd = build_docker_run_command( + image=image, + repo_root=self.repo_root, + workdir=str(CONTAINER_REPO_ROOT), + entrypoint="python", + env={"PYTHONPATH": str(CONTAINER_REPO_ROOT)}, + command=[ + "-m", + "Evaluator.cli", + "--backend", + "unsloth", + "--model", + container_repo_path(Path(model), self.repo_root), + "--scenario", + scenario.path.name, + "--output", + container_repo_path(output_json, self.repo_root), + "--markdown", + container_repo_path(output_md, self.repo_root), + ], + ) + + profile_suffix = f" ({profile})" if profile else "" + print_info(f"Running Docker evaluation with: {image}{profile_suffix}") + print() + result = subprocess.run(cmd, cwd=str(self.repo_root)) + + if result.returncode == 0: + print() + print_info(f"Results saved to: {output_json.relative_to(self.repo_root)}") + print_info(f"Markdown report: {output_md.relative_to(self.repo_root)}") + return result.returncode + + def _run_docker_vllm_evaluation(self, run, scenario) -> int: + """Start a Dockerized vLLM server and evaluate against it.""" + try: + image, profile = resolve_eval_image( + self.repo_root, + runtime="vllm", + explicit_image=getattr(self.args, "docker_image", None), + requested_profile=getattr(self.args, "docker_profile", None), + ) + except Exception as exc: + print_error(f"Failed to resolve Docker vLLM image: {exc}") + return 1 + + model_path = run.best_model_path + if model_path is None: + print_error("Selected run does not have a usable model path for vLLM.") + return 1 + + command = [ + "--host", + "0.0.0.0", + "--port", + "8000", + "--gpu-memory-utilization", + "0.9", + ] + preferred_model_id = None + + adapter_config = model_path / "adapter_config.json" + if adapter_config.exists(): + try: + adapter_data = json.loads(adapter_config.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + print_error(f"Failed to parse adapter_config.json: {exc}") + return 1 + + base_model = adapter_data.get("base_model_name_or_path") + if not base_model: + print_error("Adapter config is missing base_model_name_or_path.") + return 1 + + preferred_model_id = f"{run.trainer_type}-{run.timestamp}" + command = [ + "--model", + base_model, + *command, + "--enable-lora", + "--max-lora-rank", + "64", + "--lora-modules", + f"{preferred_model_id}={container_repo_path(model_path, self.repo_root)}", + ] + else: + command = [ + "--model", + container_repo_path(model_path, self.repo_root), + *command, + ] + + host_port = self._find_available_port(8000) + container_name = f"tuner-vllm-eval-{datetime.now().strftime('%Y%m%d%H%M%S')}" + run_cmd = build_docker_run_command( + image=image, + repo_root=self.repo_root, + publish_ports=[(host_port, 8000)], + command=command, + name=container_name, + detach=True, + ) + + profile_suffix = f" ({profile})" if profile else "" + print_info(f"Starting Docker vLLM server with: {image}{profile_suffix}") + print_info(f"Container: {container_name}") + print_info(f"Endpoint: http://127.0.0.1:{host_port}/v1") + print() + + log_process = None + try: + start = subprocess.run(run_cmd, cwd=str(self.repo_root), capture_output=True, text=True) + if start.returncode != 0: + print_error(start.stderr.strip() or start.stdout.strip() or "Failed to start Docker vLLM server.") + return 1 + + log_process = subprocess.Popen( + ["docker", "logs", "-f", container_name], + cwd=str(self.repo_root), + ) + + model_ids = self._wait_for_vllm_models("127.0.0.1", host_port) + model_id = preferred_model_id if preferred_model_id in model_ids else model_ids[0] + print() + print_info(f"vLLM server ready. Using model id: {model_id}") + print() + return self._run_subprocess_evaluation( + "vllm", + model_id, + scenario, + host="127.0.0.1", + port=host_port, + ) + except RuntimeError as exc: + print_error(f"Docker vLLM server failed to become ready: {exc}") + return 1 + finally: + if log_process is not None and log_process.poll() is None: + log_process.terminate() + subprocess.run( + ["docker", "rm", "-f", container_name], + cwd=str(self.repo_root), + capture_output=True, + text=True, + ) + + def _handle_docker_eval(self) -> int: + """Run the local evaluation workflow through Docker.""" + print_header("EVALUATION", "Test your model's performance (Docker runtime)") + + docker_ok, docker_error = ensure_docker_cli() + if not docker_ok: + print_error(docker_error) + return 1 + + backend_choice = print_menu([ + ("unsloth", f"{BOX['star']} Unsloth (Docker - direct LoRA)"), + ("vllm", f"{BOX['bullet']} vLLM (Docker - OpenAI server)"), + ], "Select backend:") + + if not backend_choice: + return 0 + + if backend_choice == "unsloth": + model, _trainer_type = self._select_unsloth_model() + if not model: + return 0 + model_label = model + else: + selected_run = self._select_vllm_run() + if selected_run is None: + return 1 + model = selected_run + model_label = selected_run.display_name + + scenarios = self._list_scenarios() + if not scenarios: + print_error("No test scenarios found in Evaluator/config/scenarios/") + return 1 + + self._display_scenarios_table(scenarios) + while True: + try: + sel = prompt(f"Select test scenario (1-{len(scenarios)})", "1") + idx = int(sel) - 1 + if 0 <= idx < len(scenarios): + selected = scenarios[idx] + break + except ValueError: + pass + print_error("Invalid selection.") + + print_config({ + "Runtime": "docker", + "Backend": backend_choice, + "Model": model_label, + "Scenario": f"{selected.name} ({selected.count} tests)", + }, "Evaluation Configuration") + + if not confirm("Start evaluation?"): + print_info("Evaluation cancelled.") + return 0 + + if backend_choice == "unsloth": + return self._run_docker_unsloth_evaluation(model, selected) + return self._run_docker_vllm_evaluation(model, selected) + def handle(self) -> int: """ Execute evaluation workflow. @@ -538,6 +893,10 @@ def handle(self) -> int: self.output(status) return 0 + runtime = getattr(self.args, "runtime", "native") if self.args else "native" + if runtime == "docker": + return self._handle_docker_eval() + print_header("EVALUATION", "Test your model's performance") # Step 1: Select backend @@ -802,7 +1161,15 @@ def on_record_dashboard(record): passed = sum(1 for r in records if r.passed) return 0 if passed == len(records) else 1 - def _run_subprocess_evaluation(self, backend: str, model: str, scenario) -> int: + def _run_subprocess_evaluation( + self, + backend: str, + model: str, + scenario, + *, + host: str | None = None, + port: int | None = None, + ) -> int: """ Fallback: Run evaluation via subprocess. @@ -834,6 +1201,10 @@ def _run_subprocess_evaluation(self, backend: str, model: str, scenario) -> int: "--output", str(output_json), "--markdown", str(output_md) ] + if host: + cmd.extend(["--host", host]) + if port: + cmd.extend(["--port", str(port)]) print_info(f"Running: {' '.join(cmd)}") print() diff --git a/tuner/handlers/train_handler.py b/tuner/handlers/train_handler.py index 82f26c69..202e06ef 100644 --- a/tuner/handlers/train_handler.py +++ b/tuner/handlers/train_handler.py @@ -10,6 +10,7 @@ - All output is JSON formatted for programmatic parsing """ +import shutil import subprocess from argparse import Namespace from pathlib import Path @@ -17,6 +18,13 @@ from tuner.handlers.base import BaseHandler from tuner.backends.registry import TrainingBackendRegistry +from tuner.utils.docker_runtime import ( + CONTAINER_REPO_ROOT, + build_docker_run_command, + container_repo_path, + ensure_docker_cli, + resolve_training_image, +) from tuner.ui import ( print_menu, print_header, @@ -75,6 +83,26 @@ def detect_platform() -> str | None: return None +def detect_docker_platform() -> str | None: + """Detect Docker-capable NVIDIA hardware without importing host torch.""" + nvidia_smi = shutil.which("nvidia-smi") + if not nvidia_smi: + return None + + try: + result = subprocess.run( + [nvidia_smi, "--query-gpu=name", "--format=csv,noheader"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0 and result.stdout.strip(): + return "rtx" + except Exception: + pass + return None + + class TrainHandler(BaseHandler): """ Handler for training workflow. @@ -122,11 +150,15 @@ def _get_training_status(self) -> dict: has_cuda = False has_mlx = False + runtime = getattr(self.args, "runtime", "native") if self.args else "native" + try: import torch has_cuda = torch.cuda.is_available() except ImportError: pass + if runtime == "docker" and not has_cuda: + has_cuda = detect_docker_platform() == "rtx" try: import mlx.core as mx @@ -148,13 +180,66 @@ def _get_training_status(self) -> dict: "methods": ["mlx"] }) + docker_ok, docker_error = ensure_docker_cli() if runtime == "docker" else (True, "") + return { "command": "train", "status": "ready" if platforms else "no_platforms", "platforms": platforms, - "detected_platform": detect_platform(), + "detected_platform": detect_platform() if runtime != "docker" else (detect_platform() or detect_docker_platform()), + "runtime": runtime, + "docker_available": docker_ok, + "docker_error": docker_error or None, } + @staticmethod + def _script_name_for_config(config) -> str: + if config.method == "grpo" and config.config_path.name == "env_config.yaml": + return "train_env_grpo.py" + return f"train_{config.method}.py" + + def _execute_docker_training(self, config) -> int: + try: + image, profile = resolve_training_image( + self.repo_root, + explicit_image=getattr(self.args, "docker_image", None), + requested_profile=getattr(self.args, "docker_profile", None), + ) + except Exception as exc: + print_error(f"Failed to resolve Docker training image: {exc}") + return 1 + + script_name = self._script_name_for_config(config) + trainer_dir = container_repo_path(config.trainer_dir, self.repo_root) + command = [script_name] + if script_name == "train_env_grpo.py": + command.extend(["--config", container_repo_path(config.config_path, self.repo_root)]) + + cmd = build_docker_run_command( + image=image, + repo_root=self.repo_root, + workdir=trainer_dir, + entrypoint="python", + env={"PYTHONPATH": str(CONTAINER_REPO_ROOT)}, + command=command, + ) + + profile_suffix = f" ({profile})" if profile else "" + print_info(f"Executing training in Docker with: {image}{profile_suffix}") + print() + + try: + process = subprocess.Popen(cmd, cwd=str(self.repo_root)) + return process.wait() + except KeyboardInterrupt: + print("\nTraining interrupted by user.") + if "process" in locals(): + process.terminate() + return 130 + except Exception as exc: + print_error(f"Docker training execution error: {exc}") + return 1 + def handle(self) -> int: """ Execute training workflow. @@ -170,10 +255,15 @@ def handle(self) -> int: self.output(status) return 0 + runtime = getattr(self.args, "runtime", "native") if self.args else "native" print_header("TRAINING", "Select your platform and training method") + if runtime == "docker": + print_info("Using Docker runtime for local GPU execution.") # Step 1: Auto-detect or select platform platform_choice = detect_platform() + if runtime == "docker" and not platform_choice: + platform_choice = detect_docker_platform() if platform_choice: platform_name = "NVIDIA GPU (CUDA)" if platform_choice == "rtx" else "Apple Silicon (MLX)" @@ -195,10 +285,19 @@ def handle(self) -> int: return 1 # Step 3: Validate environment - is_valid, error = backend.validate_environment() - if not is_valid: - print_error(f"Environment validation failed: {error}") - return 1 + if runtime == "docker": + if platform_choice != "rtx": + print_error("Docker runtime currently supports NVIDIA/CUDA local training only.") + return 1 + docker_ok, docker_error = ensure_docker_cli() + if not docker_ok: + print_error(docker_error) + return 1 + else: + is_valid, error = backend.validate_environment() + if not is_valid: + print_error(f"Environment validation failed: {error}") + return 1 # Step 4: Select method (if multiple available) methods = backend.get_available_methods() @@ -238,21 +337,20 @@ def handle(self) -> int: print_info("Training cancelled.") return 0 - # Step 8: Execute training - # Mac uses system python3 (no conda needed), NVIDIA uses conda python - if platform_choice == "mac": - import shutil - python = shutil.which("python3") or "python3" - else: - python = self.get_conda_python() - print_info(f"Executing training with: {python}") - print() - # Play training start animation (if available) if ASCIIMATICS_AVAILABLE: play_training_start(duration_frames=40) - exit_code = backend.execute(config, python) + if runtime == "docker": + exit_code = self._execute_docker_training(config) + else: + if platform_choice == "mac": + python = shutil.which("python3") or "python3" + else: + python = self.get_conda_python() + print_info(f"Executing training with: {python}") + print() + exit_code = backend.execute(config, python) if exit_code == 0: # Play celebration animation on success diff --git a/tuner/utils/docker_runtime.py b/tuner/utils/docker_runtime.py new file mode 100644 index 00000000..c20bcba6 --- /dev/null +++ b/tuner/utils/docker_runtime.py @@ -0,0 +1,206 @@ +""" +Shared helpers for local Docker-backed runtimes. + +Location: tuner/utils/docker_runtime.py +Purpose: Resolve local runtime images and build Docker commands +Used by: docker_handler, train_handler, eval_handler +""" + +from __future__ import annotations + +import subprocess +import shutil +from pathlib import Path +from typing import Mapping, Optional, Sequence + +from tuner.backends.training.cloud.base_cloud import resolve_cloud_image +from tuner.core.exceptions import CloudProviderError + +CONTAINER_REPO_ROOT = Path("/workspace/repo") +BUCKET_HELPER_IMAGE = "toolset-training-bucket-helper:latest" +BUCKET_HELPER_ENV_MARKER = "TUNER_BUCKET_HELPER_ACTIVE" + + +def get_cloud_config_path(repo_root: Path) -> Path: + """Return the canonical cloud config path.""" + return repo_root / "Trainers" / "cloud" / "cloud_config.yaml" + + +def get_bucket_helper_dir(repo_root: Path) -> Path: + """Return the checked-in Docker helper directory for Buckets support.""" + return repo_root / "docker" / "bucket-helper" + + +def get_bucket_helper_dockerfile(repo_root: Path) -> Path: + """Return the Buckets helper Dockerfile path.""" + return get_bucket_helper_dir(repo_root) / "Dockerfile" + + +def ensure_docker_cli() -> tuple[bool, str]: + """Check whether Docker is available on the host.""" + if shutil.which("docker") is None: + return False, "Docker CLI not found. Install Docker Desktop first." + return True, "" + + +def bucket_helper_image_present(repo_root: Path, *, image: str = BUCKET_HELPER_IMAGE) -> bool: + """Return True when the local Buckets helper image already exists.""" + result = subprocess.run( + ["docker", "images", "--format", "{{.Repository}}:{{.Tag}}", image], + cwd=str(repo_root), + capture_output=True, + text=True, + ) + return result.returncode == 0 and bool((result.stdout or "").strip()) + + +def build_bucket_helper_image_command( + repo_root: Path, + *, + image: str = BUCKET_HELPER_IMAGE, +) -> list[str]: + """Build the checked-in Buckets helper image.""" + helper_dir = get_bucket_helper_dir(repo_root) + dockerfile = get_bucket_helper_dockerfile(repo_root) + return [ + "docker", + "build", + "-t", + image, + "-f", + str(dockerfile), + str(helper_dir), + ] + + +def build_bucket_helper_run_command( + repo_root: Path, + *, + helper_args: Sequence[str], + image: str = BUCKET_HELPER_IMAGE, + remove: bool = True, +) -> list[str]: + """Run the Buckets helper image against the mounted repo checkout.""" + cmd = ["docker", "run"] + if remove: + cmd.append("--rm") + cmd.extend(["-v", f"{repo_root}:/workspace/repo"]) + cmd.extend(["-e", f"{BUCKET_HELPER_ENV_MARKER}=1"]) + cmd.extend(["-e", f"PYTHONPATH={CONTAINER_REPO_ROOT}"]) + cmd.extend(["--entrypoint", "python"]) + cmd.append(image) + cmd.append(str((CONTAINER_REPO_ROOT / "tuner.py").as_posix())) + cmd.extend(helper_args) + return cmd + + +def resolve_training_image( + repo_root: Path, + *, + explicit_image: Optional[str] = None, + requested_profile: Optional[str] = None, +) -> tuple[str, Optional[str]]: + """Resolve the Docker image for local training.""" + return resolve_cloud_image( + get_cloud_config_path(repo_root), + explicit_image=explicit_image, + requested_profile=requested_profile, + default_profile="stable", + fallback_image=None, + profile_section="docker_image_profiles", + ) + + +def resolve_eval_image( + repo_root: Path, + *, + runtime: str, + explicit_image: Optional[str] = None, + requested_profile: Optional[str] = None, +) -> tuple[str, Optional[str]]: + """Resolve the Docker image for local evaluation.""" + default_profile = "fast_vllm" if runtime == "vllm" else "stable_unsloth" + return resolve_cloud_image( + get_cloud_config_path(repo_root), + explicit_image=explicit_image, + requested_profile=requested_profile, + default_profile=default_profile, + fallback_image=None, + profile_section="eval_image_profiles", + ) + + +def container_repo_path(host_path: Path, repo_root: Path) -> str: + """Map a host repo-relative path into the mounted container path.""" + resolved_host = host_path.resolve() + resolved_root = repo_root.resolve() + relative = resolved_host.relative_to(resolved_root) + return str((CONTAINER_REPO_ROOT / relative).as_posix()) + + +def build_docker_run_command( + *, + image: str, + repo_root: Path, + command: Sequence[str], + workdir: Optional[str] = None, + entrypoint: Optional[str] = None, + env: Optional[Mapping[str, str]] = None, + publish_ports: Optional[Sequence[tuple[int, int]]] = None, + gpus: bool = True, + name: Optional[str] = None, + detach: bool = False, + remove: bool = True, +) -> list[str]: + """Build a `docker run` command with the repo mounted into the container.""" + cmd = ["docker", "run"] + if detach: + cmd.append("-d") + if remove: + cmd.append("--rm") + if name: + cmd.extend(["--name", name]) + if gpus: + cmd.extend(["--gpus", "all"]) + + cmd.extend(["-v", f"{repo_root}:/workspace/repo"]) + + if workdir: + cmd.extend(["-w", workdir]) + if publish_ports: + for host_port, container_port in publish_ports: + cmd.extend(["-p", f"{host_port}:{container_port}"]) + if env: + for key, value in env.items(): + cmd.extend(["-e", f"{key}={value}"]) + if entrypoint: + cmd.extend(["--entrypoint", entrypoint]) + + cmd.append(image) + cmd.extend(command) + return cmd + + +def resolve_runtime_image( + repo_root: Path, + *, + command_name: str, + runtime: str, + explicit_image: Optional[str] = None, + requested_profile: Optional[str] = None, +) -> tuple[str, Optional[str]]: + """Resolve the correct Docker image for a local command/runtime pair.""" + if command_name == "train": + return resolve_training_image( + repo_root, + explicit_image=explicit_image, + requested_profile=requested_profile, + ) + if command_name == "eval": + return resolve_eval_image( + repo_root, + runtime=runtime, + explicit_image=explicit_image, + requested_profile=requested_profile, + ) + raise CloudProviderError(f"Unsupported local Docker runtime command: {command_name}") From a84246afe02ef21ff4766e1dcb249f4c2ea4611f Mon Sep 17 00:00:00 2001 From: Professor Synapse <131487882+ProfSynapse@users.noreply.github.com> Date: Fri, 10 Apr 2026 12:06:51 -0400 Subject: [PATCH 2/2] Document Docker-first local workflow --- README.md | 60 +++++++++++++++++++++++++++++---------- docs/project-reference.md | 27 ++++++++++++------ 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 3b301bd5..97df343c 100644 --- a/README.md +++ b/README.md @@ -62,24 +62,54 @@ That path can run train -> evaluation -> exact loss -> analysis -> recommendatio - `plan-hardware` for blind hardware planning using live HF Jobs flavors and pricing - `cloud-gym` to run the vault gym against a trained cloud run -## Recent Updates - -- HF Jobs is now the canonical cloud path for train + evaluate, with `cloud-pipeline` handling the common workflow end-to-end. -- Cloud evaluation writes structured artifacts back into the source run, including `evaluation_results.json`, `evaluation_results.md`, and `evaluation_lineage.json`. -- Bucket-backed progress is a first-class UX: training and evaluation stream JSONL progress that the local dashboard can replay. -- `run-experiment` now supports fuller cloud orchestration, including post-training evaluation and exact-loss stages as separate sibling jobs by default. -- `plan-hardware` and `scripts/hf_jobs_hardware.py` make hardware selection less guessy by using the live HF Jobs hardware surface. -- Evolutionary SFT is now supported in the cloud experiment path through checked-in specs and `cloud-pipeline --train-*` overrides. +## Recent Updates + +- HF Jobs is now the canonical cloud path for train + evaluate, with `cloud-pipeline` handling the common workflow end-to-end. +- Cloud evaluation writes structured artifacts back into the source run, including `evaluation_results.json`, `evaluation_results.md`, and `evaluation_lineage.json`. +- Bucket-backed progress is a first-class UX: training and evaluation stream JSONL progress that the local dashboard can replay. +- Local NVIDIA workflows now have a first-class Docker path with `python tuner.py docker bootstrap --docker-target all`, `train --runtime docker`, and `eval --runtime docker`. +- Pulled HF bucket adapters under `toolset-training-artifacts/runs/...` are now discoverable in local Docker eval flows without manual `docker run` commands. +- `run-experiment` now supports fuller cloud orchestration, including post-training evaluation and exact-loss stages as separate sibling jobs by default. +- `plan-hardware` and `scripts/hf_jobs_hardware.py` make hardware selection less guessy by using the live HF Jobs hardware surface. +- Evolutionary SFT is now supported in the cloud experiment path through checked-in specs and `cloud-pipeline --train-*` overrides. ## Quick Start -| Path | How | -|------|-----| -| **Claude Code (recommended)** | Open repo in [Claude Code](https://docs.anthropic.com/en/docs/claude-code) and tell it what you want | -| **HF Jobs cloud train + eval** | `python tuner.py cloud-pipeline --method sft --preset full` | -| **Full cloud experiment bundle** | `python tuner.py run-experiment --experiment-spec Trainers/cloud/experiments/.yaml --yes` | -| **Interactive CLI** | `./run.sh` (Linux/WSL) or `.\run.ps1` (PowerShell) | -| **Beginner (no GPU)** | `Trainers/notebooks/sft_colab_beginner.ipynb` in Google Colab | +| Path | How | +|------|-----| +| **Claude Code (recommended)** | Open repo in [Claude Code](https://docs.anthropic.com/en/docs/claude-code) and tell it what you want | +| **Local Docker setup (Windows/NVIDIA)** | `python tuner.py docker bootstrap --docker-target all` | +| **Local Docker train** | `python tuner.py train --runtime docker` | +| **Local Docker eval** | `python tuner.py eval --runtime docker` | +| **HF Jobs cloud train + eval** | `python tuner.py cloud-pipeline --method sft --preset full` | +| **Full cloud experiment bundle** | `python tuner.py run-experiment --experiment-spec Trainers/cloud/experiments/.yaml --yes` | +| **Interactive CLI** | `./run.sh` (Linux/WSL) or `.\run.ps1` (PowerShell) | +| **Beginner (no GPU)** | `Trainers/notebooks/sft_colab_beginner.ipynb` in Google Colab | + +## Local Docker Workflow + +For Windows users with NVIDIA GPUs, the recommended local path is now Docker Desktop, not manual dependency wrangling inside the host training environment. + +```bash +python tuner.py docker bootstrap --docker-target all +python tuner.py train --runtime docker +python tuner.py eval --runtime docker +``` + +What `docker bootstrap` does: +- checks whether Docker Desktop is installed and the engine is reachable +- prepares the local `unsloth`, `vllm`, and bucket-helper images +- runs smoke tests so you can verify GPU containers work before debugging model code + +If you pull a cloud adapter locally, keep it inside the repo under `toolset-training-artifacts/runs/...` and it will show up in local eval discovery: + +```bash +python tuner.py bucket pull \ + --path runs/hf_jobs/sft//final_model \ + --dest toolset-training-artifacts + +python tuner.py eval --runtime docker +``` ## Using with Claude Code diff --git a/docs/project-reference.md b/docs/project-reference.md index 9ab6b584..138536e3 100644 --- a/docs/project-reference.md +++ b/docs/project-reference.md @@ -7,8 +7,10 @@ Scripts, configuration files, environment variables, data patterns, and platform ## Key Bash Scripts **Root Level:** -- `run.sh` / `run.ps1` - Main CLI wrappers (auto-activate conda) -- `setup_env.sh` / `setup_env.ps1` - Environment setup +- `run.sh` / `run.ps1` - Main CLI wrappers +- `setup_env.sh` / `setup_env.ps1` - Legacy host-environment setup / fallback path +- `python tuner.py docker bootstrap --docker-target all` - Preferred local Docker bootstrap for Windows + NVIDIA +- `python tuner.py docker status|pull|smoke|build` - Local Docker runtime management **Trainers:** - `Trainers/rtx3090_sft/setup.sh` - Full SFT environment setup @@ -64,6 +66,8 @@ OLLAMA_HOST=http://localhost:11434 WANDB_API_KEY=your_wandb_key ``` +The CLI now auto-loads repo-root `.env`, including for Docker bootstrap and bucket helper flows. + --- ## Data Patterns @@ -116,8 +120,14 @@ tail -f sft_output_rtx3090/YYYYMMDD_HHMMSS/logs/training_latest.jsonl **Windows PowerShell:** - Use `.ps1` scripts -- Some multiprocessing limitations -- Prefer WSL2 if possible +- Docker Desktop is now the preferred local GPU path +- Start with `python tuner.py docker bootstrap --docker-target all` +- Use `python tuner.py train --runtime docker` and `python tuner.py eval --runtime docker` +- Keep the host conda path as a fallback, not the default recommendation + +**Local Docker Artifacts:** +- Pulled cloud runs under `toolset-training-artifacts/runs/...` are treated as first-class local eval candidates +- Use `python tuner.py bucket pull --path runs/hf_jobs///final_model --dest toolset-training-artifacts` --- @@ -125,13 +135,14 @@ tail -f sft_output_rtx3090/YYYYMMDD_HHMMSS/logs/training_latest.jsonl | Task | Fully Auto | Needs User Input | Notes | |------|:----------:|:----------------:|-------| -| Environment setup | X | | `./setup_env.sh` | +| Docker bootstrap | X | | `python tuner.py docker bootstrap --docker-target all` | +| Environment setup (legacy host path) | X | | `./setup_env.sh` | | Dependency install | X | | `./run.sh doctor --fix` | | List resources | X | | `./run.sh list *` | | Dataset validation | X | | `python3 .skills/synethetic-data-generation/scripts/validate_syngen.py` | | System diagnostics | X | | `./run.sh doctor` | -| Training (SFT/KTO) | | X | Needs dataset choice, model size | -| Evaluation | | X | Needs model path, scenario set | +| Training (SFT/KTO/GRPO) | | X | Prefer `python tuner.py train --runtime docker` for local NVIDIA | +| Evaluation | | X | Prefer `python tuner.py eval --runtime docker` for local NVIDIA | | Upload to HuggingFace | | X | Needs repo name, HF_TOKEN | | Dataset improvement | | X | Needs rubrics, line range | | Synthetic data gen | | X | Needs config, teacher model | @@ -157,4 +168,4 @@ tail -f sft_output_rtx3090/YYYYMMDD_HHMMSS/logs/training_latest.jsonl - Run dry runs: `python train_sft.py --dry-run` - Validate first: `python3 .skills/synethetic-data-generation/scripts/validate_syngen.py dataset.jsonl` -**Key Principle:** Use the bash scripts (`./run.sh`, `setup.sh`, etc.) rather than direct Python when possible - they handle environment setup, dependency checks, and provide better UX. +**Key Principle:** For local NVIDIA GPU work, prefer the repo CLI plus Docker Desktop over hand-managed host dependencies. Start with `python tuner.py docker bootstrap --docker-target all`.