diff --git a/notebooks/train-hippofloop.ipynb b/notebooks/train-hippofloop.ipynb new file mode 100644 index 0000000..d7f70fc --- /dev/null +++ b/notebooks/train-hippofloop.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# hippofloop: Distill floop's consolidator into a local model\n", + "\n", + "Fine-tunes **Qwen 2.5 3B Instruct** (4-bit QLoRA) on floop's decision logs.\n", + "Runs end-to-end on a Kaggle T4 GPU.\n", + "\n", + "**Pipeline:** Load JSONL → Clean → Format SFT pairs → Train → Evaluate → Export GGUF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# Install unsloth (optimized for Kaggle/Colab T4)\n", + "!pip install unsloth[colab-new]\n", + "!pip install --no-deps trl peft accelerate bitsandbytes xformers\n", + "\n", + "# Install hippofloop from repo\n", + "!pip install git+https://github.com/nvandessel/hippofloop.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "from pathlib import Path\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(name)s: %(message)s\")\n", + "logger = logging.getLogger(\"hippofloop.notebook\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load and explore data\n", + "\n", + "Data is expected as a Kaggle Dataset mounted at `/kaggle/input/floop-decisions/`.\n", + "Upload your `decisions.jsonl` files there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = Path(\"/kaggle/input/floop-decisions\")\n", + "\n", + "# Find all JSONL files in the dataset\n", + "jsonl_files = sorted(DATA_DIR.glob(\"*.jsonl\"))\n", + "print(f\"Found {len(jsonl_files)} JSONL files:\")\n", + "for f in jsonl_files:\n", + " size_mb = f.stat().st_size / (1024 * 1024)\n", + " print(f\" {f.name} ({size_mb:.1f} MB)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hippofloop.data.loader import JsonlLoader\n", + "from hippofloop.data.cleaner import DecisionCleaner\n", + "from hippofloop.data.formatter import SftFormatter\n", + "\n", + "loader = JsonlLoader()\n", + "cleaner = DecisionCleaner()\n", + "formatter = SftFormatter()\n", + "\n", + "# Load\n", + "entries = loader.load([str(f) for f in jsonl_files])\n", + "print(f\"Loaded: {len(entries)} entries\")\n", + "\n", + "# Clean\n", + "cleaned, stats = cleaner.clean_with_stats(entries)\n", + "print(f\"\\nCleaning stats:\")\n", + "for k, v in stats.items():\n", + " print(f\" {k}: {v}\")\n", + "\n", + "# Format\n", + "pairs = formatter.format(cleaned)\n", + "print(f\"\\nSFT pairs: {len(pairs)}\")\n", + "\n", + "# Per-task breakdown\n", + "from collections import Counter\n", + "task_counts = Counter(p.task for p in pairs)\n", + "print(\"\\nBy task:\")\n", + "for task, count in sorted(task_counts.items()):\n", + " print(f\" {task}: {count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect a sample SFT pair\n", + "sample = pairs[0]\n", + "print(f\"Task: {sample.task}\")\n", + "print(f\"Source stage: {sample.source_stage}\")\n", + "for msg in sample.messages:\n", + " content_preview = msg['content'][:200] + '...' if len(msg['content']) > 200 else msg['content']\n", + " print(f\"\\n[{msg['role']}]\\n{content_preview}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Split data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 42\n", + "\n", + "train_pairs, val_pairs, test_pairs = formatter.split(\n", + " pairs, train_ratio=0.8, val_ratio=0.1, seed=SEED\n", + ")\n", + "print(f\"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}\")\n", + "\n", + "# Per-task distribution in train split\n", + "train_tasks = Counter(p.task for p in train_pairs)\n", + "print(\"\\nTrain split by task:\")\n", + "for task, count in sorted(train_tasks.items()):\n", + " print(f\" {task}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load model and apply LoRA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from unsloth import FastLanguageModel\n", + "\n", + "BASE_MODEL = \"unsloth/Qwen2.5-3B-Instruct-bnb-4bit\"\n", + "MAX_SEQ_LENGTH = 8192\n", + "\n", + "model, tokenizer = FastLanguageModel.from_pretrained(\n", + " model_name=BASE_MODEL,\n", + " max_seq_length=MAX_SEQ_LENGTH,\n", + " load_in_4bit=True,\n", + ")\n", + "\n", + "model = FastLanguageModel.get_peft_model(\n", + " model,\n", + " r=32,\n", + " lora_alpha=64,\n", + " lora_dropout=0.05,\n", + " target_modules=[\n", + " \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", + " \"gate_proj\", \"up_proj\", \"down_proj\",\n", + " ],\n", + ")\n", + "\n", + "# Print trainable params\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Prepare datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset\n", + "from hippofloop.training.trainer import UnslothTrainer\n", + "from hippofloop.training.config import TrainingConfig\n", + "\n", + "# Use UnslothTrainer.prepare_dataset for consistent formatting\n", + "config = TrainingConfig(\n", + " base_model=BASE_MODEL,\n", + " lora_rank=32, lora_alpha=64, lora_dropout=0.05,\n", + " lora_target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", + " learning_rate=2e-4, lr_scheduler=\"cosine\", warmup_ratio=0.03,\n", + " epochs=3, batch_size=1, gradient_accumulation_steps=16,\n", + " max_seq_length=MAX_SEQ_LENGTH, weight_decay=0.01,\n", + " bf16=False, fp16=True,\n", + " train_split=0.8, val_split=0.1, test_split=0.1, seed=SEED,\n", + " quantization=\"Q4_K_M\", output_path=\"hippofloop.gguf\",\n", + ")\n", + "\n", + "trainer_helper = UnslothTrainer(config)\n", + "train_dataset = Dataset.from_list(trainer_helper.prepare_dataset(train_pairs))\n", + "val_dataset = Dataset.from_list(trainer_helper.prepare_dataset(val_pairs))\n", + "\n", + "print(f\"Train dataset: {len(train_dataset)} examples\")\n", + "print(f\"Val dataset: {len(val_dataset)} examples\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TrainingArguments\n", + "from trl import SFTTrainer\n", + "\n", + "OUTPUT_DIR = \"checkpoints/qwen25-3b-hippofloop\"\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=OUTPUT_DIR,\n", + " num_train_epochs=3,\n", + " per_device_train_batch_size=1,\n", + " gradient_accumulation_steps=16,\n", + " learning_rate=2e-4,\n", + " lr_scheduler_type=\"cosine\",\n", + " warmup_ratio=0.03,\n", + " weight_decay=0.01,\n", + " bf16=False,\n", + " fp16=True,\n", + " eval_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"eval_loss\",\n", + " logging_steps=10,\n", + " seed=SEED,\n", + ")\n", + "\n", + "sft_trainer = SFTTrainer(\n", + " model=model,\n", + " tokenizer=tokenizer,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=val_dataset,\n", + " args=training_args,\n", + ")\n", + "\n", + "print(f\"Training {len(train_dataset)} examples for {training_args.num_train_epochs} epochs...\")\n", + "print(f\"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}\")\n", + "sft_trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Training loss curve\nimport matplotlib.pyplot as plt\n\ntrain_losses = [(e[\"step\"], e[\"loss\"]) for e in sft_trainer.state.log_history if \"loss\" in e]\neval_losses = [(e[\"step\"], e[\"eval_loss\"]) for e in sft_trainer.state.log_history if \"eval_loss\" in e]\n\nfig, ax = plt.subplots(figsize=(10, 5))\nif train_losses:\n steps, losses = zip(*train_losses)\n ax.plot(steps, losses, label=\"Train loss\", alpha=0.7)\nif eval_losses:\n steps, losses = zip(*eval_losses)\n ax.plot(steps, losses, \"o-\", label=\"Eval loss\", markersize=8)\nax.set_xlabel(\"Step\")\nax.set_ylabel(\"Loss\")\nax.set_title(\"Training Loss\")\nax.legend()\nax.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.show()\n\n# Best eval loss\nbest_loss = None\nif eval_losses:\n best_step, best_loss = min(eval_losses, key=lambda x: x[1])\n print(f\"Best eval loss: {best_loss:.4f} at step {best_step}\")" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Save best model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BEST_MODEL_PATH = f\"{OUTPUT_DIR}/best\"\n", + "sft_trainer.save_model(BEST_MODEL_PATH)\n", + "tokenizer.save_pretrained(BEST_MODEL_PATH)\n", + "print(f\"Saved best model to {BEST_MODEL_PATH}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Evaluate on test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "import random\n\nfrom hippofloop.eval.evaluator import ModelEvaluator\n\n# Set up inference mode\nFastLanguageModel.for_inference(model)\n\ndef model_fn(system_msg: str, user_msg: str) -> str:\n \"\"\"Run inference on the fine-tuned model.\"\"\"\n messages = [\n {\"role\": \"system\", \"content\": system_msg},\n {\"role\": \"user\", \"content\": user_msg},\n ]\n inputs = tokenizer.apply_chat_template(\n messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n ).to(model.device)\n outputs = model.generate(\n input_ids=inputs, max_new_tokens=2048, temperature=0.1, do_sample=True\n )\n # Decode only the generated tokens\n generated = outputs[0][inputs.shape[-1]:]\n return tokenizer.decode(generated, skip_special_tokens=True)\n\n# Run eval on a representative random subset (full eval can be slow)\nrng = random.Random(SEED)\neval_subset = rng.sample(test_pairs, min(50, len(test_pairs)))\nprint(f\"Evaluating on {len(eval_subset)} test examples...\")\n\nevaluator = ModelEvaluator(model_fn)\nresults = evaluator.evaluate(eval_subset)\nreport = evaluator.summary_report(results)\n\nprint(f\"\\nOverall:\")\nprint(f\" JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\" Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"\\nBy task:\")\nfor task, task_report in report.get('by_task', {}).items():\n print(f\" {task} (n={task_report['count']}):\")\n print(f\" JSON valid: {task_report['json_valid_rate']:.1%}\")\n print(f\" Schema valid: {task_report['schema_valid_rate']:.1%}\")" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Export to GGUF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "GGUF_OUTPUT = \"hippofloop-qwen25-3b-Q4_K_M.gguf\"\n", + "\n", + "model.save_pretrained_gguf(\n", + " \"gguf_export\",\n", + " tokenizer,\n", + " quantization_method=\"q4_k_m\",\n", + ")\n", + "\n", + "# Rename to target filename\n", + "gguf_dir = Path(\"gguf_export\")\n", + "gguf_files = list(gguf_dir.glob(\"*.gguf\"))\n", + "if gguf_files:\n", + " actual = max(gguf_files, key=lambda p: p.stat().st_mtime)\n", + " actual.rename(GGUF_OUTPUT)\n", + " size_mb = Path(GGUF_OUTPUT).stat().st_size / (1024 * 1024)\n", + " print(f\"Exported: {GGUF_OUTPUT} ({size_mb:.0f} MB)\")\n", + "else:\n", + " print(\"Warning: no GGUF file found after export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Download\n", + "\n", + "The GGUF file is saved in the notebook's working directory.\n", + "On Kaggle, go to **Output** tab to download it, or save as a Kaggle Dataset for reuse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Summary\nprint(\"=\" * 60)\nprint(\"hippofloop training complete\")\nprint(\"=\" * 60)\nprint(f\"Base model: {BASE_MODEL}\")\nprint(f\"Training examples: {len(train_pairs)}\")\nprint(f\"Validation examples: {len(val_pairs)}\")\nprint(f\"Test examples: {len(test_pairs)}\")\nif best_loss is not None:\n print(f\"Best eval loss: {best_loss:.4f}\")\nprint(f\"JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\"Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"GGUF output: {GGUF_OUTPUT}\")" + } + ], + "metadata": { + "kaggle": { + "accelerator": "gpu", + "dataSources": [], + "isGpuEnabled": true, + "isInternetEnabled": true, + "language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/src/hippofloop/data/loader.py b/src/hippofloop/data/loader.py index 91a6a6d..b66e429 100644 --- a/src/hippofloop/data/loader.py +++ b/src/hippofloop/data/loader.py @@ -49,15 +49,18 @@ def _load_file(self, path: str) -> list[DecisionEntry]: return entries def _parse_entry(self, raw: dict) -> DecisionEntry: + resp = raw.get("response") + parsed = raw.get("parsed") + time_ = raw.get("time") return DecisionEntry( stage=raw.get("stage", ""), pass_=raw.get("pass", ""), prompt=raw.get("prompt", []), - response=raw.get("response", ""), - parsed=raw.get("parsed"), + response=resp if resp is not None else raw.get("sonnet_response", ""), + parsed=parsed if parsed is not None else raw.get("haiku_parsed"), run_id=raw.get("run_id", ""), model=raw.get("model", ""), - time=raw.get("time", raw.get("timestamp", "")), + time=time_ if time_ is not None else raw.get("timestamp", ""), chunk=raw.get("chunk"), error=raw.get("error"), fallback=bool(raw.get("fallback", False)) or raw.get("event") == "llm_fallback", diff --git a/tests/data/test_loader.py b/tests/data/test_loader.py index f4279b0..c7710c6 100644 --- a/tests/data/test_loader.py +++ b/tests/data/test_loader.py @@ -189,3 +189,80 @@ def test_load_llm_fallback_event_detected(tmp_path): loader = JsonlLoader() entries = loader.load([path]) assert entries[0].fallback is True + + +# -- Sonnet comparison schema fallback -- + + +def test_load_sonnet_comparison_uses_sonnet_response(tmp_path): + """When 'response' is missing, fall back to 'sonnet_response'.""" + path = str(tmp_path / "sonnet.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-sonnet-001", "stage": "extract", "pass": "arc", + "model": "sonnet", "time": "2026-03-28T00:00:00Z", + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "sonnet_response": '{"arc":"test arc"}', + "haiku_response": '{"arc":"haiku arc"}', + "haiku_parsed": {"arc": "haiku arc"}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert len(entries) == 1 + assert entries[0].response == '{"arc":"test arc"}' + + +def test_load_sonnet_comparison_uses_haiku_parsed(tmp_path): + """When 'parsed' is missing, fall back to 'haiku_parsed'.""" + path = str(tmp_path / "sonnet.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-sonnet-001", "stage": "extract", "pass": "summarize", + "model": "sonnet", "time": "2026-03-28T00:00:00Z", + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "sonnet_response": '{"summary":"test"}', + "haiku_parsed": {"summary": "test from haiku"}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert entries[0].parsed == {"summary": "test from haiku"} + + +def test_load_prefers_response_over_sonnet_response(tmp_path): + """When both 'response' and 'sonnet_response' exist, prefer 'response'.""" + path = str(tmp_path / "both.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-001", "stage": "extract", "pass": "summarize", + "model": "test", "time": "T", + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "response": "original response", + "parsed": {"original": True}, + "sonnet_response": "sonnet response", + "haiku_parsed": {"haiku": True}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert entries[0].response == "original response" + assert entries[0].parsed == {"original": True} + + +def test_load_sonnet_comparison_end_to_end(tmp_path): + """Sonnet comparison entries should survive the full loader pipeline.""" + path = str(tmp_path / "sonnet.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-sonnet-001", "stage": "extract", "pass": "summarize", + "model": "sonnet", "time": "2026-03-28T00:00:00Z", "chunk": 0, + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "sonnet_response": '{"summary":"from sonnet"}', + "haiku_response": '{"summary":"from haiku"}', + "haiku_parsed": {"summary": "from haiku"}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert len(entries) == 1 + assert entries[0].response == '{"summary":"from sonnet"}' + assert entries[0].parsed == {"summary": "from haiku"} + assert entries[0].model == "sonnet" + assert entries[0].chunk == 0