From 14ec1b71223c9077d74dea710445f5e41d63a946 Mon Sep 17 00:00:00 2001 From: Nic van Dessel <51134175+nvandessel@users.noreply.github.com> Date: Wed, 1 Apr 2026 04:17:36 +0000 Subject: [PATCH 1/6] fix(loader): add fallback field mapping for sonnet comparison schema The sonnet comparison data uses `sonnet_response` and `haiku_parsed` instead of `response` and `parsed`. Add fallback mapping in _parse_entry() so these records are loaded correctly, yielding 682 additional SFT training pairs (2,076 total). Co-Authored-By: Claude Opus 4.6 --- src/hippofloop/data/loader.py | 4 +- tests/data/test_loader.py | 77 +++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/hippofloop/data/loader.py b/src/hippofloop/data/loader.py index 91a6a6d..18e6948 100644 --- a/src/hippofloop/data/loader.py +++ b/src/hippofloop/data/loader.py @@ -53,8 +53,8 @@ def _parse_entry(self, raw: dict) -> DecisionEntry: stage=raw.get("stage", ""), pass_=raw.get("pass", ""), prompt=raw.get("prompt", []), - response=raw.get("response", ""), - parsed=raw.get("parsed"), + response=raw.get("response") or raw.get("sonnet_response", ""), + parsed=raw.get("parsed") if raw.get("parsed") is not None else raw.get("haiku_parsed"), run_id=raw.get("run_id", ""), model=raw.get("model", ""), time=raw.get("time", raw.get("timestamp", "")), diff --git a/tests/data/test_loader.py b/tests/data/test_loader.py index f4279b0..c7710c6 100644 --- a/tests/data/test_loader.py +++ b/tests/data/test_loader.py @@ -189,3 +189,80 @@ def test_load_llm_fallback_event_detected(tmp_path): loader = JsonlLoader() entries = loader.load([path]) assert entries[0].fallback is True + + +# -- Sonnet comparison schema fallback -- + + +def test_load_sonnet_comparison_uses_sonnet_response(tmp_path): + """When 'response' is missing, fall back to 'sonnet_response'.""" + path = str(tmp_path / "sonnet.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-sonnet-001", "stage": "extract", "pass": "arc", + "model": "sonnet", "time": "2026-03-28T00:00:00Z", + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "sonnet_response": '{"arc":"test arc"}', + "haiku_response": '{"arc":"haiku arc"}', + "haiku_parsed": {"arc": "haiku arc"}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert len(entries) == 1 + assert entries[0].response == '{"arc":"test arc"}' + + +def test_load_sonnet_comparison_uses_haiku_parsed(tmp_path): + """When 'parsed' is missing, fall back to 'haiku_parsed'.""" + path = str(tmp_path / "sonnet.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-sonnet-001", "stage": "extract", "pass": "summarize", + "model": "sonnet", "time": "2026-03-28T00:00:00Z", + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "sonnet_response": '{"summary":"test"}', + "haiku_parsed": {"summary": "test from haiku"}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert entries[0].parsed == {"summary": "test from haiku"} + + +def test_load_prefers_response_over_sonnet_response(tmp_path): + """When both 'response' and 'sonnet_response' exist, prefer 'response'.""" + path = str(tmp_path / "both.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-001", "stage": "extract", "pass": "summarize", + "model": "test", "time": "T", + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "response": "original response", + "parsed": {"original": True}, + "sonnet_response": "sonnet response", + "haiku_parsed": {"haiku": True}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert entries[0].response == "original response" + assert entries[0].parsed == {"original": True} + + +def test_load_sonnet_comparison_end_to_end(tmp_path): + """Sonnet comparison entries should survive the full loader pipeline.""" + path = str(tmp_path / "sonnet.jsonl") + with open(path, "w") as f: + f.write(json.dumps({ + "run_id": "run-sonnet-001", "stage": "extract", "pass": "summarize", + "model": "sonnet", "time": "2026-03-28T00:00:00Z", "chunk": 0, + "prompt": [{"role": "system", "content": "sys"}, {"role": "user", "content": "usr"}], + "sonnet_response": '{"summary":"from sonnet"}', + "haiku_response": '{"summary":"from haiku"}', + "haiku_parsed": {"summary": "from haiku"}, + }) + "\n") + loader = JsonlLoader() + entries = loader.load([path]) + assert len(entries) == 1 + assert entries[0].response == '{"summary":"from sonnet"}' + assert entries[0].parsed == {"summary": "from haiku"} + assert entries[0].model == "sonnet" + assert entries[0].chunk == 0 From 49a3e9eb0cc57f3cc2f00980142b1e682b995cbe Mon Sep 17 00:00:00 2001 From: Nic van Dessel <51134175+nvandessel@users.noreply.github.com> Date: Wed, 1 Apr 2026 04:20:37 +0000 Subject: [PATCH 2/6] feat(notebook): add Kaggle training notebook for Qwen 2.5 3B MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-contained notebook that runs the full pipeline on a Kaggle T4: load JSONL → clean → format → train (QLoRA) → evaluate → export GGUF. Uses hippofloop's data pipeline and Unsloth for efficient 4-bit training with batch_size=1, grad_accum=16, max_seq_length=8192. Co-Authored-By: Claude Opus 4.6 --- notebooks/train-hippofloop.ipynb | 471 +++++++++++++++++++++++++++++++ 1 file changed, 471 insertions(+) create mode 100644 notebooks/train-hippofloop.ipynb diff --git a/notebooks/train-hippofloop.ipynb b/notebooks/train-hippofloop.ipynb new file mode 100644 index 0000000..77f2792 --- /dev/null +++ b/notebooks/train-hippofloop.ipynb @@ -0,0 +1,471 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# hippofloop: Distill floop's consolidator into a local model\n", + "\n", + "Fine-tunes **Qwen 2.5 3B Instruct** (4-bit QLoRA) on floop's decision logs.\n", + "Runs end-to-end on a Kaggle T4 GPU.\n", + "\n", + "**Pipeline:** Load JSONL → Clean → Format SFT pairs → Train → Evaluate → Export GGUF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# Install unsloth (optimized for Kaggle/Colab T4)\n", + "!pip install unsloth[colab-new]\n", + "!pip install --no-deps trl peft accelerate bitsandbytes xformers\n", + "\n", + "# Install hippofloop from repo\n", + "!pip install git+https://github.com/nvandessel/hippofloop.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "from pathlib import Path\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(name)s: %(message)s\")\n", + "logger = logging.getLogger(\"hippofloop.notebook\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load and explore data\n", + "\n", + "Data is expected as a Kaggle Dataset mounted at `/kaggle/input/floop-decisions/`.\n", + "Upload your `decisions.jsonl` files there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = Path(\"/kaggle/input/floop-decisions\")\n", + "\n", + "# Find all JSONL files in the dataset\n", + "jsonl_files = sorted(DATA_DIR.glob(\"*.jsonl\"))\n", + "print(f\"Found {len(jsonl_files)} JSONL files:\")\n", + "for f in jsonl_files:\n", + " size_mb = f.stat().st_size / (1024 * 1024)\n", + " print(f\" {f.name} ({size_mb:.1f} MB)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hippofloop.data.loader import JsonlLoader\n", + "from hippofloop.data.cleaner import DecisionCleaner\n", + "from hippofloop.data.formatter import SftFormatter\n", + "\n", + "loader = JsonlLoader()\n", + "cleaner = DecisionCleaner()\n", + "formatter = SftFormatter()\n", + "\n", + "# Load\n", + "entries = loader.load([str(f) for f in jsonl_files])\n", + "print(f\"Loaded: {len(entries)} entries\")\n", + "\n", + "# Clean\n", + "cleaned, stats = cleaner.clean_with_stats(entries)\n", + "print(f\"\\nCleaning stats:\")\n", + "for k, v in stats.items():\n", + " print(f\" {k}: {v}\")\n", + "\n", + "# Format\n", + "pairs = formatter.format(cleaned)\n", + "print(f\"\\nSFT pairs: {len(pairs)}\")\n", + "\n", + "# Per-task breakdown\n", + "from collections import Counter\n", + "task_counts = Counter(p.task for p in pairs)\n", + "print(\"\\nBy task:\")\n", + "for task, count in sorted(task_counts.items()):\n", + " print(f\" {task}: {count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect a sample SFT pair\n", + "sample = pairs[0]\n", + "print(f\"Task: {sample.task}\")\n", + "print(f\"Source stage: {sample.source_stage}\")\n", + "for msg in sample.messages:\n", + " content_preview = msg['content'][:200] + '...' if len(msg['content']) > 200 else msg['content']\n", + " print(f\"\\n[{msg['role']}]\\n{content_preview}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Split data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 42\n", + "\n", + "train_pairs, val_pairs, test_pairs = formatter.split(\n", + " pairs, train_ratio=0.8, val_ratio=0.1, seed=SEED\n", + ")\n", + "print(f\"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}\")\n", + "\n", + "# Per-task distribution in train split\n", + "train_tasks = Counter(p.task for p in train_pairs)\n", + "print(\"\\nTrain split by task:\")\n", + "for task, count in sorted(train_tasks.items()):\n", + " print(f\" {task}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load model and apply LoRA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from unsloth import FastLanguageModel\n", + "\n", + "BASE_MODEL = \"unsloth/Qwen2.5-3B-Instruct-bnb-4bit\"\n", + "MAX_SEQ_LENGTH = 8192\n", + "\n", + "model, tokenizer = FastLanguageModel.from_pretrained(\n", + " model_name=BASE_MODEL,\n", + " max_seq_length=MAX_SEQ_LENGTH,\n", + " load_in_4bit=True,\n", + ")\n", + "\n", + "model = FastLanguageModel.get_peft_model(\n", + " model,\n", + " r=32,\n", + " lora_alpha=64,\n", + " lora_dropout=0.05,\n", + " target_modules=[\n", + " \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", + " \"gate_proj\", \"up_proj\", \"down_proj\",\n", + " ],\n", + ")\n", + "\n", + "# Print trainable params\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Prepare datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset\n", + "from hippofloop.training.trainer import UnslothTrainer\n", + "from hippofloop.training.config import TrainingConfig\n", + "\n", + "# Use UnslothTrainer.prepare_dataset for consistent formatting\n", + "config = TrainingConfig(\n", + " base_model=BASE_MODEL,\n", + " lora_rank=32, lora_alpha=64, lora_dropout=0.05,\n", + " lora_target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", + " learning_rate=2e-4, lr_scheduler=\"cosine\", warmup_ratio=0.03,\n", + " epochs=3, batch_size=1, gradient_accumulation_steps=16,\n", + " max_seq_length=MAX_SEQ_LENGTH, weight_decay=0.01,\n", + " bf16=False, fp16=True,\n", + " train_split=0.8, val_split=0.1, test_split=0.1, seed=SEED,\n", + " quantization=\"Q4_K_M\", output_path=\"hippofloop.gguf\",\n", + ")\n", + "\n", + "trainer_helper = UnslothTrainer(config)\n", + "train_dataset = Dataset.from_list(trainer_helper.prepare_dataset(train_pairs))\n", + "val_dataset = Dataset.from_list(trainer_helper.prepare_dataset(val_pairs))\n", + "\n", + "print(f\"Train dataset: {len(train_dataset)} examples\")\n", + "print(f\"Val dataset: {len(val_dataset)} examples\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TrainingArguments\n", + "from trl import SFTTrainer\n", + "\n", + "OUTPUT_DIR = \"checkpoints/qwen25-3b-hippofloop\"\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=OUTPUT_DIR,\n", + " num_train_epochs=3,\n", + " per_device_train_batch_size=1,\n", + " gradient_accumulation_steps=16,\n", + " learning_rate=2e-4,\n", + " lr_scheduler_type=\"cosine\",\n", + " warmup_ratio=0.03,\n", + " weight_decay=0.01,\n", + " bf16=False,\n", + " fp16=True,\n", + " eval_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"eval_loss\",\n", + " logging_steps=10,\n", + " seed=SEED,\n", + ")\n", + "\n", + "sft_trainer = SFTTrainer(\n", + " model=model,\n", + " tokenizer=tokenizer,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=val_dataset,\n", + " args=training_args,\n", + ")\n", + "\n", + "print(f\"Training {len(train_dataset)} examples for {training_args.num_train_epochs} epochs...\")\n", + "print(f\"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}\")\n", + "sft_trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Training loss curve\n", + "import matplotlib.pyplot as plt\n", + "\n", + "train_losses = [(e[\"step\"], e[\"loss\"]) for e in sft_trainer.state.log_history if \"loss\" in e]\n", + "eval_losses = [(e[\"step\"], e[\"eval_loss\"]) for e in sft_trainer.state.log_history if \"eval_loss\" in e]\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5))\n", + "if train_losses:\n", + " steps, losses = zip(*train_losses)\n", + " ax.plot(steps, losses, label=\"Train loss\", alpha=0.7)\n", + "if eval_losses:\n", + " steps, losses = zip(*eval_losses)\n", + " ax.plot(steps, losses, \"o-\", label=\"Eval loss\", markersize=8)\n", + "ax.set_xlabel(\"Step\")\n", + "ax.set_ylabel(\"Loss\")\n", + "ax.set_title(\"Training Loss\")\n", + "ax.legend()\n", + "ax.grid(True, alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Best eval loss\n", + "if eval_losses:\n", + " best_step, best_loss = min(eval_losses, key=lambda x: x[1])\n", + " print(f\"Best eval loss: {best_loss:.4f} at step {best_step}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Save best model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BEST_MODEL_PATH = f\"{OUTPUT_DIR}/best\"\n", + "sft_trainer.save_model(BEST_MODEL_PATH)\n", + "tokenizer.save_pretrained(BEST_MODEL_PATH)\n", + "print(f\"Saved best model to {BEST_MODEL_PATH}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Evaluate on test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hippofloop.eval.evaluator import ModelEvaluator\n", + "\n", + "# Set up inference mode\n", + "FastLanguageModel.for_inference(model)\n", + "\n", + "def model_fn(system_msg: str, user_msg: str) -> str:\n", + " \"\"\"Run inference on the fine-tuned model.\"\"\"\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_msg},\n", + " {\"role\": \"user\", \"content\": user_msg},\n", + " ]\n", + " inputs = tokenizer.apply_chat_template(\n", + " messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n", + " ).to(model.device)\n", + " outputs = model.generate(\n", + " input_ids=inputs, max_new_tokens=2048, temperature=0.1, do_sample=True\n", + " )\n", + " # Decode only the generated tokens\n", + " generated = outputs[0][inputs.shape[-1]:]\n", + " return tokenizer.decode(generated, skip_special_tokens=True)\n", + "\n", + "# Run eval on a subset (full eval can be slow)\n", + "eval_subset = test_pairs[:50]\n", + "print(f\"Evaluating on {len(eval_subset)} test examples...\")\n", + "\n", + "evaluator = ModelEvaluator(model_fn)\n", + "results = evaluator.evaluate(eval_subset)\n", + "report = evaluator.summary_report(results)\n", + "\n", + "print(f\"\\nOverall:\")\n", + "print(f\" JSON valid rate: {report['json_valid_rate']:.1%}\")\n", + "print(f\" Schema valid rate: {report['schema_valid_rate']:.1%}\")\n", + "print(f\"\\nBy task:\")\n", + "for task, task_report in report.get('by_task', {}).items():\n", + " print(f\" {task} (n={task_report['count']}):\")\n", + " print(f\" JSON valid: {task_report['json_valid_rate']:.1%}\")\n", + " print(f\" Schema valid: {task_report['schema_valid_rate']:.1%}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Export to GGUF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "GGUF_OUTPUT = \"hippofloop-qwen25-3b-Q4_K_M.gguf\"\n", + "\n", + "model.save_pretrained_gguf(\n", + " \"gguf_export\",\n", + " tokenizer,\n", + " quantization_method=\"q4_k_m\",\n", + ")\n", + "\n", + "# Rename to target filename\n", + "gguf_dir = Path(\"gguf_export\")\n", + "gguf_files = list(gguf_dir.glob(\"*.gguf\"))\n", + "if gguf_files:\n", + " actual = max(gguf_files, key=lambda p: p.stat().st_mtime)\n", + " actual.rename(GGUF_OUTPUT)\n", + " size_mb = Path(GGUF_OUTPUT).stat().st_size / (1024 * 1024)\n", + " print(f\"Exported: {GGUF_OUTPUT} ({size_mb:.0f} MB)\")\n", + "else:\n", + " print(\"Warning: no GGUF file found after export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Download\n", + "\n", + "The GGUF file is saved in the notebook's working directory.\n", + "On Kaggle, go to **Output** tab to download it, or save as a Kaggle Dataset for reuse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Summary\n", + "print(\"=\" * 60)\n", + "print(\"hippofloop training complete\")\n", + "print(\"=\" * 60)\n", + "print(f\"Base model: {BASE_MODEL}\")\n", + "print(f\"Training examples: {len(train_pairs)}\")\n", + "print(f\"Validation examples: {len(val_pairs)}\")\n", + "print(f\"Test examples: {len(test_pairs)}\")\n", + "if eval_losses:\n", + " print(f\"Best eval loss: {best_loss:.4f}\")\n", + "print(f\"JSON valid rate: {report['json_valid_rate']:.1%}\")\n", + "print(f\"Schema valid rate: {report['schema_valid_rate']:.1%}\")\n", + "print(f\"GGUF output: {GGUF_OUTPUT}\")" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "gpu", + "dataSources": [], + "isGpuEnabled": true, + "isInternetEnabled": true, + "language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From ebe13c361d1cc961aeed53359414e7bd9b641277 Mon Sep 17 00:00:00 2001 From: nic van dessel <51134175+nvandessel@users.noreply.github.com> Date: Tue, 31 Mar 2026 21:21:59 -0700 Subject: [PATCH 3/6] Update src/hippofloop/data/loader.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/hippofloop/data/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hippofloop/data/loader.py b/src/hippofloop/data/loader.py index 18e6948..0358ff0 100644 --- a/src/hippofloop/data/loader.py +++ b/src/hippofloop/data/loader.py @@ -53,7 +53,7 @@ def _parse_entry(self, raw: dict) -> DecisionEntry: stage=raw.get("stage", ""), pass_=raw.get("pass", ""), prompt=raw.get("prompt", []), - response=raw.get("response") or raw.get("sonnet_response", ""), + response=raw.get("response") if raw.get("response") is not None else raw.get("sonnet_response", ""), parsed=raw.get("parsed") if raw.get("parsed") is not None else raw.get("haiku_parsed"), run_id=raw.get("run_id", ""), model=raw.get("model", ""), From aef72d7c8b29f9ae0d72f42fa5a9d085a8f79710 Mon Sep 17 00:00:00 2001 From: Nic van Dessel <51134175+nvandessel@users.noreply.github.com> Date: Wed, 1 Apr 2026 04:23:41 +0000 Subject: [PATCH 4/6] fix(notebook): representative eval sampling and best_loss guard - Use rng.sample() instead of ordered slice for eval subset - Initialize best_loss = None before training, guard with is not None Co-Authored-By: Claude Opus 4.6 --- notebooks/train-hippofloop.ipynb | 86 ++------------------------------ 1 file changed, 4 insertions(+), 82 deletions(-) diff --git a/notebooks/train-hippofloop.ipynb b/notebooks/train-hippofloop.ipynb index 77f2792..d7f70fc 100644 --- a/notebooks/train-hippofloop.ipynb +++ b/notebooks/train-hippofloop.ipynb @@ -284,33 +284,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Training loss curve\n", - "import matplotlib.pyplot as plt\n", - "\n", - "train_losses = [(e[\"step\"], e[\"loss\"]) for e in sft_trainer.state.log_history if \"loss\" in e]\n", - "eval_losses = [(e[\"step\"], e[\"eval_loss\"]) for e in sft_trainer.state.log_history if \"eval_loss\" in e]\n", - "\n", - "fig, ax = plt.subplots(figsize=(10, 5))\n", - "if train_losses:\n", - " steps, losses = zip(*train_losses)\n", - " ax.plot(steps, losses, label=\"Train loss\", alpha=0.7)\n", - "if eval_losses:\n", - " steps, losses = zip(*eval_losses)\n", - " ax.plot(steps, losses, \"o-\", label=\"Eval loss\", markersize=8)\n", - "ax.set_xlabel(\"Step\")\n", - "ax.set_ylabel(\"Loss\")\n", - "ax.set_title(\"Training Loss\")\n", - "ax.legend()\n", - "ax.grid(True, alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "# Best eval loss\n", - "if eval_losses:\n", - " best_step, best_loss = min(eval_losses, key=lambda x: x[1])\n", - " print(f\"Best eval loss: {best_loss:.4f} at step {best_step}\")" - ] + "source": "# Training loss curve\nimport matplotlib.pyplot as plt\n\ntrain_losses = [(e[\"step\"], e[\"loss\"]) for e in sft_trainer.state.log_history if \"loss\" in e]\neval_losses = [(e[\"step\"], e[\"eval_loss\"]) for e in sft_trainer.state.log_history if \"eval_loss\" in e]\n\nfig, ax = plt.subplots(figsize=(10, 5))\nif train_losses:\n steps, losses = zip(*train_losses)\n ax.plot(steps, losses, label=\"Train loss\", alpha=0.7)\nif eval_losses:\n steps, losses = zip(*eval_losses)\n ax.plot(steps, losses, \"o-\", label=\"Eval loss\", markersize=8)\nax.set_xlabel(\"Step\")\nax.set_ylabel(\"Loss\")\nax.set_title(\"Training Loss\")\nax.legend()\nax.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.show()\n\n# Best eval loss\nbest_loss = None\nif eval_losses:\n best_step, best_loss = min(eval_losses, key=lambda x: x[1])\n print(f\"Best eval loss: {best_loss:.4f} at step {best_step}\")" }, { "cell_type": "markdown", @@ -343,45 +317,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from hippofloop.eval.evaluator import ModelEvaluator\n", - "\n", - "# Set up inference mode\n", - "FastLanguageModel.for_inference(model)\n", - "\n", - "def model_fn(system_msg: str, user_msg: str) -> str:\n", - " \"\"\"Run inference on the fine-tuned model.\"\"\"\n", - " messages = [\n", - " {\"role\": \"system\", \"content\": system_msg},\n", - " {\"role\": \"user\", \"content\": user_msg},\n", - " ]\n", - " inputs = tokenizer.apply_chat_template(\n", - " messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n", - " ).to(model.device)\n", - " outputs = model.generate(\n", - " input_ids=inputs, max_new_tokens=2048, temperature=0.1, do_sample=True\n", - " )\n", - " # Decode only the generated tokens\n", - " generated = outputs[0][inputs.shape[-1]:]\n", - " return tokenizer.decode(generated, skip_special_tokens=True)\n", - "\n", - "# Run eval on a subset (full eval can be slow)\n", - "eval_subset = test_pairs[:50]\n", - "print(f\"Evaluating on {len(eval_subset)} test examples...\")\n", - "\n", - "evaluator = ModelEvaluator(model_fn)\n", - "results = evaluator.evaluate(eval_subset)\n", - "report = evaluator.summary_report(results)\n", - "\n", - "print(f\"\\nOverall:\")\n", - "print(f\" JSON valid rate: {report['json_valid_rate']:.1%}\")\n", - "print(f\" Schema valid rate: {report['schema_valid_rate']:.1%}\")\n", - "print(f\"\\nBy task:\")\n", - "for task, task_report in report.get('by_task', {}).items():\n", - " print(f\" {task} (n={task_report['count']}):\")\n", - " print(f\" JSON valid: {task_report['json_valid_rate']:.1%}\")\n", - " print(f\" Schema valid: {task_report['schema_valid_rate']:.1%}\")" - ] + "source": "import random\n\nfrom hippofloop.eval.evaluator import ModelEvaluator\n\n# Set up inference mode\nFastLanguageModel.for_inference(model)\n\ndef model_fn(system_msg: str, user_msg: str) -> str:\n \"\"\"Run inference on the fine-tuned model.\"\"\"\n messages = [\n {\"role\": \"system\", \"content\": system_msg},\n {\"role\": \"user\", \"content\": user_msg},\n ]\n inputs = tokenizer.apply_chat_template(\n messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n ).to(model.device)\n outputs = model.generate(\n input_ids=inputs, max_new_tokens=2048, temperature=0.1, do_sample=True\n )\n # Decode only the generated tokens\n generated = outputs[0][inputs.shape[-1]:]\n return tokenizer.decode(generated, skip_special_tokens=True)\n\n# Run eval on a representative random subset (full eval can be slow)\nrng = random.Random(SEED)\neval_subset = rng.sample(test_pairs, min(50, len(test_pairs)))\nprint(f\"Evaluating on {len(eval_subset)} test examples...\")\n\nevaluator = ModelEvaluator(model_fn)\nresults = evaluator.evaluate(eval_subset)\nreport = evaluator.summary_report(results)\n\nprint(f\"\\nOverall:\")\nprint(f\" JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\" Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"\\nBy task:\")\nfor task, task_report in report.get('by_task', {}).items():\n print(f\" {task} (n={task_report['count']}):\")\n print(f\" JSON valid: {task_report['json_valid_rate']:.1%}\")\n print(f\" Schema valid: {task_report['schema_valid_rate']:.1%}\")" }, { "cell_type": "markdown", @@ -431,21 +367,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Summary\n", - "print(\"=\" * 60)\n", - "print(\"hippofloop training complete\")\n", - "print(\"=\" * 60)\n", - "print(f\"Base model: {BASE_MODEL}\")\n", - "print(f\"Training examples: {len(train_pairs)}\")\n", - "print(f\"Validation examples: {len(val_pairs)}\")\n", - "print(f\"Test examples: {len(test_pairs)}\")\n", - "if eval_losses:\n", - " print(f\"Best eval loss: {best_loss:.4f}\")\n", - "print(f\"JSON valid rate: {report['json_valid_rate']:.1%}\")\n", - "print(f\"Schema valid rate: {report['schema_valid_rate']:.1%}\")\n", - "print(f\"GGUF output: {GGUF_OUTPUT}\")" - ] + "source": "# Summary\nprint(\"=\" * 60)\nprint(\"hippofloop training complete\")\nprint(\"=\" * 60)\nprint(f\"Base model: {BASE_MODEL}\")\nprint(f\"Training examples: {len(train_pairs)}\")\nprint(f\"Validation examples: {len(val_pairs)}\")\nprint(f\"Test examples: {len(test_pairs)}\")\nif best_loss is not None:\n print(f\"Best eval loss: {best_loss:.4f}\")\nprint(f\"JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\"Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"GGUF output: {GGUF_OUTPUT}\")" } ], "metadata": { @@ -468,4 +390,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file From 5ca2098c281471d1bb1af850091cc9951f614b3b Mon Sep 17 00:00:00 2001 From: Nic van Dessel <51134175+nvandessel@users.noreply.github.com> Date: Wed, 1 Apr 2026 04:32:06 +0000 Subject: [PATCH 5/6] fix: wrap long lines in loader.py for ruff E501 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/hippofloop/data/loader.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/hippofloop/data/loader.py b/src/hippofloop/data/loader.py index 0358ff0..6ed1ba0 100644 --- a/src/hippofloop/data/loader.py +++ b/src/hippofloop/data/loader.py @@ -53,8 +53,16 @@ def _parse_entry(self, raw: dict) -> DecisionEntry: stage=raw.get("stage", ""), pass_=raw.get("pass", ""), prompt=raw.get("prompt", []), - response=raw.get("response") if raw.get("response") is not None else raw.get("sonnet_response", ""), - parsed=raw.get("parsed") if raw.get("parsed") is not None else raw.get("haiku_parsed"), + response=( + raw.get("response") + if raw.get("response") is not None + else raw.get("sonnet_response", "") + ), + parsed=( + raw.get("parsed") + if raw.get("parsed") is not None + else raw.get("haiku_parsed") + ), run_id=raw.get("run_id", ""), model=raw.get("model", ""), time=raw.get("time", raw.get("timestamp", "")), From 594cbefcd4deee604b8664d91d7675a6da42c2da Mon Sep 17 00:00:00 2001 From: Nic van Dessel <51134175+nvandessel@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:57:02 +0000 Subject: [PATCH 6/6] refactor: eliminate double dict lookups in _parse_entry Extract response, parsed, and time into locals before the constructor call. Fixes redundant hash+lookup per record and aligns all three fallback fields to the same is-not-None idiom. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/hippofloop/data/loader.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/hippofloop/data/loader.py b/src/hippofloop/data/loader.py index 6ed1ba0..b66e429 100644 --- a/src/hippofloop/data/loader.py +++ b/src/hippofloop/data/loader.py @@ -49,23 +49,18 @@ def _load_file(self, path: str) -> list[DecisionEntry]: return entries def _parse_entry(self, raw: dict) -> DecisionEntry: + resp = raw.get("response") + parsed = raw.get("parsed") + time_ = raw.get("time") return DecisionEntry( stage=raw.get("stage", ""), pass_=raw.get("pass", ""), prompt=raw.get("prompt", []), - response=( - raw.get("response") - if raw.get("response") is not None - else raw.get("sonnet_response", "") - ), - parsed=( - raw.get("parsed") - if raw.get("parsed") is not None - else raw.get("haiku_parsed") - ), + response=resp if resp is not None else raw.get("sonnet_response", ""), + parsed=parsed if parsed is not None else raw.get("haiku_parsed"), run_id=raw.get("run_id", ""), model=raw.get("model", ""), - time=raw.get("time", raw.get("timestamp", "")), + time=time_ if time_ is not None else raw.get("timestamp", ""), chunk=raw.get("chunk"), error=raw.get("error"), fallback=bool(raw.get("fallback", False)) or raw.get("event") == "llm_fallback",