nvandessel · nvandessel · Apr 2, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/notebooks/train-hippofloop.ipynb b/notebooks/train-hippofloop.ipynb
@@ -0,0 +1,393 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# hippofloop: Distill floop's consolidator into a local model\n",
+    "\n",
+    "Fine-tunes **Qwen 2.5 3B Instruct** (4-bit QLoRA) on floop's decision logs.\n",
+    "Runs end-to-end on a Kaggle T4 GPU.\n",
+    "\n",
+    "**Pipeline:** Load JSONL → Clean → Format SFT pairs → Train → Evaluate → Export GGUF"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# Install unsloth (optimized for Kaggle/Colab T4)\n",
+    "!pip install unsloth[colab-new]\n",
+    "!pip install --no-deps trl peft accelerate bitsandbytes xformers\n",
+    "\n",
+    "# Install hippofloop from repo\n",
+    "!pip install git+https://github.com/nvandessel/hippofloop.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import logging\n",
+    "from pathlib import Path\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(name)s: %(message)s\")\n",
+    "logger = logging.getLogger(\"hippofloop.notebook\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load and explore data\n",
+    "\n",
+    "Data is expected as a Kaggle Dataset mounted at `/kaggle/input/floop-decisions/`.\n",
+    "Upload your `decisions.jsonl` files there."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_DIR = Path(\"/kaggle/input/floop-decisions\")\n",
+    "\n",
+    "# Find all JSONL files in the dataset\n",
+    "jsonl_files = sorted(DATA_DIR.glob(\"*.jsonl\"))\n",
+    "print(f\"Found {len(jsonl_files)} JSONL files:\")\n",
+    "for f in jsonl_files:\n",
+    "    size_mb = f.stat().st_size / (1024 * 1024)\n",
+    "    print(f\"  {f.name} ({size_mb:.1f} MB)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from hippofloop.data.loader import JsonlLoader\n",
+    "from hippofloop.data.cleaner import DecisionCleaner\n",
+    "from hippofloop.data.formatter import SftFormatter\n",
+    "\n",
+    "loader = JsonlLoader()\n",
+    "cleaner = DecisionCleaner()\n",
+    "formatter = SftFormatter()\n",
+    "\n",
+    "# Load\n",
+    "entries = loader.load([str(f) for f in jsonl_files])\n",
+    "print(f\"Loaded: {len(entries)} entries\")\n",
+    "\n",
+    "# Clean\n",
+    "cleaned, stats = cleaner.clean_with_stats(entries)\n",
+    "print(f\"\\nCleaning stats:\")\n",
+    "for k, v in stats.items():\n",
+    "    print(f\"  {k}: {v}\")\n",
+    "\n",
+    "# Format\n",
+    "pairs = formatter.format(cleaned)\n",
+    "print(f\"\\nSFT pairs: {len(pairs)}\")\n",
+    "\n",
+    "# Per-task breakdown\n",
+    "from collections import Counter\n",
+    "task_counts = Counter(p.task for p in pairs)\n",
+    "print(\"\\nBy task:\")\n",
+    "for task, count in sorted(task_counts.items()):\n",
+    "    print(f\"  {task}: {count}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inspect a sample SFT pair\n",
+    "sample = pairs[0]\n",
+    "print(f\"Task: {sample.task}\")\n",
+    "print(f\"Source stage: {sample.source_stage}\")\n",
+    "for msg in sample.messages:\n",
+    "    content_preview = msg['content'][:200] + '...' if len(msg['content']) > 200 else msg['content']\n",
+    "    print(f\"\\n[{msg['role']}]\\n{content_preview}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Split data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SEED = 42\n",
+    "\n",
+    "train_pairs, val_pairs, test_pairs = formatter.split(\n",
+    "    pairs, train_ratio=0.8, val_ratio=0.1, seed=SEED\n",
+    ")\n",
+    "print(f\"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}\")\n",
+    "\n",
+    "# Per-task distribution in train split\n",
+    "train_tasks = Counter(p.task for p in train_pairs)\n",
+    "print(\"\\nTrain split by task:\")\n",
+    "for task, count in sorted(train_tasks.items()):\n",
+    "    print(f\"  {task}: {count}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Load model and apply LoRA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "BASE_MODEL = \"unsloth/Qwen2.5-3B-Instruct-bnb-4bit\"\n",
+    "MAX_SEQ_LENGTH = 8192\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=BASE_MODEL,\n",
+    "    max_seq_length=MAX_SEQ_LENGTH,\n",
+    "    load_in_4bit=True,\n",
+    ")\n",
+    "\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=32,\n",
+    "    lora_alpha=64,\n",
+    "    lora_dropout=0.05,\n",
+    "    target_modules=[\n",
+    "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "        \"gate_proj\", \"up_proj\", \"down_proj\",\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "# Print trainable params\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Prepare datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "from hippofloop.training.trainer import UnslothTrainer\n",
+    "from hippofloop.training.config import TrainingConfig\n",
+    "\n",
+    "# Use UnslothTrainer.prepare_dataset for consistent formatting\n",
+    "config = TrainingConfig(\n",
+    "    base_model=BASE_MODEL,\n",
+    "    lora_rank=32, lora_alpha=64, lora_dropout=0.05,\n",
+    "    lora_target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    "    learning_rate=2e-4, lr_scheduler=\"cosine\", warmup_ratio=0.03,\n",
+    "    epochs=3, batch_size=1, gradient_accumulation_steps=16,\n",
+    "    max_seq_length=MAX_SEQ_LENGTH, weight_decay=0.01,\n",
+    "    bf16=False, fp16=True,\n",
+    "    train_split=0.8, val_split=0.1, test_split=0.1, seed=SEED,\n",
+    "    quantization=\"Q4_K_M\", output_path=\"hippofloop.gguf\",\n",
+    ")\n",
+    "\n",
+    "trainer_helper = UnslothTrainer(config)\n",
+    "train_dataset = Dataset.from_list(trainer_helper.prepare_dataset(train_pairs))\n",
+    "val_dataset = Dataset.from_list(trainer_helper.prepare_dataset(val_pairs))\n",
+    "\n",
+    "print(f\"Train dataset: {len(train_dataset)} examples\")\n",
+    "print(f\"Val dataset: {len(val_dataset)} examples\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from trl import SFTTrainer\n",
+    "\n",
+    "OUTPUT_DIR = \"checkpoints/qwen25-3b-hippofloop\"\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=OUTPUT_DIR,\n",
+    "    num_train_epochs=3,\n",
+    "    per_device_train_batch_size=1,\n",
+    "    gradient_accumulation_steps=16,\n",
+    "    learning_rate=2e-4,\n",
+    "    lr_scheduler_type=\"cosine\",\n",
+    "    warmup_ratio=0.03,\n",
+    "    weight_decay=0.01,\n",
+    "    bf16=False,\n",
+    "    fp16=True,\n",
+    "    eval_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model=\"eval_loss\",\n",
+    "    logging_steps=10,\n",
+    "    seed=SEED,\n",
+    ")\n",
+    "\n",
+    "sft_trainer = SFTTrainer(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=val_dataset,\n",
+    "    args=training_args,\n",
+    ")\n",
+    "\n",
+    "print(f\"Training {len(train_dataset)} examples for {training_args.num_train_epochs} epochs...\")\n",
+    "print(f\"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}\")\n",
+    "sft_trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "# Training loss curve\nimport matplotlib.pyplot as plt\n\ntrain_losses = [(e[\"step\"], e[\"loss\"]) for e in sft_trainer.state.log_history if \"loss\" in e]\neval_losses = [(e[\"step\"], e[\"eval_loss\"]) for e in sft_trainer.state.log_history if \"eval_loss\" in e]\n\nfig, ax = plt.subplots(figsize=(10, 5))\nif train_losses:\n    steps, losses = zip(*train_losses)\n    ax.plot(steps, losses, label=\"Train loss\", alpha=0.7)\nif eval_losses:\n    steps, losses = zip(*eval_losses)\n    ax.plot(steps, losses, \"o-\", label=\"Eval loss\", markersize=8)\nax.set_xlabel(\"Step\")\nax.set_ylabel(\"Loss\")\nax.set_title(\"Training Loss\")\nax.legend()\nax.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.show()\n\n# Best eval loss\nbest_loss = None\nif eval_losses:\n    best_step, best_loss = min(eval_losses, key=lambda x: x[1])\n    print(f\"Best eval loss: {best_loss:.4f} at step {best_step}\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Save best model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BEST_MODEL_PATH = f\"{OUTPUT_DIR}/best\"\n",
+    "sft_trainer.save_model(BEST_MODEL_PATH)\n",
+    "tokenizer.save_pretrained(BEST_MODEL_PATH)\n",
+    "print(f\"Saved best model to {BEST_MODEL_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Evaluate on test set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "import random\n\nfrom hippofloop.eval.evaluator import ModelEvaluator\n\n# Set up inference mode\nFastLanguageModel.for_inference(model)\n\ndef model_fn(system_msg: str, user_msg: str) -> str:\n    \"\"\"Run inference on the fine-tuned model.\"\"\"\n    messages = [\n        {\"role\": \"system\", \"content\": system_msg},\n        {\"role\": \"user\", \"content\": user_msg},\n    ]\n    inputs = tokenizer.apply_chat_template(\n        messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n    ).to(model.device)\n    outputs = model.generate(\n        input_ids=inputs, max_new_tokens=2048, temperature=0.1, do_sample=True\n    )\n    # Decode only the generated tokens\n    generated = outputs[0][inputs.shape[-1]:]\n    return tokenizer.decode(generated, skip_special_tokens=True)\n\n# Run eval on a representative random subset (full eval can be slow)\nrng = random.Random(SEED)\neval_subset = rng.sample(test_pairs, min(50, len(test_pairs)))\nprint(f\"Evaluating on {len(eval_subset)} test examples...\")\n\nevaluator = ModelEvaluator(model_fn)\nresults = evaluator.evaluate(eval_subset)\nreport = evaluator.summary_report(results)\n\nprint(f\"\\nOverall:\")\nprint(f\"  JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\"  Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"\\nBy task:\")\nfor task, task_report in report.get('by_task', {}).items():\n    print(f\"  {task} (n={task_report['count']}):\")\n    print(f\"    JSON valid: {task_report['json_valid_rate']:.1%}\")\n    print(f\"    Schema valid: {task_report['schema_valid_rate']:.1%}\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Export to GGUF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GGUF_OUTPUT = \"hippofloop-qwen25-3b-Q4_K_M.gguf\"\n",
+    "\n",
+    "model.save_pretrained_gguf(\n",
+    "    \"gguf_export\",\n",
+    "    tokenizer,\n",
+    "    quantization_method=\"q4_k_m\",\n",
+    ")\n",
+    "\n",
+    "# Rename to target filename\n",
+    "gguf_dir = Path(\"gguf_export\")\n",
+    "gguf_files = list(gguf_dir.glob(\"*.gguf\"))\n",
+    "if gguf_files:\n",
+    "    actual = max(gguf_files, key=lambda p: p.stat().st_mtime)\n",
+    "    actual.rename(GGUF_OUTPUT)\n",
+    "    size_mb = Path(GGUF_OUTPUT).stat().st_size / (1024 * 1024)\n",
+    "    print(f\"Exported: {GGUF_OUTPUT} ({size_mb:.0f} MB)\")\n",
+    "else:\n",
+    "    print(\"Warning: no GGUF file found after export\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Download\n",
+    "\n",
+    "The GGUF file is saved in the notebook's working directory.\n",
+    "On Kaggle, go to **Output** tab to download it, or save as a Kaggle Dataset for reuse."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "# Summary\nprint(\"=\" * 60)\nprint(\"hippofloop training complete\")\nprint(\"=\" * 60)\nprint(f\"Base model: {BASE_MODEL}\")\nprint(f\"Training examples: {len(train_pairs)}\")\nprint(f\"Validation examples: {len(val_pairs)}\")\nprint(f\"Test examples: {len(test_pairs)}\")\nif best_loss is not None:\n    print(f\"Best eval loss: {best_loss:.4f}\")\nprint(f\"JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\"Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"GGUF output: {GGUF_OUTPUT}\")"
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "gpu",
+   "dataSources": [],
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}