Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
393 changes: 393 additions & 0 deletions notebooks/train-hippofloop.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,393 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# hippofloop: Distill floop's consolidator into a local model\n",
"\n",
"Fine-tunes **Qwen 2.5 3B Instruct** (4-bit QLoRA) on floop's decision logs.\n",
"Runs end-to-end on a Kaggle T4 GPU.\n",
"\n",
"**Pipeline:** Load JSONL → Clean → Format SFT pairs → Train → Evaluate → Export GGUF"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"# Install unsloth (optimized for Kaggle/Colab T4)\n",
"!pip install unsloth[colab-new]\n",
"!pip install --no-deps trl peft accelerate bitsandbytes xformers\n",
"\n",
"# Install hippofloop from repo\n",
"!pip install git+https://github.com/nvandessel/hippofloop.git"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import logging\n",
"from pathlib import Path\n",
"\n",
"logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(name)s: %(message)s\")\n",
"logger = logging.getLogger(\"hippofloop.notebook\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load and explore data\n",
"\n",
"Data is expected as a Kaggle Dataset mounted at `/kaggle/input/floop-decisions/`.\n",
"Upload your `decisions.jsonl` files there."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = Path(\"/kaggle/input/floop-decisions\")\n",
"\n",
"# Find all JSONL files in the dataset\n",
"jsonl_files = sorted(DATA_DIR.glob(\"*.jsonl\"))\n",
"print(f\"Found {len(jsonl_files)} JSONL files:\")\n",
"for f in jsonl_files:\n",
" size_mb = f.stat().st_size / (1024 * 1024)\n",
" print(f\" {f.name} ({size_mb:.1f} MB)\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from hippofloop.data.loader import JsonlLoader\n",
"from hippofloop.data.cleaner import DecisionCleaner\n",
"from hippofloop.data.formatter import SftFormatter\n",
"\n",
"loader = JsonlLoader()\n",
"cleaner = DecisionCleaner()\n",
"formatter = SftFormatter()\n",
"\n",
"# Load\n",
"entries = loader.load([str(f) for f in jsonl_files])\n",
"print(f\"Loaded: {len(entries)} entries\")\n",
"\n",
"# Clean\n",
"cleaned, stats = cleaner.clean_with_stats(entries)\n",
"print(f\"\\nCleaning stats:\")\n",
"for k, v in stats.items():\n",
" print(f\" {k}: {v}\")\n",
"\n",
"# Format\n",
"pairs = formatter.format(cleaned)\n",
"print(f\"\\nSFT pairs: {len(pairs)}\")\n",
"\n",
"# Per-task breakdown\n",
"from collections import Counter\n",
"task_counts = Counter(p.task for p in pairs)\n",
"print(\"\\nBy task:\")\n",
"for task, count in sorted(task_counts.items()):\n",
" print(f\" {task}: {count}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Inspect a sample SFT pair\n",
"sample = pairs[0]\n",
"print(f\"Task: {sample.task}\")\n",
"print(f\"Source stage: {sample.source_stage}\")\n",
"for msg in sample.messages:\n",
" content_preview = msg['content'][:200] + '...' if len(msg['content']) > 200 else msg['content']\n",
" print(f\"\\n[{msg['role']}]\\n{content_preview}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Split data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"SEED = 42\n",
"\n",
"train_pairs, val_pairs, test_pairs = formatter.split(\n",
" pairs, train_ratio=0.8, val_ratio=0.1, seed=SEED\n",
")\n",
"print(f\"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}\")\n",
"\n",
"# Per-task distribution in train split\n",
"train_tasks = Counter(p.task for p in train_pairs)\n",
"print(\"\\nTrain split by task:\")\n",
"for task, count in sorted(train_tasks.items()):\n",
" print(f\" {task}: {count}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Load model and apply LoRA"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from unsloth import FastLanguageModel\n",
"\n",
"BASE_MODEL = \"unsloth/Qwen2.5-3B-Instruct-bnb-4bit\"\n",
"MAX_SEQ_LENGTH = 8192\n",
"\n",
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
" model_name=BASE_MODEL,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" load_in_4bit=True,\n",
")\n",
"\n",
"model = FastLanguageModel.get_peft_model(\n",
" model,\n",
" r=32,\n",
" lora_alpha=64,\n",
" lora_dropout=0.05,\n",
" target_modules=[\n",
" \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
" \"gate_proj\", \"up_proj\", \"down_proj\",\n",
" ],\n",
")\n",
"\n",
"# Print trainable params\n",
"model.print_trainable_parameters()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Prepare datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"from hippofloop.training.trainer import UnslothTrainer\n",
"from hippofloop.training.config import TrainingConfig\n",
"\n",
"# Use UnslothTrainer.prepare_dataset for consistent formatting\n",
"config = TrainingConfig(\n",
" base_model=BASE_MODEL,\n",
" lora_rank=32, lora_alpha=64, lora_dropout=0.05,\n",
" lora_target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
" learning_rate=2e-4, lr_scheduler=\"cosine\", warmup_ratio=0.03,\n",
" epochs=3, batch_size=1, gradient_accumulation_steps=16,\n",
" max_seq_length=MAX_SEQ_LENGTH, weight_decay=0.01,\n",
" bf16=False, fp16=True,\n",
" train_split=0.8, val_split=0.1, test_split=0.1, seed=SEED,\n",
" quantization=\"Q4_K_M\", output_path=\"hippofloop.gguf\",\n",
")\n",
"\n",
"trainer_helper = UnslothTrainer(config)\n",
"train_dataset = Dataset.from_list(trainer_helper.prepare_dataset(train_pairs))\n",
"val_dataset = Dataset.from_list(trainer_helper.prepare_dataset(val_pairs))\n",
"\n",
"print(f\"Train dataset: {len(train_dataset)} examples\")\n",
"print(f\"Val dataset: {len(val_dataset)} examples\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrainingArguments\n",
"from trl import SFTTrainer\n",
"\n",
"OUTPUT_DIR = \"checkpoints/qwen25-3b-hippofloop\"\n",
"\n",
"training_args = TrainingArguments(\n",
" output_dir=OUTPUT_DIR,\n",
" num_train_epochs=3,\n",
" per_device_train_batch_size=1,\n",
" gradient_accumulation_steps=16,\n",
" learning_rate=2e-4,\n",
" lr_scheduler_type=\"cosine\",\n",
" warmup_ratio=0.03,\n",
" weight_decay=0.01,\n",
" bf16=False,\n",
" fp16=True,\n",
" eval_strategy=\"epoch\",\n",
" save_strategy=\"epoch\",\n",
" load_best_model_at_end=True,\n",
" metric_for_best_model=\"eval_loss\",\n",
" logging_steps=10,\n",
" seed=SEED,\n",
")\n",
"\n",
"sft_trainer = SFTTrainer(\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=val_dataset,\n",
" args=training_args,\n",
")\n",
"\n",
"print(f\"Training {len(train_dataset)} examples for {training_args.num_train_epochs} epochs...\")\n",
"print(f\"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}\")\n",
"sft_trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Training loss curve\nimport matplotlib.pyplot as plt\n\ntrain_losses = [(e[\"step\"], e[\"loss\"]) for e in sft_trainer.state.log_history if \"loss\" in e]\neval_losses = [(e[\"step\"], e[\"eval_loss\"]) for e in sft_trainer.state.log_history if \"eval_loss\" in e]\n\nfig, ax = plt.subplots(figsize=(10, 5))\nif train_losses:\n steps, losses = zip(*train_losses)\n ax.plot(steps, losses, label=\"Train loss\", alpha=0.7)\nif eval_losses:\n steps, losses = zip(*eval_losses)\n ax.plot(steps, losses, \"o-\", label=\"Eval loss\", markersize=8)\nax.set_xlabel(\"Step\")\nax.set_ylabel(\"Loss\")\nax.set_title(\"Training Loss\")\nax.legend()\nax.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.show()\n\n# Best eval loss\nbest_loss = None\nif eval_losses:\n best_step, best_loss = min(eval_losses, key=lambda x: x[1])\n print(f\"Best eval loss: {best_loss:.4f} at step {best_step}\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Save best model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"BEST_MODEL_PATH = f\"{OUTPUT_DIR}/best\"\n",
"sft_trainer.save_model(BEST_MODEL_PATH)\n",
"tokenizer.save_pretrained(BEST_MODEL_PATH)\n",
"print(f\"Saved best model to {BEST_MODEL_PATH}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Evaluate on test set"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import random\n\nfrom hippofloop.eval.evaluator import ModelEvaluator\n\n# Set up inference mode\nFastLanguageModel.for_inference(model)\n\ndef model_fn(system_msg: str, user_msg: str) -> str:\n \"\"\"Run inference on the fine-tuned model.\"\"\"\n messages = [\n {\"role\": \"system\", \"content\": system_msg},\n {\"role\": \"user\", \"content\": user_msg},\n ]\n inputs = tokenizer.apply_chat_template(\n messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n ).to(model.device)\n outputs = model.generate(\n input_ids=inputs, max_new_tokens=2048, temperature=0.1, do_sample=True\n )\n # Decode only the generated tokens\n generated = outputs[0][inputs.shape[-1]:]\n return tokenizer.decode(generated, skip_special_tokens=True)\n\n# Run eval on a representative random subset (full eval can be slow)\nrng = random.Random(SEED)\neval_subset = rng.sample(test_pairs, min(50, len(test_pairs)))\nprint(f\"Evaluating on {len(eval_subset)} test examples...\")\n\nevaluator = ModelEvaluator(model_fn)\nresults = evaluator.evaluate(eval_subset)\nreport = evaluator.summary_report(results)\n\nprint(f\"\\nOverall:\")\nprint(f\" JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\" Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"\\nBy task:\")\nfor task, task_report in report.get('by_task', {}).items():\n print(f\" {task} (n={task_report['count']}):\")\n print(f\" JSON valid: {task_report['json_valid_rate']:.1%}\")\n print(f\" Schema valid: {task_report['schema_valid_rate']:.1%}\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9. Export to GGUF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"GGUF_OUTPUT = \"hippofloop-qwen25-3b-Q4_K_M.gguf\"\n",
"\n",
"model.save_pretrained_gguf(\n",
" \"gguf_export\",\n",
" tokenizer,\n",
" quantization_method=\"q4_k_m\",\n",
")\n",
"\n",
"# Rename to target filename\n",
"gguf_dir = Path(\"gguf_export\")\n",
"gguf_files = list(gguf_dir.glob(\"*.gguf\"))\n",
"if gguf_files:\n",
" actual = max(gguf_files, key=lambda p: p.stat().st_mtime)\n",
" actual.rename(GGUF_OUTPUT)\n",
" size_mb = Path(GGUF_OUTPUT).stat().st_size / (1024 * 1024)\n",
" print(f\"Exported: {GGUF_OUTPUT} ({size_mb:.0f} MB)\")\n",
"else:\n",
" print(\"Warning: no GGUF file found after export\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 10. Download\n",
"\n",
"The GGUF file is saved in the notebook's working directory.\n",
"On Kaggle, go to **Output** tab to download it, or save as a Kaggle Dataset for reuse."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Summary\nprint(\"=\" * 60)\nprint(\"hippofloop training complete\")\nprint(\"=\" * 60)\nprint(f\"Base model: {BASE_MODEL}\")\nprint(f\"Training examples: {len(train_pairs)}\")\nprint(f\"Validation examples: {len(val_pairs)}\")\nprint(f\"Test examples: {len(test_pairs)}\")\nif best_loss is not None:\n print(f\"Best eval loss: {best_loss:.4f}\")\nprint(f\"JSON valid rate: {report['json_valid_rate']:.1%}\")\nprint(f\"Schema valid rate: {report['schema_valid_rate']:.1%}\")\nprint(f\"GGUF output: {GGUF_OUTPUT}\")"
}
],
"metadata": {
"kaggle": {
"accelerator": "gpu",
"dataSources": [],
"isGpuEnabled": true,
"isInternetEnabled": true,
"language": "python"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading
Loading