diff --git a/notebooks/kernel-metadata.json b/notebooks/kernel-metadata.json new file mode 100644 index 0000000..3f07f9c --- /dev/null +++ b/notebooks/kernel-metadata.json @@ -0,0 +1,13 @@ +{ + "id": "nvandessel/hippofloop", + "title": "hippofloop", + "code_file": "train-hippofloop.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": true, + "enable_gpu": true, + "enable_internet": true, + "dataset_sources": ["nvandessel/floop-decisions"], + "competition_sources": [], + "kernel_sources": [] +} diff --git a/notebooks/train-hippofloop.ipynb b/notebooks/train-hippofloop.ipynb index d7f70fc..0c56293 100644 --- a/notebooks/train-hippofloop.ipynb +++ b/notebooks/train-hippofloop.ipynb @@ -51,28 +51,14 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 2. Load and explore data\n", - "\n", - "Data is expected as a Kaggle Dataset mounted at `/kaggle/input/floop-decisions/`.\n", - "Upload your `decisions.jsonl` files there." - ] + "source": "## 2. Load and explore data\n\nData is expected as a Kaggle Dataset. The path is auto-detected." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "DATA_DIR = Path(\"/kaggle/input/floop-decisions\")\n", - "\n", - "# Find all JSONL files in the dataset\n", - "jsonl_files = sorted(DATA_DIR.glob(\"*.jsonl\"))\n", - "print(f\"Found {len(jsonl_files)} JSONL files:\")\n", - "for f in jsonl_files:\n", - " size_mb = f.stat().st_size / (1024 * 1024)\n", - " print(f\" {f.name} ({size_mb:.1f} MB)\")" - ] + "source": "# Auto-detect Kaggle dataset path\n_candidates = [\n Path(\"/kaggle/input/datasets/nvandessel/floop-decisions\"),\n Path(\"/kaggle/input/floop-decisions\"),\n]\nDATA_DIR = next((p for p in _candidates if p.exists()), None)\nif DATA_DIR is None:\n raise FileNotFoundError(\n f\"Kaggle dataset not found. Checked: {[str(p) for p in _candidates]}. \"\n \"Add the 'floop-decisions' dataset in the notebook sidebar.\"\n )\nprint(f\"Data dir: {DATA_DIR}\")\n\n# Find all JSONL files in the dataset\njsonl_files = sorted(DATA_DIR.glob(\"*.jsonl\"))\nif not jsonl_files:\n raise FileNotFoundError(f\"No .jsonl files found in {DATA_DIR}\")\nprint(f\"Found {len(jsonl_files)} JSONL files:\")\nfor f in jsonl_files:\n size_mb = f.stat().st_size / (1024 * 1024)\n print(f\" {f.name} ({size_mb:.1f} MB)\")" }, { "cell_type": "code",