From 7f5354189a9bffbe4378149fddd8e7fd63007e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Fri, 6 Feb 2026 17:27:17 +0100 Subject: [PATCH 1/8] add flake --- flake.nix | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 flake.nix diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..0977d24 --- /dev/null +++ b/flake.nix @@ -0,0 +1,36 @@ +{ + description = "LakeBench dev environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { inherit system; }; + python = pkgs.python313; + in + { + devShells.default = pkgs.mkShell { + packages = [ + python + pkgs.uv + pkgs.ruff + ]; + + shellHook = '' + if [ ! -d .venv ]; then + echo "Creating venv..." + uv venv --python ${python}/bin/python .venv + echo "Installing lakebench with all extras..." + uv pip install --python .venv/bin/python -e ".[duckdb,polars,daft,tpcds_datagen,tpch_datagen,sparkmeasure,sail]" + uv pip install --python .venv/bin/python pytest + fi + source .venv/bin/activate + ''; + }; + } + ); +} From 443689b175544afff1d54e8012932e15ca9162e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Fri, 6 Feb 2026 17:55:59 +0100 Subject: [PATCH 2/8] feat: add Nix dev environment and local quickstart notebook --- .gitignore | 3 + examples/local_quickstart.ipynb | 192 ++++++++++++++++++++++++++++++++ flake.lock | 61 ++++++++++ flake.nix | 16 ++- 4 files changed, 267 insertions(+), 5 deletions(-) create mode 100644 examples/local_quickstart.ipynb create mode 100644 flake.lock diff --git a/.gitignore b/.gitignore index 5f0f621..be323fe 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,9 @@ dmypy.json # Benchmark outputs (optional) results/ benchmark_outputs/ +local_data/ +local_working_dir/ +local_results/ # Local dev tools *.sqlite3 diff --git a/examples/local_quickstart.ipynb b/examples/local_quickstart.ipynb new file mode 100644 index 0000000..908a5e4 --- /dev/null +++ b/examples/local_quickstart.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LakeBench Local Quickstart\n", + "\n", + "Run LakeBench benchmarks locally using DuckDB with SF=1 (~1 GB).\n", + "\n", + "Steps:\n", + "1. Generate TPC-DS and TPC-H datasets\n", + "2. Run benchmarks (ELTBench, TPC-DS, TPC-H)\n", + "3. View results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "DATA_DIR = os.path.join(os.getcwd(), \"local_data\")\n", + "WORKING_DIR = os.path.join(os.getcwd(), \"local_working_dir\")\n", + "RESULTS_DIR = os.path.join(os.getcwd(), \"local_results\")\n", + "\n", + "os.makedirs(DATA_DIR, exist_ok=True)\n", + "os.makedirs(WORKING_DIR, exist_ok=True)\n", + "os.makedirs(RESULTS_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Generate TPC-DS data (SF=1, ~1 GB)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lakebench.datagen import TPCDSDataGenerator\n", + "\n", + "TPCDSDataGenerator(\n", + " scale_factor=1,\n", + " target_folder_uri=os.path.join(DATA_DIR, \"tpcds_sf1\")\n", + ").run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Generate TPC-H data (SF=1, ~1 GB)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lakebench.datagen import TPCHDataGenerator\n", + "\n", + "TPCHDataGenerator(\n", + " scale_factor=1,\n", + " target_folder_uri=os.path.join(DATA_DIR, \"tpch_sf1\")\n", + ").run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Run ELTBench (light mode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lakebench.engines import DuckDB\n", + "from lakebench.benchmarks import ELTBench\n", + "\n", + "benchmark = ELTBench(\n", + " engine=DuckDB(schema_or_working_directory_uri=os.path.join(WORKING_DIR, \"duckdb_eltbench\")),\n", + " scenario_name=\"SF1 - Local\",\n", + " input_parquet_folder_uri=os.path.join(DATA_DIR, \"tpcds_sf1\"),\n", + " save_results=True,\n", + " result_table_uri=RESULTS_DIR\n", + ")\n", + "benchmark.run(mode=\"light\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Run TPC-DS power test (load + 99 queries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lakebench.benchmarks import TPCDS\n", + "\n", + "benchmark = TPCDS(\n", + " engine=DuckDB(schema_or_working_directory_uri=os.path.join(WORKING_DIR, \"duckdb_tpcds\")),\n", + " scenario_name=\"SF1 - Local\",\n", + " input_parquet_folder_uri=os.path.join(DATA_DIR, \"tpcds_sf1\"),\n", + " save_results=True,\n", + " result_table_uri=RESULTS_DIR\n", + ")\n", + "benchmark.run(mode=\"power_test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Run TPC-H power test (load + 22 queries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lakebench.benchmarks import TPCH\n", + "\n", + "benchmark = TPCH(\n", + " engine=DuckDB(schema_or_working_directory_uri=os.path.join(WORKING_DIR, \"duckdb_tpch\")),\n", + " scenario_name=\"SF1 - Local\",\n", + " input_parquet_folder_uri=os.path.join(DATA_DIR, \"tpch_sf1\"),\n", + " save_results=True,\n", + " result_table_uri=RESULTS_DIR\n", + ")\n", + "benchmark.run(mode=\"power_test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. View results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for r in benchmark.results:\n", + " status = \"OK\" if r[\"success\"] else \"FAIL\"\n", + " print(f\"{r['phase']:>8} | {r['test_item']:<20} | {r['duration_ms']:>8}ms | {status}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (LakeBench)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..c759433 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1770197578, + "narHash": "sha256-AYqlWrX09+HvGs8zM6ebZ1pwUqjkfpnv8mewYwAo+iM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "00c21e4c93d963c50d4c0c89bfa84ed6e0694df2", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix index 0977d24..92c8c88 100644 --- a/flake.nix +++ b/flake.nix @@ -22,13 +22,19 @@ shellHook = '' if [ ! -d .venv ]; then - echo "Creating venv..." uv venv --python ${python}/bin/python .venv - echo "Installing lakebench with all extras..." - uv pip install --python .venv/bin/python -e ".[duckdb,polars,daft,tpcds_datagen,tpch_datagen,sparkmeasure,sail]" - uv pip install --python .venv/bin/python pytest + source .venv/bin/activate + + uv pip install -e . + for extra in duckdb polars daft tpcds_datagen tpch_datagen sparkmeasure sail; do + uv pip install "lakebench[$extra]" 2>&1 || echo "warning: $extra failed to install, skipping" + done + uv pip install pytest jupyter ipykernel + + python -m ipykernel install --user --name python3 --display-name "Python 3 (LakeBench)" + else + source .venv/bin/activate fi - source .venv/bin/activate ''; }; } From ae218ea093951b71beb5acf0854bd3f82757200a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Fri, 6 Feb 2026 21:27:27 +0100 Subject: [PATCH 3/8] chore: remove local quickstart notebook and revert .gitignore --- .gitignore | 3 - examples/local_quickstart.ipynb | 192 -------------------------------- 2 files changed, 195 deletions(-) delete mode 100644 examples/local_quickstart.ipynb diff --git a/.gitignore b/.gitignore index be323fe..5f0f621 100644 --- a/.gitignore +++ b/.gitignore @@ -60,9 +60,6 @@ dmypy.json # Benchmark outputs (optional) results/ benchmark_outputs/ -local_data/ -local_working_dir/ -local_results/ # Local dev tools *.sqlite3 diff --git a/examples/local_quickstart.ipynb b/examples/local_quickstart.ipynb deleted file mode 100644 index 908a5e4..0000000 --- a/examples/local_quickstart.ipynb +++ /dev/null @@ -1,192 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# LakeBench Local Quickstart\n", - "\n", - "Run LakeBench benchmarks locally using DuckDB with SF=1 (~1 GB).\n", - "\n", - "Steps:\n", - "1. Generate TPC-DS and TPC-H datasets\n", - "2. Run benchmarks (ELTBench, TPC-DS, TPC-H)\n", - "3. View results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "DATA_DIR = os.path.join(os.getcwd(), \"local_data\")\n", - "WORKING_DIR = os.path.join(os.getcwd(), \"local_working_dir\")\n", - "RESULTS_DIR = os.path.join(os.getcwd(), \"local_results\")\n", - "\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "os.makedirs(WORKING_DIR, exist_ok=True)\n", - "os.makedirs(RESULTS_DIR, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Generate TPC-DS data (SF=1, ~1 GB)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lakebench.datagen import TPCDSDataGenerator\n", - "\n", - "TPCDSDataGenerator(\n", - " scale_factor=1,\n", - " target_folder_uri=os.path.join(DATA_DIR, \"tpcds_sf1\")\n", - ").run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Generate TPC-H data (SF=1, ~1 GB)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lakebench.datagen import TPCHDataGenerator\n", - "\n", - "TPCHDataGenerator(\n", - " scale_factor=1,\n", - " target_folder_uri=os.path.join(DATA_DIR, \"tpch_sf1\")\n", - ").run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Run ELTBench (light mode)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lakebench.engines import DuckDB\n", - "from lakebench.benchmarks import ELTBench\n", - "\n", - "benchmark = ELTBench(\n", - " engine=DuckDB(schema_or_working_directory_uri=os.path.join(WORKING_DIR, \"duckdb_eltbench\")),\n", - " scenario_name=\"SF1 - Local\",\n", - " input_parquet_folder_uri=os.path.join(DATA_DIR, \"tpcds_sf1\"),\n", - " save_results=True,\n", - " result_table_uri=RESULTS_DIR\n", - ")\n", - "benchmark.run(mode=\"light\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Run TPC-DS power test (load + 99 queries)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lakebench.benchmarks import TPCDS\n", - "\n", - "benchmark = TPCDS(\n", - " engine=DuckDB(schema_or_working_directory_uri=os.path.join(WORKING_DIR, \"duckdb_tpcds\")),\n", - " scenario_name=\"SF1 - Local\",\n", - " input_parquet_folder_uri=os.path.join(DATA_DIR, \"tpcds_sf1\"),\n", - " save_results=True,\n", - " result_table_uri=RESULTS_DIR\n", - ")\n", - "benchmark.run(mode=\"power_test\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Run TPC-H power test (load + 22 queries)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lakebench.benchmarks import TPCH\n", - "\n", - "benchmark = TPCH(\n", - " engine=DuckDB(schema_or_working_directory_uri=os.path.join(WORKING_DIR, \"duckdb_tpch\")),\n", - " scenario_name=\"SF1 - Local\",\n", - " input_parquet_folder_uri=os.path.join(DATA_DIR, \"tpch_sf1\"),\n", - " save_results=True,\n", - " result_table_uri=RESULTS_DIR\n", - ")\n", - "benchmark.run(mode=\"power_test\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. View results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for r in benchmark.results:\n", - " status = \"OK\" if r[\"success\"] else \"FAIL\"\n", - " print(f\"{r['phase']:>8} | {r['test_item']:<20} | {r['duration_ms']:>8}ms | {status}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (LakeBench)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 2b3bc49a7b1359b89bb8776b842db068a08f0e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sat, 14 Feb 2026 13:23:41 +0100 Subject: [PATCH 4/8] update-env --- flake.nix | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index 92c8c88..66479f3 100644 --- a/flake.nix +++ b/flake.nix @@ -21,7 +21,8 @@ ]; shellHook = '' - if [ ! -d .venv ]; then + update-venv() { + rm -rf .venv uv venv --python ${python}/bin/python .venv source .venv/bin/activate @@ -32,6 +33,10 @@ uv pip install pytest jupyter ipykernel python -m ipykernel install --user --name python3 --display-name "Python 3 (LakeBench)" + } + + if [ ! -d .venv ]; then + update-venv else source .venv/bin/activate fi From 4805c6b6c120a56c7053f3bce8186d24d3fd56be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Fri, 27 Feb 2026 07:40:56 +0100 Subject: [PATCH 5/8] merge --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2a65a47..e65d3c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ tpcds_datagen = ["duckdb==1.4.4; python_version >= '3.9'", "pyarrow>=15.0.0; pyt tpch_datagen = ["tpchgen-cli>=2.0.1"] sparkmeasure = ["sparkmeasure==0.24.0"] spark = ["pyspark>=3.5.0,<4.0.0; python_version >= '3.9'", "delta-spark>=3.2.0,<4.0.0; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] -sail = ["pysail>=0.4.2; python_version >= '3.9'", "pyspark[connect]>=4.0.0; python_version >= '3.9'", "deltalake>=1.2.1; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] +sail = ["pysail>=0.5.0", "pyspark[connect]>=4.0.0", "deltalake>=1.2.1", "pyarrow>=15.0.0"] [project.urls] github = "https://github.com/mwc360/LakeBench" From 532898fb5e9c7773f5e497714a30c6f1dd1c1374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sat, 28 Feb 2026 15:03:44 +0100 Subject: [PATCH 6/8] clean --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e65d3c0..2a65a47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ tpcds_datagen = ["duckdb==1.4.4; python_version >= '3.9'", "pyarrow>=15.0.0; pyt tpch_datagen = ["tpchgen-cli>=2.0.1"] sparkmeasure = ["sparkmeasure==0.24.0"] spark = ["pyspark>=3.5.0,<4.0.0; python_version >= '3.9'", "delta-spark>=3.2.0,<4.0.0; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] -sail = ["pysail>=0.5.0", "pyspark[connect]>=4.0.0", "deltalake>=1.2.1", "pyarrow>=15.0.0"] +sail = ["pysail>=0.4.2; python_version >= '3.9'", "pyspark[connect]>=4.0.0; python_version >= '3.9'", "deltalake>=1.2.1; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] [project.urls] github = "https://github.com/mwc360/LakeBench" From ed398bac2e1b55cb9b93d58a553cde72658098aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sat, 28 Feb 2026 15:15:49 +0100 Subject: [PATCH 7/8] auto-update --- flake.nix | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/flake.nix b/flake.nix index 66479f3..fa49789 100644 --- a/flake.nix +++ b/flake.nix @@ -21,22 +21,27 @@ ]; shellHook = '' - update-venv() { + create-venv() { rm -rf .venv uv venv --python ${python}/bin/python .venv source .venv/bin/activate uv pip install -e . - for extra in duckdb polars daft tpcds_datagen tpch_datagen sparkmeasure sail; do + # spark and sail conflict with each other, install separately + for extra in duckdb polars daft tpcds_datagen tpch_datagen sparkmeasure spark sail; do uv pip install "lakebench[$extra]" 2>&1 || echo "warning: $extra failed to install, skipping" done - uv pip install pytest jupyter ipykernel + uv pip install --group dev - python -m ipykernel install --user --name python3 --display-name "Python 3 (LakeBench)" + # store hash to detect pyproject.toml changes + md5sum pyproject.toml > .venv/.pyproject.hash } if [ ! -d .venv ]; then - update-venv + create-venv + elif ! md5sum --check .venv/.pyproject.hash &>/dev/null; then + echo "pyproject.toml changed, recreating venv..." + create-venv else source .venv/bin/activate fi From 1d0220bc4c279b117b0eaa65f2e52566e3f75092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sat, 28 Feb 2026 15:36:49 +0100 Subject: [PATCH 8/8] jupyter --- flake.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flake.nix b/flake.nix index fa49789..e12b827 100644 --- a/flake.nix +++ b/flake.nix @@ -32,6 +32,8 @@ uv pip install "lakebench[$extra]" 2>&1 || echo "warning: $extra failed to install, skipping" done uv pip install --group dev + uv pip install jupyter ipykernel + python -m ipykernel install --user --name python3 --display-name "Python 3 (LakeBench)" # store hash to detect pyproject.toml changes md5sum pyproject.toml > .venv/.pyproject.hash