From 101f80e7a22468dd13a18878558fb61595a1ed29 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:04:37 +0200 Subject: [PATCH 1/9] #357 Add broadcast 12-node entrypoint --- ...tial_join_databricks_broadcast_12_nodes.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/presentation/entrypoints/national_scale_spatial_join_databricks_broadcast_12_nodes.py diff --git a/src/presentation/entrypoints/national_scale_spatial_join_databricks_broadcast_12_nodes.py b/src/presentation/entrypoints/national_scale_spatial_join_databricks_broadcast_12_nodes.py new file mode 100644 index 00000000..9d360612 --- /dev/null +++ b/src/presentation/entrypoints/national_scale_spatial_join_databricks_broadcast_12_nodes.py @@ -0,0 +1,27 @@ +from dependency_injector.wiring import Provide, inject + +from src.application.contracts import IDatabricksService +from src.infra.infrastructure import Containers +from src.presentation.entrypoints._databricks_benchmark_runner import ( + run_databricks_national_scale_spatial_join, +) + + +@inject +def national_scale_spatial_join_databricks_broadcast_12_nodes( + databricks_service: IDatabricksService = Provide[Containers.databricks_service], +) -> None: + """ + Benchmark: national-scale spatial join between Norwegian municipalities and the + configured buildings dataset size executed on Azure Databricks with a 12-worker + cluster, using the explicit broadcast join strategy (small-side broadcast hint + via ``broadcast(municipalities_df)``). The dataset size is pulled from DI inside + ``run_databricks_national_scale_spatial_join``. The cluster is provisioned once, + every warmup and timed iteration runs against it, and the cluster is terminated + after the benchmark completes. + """ + run_databricks_national_scale_spatial_join( + databricks_service=databricks_service, + num_workers=12, + notebook_variant="broadcast", + ) From 3fe4c3592aea4a8b388aae86ffc293a22bb1ed3a Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:04:49 +0200 Subject: [PATCH 2/9] #357 Add partitioned 12-node entrypoint --- ...al_join_databricks_partitioned_12_nodes.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/presentation/entrypoints/national_scale_spatial_join_databricks_partitioned_12_nodes.py diff --git a/src/presentation/entrypoints/national_scale_spatial_join_databricks_partitioned_12_nodes.py b/src/presentation/entrypoints/national_scale_spatial_join_databricks_partitioned_12_nodes.py new file mode 100644 index 00000000..d84a68bf --- /dev/null +++ b/src/presentation/entrypoints/national_scale_spatial_join_databricks_partitioned_12_nodes.py @@ -0,0 +1,28 @@ +from dependency_injector.wiring import Provide, inject + +from src.application.contracts import IDatabricksService +from src.infra.infrastructure import Containers +from src.presentation.entrypoints._databricks_benchmark_runner import ( + run_databricks_national_scale_spatial_join, +) + + +@inject +def national_scale_spatial_join_databricks_partitioned_12_nodes( + databricks_service: IDatabricksService = Provide[Containers.databricks_service], +) -> None: + """ + Benchmark: national-scale spatial join between Norwegian municipalities and the + configured buildings dataset size executed on Azure Databricks with a 12-worker + cluster, using the explicit Sedona spatial partitioner join strategy + (``sedona.global.index=true``, ``sedona.join.gridtype=kdbtree``, + ``sedona.join.indexbuildside=right``). The dataset size is pulled from DI + inside ``run_databricks_national_scale_spatial_join``. The cluster is + provisioned once, every warmup and timed iteration runs against it, and the + cluster is terminated after the benchmark completes. + """ + run_databricks_national_scale_spatial_join( + databricks_service=databricks_service, + num_workers=12, + notebook_variant="partitioned", + ) From 7495e24de9736f901f06011a694718b5a710cfd0 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:04:53 +0200 Subject: [PATCH 3/9] #357 Re-export 12-node entrypoints from __init__ --- src/presentation/entrypoints/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/presentation/entrypoints/__init__.py b/src/presentation/entrypoints/__init__.py index 7508c055..d0b7af41 100644 --- a/src/presentation/entrypoints/__init__.py +++ b/src/presentation/entrypoints/__init__.py @@ -13,8 +13,10 @@ from .national_scale_spatial_join_databricks_broadcast_2_nodes import national_scale_spatial_join_databricks_broadcast_2_nodes from .national_scale_spatial_join_databricks_broadcast_4_nodes import national_scale_spatial_join_databricks_broadcast_4_nodes from .national_scale_spatial_join_databricks_broadcast_8_nodes import national_scale_spatial_join_databricks_broadcast_8_nodes +from .national_scale_spatial_join_databricks_broadcast_12_nodes import national_scale_spatial_join_databricks_broadcast_12_nodes from .national_scale_spatial_join_databricks_broadcast_16_nodes import national_scale_spatial_join_databricks_broadcast_16_nodes from .national_scale_spatial_join_databricks_partitioned_2_nodes import national_scale_spatial_join_databricks_partitioned_2_nodes from .national_scale_spatial_join_databricks_partitioned_4_nodes import national_scale_spatial_join_databricks_partitioned_4_nodes from .national_scale_spatial_join_databricks_partitioned_8_nodes import national_scale_spatial_join_databricks_partitioned_8_nodes +from .national_scale_spatial_join_databricks_partitioned_12_nodes import national_scale_spatial_join_databricks_partitioned_12_nodes from .national_scale_spatial_join_databricks_partitioned_16_nodes import national_scale_spatial_join_databricks_partitioned_16_nodes From 0fee15da3286dff6781b08e3fa204725cb0f3221 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:04:58 +0200 Subject: [PATCH 4/9] #357 Add 12-node case arms to benchmark runner --- benchmark_runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmark_runner.py b/benchmark_runner.py index 7d36c450..c4b77fd2 100644 --- a/benchmark_runner.py +++ b/benchmark_runner.py @@ -24,10 +24,12 @@ national_scale_spatial_join_databricks_broadcast_2_nodes, national_scale_spatial_join_databricks_broadcast_4_nodes, national_scale_spatial_join_databricks_broadcast_8_nodes, + national_scale_spatial_join_databricks_broadcast_12_nodes, national_scale_spatial_join_databricks_broadcast_16_nodes, national_scale_spatial_join_databricks_partitioned_2_nodes, national_scale_spatial_join_databricks_partitioned_4_nodes, national_scale_spatial_join_databricks_partitioned_8_nodes, + national_scale_spatial_join_databricks_partitioned_12_nodes, national_scale_spatial_join_databricks_partitioned_16_nodes, ) @@ -104,6 +106,9 @@ def benchmark_runner() -> None: case "national-scale-spatial-join-databricks-broadcast-8-nodes": national_scale_spatial_join_databricks_broadcast_8_nodes() return + case "national-scale-spatial-join-databricks-broadcast-12-nodes": + national_scale_spatial_join_databricks_broadcast_12_nodes() + return case "national-scale-spatial-join-databricks-broadcast-16-nodes": national_scale_spatial_join_databricks_broadcast_16_nodes() return @@ -116,6 +121,9 @@ def benchmark_runner() -> None: case "national-scale-spatial-join-databricks-partitioned-8-nodes": national_scale_spatial_join_databricks_partitioned_8_nodes() return + case "national-scale-spatial-join-databricks-partitioned-12-nodes": + national_scale_spatial_join_databricks_partitioned_12_nodes() + return case "national-scale-spatial-join-databricks-partitioned-16-nodes": national_scale_spatial_join_databricks_partitioned_16_nodes() return From 5de6845e5a67df6e1fc9c4e361349ceb5c9fc188 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:05:03 +0200 Subject: [PATCH 5/9] #357 Add 12 new Sedona experiments and optimize batch packing --- benchmarks.yml | 195 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 190 insertions(+), 5 deletions(-) diff --git a/benchmarks.yml b/benchmarks.yml index d2a9878b..c90b1e09 100644 --- a/benchmarks.yml +++ b/benchmarks.yml @@ -158,10 +158,10 @@ experiments: - bbox-filtering-duckdb-large # ================================================================== - # RQ2 — National-scale spatial join (25 experiments) + # RQ2 — National-scale spatial join (36 experiments) # Single-node: DuckDB + PostGIS at small/medium/large (paired). - # Sedona: broadcast + partitioned at {2, 4, 8, 16} workers × 3 - # sizes, unpaired (each provisions its own cluster). + # Sedona: broadcast + partitioned at {2, 4, 8, 12, 16} workers × 3 + # sizes (each provisions its own cluster). # ================================================================== # ---- single-node ---- @@ -174,6 +174,10 @@ experiments: related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-small - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small - national-scale-spatial-join-postgis-small - id: national-scale-spatial-join-postgis-small @@ -185,6 +189,10 @@ experiments: related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-small - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small - national-scale-spatial-join-duckdb-small - id: national-scale-spatial-join-duckdb-medium @@ -196,6 +204,8 @@ experiments: related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-medium - national-scale-spatial-join-databricks-partitioned-8-nodes-medium + - national-scale-spatial-join-databricks-broadcast-12-nodes-medium + - national-scale-spatial-join-databricks-partitioned-12-nodes-medium - national-scale-spatial-join-postgis-medium - id: national-scale-spatial-join-postgis-medium @@ -207,6 +217,8 @@ experiments: related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-medium - national-scale-spatial-join-databricks-partitioned-8-nodes-medium + - national-scale-spatial-join-databricks-broadcast-12-nodes-medium + - national-scale-spatial-join-databricks-partitioned-12-nodes-medium - national-scale-spatial-join-duckdb-medium - id: national-scale-spatial-join-duckdb-large @@ -230,6 +242,21 @@ experiments: - national-scale-spatial-join-duckdb-large # ---- Sedona broadcast strategy ---- + - id: national-scale-spatial-join-databricks-broadcast-2-nodes-small + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-2-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: small + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-8-nodes-small + - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small + - national-scale-spatial-join-duckdb-small + - national-scale-spatial-join-postgis-small + - id: national-scale-spatial-join-databricks-broadcast-2-nodes-medium image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-2-nodes:latest cpu: 4 @@ -256,6 +283,8 @@ experiments: runs: 3 related_script_ids: - national-scale-spatial-join-databricks-partitioned-4-nodes-small + - national-scale-spatial-join-databricks-broadcast-16-nodes-small + - national-scale-spatial-join-databricks-partitioned-16-nodes-small - id: national-scale-spatial-join-databricks-broadcast-4-nodes-medium image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-4-nodes:latest @@ -265,6 +294,8 @@ experiments: runs: 3 related_script_ids: - national-scale-spatial-join-databricks-partitioned-4-nodes-medium + - national-scale-spatial-join-databricks-broadcast-16-nodes-medium + - national-scale-spatial-join-databricks-partitioned-16-nodes-medium - id: national-scale-spatial-join-databricks-broadcast-4-nodes-large image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-4-nodes:latest @@ -272,7 +303,8 @@ experiments: memory_gb: 16 dataset_size: large runs: 3 - related_script_ids: [] + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-12-nodes-large - id: national-scale-spatial-join-databricks-broadcast-8-nodes-small image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-8-nodes:latest @@ -282,6 +314,10 @@ experiments: runs: 3 related_script_ids: - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small - national-scale-spatial-join-duckdb-small - national-scale-spatial-join-postgis-small @@ -293,6 +329,8 @@ experiments: runs: 3 related_script_ids: - national-scale-spatial-join-databricks-partitioned-8-nodes-medium + - national-scale-spatial-join-databricks-broadcast-12-nodes-medium + - national-scale-spatial-join-databricks-partitioned-12-nodes-medium - national-scale-spatial-join-duckdb-medium - national-scale-spatial-join-postgis-medium @@ -305,6 +343,65 @@ experiments: related_script_ids: - national-scale-spatial-join-databricks-partitioned-8-nodes-large + - id: national-scale-spatial-join-databricks-broadcast-12-nodes-small + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-12-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: small + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-8-nodes-small + - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small + - national-scale-spatial-join-duckdb-small + - national-scale-spatial-join-postgis-small + + - id: national-scale-spatial-join-databricks-broadcast-12-nodes-medium + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-12-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: medium + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-8-nodes-medium + - national-scale-spatial-join-databricks-partitioned-8-nodes-medium + - national-scale-spatial-join-databricks-partitioned-12-nodes-medium + - national-scale-spatial-join-duckdb-medium + - national-scale-spatial-join-postgis-medium + + - id: national-scale-spatial-join-databricks-broadcast-12-nodes-large + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-12-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: large + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-4-nodes-large + + - id: national-scale-spatial-join-databricks-broadcast-16-nodes-small + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-16-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: small + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-4-nodes-small + - national-scale-spatial-join-databricks-partitioned-4-nodes-small + - national-scale-spatial-join-databricks-partitioned-16-nodes-small + + - id: national-scale-spatial-join-databricks-broadcast-16-nodes-medium + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-16-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: medium + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-4-nodes-medium + - national-scale-spatial-join-databricks-partitioned-4-nodes-medium + - national-scale-spatial-join-databricks-partitioned-16-nodes-medium + - id: national-scale-spatial-join-databricks-broadcast-16-nodes-large image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-broadcast-16-nodes:latest cpu: 4 @@ -315,6 +412,21 @@ experiments: - national-scale-spatial-join-databricks-broadcast-2-nodes-large # ---- Sedona partitioned strategy ---- + - id: national-scale-spatial-join-databricks-partitioned-2-nodes-small + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-2-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: small + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-8-nodes-small + - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small + - national-scale-spatial-join-duckdb-small + - national-scale-spatial-join-postgis-small + - id: national-scale-spatial-join-databricks-partitioned-2-nodes-medium image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-2-nodes:latest cpu: 4 @@ -343,6 +455,8 @@ experiments: runs: 3 related_script_ids: - national-scale-spatial-join-databricks-broadcast-4-nodes-small + - national-scale-spatial-join-databricks-broadcast-16-nodes-small + - national-scale-spatial-join-databricks-partitioned-16-nodes-small - id: national-scale-spatial-join-databricks-partitioned-4-nodes-medium image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-4-nodes:latest @@ -353,6 +467,8 @@ experiments: skip: failed # executor OOM: RangeJoin spatial index exceeds Standard_D4s_v3 memory at medium scale related_script_ids: - national-scale-spatial-join-databricks-broadcast-4-nodes-medium + - national-scale-spatial-join-databricks-broadcast-16-nodes-medium + - national-scale-spatial-join-databricks-partitioned-16-nodes-medium - id: national-scale-spatial-join-databricks-partitioned-4-nodes-large image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-4-nodes:latest @@ -361,7 +477,8 @@ experiments: dataset_size: large runs: 3 skip: failed # executor OOM: RangeJoin spatial index exceeds Standard_D4s_v3 memory at large scale - related_script_ids: [] + related_script_ids: + - national-scale-spatial-join-databricks-partitioned-12-nodes-large - id: national-scale-spatial-join-databricks-partitioned-8-nodes-small image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-8-nodes:latest @@ -371,6 +488,10 @@ experiments: runs: 3 related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-databricks-partitioned-12-nodes-small - national-scale-spatial-join-duckdb-small - national-scale-spatial-join-postgis-small @@ -383,6 +504,8 @@ experiments: skip: failed # executor OOM: RangeJoin spatial index exceeds Standard_D4s_v3 memory at medium scale related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-medium + - national-scale-spatial-join-databricks-broadcast-12-nodes-medium + - national-scale-spatial-join-databricks-partitioned-12-nodes-medium - national-scale-spatial-join-duckdb-medium - national-scale-spatial-join-postgis-medium @@ -396,6 +519,68 @@ experiments: related_script_ids: - national-scale-spatial-join-databricks-broadcast-8-nodes-large + - id: national-scale-spatial-join-databricks-partitioned-12-nodes-small + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-12-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: small + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-8-nodes-small + - national-scale-spatial-join-databricks-partitioned-8-nodes-small + - national-scale-spatial-join-databricks-broadcast-2-nodes-small + - national-scale-spatial-join-databricks-partitioned-2-nodes-small + - national-scale-spatial-join-databricks-broadcast-12-nodes-small + - national-scale-spatial-join-duckdb-small + - national-scale-spatial-join-postgis-small + + - id: national-scale-spatial-join-databricks-partitioned-12-nodes-medium + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-12-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: medium + runs: 3 + skip: failed # executor OOM: RangeJoin spatial index exceeds Standard_D4s_v3 memory at medium scale + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-8-nodes-medium + - national-scale-spatial-join-databricks-partitioned-8-nodes-medium + - national-scale-spatial-join-databricks-broadcast-12-nodes-medium + - national-scale-spatial-join-duckdb-medium + - national-scale-spatial-join-postgis-medium + + - id: national-scale-spatial-join-databricks-partitioned-12-nodes-large + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-12-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: large + runs: 3 + skip: failed # executor OOM: RangeJoin spatial index exceeds Standard_D4s_v3 memory at large scale + related_script_ids: + - national-scale-spatial-join-databricks-partitioned-4-nodes-large + + - id: national-scale-spatial-join-databricks-partitioned-16-nodes-small + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-16-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: small + runs: 3 + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-4-nodes-small + - national-scale-spatial-join-databricks-partitioned-4-nodes-small + - national-scale-spatial-join-databricks-broadcast-16-nodes-small + + - id: national-scale-spatial-join-databricks-partitioned-16-nodes-medium + image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-16-nodes:latest + cpu: 4 + memory_gb: 16 + dataset_size: medium + runs: 3 + skip: failed # executor OOM: RangeJoin spatial index exceeds Standard_D4s_v3 memory at medium scale + related_script_ids: + - national-scale-spatial-join-databricks-broadcast-4-nodes-medium + - national-scale-spatial-join-databricks-partitioned-4-nodes-medium + - national-scale-spatial-join-databricks-broadcast-16-nodes-medium + - id: national-scale-spatial-join-databricks-partitioned-16-nodes-large image: doppaacr.azurecr.io/national-scale-spatial-join-databricks-partitioned-16-nodes:latest cpu: 4 From 56bc3d214a1fc2c1ce422245a9696c28fe890932 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:05:08 +0200 Subject: [PATCH 6/9] #357 Add 12-node docker-compose services --- docker-compose.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 57219d23..2fce66e5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -182,6 +182,15 @@ services: image: national-scale-spatial-join-databricks-partitioned-8-nodes:latest command: python benchmark_runner.py --script-id national-scale-spatial-join-databricks-partitioned-8-nodes --benchmark-run 1 --run-id ABCDEF + national-scale-spatial-join-databricks-broadcast-12-nodes: + env_file: + - .env + build: + context: . + dockerfile: .docker/Query.Dockerfile + image: national-scale-spatial-join-databricks-broadcast-12-nodes:latest + command: python benchmark_runner.py --script-id national-scale-spatial-join-databricks-broadcast-12-nodes --benchmark-run 1 --run-id ABCDEF + national-scale-spatial-join-databricks-broadcast-16-nodes: env_file: - .env @@ -191,6 +200,15 @@ services: image: national-scale-spatial-join-databricks-broadcast-16-nodes:latest command: python benchmark_runner.py --script-id national-scale-spatial-join-databricks-broadcast-16-nodes --benchmark-run 1 --run-id ABCDEF + national-scale-spatial-join-databricks-partitioned-12-nodes: + env_file: + - .env + build: + context: . + dockerfile: .docker/Query.Dockerfile + image: national-scale-spatial-join-databricks-partitioned-12-nodes:latest + command: python benchmark_runner.py --script-id national-scale-spatial-join-databricks-partitioned-12-nodes --benchmark-run 1 --run-id ABCDEF + national-scale-spatial-join-databricks-partitioned-16-nodes: env_file: - .env From 01bdb41db98bc63d8f7cc2214e404bdf8a876a81 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:05:13 +0200 Subject: [PATCH 7/9] #357 Update test matrix and batch listing for 51 experiments --- README.md | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 18cc19c0..39eee7d8 100644 --- a/README.md +++ b/README.md @@ -209,7 +209,7 @@ Concretely, the outer orchestrator loop is serial: it picks the next experiment its peer batch in parallel, waits for the whole batch to finish, marks every member completed, and moves on. At any moment one batch is in flight; within that batch every member runs on its own ACI in parallel. -The 41 experiments are packed into 17 batches under four constraints that the `related_script_ids` graph encodes: +The 51 experiments are packed into 17 batches under four constraints that the `related_script_ids` graph encodes: 1. **Same query type** per batch — `point-in-polygon-lookup`, `knn-search`, `bbox-filtering`, or `national-scale-spatial-join` never mix. @@ -228,7 +228,7 @@ by constraint 4. ### Test matrix -The matrix below is the active set of 44 experiments grouped into 18 parallel batches. Each cell lists the engines +The matrix below is the active set of 51 experiments grouped into 17 parallel batches. Each cell lists the engines or Sedona configurations that launch together in the same wall-clock window; size suffixes (`-small`, `-medium`, `-large`) are appended to the experiment ids in `benchmarks.yml` and forwarded to each container as `--dataset-size`. Shapefile (`local`) only participates at the `small` tier per the thesis methodology — it represents the @@ -245,20 +245,18 @@ laptop-workflow reference, not a scalable engine. The medium tier was dropped from the surviving RQ1 queries and `attribute-spatial-compound-filter` was removed across the board (issue #281); the 13 freed cells are reinvested in RQ2. -**RQ2 — National-scale spatial join** (26 experiments, 11 batches) +**RQ2 — National-scale spatial join** (36 experiments, 11 batches) -| Engine / strategy | `small` | `medium` | `large` | -|---------------------|---------------------|---------------------|---------------------------| -| Single-node | duckdb · postgis | duckdb · postgis | duckdb · postgis | -| Sedona `broadcast` | 4 / 8 nodes | 2 / 4 / 8 nodes | 2 / 4 / 8 / 12 / 16 nodes | -| Sedona `partitioned`| 4 / 8 nodes | 2 / 4 / 8 nodes | 2 / 4 / 8 / 12 / 16 nodes | +| Engine / strategy | `small` | `medium` | `large` | +|---------------------|------------------------------|------------------------------|------------------------------| +| Single-node | duckdb · postgis | duckdb · postgis | duckdb · postgis | +| Sedona `broadcast` | 2 / 4 / 8 / 12 / 16 nodes | 2 / 4 / 8 / 12 / 16 nodes | 2 / 4 / 8 / 12 / 16 nodes | +| Sedona `partitioned`| 2 / 4 / 8 / 12 / 16 nodes | 2 / 4 / 8 / 12 / 16 nodes | 2 / 4 / 8 / 12 / 16 nodes | Within each size column, single-node and Sedona experiments are packed into the same batches up to the 200 vCPU Databricks budget — the table groups by strategy for readability, not by batch membership. Concrete batch membership is whatever `related_script_ids` in `benchmarks.yml` declares; see the batch listing below. -The 2-node row is omitted at `small` for `broadcast` and `partitioned`: at ~5M polygons those configurations were -weakly differentiated; the freed cells fund the 12-/16-node extension of the scaling curve at `large`. A `default` strategy (no hint; Spark's CBO picks the plan) was originally planned as an untuned baseline but was dropped entirely because iterations consistently timed out or failed at this workload scale, making reliable measurement infeasible (issue #254). @@ -275,10 +273,10 @@ the seeded shuffle. | K2 | knn-search | large | 0 | duckdb · postgis | | B1 | bbox-filtering | small | 0 | duckdb · postgis · local | | B2 | bbox-filtering | large | 0 | duckdb · postgis | -| A_S1 | national-scale-spatial-join | small | 72 | broadcast-8 · partitioned-8 · duckdb · postgis | -| A_S2 | national-scale-spatial-join | small | 40 | broadcast-4 · partitioned-4 | -| A_M1 | national-scale-spatial-join | medium | 72 | broadcast-8 · partitioned-8 · duckdb · postgis | -| A_M2 | national-scale-spatial-join | medium | 40 | broadcast-4 · partitioned-4 | +| A_S1 | national-scale-spatial-join | small | 200 | broadcast-2 · broadcast-8 · broadcast-12 · partitioned-2 · partitioned-8 · partitioned-12 · duckdb · postgis | +| A_S2 | national-scale-spatial-join | small | 176 | broadcast-4 · broadcast-16 · partitioned-4 · partitioned-16 | +| A_M1 | national-scale-spatial-join | medium | 176 | broadcast-8 · broadcast-12 · partitioned-8 · partitioned-12 · duckdb · postgis | +| A_M2 | national-scale-spatial-join | medium | 176 | broadcast-4 · broadcast-16 · partitioned-4 · partitioned-16 | | A_M3 | national-scale-spatial-join | medium | 24 | broadcast-2 · partitioned-2 | | A_L1 | national-scale-spatial-join | large | 80 | broadcast-16 · broadcast-2 | | A_L2 | national-scale-spatial-join | large | 80 | partitioned-16 · partitioned-2 | From 5b92f257b411e79068328d7a0c0df77577e2ec11 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:19:34 +0200 Subject: [PATCH 8/9] #357 Add 12-node Sedona services to PR build matrix --- .github/workflows/pull-request-tests.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/pull-request-tests.yml b/.github/workflows/pull-request-tests.yml index 1e57b222..b6f61e6c 100644 --- a/.github/workflows/pull-request-tests.yml +++ b/.github/workflows/pull-request-tests.yml @@ -184,6 +184,9 @@ jobs: - service: national-scale-spatial-join-databricks-broadcast-8-nodes display_name: Sedona National Scale Spatial Join - Broadcast - 8 Nodes + - service: national-scale-spatial-join-databricks-broadcast-12-nodes + display_name: Sedona National Scale Spatial Join - Broadcast - 12 Nodes + - service: national-scale-spatial-join-databricks-partitioned-2-nodes display_name: Sedona National Scale Spatial Join - Partitioned - 2 Nodes @@ -193,6 +196,9 @@ jobs: - service: national-scale-spatial-join-databricks-partitioned-8-nodes display_name: Sedona National Scale Spatial Join - Partitioned - 8 Nodes + - service: national-scale-spatial-join-databricks-partitioned-12-nodes + display_name: Sedona National Scale Spatial Join - Partitioned - 12 Nodes + - service: national-scale-spatial-join-databricks-broadcast-16-nodes display_name: Sedona National Scale Spatial Join - Broadcast - 16 Nodes From c6ae2728c477c8e419c3b335ab575b05fb23efb5 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Tue, 26 May 2026 16:19:39 +0200 Subject: [PATCH 9/9] #357 Add 12-node Sedona services to ACR push matrix --- .github/workflows/push-containers-to-acr.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/push-containers-to-acr.yml b/.github/workflows/push-containers-to-acr.yml index 620bfc61..6d956d47 100644 --- a/.github/workflows/push-containers-to-acr.yml +++ b/.github/workflows/push-containers-to-acr.yml @@ -117,6 +117,10 @@ jobs: image: national-scale-spatial-join-databricks-broadcast-8-nodes display_name: Sedona National Scale Spatial Join - Broadcast - 8 Nodes + - service: national-scale-spatial-join-databricks-broadcast-12-nodes + image: national-scale-spatial-join-databricks-broadcast-12-nodes + display_name: Sedona National Scale Spatial Join - Broadcast - 12 Nodes + - service: national-scale-spatial-join-databricks-partitioned-2-nodes image: national-scale-spatial-join-databricks-partitioned-2-nodes display_name: Sedona National Scale Spatial Join - Partitioned - 2 Nodes @@ -129,6 +133,10 @@ jobs: image: national-scale-spatial-join-databricks-partitioned-8-nodes display_name: Sedona National Scale Spatial Join - Partitioned - 8 Nodes + - service: national-scale-spatial-join-databricks-partitioned-12-nodes + image: national-scale-spatial-join-databricks-partitioned-12-nodes + display_name: Sedona National Scale Spatial Join - Partitioned - 12 Nodes + - service: national-scale-spatial-join-databricks-broadcast-16-nodes image: national-scale-spatial-join-databricks-broadcast-16-nodes display_name: Sedona National Scale Spatial Join - Broadcast - 16 Nodes