From 295df8d8d920cc57505a4622ec3338a406ba6cd9 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Sun, 24 May 2026 15:42:33 +0200 Subject: [PATCH] #346 Disable AQE in broadcast notebook to prevent BroadcastNestedLoopJoin fallback --- .../databricks/national_scale_spatial_join_broadcast.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/presentation/databricks/national_scale_spatial_join_broadcast.py b/src/presentation/databricks/national_scale_spatial_join_broadcast.py index 4cc8896..ccf75c5 100644 --- a/src/presentation/databricks/national_scale_spatial_join_broadcast.py +++ b/src/presentation/databricks/national_scale_spatial_join_broadcast.py @@ -14,6 +14,10 @@ # `broadcast()` so Sedona picks `BroadcastIndexJoin` instead of the default # `SortMergeJoin` Spark would pick for ST_Intersects. # +# AQE (Adaptive Query Execution) is disabled before the timed section to +# prevent it from rewriting Sedona's BroadcastIndexJoin plan back into +# Spark's native BroadcastNestedLoopJoin (a brute-force cross product). +# # Notes: # - stage_durations_ms is capped at the first 100 stages (dbutils.notebook.exit # has a payload cap around 1 MB); a warning is logged if truncation happens. @@ -139,6 +143,9 @@ # COMMAND ---------- +_original_aqe = spark.conf.get("spark.sql.adaptive.enabled") +spark.conf.set("spark.sql.adaptive.enabled", "false") + start_time = time.perf_counter() result = ( @@ -155,6 +162,8 @@ cardinality = result.count() elapsed_seconds = time.perf_counter() - start_time +spark.conf.set("spark.sql.adaptive.enabled", _original_aqe) + print(f"Spatial join complete. Regions with matched buildings: {cardinality}") print(f"Elapsed seconds: {elapsed_seconds:.3f}")