From 99a54304861c374bcc4d1e2131110a8e5f1d0ef7 Mon Sep 17 00:00:00 2001
From: Jathavaan Shankarr <jathavaan12@gmail.com>
Date: Sun, 24 May 2026 10:16:44 +0200
Subject: [PATCH] #254 Split per-iteration and cumulative hard timeouts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-iteration timeout: 75 min (BENCHMARK_MAX_ITERATION_SECONDS).
Cumulative timeout: 6 × per-iteration = 450 min (BENCHMARK_MAX_TIMED_WINDOW_SECONDS).
Previously both checks shared one constant at 90 min.
---
 CLAUDE.md                         |  2 +-
 src/application/common/monitor.py | 15 ++++++++-------
 src/config.py                     |  3 ++-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 62802ac..9aa8ad6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -25,7 +25,7 @@ Python, DuckDB (spatial), PostGIS on Azure Database for PostgreSQL, Apache Sedon
 - Secrets live in `.env` (gitignored) and are forwarded to ACI as `--secure-environment-variables` from `main.py`.
 - Benchmark batching: members of a batch list each other bidirectionally in `related_script_ids`. The orchestrator dedupes via `completed_experiments`, so each batch runs once as one parallel `ThreadPoolExecutor` fan-out. A batch must satisfy four constraints simultaneously: (a) same query type, (b) same `dataset_size`, (c) at most one PostGIS member (shared Azure Postgres server), (d) Databricks cluster vCPU sum ≤ 200, computed as `(workers + 1) × 4` per Sedona member on `Standard_D4s_v3`. See `README.md#pairing-and-randomization` for the full batch listing. *Why: peers must execute under the same wall-clock window for fair comparison, without contending on shared infrastructure or breaching regional quota.*
 - Adding a benchmark requires three edits in lockstep: file in `src/presentation/entrypoints/`, `case` arm in `benchmark_runner.py`, and an entry in `benchmarks.yml`. Missing any one silently breaks dispatch or orchestration.
-- Stopping rule on `@monitor`: high-frequency single-machine queries use sequential stopping (bootstrapped CI on the mean elapsed time, floors at `BENCHMARK_MIN_ITERATIONS` and `BENCHMARK_MIN_TIMED_WINDOW_SECONDS`, ceiling at the `BenchmarkIteration` value, hard timeout at `BENCHMARK_MAX_TIMED_WINDOW_SECONDS`). Long-running low-variance benchmarks (`national_scale_spatial_join_*` for Databricks, DuckDB, and PostGIS) opt out via `use_sequential_stopping=False` and run a small fixed iteration count; they also override `warmup_iterations=1` since one warmup is enough on long-running queries and additional warmups dominate the wall-clock budget. Transient per-iteration failures do not abort the run: failed iterations are recorded as `status="failed"` samples and the loop continues until `BENCHMARK_MAX_CONSECUTIVE_FAILURES` in a row triggers `stop_reason="failed"`; a run that finishes normally but had any failed iterations is tagged `stop_reason="partial"`. *Why: bootstrap CI on <10 samples is uninformative, and the per-iteration cost of those benchmarks (cluster time, shared Postgres) outweighs precision gains. Spark/Azure flakiness is intermittent rather than deterministic — losing 2 of 3 timed iters to one transient executor loss erases data we could have kept.* See `README.md#stopping-rule` for the full rule.
+- Stopping rule on `@monitor`: high-frequency single-machine queries use sequential stopping (bootstrapped CI on the mean elapsed time, floors at `BENCHMARK_MIN_ITERATIONS` and `BENCHMARK_MIN_TIMED_WINDOW_SECONDS`, ceiling at the `BenchmarkIteration` value, per-iteration hard timeout at `BENCHMARK_MAX_ITERATION_SECONDS`, cumulative hard timeout at `BENCHMARK_MAX_TIMED_WINDOW_SECONDS`). Long-running low-variance benchmarks (`national_scale_spatial_join_*` for Databricks, DuckDB, and PostGIS) opt out via `use_sequential_stopping=False` and run a small fixed iteration count; they also override `warmup_iterations=1` since one warmup is enough on long-running queries and additional warmups dominate the wall-clock budget. Transient per-iteration failures do not abort the run: failed iterations are recorded as `status="failed"` samples and the loop continues until `BENCHMARK_MAX_CONSECUTIVE_FAILURES` in a row triggers `stop_reason="failed"`; a run that finishes normally but had any failed iterations is tagged `stop_reason="partial"`. *Why: bootstrap CI on <10 samples is uninformative, and the per-iteration cost of those benchmarks (cluster time, shared Postgres) outweighs precision gains. Spark/Azure flakiness is intermittent rather than deterministic — losing 2 of 3 timed iters to one transient executor loss erases data we could have kept.* See `README.md#stopping-rule` for the full rule.
 
 ## Commands
 
diff --git a/src/application/common/monitor.py b/src/application/common/monitor.py
index 6ed0e37..fc3b9fe 100644
--- a/src/application/common/monitor.py
+++ b/src/application/common/monitor.py
@@ -47,7 +47,8 @@ def monitor(
         bounded below by ``Config.BENCHMARK_MIN_ITERATIONS`` and
         ``Config.BENCHMARK_MIN_TIMED_WINDOW_SECONDS``, and above by ``benchmark_iteration``
         (soft ceiling: kept open until the 60-second floor is met to keep the cost-metric
-        window valid) and ``Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS`` (hard timeout).
+        window valid), ``Config.BENCHMARK_MAX_ITERATION_SECONDS`` (per-iteration hard timeout),
+        and ``Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS`` (cumulative hard timeout).
         Set to False for Databricks national-scale runs, which use a fixed iteration count.
         Default is True.
     :param warmup_iterations: Override the number of warmup iterations. ``None`` falls back
@@ -124,13 +125,13 @@ def wrapper(*args, **kwargs):
                             f"Skipping timed iterations."
                         )
                         break
-                    if w_elapsed >= Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS:
+                    if w_elapsed >= Config.BENCHMARK_MAX_ITERATION_SECONDS:
                         stop_reason = StopReason.TIMEOUT
                         logger.warning(
                             f"Warmup iteration for '{query_id}' took "
                             f"{w_elapsed:.1f}s, exceeding "
-                            f"BENCHMARK_MAX_TIMED_WINDOW_SECONDS="
-                            f"{Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS}s; stopping."
+                            f"BENCHMARK_MAX_ITERATION_SECONDS="
+                            f"{Config.BENCHMARK_MAX_ITERATION_SECONDS}s; stopping."
                         )
                         break
                 if failure is None and stop_reason is None:
@@ -281,13 +282,13 @@ def wrapper(*args, **kwargs):
                             ],
                         )
 
-                        if elapsed_time >= Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS:
+                        if elapsed_time >= Config.BENCHMARK_MAX_ITERATION_SECONDS:
                             stop_reason = StopReason.TIMEOUT
                             logger.warning(
                                 f"Single iteration of '{query_id}' took "
                                 f"{elapsed_time:.1f}s, exceeding "
-                                f"BENCHMARK_MAX_TIMED_WINDOW_SECONDS="
-                                f"{Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS}s; "
+                                f"BENCHMARK_MAX_ITERATION_SECONDS="
+                                f"{Config.BENCHMARK_MAX_ITERATION_SECONDS}s; "
                                 f"stopping."
                             )
                             break
diff --git a/src/config.py b/src/config.py
index 1789695..de3d22c 100644
--- a/src/config.py
+++ b/src/config.py
@@ -118,7 +118,8 @@ class Config:
     # Sequential stopping rule (bootstrapped CI on mean elapsed time)
     BENCHMARK_MIN_ITERATIONS: int = 10
     BENCHMARK_MIN_TIMED_WINDOW_SECONDS: int = 60
-    BENCHMARK_MAX_TIMED_WINDOW_SECONDS: int = 5400
+    BENCHMARK_MAX_ITERATION_SECONDS: int = 4500
+    BENCHMARK_MAX_TIMED_WINDOW_SECONDS: int = 6 * BENCHMARK_MAX_ITERATION_SECONDS
     BENCHMARK_TARGET_CI_HALF_WIDTH_RELATIVE: float = 0.05
     BENCHMARK_BOOTSTRAP_RESAMPLES: int = 1000
     BENCHMARK_CI_CONFIDENCE: float = 0.95