From 99a54304861c374bcc4d1e2131110a8e5f1d0ef7 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Sun, 24 May 2026 10:16:44 +0200 Subject: [PATCH] #254 Split per-iteration and cumulative hard timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-iteration timeout: 75 min (BENCHMARK_MAX_ITERATION_SECONDS). Cumulative timeout: 6 × per-iteration = 450 min (BENCHMARK_MAX_TIMED_WINDOW_SECONDS). Previously both checks shared one constant at 90 min. --- CLAUDE.md | 2 +- src/application/common/monitor.py | 15 ++++++++------- src/config.py | 3 ++- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 62802ac..9aa8ad6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -25,7 +25,7 @@ Python, DuckDB (spatial), PostGIS on Azure Database for PostgreSQL, Apache Sedon - Secrets live in `.env` (gitignored) and are forwarded to ACI as `--secure-environment-variables` from `main.py`. - Benchmark batching: members of a batch list each other bidirectionally in `related_script_ids`. The orchestrator dedupes via `completed_experiments`, so each batch runs once as one parallel `ThreadPoolExecutor` fan-out. A batch must satisfy four constraints simultaneously: (a) same query type, (b) same `dataset_size`, (c) at most one PostGIS member (shared Azure Postgres server), (d) Databricks cluster vCPU sum ≤ 200, computed as `(workers + 1) × 4` per Sedona member on `Standard_D4s_v3`. See `README.md#pairing-and-randomization` for the full batch listing. *Why: peers must execute under the same wall-clock window for fair comparison, without contending on shared infrastructure or breaching regional quota.* - Adding a benchmark requires three edits in lockstep: file in `src/presentation/entrypoints/`, `case` arm in `benchmark_runner.py`, and an entry in `benchmarks.yml`. Missing any one silently breaks dispatch or orchestration. -- Stopping rule on `@monitor`: high-frequency single-machine queries use sequential stopping (bootstrapped CI on the mean elapsed time, floors at `BENCHMARK_MIN_ITERATIONS` and `BENCHMARK_MIN_TIMED_WINDOW_SECONDS`, ceiling at the `BenchmarkIteration` value, hard timeout at `BENCHMARK_MAX_TIMED_WINDOW_SECONDS`). Long-running low-variance benchmarks (`national_scale_spatial_join_*` for Databricks, DuckDB, and PostGIS) opt out via `use_sequential_stopping=False` and run a small fixed iteration count; they also override `warmup_iterations=1` since one warmup is enough on long-running queries and additional warmups dominate the wall-clock budget. Transient per-iteration failures do not abort the run: failed iterations are recorded as `status="failed"` samples and the loop continues until `BENCHMARK_MAX_CONSECUTIVE_FAILURES` in a row triggers `stop_reason="failed"`; a run that finishes normally but had any failed iterations is tagged `stop_reason="partial"`. *Why: bootstrap CI on <10 samples is uninformative, and the per-iteration cost of those benchmarks (cluster time, shared Postgres) outweighs precision gains. Spark/Azure flakiness is intermittent rather than deterministic — losing 2 of 3 timed iters to one transient executor loss erases data we could have kept.* See `README.md#stopping-rule` for the full rule. +- Stopping rule on `@monitor`: high-frequency single-machine queries use sequential stopping (bootstrapped CI on the mean elapsed time, floors at `BENCHMARK_MIN_ITERATIONS` and `BENCHMARK_MIN_TIMED_WINDOW_SECONDS`, ceiling at the `BenchmarkIteration` value, per-iteration hard timeout at `BENCHMARK_MAX_ITERATION_SECONDS`, cumulative hard timeout at `BENCHMARK_MAX_TIMED_WINDOW_SECONDS`). Long-running low-variance benchmarks (`national_scale_spatial_join_*` for Databricks, DuckDB, and PostGIS) opt out via `use_sequential_stopping=False` and run a small fixed iteration count; they also override `warmup_iterations=1` since one warmup is enough on long-running queries and additional warmups dominate the wall-clock budget. Transient per-iteration failures do not abort the run: failed iterations are recorded as `status="failed"` samples and the loop continues until `BENCHMARK_MAX_CONSECUTIVE_FAILURES` in a row triggers `stop_reason="failed"`; a run that finishes normally but had any failed iterations is tagged `stop_reason="partial"`. *Why: bootstrap CI on <10 samples is uninformative, and the per-iteration cost of those benchmarks (cluster time, shared Postgres) outweighs precision gains. Spark/Azure flakiness is intermittent rather than deterministic — losing 2 of 3 timed iters to one transient executor loss erases data we could have kept.* See `README.md#stopping-rule` for the full rule. ## Commands diff --git a/src/application/common/monitor.py b/src/application/common/monitor.py index 6ed0e37..fc3b9fe 100644 --- a/src/application/common/monitor.py +++ b/src/application/common/monitor.py @@ -47,7 +47,8 @@ def monitor( bounded below by ``Config.BENCHMARK_MIN_ITERATIONS`` and ``Config.BENCHMARK_MIN_TIMED_WINDOW_SECONDS``, and above by ``benchmark_iteration`` (soft ceiling: kept open until the 60-second floor is met to keep the cost-metric - window valid) and ``Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS`` (hard timeout). + window valid), ``Config.BENCHMARK_MAX_ITERATION_SECONDS`` (per-iteration hard timeout), + and ``Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS`` (cumulative hard timeout). Set to False for Databricks national-scale runs, which use a fixed iteration count. Default is True. :param warmup_iterations: Override the number of warmup iterations. ``None`` falls back @@ -124,13 +125,13 @@ def wrapper(*args, **kwargs): f"Skipping timed iterations." ) break - if w_elapsed >= Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS: + if w_elapsed >= Config.BENCHMARK_MAX_ITERATION_SECONDS: stop_reason = StopReason.TIMEOUT logger.warning( f"Warmup iteration for '{query_id}' took " f"{w_elapsed:.1f}s, exceeding " - f"BENCHMARK_MAX_TIMED_WINDOW_SECONDS=" - f"{Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS}s; stopping." + f"BENCHMARK_MAX_ITERATION_SECONDS=" + f"{Config.BENCHMARK_MAX_ITERATION_SECONDS}s; stopping." ) break if failure is None and stop_reason is None: @@ -281,13 +282,13 @@ def wrapper(*args, **kwargs): ], ) - if elapsed_time >= Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS: + if elapsed_time >= Config.BENCHMARK_MAX_ITERATION_SECONDS: stop_reason = StopReason.TIMEOUT logger.warning( f"Single iteration of '{query_id}' took " f"{elapsed_time:.1f}s, exceeding " - f"BENCHMARK_MAX_TIMED_WINDOW_SECONDS=" - f"{Config.BENCHMARK_MAX_TIMED_WINDOW_SECONDS}s; " + f"BENCHMARK_MAX_ITERATION_SECONDS=" + f"{Config.BENCHMARK_MAX_ITERATION_SECONDS}s; " f"stopping." ) break diff --git a/src/config.py b/src/config.py index 1789695..de3d22c 100644 --- a/src/config.py +++ b/src/config.py @@ -118,7 +118,8 @@ class Config: # Sequential stopping rule (bootstrapped CI on mean elapsed time) BENCHMARK_MIN_ITERATIONS: int = 10 BENCHMARK_MIN_TIMED_WINDOW_SECONDS: int = 60 - BENCHMARK_MAX_TIMED_WINDOW_SECONDS: int = 5400 + BENCHMARK_MAX_ITERATION_SECONDS: int = 4500 + BENCHMARK_MAX_TIMED_WINDOW_SECONDS: int = 6 * BENCHMARK_MAX_ITERATION_SECONDS BENCHMARK_TARGET_CI_HALF_WIDTH_RELATIVE: float = 0.05 BENCHMARK_BOOTSTRAP_RESAMPLES: int = 1000 BENCHMARK_CI_CONFIDENCE: float = 0.95