diff --git a/doc/measuring-minimum-block-time.md b/doc/measuring-minimum-block-time.md index 269b21d2..e6171dc0 100644 --- a/doc/measuring-minimum-block-time.md +++ b/doc/measuring-minimum-block-time.md @@ -13,10 +13,10 @@ The missions perform the binary search to find the minimum sustainable block tim A candidate close time `T` is considered a **pass** if and only if, **on every node in the network**, the stellar-core `ledger.age.closed-histogram` metric satisfies **both** of the following: -* **P50** (median) is in the range `[0.80·T, 1.20·T)` ms *(temporary; see note below)* +* **P75** is in the range `[0.80·T, 1.20·T)` ms *(temporary; see note below)* * **P99** is `≤ 2·T` ms -> **FIXME (P50 tolerance):** the intended P50 band is `[0.95·T, 1.05·T)` (±5%), but stellar-core currently has performance regressions that prevent the stricter band from being achievable under load. The tolerance has been temporarily widened to ±20% so the test can exercise the rest of the pipeline; tighten it back to ±5% (or narrower) once those regressions are fixed. +> **FIXME (P75 tolerance):** the intended P75 band is `[0.95·T, 1.05·T)` (±5%), but stellar-core currently has performance regressions that prevent the stricter band from being achievable under load. The tolerance has been temporarily widened to ±20% so the test can exercise the rest of the pipeline; tighten it back to ±5% (or narrower) once those regressions are fixed. If any node violates any of these bounds, `T` is considered a **fail** and the binary search raises its lower bound. The same is true if the load run itself errors (e.g., stellar-core's internal `loadgen-run-failed` counter increments, nodes fall out of sync, or peers report inconsistent ledger hashes) — in that case the mission treats the iteration as a fail and the search continues upward. diff --git a/doc/missions.md b/doc/missions.md index 4db06e03..f417d45d 100644 --- a/doc/missions.md +++ b/doc/missions.md @@ -153,7 +153,7 @@ Stress test a network of simulated Tier1 topology with a mix of classic and soro ## MissionMinBlockTimeClassic -Find the minimum ledger target close time a simulated Tier1 network can sustain at a fixed TPS while meeting a per-node `ledger.age.closed` latency SLA (P50 within ±20% of target, P99 ≤ 2×), driving classic-payment load. See [Running minimum block time test](measuring-minimum-block-time.md) for details. +Find the minimum ledger target close time a simulated Tier1 network can sustain at a fixed TPS while meeting a per-node `ledger.age.closed` latency SLA (P75 within ±20% of target, P99 ≤ 2×), driving classic-payment load. See [Running minimum block time test](measuring-minimum-block-time.md) for details. ## MissionMinBlockTimeMixed diff --git a/src/FSLibrary/MinBlockTimeTest.fs b/src/FSLibrary/MinBlockTimeTest.fs index 037ca224..877f717e 100644 --- a/src/FSLibrary/MinBlockTimeTest.fs +++ b/src/FSLibrary/MinBlockTimeTest.fs @@ -28,13 +28,13 @@ let private timeoutsFor (targetMs: int) : int = max 500 (targetMs / 5) let private readLedgerAgePercentiles (peer: Peer) : float * float = let h = peer.GetMetrics().LedgerAgeClosedHistogram - float h.Median, float h.``99`` + float h.``75``, float h.``99`` // Returns true iff every peer's ledger.age.closed-histogram satisfies: -// P50 in [0.80*T, 1.20*T) +// P75 in [0.80*T, 1.20*T) // P99 <= 2*T // -// FIXME: the P50 tolerance is temporarily widened to +/-20% because +// FIXME: the P75 tolerance is temporarily widened to +/-20% because // stellar-core currently has perf regressions that prevent the intended // +/-5% band from being achievable. Tighten this back to 0.95/1.05 (or // lower) once those regressions are fixed. @@ -48,14 +48,14 @@ let private checkLedgerAgeSLA (formation: StellarFormation) (coreSets: CoreSet l formation.NetworkCfg.EachPeerInSets (List.toArray coreSets) (fun peer -> - let p50, p99 = readLedgerAgePercentiles peer - let peerOk = p50 >= tLo && p50 < tHi && p99 <= p99Max + let p75, p99 = readLedgerAgePercentiles peer + let peerOk = p75 >= tLo && p75 < tHi && p99 <= p99Max LogInfo - "peer=%s T=%dms p50=%.0f p99=%.0f -> %s" + "peer=%s T=%dms p75=%.0f p99=%.0f -> %s" peer.ShortName.StringName targetMs - p50 + p75 p99 (if peerOk then "PASS" else "FAIL") diff --git a/src/FSLibrary/MissionMinBlockTimeClassic.fs b/src/FSLibrary/MissionMinBlockTimeClassic.fs index ff99d09b..ba750ebb 100644 --- a/src/FSLibrary/MissionMinBlockTimeClassic.fs +++ b/src/FSLibrary/MissionMinBlockTimeClassic.fs @@ -6,9 +6,9 @@ module MissionMinBlockTimeClassic // Mirror of MissionMaxTPSClassic for the minimum-block-time search: fixes the // TPS at --tx-rate and binary-searches for the smallest ledger target close -// time (in [--min-block-time-ms, 6000] ms) that still keeps ledger.age.closed -// within the currently enforced SLA checks (P50 within a widened band around -// T, and P99 <= 2T; no P99.9 bound is currently enforced here). +// time (in [--min-block-time-ms, --max-block-time-ms] ms) that still keeps +// ledger.age.closed-histogram within the currently enforced SLA checks (P75 +// within a widened band around T, and P99 <= 2T). open MinBlockTimeTest open StellarMissionContext diff --git a/src/FSLibrary/MissionMinBlockTimeMixed.fs b/src/FSLibrary/MissionMinBlockTimeMixed.fs index 743d71f6..deceb377 100644 --- a/src/FSLibrary/MissionMinBlockTimeMixed.fs +++ b/src/FSLibrary/MissionMinBlockTimeMixed.fs @@ -6,7 +6,7 @@ module MissionMinBlockTimeMixed // Mirror of MissionMaxTPSMixed for the minimum-block-time search: mixed // classic + Soroban workload at a fixed TPS, binary-searches for the smallest -// ledger target close time that still keeps ledger.age.closed within SLA. +// ledger target close time that still keeps ledger.age.closed-histogram within SLA. open MinBlockTimeTest open PubnetData