From 1a432be9e487bbfcfa3c523a8727ce17b7d8af8c Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 9 Apr 2026 16:34:08 -0400 Subject: [PATCH] branch to check for chain progression during quiet period (fault injection off, system allowed time to recover --- .github/workflows/run_antithesis_test.yml | 6 + docker-compose.yaml | 3 + workload/cmd/stress-engine/main.go | 2 + .../stress-engine/quiet_recovery_vectors.go | 199 ++++++++++++++++++ 4 files changed, 210 insertions(+) create mode 100644 workload/cmd/stress-engine/quiet_recovery_vectors.go diff --git a/.github/workflows/run_antithesis_test.yml b/.github/workflows/run_antithesis_test.yml index 6222f01e..d76dd460 100644 --- a/.github/workflows/run_antithesis_test.yml +++ b/.github/workflows/run_antithesis_test.yml @@ -100,6 +100,11 @@ on: required: false default: false description: Enable to run an smoke test. + quiet_recovery: + type: boolean + required: false + default: false + description: Enable quiet recovery checks (pauses fault injection to verify chain self-healing) jobs: manual_run: @@ -153,6 +158,7 @@ jobs: custom.lotus_miner_1_tag=${{ inputs.lotus_miner_1_tag }} custom.forest_tag=${{ inputs.forest }} custom.forest_0_tag=${{ inputs.forest_0_tag }} + custom.quiet_recovery=${{ inputs.quiet_recovery }} scheduled_run_a: diff --git a/docker-compose.yaml b/docker-compose.yaml index 99dc8a18..c4f9f151 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -156,6 +156,9 @@ services: # Power / reorg - STRESS_WEIGHT_REORG=${STRESS_WEIGHT_REORG:-0} - STRESS_WEIGHT_POWER_SLASH=${STRESS_WEIGHT_POWER_SLASH:-2} + # Quiet recovery: pauses all fault injection to verify chain self-healing (off by default) + - QUIET_RECOVERY_ENABLED=${QUIET_RECOVERY_ENABLED:-0} + - STRESS_WEIGHT_QUIET_RECOVERY=${STRESS_WEIGHT_QUIET_RECOVERY:-0} # Protocol fuzzer: 0=off, 1=on (fuzzer uses its own Go-code defaults for weights) - FUZZER_ENABLED=${FUZZER_ENABLED:-0} # diff --git a/workload/cmd/stress-engine/main.go b/workload/cmd/stress-engine/main.go index e88d0b96..62652958 100644 --- a/workload/cmd/stress-engine/main.go +++ b/workload/cmd/stress-engine/main.go @@ -255,6 +255,8 @@ func buildDeck() { {"DoF3FinalityMonitor", "STRESS_WEIGHT_F3_MONITOR", DoF3FinalityMonitor, 2}, {"DoF3FinalityAgreement", "STRESS_WEIGHT_F3_AGREEMENT", DoF3FinalityAgreement, 3}, {"DoDrandBeaconAudit", "STRESS_WEIGHT_DRAND_BEACON_AUDIT", DoDrandBeaconAudit, 3}, + // Quiet recovery: pauses all faults, checks self-healing (off by default, enable via QUIET_RECOVERY_ENABLED=1) + {"DoQuietRecovery", "STRESS_WEIGHT_QUIET_RECOVERY", DoQuietRecovery, 0}, } // Non-FOC stress vectors — skipped when FOC profile is active. diff --git a/workload/cmd/stress-engine/quiet_recovery_vectors.go b/workload/cmd/stress-engine/quiet_recovery_vectors.go new file mode 100644 index 00000000..df99870c --- /dev/null +++ b/workload/cmd/stress-engine/quiet_recovery_vectors.go @@ -0,0 +1,199 @@ +package main + +import ( + "log" + "os" + "os/exec" + "time" + + "github.com/antithesishq/antithesis-sdk-go/assert" + + "github.com/filecoin-project/go-state-types/abi" + "github.com/filecoin-project/lotus/chain/types" +) + +// =========================================================================== +// Quiet Recovery Vector +// +// Pauses Antithesis fault injection for a configurable duration, then verifies +// that the Filecoin network self-heals: chain advances, nodes converge, and +// all nodes agree on the tipset at finalized height. +// +// Gated by QUIET_RECOVERY_ENABLED=1 (off by default — pausing faults is +// disruptive to the entire devnet). Enable via the notebook or GH Action toggle. +// +// Requires the ANTITHESIS_STOP_FAULTS binary injected by the Antithesis runtime. +// =========================================================================== + +const ( + quietDuration = "45" // seconds to pause faults (string for exec arg) + quietStabilizeSecs = 15 // seconds to wait for gossip/reconnection after faults pause + quietDriftThreshold = 3 // max block drift to consider nodes "converged" +) + +// DoQuietRecovery requests a fault-free window and verifies chain self-healing. +func DoQuietRecovery() { + if os.Getenv("QUIET_RECOVERY_ENABLED") != "1" { + return + } + + stopBin := os.Getenv("ANTITHESIS_STOP_FAULTS") + if stopBin == "" { + debugLog("[quiet-recovery] ANTITHESIS_STOP_FAULTS not set, skipping") + return + } + + if len(nodeKeys) < 2 { + return + } + + // ── Step 1: Record pre-recovery heights ────────────────────────────────── + preHeights := queryNodeHeights() + preMax := maxEpoch(preHeights) + if preMax == 0 { + log.Println("[quiet-recovery] no responsive nodes, skipping") + return + } + log.Printf("[quiet-recovery] pre-recovery max height: %d", preMax) + + // ── Step 2: Pause fault injection ──────────────────────────────────────── + log.Printf("[quiet-recovery] requesting %ss quiet period", quietDuration) + cmd := exec.CommandContext(ctx, stopBin, quietDuration) + if err := cmd.Run(); err != nil { + log.Printf("[quiet-recovery] ANTITHESIS_STOP_FAULTS failed: %v", err) + return + } + + // ── Step 3: Wait for stabilization ─────────────────────────────────────── + time.Sleep(time.Duration(quietStabilizeSecs) * time.Second) + + // ── Step 4: Record post-recovery heights ───────────────────────────────── + postHeights := queryNodeHeights() + postMax := maxEpoch(postHeights) + postMin := minEpoch(postHeights) + log.Printf("[quiet-recovery] post-recovery max height: %d, min height: %d", postMax, postMin) + + // ── Step 5: Assert chain advanced ──────────────────────────────────────── + advanced := postMax > preMax + assert.Sometimes(advanced, "Chain advanced during quiet period", map[string]any{ + "pre_max_height": preMax, + "post_max_height": postMax, + }) + if advanced { + log.Printf("[quiet-recovery] chain advanced from %d to %d", preMax, postMax) + } else { + log.Printf("[quiet-recovery] chain did NOT advance (pre=%d post=%d)", preMax, postMax) + } + + // ── Step 6: Assert consensus recovery (drift check) ────────────────────── + if len(postHeights) < 2 { + log.Println("[quiet-recovery] fewer than 2 responsive nodes post-recovery, skipping convergence check") + return + } + + drift := int(postMax - postMin) + converged := drift <= quietDriftThreshold + assert.Sometimes(converged, "Consensus recovered during quiet period", map[string]any{ + "drift": drift, + "threshold": quietDriftThreshold, + "nodes": len(postHeights), + }) + + if converged { + log.Printf("[quiet-recovery] consensus recovered (drift=%d <= %d)", drift, quietDriftThreshold) + } else { + log.Printf("[quiet-recovery] consensus NOT recovered (drift=%d > %d)", drift, quietDriftThreshold) + return // don't check tipset agreement when nodes are diverged + } + + // ── Step 7: Assert tipset agreement at finalized height ────────────────── + // Use the minimum post-recovery height minus a small finality buffer as the + // comparison point. All converged nodes should agree on this tipset. + const finalityBuffer = 5 + checkHeight := postMin - abi.ChainEpoch(finalityBuffer) + if checkHeight <= 0 { + log.Println("[quiet-recovery] chain too short for finalized tipset check") + return + } + + var cidStrings []string + var respondents int + for _, name := range nodeKeys { + h, ok := postHeights[name] + if !ok || h == 0 { + continue + } + ts, err := nodes[name].ChainGetTipSetByHeight(ctx, checkHeight, types.EmptyTSK) + if err != nil { + debugLog("[quiet-recovery] ChainGetTipSetByHeight(%d) failed on %s: %v", checkHeight, name, err) + continue + } + cids := "" + for _, c := range ts.Cids() { + cids += c.String() + "," + } + cidStrings = append(cidStrings, cids) + respondents++ + } + + if respondents < 2 { + log.Printf("[quiet-recovery] only %d nodes returned tipsets at height %d, skipping agreement check", respondents, checkHeight) + return + } + + allAgree := true + for i := 1; i < len(cidStrings); i++ { + if cidStrings[i] != cidStrings[0] { + allAgree = false + break + } + } + + assert.Always(allAgree, "State consistent after quiet period recovery", map[string]any{ + "check_height": checkHeight, + "respondents": respondents, + "drift": drift, + }) + + if allAgree { + log.Printf("[quiet-recovery] all %d nodes agree on tipset at height %d", respondents, checkHeight) + } else { + log.Printf("[quiet-recovery] TIPSET DISAGREEMENT at height %d among %d nodes", checkHeight, respondents) + } +} + +// queryNodeHeights returns the chain head height for each connected node. +func queryNodeHeights() map[string]abi.ChainEpoch { + heights := make(map[string]abi.ChainEpoch, len(nodeKeys)) + for _, name := range nodeKeys { + head, err := nodes[name].ChainHead(ctx) + if err != nil { + debugLog("[quiet-recovery] ChainHead failed on %s: %v", name, err) + continue + } + heights[name] = head.Height() + } + return heights +} + +// maxEpoch returns the maximum height from a height map. +func maxEpoch(heights map[string]abi.ChainEpoch) abi.ChainEpoch { + var max abi.ChainEpoch + for _, h := range heights { + if h > max { + max = h + } + } + return max +} + +// minEpoch returns the minimum height from a height map (ignoring zeros). +func minEpoch(heights map[string]abi.ChainEpoch) abi.ChainEpoch { + var min abi.ChainEpoch + for _, h := range heights { + if h > 0 && (min == 0 || h < min) { + min = h + } + } + return min +}