simplyblock · schmidt-scaled · May 26, 2026 · May 26, 2026 · May 28, 2026 · May 28, 2026
diff --git a/scripts/aws_dual_node_outage_soak_mixed_churn.py b/scripts/aws_dual_node_outage_soak_mixed_churn.py
@@ -122,7 +122,42 @@ def parse_args():
         "--shutdown-gap",
         type=int,
         default=0,
-        help="Optional delay between shutting down the two selected nodes.",
+        help=(
+            "Legacy fixed delay between the two outages within an iteration. "
+            "When > 0 it overrides --outage-gap-min/--outage-gap-max with a "
+            "constant. Default 0 = use the random range below."
+        ),
+    )
+    parser.add_argument(
+        "--outage-gap-min",
+        type=int,
+        default=15,
+        help=(
+            "Minimum seconds between applying outage 1 and outage 2 of an "
+            "iteration. The actual gap is drawn uniformly from "
+            "[--outage-gap-min, --outage-gap-max] and then capped per "
+            "method 1 so the requested --min-outage-overlap is guaranteed."
+        ),
+    )
+    parser.add_argument(
+        "--outage-gap-max",
+        type=int,
+        default=180,
+        help=(
+            "Maximum seconds between applying outage 1 and outage 2 of an "
+            "iteration. Default 3 min."
+        ),
+    )
+    parser.add_argument(
+        "--min-outage-overlap",
+        type=int,
+        default=10,
+        help=(
+            "Minimum seconds both outage targets must be simultaneously "
+            "not-online inside an iteration. Used to cap the inter-outage "
+            "gap when method 1's recovery window is short (e.g. "
+            "network_outage_20)."
+        ),
     )
     parser.add_argument(
         "--log-file",
@@ -844,32 +879,28 @@ def sbctl_allow_failure(self, args, timeout=600):
         )
         return rc, stdout_text, stderr_text
 
-    def shutdown_with_migration_retry(self, node_id):
-        while True:
-            rc, stdout_text, stderr_text = self.sbctl_allow_failure(
-                f"sn shutdown {node_id}",
-                timeout=300,
-            )
-            if rc == 0:
-                return
-            output = f"{stdout_text}\n{stderr_text}".lower()
-            retry_markers = (
-                "migration",
-                "migrat",
-                "rebalanc",
-                "active task",
-                "running task",
-                "in_progress",
-                "in progress",
-            )
-            if any(marker in output for marker in retry_markers):
-                self.logger.log(
-                    f"Shutdown of {node_id} blocked by migration/rebalance/task; retrying in 15s"
-                )
-                time.sleep(15)
-                continue
+    def _graceful_shutdown(self, node_id):
+        """Single-shot `sbctl sn shutdown`. Raises on failure.
+
+        Previously this method looped on output markers like 'migration'
+        / 'rebalanc' / 'active task' and slept 15s between retries — the
+        soak would sit and wait for CP-side rebalancing to drain before
+        the outage could proceed. That retry was needed because
+        shutdown_storage_node used to call _check_ftt_allows_node_removal
+        (added in commit fbdffea3, 2026-03-28) which refused the
+        shutdown during rebalance with a stdout containing 'rebalanc'.
+        That call has been removed from the CP — shutdown is supposed to
+        proceed under the cluster's FTT contract, regardless of in-flight
+        rebalance, so we no longer need to wait here either.
+        """
+        rc, stdout_text, stderr_text = self.sbctl_allow_failure(
+            f"sn shutdown {node_id}",
+            timeout=300,
+        )
+        if rc != 0:
             raise RemoteCommandError(
                 f"mgmt: command failed with rc={rc}: sbctl sn shutdown {node_id}"
+                f" | stdout={stdout_text.strip()} | stderr={stderr_text.strip()}"
             )
 
     def prepare_client(self):
@@ -1520,28 +1551,21 @@ def reraise_churn_error(self):
     # ----- outage methods ---------------------------------------------------
 
     def _forced_shutdown(self, node_id):
-        """Shutdown with --force; still retry if blocked by migration."""
-        while True:
-            rc, stdout_text, stderr_text = self.sbctl_allow_failure(
-                f"sn shutdown {node_id} --force",
-                timeout=300,
-            )
-            if rc == 0:
-                return
-            output = f"{stdout_text}\n{stderr_text}".lower()
-            retry_markers = (
-                "migration", "migrat", "rebalanc",
-                "active task", "running task",
-                "in_progress", "in progress",
-            )
-            if any(m in output for m in retry_markers):
-                self.logger.log(
-                    f"Forced shutdown of {node_id} blocked by migration/task; retrying in 15s"
-                )
-                time.sleep(15)
-                continue
+        """Single-shot `sbctl sn shutdown --force`. Raises on failure.
+
+        --force bypasses every shutdown guard inside the CP, so the
+        retry-on-migration-markers loop that used to live here never
+        actually fired — sbctl --force does not return 'migration' /
+        'rebalanc' / 'active task' strings. Removed for clarity.
+        """
+        rc, stdout_text, stderr_text = self.sbctl_allow_failure(
+            f"sn shutdown {node_id} --force",
+            timeout=300,
+        )
+        if rc != 0:
             raise RemoteCommandError(
                 f"mgmt: command failed with rc={rc}: sbctl sn shutdown {node_id} --force"
+                f" | stdout={stdout_text.strip()} | stderr={stderr_text.strip()}"
             )
 
     def _container_kill(self, node_id):
@@ -1672,35 +1696,57 @@ def _inter_iteration_nic_chaos(self):
         self.wait_for_cluster_stable(require_no_rebalance=self.args.wait_for_rebalance)
 
     def _network_outage(self, node_id, duration):
-        """Take all data NICs down on one storage node for *duration* seconds,
-        then bring them back up. Simulates a transient network partition of
-        a single node. Node is expected to auto-recover once the NICs return
-        — no sbctl restart is issued."""
+        """Drop data NICs on one storage node; schedule the NIC bring-up
+        ``duration`` seconds later on a background daemon thread, then
+        return.
+
+        Previously this method blocked for the full ``duration`` (the
+        sleep ran inline before bringing NICs back up). That made it
+        impossible to overlap a network_outage_N outage with a second
+        outage applied within the same iteration — by the time
+        run_outage_pair called _apply_outage for node 2, node 1's NICs
+        were already up and the CP was already healing it. Decoupling
+        the bring-up from the call site lets the second outage land
+        while the first node is still partitioned.
+
+        The bring-up thread is daemonized so the soak's exit (atexit /
+        unhandled exception) does not block on it. We do NOT join the
+        thread anywhere in the iteration: the only thing that depends
+        on the NICs being back up is the next iteration's
+        wait_for_all_online, which polls anyway.
+        """
         host = self._node_host(node_id)
         nics = self._get_data_nics() or ["eth1"]
         self.logger.log(
-            f"network_outage on {node_id}: dropping {nics} for {duration}s"
+            f"network_outage on {node_id}: dropping {nics} for {duration}s "
+            "(async bring-up)"
         )
         for nic in nics:
             try:
                 host.run(f"sudo ip link set {nic} down", timeout=10, check=False,
                          label=f"netout down {nic} on {node_id}")
             except Exception as e:
                 self.logger.log(f"WARNING: failed to down {nic} on {node_id}: {e}")
-        try:
-            time.sleep(duration)
-        finally:
-            for nic in nics:
-                try:
-                    host.run(f"sudo ip link set {nic} up", timeout=10, check=False,
-                             label=f"netout up {nic} on {node_id}")
-                except Exception as e:
-                    self.logger.log(f"WARNING: failed to up {nic} on {node_id}: {e}")
+
+        def _bring_up_later():
+            try:
+                time.sleep(duration)
+            finally:
+                for nic in nics:
+                    try:
+                        host.run(f"sudo ip link set {nic} up", timeout=10, check=False,
+                                 label=f"netout up {nic} on {node_id}")
+                    except Exception as e:
+                        self.logger.log(f"WARNING: failed to up {nic} on {node_id}: {e}")
+
+        t = threading.Thread(target=_bring_up_later, daemon=True,
+                             name=f"netout-bringup-{node_id[:8]}")
+        t.start()
 
     def _apply_outage(self, node_id, method):
         self.logger.log(f"Applying outage '{method}' on {node_id}")
         if method == "graceful":
-            self.shutdown_with_migration_retry(node_id)
+            self._graceful_shutdown(node_id)
         elif method == "forced":
             self._forced_shutdown(node_id)
         elif method == "container_kill":
@@ -1751,15 +1797,89 @@ def wait_node_leaves_online(self, node_id, timeout=90, poll=2):
             time.sleep(poll)
         return False
 
+    # Conservative lower bound on how long node stays not-online for each
+    # outage method. Used to cap the inter-outage gap so that the
+    # configured --min-outage-overlap is guaranteed (the gap can never
+    # eat the entire recovery window of outage 1). Real recovery is
+    # usually longer; underestimating keeps the overlap invariant safe.
+    #
+    # graceful / forced: the node stays in OFFLINE until run_outage_pair
+    # issues `sn restart` later in the iteration — so the unavailability
+    # window is effectively unbounded from the gap's perspective. Use a
+    # very large sentinel.
+    _METHOD_MIN_UNAVAIL_S = {
+        "graceful": 10_000,
+        "forced": 10_000,
+        # CP detection + auto-restart takes at least this long in practice.
+        "container_kill": 30,
+        # Reboot itself, BIOS, boot, SPDK start. Floor is generous.
+        "host_reboot": 90,
+        # network_outage_N handled by name parsing below.
+    }
+
+    def _expected_min_unavail_seconds(self, method):
+        if method.startswith("network_outage_"):
+            try:
+                return int(method.rsplit("_", 1)[-1])
+            except ValueError:
+                return 30
+        return self._METHOD_MIN_UNAVAIL_S.get(method, 30)
+
+    def _pick_outage_gap(self, method1):
+        """Random gap in [outage_gap_min, outage_gap_max], capped per
+        method1 so --min-outage-overlap is guaranteed.
+
+        --shutdown-gap > 0 overrides everything with a fixed constant
+        (legacy behaviour; emit a warning if the constant would violate
+        the overlap invariant for method1).
+        """
+        overlap = max(0, self.args.min_outage_overlap)
+        unavail = self._expected_min_unavail_seconds(method1)
+        # Hard upper bound: gap + overlap <= unavail  =>  gap <= unavail - overlap
+        cap = max(1, unavail - overlap)
+
+        if self.args.shutdown_gap and self.args.shutdown_gap > 0:
+            gap = self.args.shutdown_gap
+            if gap > cap:
+                self.logger.log(
+                    f"WARNING: --shutdown-gap={gap}s exceeds method1={method1}'s "
+                    f"safe cap {cap}s; overlap of {overlap}s is NOT guaranteed"
+                )
+            return gap
+
+        lo = max(1, self.args.outage_gap_min)
+        hi = max(lo, self.args.outage_gap_max)
+        # Clamp the upper bound to the cap; clamp the lower bound to
+        # respect the cap too (otherwise random.randint would raise).
+        hi = min(hi, cap)
+        lo = min(lo, hi)
+        gap = random.randint(lo, hi)
+        self.logger.log(
+            f"Outage gap chosen: {gap}s "
+            f"(range=[{lo},{hi}], cap={cap}s for method1={method1}, "
+            f"min-overlap={overlap}s)"
+        )
+        return gap
+
     def run_outage_pair(self, node1, node2, method1, method2):
         self.logger.log(
             f"Outage pair: {node1}={method1} and {node2}={method2}"
         )
-        # Apply first outage, then optional gap, then second outage.
+        # Apply first outage, then a method1-aware gap, then second outage.
+        # The gap is bounded so node 1's recovery window is guaranteed to
+        # span at least --min-outage-overlap seconds after node 2 goes
+        # down — i.e., both nodes are simultaneously not-online for the
+        # configured minimum.
+        gap = self._pick_outage_gap(method1)
+        t_outage1 = time.time()
         self._apply_outage(node1, method1)
-        if self.args.shutdown_gap:
-            time.sleep(self.args.shutdown_gap)
+        time.sleep(gap)
+        t_outage2 = time.time()
         self._apply_outage(node2, method2)
+        self.logger.log(
+            f"Outage pair applied: outage1 at t=0, outage2 at "
+            f"t={t_outage2 - t_outage1:.1f}s (gap={gap}s)"
+        )
 
         # Issue sbctl restart only for methods that leave the node in a
         # "shutdown" state that the CP won't recover on its own.