diff --git a/agent/flow-trace/05_FAILURE_REFUND_SLASHING.md b/agent/flow-trace/05_FAILURE_REFUND_SLASHING.md index 96ce8d050..a629618b5 100644 --- a/agent/flow-trace/05_FAILURE_REFUND_SLASHING.md +++ b/agent/flow-trace/05_FAILURE_REFUND_SLASHING.md @@ -1059,11 +1059,17 @@ When CommitteeMemberExpelled event arrives from EVM: │ ├─ Only processes raw events (party_id: None) │ └─ Removes expelled node from committee filter set │ -└─ When E3Failed / E3StageChanged(Complete|Failed) arrives: +└─ When E3Failed(timeout) / E3StageChanged(Complete) arrives: │ ├─ E3Router (central cleanup orchestrator): - │ └─ Converts E3Failed / E3StageChanged(Complete|Failed) → E3RequestComplete - │ → Single cleanup signal for all per-E3 actors + │ ├─ E3Failed with a timeout reason (CommitteeFormationTimeout, DKGTimeout, + │ │ ComputeTimeout, DecryptionTimeout) → publishes E3RequestComplete + │ │ → Single cleanup signal for all per-E3 actors + │ │ NOTE: E3Failed with a misbehaviour reason (DKGInvalidShares, etc.) does + │ │ NOT trigger E3RequestComplete — the accusation/slashing lifecycle must + │ │ complete first. + │ └─ E3StageChanged(Failed) and E3Failed(timeout) arriving after context teardown + │ are silently ignored (expected on-chain lag) │ ├─ CommitteeFinalizer (direct handler — semantic work): │ └─ Cancels any pending committee-finalization timer for this e3_id diff --git a/crates/events/src/interfold_event/e3_failed.rs b/crates/events/src/interfold_event/e3_failed.rs index ebce4b531..a9f05977f 100644 --- a/crates/events/src/interfold_event/e3_failed.rs +++ b/crates/events/src/interfold_event/e3_failed.rs @@ -27,6 +27,21 @@ pub enum FailureReason { VerificationFailed, } +impl FailureReason { + /// Returns true when the failure was caused purely by a deadline expiring rather + /// than by a node acting maliciously. Timeout failures have no associated + /// accusation/slashing lifecycle, so their E3 context can be torn down immediately. + pub fn is_timeout(&self) -> bool { + matches!( + self, + Self::CommitteeFormationTimeout + | Self::DKGTimeout + | Self::ComputeTimeout + | Self::DecryptionTimeout + ) + } +} + /// E3 lifecycle stage #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum E3Stage { diff --git a/crates/keyshare/src/actors/threshold_keyshare.rs b/crates/keyshare/src/actors/threshold_keyshare.rs index dd6735cd3..9df9b0fd3 100644 --- a/crates/keyshare/src/actors/threshold_keyshare.rs +++ b/crates/keyshare/src/actors/threshold_keyshare.rs @@ -2047,7 +2047,7 @@ impl Handler for ThresholdKeyshare { self.bus.publish_without_context(E3Failed { e3_id: msg.e3_id, failed_at_stage: E3Stage::CommitteeFinalized, - reason: FailureReason::InsufficientCommitteeMembers, + reason: FailureReason::DKGTimeout, })?; // Stop this actor since we can't proceed without all encryption keys @@ -2081,7 +2081,7 @@ impl Handler for ThresholdKeyshare { self.bus.publish_without_context(E3Failed { e3_id: msg.e3_id, failed_at_stage: E3Stage::CommitteeFinalized, - reason: FailureReason::InsufficientCommitteeMembers, + reason: FailureReason::DKGTimeout, })?; ctx.stop(); @@ -2129,7 +2129,7 @@ impl Handler for ThresholdKeyshare { self.bus.publish_without_context(E3Failed { e3_id: msg.e3_id.clone(), failed_at_stage: E3Stage::CommitteeFinalized, - reason: FailureReason::InsufficientCommitteeMembers, + reason: FailureReason::DecryptionTimeout, })?; ctx.stop(); @@ -2274,7 +2274,7 @@ mod tests { InterfoldEventData::E3Failed(data) if data.e3_id == failure.e3_id && data.failed_at_stage == E3Stage::CommitteeFinalized - && data.reason == FailureReason::InsufficientCommitteeMembers + && data.reason == FailureReason::DKGTimeout )); Ok(()) @@ -2305,7 +2305,7 @@ mod tests { InterfoldEventData::E3Failed(data) if data.e3_id == failure.e3_id && data.failed_at_stage == E3Stage::CommitteeFinalized - && data.reason == FailureReason::InsufficientCommitteeMembers + && data.reason == FailureReason::DKGTimeout )); Ok(()) @@ -2328,7 +2328,7 @@ mod tests { InterfoldEventData::E3Failed(data) if data.e3_id == failure.e3_id && data.failed_at_stage == E3Stage::CommitteeFinalized - && data.reason == FailureReason::InsufficientCommitteeMembers + && data.reason == FailureReason::DecryptionTimeout )); Ok(()) diff --git a/crates/request/src/domain/routing.rs b/crates/request/src/domain/routing.rs index b91fdce6f..5f63cf653 100644 --- a/crates/request/src/domain/routing.rs +++ b/crates/request/src/domain/routing.rs @@ -66,14 +66,21 @@ impl RequestRouter { // If this e3 round has already been completed then this event is unexpected. if completed.contains(&e3_id) { - // Plaintext Aggregated Triggers E3RequestComplete which tears down the per-E3 context - // and mark it as completed, but the E3StageChanged(Complete) that arrives from the EVM - // after local teardown is expected and should be ignored rather than treated as an error. - if matches!( - msg.get_data(), + // On-chain confirmation events that lag behind local teardown are expected and + // should be silently ignored rather than treated as an error. + let is_late_terminal = match msg.get_data() { + // E3StageChanged(Complete) always lags local PlaintextAggregated completion. InterfoldEventData::E3StageChanged(data) - if matches!(data.new_stage, E3Stage::Complete) - ) { + if matches!(data.new_stage, E3Stage::Complete | E3Stage::Failed) => + { + true + } + // E3Failed from on-chain markE3Failed may arrive after a local timeout already + // cleaned up the context. + InterfoldEventData::E3Failed(data) if data.reason.is_timeout() => true, + _ => false, + }; + if is_late_terminal { return RoutingDecision::Ignore; } return RoutingDecision::AlreadyCompleted(e3_id); @@ -88,8 +95,12 @@ impl RequestRouter { { PostForward::PublishComplete } - // NOTE: E3Stage::Failed does NOT trigger E3RequestComplete. Failed rounds need the - // accusation/slashing lifecycle to complete before the context is torn down. + // Timeout failures have no accusation/slashing lifecycle, so the context can be + // torn down immediately. Misbehaviour failures (DKGInvalidShares, etc.) still need + // the accusation/slashing lifecycle to complete before teardown. + InterfoldEventData::E3Failed(data) if data.reason.is_timeout() => { + PostForward::PublishComplete + } InterfoldEventData::E3RequestComplete(_) => PostForward::Teardown, _ => PostForward::None, }; @@ -105,8 +116,8 @@ impl RequestRouter { mod tests { use super::*; use e3_events::{ - E3RequestComplete, E3Stage, E3StageChanged, InterfoldEvent, PlaintextAggregated, Sequenced, - Shutdown, + E3Failed, E3RequestComplete, E3Stage, E3StageChanged, FailureReason, InterfoldEvent, + PlaintextAggregated, Sequenced, Shutdown, }; fn e3id() -> E3id { @@ -190,9 +201,9 @@ mod tests { } #[test] - fn stage_changed_to_failed_still_errors_when_completed() { - // E3StageChanged(Failed) after completion IS unexpected and should still error, - // because the failed path goes through accusation/slashing, not simple completion. + fn stage_changed_to_failed_ignored_when_completed() { + // E3StageChanged(Failed) from the EVM can arrive after a local timeout already cleaned up + // the context. Treat it as a silent no-op, the same way we handle E3StageChanged(Complete). let id = e3id(); let mut completed = HashSet::new(); completed.insert(id.clone()); @@ -203,7 +214,7 @@ mod tests { }); assert_eq!( RequestRouter::route(&msg, &completed), - RoutingDecision::AlreadyCompleted(id) + RoutingDecision::Ignore ); } @@ -285,4 +296,109 @@ mod tests { } ); } + + // --- timeout-triggered E3Failed tests --- + + fn e3_failed(id: E3id, reason: FailureReason) -> InterfoldEvent { + from_data(E3Failed { + e3_id: id, + failed_at_stage: E3Stage::CommitteeFinalized, + reason, + }) + } + + #[test] + fn e3_failed_dkg_timeout_publishes_complete() { + let id = e3id(); + let msg = e3_failed(id.clone(), FailureReason::DKGTimeout); + assert_eq!( + RequestRouter::route(&msg, &HashSet::new()), + RoutingDecision::Process { + e3_id: id, + post_forward: PostForward::PublishComplete, + } + ); + } + + #[test] + fn e3_failed_committee_formation_timeout_publishes_complete() { + let id = e3id(); + let msg = e3_failed(id.clone(), FailureReason::CommitteeFormationTimeout); + assert_eq!( + RequestRouter::route(&msg, &HashSet::new()), + RoutingDecision::Process { + e3_id: id, + post_forward: PostForward::PublishComplete, + } + ); + } + + #[test] + fn e3_failed_compute_timeout_publishes_complete() { + let id = e3id(); + let msg = e3_failed(id.clone(), FailureReason::ComputeTimeout); + assert_eq!( + RequestRouter::route(&msg, &HashSet::new()), + RoutingDecision::Process { + e3_id: id, + post_forward: PostForward::PublishComplete, + } + ); + } + + #[test] + fn e3_failed_decryption_timeout_publishes_complete() { + let id = e3id(); + let msg = e3_failed(id.clone(), FailureReason::DecryptionTimeout); + assert_eq!( + RequestRouter::route(&msg, &HashSet::new()), + RoutingDecision::Process { + e3_id: id, + post_forward: PostForward::PublishComplete, + } + ); + } + + #[test] + fn e3_failed_invalid_shares_does_not_complete() { + // Slashable failures must NOT trigger E3RequestComplete — the accusation/slashing + // lifecycle must be allowed to finish first. + let id = e3id(); + let msg = e3_failed(id.clone(), FailureReason::DKGInvalidShares); + assert_eq!( + RequestRouter::route(&msg, &HashSet::new()), + RoutingDecision::Process { + e3_id: id, + post_forward: PostForward::None, + } + ); + } + + #[test] + fn e3_failed_timeout_ignored_when_already_completed() { + let id = e3id(); + let mut completed = HashSet::new(); + completed.insert(id.clone()); + let msg = e3_failed(id.clone(), FailureReason::DKGTimeout); + assert_eq!( + RequestRouter::route(&msg, &completed), + RoutingDecision::Ignore + ); + } + + #[test] + fn stage_changed_to_failed_ignored_when_already_completed() { + let id = e3id(); + let mut completed = HashSet::new(); + completed.insert(id.clone()); + let msg = from_data(E3StageChanged { + e3_id: id.clone(), + previous_stage: E3Stage::CommitteeFinalized, + new_stage: E3Stage::Failed, + }); + assert_eq!( + RequestRouter::route(&msg, &completed), + RoutingDecision::Ignore + ); + } }