Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions agent/flow-trace/05_FAILURE_REFUND_SLASHING.md
Original file line number Diff line number Diff line change
Expand Up @@ -1059,11 +1059,17 @@ When CommitteeMemberExpelled event arrives from EVM:
│ ├─ Only processes raw events (party_id: None)
│ └─ Removes expelled node from committee filter set
└─ When E3Failed / E3StageChanged(Complete|Failed) arrives:
└─ When E3Failed(timeout) / E3StageChanged(Complete) arrives:
├─ E3Router (central cleanup orchestrator):
│ └─ Converts E3Failed / E3StageChanged(Complete|Failed) → E3RequestComplete
│ → Single cleanup signal for all per-E3 actors
│ ├─ E3Failed with a timeout reason (CommitteeFormationTimeout, DKGTimeout,
│ │ ComputeTimeout, DecryptionTimeout) → publishes E3RequestComplete
│ │ → Single cleanup signal for all per-E3 actors
│ │ NOTE: E3Failed with a misbehaviour reason (DKGInvalidShares, etc.) does
│ │ NOT trigger E3RequestComplete — the accusation/slashing lifecycle must
│ │ complete first.
│ └─ E3StageChanged(Failed) and E3Failed(timeout) arriving after context teardown
│ are silently ignored (expected on-chain lag)
├─ CommitteeFinalizer (direct handler — semantic work):
│ └─ Cancels any pending committee-finalization timer for this e3_id
Expand Down
15 changes: 15 additions & 0 deletions crates/events/src/interfold_event/e3_failed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ pub enum FailureReason {
VerificationFailed,
}

impl FailureReason {
/// Returns true when the failure was caused purely by a deadline expiring rather
/// than by a node acting maliciously. Timeout failures have no associated
/// accusation/slashing lifecycle, so their E3 context can be torn down immediately.
pub fn is_timeout(&self) -> bool {
matches!(
self,
Self::CommitteeFormationTimeout
| Self::DKGTimeout
| Self::ComputeTimeout
| Self::DecryptionTimeout
)
}
}

/// E3 lifecycle stage
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum E3Stage {
Expand Down
12 changes: 6 additions & 6 deletions crates/keyshare/src/actors/threshold_keyshare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2047,7 +2047,7 @@ impl Handler<EncryptionKeyCollectionFailed> for ThresholdKeyshare {
self.bus.publish_without_context(E3Failed {
e3_id: msg.e3_id,
failed_at_stage: E3Stage::CommitteeFinalized,
reason: FailureReason::InsufficientCommitteeMembers,
reason: FailureReason::DKGTimeout,
})?;
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// Stop this actor since we can't proceed without all encryption keys
Expand Down Expand Up @@ -2081,7 +2081,7 @@ impl Handler<ThresholdShareCollectionFailed> for ThresholdKeyshare {
self.bus.publish_without_context(E3Failed {
e3_id: msg.e3_id,
failed_at_stage: E3Stage::CommitteeFinalized,
reason: FailureReason::InsufficientCommitteeMembers,
reason: FailureReason::DKGTimeout,
})?;

ctx.stop();
Expand Down Expand Up @@ -2129,7 +2129,7 @@ impl Handler<DecryptionKeySharedCollectionFailed> for ThresholdKeyshare {
self.bus.publish_without_context(E3Failed {
e3_id: msg.e3_id.clone(),
failed_at_stage: E3Stage::CommitteeFinalized,
reason: FailureReason::InsufficientCommitteeMembers,
reason: FailureReason::DecryptionTimeout,
})?;

ctx.stop();
Expand Down Expand Up @@ -2274,7 +2274,7 @@ mod tests {
InterfoldEventData::E3Failed(data)
if data.e3_id == failure.e3_id
&& data.failed_at_stage == E3Stage::CommitteeFinalized
&& data.reason == FailureReason::InsufficientCommitteeMembers
&& data.reason == FailureReason::DKGTimeout
));

Ok(())
Expand Down Expand Up @@ -2305,7 +2305,7 @@ mod tests {
InterfoldEventData::E3Failed(data)
if data.e3_id == failure.e3_id
&& data.failed_at_stage == E3Stage::CommitteeFinalized
&& data.reason == FailureReason::InsufficientCommitteeMembers
&& data.reason == FailureReason::DKGTimeout
));

Ok(())
Expand All @@ -2328,7 +2328,7 @@ mod tests {
InterfoldEventData::E3Failed(data)
if data.e3_id == failure.e3_id
&& data.failed_at_stage == E3Stage::CommitteeFinalized
&& data.reason == FailureReason::InsufficientCommitteeMembers
&& data.reason == FailureReason::DecryptionTimeout
));

Ok(())
Expand Down
146 changes: 131 additions & 15 deletions crates/request/src/domain/routing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,21 @@ impl RequestRouter {

// If this e3 round has already been completed then this event is unexpected.
if completed.contains(&e3_id) {
// Plaintext Aggregated Triggers E3RequestComplete which tears down the per-E3 context
// and mark it as completed, but the E3StageChanged(Complete) that arrives from the EVM
// after local teardown is expected and should be ignored rather than treated as an error.
if matches!(
msg.get_data(),
// On-chain confirmation events that lag behind local teardown are expected and
// should be silently ignored rather than treated as an error.
let is_late_terminal = match msg.get_data() {
// E3StageChanged(Complete) always lags local PlaintextAggregated completion.
InterfoldEventData::E3StageChanged(data)
if matches!(data.new_stage, E3Stage::Complete)
) {
if matches!(data.new_stage, E3Stage::Complete | E3Stage::Failed) =>
{
true
}
// E3Failed from on-chain markE3Failed may arrive after a local timeout already
// cleaned up the context.
InterfoldEventData::E3Failed(data) if data.reason.is_timeout() => true,
_ => false,
};
if is_late_terminal {
return RoutingDecision::Ignore;
}
return RoutingDecision::AlreadyCompleted(e3_id);
Expand All @@ -88,8 +95,12 @@ impl RequestRouter {
{
PostForward::PublishComplete
}
// NOTE: E3Stage::Failed does NOT trigger E3RequestComplete. Failed rounds need the
// accusation/slashing lifecycle to complete before the context is torn down.
// Timeout failures have no accusation/slashing lifecycle, so the context can be
// torn down immediately. Misbehaviour failures (DKGInvalidShares, etc.) still need
// the accusation/slashing lifecycle to complete before teardown.
InterfoldEventData::E3Failed(data) if data.reason.is_timeout() => {
PostForward::PublishComplete
}
InterfoldEventData::E3RequestComplete(_) => PostForward::Teardown,
_ => PostForward::None,
};
Expand All @@ -105,8 +116,8 @@ impl RequestRouter {
mod tests {
use super::*;
use e3_events::{
E3RequestComplete, E3Stage, E3StageChanged, InterfoldEvent, PlaintextAggregated, Sequenced,
Shutdown,
E3Failed, E3RequestComplete, E3Stage, E3StageChanged, FailureReason, InterfoldEvent,
PlaintextAggregated, Sequenced, Shutdown,
};

fn e3id() -> E3id {
Expand Down Expand Up @@ -190,9 +201,9 @@ mod tests {
}

#[test]
fn stage_changed_to_failed_still_errors_when_completed() {
// E3StageChanged(Failed) after completion IS unexpected and should still error,
// because the failed path goes through accusation/slashing, not simple completion.
fn stage_changed_to_failed_ignored_when_completed() {
// E3StageChanged(Failed) from the EVM can arrive after a local timeout already cleaned up
// the context. Treat it as a silent no-op, the same way we handle E3StageChanged(Complete).
let id = e3id();
let mut completed = HashSet::new();
completed.insert(id.clone());
Expand All @@ -203,7 +214,7 @@ mod tests {
});
assert_eq!(
RequestRouter::route(&msg, &completed),
RoutingDecision::AlreadyCompleted(id)
RoutingDecision::Ignore
);
}

Expand Down Expand Up @@ -285,4 +296,109 @@ mod tests {
}
);
}

// --- timeout-triggered E3Failed tests ---

fn e3_failed(id: E3id, reason: FailureReason) -> InterfoldEvent {
from_data(E3Failed {
e3_id: id,
failed_at_stage: E3Stage::CommitteeFinalized,
reason,
})
}

#[test]
fn e3_failed_dkg_timeout_publishes_complete() {
let id = e3id();
let msg = e3_failed(id.clone(), FailureReason::DKGTimeout);
assert_eq!(
RequestRouter::route(&msg, &HashSet::new()),
RoutingDecision::Process {
e3_id: id,
post_forward: PostForward::PublishComplete,
}
);
}

#[test]
fn e3_failed_committee_formation_timeout_publishes_complete() {
let id = e3id();
let msg = e3_failed(id.clone(), FailureReason::CommitteeFormationTimeout);
assert_eq!(
RequestRouter::route(&msg, &HashSet::new()),
RoutingDecision::Process {
e3_id: id,
post_forward: PostForward::PublishComplete,
}
);
}

#[test]
fn e3_failed_compute_timeout_publishes_complete() {
let id = e3id();
let msg = e3_failed(id.clone(), FailureReason::ComputeTimeout);
assert_eq!(
RequestRouter::route(&msg, &HashSet::new()),
RoutingDecision::Process {
e3_id: id,
post_forward: PostForward::PublishComplete,
}
);
}

#[test]
fn e3_failed_decryption_timeout_publishes_complete() {
let id = e3id();
let msg = e3_failed(id.clone(), FailureReason::DecryptionTimeout);
assert_eq!(
RequestRouter::route(&msg, &HashSet::new()),
RoutingDecision::Process {
e3_id: id,
post_forward: PostForward::PublishComplete,
}
);
}

#[test]
fn e3_failed_invalid_shares_does_not_complete() {
// Slashable failures must NOT trigger E3RequestComplete — the accusation/slashing
// lifecycle must be allowed to finish first.
let id = e3id();
let msg = e3_failed(id.clone(), FailureReason::DKGInvalidShares);
assert_eq!(
RequestRouter::route(&msg, &HashSet::new()),
RoutingDecision::Process {
e3_id: id,
post_forward: PostForward::None,
}
);
}

#[test]
fn e3_failed_timeout_ignored_when_already_completed() {
let id = e3id();
let mut completed = HashSet::new();
completed.insert(id.clone());
let msg = e3_failed(id.clone(), FailureReason::DKGTimeout);
assert_eq!(
RequestRouter::route(&msg, &completed),
RoutingDecision::Ignore
);
}

#[test]
fn stage_changed_to_failed_ignored_when_already_completed() {
let id = e3id();
let mut completed = HashSet::new();
completed.insert(id.clone());
let msg = from_data(E3StageChanged {
e3_id: id.clone(),
previous_stage: E3Stage::CommitteeFinalized,
new_stage: E3Stage::Failed,
});
assert_eq!(
RequestRouter::route(&msg, &completed),
RoutingDecision::Ignore
);
}
}
Loading