From 58b62f6f3820d0026e0abd599930886e69e7a727 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 9 Apr 2025 12:39:51 +1000 Subject: [PATCH 01/11] Update function doc only --- beacon_node/network/src/sync/network_context/custody.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 718ce74018c..f4d010b881e 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -252,8 +252,8 @@ impl ActiveCustodyRequest { // De-prioritize peers that have failed to successfully respond to // requests recently self.failed_peers.contains(peer), - // Prefer peers with less requests to load balance across peers. We - // We batch requests to the same peer, so count existance in the + // Prefer peers with fewer requests to load balance across peers. + // We batch requests to the same peer, so count existence in the // `columns_to_request_by_peer` as a single 1 request. active_request_count_by_peer.get(peer).copied().unwrap_or(0) + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), From 70d702048c626b6d04a85e4010dd14e6d1840f9b Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 2 May 2025 18:40:04 -0300 Subject: [PATCH 02/11] Request columns by range from all synced peers --- .../src/peer_manager/peerdb.rs | 19 +++++++++++++++++-- .../network/src/sync/range_sync/chain.rs | 14 +++++++++++++- beacon_node/network/src/sync/tests/lookups.rs | 7 +++++-- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 083887046af..95a4e82fa21 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -1,6 +1,8 @@ use crate::discovery::enr::PEERDAS_CUSTODY_GROUP_COUNT_ENR_KEY; use crate::discovery::{peer_id_to_node_id, CombinedKey}; -use crate::{metrics, multiaddr::Multiaddr, types::Subnet, Enr, EnrExt, Gossipsub, PeerId}; +use crate::{ + metrics, multiaddr::Multiaddr, types::Subnet, Enr, EnrExt, Gossipsub, PeerId, SyncInfo, +}; use itertools::Itertools; use logging::crit; use peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; @@ -15,7 +17,7 @@ use std::{ use sync_status::SyncStatus; use tracing::{debug, error, trace, warn}; use types::data_column_custody_group::compute_subnets_for_node; -use types::{ChainSpec, DataColumnSubnetId, EthSpec}; +use types::{ChainSpec, DataColumnSubnetId, Epoch, EthSpec, Hash256, Slot}; pub mod client; pub mod peer_info; @@ -735,6 +737,19 @@ impl PeerDB { }, ); + self.update_sync_status( + &peer_id, + SyncStatus::Synced { + // Fill in mock SyncInfo, only for the peer to return `is_synced() == true`. + info: SyncInfo { + head_slot: Slot::new(0), + head_root: Hash256::ZERO, + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + }, + }, + ); + if supernode { let peer_info = self.peers.get_mut(&peer_id).expect("peer exists"); let all_subnets = (0..spec.data_column_sidecar_subnet_count) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 5eead51b763..f576d30bfc5 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -920,13 +920,25 @@ impl SyncingChain { if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_peers(); + + // TODO(das): we should request only from peers that are part of this SyncingChain. + // However, then we hit the NoPeer error frequently which causes the batch to fail and + // the SyncingChain to be dropped. We need to handle this case more gracefully. + let synced_peers = network + .network_globals() + .peers + .read() + .synced_peers() + .cloned() + .collect::>(); + match network.block_components_by_range_request( request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, - &self.peers, + &synced_peers, &failed_peers, ) { Ok(request_id) => { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 84c95b2a4c8..565d7bc9f81 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -357,10 +357,13 @@ impl TestRig { pub fn new_connected_peer(&mut self) -> PeerId { let key = self.determinstic_key(); - self.network_globals + let peer_id = self + .network_globals .peers .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key) + .__add_connected_peer_testing_only(false, &self.harness.spec, key); + self.log(&format!("Added new peer for testing {peer_id:?}")); + peer_id } pub fn new_connected_supernode_peer(&mut self) -> PeerId { From 29c153a63d90c3986049c432aaa592ecb52279b6 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 3 May 2025 17:25:40 -0300 Subject: [PATCH 03/11] Prioritize requests to peers with less overall requests --- .../network/src/sync/backfill_sync/mod.rs | 4 ++- .../network/src/sync/network_context.rs | 6 ++++- .../network/src/sync/range_sync/batch.rs | 7 ++++++ .../network/src/sync/range_sync/chain.rs | 25 +++++++++++++------ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 25bc0190c2d..bc1f291fc9c 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -24,7 +24,7 @@ use lighthouse_network::{PeerAction, PeerId}; use logging::crit; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashSet, + HashMap, HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -920,6 +920,8 @@ impl BackFillSync { RangeRequestId::BackfillSync { batch_id }, &synced_peers, &failed_peers, + // Does not track total requests per peers for now + &HashMap::new(), ) { Ok(request_id) => { // inform the batch about the new request diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 8ea9caa9eca..084e6395bf6 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -412,6 +412,7 @@ impl SyncNetworkContext { requester: RangeRequestId, peers: &HashSet, peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, ) -> Result { let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); let batch_type = self.batch_type(batch_epoch); @@ -426,13 +427,16 @@ impl SyncNetworkContext { peers_to_deprioritize.contains(peer), // Prefer peers with less overall requests active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Prefer peers with less total cummulative requests, so we fetch data from a + // diverse set of peers + total_requests_per_peer.get(peer).copied().unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties rand::random::(), peer, ) }) .min() - .map(|(_, _, _, peer)| *peer) + .map(|(_, _, _, _, peer)| *peer) else { // Backfill and forward sync handle this condition gracefully. // - Backfill sync: will pause waiting for more peers to join diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 3fed7153fce..fe31e4dc2d0 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,4 +1,5 @@ use beacon_chain::block_verification_types::RpcBlock; +use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; @@ -53,6 +54,12 @@ impl BatchPeerGroup { pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { self.column_peers.get(index) } + + pub fn iter_unique_peers(&self) -> impl Iterator { + std::iter::once(&self.block_peer) + .chain(self.column_peers.values()) + .unique() + } } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index f576d30bfc5..855bd18d945 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -9,7 +9,7 @@ use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; -use std::collections::{btree_map::Entry, BTreeMap, HashSet}; +use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; use std::fmt; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; @@ -87,9 +87,11 @@ pub struct SyncingChain { batches: BTreeMap>, /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain - /// and thus available to download this chain from, as well as the batches we are currently - /// requesting. - peers: HashSet, + /// and thus available to download this chain from. + /// + /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain + /// `HashMap` + peers: HashMap, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -148,7 +150,7 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers: HashSet::from_iter([peer_id]), + peers: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -178,7 +180,7 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.iter().cloned() + self.peers.keys().cloned() } /// Progress in epochs made by the chain @@ -231,6 +233,12 @@ impl SyncingChain { request_id: Id, blocks: Vec>, ) -> ProcessingResult { + // Account for one more requests to this peer + // TODO(das): this code assumes that we do a single request per peer per RpcBlock + for peer in peer.iter_unique_peers() { + *self.peers.entry(*peer).or_default() += 1; + } + // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { None => { @@ -580,7 +588,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for peer in self.peers.drain() { + for (peer, _) in self.peers.drain() { network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -845,7 +853,7 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - self.peers.insert(peer_id); + self.peers.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -940,6 +948,7 @@ impl SyncingChain { }, &synced_peers, &failed_peers, + &self.peers, ) { Ok(request_id) => { // inform the batch about the new request From 48c8b4f8a4376d9ba40908f6ab6d9178aa67dc9d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 7 May 2025 02:33:05 -0300 Subject: [PATCH 04/11] Implement strong custodial attributability and individual column retries --- .../src/service/api_types.rs | 9 + .../network/src/sync/network_context.rs | 151 +++++- .../src/sync/network_context/custody.rs | 6 +- .../sync/network_context/custody_by_range.rs | 468 ++++++++++++++++++ beacon_node/network/src/sync/peer_sampling.rs | 2 +- 5 files changed, 619 insertions(+), 17 deletions(-) create mode 100644 beacon_node/network/src/sync/network_context/custody_by_range.rs diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index b36f8cc2154..ebc4b8f3643 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -63,6 +63,14 @@ pub struct DataColumnsByRangeRequestId { pub parent_request_id: ComponentsByRangeRequestId, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyByRangeRequestId { + /// Id to identify this attempt at a meta custody by range request for `parent_request_id` + pub id: Id, + /// The Id of the overall By Range request for block components. + pub parent_request_id: ComponentsByRangeRequestId, +} + /// Block components by range request for range sync. Includes an ID for downstream consumers to /// handle retries and tie all their sub requests together. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -221,6 +229,7 @@ macro_rules! impl_display { impl_display!(BlocksByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); +impl_display!(CustodyByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 084e6395bf6..a5bb15c4c66 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,7 +1,8 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; +use self::custody::{ActiveCustodyByRootRequest, Error as CustodyRequestError}; +use self::custody_by_range::{ActiveCustodyByRangeRequest, Error as CustodyByRangeError}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; @@ -22,8 +23,8 @@ use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, Req pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + CustodyByRangeRequestId, CustodyId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; @@ -41,11 +42,12 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, Slot, }; pub mod custody; +pub mod custody_by_range; mod requests; #[derive(Debug)] @@ -189,8 +191,11 @@ pub struct SyncNetworkContext { data_columns_by_range_requests: ActiveRequests>, - /// Mapping of active custody column requests for a block root - custody_by_root_requests: FnvHashMap>, + /// Mapping of active custody column by root requests for a block root + custody_by_root_requests: FnvHashMap>, + + /// Mapping of active custody column by range requests + custody_by_range_requests: FnvHashMap>, /// BlocksByRange requests paired with other ByRange requests for data components components_by_range_requests: @@ -248,6 +253,7 @@ impl SyncNetworkContext { blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), + custody_by_range_requests: <_>::default(), components_by_range_requests: FnvHashMap::default(), network_beacon_processor, chain, @@ -276,6 +282,7 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, execution_engine_state: _, @@ -379,6 +386,7 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, execution_engine_state: _, @@ -499,7 +507,8 @@ impl SyncNetworkContext { id, ) }) - .collect::, _>>()?; + .collect::, _>>() + .expect("remove this code"); Ok((requests, column_to_peer_map)) }) @@ -821,7 +830,7 @@ impl SyncNetworkContext { } /// Request to send a single `data_columns_by_root` request to the network. - pub fn data_column_lookup_request( + pub fn data_columns_by_root_request( &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, @@ -915,7 +924,7 @@ impl SyncNetworkContext { ); let requester = CustodyRequester(id); - let mut request = ActiveCustodyRequest::new( + let mut request = ActiveCustodyByRootRequest::new( block_root, CustodyId { requester }, &custody_indexes_to_fetch, @@ -1038,7 +1047,7 @@ impl SyncNetworkContext { peer_id: PeerId, request: DataColumnsByRangeRequest, parent_request_id: ComponentsByRangeRequestId, - ) -> Result { + ) -> Result { let id = DataColumnsByRangeRequestId { id: self.next_id(), parent_request_id, @@ -1049,7 +1058,7 @@ impl SyncNetworkContext { request: RequestType::DataColumnsByRange(request.clone()), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + .map_err(|_| "network send error")?; debug!( method = "DataColumnsByRange", @@ -1072,6 +1081,68 @@ impl SyncNetworkContext { Ok(id) } + /// Request to fetch all needed custody columns of a range of slot. This function may not send + /// any request to the network if no columns have to be fetched based on the import state of the + /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_range` + /// requests. + pub fn send_custody_by_range_request( + &mut self, + parent_id: ComponentsByRangeRequestId, + block_slots_with_data: Vec, + epoch: Epoch, + column_indices: Vec, + lookup_peers: Arc>>, + ) -> Result { + let id = CustodyByRangeRequestId { + id: self.next_id(), + parent_request_id: parent_id, + }; + + debug!( + indices = ?column_indices, + %id, + "Starting custody columns by range request" + ); + + let mut request = ActiveCustodyByRangeRequest::new( + epoch, + block_slots_with_data, + parent_id, + &column_indices, + lookup_peers, + ); + + // Note that you can only send, but not handle a response here + match request.continue_requests(self) { + Ok(_) => { + // Ignoring the result of `continue_requests` is okay. A request that has just been + // created cannot return data immediately, it must send some request to the network + // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. + self.custody_by_range_requests.insert(id, request); + Ok(id) + } + Err(e) => Err(match e { + CustodyByRangeError::NoPeer(column_index) => { + RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) + } + // - TooManyFailures: Should never happen, `request` has just been created, it's + // count of download_failures is 0 here + // - BadState: Should never happen, a bad state can only happen when handling a + // network response + // - UnexpectedRequestId: Never happens: this Err is only constructed handling a + // download or processing response + // - SendFailed: Should never happen unless in a bad drop sequence when shutting + // down the node + e @ (CustodyByRangeError::TooManyFailures + | CustodyByRangeError::BadState { .. } + | CustodyByRangeError::UnexpectedRequestId { .. } + | CustodyByRangeError::SendFailed { .. }) => { + RpcRequestSendError::InternalError(format!("{e:?}")) + } + }), + } + } + pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1402,7 +1473,7 @@ impl SyncNetworkContext { fn handle_custody_by_root_result( &mut self, id: CustodyRequester, - request: ActiveCustodyRequest, + request: ActiveCustodyByRootRequest, result: CustodyRequestResult, ) -> Option> { let span = span!( @@ -1432,6 +1503,60 @@ impl SyncNetworkContext { result } + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result. + /// - `None`: Request still active, requester should do no action + #[allow(clippy::type_complexity)] + pub fn on_custody_by_range_response( + &mut self, + id: CustodyByRangeRequestId, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + resp: RpcResponseResult>>>, + ) -> Option> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.custody_by_range_requests.remove(&id) else { + // TOOD(das): This log can happen if the request is error'ed early and dropped + debug!(?id, "Custody by range downloaded event for unknown request"); + return None; + }; + + let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + + self.handle_custody_by_range_result(id, request, result) + } + + fn handle_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + request: ActiveCustodyByRangeRequest, + result: CustodyRequestResult, + ) -> Option> { + let result = result + .map_err(RpcResponseError::CustodyRequestError) + .transpose(); + + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + match result.as_ref() { + Some(Ok((columns, peer_group, _))) => { + debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + } + Some(Err(e)) => { + debug!(?id, error = ?e, "Custody request failure, removing" ) + } + None => { + self.custody_by_range_requests.insert(id, request); + } + } + result + } + pub fn send_block_for_processing( &self, id: Id, diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index f4d010b881e..44e80e3766d 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -23,7 +23,7 @@ const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); type DataColumnSidecarList = Vec>>; -pub struct ActiveCustodyRequest { +pub struct ActiveCustodyByRootRequest { block_root: Hash256, custody_id: CustodyId, /// List of column indices this request needs to download to complete successfully @@ -62,7 +62,7 @@ struct ActiveBatchColumnsRequest { pub type CustodyRequestResult = Result, PeerGroup, Duration)>, Error>; -impl ActiveCustodyRequest { +impl ActiveCustodyByRootRequest { pub(crate) fn new( block_root: Hash256, custody_id: CustodyId, @@ -284,7 +284,7 @@ impl ActiveCustodyRequest { for (peer_id, indices) in columns_to_request_by_peer.into_iter() { let request_result = cx - .data_column_lookup_request( + .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, DataColumnsByRootSingleBlockRequest { diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs new file mode 100644 index 00000000000..aee22361444 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -0,0 +1,468 @@ +use beacon_chain::validator_monitor::timestamp_now; +use beacon_chain::BeaconChainTypes; +use fnv::FnvHashMap; +use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; +use lighthouse_network::service::api_types::{ + ComponentsByRangeRequestId, DataColumnsByRangeRequestId, +}; +use lighthouse_network::PeerId; +use lru_cache::LRUTimeCache; +use parking_lot::RwLock; +use rand::Rng; +use std::collections::HashSet; +use std::time::{Duration, Instant}; +use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use tracing::{debug, warn}; +use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Slot}; + +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; + +const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; +const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); + +type DataColumnSidecarList = Vec>>; + +pub struct ActiveCustodyByRangeRequest { + // TODO(das): Pass a better type for the by_range request + epoch: Epoch, + block_slots_with_data: Vec, + parent_id: ComponentsByRangeRequestId, + /// List of column indices this request needs to download to complete successfully + column_requests: FnvHashMap>, + /// Active requests for 1 or more columns each + active_batch_columns_requests: + FnvHashMap, + /// Peers that have recently failed to successfully respond to a columns by root request. + /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. + failed_peers: LRUTimeCache, + /// Set of peers that claim to have imported this block and their custody columns + lookup_peers: Arc>>, + + _phantom: PhantomData, +} + +#[derive(Debug, Eq, PartialEq)] +pub enum Error { + SendFailed(&'static str), + TooManyFailures, + BadState(String), + NoPeer(ColumnIndex), + /// Received a download result for a different request id than the in-flight request. + /// There should only exist a single request at a time. Having multiple requests is a bug and + /// can result in undefined state, so it's treated as a hard error and the batch is dropped. + UnexpectedRequestId { + expected_req_id: DataColumnsByRangeRequestId, + req_id: DataColumnsByRangeRequestId, + }, +} + +struct ActiveBatchColumnsRequest { + indices: Vec, +} + +pub type CustodyByRangeRequestResult = + Result, PeerGroup, Duration)>, Error>; + +impl ActiveCustodyByRangeRequest { + pub(crate) fn new( + epoch: Epoch, + block_slots_with_data: Vec, + parent_id: ComponentsByRangeRequestId, + column_indices: &[ColumnIndex], + lookup_peers: Arc>>, + ) -> Self { + Self { + epoch, + block_slots_with_data, + parent_id, + column_requests: HashMap::from_iter( + column_indices + .iter() + .map(|index| (*index, ColumnRequest::new())), + ), + active_batch_columns_requests: <_>::default(), + failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), + lookup_peers, + _phantom: PhantomData, + } + } + + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Err`: Custody request has failed and will be dropped + /// - `Ok(Some)`: Custody request has successfully completed and will be dropped + /// - `Ok(None)`: Custody request still active + pub(crate) fn on_data_column_downloaded( + &mut self, + peer_id: PeerId, + req_id: DataColumnsByRangeRequestId, + resp: RpcResponseResult>, + cx: &mut SyncNetworkContext, + ) -> CustodyRequestResult { + let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { + warn!( + id = ?self.parent_id, + %req_id, + "Received custody by range response for unrequested index" + ); + return Ok(None); + }; + + match resp { + Ok((data_columns, seen_timestamp)) => { + debug!( + id = ?self.parent_id, + %req_id, + %peer_id, + count = data_columns.len(), + "Custody by range request download success" + ); + + // Map columns by index as an optimization to not loop the returned list on each + // requested index. The worse case is 128 loops over a 128 item vec + mutation to + // drop the consumed columns. + let mut data_columns_by_index = + HashMap::<(ColumnIndex, Slot), Arc>>::new(); + for data_column in data_columns { + data_columns_by_index + .insert((data_column.index, data_column.slot()), data_column); + } + + // Accumulate columns that the peer does not have to issue a single log per request + let mut missing_column_indexes = vec![]; + + for index in &batch_request.indices { + let column_request = self + .column_requests + .get_mut(index) + .ok_or(Error::BadState(format!("unknown column_index {index}")))?; + + let columns_at_index = self + .block_slots_with_data + .iter() + .map(|slot| { + if let Some(data_column) = + data_columns_by_index.remove(&(*index, *slot)) + { + Ok(data_column) + } else { + // The following three statements are true: + // - block at `slot` is not missed, and has data + // - peer custodies this column `index` + // - peer claims to be synced to at least `slot` + // + // Therefore not returning this column is an protocol violation that we + // penalize and mark the peer as failed to retry with another peer. + // + // TODO(das) do not consider this case a success. We know for sure the block has + // data. However we allow the peer to return empty as we can't attribute fault. + // TODO(das): Should track which columns are missing and eventually give up + // TODO(das): If the peer is in the lookup peer set it claims to have imported + // the block AND its custody columns. So in this case we can downscore + Err(format!("Missing column at slot {slot} index {index}")) + } + }) + .collect::, _>>(); + + match columns_at_index { + Ok(columns_at_index) => { + column_request.on_download_success( + req_id, + peer_id, + columns_at_index, + seen_timestamp, + )?; + } + Err(reason) => { + column_request.on_download_error(req_id)?; + missing_column_indexes.push(index); + } + } + } + + if !missing_column_indexes.is_empty() { + // Note: Batch logging that columns are missing to not spam logger + debug!( + id = ?self.parent_id, + %req_id, + %peer_id, + // TODO(das): this property can become very noisy, being the full range 0..128 + ?missing_column_indexes, + "Custody by range peer claims to not have some data" + ); + + self.failed_peers.insert(peer_id); + } + } + Err(err) => { + debug!( + id = ?self.parent_id, + %req_id, + %peer_id, + error = ?err, + "Custody by range download error" + ); + + // TODO(das): Should mark peer as failed and try from another peer + for column_index in &batch_request.indices { + self.column_requests + .get_mut(column_index) + .ok_or(Error::BadState("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id)?; + } + + self.failed_peers.insert(peer_id); + } + }; + + self.continue_requests(cx) + } + + pub(crate) fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> CustodyRequestResult { + if self.column_requests.values().all(|r| r.is_downloaded()) { + // All requests have completed successfully. + let mut peers = HashMap::>::new(); + let mut seen_timestamps = vec![]; + let columns = std::mem::take(&mut self.column_requests) + .into_values() + .map(|request| { + let (peer, data_columns, seen_timestamp) = request.complete()?; + + for data_column in &data_columns { + let columns_by_peer = peers.entry(peer).or_default(); + if !columns_by_peer.contains(&(data_column.index as usize)) { + columns_by_peer.push(data_column.index as usize); + } + } + + seen_timestamps.push(seen_timestamp); + + Ok(data_columns) + }) + .collect::, _>>()? + // Flatten Vec> to Vec + // TODO(das): maybe not optimal for the coupling logic later + .into_iter() + .flatten() + .collect(); + + let peer_group = PeerGroup::from_set(peers); + let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); + return Ok(Some((columns, peer_group, max_seen_timestamp))); + } + + let active_request_count_by_peer = cx.active_request_count_by_peer(); + let mut columns_to_request_by_peer = HashMap::>::new(); + let lookup_peers = self.lookup_peers.read(); + + // Need to: + // - track how many active requests a peer has for load balancing + // - which peers have failures to attempt others + // - which peer returned what to have PeerGroup attributability + + for (column_index, request) in self.column_requests.iter_mut() { + if let Some(wait_duration) = request.is_awaiting_download() { + if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + return Err(Error::TooManyFailures); + } + + // TODO(das): When is a fork and only a subset of your peers know about a block, we should + // only query the peers on that fork. Should this case be handled? How to handle it? + let custodial_peers = cx.get_custodial_peers(*column_index); + + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation / status / block message claiming to have imported the + // lookup. The frequency of those messages is low, so drawing only from lookup_peers + // could cause many lookups to take much longer or fail as they don't have enough + // custody peers on a given column + let mut priorized_peers = custodial_peers + .iter() + .map(|peer| { + ( + // Prioritize peers that claim to know have imported this block + if lookup_peers.contains(peer) { 0 } else { 1 }, + // De-prioritize peers that have failed to successfully respond to + // requests recently + self.failed_peers.contains(peer), + // Prefer peers with fewer requests to load balance across peers. + // We batch requests to the same peer, so count existence in the + // `columns_to_request_by_peer` as a single 1 request. + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::thread_rng().gen::(), + *peer, + ) + }) + .collect::>(); + priorized_peers.sort_unstable(); + + if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + columns_to_request_by_peer + .entry(*peer_id) + .or_default() + .push(*column_index); + } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { + // Allow to request to sit stale in `NotStarted` state for at most + // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that + // lookup will naturally retry when other peers send us attestations for + // descendants of this un-available lookup. + return Err(Error::NoPeer(*column_index)); + } else { + // Do not issue requests if there is no custody peer on this column + } + } + } + + for (peer_id, indices) in columns_to_request_by_peer.into_iter() { + let req_id = cx + .send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + // TODO(das): generalize with constants from batch + start_slot: self + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + .as_u64(), + count: T::EthSpec::slots_per_epoch(), + columns: indices.clone(), + }, + self.parent_id, + ) + .map_err(Error::SendFailed)?; + + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::BadState("unknown column_index".to_owned()))?; + + column_request.on_download_start(req_id)?; + } + + self.active_batch_columns_requests + .insert(req_id, ActiveBatchColumnsRequest { indices }); + } + + Ok(None) + } +} + +/// TODO(das): this attempt count is nested into the existing lookup request count. +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; + +struct ColumnRequest { + status: Status, + download_failures: usize, +} + +#[derive(Debug, Clone)] +enum Status { + NotStarted(Instant), + Downloading(DataColumnsByRangeRequestId), + Downloaded(PeerId, DataColumnSidecarList, Duration), +} + +impl ColumnRequest { + fn new() -> Self { + Self { + status: Status::NotStarted(Instant::now()), + download_failures: 0, + } + } + + fn is_awaiting_download(&self) -> Option { + match self.status { + Status::NotStarted(start_time) => Some(start_time.elapsed()), + Status::Downloading { .. } | Status::Downloaded { .. } => None, + } + } + + fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted { .. } | Status::Downloading { .. } => false, + Status::Downloaded { .. } => true, + } + } + + fn on_download_start(&mut self, req_id: DataColumnsByRangeRequestId) -> Result<(), Error> { + match &self.status { + Status::NotStarted { .. } => { + self.status = Status::Downloading(req_id); + Ok(()) + } + other => Err(Error::BadState(format!( + "bad state on_download_start expected NotStarted got {other:?}" + ))), + } + } + + fn on_download_error(&mut self, req_id: DataColumnsByRangeRequestId) -> Result<(), Error> { + match &self.status { + Status::Downloading(expected_req_id) => { + if req_id != *expected_req_id { + return Err(Error::UnexpectedRequestId { + expected_req_id: *expected_req_id, + req_id, + }); + } + self.status = Status::NotStarted(Instant::now()); + Ok(()) + } + other => Err(Error::BadState(format!( + "bad state on_download_error expected Downloading got {other:?}" + ))), + } + } + + fn on_download_error_and_mark_failure( + &mut self, + req_id: DataColumnsByRangeRequestId, + ) -> Result<(), Error> { + // TODO(das): Should track which peers don't have data + self.download_failures += 1; + self.on_download_error(req_id) + } + + fn on_download_success( + &mut self, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_column: DataColumnSidecarList, + seen_timestamp: Duration, + ) -> Result<(), Error> { + match &self.status { + Status::Downloading(expected_req_id) => { + if req_id != *expected_req_id { + return Err(Error::UnexpectedRequestId { + expected_req_id: *expected_req_id, + req_id, + }); + } + self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); + Ok(()) + } + other => Err(Error::BadState(format!( + "bad state on_download_success expected Downloading got {other:?}" + ))), + } + } + + fn complete(self) -> Result<(PeerId, DataColumnSidecarList, Duration), Error> { + match self.status { + Status::Downloaded(peer_id, data_column, seen_timestamp) => { + Ok((peer_id, data_column, seen_timestamp)) + } + other => Err(Error::BadState(format!( + "bad state complete expected Downloaded got {other:?}" + ))), + } + } +} diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index 59b751787e3..71113e8bcc9 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -570,7 +570,7 @@ impl ActiveSamplingRequest { // Send requests. let mut sent_request = false; for (peer_id, column_indexes) in column_indexes_to_request { - cx.data_column_lookup_request( + cx.data_columns_by_root_request( DataColumnsByRootRequester::Sampling(SamplingId { id: self.requester_id, sampling_request_id: self.current_sampling_request_id, From c8c8329e65788b459e62985fb23fb412c685a360 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 8 May 2025 10:57:38 -0300 Subject: [PATCH 05/11] More fixes and tests --- .../src/block_verification_types.rs | 6 +- beacon_node/beacon_chain/src/test_utils.rs | 3 +- .../src/service/api_types.rs | 17 +- .../network/src/sync/backfill_sync/mod.rs | 7 +- .../network/src/sync/block_lookups/mod.rs | 6 +- .../src/sync/block_sidecar_coupling.rs | 587 ------------------ beacon_node/network/src/sync/manager.rs | 53 +- beacon_node/network/src/sync/mod.rs | 1 - .../network/src/sync/network_context.rs | 347 ++++------- .../block_components_by_range.rs | 452 ++++++++++++++ .../sync/network_context/custody_by_range.rs | 128 +++- beacon_node/network/src/sync/peer_sampling.rs | 10 +- .../network/src/sync/range_sync/batch.rs | 10 +- .../network/src/sync/range_sync/chain.rs | 8 +- .../network/src/sync/range_sync/mod.rs | 2 +- .../network/src/sync/range_sync/range.rs | 3 +- beacon_node/network/src/sync/tests/lookups.rs | 33 +- beacon_node/network/src/sync/tests/mod.rs | 7 +- beacon_node/network/src/sync/tests/range.rs | 425 ++++++++++--- consensus/types/src/signed_beacon_block.rs | 4 + 20 files changed, 1114 insertions(+), 995 deletions(-) delete mode 100644 beacon_node/network/src/sync/block_sidecar_coupling.rs create mode 100644 beacon_node/network/src/sync/network_context/block_components_by_range.rs diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index dbc2494e8b7..55cfd1c32eb 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -199,13 +199,15 @@ impl RpcBlock { custody_columns: Vec>, expected_custody_indices: Vec, spec: &ChainSpec, - ) -> Result { + ) -> Result { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); let custody_columns_count = expected_custody_indices.len(); let inner = RpcBlockInner::BlockAndCustodyColumns( block, - RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize)?, + RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize) + // This is an internal error that should never happen + .map_err(|e| format!("custody_columns variable list error: {e:?}"))?, expected_custody_indices, ); Ok(Self { diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 858aaafcf07..8f5a119fb5d 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2418,7 +2418,8 @@ where columns, expected_custody_indices, &self.spec, - )? + ) + .map_err(BlockError::InternalError)? } else { RpcBlock::new_without_blobs(Some(block_root), block, sampling_column_count) } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index ebc4b8f3643..8300ad4bb89 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -59,8 +59,8 @@ pub struct BlobsByRangeRequestId { pub struct DataColumnsByRangeRequestId { /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + /// The Id of the parent custody by range request that issued this data_columns_by_range request + pub parent_request_id: CustodyByRangeRequestId, } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -308,14 +308,17 @@ mod tests { fn display_id_data_columns_by_range() { let id = DataColumnsByRangeRequestId { id: 123, - parent_request_id: ComponentsByRangeRequestId { + parent_request_id: CustodyByRangeRequestId { id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), + parent_request_id: ComponentsByRangeRequestId { + id: 121, + requester: RangeRequestId::RangeSync { + chain_id: 54, + batch_id: Epoch::new(0), + }, }, }, }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); + assert_eq!(format!("{id}"), "123/122/121/RangeSync/0/54"); } } diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index bc1f291fc9c..7eed5416c16 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -20,7 +20,7 @@ use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::{PeerAction, PeerId}; +use lighthouse_network::PeerAction; use logging::crit; use std::collections::{ btree_map::{BTreeMap, Entry}, @@ -311,7 +311,6 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> Result<(), BackFillError> { @@ -325,7 +324,9 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - match batch.download_failed(Some(*peer_id)) { + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + match batch.download_failed(None) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 8c884f644e1..2c59f710d04 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -494,7 +494,7 @@ impl BlockLookups { let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { // We don't have the ability to cancel in-flight RPC requests. So this can happen // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); + debug!(%id, "Block returned for single block lookup not present"); return Err(LookupRequestError::UnknownLookup); }; @@ -507,7 +507,7 @@ impl BlockLookups { Ok((response, peer_group, seen_timestamp)) => { debug!( ?block_root, - ?id, + %id, ?peer_group, ?response_type, "Received lookup download success" @@ -540,7 +540,7 @@ impl BlockLookups { // the peer and the request ID which is linked to this `id` value here. debug!( ?block_root, - ?id, + %id, ?response_type, error = ?e, "Received lookup download failure" diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs deleted file mode 100644 index 90397649cc5..00000000000 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ /dev/null @@ -1,587 +0,0 @@ -use beacon_chain::{ - block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, -}; -use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - }, - PeerId, -}; -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; -use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, -}; - -use super::range_sync::BatchPeerGroup; - -pub struct RangeBlockComponentsRequest { - /// Blocks we have received awaiting for their corresponding sidecar. - blocks_request: ByRangeRequest>>>, - /// Sidecars we have received awaiting for their corresponding block. - block_data_request: RangeBlockDataRequest, -} - -enum ByRangeRequest { - Active(I), - Complete(T, PeerId), -} - -enum RangeBlockDataRequest { - /// All pre-deneb blocks - NoData, - /// All post-Deneb blocks, regardless of if they have data or not - Blobs(ByRangeRequest>>>), - /// All post-Fulu blocks, regardless of if they have data or not - DataColumns { - requests: HashMap< - DataColumnsByRangeRequestId, - ByRangeRequest>, - >, - expected_column_to_peer: HashMap, - }, -} - -impl RangeBlockComponentsRequest { - pub fn new( - blocks_req_id: BlocksByRangeRequestId, - blobs_req_id: Option, - data_columns: Option<( - Vec, - HashMap, - )>, - ) -> Self { - let block_data_request = if let Some(blobs_req_id) = blobs_req_id { - RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) - } else if let Some((requests, expected_column_to_peer)) = data_columns { - RangeBlockDataRequest::DataColumns { - requests: requests - .into_iter() - .map(|id| (id, ByRangeRequest::Active(id))) - .collect(), - expected_column_to_peer, - } - } else { - RangeBlockDataRequest::NoData - }; - - Self { - blocks_request: ByRangeRequest::Active(blocks_req_id), - block_data_request, - } - } - - pub fn add_blocks( - &mut self, - req_id: BlocksByRangeRequestId, - blocks: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - self.blocks_request.finish(req_id, blocks, peer_id) - } - - pub fn add_blobs( - &mut self, - req_id: BlobsByRangeRequestId, - blobs: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), - RangeBlockDataRequest::Blobs(ref mut req) => req.finish(req_id, blobs, peer_id), - RangeBlockDataRequest::DataColumns { .. } => { - Err("received blobs but expected data columns".to_owned()) - } - } - } - - pub fn add_custody_columns( - &mut self, - req_id: DataColumnsByRangeRequestId, - columns: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => { - Err("received data columns but expected no data".to_owned()) - } - RangeBlockDataRequest::Blobs(_) => { - Err("received data columns but expected blobs".to_owned()) - } - RangeBlockDataRequest::DataColumns { - ref mut requests, .. - } => { - let req = requests - .get_mut(&req_id) - .ok_or(format!("unknown data columns by range req_id {req_id}"))?; - req.finish(req_id, columns, peer_id) - } - } - } - - #[allow(clippy::type_complexity)] - pub fn responses( - &self, - spec: &ChainSpec, - ) -> Option>, BatchPeerGroup), String>> { - let Some((blocks, &block_peer)) = self.blocks_request.to_finished() else { - return None; - }; - - match &self.block_data_request { - RangeBlockDataRequest::NoData => Some( - Self::responses_with_blobs(blocks.to_vec(), vec![], spec) - .map(|blocks| (blocks, BatchPeerGroup::new_from_block_peer(block_peer))), - ), - RangeBlockDataRequest::Blobs(request) => { - let Some((blobs, _blob_peer)) = request.to_finished() else { - return None; - }; - Some( - Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) - .map(|blocks| (blocks, BatchPeerGroup::new_from_block_peer(block_peer))), - ) - } - RangeBlockDataRequest::DataColumns { - requests, - expected_column_to_peer, - } => { - let mut data_columns = vec![]; - let mut column_peers = HashMap::new(); - for req in requests.values() { - let Some((resp_columns, column_peer)) = req.to_finished() else { - return None; - }; - data_columns.extend(resp_columns.clone()); - for column in resp_columns { - column_peers.insert(column.index, *column_peer); - } - } - - Some( - Self::responses_with_custody_columns( - blocks.to_vec(), - data_columns, - expected_column_to_peer.clone(), - spec, - ) - .map(|blocks| (blocks, BatchPeerGroup::new(block_peer, column_peers))), - ) - } - } - } - - fn responses_with_blobs( - blocks: Vec>>, - blobs: Vec>>, - spec: &ChainSpec, - ) -> Result>, String> { - // There can't be more more blobs than blocks. i.e. sending any blob (empty - // included) for a skipped slot is not permitted. - let mut responses = Vec::with_capacity(blocks.len()); - let mut blob_iter = blobs.into_iter().peekable(); - for block in blocks.into_iter() { - let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; - let mut blob_list = Vec::with_capacity(max_blobs_per_block); - while { - let pair_next_blob = blob_iter - .peek() - .map(|sidecar| sidecar.slot() == block.slot()) - .unwrap_or(false); - pair_next_blob - } { - blob_list.push(blob_iter.next().ok_or("Missing next blob".to_string())?); - } - - let mut blobs_buffer = vec![None; max_blobs_per_block]; - for blob in blob_list { - let blob_index = blob.index as usize; - let Some(blob_opt) = blobs_buffer.get_mut(blob_index) else { - return Err("Invalid blob index".to_string()); - }; - if blob_opt.is_some() { - return Err("Repeat blob index".to_string()); - } else { - *blob_opt = Some(blob); - } - } - let blobs = RuntimeVariableList::new( - blobs_buffer.into_iter().flatten().collect::>(), - max_blobs_per_block, - ) - .map_err(|_| "Blobs returned exceeds max length".to_string())?; - responses.push(RpcBlock::new(None, block, Some(blobs)).map_err(|e| format!("{e:?}"))?) - } - - // if accumulated sidecars is not empty, throw an error. - if blob_iter.next().is_some() { - return Err("Received sidecars that don't pair well".to_string()); - } - - Ok(responses) - } - - fn responses_with_custody_columns( - blocks: Vec>>, - data_columns: DataColumnSidecarList, - expected_custody_columns: HashMap, - spec: &ChainSpec, - ) -> Result>, String> { - // Group data columns by block_root and index - let mut custody_columns_by_block = HashMap::>>::new(); - let mut block_roots_by_slot = HashMap::>::new(); - let expected_custody_indices = expected_custody_columns.keys().cloned().collect::>(); - - for column in data_columns { - let block_root = column.block_root(); - let index = column.index; - - block_roots_by_slot - .entry(column.slot()) - .or_default() - .insert(block_root); - - // Sanity check before casting to `CustodyDataColumn`. But this should never happen - if !expected_custody_columns.contains_key(&index) { - return Err(format!( - "Received column not in expected custody indices {index}" - )); - } - - custody_columns_by_block - .entry(block_root) - .or_default() - .push(CustodyDataColumn::from_asserted_custody(column)); - } - - // Now iterate all blocks ensuring that the block roots of each block and data column match, - // plus we have columns for our custody requirements - let rpc_blocks = blocks - .into_iter() - .map(|block| { - let block_root = get_block_root(&block); - block_roots_by_slot - .entry(block.slot()) - .or_default() - .insert(block_root); - - let custody_columns = custody_columns_by_block - .remove(&block_root) - .unwrap_or_default(); - - RpcBlock::new_with_custody_columns( - Some(block_root), - block, - custody_columns, - expected_custody_indices.clone(), - spec, - ) - .map_err(|e| format!("{e:?}")) - }) - .collect::, _>>()?; - - // Assert that there are no columns left for other blocks - if !custody_columns_by_block.is_empty() { - let remaining_roots = custody_columns_by_block.keys().collect::>(); - return Err(format!("Not all columns consumed: {remaining_roots:?}")); - } - - for (_slot, block_roots) in block_roots_by_slot { - if block_roots.len() > 1 { - // TODO: Some peer(s) are faulty or malicious. This batch will fail processing but - // we want to send it to the process to better attribute fault. Maybe warn log for - // now and track it in a metric? - } - } - - Ok(rpc_blocks) - } -} - -impl ByRangeRequest { - fn finish(&mut self, id: I, data: T, peer_id: PeerId) -> Result<(), String> { - match self { - Self::Active(expected_id) => { - if expected_id != &id { - return Err(format!("unexpected req_id expected {expected_id} got {id}")); - } - *self = Self::Complete(data, peer_id); - Ok(()) - } - Self::Complete(_, _) => Err("request already complete".to_owned()), - } - } - - fn to_finished(&self) -> Option<(&T, &PeerId)> { - match self { - Self::Active(_) => None, - Self::Complete(data, peer_id) => Some((data, peer_id)), - } - } -} - -#[cfg(test)] -mod tests { - use super::RangeBlockComponentsRequest; - use beacon_chain::test_utils::{ - generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, NumBlobs, - }; - use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, - }, - PeerId, - }; - use rand::SeedableRng; - use std::{collections::HashMap, sync::Arc}; - use types::{test_utils::XorShiftRng, Epoch, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; - - fn components_id() -> ComponentsByRangeRequestId { - ComponentsByRangeRequestId { - id: 0, - requester: RangeRequestId::RangeSync { - chain_id: 1, - batch_id: Epoch::new(0), - }, - } - } - - fn blocks_id(parent_request_id: ComponentsByRangeRequestId) -> BlocksByRangeRequestId { - BlocksByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn blobs_id(parent_request_id: ComponentsByRangeRequestId) -> BlobsByRangeRequestId { - BlobsByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn columns_id( - id: Id, - parent_request_id: ComponentsByRangeRequestId, - ) -> DataColumnsByRangeRequestId { - DataColumnsByRangeRequestId { - id, - parent_request_id, - } - } - - fn is_finished(info: &RangeBlockComponentsRequest) -> bool { - let spec = test_spec::(); - info.responses(&spec).is_some() - } - - #[test] - fn no_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, &mut rng, &spec) - .0 - .into() - }) - .collect::>>>(); - - let blocks_req_id = blocks_id(components_id()); - let mut info = RangeBlockComponentsRequest::::new(blocks_req_id, None, None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn empty_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - // Always generate some blobs. - generate_rand_block_and_blobs::( - ForkName::Deneb, - NumBlobs::Number(3), - &mut rng, - &spec, - ) - .0 - .into() - }) - .collect::>>>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let blobs_req_id = blobs_id(components_id); - let mut info = - RangeBlockComponentsRequest::::new(blocks_req_id, Some(blobs_req_id), None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - // Expect no blobs returned - info.add_blobs(blobs_req_id, vec![], peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed, even if blobs weren't returned. - // This makes sure we don't expect blobs here when they have expired. Checking this logic should - // be hendled elsewhere. - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns() { - let spec = test_spec::(); - let peer = PeerId::random(); - let expects_custody_columns = [1, 2, 3, 4]; - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = expects_custody_columns - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let column_to_peer = expects_custody_columns - .iter() - .map(|index| (*index, peer)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), column_to_peer)), - ); - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - // Send data columns - for (i, &column_index) in expects_custody_columns.iter().enumerate() { - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| b.1.iter().filter(|d| d.index == column_index).cloned()) - .collect(), - peer, - ) - .unwrap(); - - if i < expects_custody_columns.len() - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns_batched() { - let spec = test_spec::(); - let peer = PeerId::random(); - let batched_column_requests = [vec![1_u64, 2], vec![3, 4]]; - let expects_custody_columns = batched_column_requests - .iter() - .flatten() - .map(|index| (*index, peer)) - .collect::>(); - let custody_column_request_ids = - (0..batched_column_requests.len() as u32).collect::>(); - let num_of_data_column_requests = custody_column_request_ids.len(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = batched_column_requests - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), expects_custody_columns.clone())), - ); - - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - for (i, column_indices) in batched_column_requests.iter().enumerate() { - // Send the set of columns in the same batch request - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| { - b.1.iter() - .filter(|d| column_indices.contains(&d.index)) - .cloned() - }) - .collect::>(), - peer, - ) - .unwrap(); - - if i < num_of_data_column_requests - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } -} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index a490053e316..f8a6c748d18 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1166,10 +1166,9 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Block(id, resp), + RangeBlockComponent::Block(id, resp, peer_id), ); } } @@ -1181,10 +1180,9 @@ impl SyncManager { blob: RpcEvent>>, ) { if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Blob(id, resp), + RangeBlockComponent::Blob(id, resp, peer_id), ); } } @@ -1195,15 +1193,34 @@ impl SyncManager { peer_id: PeerId, data_column: RpcEvent>>, ) { + // data_columns_by_range returns either an Ok list of data columns, or an RpcResponseError if let Some(resp) = self .network .on_data_columns_by_range_response(id, peer_id, data_column) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::CustodyColumns(id, resp), - ); + // custody_by_range accumulates the results of multiple data_columns_by_range requests + // returning a bigger list of data columns across all the column indices this node has + // to custody + if let Some(result) = + self.network + .on_custody_by_range_response(id.parent_request_id, id, peer_id, resp) + { + // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not + // not have to pass a PeerGroup in case of error + let peers = match &result { + Ok((_, peers, _)) => peers.clone(), + Err(_) => todo!(), + }; + + self.on_block_components_by_range_response( + id.parent_request_id.parent_request_id, + RangeBlockComponent::CustodyColumns( + id.parent_request_id, + result.map(|(data, _peers, timestamp)| (data, timestamp)), + peers, + ), + ); + } } } @@ -1247,17 +1264,15 @@ impl SyncManager { /// Handles receiving a response for a range sync request that should have both blocks and /// blobs. - fn on_range_components_response( + fn on_block_components_by_range_response( &mut self, range_request_id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self.network.range_block_component_response( - range_request_id, - peer_id, - range_block_component, - ) { + if let Some(resp) = self + .network + .on_block_components_by_range_response(range_request_id, range_block_component) + { match resp { Ok((blocks, batch_peers)) => { match range_request_id.requester { @@ -1295,7 +1310,6 @@ impl SyncManager { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.inject_error( &mut self.network, - peer_id, batch_id, chain_id, range_request_id.id, @@ -1307,7 +1321,6 @@ impl SyncManager { match self.backfill_sync.inject_error( &mut self.network, batch_id, - &peer_id, range_request_id.id, e, ) { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 0f5fd6fb9f1..97302df04e8 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -3,7 +3,6 @@ //! Stores the various syncing methods for the beacon chain. mod backfill_sync; mod block_lookups; -mod block_sidecar_coupling; pub mod manager; mod network_context; mod peer_sampling; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index a5bb15c4c66..359db74566b 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -2,11 +2,12 @@ //! channel and stores a global RPC ID to perform requests. use self::custody::{ActiveCustodyByRootRequest, Error as CustodyRequestError}; -use self::custody_by_range::{ActiveCustodyByRangeRequest, Error as CustodyByRangeError}; +use self::custody_by_range::{ + ActiveCustodyByRangeRequest, CustodyByRangeRequestResult, Error as CustodyByRangeError, +}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; -use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::{BatchPeerGroup, ByRangeRequestType}; +use super::range_sync::BatchPeerGroup; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -16,6 +17,7 @@ use crate::sync::block_lookups::SingleLookupId; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; +pub use block_components_by_range::BlockComponentsByRangeRequest; use custody::CustodyRequestResult; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; @@ -33,7 +35,6 @@ use requests::{ ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems, BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems, }; -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -42,10 +43,11 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, - ForkContext, Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; +pub mod block_components_by_range; pub mod custody; pub mod custody_by_range; mod requests; @@ -76,8 +78,10 @@ pub type CustodyByRootResult = pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), + // TODO(das): find a way to collapse this errors into a more managable enum CustodyRequestError(#[allow(dead_code)] CustodyRequestError), - BlockComponentCouplingError(#[allow(dead_code)] String), + CustodyByRangeError(#[allow(dead_code)] CustodyByRangeError), + BlockComponentsByRangeError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] @@ -146,6 +150,17 @@ impl PeerGroup { } }) } + + pub fn as_reversed_map(&self) -> HashMap { + // TODO(das): should we change PeerGroup to hold this map? + let mut index_to_peer = HashMap::::new(); + for (peer, indices) in self.peers.iter() { + for &index in indices { + index_to_peer.insert(index as u64, *peer); + } + } + index_to_peer + } } /// Sequential ID that uniquely identifies ReqResp outgoing requests @@ -198,8 +213,8 @@ pub struct SyncNetworkContext { custody_by_range_requests: FnvHashMap>, /// BlocksByRange requests paired with other ByRange requests for data components - components_by_range_requests: - FnvHashMap>, + block_components_by_range_requests: + FnvHashMap>, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -218,14 +233,17 @@ pub enum RangeBlockComponent { Block( BlocksByRangeRequestId, RpcResponseResult>>>, + PeerId, ), Blob( BlobsByRangeRequestId, RpcResponseResult>>>, + PeerId, ), CustodyColumns( - DataColumnsByRangeRequestId, + CustodyByRangeRequestId, RpcResponseResult>>>, + PeerGroup, ), } @@ -254,7 +272,7 @@ impl SyncNetworkContext { data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), custody_by_range_requests: <_>::default(), - components_by_range_requests: FnvHashMap::default(), + block_components_by_range_requests: <_>::default(), network_beacon_processor, chain, fork_context, @@ -284,7 +302,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -334,6 +352,10 @@ impl SyncNetworkContext { &self.network_beacon_processor.network_globals } + pub fn spec(&self) -> &ChainSpec { + &self.chain.spec + } + /// Returns the Client type of the peer if known pub fn client_type(&self, peer_id: &PeerId) -> Client { self.network_globals() @@ -388,7 +410,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -422,207 +444,90 @@ impl SyncNetworkContext { peers_to_deprioritize: &HashSet, total_requests_per_peer: &HashMap, ) -> Result { - let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let batch_type = self.batch_type(batch_epoch); - - let active_request_count_by_peer = self.active_request_count_by_peer(); - - let Some(block_peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Prefer peers with less total cummulative requests, so we fetch data from a - // diverse set of peers - total_requests_per_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, _, peer)| *peer) - else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); - }; - - // Attempt to find all required custody peers before sending any request or creating an ID - let columns_by_range_peers_to_request = - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns.clone(); - Some(self.select_columns_by_range_peers_to_request( - &column_indexes, - peers, - active_request_count_by_peer, - peers_to_deprioritize, - )?) - } else { - None - }; - - // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; - - let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { - Some(self.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - )?) - } else { - None - }; - - let data_column_requests = columns_by_range_peers_to_request - .map(|columns_by_range_peers_to_request| { - let column_to_peer_map = columns_by_range_peers_to_request - .iter() - .flat_map(|(peer_id, columns)| columns.iter().map(|column| (*column, *peer_id))) - .collect::>(); - - let requests = columns_by_range_peers_to_request - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>() - .expect("remove this code"); - - Ok((requests, column_to_peer_map)) - }) - .transpose()?; + let req = BlockComponentsByRangeRequest::new( + id, + request, + peers, + peers_to_deprioritize, + total_requests_per_peer, + self, + )?; - let info = - RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_column_requests); - self.components_by_range_requests.insert(id, info); + self.block_components_by_range_requests.insert(id, req); + // TODO: use ID Ok(id.id) } - fn select_columns_by_range_peers_to_request( - &self, - custody_indexes: &HashSet, - peers: &HashSet, - active_request_count_by_peer: HashMap, - peers_to_deprioritize: &HashSet, - ) -> Result>, RpcRequestSendError> { - let mut columns_to_request_by_peer = HashMap::>::new(); - - for column_index in custody_indexes { - // Strictly consider peers that are custodials of this column AND are part of this - // syncing chain. If the forward range sync chain has few peers, it's likely that this - // function will not be able to find peers on our custody columns. - let Some(custody_peer) = peers - .iter() - .filter(|peer| { - self.network_globals() - .is_custody_peer_of(*column_index, peer) - }) - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - // Also account for requests that are not yet issued tracked in peer_id_to_request_map - // We batch requests to the same peer, so count existance in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // TODO(das): this will be pretty bad UX. To improve we should: - // - Handle the no peers case gracefully, maybe add some timeout and give a few - // minutes / seconds to the peer manager to locate peers on this subnet before - // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( - *column_index, - ))); - }; - - columns_to_request_by_peer - .entry(custody_peer) - .or_default() - .push(*column_index); - } - - Ok(columns_to_request_by_peer) - } - /// Received a blocks by range or blobs by range response for a request that couples blocks ' /// and blobs. #[allow(clippy::type_complexity)] - pub fn range_block_component_response( + pub fn on_block_components_by_range_response( &mut self, id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) -> Option>, BatchPeerGroup), RpcResponseError>> { - let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { - metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); return None; }; - if let Err(e) = { - let request = entry.get_mut(); - match range_block_component { - RangeBlockComponent::Block(req_id, resp) => resp.and_then(|(blocks, _)| { - request - .add_blocks(req_id, blocks, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::Blob(req_id, resp) => resp.and_then(|(blobs, _)| { + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + request + .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .map_err(RpcResponseError::BlockComponentsByRangeError) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .map_err(RpcResponseError::BlockComponentsByRangeError) + }), + RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { + resp.and_then(|(custody_columns, _)| { request - .add_blobs(req_id, blobs, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::CustodyColumns(req_id, resp) => { - resp.and_then(|(custody_columns, _)| { - request - .add_custody_columns(req_id, custody_columns, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }) - } + .on_custody_by_range_result(req_id, custody_columns, peers, self) + .map_err(RpcResponseError::BlockComponentsByRangeError) + }) } - } { - entry.remove(); - return Some(Err(e)); - } + }; + + let result = result.transpose(); - if let Some(blocks_result) = entry.get().responses(&self.chain.spec) { - entry.remove(); - // If the request is finished, dequeue everything - Some(blocks_result.map_err(RpcResponseError::BlockComponentCouplingError)) - } else { - None + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + match result.as_ref() { + Some(Ok((blocks, peer_group))) => { + let blocks_with_data = blocks + .iter() + .filter(|block| block.as_block().has_data()) + .count(); + debug!( + %id, + blocks = blocks.len(), + blocks_with_data, + peers = ?peer_group, + "Block components by range request success, removing" + ) + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Block components by range request failure, removing" ) + } + None => { + self.block_components_by_range_requests.insert(id, request); + } } + result } /// Request block of `block_root` if necessary by checking: @@ -1046,7 +951,7 @@ impl SyncNetworkContext { &mut self, peer_id: PeerId, request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: CustodyByRangeRequestId, ) -> Result { let id = DataColumnsByRangeRequestId { id: self.next_id(), @@ -1088,7 +993,7 @@ impl SyncNetworkContext { pub fn send_custody_by_range_request( &mut self, parent_id: ComponentsByRangeRequestId, - block_slots_with_data: Vec, + blocks_with_data: Vec, epoch: Epoch, column_indices: Vec, lookup_peers: Arc>>, @@ -1105,9 +1010,9 @@ impl SyncNetworkContext { ); let mut request = ActiveCustodyByRangeRequest::new( + id, epoch, - block_slots_with_data, - parent_id, + blocks_with_data, &column_indices, lookup_peers, ); @@ -1247,34 +1152,6 @@ impl SyncNetworkContext { id } - /// Check whether a batch for this epoch (and only this epoch) should request just blocks or - /// blocks and blobs. - fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { - // Induces a compile time panic if this doesn't hold true. - #[allow(clippy::assertions_on_constants)] - const _: () = assert!( - super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 - && super::range_sync::EPOCHS_PER_BATCH == 1, - "To deal with alignment with deneb boundaries, batches need to be of just one epoch" - ); - - if self - .chain - .data_availability_checker - .data_columns_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndColumns - } else if self - .chain - .data_availability_checker - .blobs_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndBlobs - } else { - ByRangeRequestType::Blocks - } - } - /// Attempt to make progress on all custody_by_root requests. Some request may be stale waiting /// for custody peers. Returns a Vec of results as zero or more requests may fail in this /// attempt. @@ -1460,8 +1337,10 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Custody column downloaded event for unknown request"); + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_root"], + ); return None; }; @@ -1491,10 +1370,10 @@ impl SyncNetworkContext { // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } Some(Err(e)) => { - debug!(?id, error = ?e, "Custody request failure, removing" ) + debug!(%id, error = ?e, "Custody by root request failure, removing" ) } None => { self.custody_by_root_requests.insert(id, request); @@ -1522,7 +1401,7 @@ impl SyncNetworkContext { // do nested requests let Some(mut request) = self.custody_by_range_requests.remove(&id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Custody by range downloaded event for unknown request"); + debug!(%id, "Custody by range downloaded event for unknown request"); return None; }; @@ -1535,20 +1414,20 @@ impl SyncNetworkContext { &mut self, id: CustodyByRangeRequestId, request: ActiveCustodyByRangeRequest, - result: CustodyRequestResult, + result: CustodyByRangeRequestResult, ) -> Option> { let result = result - .map_err(RpcResponseError::CustodyRequestError) + .map_err(RpcResponseError::CustodyByRangeError) .transpose(); // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by range request success, removing") } Some(Err(e)) => { - debug!(?id, error = ?e, "Custody request failure, removing" ) + debug!(%id, error = ?e, "Custody by range request failure, removing" ) } None => { self.custody_by_range_requests.insert(id, request); @@ -1618,7 +1497,7 @@ impl SyncNetworkContext { .beacon_processor_if_enabled() .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - debug!(?block_root, ?id, "Sending blobs for processing"); + debug!(?block_root, %id, "Sending blobs for processing"); // Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync // must receive a single `SyncMessage::BlockComponentProcessed` event with this process type beacon_processor @@ -1689,8 +1568,8 @@ impl SyncNetworkContext { ), ("custody_by_root", self.custody_by_root_requests.len()), ( - "components_by_range", - self.components_by_range_requests.len(), + "block_components_by_range", + self.block_components_by_range_requests.len(), ), ] { metrics::set_gauge_vec(&metrics::SYNC_ACTIVE_NETWORK_REQUESTS, &[id], count as i64); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs new file mode 100644 index 00000000000..47cc6ee0f73 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -0,0 +1,452 @@ +use crate::sync::network_context::{ + NoPeerError, PeerGroup, RpcRequestSendError, SyncNetworkContext, +}; +use crate::sync::range_sync::BatchPeerGroup; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::data_column_verification::CustodyDataColumn; +use beacon_chain::{get_block_root, BeaconChainTypes}; +use lighthouse_network::rpc::methods::{BlobsByRangeRequest, BlocksByRangeRequest}; +use lighthouse_network::service::api_types::{ + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, +}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use types::{ + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, SignedBeaconBlock, + Slot, +}; + +pub struct BlockComponentsByRangeRequest { + id: ComponentsByRangeRequestId, + peers: Arc>>, + request: BlocksByRangeRequest, + state: State, +} + +enum State { + Base { + blocks_by_range_request: + ByRangeRequest>>>, + }, + // Two single concurrent requests for block + blobs + DenebEnabled { + blocks_by_range_request: + ByRangeRequest>>>, + blobs_by_range_request: ByRangeRequest>>>, + }, + // Request blocks first, then columns + FuluEnabled(FuluEnabledState), +} + +enum FuluEnabledState { + BlockRequest { + blocks_by_range_request: + ByRangeRequest>>>, + }, + CustodyRequest { + blocks: Vec>>, + block_peer: PeerId, + custody_by_range_request: + ByRangeRequest>>, PeerGroup>, + }, +} + +enum ByRangeRequest { + /// Active(RequestIndex) + Active(I), + /// Complete(DownloadedData, Peers) + Complete(T, P), +} + +pub type BlockComponentsByRangeRequestResult = + Result>, BatchPeerGroup)>, String>; + +impl BlockComponentsByRangeRequest { + pub fn new( + id: ComponentsByRangeRequestId, + request: BlocksByRangeRequest, + peers: &HashSet, + peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, + cx: &mut SyncNetworkContext, + ) -> Result { + // Induces a compile time panic if this doesn't hold true. + #[allow(clippy::assertions_on_constants)] + const _: () = assert!( + super::super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 + && super::super::range_sync::EPOCHS_PER_BATCH == 1, + "To deal with alignment with deneb boundaries, batches need to be of just one epoch" + ); + // The assertion above ensures each batch is in one single epoch + let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); + let batch_fork = cx.spec().fork_name_at_epoch(batch_epoch); + + // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we + // will request all blocks for the first 5 epochs to that same single peer. Before we would + // query only idle peers in the batch. + let Some(block_peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // TODO(das): Should we use active_request_count_by_peer? + // Prefer peers with less overall requests + // active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Prefer peers with less total cummulative requests, so we fetch data from a + // diverse set of peers + total_requests_per_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { + // Backfill and forward sync handle this condition gracefully. + // - Backfill sync: will pause waiting for more peers to join + // - Forward sync: can never happen as the chain is dropped when removing the last peer. + return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); + }; + + let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; + + let state = if batch_fork.fulu_enabled() { + State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + }) + } else if batch_fork.deneb_enabled() { + // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I + // would like the beacon processor / da_checker to be the one that decides if an + // RpcBlock is valid or not with respect to containing blobs. Having sync not even + // attempt a requests seems like an added limitation. + let blobs_req_id = cx.send_blobs_by_range_request( + block_peer, + BlobsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + }, + id, + )?; + State::DenebEnabled { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + blobs_by_range_request: ByRangeRequest::Active(blobs_req_id), + } + } else { + State::Base { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + } + }; + + Ok(Self { + id, + // TODO(das): share the rwlock with the range sync batch. Are peers added to the batch + // after being created? + peers: Arc::new(RwLock::new(peers.clone())), + request, + state, + }) + } + + pub fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + // TODO(das): use the peer group + let peer_group = BatchPeerGroup::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_base(blocks.to_vec()); + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range requests to complete + Ok(None) + } + } + State::DenebEnabled { + blocks_by_range_request, + blobs_by_range_request, + } => { + if let (Some((blocks, block_peer)), Some((blobs, _))) = ( + blocks_by_range_request.to_finished(), + blobs_by_range_request.to_finished(), + ) { + // We use the same block_peer for the blobs request + let peer_group = BatchPeerGroup::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_deneb(blocks.to_vec(), blobs.to_vec()); + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range and blobs_by_range requests to complete + Ok(None) + } + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + // TODO(das): use the peer group + let blocks_with_data = blocks + .iter() + .filter(|block| block.has_data()) + .map(|block| block.signed_block_header()) + .collect::>(); + + if blocks_with_data.is_empty() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + // Done, we got blocks and no columns needed + let peer_group = BatchPeerGroup::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + vec![], + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + let mut column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect::>(); + column_indices.sort_unstable(); + + let req_id = cx + .send_custody_by_range_request( + self.id, + blocks_with_data, + Slot::new(*self.request.start_slot()) + .epoch(T::EthSpec::slots_per_epoch()), + column_indices, + self.peers.clone(), + ) + .map_err(|e| match e { + RpcRequestSendError::NoPeer(e) => todo!("what to do? {e:?}"), + RpcRequestSendError::InternalError(e) => format!( + "InternalError sending custody_by_range request: {e}" + ), + })?; + + *state = FuluEnabledState::CustodyRequest { + blocks: blocks.to_vec(), + block_peer: *block_peer, + custody_by_range_request: ByRangeRequest::Active(req_id), + }; + + // Wait for the new custody_by_range request to complete + Ok(None) + } + } else { + // Wait for the block request to complete + Ok(None) + } + } + FuluEnabledState::CustodyRequest { + blocks, + block_peer, + custody_by_range_request, + } => { + if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + let peer_group = + BatchPeerGroup::new(*block_peer, column_peers.as_reversed_map()); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + columns.to_vec(), + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for the custody_by_range request to complete + Ok(None) + } + } + }, + } + } + + pub fn on_blocks_by_range_result( + &mut self, + id: BlocksByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } + | State::DenebEnabled { + blocks_by_range_request, + .. + } + | State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request, + }) => { + blocks_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(FuluEnabledState::CustodyRequest { .. }) => { + return Err( + "Received blocks_by_range response expecting custody_by_range".to_string(), + ) + } + } + + self.continue_requests(cx) + } + + pub fn on_blobs_by_range_result( + &mut self, + id: BlobsByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } => { + return Err("Received blobs_by_range response before Deneb".to_string()) + } + State::DenebEnabled { + blobs_by_range_request, + .. + } => { + blobs_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(_) => { + return Err("Received blobs_by_range response after PeerDAS".to_string()) + } + } + + self.continue_requests(cx) + } + + pub fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + data: Vec>>, + peers: PeerGroup, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } | State::DenebEnabled { .. } => { + return Err("Received custody_by_range response before PeerDAS".to_string()) + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + return Err("Received custody_by_range expecting blocks_by_range".to_string()); + } + FuluEnabledState::CustodyRequest { + custody_by_range_request, + .. + } => { + custody_by_range_request.finish(id, data, peers)?; + } + }, + } + + self.continue_requests(cx) + } +} + +fn couple_blocks_base(_blocks: Vec>>) -> Vec> { + todo!(); +} + +fn couple_blocks_deneb( + _blocks: Vec>>, + _blobs: Vec>>, +) -> Vec> { + todo!(); +} + +fn couple_blocks_fulu( + blocks: Vec>>, + data_columns: Vec>>, + custody_column_indices: Vec, + spec: &ChainSpec, +) -> Result>, String> { + // Group data columns by block_root and index + let mut custody_columns_by_block = HashMap::>>::new(); + + for column in data_columns { + let block_root = column.block_root(); + + if custody_column_indices.contains(&column.index) { + custody_columns_by_block + .entry(block_root) + .or_default() + // Safe to convert to `CustodyDataColumn`: we have asserted that the index of + // this column is in the set of `expects_custody_columns` and with the expected + // block root, so for the expected epoch of this batch. + .push(CustodyDataColumn::from_asserted_custody(column)); + } + } + + // Now iterate all blocks ensuring that the block roots of each block and data column match, + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let data_columns_with_block_root = custody_columns_by_block + // Remove to only use columns once + .remove(&block_root) + .unwrap_or_default(); + + // TODO(das): Change RpcBlock to holding a Vec of DataColumnSidecars so we don't need + // the spec here. + RpcBlock::new_with_custody_columns( + Some(block_root), + block, + data_columns_with_block_root, + custody_column_indices.clone(), + spec, + ) + }) + .collect::, _>>() +} + +impl ByRangeRequest { + fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), String> { + match self { + Self::Active(expected_id) => { + if expected_id != &id { + return Err(format!("unexpected req_id expected {expected_id} got {id}")); + } + *self = Self::Complete(data, peer_id); + Ok(()) + } + Self::Complete(_, _) => Err("request already complete".to_owned()), + } + } + + fn to_finished(&self) -> Option<(&T, &P)> { + match self { + Self::Active(_) => None, + Self::Complete(data, peer_id) => Some((data, peer_id)), + } + } +} diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index aee22361444..cc0699ce7bf 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -3,7 +3,7 @@ use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use lighthouse_network::service::api_types::{ - ComponentsByRangeRequestId, DataColumnsByRangeRequestId, + CustodyByRangeRequestId, DataColumnsByRangeRequestId, }; use lighthouse_network::PeerId; use lru_cache::LRUTimeCache; @@ -13,7 +13,10 @@ use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; -use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Slot}; +use types::{ + data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Hash256, + SignedBeaconBlockHeader, Slot, +}; use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -23,10 +26,11 @@ const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); type DataColumnSidecarList = Vec>>; pub struct ActiveCustodyByRangeRequest { + id: CustodyByRangeRequestId, // TODO(das): Pass a better type for the by_range request epoch: Epoch, - block_slots_with_data: Vec, - parent_id: ComponentsByRangeRequestId, + /// Blocks that we expect peers to serve data columns for + blocks_with_data: Vec, /// List of column indices this request needs to download to complete successfully column_requests: FnvHashMap>, /// Active requests for 1 or more columns each @@ -63,18 +67,27 @@ struct ActiveBatchColumnsRequest { pub type CustodyByRangeRequestResult = Result, PeerGroup, Duration)>, Error>; +enum ColumnResponseError { + NonMatchingColumn { + slot: Slot, + actual_block_root: Hash256, + expected_block_root: Hash256, + }, + MissingColumn(Slot), +} + impl ActiveCustodyByRangeRequest { pub(crate) fn new( + id: CustodyByRangeRequestId, epoch: Epoch, - block_slots_with_data: Vec, - parent_id: ComponentsByRangeRequestId, + blocks_with_data: Vec, column_indices: &[ColumnIndex], lookup_peers: Arc>>, ) -> Self { Self { + id, epoch, - block_slots_with_data, - parent_id, + blocks_with_data, column_requests: HashMap::from_iter( column_indices .iter() @@ -101,10 +114,10 @@ impl ActiveCustodyByRangeRequest { req_id: DataColumnsByRangeRequestId, resp: RpcResponseResult>, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRangeRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( - id = ?self.parent_id, + id = %self.id, %req_id, "Received custody by range response for unrequested index" ); @@ -113,14 +126,6 @@ impl ActiveCustodyByRangeRequest { match resp { Ok((data_columns, seen_timestamp)) => { - debug!( - id = ?self.parent_id, - %req_id, - %peer_id, - count = data_columns.len(), - "Custody by range request download success" - ); - // Map columns by index as an optimization to not loop the returned list on each // requested index. The worse case is 128 loops over a 128 item vec + mutation to // drop the consumed columns. @@ -133,6 +138,8 @@ impl ActiveCustodyByRangeRequest { // Accumulate columns that the peer does not have to issue a single log per request let mut missing_column_indexes = vec![]; + let mut incorrect_column_indices = vec![]; + let mut imported_column_indices = vec![]; for index in &batch_request.indices { let column_request = self @@ -141,13 +148,27 @@ impl ActiveCustodyByRangeRequest { .ok_or(Error::BadState(format!("unknown column_index {index}")))?; let columns_at_index = self - .block_slots_with_data + .blocks_with_data .iter() - .map(|slot| { - if let Some(data_column) = - data_columns_by_index.remove(&(*index, *slot)) + .map(|block| { + let slot = block.message.slot; + if let Some(data_column) = data_columns_by_index.remove(&(*index, slot)) { - Ok(data_column) + let actual_block_root = + data_column.signed_block_header.message.canonical_root(); + let expected_block_root = block.message.canonical_root(); + if actual_block_root != expected_block_root { + Err(ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root: data_column + .signed_block_header + .message + .canonical_root(), + expected_block_root: block.message.canonical_root(), + }) + } else { + Ok(data_column) + } } else { // The following three statements are true: // - block at `slot` is not missed, and has data @@ -162,7 +183,7 @@ impl ActiveCustodyByRangeRequest { // TODO(das): Should track which columns are missing and eventually give up // TODO(das): If the peer is in the lookup peer set it claims to have imported // the block AND its custody columns. So in this case we can downscore - Err(format!("Missing column at slot {slot} index {index}")) + Err(ColumnResponseError::MissingColumn(slot)) } }) .collect::, _>>(); @@ -175,19 +196,64 @@ impl ActiveCustodyByRangeRequest { columns_at_index, seen_timestamp, )?; + + imported_column_indices.push(index); } - Err(reason) => { + Err(e) => { column_request.on_download_error(req_id)?; - missing_column_indexes.push(index); + + match e { + ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root, + expected_block_root, + } => { + incorrect_column_indices.push(( + index, + slot, + actual_block_root, + expected_block_root, + )); + } + ColumnResponseError::MissingColumn(slot) => { + missing_column_indexes.push((index, slot)); + } + } } } } + // Log missing_column_indexes and incorrect_column_indices here in batch per request + // to make this logs more compact and less noisy. + if !imported_column_indices.is_empty() { + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + count = imported_column_indices.len(), + "Custody by range request download imported columns" + ); + } + + if !incorrect_column_indices.is_empty() { + // Note: Batch logging that columns are missing to not spam logger + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + // TODO(das): this property can become very noisy, being the full range 0..128 + incorrect_columns = ?incorrect_column_indices, + "Custody by range peer returned non-matching columns" + ); + + self.failed_peers.insert(peer_id); + } + if !missing_column_indexes.is_empty() { // Note: Batch logging that columns are missing to not spam logger debug!( - id = ?self.parent_id, - %req_id, + id = %self.id, + data_columns_by_range_req_id = %req_id, %peer_id, // TODO(das): this property can become very noisy, being the full range 0..128 ?missing_column_indexes, @@ -199,7 +265,7 @@ impl ActiveCustodyByRangeRequest { } Err(err) => { debug!( - id = ?self.parent_id, + id = %self.id, %req_id, %peer_id, error = ?err, @@ -224,7 +290,7 @@ impl ActiveCustodyByRangeRequest { pub(crate) fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRangeRequestResult { if self.column_requests.values().all(|r| r.is_downloaded()) { // All requests have completed successfully. let mut peers = HashMap::>::new(); @@ -333,7 +399,7 @@ impl ActiveCustodyByRangeRequest { count: T::EthSpec::slots_per_epoch(), columns: indices.clone(), }, - self.parent_id, + self.id, ) .map_err(Error::SendFailed)?; diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index 71113e8bcc9..d76c7d2bbc2 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -98,13 +98,13 @@ impl Sampling { // TODO(das): Should track failed sampling request for some time? Otherwise there's // a risk of a loop with multiple triggers creating the request, then failing, // and repeat. - debug!(?id, "Ignoring duplicate sampling request"); + debug!(%id, "Ignoring duplicate sampling request"); return None; } }; debug!( - ?id, + %id, column_selection = ?request.column_selection(), "Created new sample request" ); @@ -138,7 +138,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample downloaded event for unknown request"); + debug!(%id, "Sample downloaded event for unknown request"); return None; }; @@ -167,7 +167,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample verified event for unknown request"); + debug!(%id, "Sample verified event for unknown request"); return None; }; @@ -191,7 +191,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let result = result.transpose(); if let Some(result) = result { - debug!(?id, ?result, "Sampling request completed, removing"); + debug!(%id, ?result, "Sampling request completed, removing"); metrics::inc_counter_vec( &metrics::SAMPLING_REQUEST_RESULT, &[metrics::from_result(&result)], diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index fe31e4dc2d0..18529603efe 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -18,15 +18,7 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; -/// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] -#[strum(serialize_all = "snake_case")] -pub enum ByRangeRequestType { - BlocksAndColumns, - BlocksAndBlobs, - Blocks, -} - +// TODO(das): Consider merging with PeerGroup #[derive(Clone, Debug)] pub struct BatchPeerGroup { block_peer: PeerId, diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 855bd18d945..00647f9848f 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -865,7 +865,6 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> ProcessingResult { @@ -880,7 +879,6 @@ impl SyncingChain { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), - %peer_id, %request_id, ?batch_state, "Batch not expecting block" @@ -891,12 +889,13 @@ impl SyncingChain { batch_epoch = %batch_id, batch_state = ?batch.state(), error = ?err, - %peer_id, %request_id, "Batch download error" ); if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(*peer_id))? + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + batch.download_failed(None)? { return Err(RemoveChain::ChainFailed { blacklist, @@ -907,7 +906,6 @@ impl SyncingChain { } else { debug!( batch_epoch = %batch_id, - %peer_id, %request_id, batch_state, "Batch not found" diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index f57c1497180..66f07833ea0 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -9,7 +9,7 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeerGroup, BatchProcessingResult, - BatchState, ByRangeRequestType, + BatchState, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 919a321f380..929752e3e26 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -344,7 +344,6 @@ where pub fn inject_error( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, batch_id: BatchId, chain_id: ChainId, request_id: Id, @@ -352,7 +351,7 @@ where ) { // check that this request is pending match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id, err) + chain.inject_error(network, batch_id, request_id, err) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 565d7bc9f81..114b6d624aa 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -113,6 +113,7 @@ impl TestRig { network_rx, network_rx_queue: vec![], sync_rx, + sent_blocks_by_range: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), sync_manager: SyncManager::new( @@ -356,22 +357,34 @@ impl TestRig { } pub fn new_connected_peer(&mut self) -> PeerId { - let key = self.determinstic_key(); - let peer_id = self - .network_globals - .peers - .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key); - self.log(&format!("Added new peer for testing {peer_id:?}")); - peer_id + self.add_connected_peer_testing_only(false) } pub fn new_connected_supernode_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(true) + } + + fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); - self.network_globals + let peer_id = self + .network_globals .peers .write() - .__add_connected_peer_testing_only(true, &self.harness.spec, key) + .__add_connected_peer_testing_only(supernode, &self.harness.spec, key); + let mut peer_custody_subnets = self + .network_globals + .peers + .read() + .peer_info(&peer_id) + .expect("peer was just added") + .custody_subnets_iter() + .map(|subnet| **subnet) + .collect::>(); + peer_custody_subnets.sort_unstable(); + self.log(&format!( + "Added new peer for testing {peer_id:?} custody subnets {peer_custody_subnets:?}" + )); + peer_id } fn determinstic_key(&mut self) -> CombinedKey { diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index ec24ddb036a..40a58bc325e 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -6,13 +6,15 @@ use beacon_chain::builder::Witness; use beacon_chain::eth1_chain::CachingEth1Backend; use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; use beacon_processor::WorkEvent; +use lighthouse_network::service::api_types::ComponentsByRangeRequestId; use lighthouse_network::NetworkGlobals; use rand_chacha::ChaCha20Rng; use slot_clock::ManualSlotClock; +use std::collections::HashMap; use std::sync::Arc; use store::MemoryStore; use tokio::sync::mpsc; -use types::{ChainSpec, ForkName, MinimalEthSpec as E}; +use types::{ChainSpec, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; mod lookups; mod range; @@ -64,4 +66,7 @@ struct TestRig { rng: ChaCha20Rng, fork_name: ForkName, spec: Arc, + + // Cache of sent blocks for PeerDAS responses + sent_blocks_by_range: HashMap>>>, } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 06dca355e53..85fe34cd724 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -15,14 +15,15 @@ use lighthouse_network::rpc::methods::{ }; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - SyncRequestId, + AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + DataColumnsByRangeRequestId, SyncRequestId, }; use lighthouse_network::{PeerId, SyncInfo}; use std::time::Duration; use types::{ - BlobSidecarList, BlockImportSource, Epoch, EthSpec, Hash256, MinimalEthSpec as E, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, + EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, + SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -35,7 +36,13 @@ pub(crate) enum DataSidecars { enum ByRangeDataRequestIds { PreDeneb, PrePeerDAS(BlobsByRangeRequestId, PeerId), - PostPeerDAS(Vec<(DataColumnsByRangeRequestId, PeerId)>), + PostPeerDAS( + Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )>, + ), } /// Sync tests are usually written in the form: @@ -46,10 +53,11 @@ enum ByRangeDataRequestIds { /// To make writting tests succint, the machinery in this testing rig automatically identifies /// _which_ request to complete. Picking the right request is critical for tests to pass, so this /// filter allows better expressivity on the criteria to identify the right request. -#[derive(Default, Debug, Clone)] +#[derive(Default, Debug, Clone, Copy)] struct RequestFilter { peer: Option, epoch: Option, + column_index: Option, } impl RequestFilter { @@ -62,13 +70,64 @@ impl RequestFilter { self.epoch = Some(epoch); self } + + fn column_index(mut self, index: u64) -> Self { + self.column_index = Some(index); + self + } + + fn matches(&self, peer: PeerId, start_slot: u64) -> bool { + if let Some(expected_epoch) = self.epoch { + let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); + if epoch != expected_epoch { + return false; + } + } + if let Some(expected_peer) = self.peer { + if peer != expected_peer { + return false; + } + } + true + } } fn filter() -> RequestFilter { RequestFilter::default() } +/// Instruct the testing rig how to complete requests for _by_range requests +#[derive(Debug, Clone, Copy)] +struct CompleteConfig { + with_data: bool, + custody_failure_at_index: Option, +} + +impl CompleteConfig { + // TODO(das): add tests where blocks don't have data + + fn custody_failure_at_index(mut self, index: u64) -> Self { + self.custody_failure_at_index = Some(index); + self + } +} + +fn complete() -> CompleteConfig { + CompleteConfig { + with_data: true, + custody_failure_at_index: None, + } +} + impl TestRig { + fn our_custody_indices(&self) -> Vec { + self.network_globals + .sampling_columns + .iter() + .copied() + .collect() + } + /// Produce a head peer with an advanced head fn add_head_peer(&mut self) -> PeerId { self.add_head_peer_with_root(Hash256::random()) @@ -191,31 +250,120 @@ impl TestRig { } } + fn expect_no_columns_by_range_requests(&mut self, request_filter: RequestFilter) { + let requests = self.find_data_columns_by_range_requests(request_filter); + if !requests.is_empty() { + panic!("Expected no DataColumnsByRange requests with filter {request_filter:?} but found {requests:?}"); + } + } + fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); } - fn find_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> ((BlocksByRangeRequestId, PeerId), ByRangeDataRequestIds) { - let filter_f = |peer: PeerId, start_slot: u64| { - if let Some(expected_epoch) = request_filter.epoch { - let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); - if epoch != expected_epoch { - return false; - } - } - if let Some(expected_peer) = request_filter.peer { - if peer != expected_peer { - return false; + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> Arc> { + let fork_name = self.spec.fork_name_at_slot::(slot); + let mut block = match fork_name { + ForkName::Fulu => { + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + block + .body_mut() + .blob_kzg_commitments_mut() + .expect("fulu block") + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); } + block } - true + _ => todo!("extend for other forks"), }; + *block.slot_mut() = slot; + Arc::new(SignedBeaconBlock::from_block(block, Signature::empty())) + } + + fn last_sent_blocks_by_range( + &mut self, + id: ComponentsByRangeRequestId, + ) -> Vec>> { + self.sent_blocks_by_range + .get(&id) + .cloned() + .unwrap_or_else(|| panic!("No blocks for ComponentsByRangeRequestId {id}")) + } - let block_req = self + fn send_blocks_by_range_response( + &mut self, + req_id: BlocksByRangeRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); + self.log(&format!( + "Completing BlocksByRange request {req_id} to {peer_id} with blocks {slots:?}" + )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: None, + seen_timestamp: D, + }); + + if self + .sent_blocks_by_range + .insert(req_id.parent_request_id, blocks.to_vec()) + .is_some() + { + panic!("Sent two blocks_by_range requests in the same epoch. We need better tracking"); + } + } + + fn send_data_columns_by_range_response( + &mut self, + id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + + fn find_blocks_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> (BlocksByRangeRequestId, PeerId) { + self .pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, @@ -224,28 +372,49 @@ impl TestRig { OldBlocksByRangeRequestV2 { start_slot, .. }, )), app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches(*peer_id, *start_slot) => Some((*id, *peer_id)), _ => None, }) .unwrap_or_else(|e| { panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }); + }) + } - let by_range_data_requests = if self.after_fulu() { - let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { + fn find_data_columns_by_range_requests( + &mut self, + request_filter: RequestFilter, + ) -> Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )> { + let mut data_columns_requests = vec![]; + while let Ok(data_columns_request) = + self.pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, request: - RequestType::DataColumnsByRange(DataColumnsByRangeRequest { - start_slot, .. - }), + RequestType::DataColumnsByRange( + req @ DataColumnsByRangeRequest { start_slot, .. }, + ), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches(*peer_id, *start_slot) => { + Some((*id, *peer_id, req.clone())) + } _ => None, - }) { - data_columns_requests.push(data_columns_request); - } + }) + { + data_columns_requests.push(data_columns_request); + } + data_columns_requests + } + + fn find_data_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> ByRangeDataRequestIds { + if self.after_fulu() { + let data_columns_requests = self.find_data_columns_by_range_requests(request_filter); if data_columns_requests.is_empty() { panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); } @@ -257,7 +426,7 @@ impl TestRig { peer_id, request: RequestType::BlobsByRange(BlobsByRangeRequest { start_slot, .. }), app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches(*peer_id, *start_slot) => Some((*id, *peer_id)), _ => None, }) .unwrap_or_else(|e| { @@ -266,35 +435,59 @@ impl TestRig { ByRangeDataRequestIds::PrePeerDAS(id, peer) } else { ByRangeDataRequestIds::PreDeneb - }; + } + } - (block_req, by_range_data_requests) + fn find_and_complete_block_components_by_range_request( + &mut self, + epoch: Epoch, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let id = + self.find_and_complete_blocks_by_range_request(epoch, request_filter, complete_config); + self.find_and_complete_data_by_range_request(request_filter, complete_config); + id } fn find_and_complete_blocks_by_range_request( &mut self, + epoch: Epoch, request_filter: RequestFilter, + complete_config: CompleteConfig, ) -> RangeRequestId { - let ((blocks_req_id, block_peer), by_range_data_request_ids) = - self.find_blocks_by_range_request(request_filter); + let (blocks_req_id, block_peer) = self.find_blocks_by_range_request(request_filter); - // Complete the request with a single stream termination - self.log(&format!( - "Completing BlocksByRange request {blocks_req_id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(blocks_req_id), - peer_id: block_peer, - beacon_block: None, - seen_timestamp: D, - }); + let block = self.zero_block_at_slot( + epoch.start_slot(E::slots_per_epoch()), + complete_config.with_data, + ); + let blocks = [block]; + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn find_and_complete_data_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + let by_range_data_request_ids = self.find_data_by_range_request(request_filter); + self.complete_data_by_range_request(by_range_data_request_ids, complete_config); + } + fn complete_data_by_range_request( + &mut self, + by_range_data_request_ids: ByRangeDataRequestIds, + complete_config: CompleteConfig, + ) { match by_range_data_request_ids { ByRangeDataRequestIds::PreDeneb => {} ByRangeDataRequestIds::PrePeerDAS(id, peer_id) => { // Complete the request with a single stream termination self.log(&format!( - "Completing BlobsByRange request {id:?} with empty stream" + "Completing BlobsByRange request {id} with empty stream" )); self.send_sync_message(SyncMessage::RpcBlob { sync_request_id: SyncRequestId::BlobsByRange(id), @@ -305,21 +498,62 @@ impl TestRig { } ByRangeDataRequestIds::PostPeerDAS(data_column_req_ids) => { // Complete the request with a single stream termination - for (id, peer_id) in data_column_req_ids { - self.log(&format!( - "Completing DataColumnsByRange request {id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); + for (id, peer_id, req) in data_column_req_ids { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: + kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!("Forced custody failure at request {id} for peer {peer_id} index {index:?}")); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); } } } - - blocks_req_id.parent_request_id.requester } fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { @@ -344,15 +578,19 @@ impl TestRig { &mut self, last_epoch: u64, request_filter: RequestFilter, + complete_config: CompleteConfig, ) { for epoch in 0..last_epoch { // Note: In this test we can't predict the block peer - let id = - self.find_and_complete_blocks_by_range_request(request_filter.clone().epoch(epoch)); + let id = self.find_and_complete_block_components_by_range_request( + Epoch::new(epoch), + request_filter.epoch(epoch), + complete_config, + ); if let RangeRequestId::RangeSync { batch_id, .. } = id { assert_eq!(batch_id.as_u64(), epoch, "Unexpected batch_id"); } else { - panic!("unexpected RangeRequestId {id:?}"); + panic!("unexpected RangeRequestId {id}"); } let id = match id { @@ -536,7 +774,12 @@ fn pause_and_resume_on_ee_offline() { // make the ee offline rig.update_execution_engine_state(EngineState::Offline); // send the response to the request - rig.find_and_complete_blocks_by_range_request(filter().peer(peer1).epoch(0)); + let epoch = Epoch::new(0); + rig.find_and_complete_block_components_by_range_request( + epoch, + filter().peer(peer1).epoch(0), + complete(), + ); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); @@ -547,7 +790,7 @@ fn pause_and_resume_on_ee_offline() { // Don't filter requests and the columns requests may be sent to peer1 or peer2 // We need to filter by epoch, because the previous batch eagerly sent requests for the next // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_blocks_by_range_request(filter().epoch(0)); + rig.find_and_complete_block_components_by_range_request(epoch, filter().epoch(0), complete()); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); // make the beacon processor available again. @@ -576,14 +819,11 @@ fn finalized_sync_enough_global_custody_peers_few_chain_peers() { // Current priorization only sends batches to idle peers, so we need enough peers for each batch // TODO: Test this with a single peer in the chain, it should still work - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS) as usize, - ); + r.add_random_peers(remote_info, 1); r.assert_state(RangeSyncType::Finalized); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); } #[test] @@ -618,5 +858,44 @@ fn finalized_sync_not_enough_custody_peers_on_start() { ); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); +} + +#[test] +fn finalized_sync_single_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + + // Unikely that the single peer we added has enough columns for us. Tests are determinstic and + // this error should never be hit + r.add_random_peer(remote_info.clone()); + r.assert_state(RangeSyncType::Finalized); + + // send the response to the request + // Don't filter requests and the columns requests may be sent to peer1 or peer2 + // We need to filter by epoch, because the previous batch eagerly sent requests for the next + // epoch for the other batch. So we can either filter by epoch of by sync type. + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + r.find_and_complete_block_components_by_range_request( + Epoch::new(0), + filter().epoch(0), + complete().custody_failure_at_index(column_index_to_fail), + ); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. + let reqs = r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); + // Find the requests first to assert that this is the only request that exists + r.expect_no_columns_by_range_requests(filter().epoch(0)); + // complete this one request without the custody failure now + r.complete_data_by_range_request(reqs, complete()); + + // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch + // 1 successfully } diff --git a/consensus/types/src/signed_beacon_block.rs b/consensus/types/src/signed_beacon_block.rs index eb5925a29b5..5670c789ab0 100644 --- a/consensus/types/src/signed_beacon_block.rs +++ b/consensus/types/src/signed_beacon_block.rs @@ -318,6 +318,10 @@ impl> SignedBeaconBlock .unwrap_or(0) } + pub fn has_data(&self) -> bool { + self.num_expected_blobs() > 0 + } + /// Used for displaying commitments in logs. pub fn commitments_formatted(&self) -> String { let Ok(commitments) = self.message().body().blob_kzg_commitments() else { From 0e43d3bbd05b6347d5e7bf0725318707643888d1 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 8 May 2025 21:29:05 -0300 Subject: [PATCH 06/11] Improve error managment of nested requests --- beacon_node/network/src/sync/manager.rs | 3 +- .../network/src/sync/network_context.rs | 67 +----- .../block_components_by_range.rs | 57 +++-- .../src/sync/network_context/custody.rs | 91 +++++--- .../sync/network_context/custody_by_range.rs | 162 ++++++++++---- .../src/sync/network_context/requests.rs | 2 +- beacon_node/network/src/sync/tests/range.rs | 202 ++++++++++++------ 7 files changed, 369 insertions(+), 215 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index f8a6c748d18..397bffa2ad9 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1209,7 +1209,8 @@ impl SyncManager { // not have to pass a PeerGroup in case of error let peers = match &result { Ok((_, peers, _)) => peers.clone(), - Err(_) => todo!(), + // TODO(das): this PeerGroup with a single peer is incorrect + Err(_) => PeerGroup::from_single(peer_id), }; self.on_block_components_by_range_response( diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 359db74566b..28d5de2135c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,10 +1,8 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::{ActiveCustodyByRootRequest, Error as CustodyRequestError}; -use self::custody_by_range::{ - ActiveCustodyByRangeRequest, CustodyByRangeRequestResult, Error as CustodyByRangeError, -}; +use self::custody::ActiveCustodyByRootRequest; +use self::custody_by_range::{ActiveCustodyByRangeRequest, CustodyByRangeRequestResult}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::manager::BlockProcessType; use super::range_sync::BatchPeerGroup; @@ -74,14 +72,11 @@ pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; pub type CustodyByRootResult = Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), - // TODO(das): find a way to collapse this errors into a more managable enum - CustodyRequestError(#[allow(dead_code)] CustodyRequestError), - CustodyByRangeError(#[allow(dead_code)] CustodyByRangeError), - BlockComponentsByRangeError(#[allow(dead_code)] String), + InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] @@ -486,18 +481,18 @@ impl SyncNetworkContext { RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { request .on_blocks_by_range_result(req_id, blocks, peer_id, self) - .map_err(RpcResponseError::BlockComponentsByRangeError) + .map_err(Into::::into) }), RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { request .on_blobs_by_range_result(req_id, blobs, peer_id, self) - .map_err(RpcResponseError::BlockComponentsByRangeError) + .map_err(Into::::into) }), RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { resp.and_then(|(custody_columns, _)| { request .on_custody_by_range_result(req_id, custody_columns, peers, self) - .map_err(RpcResponseError::BlockComponentsByRangeError) + .map_err(Into::::into) }) } }; @@ -845,25 +840,7 @@ impl SyncNetworkContext { self.custody_by_root_requests.insert(requester, request); Ok(LookupRequestResult::RequestSent(id.req_id)) } - Err(e) => Err(match e { - CustodyRequestError::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } - // - TooManyFailures: Should never happen, `request` has just been created, it's - // count of download_failures is 0 here - // - BadState: Should never happen, a bad state can only happen when handling a - // network response - // - UnexpectedRequestId: Never happens: this Err is only constructed handling a - // download or processing response - // - SendFailed: Should never happen unless in a bad drop sequence when shutting - // down the node - e @ (CustodyRequestError::TooManyFailures - | CustodyRequestError::BadState { .. } - | CustodyRequestError::UnexpectedRequestId { .. } - | CustodyRequestError::SendFailed { .. }) => { - RpcRequestSendError::InternalError(format!("{e:?}")) - } - }), + Err(e) => Err(e.into()), } } @@ -1026,25 +1003,7 @@ impl SyncNetworkContext { self.custody_by_range_requests.insert(id, request); Ok(id) } - Err(e) => Err(match e { - CustodyByRangeError::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } - // - TooManyFailures: Should never happen, `request` has just been created, it's - // count of download_failures is 0 here - // - BadState: Should never happen, a bad state can only happen when handling a - // network response - // - UnexpectedRequestId: Never happens: this Err is only constructed handling a - // download or processing response - // - SendFailed: Should never happen unless in a bad drop sequence when shutting - // down the node - e @ (CustodyByRangeError::TooManyFailures - | CustodyByRangeError::BadState { .. } - | CustodyByRangeError::UnexpectedRequestId { .. } - | CustodyByRangeError::SendFailed { .. }) => { - RpcRequestSendError::InternalError(format!("{e:?}")) - } - }), + Err(e) => Err(e.into()), } } @@ -1362,9 +1321,7 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result - .map_err(RpcResponseError::CustodyRequestError) - .transpose(); + let result = result.map_err(Into::::into).transpose(); // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. @@ -1416,9 +1373,7 @@ impl SyncNetworkContext { request: ActiveCustodyByRangeRequest, result: CustodyByRangeRequestResult, ) -> Option> { - let result = result - .map_err(RpcResponseError::CustodyByRangeError) - .transpose(); + let result = result.map_err(Into::::into).transpose(); // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 47cc6ee0f73..80849b6f029 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -1,5 +1,5 @@ use crate::sync::network_context::{ - NoPeerError, PeerGroup, RpcRequestSendError, SyncNetworkContext, + NoPeerError, PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; use crate::sync::range_sync::BatchPeerGroup; use beacon_chain::block_verification_types::RpcBlock; @@ -62,7 +62,19 @@ enum ByRangeRequest { } pub type BlockComponentsByRangeRequestResult = - Result>, BatchPeerGroup)>, String>; + Result>, BatchPeerGroup)>, Error>; + +pub enum Error { + InternalError(String), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + } + } +} impl BlockComponentsByRangeRequest { pub fn new( @@ -238,9 +250,11 @@ impl BlockComponentsByRangeRequest { ) .map_err(|e| match e { RpcRequestSendError::NoPeer(e) => todo!("what to do? {e:?}"), - RpcRequestSendError::InternalError(e) => format!( - "InternalError sending custody_by_range request: {e}" - ), + RpcRequestSendError::InternalError(e) => { + Error::InternalError(format!( + "InternalError sending custody_by_range request: {e}", + )) + } })?; *state = FuluEnabledState::CustodyRequest { @@ -310,9 +324,9 @@ impl BlockComponentsByRangeRequest { blocks_by_range_request.finish(id, data, peer_id)?; } State::FuluEnabled(FuluEnabledState::CustodyRequest { .. }) => { - return Err( + return Err(Error::InternalError( "Received blocks_by_range response expecting custody_by_range".to_string(), - ) + )) } } @@ -328,7 +342,9 @@ impl BlockComponentsByRangeRequest { ) -> BlockComponentsByRangeRequestResult { match &mut self.state { State::Base { .. } => { - return Err("Received blobs_by_range response before Deneb".to_string()) + return Err(Error::InternalError( + "Received blobs_by_range response before Deneb".to_string(), + )) } State::DenebEnabled { blobs_by_range_request, @@ -337,7 +353,9 @@ impl BlockComponentsByRangeRequest { blobs_by_range_request.finish(id, data, peer_id)?; } State::FuluEnabled(_) => { - return Err("Received blobs_by_range response after PeerDAS".to_string()) + return Err(Error::InternalError( + "Received blobs_by_range response after PeerDAS".to_string(), + )) } } @@ -353,11 +371,15 @@ impl BlockComponentsByRangeRequest { ) -> BlockComponentsByRangeRequestResult { match &mut self.state { State::Base { .. } | State::DenebEnabled { .. } => { - return Err("Received custody_by_range response before PeerDAS".to_string()) + return Err(Error::InternalError( + "Received custody_by_range response before PeerDAS".to_string(), + )) } State::FuluEnabled(state) => match state { FuluEnabledState::BlockRequest { .. } => { - return Err("Received custody_by_range expecting blocks_by_range".to_string()); + return Err(Error::InternalError( + "Received custody_by_range expecting blocks_by_range".to_string(), + )); } FuluEnabledState::CustodyRequest { custody_by_range_request, @@ -388,7 +410,7 @@ fn couple_blocks_fulu( data_columns: Vec>>, custody_column_indices: Vec, spec: &ChainSpec, -) -> Result>, String> { +) -> Result>, Error> { // Group data columns by block_root and index let mut custody_columns_by_block = HashMap::>>::new(); @@ -425,21 +447,26 @@ fn couple_blocks_fulu( custody_column_indices.clone(), spec, ) + .map_err(Error::InternalError) }) .collect::, _>>() } impl ByRangeRequest { - fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), String> { + fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), Error> { match self { Self::Active(expected_id) => { if expected_id != &id { - return Err(format!("unexpected req_id expected {expected_id} got {id}")); + return Err(Error::InternalError(format!( + "unexpected req_id expected {expected_id} got {id}" + ))); } *self = Self::Complete(data, peer_id); Ok(()) } - Self::Complete(_, _) => Err("request already complete".to_owned()), + Self::Complete(_, _) => Err(Error::InternalError(format!( + "request already complete {id}" + ))), } } diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 44e80e3766d..f46b354313f 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -1,5 +1,6 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, NoPeerError, + RpcRequestSendError, RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; @@ -40,19 +41,35 @@ pub struct ActiveCustodyByRootRequest { _phantom: PhantomData, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub enum Error { - SendFailed(&'static str), - TooManyFailures, - BadState(String), NoPeer(ColumnIndex), - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, - }, + TooManyDownloadErrors(RpcResponseError), + InternalError(String), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + Error::TooManyDownloadErrors(e) => e, + Error::NoPeer(_index) => todo!(), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::NoPeer(column_index) => { + RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) + } + Error::TooManyDownloadErrors(_) => { + RpcRequestSendError::InternalError("Download error in request send".to_string()) + } + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } } struct ActiveBatchColumnsRequest { @@ -131,7 +148,7 @@ impl ActiveCustodyByRootRequest { let column_request = self .column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; if let Some(data_column) = data_columns.remove(column_index) { column_request.on_download_success( @@ -182,8 +199,8 @@ impl ActiveCustodyByRootRequest { for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id)?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; } self.failed_peers.insert(peer_id); @@ -230,8 +247,12 @@ impl ActiveCustodyByRootRequest { for (column_index, request) in self.column_requests.iter_mut() { if let Some(wait_duration) = request.is_awaiting_download() { - if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyFailures); + if request.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + let last_error = request + .download_failures + .last() + .expect("download_failures is not empty"); + return Err(Error::TooManyDownloadErrors(last_error.clone())); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -297,7 +318,9 @@ impl ActiveCustodyByRootRequest { // columns. For the rest of peers, don't downscore if columns are missing. lookup_peers.contains(&peer_id), ) - .map_err(Error::SendFailed)?; + .map_err(|e| { + Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) + })?; match request_result { LookupRequestResult::RequestSent(req_id) => { @@ -306,7 +329,7 @@ impl ActiveCustodyByRootRequest { .column_requests .get_mut(column_index) // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; column_request.on_download_start(req_id)?; } @@ -328,7 +351,7 @@ const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; struct ColumnRequest { status: Status, - download_failures: usize, + download_failures: Vec, } #[derive(Debug, Clone)] @@ -342,7 +365,7 @@ impl ColumnRequest { fn new() -> Self { Self { status: Status::NotStarted(Instant::now()), - download_failures: 0, + download_failures: vec![], } } @@ -366,7 +389,7 @@ impl ColumnRequest { self.status = Status::Downloading(req_id); Ok(()) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state on_download_start expected NotStarted got {other:?}" ))), } @@ -376,15 +399,14 @@ impl ColumnRequest { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::NotStarted(Instant::now()); Ok(()) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state on_download_error expected Downloading got {other:?}" ))), } @@ -393,9 +415,9 @@ impl ColumnRequest { fn on_download_error_and_mark_failure( &mut self, req_id: DataColumnsByRootRequestId, + e: RpcResponseError, ) -> Result<(), Error> { - // TODO(das): Should track which peers don't have data - self.download_failures += 1; + self.download_failures.push(e); self.on_download_error(req_id) } @@ -409,15 +431,14 @@ impl ColumnRequest { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); Ok(()) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state on_download_success expected Downloading got {other:?}" ))), } @@ -428,7 +449,7 @@ impl ColumnRequest { Status::Downloaded(peer_id, data_column, seen_timestamp) => { Ok((peer_id, data_column, seen_timestamp)) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state complete expected Downloaded got {other:?}" ))), } diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index cc0699ce7bf..2d67b350b27 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -1,3 +1,4 @@ +use crate::sync::network_context::{NoPeerError, RpcRequestSendError, RpcResponseError}; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; @@ -5,7 +6,7 @@ use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use lighthouse_network::service::api_types::{ CustodyByRangeRequestId, DataColumnsByRangeRequestId, }; -use lighthouse_network::PeerId; +use lighthouse_network::{PeerAction, PeerId}; use lru_cache::LRUTimeCache; use parking_lot::RwLock; use rand::Rng; @@ -20,7 +21,7 @@ use types::{ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; -const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; +const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); type DataColumnSidecarList = Vec>>; @@ -38,26 +39,48 @@ pub struct ActiveCustodyByRangeRequest { FnvHashMap, /// Peers that have recently failed to successfully respond to a columns by root request. /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. - failed_peers: LRUTimeCache, + peers_with_custody_failures: LRUTimeCache, + peers_with_temporary_faults: LRUTimeCache, + // TODO(das): does this HashSet has an OOM risk? We should either: make sure that this request + // structs are dropped after some time, that disconnected peers are pruned (but we may want to + // retain faulty information if they just disconnect and reconnect) or make this an LRUTimeCache + // with a long time (like 5 minutes). + peers_with_permanent_faults: HashSet, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, _phantom: PhantomData, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub enum Error { - SendFailed(&'static str), - TooManyFailures, - BadState(String), + InternalError(String), + TooManyDownloadErrors(RpcResponseError), NoPeer(ColumnIndex), - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the batch is dropped. - UnexpectedRequestId { - expected_req_id: DataColumnsByRangeRequestId, - req_id: DataColumnsByRangeRequestId, - }, +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + Error::TooManyDownloadErrors(e) => e, + Error::NoPeer(_index) => todo!(), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::NoPeer(column_index) => { + RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) + } + Error::TooManyDownloadErrors(_) => { + RpcRequestSendError::InternalError("Download error in request send".to_string()) + } + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } } struct ActiveBatchColumnsRequest { @@ -94,7 +117,13 @@ impl ActiveCustodyByRangeRequest { .map(|index| (*index, ColumnRequest::new())), ), active_batch_columns_requests: <_>::default(), - failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), + peers_with_custody_failures: LRUTimeCache::new(Duration::from_secs( + TEMPORARY_FAULT_EXPIRY_SECONDS, + )), + peers_with_temporary_faults: LRUTimeCache::new(Duration::from_secs( + TEMPORARY_FAULT_EXPIRY_SECONDS, + )), + peers_with_permanent_faults: HashSet::new(), lookup_peers, _phantom: PhantomData, } @@ -142,10 +171,12 @@ impl ActiveCustodyByRangeRequest { let mut imported_column_indices = vec![]; for index in &batch_request.indices { - let column_request = self - .column_requests - .get_mut(index) - .ok_or(Error::BadState(format!("unknown column_index {index}")))?; + let column_request = + self.column_requests + .get_mut(index) + .ok_or(Error::InternalError(format!( + "unknown column_index {index}" + )))?; let columns_at_index = self .blocks_with_data @@ -246,7 +277,16 @@ impl ActiveCustodyByRangeRequest { "Custody by range peer returned non-matching columns" ); - self.failed_peers.insert(peer_id); + // Returning a non-canonical column is not a permanent fault. We should not + // retry the peer for some time but the peer may return a canonical column in + // the future. + // TODO(das): if this finalized sync the fault is permanent + self.peers_with_temporary_faults.insert(peer_id); + cx.report_peer( + peer_id, + PeerAction::MidToleranceError, + "non-matching data column", + ); } if !missing_column_indexes.is_empty() { @@ -260,7 +300,13 @@ impl ActiveCustodyByRangeRequest { "Custody by range peer claims to not have some data" ); - self.failed_peers.insert(peer_id); + // Not having columns is not a permanent fault. The peer may be backfilling. + self.peers_with_custody_failures.insert(peer_id); + cx.report_peer( + peer_id, + PeerAction::MidToleranceError, + "data column custody failure", + ); } } Err(err) => { @@ -276,11 +322,24 @@ impl ActiveCustodyByRangeRequest { for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id)?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; } - self.failed_peers.insert(peer_id); + match err { + // Verify errors are correctness errors against our request or about the + // returned data itself. This peer is faulty or malicious, should not be + // retried. + RpcResponseError::VerifyError(_) => { + self.peers_with_permanent_faults.insert(peer_id); + } + // Network errors are not permanent faults and worth retrying + RpcResponseError::RpcError(_) => { + self.peers_with_temporary_faults.insert(peer_id); + } + // Do nothing for internal errors + RpcResponseError::InternalError(_) => {} + } } }; @@ -334,8 +393,12 @@ impl ActiveCustodyByRangeRequest { for (column_index, request) in self.column_requests.iter_mut() { if let Some(wait_duration) = request.is_awaiting_download() { - if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyFailures); + if request.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + let last_error = request + .download_failures + .last() + .expect("download_failures is not empty"); + return Err(Error::TooManyDownloadErrors(last_error.clone())); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -349,13 +412,19 @@ impl ActiveCustodyByRangeRequest { // custody peers on a given column let mut priorized_peers = custodial_peers .iter() + .filter(|peer| { + // Never request again peers with permanent faults + // Do not request peers with custody failures for some time + !self.peers_with_permanent_faults.contains(peer) + && !self.peers_with_custody_failures.contains(peer) + }) .map(|peer| { ( // Prioritize peers that claim to know have imported this block if lookup_peers.contains(peer) { 0 } else { 1 }, // De-prioritize peers that have failed to successfully respond to - // requests recently - self.failed_peers.contains(peer), + // requests recently, but allow to immediatelly request them again + self.peers_with_temporary_faults.contains(peer), // Prefer peers with fewer requests to load balance across peers. // We batch requests to the same peer, so count existence in the // `columns_to_request_by_peer` as a single 1 request. @@ -381,6 +450,7 @@ impl ActiveCustodyByRangeRequest { // descendants of this un-available lookup. return Err(Error::NoPeer(*column_index)); } else { + return Err(Error::NoPeer(*column_index)); // Do not issue requests if there is no custody peer on this column } } @@ -401,14 +471,16 @@ impl ActiveCustodyByRangeRequest { }, self.id, ) - .map_err(Error::SendFailed)?; + .map_err(|e| Error::InternalError(format!("send failed {e}")))?; for column_index in &indices { let column_request = self .column_requests .get_mut(column_index) // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError(format!( + "Unknown column_request {column_index}" + )))?; column_request.on_download_start(req_id)?; } @@ -426,7 +498,7 @@ const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; struct ColumnRequest { status: Status, - download_failures: usize, + download_failures: Vec, } #[derive(Debug, Clone)] @@ -440,7 +512,7 @@ impl ColumnRequest { fn new() -> Self { Self { status: Status::NotStarted(Instant::now()), - download_failures: 0, + download_failures: vec![], } } @@ -464,7 +536,7 @@ impl ColumnRequest { self.status = Status::Downloading(req_id); Ok(()) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state on_download_start expected NotStarted got {other:?}" ))), } @@ -474,15 +546,14 @@ impl ColumnRequest { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::NotStarted(Instant::now()); Ok(()) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state on_download_error expected Downloading got {other:?}" ))), } @@ -491,9 +562,9 @@ impl ColumnRequest { fn on_download_error_and_mark_failure( &mut self, req_id: DataColumnsByRangeRequestId, + e: RpcResponseError, ) -> Result<(), Error> { - // TODO(das): Should track which peers don't have data - self.download_failures += 1; + self.download_failures.push(e); self.on_download_error(req_id) } @@ -507,15 +578,14 @@ impl ColumnRequest { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); Ok(()) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state on_download_success expected Downloading got {other:?}" ))), } @@ -526,7 +596,7 @@ impl ColumnRequest { Status::Downloaded(peer_id, data_column, seen_timestamp) => { Ok((peer_id, data_column, seen_timestamp)) } - other => Err(Error::BadState(format!( + other => Err(Error::InternalError(format!( "bad state complete expected Downloaded got {other:?}" ))), } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 963b633ed6d..7635717adce 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -26,7 +26,7 @@ mod blocks_by_root; mod data_columns_by_range; mod data_columns_by_root; -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] +#[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { NotEnoughResponsesReturned { actual: usize }, TooManyResponses, diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 85fe34cd724..31c0d30a6ad 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -11,7 +11,6 @@ use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecut use beacon_processor::WorkType; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, - OldBlocksByRangeRequestV2, }; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ @@ -19,6 +18,7 @@ use lighthouse_network::service::api_types::{ DataColumnsByRangeRequestId, SyncRequestId, }; use lighthouse_network::{PeerId, SyncInfo}; +use std::collections::HashSet; use std::time::Duration; use types::{ BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, @@ -35,7 +35,7 @@ pub(crate) enum DataSidecars { enum ByRangeDataRequestIds { PreDeneb, - PrePeerDAS(BlobsByRangeRequestId, PeerId), + PrePeerDAS(BlobsByRangeRequestId, PeerId, BlobsByRangeRequest), PostPeerDAS( Vec<( DataColumnsByRangeRequestId, @@ -45,6 +45,21 @@ enum ByRangeDataRequestIds { ), } +impl ByRangeDataRequestIds { + fn peer(&self) -> PeerId { + match self { + Self::PreDeneb => panic!("no requests PreDeneb"), + Self::PrePeerDAS(_, peer, _) => *peer, + Self::PostPeerDAS(reqs) => { + if reqs.len() != 1 { + panic!("Should have 1 PostPeerDAS request"); + } + reqs.first().expect("no PostPeerDAS requests").1 + } + } + } +} + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -76,7 +91,28 @@ impl RequestFilter { self } - fn matches(&self, peer: PeerId, start_slot: u64) -> bool { + fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { + self.matches_common(peer, *req.start_slot()) + } + + fn matches_blobs_by_range(&self, peer: &PeerId, req: &BlobsByRangeRequest) -> bool { + self.matches_common(peer, req.start_slot) + } + + fn matches_data_columns_by_range( + &self, + peer: &PeerId, + req: &DataColumnsByRangeRequest, + ) -> bool { + if let Some(index) = self.column_index { + if !req.columns.contains(&index) { + return false; + } + } + self.matches_common(peer, req.start_slot) + } + + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { if let Some(expected_epoch) = self.epoch { let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); if epoch != expected_epoch { @@ -84,7 +120,7 @@ impl RequestFilter { } } if let Some(expected_peer) = self.peer { - if peer != expected_peer { + if *peer != expected_peer { return false; } } @@ -99,6 +135,7 @@ fn filter() -> RequestFilter { /// Instruct the testing rig how to complete requests for _by_range requests #[derive(Debug, Clone, Copy)] struct CompleteConfig { + block_count: usize, with_data: bool, custody_failure_at_index: Option, } @@ -114,6 +151,7 @@ impl CompleteConfig { fn complete() -> CompleteConfig { CompleteConfig { + block_count: 1, with_data: true, custody_failure_at_index: None, } @@ -362,22 +400,20 @@ impl TestRig { fn find_blocks_by_range_request( &mut self, request_filter: RequestFilter, - ) -> (BlocksByRangeRequestId, PeerId) { - self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::BlocksByRange(OldBlocksByRangeRequest::V2( - OldBlocksByRangeRequestV2 { start_slot, .. }, - )), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if request_filter.matches(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }) + ) -> (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest) { + self.pop_received_network_event(|ev| match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), + } if request_filter.matches_blocks_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + }) + .unwrap_or_else(|e| { + panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") + }) } fn find_data_columns_by_range_requests( @@ -389,21 +425,16 @@ impl TestRig { DataColumnsByRangeRequest, )> { let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::DataColumnsByRange( - req @ DataColumnsByRangeRequest { start_slot, .. }, - ), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if request_filter.matches(*peer_id, *start_slot) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - }) - { + while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), + } if request_filter.matches_data_columns_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + }) { data_columns_requests.push(data_columns_request); } data_columns_requests @@ -420,19 +451,21 @@ impl TestRig { } ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) } else if self.after_deneb() { - let (id, peer) = self + let (id, peer, req) = self .pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, - request: RequestType::BlobsByRange(BlobsByRangeRequest { start_slot, .. }), + request: RequestType::BlobsByRange(req), app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if request_filter.matches(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches_blobs_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } _ => None, }) .unwrap_or_else(|e| { panic!("Should have a blobs by range request, filter {request_filter:?}: {e:?}") }); - ByRangeDataRequestIds::PrePeerDAS(id, peer) + ByRangeDataRequestIds::PrePeerDAS(id, peer, req) } else { ByRangeDataRequestIds::PreDeneb } @@ -440,29 +473,28 @@ impl TestRig { fn find_and_complete_block_components_by_range_request( &mut self, - epoch: Epoch, request_filter: RequestFilter, complete_config: CompleteConfig, ) -> RangeRequestId { - let id = - self.find_and_complete_blocks_by_range_request(epoch, request_filter, complete_config); + let id = self.find_and_complete_blocks_by_range_request(request_filter, complete_config); self.find_and_complete_data_by_range_request(request_filter, complete_config); id } fn find_and_complete_blocks_by_range_request( &mut self, - epoch: Epoch, request_filter: RequestFilter, complete_config: CompleteConfig, ) -> RangeRequestId { - let (blocks_req_id, block_peer) = self.find_blocks_by_range_request(request_filter); + let (blocks_req_id, block_peer, blocks_req) = + self.find_blocks_by_range_request(request_filter); - let block = self.zero_block_at_slot( - epoch.start_slot(E::slots_per_epoch()), - complete_config.with_data, - ); - let blocks = [block]; + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); blocks_req_id.parent_request_id.requester @@ -484,10 +516,10 @@ impl TestRig { ) { match by_range_data_request_ids { ByRangeDataRequestIds::PreDeneb => {} - ByRangeDataRequestIds::PrePeerDAS(id, peer_id) => { + ByRangeDataRequestIds::PrePeerDAS(id, peer_id, req) => { // Complete the request with a single stream termination self.log(&format!( - "Completing BlobsByRange request {id} with empty stream" + "Completing BlobsByRange request {id} {req:?} with empty stream" )); self.send_sync_message(SyncMessage::RpcBlob { sync_request_id: SyncRequestId::BlobsByRange(id), @@ -583,7 +615,6 @@ impl TestRig { for epoch in 0..last_epoch { // Note: In this test we can't predict the block peer let id = self.find_and_complete_block_components_by_range_request( - Epoch::new(epoch), request_filter.epoch(epoch), complete_config, ); @@ -774,9 +805,7 @@ fn pause_and_resume_on_ee_offline() { // make the ee offline rig.update_execution_engine_state(EngineState::Offline); // send the response to the request - let epoch = Epoch::new(0); rig.find_and_complete_block_components_by_range_request( - epoch, filter().peer(peer1).epoch(0), complete(), ); @@ -790,7 +819,7 @@ fn pause_and_resume_on_ee_offline() { // Don't filter requests and the columns requests may be sent to peer1 or peer2 // We need to filter by epoch, because the previous batch eagerly sent requests for the next // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_block_components_by_range_request(epoch, filter().epoch(0), complete()); + rig.find_and_complete_block_components_by_range_request(filter().epoch(0), complete()); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); // make the beacon processor available again. @@ -871,19 +900,16 @@ fn finalized_sync_single_custody_peer_failure() { let advanced_epochs: u64 = 2; let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit r.add_random_peer(remote_info.clone()); r.assert_state(RangeSyncType::Finalized); - // send the response to the request - // Don't filter requests and the columns requests may be sent to peer1 or peer2 - // We need to filter by epoch, because the previous batch eagerly sent requests for the next - // epoch for the other batch. So we can either filter by epoch of by sync type. - let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + // Progress all blocks_by_range and columns_by_range requests but respond empty for a single + // column index r.find_and_complete_block_components_by_range_request( - Epoch::new(0), filter().epoch(0), complete().custody_failure_at_index(column_index_to_fail), ); @@ -899,3 +925,57 @@ fn finalized_sync_single_custody_peer_failure() { // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch // 1 successfully } + +#[test] +fn finalized_sync_permanent_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + const PEERS_IN_BATCH: usize = 4; + + for _ in 0..PEERS_IN_BATCH { + r.add_random_peer(remote_info.clone()); + } + r.assert_state(RangeSyncType::Finalized); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. + r.find_and_complete_block_components_by_range_request( + filter().epoch(0), + complete().custody_failure_at_index(column_index_to_fail), + ); + + let mut requested_peers = HashSet::new(); + + for i in 0..PEERS_IN_BATCH { + r.log(&format!("Loop {i} of custody failure round")); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. We want to make sure that the request goes to different peer + // than the attempts before. + let reqs = + r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); + let req_peer = reqs.peer(); + if requested_peers.contains(&req_peer) { + panic!("Re-requested the same peer {req_peer} again after a custody failure"); + } + requested_peers.insert(req_peer); + + // Find the requests first to assert that this is the only request that exists + r.expect_no_columns_by_range_requests(filter().epoch(0)); + // complete this one request without the custody failure now + r.complete_data_by_range_request( + reqs, + complete().custody_failure_at_index(column_index_to_fail), + ); + } + + // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch + // 1 successfully +} From 0ca21a70cc6f338401f1beb321c572c22d24aa9f Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 9 May 2025 01:16:36 -0300 Subject: [PATCH 07/11] Remove NoPeer error --- .../block_components_by_range.rs | 29 ++++++++++--------- .../src/sync/network_context/custody.rs | 26 +++++++---------- .../sync/network_context/custody_by_range.rs | 25 ++++++---------- 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 80849b6f029..3ed3f71308d 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -1,5 +1,5 @@ use crate::sync::network_context::{ - NoPeerError, PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, + PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; use crate::sync::range_sync::BatchPeerGroup; use beacon_chain::block_verification_types::RpcBlock; @@ -76,6 +76,14 @@ impl From for RpcResponseError { } } +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } +} + impl BlockComponentsByRangeRequest { pub fn new( id: ComponentsByRangeRequestId, @@ -119,10 +127,12 @@ impl BlockComponentsByRangeRequest { .min() .map(|(_, _, _, peer)| *peer) else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the SyncingChain is removed. + // TODO(das): add test for this. + return Err(RpcRequestSendError::InternalError( + "A batch peer set should never be empty".to_string(), + )); }; let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; @@ -248,14 +258,7 @@ impl BlockComponentsByRangeRequest { column_indices, self.peers.clone(), ) - .map_err(|e| match e { - RpcRequestSendError::NoPeer(e) => todo!("what to do? {e:?}"), - RpcRequestSendError::InternalError(e) => { - Error::InternalError(format!( - "InternalError sending custody_by_range request: {e}", - )) - } - })?; + .map_err(|e| e.into())?; *state = FuluEnabledState::CustodyRequest { blocks: blocks.to_vec(), diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index f46b354313f..9b048974d95 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -1,6 +1,6 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, NoPeerError, - RpcRequestSendError, RpcResponseError, + DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, RpcRequestSendError, + RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; @@ -20,7 +20,6 @@ use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; -const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); type DataColumnSidecarList = Vec>>; @@ -43,7 +42,6 @@ pub struct ActiveCustodyByRootRequest { #[derive(Debug)] pub enum Error { - NoPeer(ColumnIndex), TooManyDownloadErrors(RpcResponseError), InternalError(String), } @@ -53,7 +51,6 @@ impl From for RpcResponseError { match e { Error::InternalError(e) => RpcResponseError::InternalError(e), Error::TooManyDownloadErrors(e) => e, - Error::NoPeer(_index) => todo!(), } } } @@ -61,9 +58,6 @@ impl From for RpcResponseError { impl From for RpcRequestSendError { fn from(e: Error) -> Self { match e { - Error::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } Error::TooManyDownloadErrors(_) => { RpcRequestSendError::InternalError("Download error in request send".to_string()) } @@ -246,7 +240,7 @@ impl ActiveCustodyByRootRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(wait_duration) = request.is_awaiting_download() { + if let Some(_) = request.is_awaiting_download() { if request.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { let last_error = request .download_failures @@ -291,14 +285,14 @@ impl ActiveCustodyByRootRequest { .entry(*peer_id) .or_default() .push(*column_index); - } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { - // Allow to request to sit stale in `NotStarted` state for at most - // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that - // lookup will naturally retry when other peers send us attestations for - // descendants of this un-available lookup. - return Err(Error::NoPeer(*column_index)); } else { - // Do not issue requests if there is no custody peer on this column + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request should be dropped and failed after some time. + // TODO(das): implement the above } } } diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 2d67b350b27..f2af8618c65 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -1,4 +1,4 @@ -use crate::sync::network_context::{NoPeerError, RpcRequestSendError, RpcResponseError}; +use crate::sync::network_context::{RpcRequestSendError, RpcResponseError}; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; @@ -22,7 +22,6 @@ use types::{ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; -const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); type DataColumnSidecarList = Vec>>; @@ -56,7 +55,6 @@ pub struct ActiveCustodyByRangeRequest { pub enum Error { InternalError(String), TooManyDownloadErrors(RpcResponseError), - NoPeer(ColumnIndex), } impl From for RpcResponseError { @@ -64,7 +62,6 @@ impl From for RpcResponseError { match e { Error::InternalError(e) => RpcResponseError::InternalError(e), Error::TooManyDownloadErrors(e) => e, - Error::NoPeer(_index) => todo!(), } } } @@ -72,9 +69,6 @@ impl From for RpcResponseError { impl From for RpcRequestSendError { fn from(e: Error) -> Self { match e { - Error::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } Error::TooManyDownloadErrors(_) => { RpcRequestSendError::InternalError("Download error in request send".to_string()) } @@ -392,7 +386,7 @@ impl ActiveCustodyByRangeRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(wait_duration) = request.is_awaiting_download() { + if let Some(_) = request.is_awaiting_download() { if request.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { let last_error = request .download_failures @@ -443,15 +437,14 @@ impl ActiveCustodyByRangeRequest { .entry(*peer_id) .or_default() .push(*column_index); - } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { - // Allow to request to sit stale in `NotStarted` state for at most - // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that - // lookup will naturally retry when other peers send us attestations for - // descendants of this un-available lookup. - return Err(Error::NoPeer(*column_index)); } else { - return Err(Error::NoPeer(*column_index)); - // Do not issue requests if there is no custody peer on this column + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request should be dropped and failed after some time. + // TODO(das): implement the above } } } From 15469e7ac93dc15804587559dd5b2bb79e8f0dde Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 10 May 2025 12:02:39 -0300 Subject: [PATCH 08/11] Introduce the RequestExpired error --- .../network/src/sync/backfill_sync/mod.rs | 23 ++- beacon_node/network/src/sync/manager.rs | 56 ++++-- .../network/src/sync/network_context.rs | 55 ++++-- .../block_components_by_range.rs | 6 +- .../sync/network_context/custody_by_range.rs | 128 +++++++++---- .../{custody.rs => custody_by_root.rs} | 178 +++--------------- .../network/src/sync/range_sync/chain.rs | 4 +- 7 files changed, 210 insertions(+), 240 deletions(-) rename beacon_node/network/src/sync/network_context/{custody.rs => custody_by_root.rs} (73%) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 7eed5416c16..15975e4ef5c 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -328,9 +328,14 @@ impl BackFillSync { // mechanism compatible with PeerDAS and before PeerDAS? match batch.download_failed(None) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) - } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { + RpcResponseError::RpcError(_) + | RpcResponseError::VerifyError(_) + | RpcResponseError::InternalError(_) => { + BackFillError::BatchDownloadFailed(batch_id) + } + RpcResponseError::RequestExpired(_) => BackFillError::Paused, + }), Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), } } else { @@ -934,15 +939,9 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { - RpcRequestSendError::NoPeer(no_peer) => { - // If we are here the chain has no more synced peers - info!( - "reason" = format!("insufficient_synced_peers({no_peer:?})"), - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); - } + // TODO(das): block_components_by_range requests can now hang out indefinitely. + // Is that fine? Maybe we should fail the requests from the network_context + // level without involving the BackfillSync itself. RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 397bffa2ad9..de3b333317d 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -36,7 +36,8 @@ use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::block_lookups::BlockLookups; use super::network_context::{ - CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, + CustodyByRangeResult, CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, + SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; @@ -58,9 +59,10 @@ use beacon_chain::{ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SamplingId, SamplingRequester, SingleLookupReqId, SyncRequestId, + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, + SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::PeerId; @@ -439,6 +441,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. @@ -532,6 +537,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Updates the syncing state of a peer. @@ -1205,26 +1213,34 @@ impl SyncManager { self.network .on_custody_by_range_response(id.parent_request_id, id, peer_id, resp) { - // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not - // not have to pass a PeerGroup in case of error - let peers = match &result { - Ok((_, peers, _)) => peers.clone(), - // TODO(das): this PeerGroup with a single peer is incorrect - Err(_) => PeerGroup::from_single(peer_id), - }; - - self.on_block_components_by_range_response( - id.parent_request_id.parent_request_id, - RangeBlockComponent::CustodyColumns( - id.parent_request_id, - result.map(|(data, _peers, timestamp)| (data, timestamp)), - peers, - ), - ); + self.on_custody_by_range_result(id.parent_request_id, result); } } } + fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + result: CustodyByRangeResult, + ) { + // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not + // not have to pass a PeerGroup in case of error + let peers = match &result { + Ok((_, peers, _)) => peers.clone(), + // TODO(das): this PeerGroup with no peers incorrect + Err(_) => PeerGroup::from_set(<_>::default()), + }; + + self.on_block_components_by_range_response( + id.parent_request_id, + RangeBlockComponent::CustodyColumns( + id, + result.map(|(data, _peers, timestamp)| (data, timestamp)), + peers, + ), + ); + } + fn on_custody_by_root_result( &mut self, requester: CustodyRequester, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 28d5de2135c..0d05c584684 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,8 +1,8 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::ActiveCustodyByRootRequest; use self::custody_by_range::{ActiveCustodyByRangeRequest, CustodyByRangeRequestResult}; +use self::custody_by_root::{ActiveCustodyByRootRequest, CustodyByRootRequestResult}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::manager::BlockProcessType; use super::range_sync::BatchPeerGroup; @@ -16,7 +16,6 @@ use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; pub use block_components_by_range::BlockComponentsByRangeRequest; -use custody::CustodyRequestResult; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; @@ -46,8 +45,8 @@ use types::{ }; pub mod block_components_by_range; -pub mod custody; pub mod custody_by_range; +pub mod custody_by_root; mod requests; #[derive(Debug)] @@ -68,31 +67,29 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; + /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; +pub type CustodyByRootResult = RpcResponseBatchResult>; + +pub type CustodyByRangeResult = RpcResponseBatchResult>; #[derive(Debug, Clone)] pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), + RequestExpired(String), InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// No peer available matching the required criteria - NoPeer(NoPeerError), /// These errors should never happen, including unreachable custody errors or network send /// errors. InternalError(String), -} - -/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` -#[derive(Debug, PartialEq, Eq)] -pub enum NoPeerError { - BlockPeer, - CustodyPeer(ColumnIndex), + // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream + // consumers that sends are expected to be infallible. If this assumption changes in the future, + // add a new variant. } #[derive(Debug, PartialEq, Eq)] @@ -1137,6 +1134,32 @@ impl SyncNetworkContext { .collect() } + /// Attempt to make progress on all custody_by_range requests. Some request may be stale waiting + /// for custody peers. Returns a Vec of results as zero or more requests may fail in this + /// attempt. + pub fn continue_custody_by_range_requests( + &mut self, + ) -> Vec<(CustodyByRangeRequestId, CustodyByRangeResult)> { + let ids = self + .custody_by_range_requests + .keys() + .copied() + .collect::>(); + + // Need to collect ids and results in separate steps to re-borrow self. + ids.into_iter() + .filter_map(|id| { + let mut request = self + .custody_by_range_requests + .remove(&id) + .expect("key of hashmap"); + let result = request.continue_requests(self); + self.handle_custody_by_range_result(id, request, result) + .map(|result| (id, result)) + }) + .collect() + } + // Request handlers pub(crate) fn on_single_block_response( @@ -1312,7 +1335,7 @@ impl SyncNetworkContext { &mut self, id: CustodyRequester, request: ActiveCustodyByRootRequest, - result: CustodyRequestResult, + result: CustodyByRootRequestResult, ) -> Option> { let span = span!( Level::INFO, @@ -1372,7 +1395,7 @@ impl SyncNetworkContext { id: CustodyByRangeRequestId, request: ActiveCustodyByRangeRequest, result: CustodyByRangeRequestResult, - ) -> Option> { + ) -> Option> { let result = result.map_err(Into::::into).transpose(); // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 3ed3f71308d..332c755e506 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -258,7 +258,11 @@ impl BlockComponentsByRangeRequest { column_indices, self.peers.clone(), ) - .map_err(|e| e.into())?; + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => { + Error::InternalError(e) + } + })?; *state = FuluEnabledState::CustodyRequest { blocks: blocks.to_vec(), diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index f2af8618c65..d5e974f7043 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -13,6 +13,7 @@ use rand::Rng; use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use strum::IntoStaticStr; use tracing::{debug, warn}; use types::{ data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Hash256, @@ -22,17 +23,24 @@ use types::{ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; +/// TODO(das): this attempt count is nested into the existing lookup request count. +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; +const REQUEST_EXPIRY_SECONDS: u64 = 300; type DataColumnSidecarList = Vec>>; pub struct ActiveCustodyByRangeRequest { + start_time: Instant, id: CustodyByRangeRequestId, // TODO(das): Pass a better type for the by_range request epoch: Epoch, /// Blocks that we expect peers to serve data columns for blocks_with_data: Vec, /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap>, + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>, + >, /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, @@ -55,6 +63,7 @@ pub struct ActiveCustodyByRangeRequest { pub enum Error { InternalError(String), TooManyDownloadErrors(RpcResponseError), + ExpiredNoCustodyPeers(Vec), } impl From for RpcResponseError { @@ -62,6 +71,9 @@ impl From for RpcResponseError { match e { Error::InternalError(e) => RpcResponseError::InternalError(e), Error::TooManyDownloadErrors(e) => e, + Error::ExpiredNoCustodyPeers(indices) => RpcResponseError::RequestExpired(format!( + "Expired waiting for custody peers {indices:?}" + )), } } } @@ -73,6 +85,9 @@ impl From for RpcRequestSendError { RpcRequestSendError::InternalError("Download error in request send".to_string()) } Error::InternalError(e) => RpcRequestSendError::InternalError(e), + Error::ExpiredNoCustodyPeers(_) => RpcRequestSendError::InternalError( + "Request can not expire when requesting it".to_string(), + ), } } } @@ -102,6 +117,7 @@ impl ActiveCustodyByRangeRequest { lookup_peers: Arc>>, ) -> Self { Self { + start_time: Instant::now(), id, epoch, blocks_with_data, @@ -333,6 +349,8 @@ impl ActiveCustodyByRangeRequest { } // Do nothing for internal errors RpcResponseError::InternalError(_) => {} + // unreachable + RpcResponseError::RequestExpired(_) => {} } } }; @@ -386,13 +404,9 @@ impl ActiveCustodyByRangeRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(_) = request.is_awaiting_download() { - if request.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - let last_error = request - .download_failures - .last() - .expect("download_failures is not empty"); - return Err(Error::TooManyDownloadErrors(last_error.clone())); + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -482,60 +496,91 @@ impl ActiveCustodyByRangeRequest { .insert(req_id, ActiveBatchColumnsRequest { indices }); } + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + Ok(None) } } -/// TODO(das): this attempt count is nested into the existing lookup request count. -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; - -struct ColumnRequest { - status: Status, +pub struct ColumnRequest { + status: Status, download_failures: Vec, } -#[derive(Debug, Clone)] -enum Status { - NotStarted(Instant), - Downloading(DataColumnsByRangeRequestId), - Downloaded(PeerId, DataColumnSidecarList, Duration), +#[derive(Debug, Clone, IntoStaticStr)] +pub enum Status { + NotStarted, + Downloading(I), + Downloaded(PeerId, T, Duration), } -impl ColumnRequest { - fn new() -> Self { +impl ColumnRequest { + pub fn new() -> Self { Self { - status: Status::NotStarted(Instant::now()), + status: Status::NotStarted, download_failures: vec![], } } - fn is_awaiting_download(&self) -> Option { + pub fn is_awaiting_download(&self) -> bool { match self.status { - Status::NotStarted(start_time) => Some(start_time.elapsed()), - Status::Downloading { .. } | Status::Downloaded { .. } => None, + Status::NotStarted => true, + Status::Downloading { .. } | Status::Downloaded { .. } => false, } } - fn is_downloaded(&self) -> bool { + pub fn is_downloading(&self) -> bool { match self.status { - Status::NotStarted { .. } | Status::Downloading { .. } => false, + Status::NotStarted => false, + Status::Downloading { .. } => true, + Status::Downloaded { .. } => false, + } + } + + pub fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted | Status::Downloading { .. } => false, Status::Downloaded { .. } => true, } } - fn on_download_start(&mut self, req_id: DataColumnsByRangeRequestId) -> Result<(), Error> { + pub fn too_many_failures(&self) -> Option { + if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + Some( + self.download_failures + .last() + .cloned() + .expect("download_failures is not empty"), + ) + } else { + None + } + } + + pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { match &self.status { - Status::NotStarted { .. } => { + Status::NotStarted => { self.status = Status::Downloading(req_id); Ok(()) } other => Err(Error::InternalError(format!( - "bad state on_download_start expected NotStarted got {other:?}" + "bad state on_download_start expected NotStarted got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error(&mut self, req_id: DataColumnsByRangeRequestId) -> Result<(), Error> { + pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { @@ -543,29 +588,30 @@ impl ColumnRequest { "Received download result for req_id {req_id} expecting {expected_req_id}" ))); } - self.status = Status::NotStarted(Instant::now()); + self.status = Status::NotStarted; Ok(()) } other => Err(Error::InternalError(format!( - "bad state on_download_error expected Downloading got {other:?}" + "bad state on_download_error expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error_and_mark_failure( + pub fn on_download_error_and_mark_failure( &mut self, - req_id: DataColumnsByRangeRequestId, + req_id: I, e: RpcResponseError, ) -> Result<(), Error> { self.download_failures.push(e); self.on_download_error(req_id) } - fn on_download_success( + pub fn on_download_success( &mut self, - req_id: DataColumnsByRangeRequestId, + req_id: I, peer_id: PeerId, - data_column: DataColumnSidecarList, + data_column: T, seen_timestamp: Duration, ) -> Result<(), Error> { match &self.status { @@ -579,18 +625,20 @@ impl ColumnRequest { Ok(()) } other => Err(Error::InternalError(format!( - "bad state on_download_success expected Downloading got {other:?}" + "bad state on_download_success expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn complete(self) -> Result<(PeerId, DataColumnSidecarList, Duration), Error> { + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { Ok((peer_id, data_column, seen_timestamp)) } other => Err(Error::InternalError(format!( - "bad state complete expected Downloaded got {other:?}" + "bad state complete expected Downloaded got {}", + Into::<&'static str>::into(other), ))), } } diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs similarity index 73% rename from beacon_node/network/src/sync/network_context/custody.rs rename to beacon_node/network/src/sync/network_context/custody_by_root.rs index 9b048974d95..a7a83727dd8 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,6 +1,6 @@ +use super::custody_by_range::{ColumnRequest, Error}; use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, RpcRequestSendError, - RpcResponseError, + DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; @@ -14,20 +14,25 @@ use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; -use types::EthSpec; use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; +const REQUEST_EXPIRY_SECONDS: u64 = 300; type DataColumnSidecarList = Vec>>; pub struct ActiveCustodyByRootRequest { + start_time: Instant, block_root: Hash256, custody_id: CustodyId, /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap>, + #[allow(clippy::type_complexity)] + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>>, + >, /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, @@ -40,37 +45,11 @@ pub struct ActiveCustodyByRootRequest { _phantom: PhantomData, } -#[derive(Debug)] -pub enum Error { - TooManyDownloadErrors(RpcResponseError), - InternalError(String), -} - -impl From for RpcResponseError { - fn from(e: Error) -> Self { - match e { - Error::InternalError(e) => RpcResponseError::InternalError(e), - Error::TooManyDownloadErrors(e) => e, - } - } -} - -impl From for RpcRequestSendError { - fn from(e: Error) -> Self { - match e { - Error::TooManyDownloadErrors(_) => { - RpcRequestSendError::InternalError("Download error in request send".to_string()) - } - Error::InternalError(e) => RpcRequestSendError::InternalError(e), - } - } -} - struct ActiveBatchColumnsRequest { indices: Vec, } -pub type CustodyRequestResult = +pub type CustodyByRootRequestResult = Result, PeerGroup, Duration)>, Error>; impl ActiveCustodyByRootRequest { @@ -81,6 +60,7 @@ impl ActiveCustodyByRootRequest { lookup_peers: Arc>>, ) -> Self { Self { + start_time: Instant::now(), block_root, custody_id, column_requests: HashMap::from_iter( @@ -109,7 +89,7 @@ impl ActiveCustodyByRootRequest { req_id: DataColumnsByRootRequestId, resp: RpcResponseResult>, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( block_root = ?self.block_root, @@ -207,7 +187,7 @@ impl ActiveCustodyByRootRequest { pub(crate) fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { if self.column_requests.values().all(|r| r.is_downloaded()) { // All requests have completed successfully. let mut peers = HashMap::>::new(); @@ -233,6 +213,7 @@ impl ActiveCustodyByRootRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); + let mut indices_without_peers = vec![]; // Need to: // - track how many active requests a peer has for load balancing @@ -240,13 +221,9 @@ impl ActiveCustodyByRootRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(_) = request.is_awaiting_download() { - if request.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - let last_error = request - .download_failures - .last() - .expect("download_failures is not empty"); - return Err(Error::TooManyDownloadErrors(last_error.clone())); + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -293,6 +270,7 @@ impl ActiveCustodyByRootRequest { // // Otherwise this request should be dropped and failed after some time. // TODO(das): implement the above + indices_without_peers.push(column_index); } } } @@ -336,116 +314,18 @@ impl ActiveCustodyByRootRequest { } } - Ok(None) - } -} - -/// TODO(das): this attempt count is nested into the existing lookup request count. -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; - -struct ColumnRequest { - status: Status, - download_failures: Vec, -} - -#[derive(Debug, Clone)] -enum Status { - NotStarted(Instant), - Downloading(DataColumnsByRootRequestId), - Downloaded(PeerId, Arc>, Duration), -} - -impl ColumnRequest { - fn new() -> Self { - Self { - status: Status::NotStarted(Instant::now()), - download_failures: vec![], - } - } - - fn is_awaiting_download(&self) -> Option { - match self.status { - Status::NotStarted(start_time) => Some(start_time.elapsed()), - Status::Downloading { .. } | Status::Downloaded { .. } => None, - } - } - - fn is_downloaded(&self) -> bool { - match self.status { - Status::NotStarted { .. } | Status::Downloading { .. } => false, - Status::Downloaded { .. } => true, - } - } - - fn on_download_start(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { - match &self.status { - Status::NotStarted { .. } => { - self.status = Status::Downloading(req_id); - Ok(()) - } - other => Err(Error::InternalError(format!( - "bad state on_download_start expected NotStarted got {other:?}" - ))), - } - } - - fn on_download_error(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { - match &self.status { - Status::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(Error::InternalError(format!( - "Received download result for req_id {req_id} expecting {expected_req_id}" - ))); - } - self.status = Status::NotStarted(Instant::now()); - Ok(()) - } - other => Err(Error::InternalError(format!( - "bad state on_download_error expected Downloading got {other:?}" - ))), - } - } - - fn on_download_error_and_mark_failure( - &mut self, - req_id: DataColumnsByRootRequestId, - e: RpcResponseError, - ) -> Result<(), Error> { - self.download_failures.push(e); - self.on_download_error(req_id) - } - - fn on_download_success( - &mut self, - req_id: DataColumnsByRootRequestId, - peer_id: PeerId, - data_column: Arc>, - seen_timestamp: Duration, - ) -> Result<(), Error> { - match &self.status { - Status::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(Error::InternalError(format!( - "Received download result for req_id {req_id} expecting {expected_req_id}" - ))); - } - self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); - Ok(()) - } - other => Err(Error::InternalError(format!( - "bad state on_download_success expected Downloading got {other:?}" - ))), + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); } - } - fn complete(self) -> Result<(PeerId, Arc>, Duration), Error> { - match self.status { - Status::Downloaded(peer_id, data_column, seen_timestamp) => { - Ok((peer_id, data_column, seen_timestamp)) - } - other => Err(Error::InternalError(format!( - "bad state complete expected Downloaded got {other:?}" - ))), - } + Ok(None) } } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 00647f9848f..60cf72957ba 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -892,6 +892,7 @@ impl SyncingChain { %request_id, "Batch download error" ); + // TODO(das): Should handle here the error RequestExpired explicitly? if let BatchOperationOutcome::Failed { blacklist } = // TODO(das): Is it necessary for the batch to track failed peers? Can we make this // mechanism compatible with PeerDAS and before PeerDAS? @@ -969,8 +970,7 @@ impl SyncingChain { // that happens. Sync manager must periodicatlly prune stalled batches like // we do for lookup sync. Then we can deprecate the redundant // `good_peers_on_sampling_subnets` checks. - e - @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { + e @ RpcRequestSendError::InternalError(_) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried From 550fc5f17f28ed4ecd9e3aeaad04d355093dce29 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 12 May 2025 15:27:16 -0300 Subject: [PATCH 09/11] Implement todos --- .../lighthouse_network/src/types/globals.rs | 19 ++++++ .../block_components_by_range.rs | 60 +++++++++++++++---- beacon_node/network/src/sync/tests/lookups.rs | 11 +++- beacon_node/network/src/sync/tests/range.rs | 24 +++----- 4 files changed, 87 insertions(+), 27 deletions(-) diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index 7a447745cc3..46412d269b2 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -245,6 +245,25 @@ impl NetworkGlobals { Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) } + pub fn new_test_globals_as_supernode( + trusted_peers: Vec, + config: Arc, + spec: Arc, + is_supernode: bool, + ) -> NetworkGlobals { + let metadata = MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets: Default::default(), + custody_group_count: if is_supernode { + spec.number_of_custody_groups + } else { + spec.custody_requirement + }, + }); + Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) + } + pub(crate) fn new_test_globals_with_metadata( trusted_peers: Vec, metadata: MetaData, diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 332c755e506..a2f9e677052 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -15,8 +15,8 @@ use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, SignedBeaconBlock, - Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, RuntimeVariableList, + SignedBeaconBlock, Slot, }; pub struct BlockComponentsByRangeRequest { @@ -185,7 +185,10 @@ impl BlockComponentsByRangeRequest { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { // TODO(das): use the peer group let peer_group = BatchPeerGroup::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_base(blocks.to_vec()); + let rpc_blocks = couple_blocks_base( + blocks.to_vec(), + cx.network_globals().sampling_columns.len(), + ); Ok(Some((rpc_blocks, peer_group))) } else { // Wait for blocks_by_range requests to complete @@ -202,7 +205,8 @@ impl BlockComponentsByRangeRequest { ) { // We use the same block_peer for the blobs request let peer_group = BatchPeerGroup::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_deneb(blocks.to_vec(), blobs.to_vec()); + let rpc_blocks = + couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; Ok(Some((rpc_blocks, peer_group))) } else { // Wait for blocks_by_range and blobs_by_range requests to complete @@ -401,15 +405,51 @@ impl BlockComponentsByRangeRequest { } } -fn couple_blocks_base(_blocks: Vec>>) -> Vec> { - todo!(); +fn couple_blocks_base( + blocks: Vec>>, + custody_columns_count: usize, +) -> Vec> { + blocks + .into_iter() + .map(|block| RpcBlock::new_without_blobs(None, block, custody_columns_count)) + .collect() } fn couple_blocks_deneb( - _blocks: Vec>>, - _blobs: Vec>>, -) -> Vec> { - todo!(); + blocks: Vec>>, + blobs: Vec>>, + spec: &ChainSpec, +) -> Result>, Error> { + let mut blobs_by_block = HashMap::>>>::new(); + for blob in blobs { + let block_root = blob.block_root(); + blobs_by_block.entry(block_root).or_default().push(blob); + } + + // Now collect all blobs that match to the block by block root. BlobsByRange request checks + // the inclusion proof so we know that the commitment is the expected. + // + // BlobsByRange request handler ensures that we don't receive more blobs than possible. + // If the peer serving the request sends us blobs that don't pair well we'll send to the + // processor blocks without expected blobs, resulting in a downscoring event. A serving peer + // could serve fake blobs for blocks that don't have data, but it would gain nothing by it + // wasting theirs and our bandwidth 1:1. Therefore blobs that don't pair well are just ignored. + // + // RpcBlock::new ensures that the count of blobs is consistent with the block + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; + let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); + // BlobsByRange request handler enforces that blobs are sorted by index + let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block).map_err(|_| { + Error::InternalError("Blobs returned exceeds max length".to_string()) + })?; + Ok(RpcBlock::new(Some(block_root), block, Some(blobs)) + .expect("TODO: don't do matching here")) + }) + .collect::>, Error>>() } fn couple_blocks_fulu( diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 114b6d624aa..b2ed800040e 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -55,6 +55,14 @@ type DCByRootId = (SyncRequestId, Vec); impl TestRig { pub fn test_setup() -> Self { + Self::test_setup_with_options(false) + } + + pub fn test_setup_as_supernode() -> Self { + Self::test_setup_with_options(true) + } + + fn test_setup_with_options(is_supernode: bool) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -83,10 +91,11 @@ impl TestRig { // TODO(das): make the generation of the ENR use the deterministic rng to have consistent // column assignments let network_config = Arc::new(NetworkConfig::default()); - let globals = Arc::new(NetworkGlobals::new_test_globals( + let globals = Arc::new(NetworkGlobals::new_test_globals_as_supernode( Vec::new(), network_config, chain.spec.clone(), + is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 31c0d30a6ad..2e9c608286d 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -301,22 +301,14 @@ impl TestRig { } fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> Arc> { - let fork_name = self.spec.fork_name_at_slot::(slot); - let mut block = match fork_name { - ForkName::Fulu => { - let mut block = BeaconBlock::empty(&self.spec); - if with_data { - block - .body_mut() - .blob_kzg_commitments_mut() - .expect("fulu block") - .push(KzgCommitment([0; 48])) - .expect("pushed to empty kzg commitments"); - } - block + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { + blob_kzg_commitments + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); } - _ => todo!("extend for other forks"), - }; + } *block.slot_mut() = slot; Arc::new(SignedBeaconBlock::from_block(block, Signature::empty())) } @@ -953,7 +945,7 @@ fn finalized_sync_permanent_custody_peer_failure() { let mut requested_peers = HashSet::new(); - for i in 0..PEERS_IN_BATCH { + for i in 0..PEERS_IN_BATCH - 1 { r.log(&format!("Loop {i} of custody failure round")); // Some peer had a costudy failure at `column_index` so sync should do a single extra request From 95d0c0835eb4a6909cbc09ff2c88fddd4c9cf915 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 12 May 2025 16:54:20 -0300 Subject: [PATCH 10/11] Test for supernode peer only --- .../network/src/sync/network_context.rs | 12 +++- .../sync/network_context/custody_by_range.rs | 7 ++ beacon_node/network/src/sync/tests/lookups.rs | 19 +++++ beacon_node/network/src/sync/tests/mod.rs | 2 + beacon_node/network/src/sync/tests/range.rs | 71 +++++++++++++++---- 5 files changed, 96 insertions(+), 15 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 0d05c584684..3c74e77dba6 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -504,11 +504,14 @@ impl SyncNetworkContext { .iter() .filter(|block| block.as_block().has_data()) .count(); + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` debug!( %id, blocks = blocks.len(), blocks_with_data, - peers = ?peer_group, + block_peer = ?peer_group.block(), "Block components by range request success, removing" ) } @@ -1401,8 +1404,11 @@ impl SyncNetworkContext { // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { - Some(Ok((columns, peer_group, _))) => { - debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by range request success, removing") + Some(Ok((columns, _peer_group, _))) => { + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!(%id, count = columns.len(), "Custody by range request success, removing") } Some(Err(e)) => { debug!(%id, error = ?e, "Custody by range request failure, removing" ) diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index d5e974f7043..0463eb07185 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -267,6 +267,13 @@ impl ActiveCustodyByRangeRequest { // Log missing_column_indexes and incorrect_column_indices here in batch per request // to make this logs more compact and less noisy. if !imported_column_indices.is_empty() { + // TODO(das): this log may be redundant. We already log on DataColumnsByRange + // completed, and on DataColumnsByRange sent we log the column indices + // ``` + // Sync RPC request sent method="DataColumnsByRange" slots=8 epoch=4 columns=[52] peer=16Uiu2HAmEooeoHzHDYS35TSHrJDSfmREecPyFskrLPYm9Gm1EURj id=493/399/10/RangeSync/4/1 + // Sync RPC request completed id=493/399/10/RangeSync/4/1 method="DataColumnsByRange" count=1 + // ``` + // Which can be traced to this custody by range request, and the initial log debug!( id = %self.id, data_columns_by_range_req_id = %req_id, diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index b2ed800040e..f3224f4f8e6 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -53,6 +53,11 @@ const SAMPLING_REQUIRED_SUCCESSES: usize = 2; type DCByRootIds = Vec; type DCByRootId = (SyncRequestId, Vec); +pub enum PeersConfig { + SupernodeAndRandom, + SupernodeOnly, +} + impl TestRig { pub fn test_setup() -> Self { Self::test_setup_with_options(false) @@ -400,6 +405,20 @@ impl TestRig { k256::ecdsa::SigningKey::random(&mut self.rng).into() } + pub fn new_connected_peers(&mut self, config: PeersConfig) { + match config { + PeersConfig::SupernodeAndRandom => { + for _ in 0..100 { + self.new_connected_peer(); + } + self.new_connected_supernode_peer(); + } + PeersConfig::SupernodeOnly => { + self.new_connected_supernode_peer(); + } + } + } + pub fn new_connected_peers_for_peerdas(&mut self) { // Enough sampling peers with few columns for _ in 0..100 { diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index 40a58bc325e..a09313c5021 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -16,6 +16,8 @@ use store::MemoryStore; use tokio::sync::mpsc; use types::{ChainSpec, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; +pub use lookups::PeersConfig; + mod lookups; mod range; diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 2e9c608286d..323c913cace 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -6,9 +6,10 @@ use crate::sync::network_context::RangeRequestId; use crate::sync::range_sync::RangeSyncType; use crate::sync::SyncMessage; use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; +use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; +use lighthouse_network::discovery::{peer_id_to_node_id, CombinedKey}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, }; @@ -17,13 +18,16 @@ use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, DataColumnsByRangeRequestId, SyncRequestId, }; -use lighthouse_network::{PeerId, SyncInfo}; +use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; +use rand::SeedableRng; +use rand_chacha::ChaCha20Rng; use std::collections::HashSet; use std::time::Duration; +use types::data_column_custody_group::compute_subnets_for_node; use types::{ - BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, - EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, - SignedBeaconBlockHash, Slot, VariableList, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, + DataColumnSubnetId, Epoch, EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, + SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -60,6 +64,10 @@ impl ByRangeDataRequestIds { } } +struct Config { + peers: PeersConfig, +} + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -847,9 +855,20 @@ fn finalized_sync_enough_global_custody_peers_few_chain_peers() { r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); } +// Same test with different types of peers: +// - 100 peers +// - 1 supernode +// - perfectly distributed peer ids + #[test] -fn finalized_sync_not_enough_custody_peers_on_start() { - let mut r = TestRig::test_setup(); +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeOnly, + }); +} + +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { + let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; @@ -869,14 +888,10 @@ fn finalized_sync_not_enough_custody_peers_on_start() { r.expect_empty_network(); // Generate enough peers and supernodes to cover all custody columns - r.new_connected_peers_for_peerdas(); + r.new_connected_peers(config.peers); // Note: not necessary to add this peers to the chain, as we draw from the global pool // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS - 1) as usize, - ); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); @@ -971,3 +986,35 @@ fn finalized_sync_permanent_custody_peer_failure() { // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch // 1 successfully } + +#[test] +#[ignore] +fn mine_peerids() { + let spec = test_spec::(); + let mut rng = ChaCha20Rng::from_seed([0u8; 32]); + + let expected_subnets = (0..3) + .map(|i| DataColumnSubnetId::new(i as u64)) + .collect::>(); + + for i in 0..usize::MAX { + let key: CombinedKey = k256::ecdsa::SigningKey::random(&mut rng).into(); + let enr = Enr::builder().build(&key).unwrap(); + let peer_id = enr.peer_id(); + // Use default custody groups count + let node_id = peer_id_to_node_id(&peer_id).expect("convert peer_id to node_id"); + let subnets = compute_subnets_for_node(node_id.raw(), spec.custody_requirement, &spec) + .expect("should compute custody subnets"); + if expected_subnets == subnets { + panic!("{:?}", subnets); + } else { + let matches = expected_subnets + .iter() + .filter(|index| subnets.contains(index)) + .count(); + if matches > 0 { + println!("{i} {:?}", matches); + } + } + } +} From a37089462921625c65ef30fa7fd5dc2ab9012a72 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 19 May 2025 16:46:56 -0500 Subject: [PATCH 11/11] Improve sync tests --- beacon_node/network/src/sync/manager.rs | 29 +- .../network/src/sync/network_context.rs | 53 ++- .../block_components_by_range.rs | 22 + .../sync/network_context/custody_by_range.rs | 6 +- .../src/sync/network_context/requests.rs | 6 +- .../network/src/sync/range_sync/chain.rs | 89 ++-- .../src/sync/range_sync/chain_collection.rs | 7 + .../network/src/sync/range_sync/mod.rs | 4 +- .../network/src/sync/range_sync/range.rs | 17 +- beacon_node/network/src/sync/tests/lookups.rs | 75 +++- beacon_node/network/src/sync/tests/range.rs | 397 ++++++++++++++---- 11 files changed, 501 insertions(+), 204 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index de3b333317d..6a236ed6479 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -335,23 +335,6 @@ impl SyncManager { .collect() } - #[cfg(test)] - pub(crate) fn get_range_sync_chains( - &self, - ) -> Result, &'static str> { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn range_sync_state(&self) -> super::range_sync::SyncChainStatus { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn __range_failed_chains(&mut self) -> Vec { - self.range_sync.__failed_chains() - } - #[cfg(test)] pub(crate) fn get_failed_chains(&mut self) -> Vec { self.block_lookups.get_failed_chains() @@ -376,6 +359,18 @@ impl SyncManager { self.sampling.get_request_status(block_root, index) } + // Leak the full network context to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn network(&mut self) -> &mut SyncNetworkContext { + &mut self.network + } + + // Leak the full range_sync to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn range_sync(&mut self) -> &mut RangeSync { + &mut self.range_sync + } + #[cfg(test)] pub(crate) fn update_execution_engine_state(&mut self, state: EngineState) { self.handle_new_execution_engine_state(state); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 3c74e77dba6..ad674697ca2 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -15,7 +15,9 @@ use crate::sync::block_lookups::SingleLookupId; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; -pub use block_components_by_range::BlockComponentsByRangeRequest; +pub use block_components_by_range::{ + BlockComponentsByRangeRequest, BlockComponentsByRangeRequestStep, +}; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; @@ -278,6 +280,14 @@ impl SyncNetworkContext { /// Returns the ids of all the requests made to the given peer_id. pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Vec { + self.active_requests() + .filter(|(_, request_peer)| *request_peer == peer_id) + .map(|(id, _)| id) + .collect() + } + + /// Returns the ids of all active requests + pub fn active_requests(&mut self) -> impl Iterator { // Note: using destructuring pattern without a default case to make sure we don't forget to // add new request types to this function. Otherwise, lookup sync can break and lookups // will get stuck if a peer disconnects during an active requests. @@ -302,29 +312,23 @@ impl SyncNetworkContext { } = self; let blocks_by_root_ids = blocks_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlock { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlock { id: *id }, peer)); let blobs_by_root_ids = blobs_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlob { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlob { id: *id }, peer)); let data_column_by_root_ids = data_columns_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); let blocks_by_range_ids = blocks_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlocksByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlocksByRange(*id), peer)); let blobs_by_range_ids = blobs_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlobsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlobsByRange(*id), peer)); let data_column_by_range_ids = data_columns_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRange(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) @@ -332,6 +336,17 @@ impl SyncNetworkContext { .chain(blocks_by_range_ids) .chain(blobs_by_range_ids) .chain(data_column_by_range_ids) + } + + pub fn active_block_components_by_range_requests( + &self, + ) -> Vec<( + ComponentsByRangeRequestId, + BlockComponentsByRangeRequestStep, + )> { + self.block_components_by_range_requests + .iter() + .map(|(id, req)| (*id, req.state_step())) .collect() } diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index a2f9e677052..0a15ebe92a2 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -84,6 +84,13 @@ impl From for RpcRequestSendError { } } +/// FOR TESTING ONLY +#[derive(Debug)] +pub enum BlockComponentsByRangeRequestStep { + BlocksRequest, + CustodyRequest, +} + impl BlockComponentsByRangeRequest { pub fn new( id: ComponentsByRangeRequestId, @@ -403,6 +410,21 @@ impl BlockComponentsByRangeRequest { self.continue_requests(cx) } + + pub fn state_step(&self) -> BlockComponentsByRangeRequestStep { + match &self.state { + State::Base { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::DenebEnabled { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + BlockComponentsByRangeRequestStep::BlocksRequest + } + FuluEnabledState::CustodyRequest { .. } => { + BlockComponentsByRangeRequestStep::CustodyRequest + } + }, + } + } } fn couple_blocks_base( diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 0463eb07185..c10b284c736 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -319,11 +319,7 @@ impl ActiveCustodyByRangeRequest { // Not having columns is not a permanent fault. The peer may be backfilling. self.peers_with_custody_failures.insert(peer_id); - cx.report_peer( - peer_id, - PeerAction::MidToleranceError, - "data column custody failure", - ); + cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); } } Err(err) => { diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 7635717adce..75fa78710e7 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -171,12 +171,10 @@ impl ActiveRequests { } } - pub fn active_requests_of_peer(&self, peer_id: &PeerId) -> Vec<&K> { + pub fn active_requests(&self) -> impl Iterator { self.requests .iter() - .filter(|(_, request)| &request.peer_id == peer_id) - .map(|(id, _)| id) - .collect() + .map(|(id, request)| (id, &request.peer_id)) } pub fn iter_request_peers(&self) -> impl Iterator + '_ { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 60cf72957ba..66c898ccb08 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -133,7 +133,40 @@ pub enum ChainSyncingState { Syncing, } +#[cfg(test)] +#[derive(Debug, Eq, PartialEq)] +pub enum BatchStateSummary { + Downloading, + Processing, + AwaitingProcessing, + AwaitingValidation, + Unexpected(&'static str), +} + impl SyncingChain { + /// Returns a summary of batch states for assertions in tests. + #[cfg(test)] + pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> { + self.batches + .iter() + .map(|(id, batch)| { + let state = match batch.state() { + // A batch is never left in this state, it's only the initial value + BatchState::AwaitingDownload => { + BatchStateSummary::Unexpected("AwaitingDownload") + } + BatchState::Downloading { .. } => BatchStateSummary::Downloading, + BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing, + BatchState::Poisoned { .. } => BatchStateSummary::Unexpected("Poisoned"), + BatchState::Processing { .. } => BatchStateSummary::Processing, + BatchState::Failed => BatchStateSummary::Unexpected("Failed"), + BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation, + }; + (*id, state) + }) + .collect() + } + #[allow(clippy::too_many_arguments)] pub fn new( id: Id, @@ -419,11 +452,6 @@ impl SyncingChain { self.request_batches(network)?; } } - } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - // This is to handle the case where no batch was sent for the current processing - // target when there is no sampling peers available. This is a valid state and should not - // return an error. - return Ok(KeepChain); } else { return Err(RemoveChain::WrongChainState(format!( "Batch not found for current processing target {}", @@ -964,13 +992,7 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For - // sync to work properly it must be okay to have "stalled" batches in - // AwaitingDownload state. Currently it will error with invalid state if - // that happens. Sync manager must periodicatlly prune stalled batches like - // we do for lookup sync. Then we can deprecate the redundant - // `good_peers_on_sampling_subnets` checks. - e @ RpcRequestSendError::InternalError(_) => { + RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried @@ -1029,11 +1051,6 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); - return Ok(KeepChain); - } - if let Entry::Vacant(entry) = self.batches.entry(epoch) { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); entry.insert(optimistic_batch); @@ -1056,35 +1073,6 @@ impl SyncingChain { Ok(KeepChain) } - /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in - /// every sampling column subnet. - fn good_peers_on_sampling_subnets( - &self, - epoch: Epoch, - network: &SyncNetworkContext, - ) -> bool { - if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { - // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network - .network_globals() - .sampling_subnets - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer(*subnet_id) - .count(); - - peer_count > 0 - }); - peers_on_all_custody_subnets - } else { - true - } - } - /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] @@ -1117,15 +1105,6 @@ impl SyncingChain { return None; } - // don't send batch requests until we have peers on sampling subnets - // TODO(das): this is a workaround to avoid sending out excessive block requests because - // block and data column requests are currently coupled. This can be removed once we find a - // way to decouple the requests and do retries individually, see issue #6258. - if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); - return None; - } - // If no batch needs a retry, attempt to send the batch of the next epoch to download let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index c6be3de5767..290713a2453 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -54,6 +54,13 @@ pub struct ChainCollection { } impl ChainCollection { + #[cfg(test)] + pub(crate) fn iter(&self) -> impl Iterator> { + self.finalized_chains + .values() + .chain(self.head_chains.values()) + } + pub fn new(beacon_chain: Arc>) -> Self { ChainCollection { beacon_chain, diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 66f07833ea0..c58e1579fd5 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -11,8 +11,8 @@ pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeerGroup, BatchProcessingResult, BatchState, }; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] -pub use chain_collection::SyncChainStatus; +pub use chain::BatchStateSummary; +pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 929752e3e26..6f3f8408622 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,7 +39,7 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. -use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; +use super::chain::{BatchId, BatchStateSummary, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; use super::BatchPeerGroup; @@ -100,10 +100,23 @@ where } #[cfg(test)] - pub(crate) fn __failed_chains(&mut self) -> Vec { + pub(crate) fn failed_chains(&mut self) -> Vec { self.failed_chains.keys().copied().collect() } + #[cfg(test)] + pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + self.chains + .iter() + .flat_map(|chain| { + chain + .batches_state() + .into_iter() + .map(|(batch_id, state)| (chain.get_id(), batch_id, state)) + }) + .collect() + } + #[instrument(parent = None, level = "info", fields(component = "range_sync"), diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index f3224f4f8e6..74b530b66e8 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -35,7 +35,7 @@ use lighthouse_network::{ SamplingRequester, SingleLookupReqId, SyncRequestId, }, types::SyncState, - NetworkConfig, NetworkGlobals, PeerId, + NetworkConfig, NetworkGlobals, PeerId, SyncInfo, }; use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; @@ -259,8 +259,8 @@ impl TestRig { self.sync_manager.active_parent_lookups().len() } - fn active_range_sync_chain(&self) -> (RangeSyncType, Slot, Slot) { - self.sync_manager.get_range_sync_chains().unwrap().unwrap() + fn active_range_sync_chain(&mut self) -> (RangeSyncType, Slot, Slot) { + self.sync_manager.range_sync().state().unwrap().unwrap() } fn assert_single_lookups_count(&self, count: usize) { @@ -370,15 +370,17 @@ impl TestRig { self.expect_empty_network(); } - pub fn new_connected_peer(&mut self) -> PeerId { + // Don't make pub, use `add_connected_peer_testing_only` + fn new_connected_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(false) } - pub fn new_connected_supernode_peer(&mut self) -> PeerId { + // Don't make pub, use `add_connected_peer_testing_only` + fn new_connected_supernode_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(true) } - fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { + pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self .network_globals @@ -401,20 +403,26 @@ impl TestRig { peer_id } + pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { + let peer_id = self.add_connected_peer_testing_only(supernode); + self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); + peer_id + } + fn determinstic_key(&mut self) -> CombinedKey { k256::ecdsa::SigningKey::random(&mut self.rng).into() } - pub fn new_connected_peers(&mut self, config: PeersConfig) { + pub fn add_sync_peers(&mut self, config: PeersConfig, remote_info: SyncInfo) { match config { PeersConfig::SupernodeAndRandom => { for _ in 0..100 { - self.new_connected_peer(); + self.add_sync_peer(false, remote_info.clone()); } - self.new_connected_supernode_peer(); + self.add_sync_peer(true, remote_info); } PeersConfig::SupernodeOnly => { - self.new_connected_supernode_peer(); + self.add_sync_peer(true, remote_info); } } } @@ -881,6 +889,19 @@ impl TestRig { } } + // Find, not pop + pub fn filter_received_network_events) -> Option>( + &mut self, + predicate_transform: F, + ) -> Vec { + self.drain_network_rx(); + + self.network_rx_queue + .iter() + .filter_map(predicate_transform) + .collect() + } + pub fn pop_received_processor_event) -> Option>( &mut self, predicate_transform: F, @@ -1134,6 +1155,21 @@ impl TestRig { } } + pub fn expect_no_penalty_for_anyone(&mut self) { + self.drain_network_rx(); + let downscore_events = self + .network_rx_queue + .iter() + .filter_map(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), + _ => None, + }) + .collect::>(); + if !downscore_events.is_empty() { + panic!("Expected no downscoring events but found: {downscore_events:?}"); + } + } + #[track_caller] fn expect_parent_chain_process(&mut self) { match self.beacon_processor_rx.try_recv() { @@ -1169,6 +1205,25 @@ impl TestRig { } } + #[track_caller] + pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { + let all_penalties = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if all_penalties + .iter() + .any(|(_, msg)| *msg != expected_penalty_msg) + { + panic!( + "Expected penalties only of {expected_penalty_msg}, but found {all_penalties:?}" + ); + } + self.log(&format!( + "Found expected penalties {expected_penalty_msg}: {all_penalties:?}" + )); + } + #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 323c913cace..c82e4f97769 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -2,9 +2,9 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; -use crate::sync::network_context::RangeRequestId; -use crate::sync::range_sync::RangeSyncType; -use crate::sync::SyncMessage; +use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; +use crate::sync::range_sync::{BatchId, BatchStateSummary, RangeSyncType}; +use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; @@ -18,6 +18,7 @@ use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, DataColumnsByRangeRequestId, SyncRequestId, }; +use lighthouse_network::types::SyncState; use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; use rand::SeedableRng; use rand_chacha::ChaCha20Rng; @@ -68,6 +69,14 @@ struct Config { peers: PeersConfig, } +type BlocksByRangeRequestData = (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest); + +type DataColumnsByRangeRequestData = ( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, +); + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -99,6 +108,36 @@ impl RequestFilter { self } + fn blocks_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), + } if self.matches_blocks_by_range(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + + fn data_columns_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), + } if self.matches_data_columns_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { self.matches_common(peer, *req.start_slot()) } @@ -182,7 +221,7 @@ impl TestRig { /// Produce a head peer with an advanced head fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { let local_info = self.local_info(); - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { head_root, head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), ..local_info @@ -198,7 +237,7 @@ impl TestRig { fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { let local_info = self.local_info(); let finalized_epoch = local_info.finalized_epoch + 2; - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { finalized_epoch, finalized_root, head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), @@ -233,37 +272,22 @@ impl TestRig { } } - fn add_random_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { - let peer_id = self.new_connected_peer(); - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { + self.add_sync_peer(false, remote_info) } - fn add_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { + fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { // Create valid peer known to network globals // TODO(fulu): Using supernode peers to ensure we have peer across all column // subnets for syncing. Should add tests connecting to full node peers. - let peer_id = self.new_connected_supernode_peer(); - // Send peer to sync - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id - } - - fn add_random_peers(&mut self, remote_info: SyncInfo, count: usize) { - for _ in 0..count { - let peer = self.new_connected_peer(); - self.add_peer(peer, remote_info.clone()); - } - } - - fn add_peer(&mut self, peer: PeerId, remote_info: SyncInfo) { - self.send_sync_message(SyncMessage::AddPeer(peer, remote_info)); + self.add_sync_peer(true, remote_info) } - fn assert_state(&self, state: RangeSyncType) { + fn assert_state(&mut self, state: RangeSyncType) { assert_eq!( self.sync_manager - .range_sync_state() + .range_sync() + .state() .expect("State is ok") .expect("Range should be syncing, there are no chains") .0, @@ -272,15 +296,28 @@ impl TestRig { ); } - fn assert_no_chains_exist(&self) { - if let Some(chain) = self.sync_manager.get_range_sync_chains().unwrap() { + fn get_sync_state(&mut self) -> SyncState { + self.sync_manager.network().network_globals().sync_state() + } + + fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + self.sync_manager.range_sync().batches_state() + } + + fn assert_sync_state(&mut self) { + let current_state = self.sync_manager.network().network_globals().sync_state(); + panic!("{:?}", current_state); + } + + fn assert_no_chains_exist(&mut self) { + if let Some(chain) = self.sync_manager.range_sync().state().unwrap() { panic!("There still exists a chain {chain:?}"); } } fn assert_no_failed_chains(&mut self) { assert_eq!( - self.sync_manager.__range_failed_chains(), + self.sync_manager.range_sync().failed_chains(), Vec::::new(), "Expected no failed chains" ) @@ -296,13 +333,81 @@ impl TestRig { } } - fn expect_no_columns_by_range_requests(&mut self, request_filter: RequestFilter) { - let requests = self.find_data_columns_by_range_requests(request_filter); + fn expect_blocks_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = + self.filter_received_network_events(|ev| request_filter.blocks_by_range_requests(ev)); + if events.is_empty() { + panic!("Expected to find blocks_by_range requests {request_filter:?}") + } + } + + fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = self + .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); + if !events.is_empty() { + panic!("Expected to not find data_columns_by_range requests {request_filter:?} by found {events:?}") + } + } + + fn expect_active_block_components_by_range_request_on_custody_step(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if requests.is_empty() { + panic!("No active block_components_by_range requests"); + } + for (id, step) in requests { + if !matches!(step, BlockComponentsByRangeRequestStep::CustodyRequest) { + panic!("block_components_by_range request {id} is not on CustodyRequest step: {step:?}"); + } + } + } + + fn expect_no_active_block_components_by_range_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if !requests.is_empty() { + panic!("Still active block_components_by_range requests {requests:?}"); + } + } + + fn expect_no_active_rpc_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_requests() + .collect::>(); if !requests.is_empty() { - panic!("Expected no DataColumnsByRange requests with filter {request_filter:?} but found {requests:?}"); + panic!("There are still active RPC requests {requests:?}"); + } + } + + fn expect_all_batches_in_state(&mut self, states: &[BatchStateSummary]) { + let batches = self.get_batch_states(); + if batches.is_empty() { + panic!("no batches"); + } + for batch in &batches { + if !states.contains(&batch.2) { + panic!("batch {batch:?} not in state {states:?}. Batches: {batches:?}"); + } } } + fn expect_all_batches_downloading(&mut self) { + self.expect_all_batches_in_state(&[BatchStateSummary::Downloading]); + } + + fn expect_all_batches_processing_or_awaiting(&mut self) { + self.expect_all_batches_in_state(&[ + BatchStateSummary::Processing, + BatchStateSummary::AwaitingProcessing, + ]); + } + fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); @@ -397,26 +502,17 @@ impl TestRig { }); } - fn find_blocks_by_range_request( + fn pop_blocks_by_range_request( &mut self, request_filter: RequestFilter, ) -> (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest) { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlocksByRange(req), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if request_filter.matches_blocks_by_range(peer_id, req) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }) + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) + .unwrap_or_else(|e| { + panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") + }) } - fn find_data_columns_by_range_requests( + fn pop_data_columns_by_range_requests( &mut self, request_filter: RequestFilter, ) -> Vec<( @@ -425,16 +521,9 @@ impl TestRig { DataColumnsByRangeRequest, )> { let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::DataColumnsByRange(req), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if request_filter.matches_data_columns_by_range(peer_id, req) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - }) { + while let Ok(data_columns_request) = + self.pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { data_columns_requests.push(data_columns_request); } data_columns_requests @@ -445,7 +534,7 @@ impl TestRig { request_filter: RequestFilter, ) -> ByRangeDataRequestIds { if self.after_fulu() { - let data_columns_requests = self.find_data_columns_by_range_requests(request_filter); + let data_columns_requests = self.pop_data_columns_by_range_requests(request_filter); if data_columns_requests.is_empty() { panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); } @@ -487,8 +576,25 @@ impl TestRig { complete_config: CompleteConfig, ) -> RangeRequestId { let (blocks_req_id, block_peer, blocks_req) = - self.find_blocks_by_range_request(request_filter); + self.pop_blocks_by_range_request(request_filter); + + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + blocks_req_id.parent_request_id.requester + } + + fn complete_blocks_by_range_request( + &mut self, + request: BlocksByRangeRequestData, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let (blocks_req_id, block_peer, blocks_req) = request; let start_slot = Slot::new(*blocks_req.start_slot()); let blocks = (0..complete_config.block_count) .map(|i| { @@ -500,6 +606,66 @@ impl TestRig { blocks_req_id.parent_request_id.requester } + fn complete_data_columns_by_range_request( + &mut self, + (id, peer_id, req): DataColumnsByRangeRequestData, + complete_config: CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!( + "Forced custody failure at request {id} for peer {peer_id} index {index:?}" + )); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); + } + fn find_and_complete_data_by_range_request( &mut self, request_filter: RequestFilter, @@ -588,6 +754,33 @@ impl TestRig { } } + fn progress_until_no_events( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + loop { + if let Ok(request) = + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) + { + self.complete_blocks_by_range_request(request, complete_config); + continue; + } + + if let Ok(request) = self + .pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + self.complete_data_columns_by_range_request(request, complete_config); + continue; + } + + let sync_state = self.get_sync_state(); + self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); + + return; + } + } + fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { self.pop_received_processor_event(|ev| { (ev.work_type() == WorkType::ChainSegment).then_some(()) @@ -745,14 +938,14 @@ fn head_chain_removed_while_finalized_syncing() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer(); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Fail the head chain by disconnecting the peer. rig.peer_disconnected(head_peer); @@ -779,14 +972,14 @@ async fn state_update_while_purging() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer_with_root(finalized_peer_root); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Now the chain knows both chains target roots. rig.remember_block(head_peer_block).await; @@ -848,7 +1041,7 @@ fn finalized_sync_enough_global_custody_peers_few_chain_peers() { // Current priorization only sends batches to idle peers, so we need enough peers for each batch // TODO: Test this with a single peer in the chain, it should still work - r.add_random_peers(remote_info, 1); + r.add_sync_peer(false, remote_info); r.assert_state(RangeSyncType::Finalized); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; @@ -867,6 +1060,13 @@ fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { }); } +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeAndRandom, + }); +} + fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS @@ -879,22 +1079,31 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit - r.add_random_peer_not_supernode(remote_info.clone()); + r.add_connected_sync_peer_not_supernode(remote_info.clone()); r.assert_state(RangeSyncType::Finalized); - // Because we don't have enough peers on all columns we haven't sent any request. - // NOTE: There's a small chance that this single peer happens to custody exactly the set we - // expect, in that case the test will fail. Find a way to make the test deterministic. - r.expect_empty_network(); + // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it + // doesn't have enough peers to cover all columns + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + + // Here we have a batch with partially completed block_components_by_range requests. The batch + // should not have failed, we are still syncing, and there are no downscoring events. + r.expect_no_penalty_for_anyone(); + r.expect_active_block_components_by_range_request_on_custody_step(); // Generate enough peers and supernodes to cover all custody columns - r.new_connected_peers(config.peers); + r.add_sync_peers(config.peers, remote_info.clone()); // Note: not necessary to add this peers to the chain, as we draw from the global pool // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work + // events from here easily. What we pop from the beacon processor queue is an opaque closure + // wihtout any information. We don't know what batch it is for. } #[test] @@ -909,28 +1118,36 @@ fn finalized_sync_single_custody_peer_failure() { let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); - // Unikely that the single peer we added has enough columns for us. Tests are determinstic and - // this error should never be hit - r.add_random_peer(remote_info.clone()); + r.add_sync_peer(true, remote_info.clone()); r.assert_state(RangeSyncType::Finalized); // Progress all blocks_by_range and columns_by_range requests but respond empty for a single // column index - r.find_and_complete_block_components_by_range_request( - filter().epoch(0), + r.progress_until_no_events( + filter(), complete().custody_failure_at_index(column_index_to_fail), ); + r.expect_penalties("custody_failure"); + + // Some peer had a custody failure, but since there's a single peer in the batch we won't issue + // another request yet. + r.expect_no_active_rpc_requests(); + // Ensure that the block components by range request have not failed + r.expect_active_block_components_by_range_request_on_custody_step(); + r.expect_all_batches_downloading(); + + // After adding a new peer we will try to fetch from it + r.add_sync_peer(true, remote_info.clone()); + r.progress_until_no_events( + // Find the requests first to assert that this is the only request that exists + filter().column_index(column_index_to_fail), + // complete this one request without the custody failure now + complete(), + ); - // Some peer had a costudy failure at `column_index` so sync should do a single extra request - // for that index and epoch. - let reqs = r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); - // Find the requests first to assert that this is the only request that exists - r.expect_no_columns_by_range_requests(filter().epoch(0)); - // complete this one request without the custody failure now - r.complete_data_by_range_request(reqs, complete()); - - // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch - // 1 successfully + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + r.expect_all_batches_processing_or_awaiting(); } #[test] @@ -947,7 +1164,7 @@ fn finalized_sync_permanent_custody_peer_failure() { const PEERS_IN_BATCH: usize = 4; for _ in 0..PEERS_IN_BATCH { - r.add_random_peer(remote_info.clone()); + r.add_connected_sync_random_peer(remote_info.clone()); } r.assert_state(RangeSyncType::Finalized); @@ -975,7 +1192,7 @@ fn finalized_sync_permanent_custody_peer_failure() { requested_peers.insert(req_peer); // Find the requests first to assert that this is the only request that exists - r.expect_no_columns_by_range_requests(filter().epoch(0)); + r.expect_no_data_columns_by_range_requests(filter().epoch(0)); // complete this one request without the custody failure now r.complete_data_by_range_request( reqs,