From 4fb2ae658a8402e63b2163c1a9591bf656a5574f Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 21 May 2025 23:34:28 -0500 Subject: [PATCH 01/66] Implement reliable range sync for PeerDAS --- .../src/block_verification_types.rs | 8 +- beacon_node/beacon_chain/src/test_utils.rs | 3 +- .../src/service/api_types.rs | 22 +- .../lighthouse_network/src/types/globals.rs | 19 + .../network/src/sync/backfill_sync/mod.rs | 34 +- .../network/src/sync/block_lookups/mod.rs | 6 +- beacon_node/network/src/sync/manager.rs | 107 ++- beacon_node/network/src/sync/mod.rs | 1 - .../network/src/sync/network_context.rs | 587 ++++++------ .../block_components_by_range.rs | 550 +++++++++++ .../sync/network_context/custody_by_range.rs | 481 ++++++++++ .../{custody.rs => custody_by_root.rs} | 233 +++-- .../src/sync/network_context/requests.rs | 8 +- beacon_node/network/src/sync/peer_sampling.rs | 12 +- .../network/src/sync/range_sync/batch.rs | 17 +- .../network/src/sync/range_sync/chain.rs | 123 ++- .../src/sync/range_sync/chain_collection.rs | 7 + .../network/src/sync/range_sync/mod.rs | 5 +- .../network/src/sync/range_sync/range.rs | 20 +- beacon_node/network/src/sync/tests/lookups.rs | 122 ++- beacon_node/network/src/sync/tests/mod.rs | 9 +- beacon_node/network/src/sync/tests/range.rs | 901 +++++++++++++++--- consensus/types/src/signed_beacon_block.rs | 4 + 23 files changed, 2579 insertions(+), 700 deletions(-) create mode 100644 beacon_node/network/src/sync/network_context/block_components_by_range.rs create mode 100644 beacon_node/network/src/sync/network_context/custody_by_range.rs rename beacon_node/network/src/sync/network_context/{custody.rs => custody_by_root.rs} (70%) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 7abaf09e5e0..84011e23ff9 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -199,7 +199,7 @@ impl RpcBlock { custody_columns: Vec>, expected_custody_indices: Vec, spec: &ChainSpec, - ) -> Result { + ) -> Result { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); let custody_columns_count = expected_custody_indices.len(); @@ -209,11 +209,7 @@ impl RpcBlock { custody_columns, spec.number_of_columns as usize, ) - .map_err(|e| { - AvailabilityCheckError::Unexpected(format!( - "custody_columns len exceeds number_of_columns: {e:?}" - )) - })?, + .map_err(|e| format!("custody_columns len exceeds number_of_columns: {e:?}"))?, expected_custody_indices, }; Ok(Self { diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 858aaafcf07..8f5a119fb5d 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2418,7 +2418,8 @@ where columns, expected_custody_indices, &self.spec, - )? + ) + .map_err(BlockError::InternalError)? } else { RpcBlock::new_without_blobs(Some(block_root), block, sampling_column_count) } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index b36f8cc2154..8300ad4bb89 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -59,6 +59,14 @@ pub struct BlobsByRangeRequestId { pub struct DataColumnsByRangeRequestId { /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` pub id: Id, + /// The Id of the parent custody by range request that issued this data_columns_by_range request + pub parent_request_id: CustodyByRangeRequestId, +} + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyByRangeRequestId { + /// Id to identify this attempt at a meta custody by range request for `parent_request_id` + pub id: Id, /// The Id of the overall By Range request for block components. pub parent_request_id: ComponentsByRangeRequestId, } @@ -221,6 +229,7 @@ macro_rules! impl_display { impl_display!(BlocksByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); +impl_display!(CustodyByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); @@ -299,14 +308,17 @@ mod tests { fn display_id_data_columns_by_range() { let id = DataColumnsByRangeRequestId { id: 123, - parent_request_id: ComponentsByRangeRequestId { + parent_request_id: CustodyByRangeRequestId { id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), + parent_request_id: ComponentsByRangeRequestId { + id: 121, + requester: RangeRequestId::RangeSync { + chain_id: 54, + batch_id: Epoch::new(0), + }, }, }, }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); + assert_eq!(format!("{id}"), "123/122/121/RangeSync/0/54"); } } diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index fd99d935890..7fa751a9057 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -245,6 +245,25 @@ impl NetworkGlobals { Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) } + pub fn new_test_globals_as_supernode( + trusted_peers: Vec, + config: Arc, + spec: Arc, + is_supernode: bool, + ) -> NetworkGlobals { + let metadata = MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets: Default::default(), + custody_group_count: if is_supernode { + spec.number_of_custody_groups + } else { + spec.custody_requirement + }, + }); + Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) + } + pub(crate) fn new_test_globals_with_metadata( trusted_peers: Vec, metadata: MetaData, diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 7b5701cc8d2..45b9c61641b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -21,11 +21,11 @@ use beacon_chain::{BeaconChain, BeaconChainTypes}; use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::{PeerAction, PeerId}; +use lighthouse_network::PeerAction; use logging::crit; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashSet, + HashMap, HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -312,7 +312,6 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> Result<(), BackFillError> { @@ -326,11 +325,18 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - match batch.download_failed(Some(*peer_id)) { + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + match batch.download_failed(None) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) - } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { + RpcResponseError::RpcError(_) + | RpcResponseError::VerifyError(_) + | RpcResponseError::InternalError(_) => { + BackFillError::BatchDownloadFailed(batch_id) + } + RpcResponseError::RequestExpired(_) => BackFillError::Paused, + }), Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), } } else { @@ -929,6 +935,8 @@ impl BackFillSync { RangeRequestId::BackfillSync { batch_id }, &synced_peers, &failed_peers, + // Does not track total requests per peers for now + &HashMap::new(), ) { Ok(request_id) => { // inform the batch about the new request @@ -940,15 +948,9 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { - RpcRequestSendError::NoPeer(no_peer) => { - // If we are here the chain has no more synced peers - info!( - "reason" = format!("insufficient_synced_peers({no_peer:?})"), - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); - } + // TODO(das): block_components_by_range requests can now hang out indefinitely. + // Is that fine? Maybe we should fail the requests from the network_context + // level without involving the BackfillSync itself. RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 8c884f644e1..2c59f710d04 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -494,7 +494,7 @@ impl BlockLookups { let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { // We don't have the ability to cancel in-flight RPC requests. So this can happen // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); + debug!(%id, "Block returned for single block lookup not present"); return Err(LookupRequestError::UnknownLookup); }; @@ -507,7 +507,7 @@ impl BlockLookups { Ok((response, peer_group, seen_timestamp)) => { debug!( ?block_root, - ?id, + %id, ?peer_group, ?response_type, "Received lookup download success" @@ -540,7 +540,7 @@ impl BlockLookups { // the peer and the request ID which is linked to this `id` value here. debug!( ?block_root, - ?id, + %id, ?response_type, error = ?e, "Received lookup download failure" diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 3c94793941c..0cf17c7b899 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -36,7 +36,8 @@ use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::block_lookups::BlockLookups; use super::network_context::{ - CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, + CustodyByRangeResult, CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, + SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; @@ -58,9 +59,10 @@ use beacon_chain::{ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SamplingId, SamplingRequester, SingleLookupReqId, SyncRequestId, + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, + SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::PeerId; @@ -336,23 +338,6 @@ impl SyncManager { .collect() } - #[cfg(test)] - pub(crate) fn get_range_sync_chains( - &self, - ) -> Result, &'static str> { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn range_sync_state(&self) -> super::range_sync::SyncChainStatus { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn __range_failed_chains(&mut self) -> Vec { - self.range_sync.__failed_chains() - } - #[cfg(test)] pub(crate) fn get_failed_chains(&mut self) -> Vec { self.block_lookups.get_failed_chains() @@ -377,6 +362,18 @@ impl SyncManager { self.sampling.get_request_status(block_root, index) } + // Leak the full network context to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn network(&mut self) -> &mut SyncNetworkContext { + &mut self.network + } + + // Leak the full range_sync to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn range_sync(&mut self) -> &mut RangeSync { + &mut self.range_sync + } + #[cfg(test)] pub(crate) fn update_execution_engine_state(&mut self, state: EngineState) { self.handle_new_execution_engine_state(state); @@ -442,6 +439,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. @@ -545,6 +545,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Updates the syncing state of a peer. @@ -1186,10 +1189,9 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Block(id, resp), + RangeBlockComponent::Block(id, resp, peer_id), ); } } @@ -1201,10 +1203,9 @@ impl SyncManager { blob: RpcEvent>>, ) { if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Blob(id, resp), + RangeBlockComponent::Blob(id, resp, peer_id), ); } } @@ -1215,18 +1216,46 @@ impl SyncManager { peer_id: PeerId, data_column: RpcEvent>>, ) { + // data_columns_by_range returns either an Ok list of data columns, or an RpcResponseError if let Some(resp) = self .network .on_data_columns_by_range_response(id, peer_id, data_column) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::CustodyColumns(id, resp), - ); + // custody_by_range accumulates the results of multiple data_columns_by_range requests + // returning a bigger list of data columns across all the column indices this node has + // to custody + if let Some(result) = + self.network + .on_custody_by_range_response(id.parent_request_id, id, peer_id, resp) + { + self.on_custody_by_range_result(id.parent_request_id, result); + } } } + fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + result: CustodyByRangeResult, + ) { + // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not + // not have to pass a PeerGroup in case of error + let peers = match &result { + Ok((_, peers, _)) => peers.clone(), + // TODO(das): this PeerGroup with no peers incorrect + Err(_) => PeerGroup::from_set(<_>::default()), + }; + + self.on_block_components_by_range_response( + id.parent_request_id, + RangeBlockComponent::CustodyColumns( + id, + result.map(|(data, _peers, timestamp)| (data, timestamp)), + peers, + ), + ); + } + fn on_custody_by_root_result( &mut self, requester: CustodyRequester, @@ -1267,17 +1296,15 @@ impl SyncManager { /// Handles receiving a response for a range sync request that should have both blocks and /// blobs. - fn on_range_components_response( + fn on_block_components_by_range_response( &mut self, range_request_id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self.network.range_block_component_response( - range_request_id, - peer_id, - range_block_component, - ) { + if let Some(resp) = self + .network + .on_block_components_by_range_response(range_request_id, range_block_component) + { match resp { Ok((blocks, batch_peers)) => { match range_request_id.requester { @@ -1315,7 +1342,6 @@ impl SyncManager { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.inject_error( &mut self.network, - peer_id, batch_id, chain_id, range_request_id.id, @@ -1327,7 +1353,6 @@ impl SyncManager { match self.backfill_sync.inject_error( &mut self.network, batch_id, - &peer_id, range_request_id.id, e, ) { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 0f5fd6fb9f1..97302df04e8 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -3,7 +3,6 @@ //! Stores the various syncing methods for the beacon chain. mod backfill_sync; mod block_lookups; -mod block_sidecar_coupling; pub mod manager; mod network_context; mod peer_sampling; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 50b39fe72ef..d7ad9d3eb7a 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,11 +1,11 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; +use self::custody_by_range::{ActiveCustodyByRangeRequest, CustodyByRangeRequestResult}; +use self::custody_by_root::{ActiveCustodyByRootRequest, CustodyByRootRequestResult}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; -use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::{BatchPeers, ByRangeRequestType}; +use super::range_sync::BatchPeers; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -17,15 +17,17 @@ use crate::sync::block_lookups::SingleLookupId; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; -use custody::CustodyRequestResult; +pub use block_components_by_range::BlockComponentsByRangeRequest; +#[cfg(test)] +pub use block_components_by_range::BlockComponentsByRangeRequestStep; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + CustodyByRangeRequestId, CustodyId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; @@ -36,7 +38,6 @@ use requests::{ }; #[cfg(test)] use slot_clock::SlotClock; -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -47,11 +48,13 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; -pub mod custody; +pub mod block_components_by_range; +pub mod custody_by_range; +pub mod custody_by_root; mod requests; #[derive(Debug)] @@ -72,32 +75,29 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; + /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; +pub type CustodyByRootResult = RpcResponseBatchResult>; -#[derive(Debug)] +pub type CustodyByRangeResult = RpcResponseBatchResult>; + +#[derive(Debug, Clone)] pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), - CustodyRequestError(#[allow(dead_code)] CustodyRequestError), - BlockComponentCouplingError(#[allow(dead_code)] String), + RequestExpired(String), + InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// No peer available matching the required criteria - NoPeer(NoPeerError), /// These errors should never happen, including unreachable custody errors or network send /// errors. InternalError(String), -} - -/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` -#[derive(Debug, PartialEq, Eq)] -pub enum NoPeerError { - BlockPeer, - CustodyPeer(ColumnIndex), + // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream + // consumers that sends are expected to be infallible. If this assumption changes in the future, + // add a new variant. } #[derive(Debug, PartialEq, Eq)] @@ -150,6 +150,17 @@ impl PeerGroup { } }) } + + pub fn as_reversed_map(&self) -> HashMap { + // TODO(das): should we change PeerGroup to hold this map? + let mut index_to_peer = HashMap::::new(); + for (peer, indices) in self.peers.iter() { + for &index in indices { + index_to_peer.insert(index as u64, *peer); + } + } + index_to_peer + } } /// Sequential ID that uniquely identifies ReqResp outgoing requests @@ -195,12 +206,15 @@ pub struct SyncNetworkContext { data_columns_by_range_requests: ActiveRequests>, - /// Mapping of active custody column requests for a block root - custody_by_root_requests: FnvHashMap>, + /// Mapping of active custody column by root requests for a block root + custody_by_root_requests: FnvHashMap>, + + /// Mapping of active custody column by range requests + custody_by_range_requests: FnvHashMap>, /// BlocksByRange requests paired with other ByRange requests for data components - components_by_range_requests: - FnvHashMap>, + block_components_by_range_requests: + FnvHashMap>, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -219,14 +233,17 @@ pub enum RangeBlockComponent { Block( BlocksByRangeRequestId, RpcResponseResult>>>, + PeerId, ), Blob( BlobsByRangeRequestId, RpcResponseResult>>>, + PeerId, ), CustodyColumns( - DataColumnsByRangeRequestId, + CustodyByRangeRequestId, RpcResponseResult>>>, + PeerGroup, ), } @@ -283,7 +300,8 @@ impl SyncNetworkContext { blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), - components_by_range_requests: FnvHashMap::default(), + custody_by_range_requests: <_>::default(), + block_components_by_range_requests: <_>::default(), network_beacon_processor, chain, fork_context, @@ -297,6 +315,14 @@ impl SyncNetworkContext { /// Returns the ids of all the requests made to the given peer_id. pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Vec { + self.active_requests() + .filter(|(_, request_peer)| *request_peer == peer_id) + .map(|(id, _)| id) + .collect() + } + + /// Returns the ids of all active requests + pub fn active_requests(&mut self) -> impl Iterator { // Note: using destructuring pattern without a default case to make sure we don't forget to // add new request types to this function. Otherwise, lookup sync can break and lookups // will get stuck if a peer disconnects during an active requests. @@ -311,8 +337,9 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -320,29 +347,23 @@ impl SyncNetworkContext { } = self; let blocks_by_root_ids = blocks_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlock { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlock { id: *id }, peer)); let blobs_by_root_ids = blobs_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlob { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlob { id: *id }, peer)); let data_column_by_root_ids = data_columns_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); let blocks_by_range_ids = blocks_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlocksByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlocksByRange(*id), peer)); let blobs_by_range_ids = blobs_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlobsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlobsByRange(*id), peer)); let data_column_by_range_ids = data_columns_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRange(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) @@ -350,6 +371,18 @@ impl SyncNetworkContext { .chain(blocks_by_range_ids) .chain(blobs_by_range_ids) .chain(data_column_by_range_ids) + } + + #[cfg(test)] + pub fn active_block_components_by_range_requests( + &self, + ) -> Vec<( + ComponentsByRangeRequestId, + BlockComponentsByRangeRequestStep, + )> { + self.block_components_by_range_requests + .iter() + .map(|(id, req)| (*id, req.state_step())) .collect() } @@ -362,6 +395,10 @@ impl SyncNetworkContext { &self.network_beacon_processor.network_globals } + pub fn spec(&self) -> &ChainSpec { + &self.chain.spec + } + /// Returns the Client type of the peer if known pub fn client_type(&self, peer_id: &PeerId) -> Client { self.network_globals() @@ -414,8 +451,9 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -447,205 +485,95 @@ impl SyncNetworkContext { requester: RangeRequestId, peers: &HashSet, peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, ) -> Result { - let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let batch_type = self.batch_type(batch_epoch); - - let active_request_count_by_peer = self.active_request_count_by_peer(); - - let Some(block_peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); - }; - - // Attempt to find all required custody peers before sending any request or creating an ID - let columns_by_range_peers_to_request = - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns.clone(); - Some(self.select_columns_by_range_peers_to_request( - &column_indexes, - peers, - active_request_count_by_peer, - peers_to_deprioritize, - )?) - } else { - None - }; - - // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; - - let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { - Some(self.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - )?) - } else { - None - }; - - let data_column_requests = columns_by_range_peers_to_request - .map(|columns_by_range_peers_to_request| { - let column_to_peer_map = columns_by_range_peers_to_request - .iter() - .flat_map(|(peer_id, columns)| columns.iter().map(|column| (*column, *peer_id))) - .collect::>(); - - let requests = columns_by_range_peers_to_request - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>()?; - - Ok((requests, column_to_peer_map)) - }) - .transpose()?; + let req = BlockComponentsByRangeRequest::new( + id, + request, + peers, + peers_to_deprioritize, + total_requests_per_peer, + self, + )?; - let info = - RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_column_requests); - self.components_by_range_requests.insert(id, info); + self.block_components_by_range_requests.insert(id, req); + // TODO: use ID Ok(id.id) } - fn select_columns_by_range_peers_to_request( - &self, - custody_indexes: &HashSet, - peers: &HashSet, - active_request_count_by_peer: HashMap, - peers_to_deprioritize: &HashSet, - ) -> Result>, RpcRequestSendError> { - let mut columns_to_request_by_peer = HashMap::>::new(); - - for column_index in custody_indexes { - // Strictly consider peers that are custodials of this column AND are part of this - // syncing chain. If the forward range sync chain has few peers, it's likely that this - // function will not be able to find peers on our custody columns. - let Some(custody_peer) = peers - .iter() - .filter(|peer| { - self.network_globals() - .is_custody_peer_of(*column_index, peer) - }) - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - // Also account for requests that are not yet issued tracked in peer_id_to_request_map - // We batch requests to the same peer, so count existance in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // TODO(das): this will be pretty bad UX. To improve we should: - // - Handle the no peers case gracefully, maybe add some timeout and give a few - // minutes / seconds to the peer manager to locate peers on this subnet before - // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( - *column_index, - ))); - }; - - columns_to_request_by_peer - .entry(custody_peer) - .or_default() - .push(*column_index); - } - - Ok(columns_to_request_by_peer) - } - - /// Received a _by_range response for a request that couples blocks and its data - /// - /// `peer_id` is the peer that served this individual RPC _by_range response. + /// Received a blocks by range or blobs by range response for a request that couples blocks ' + /// and blobs. #[allow(clippy::type_complexity)] - pub fn range_block_component_response( + pub fn on_block_components_by_range_response( &mut self, id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) -> Option>, BatchPeers), RpcResponseError>> { - let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { - metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); return None; }; - if let Err(e) = { - let request = entry.get_mut(); - match range_block_component { - RangeBlockComponent::Block(req_id, resp) => resp.and_then(|(blocks, _)| { - request - .add_blocks(req_id, blocks, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::Blob(req_id, resp) => resp.and_then(|(blobs, _)| { + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + request + .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { + resp.and_then(|(custody_columns, _)| { request - .add_blobs(req_id, blobs, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::CustodyColumns(req_id, resp) => { - resp.and_then(|(custody_columns, _)| { - request - .add_custody_columns(req_id, custody_columns, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }) - } + .on_custody_by_range_result(req_id, custody_columns, peers, self) + .map_err(Into::::into) + }) } - } { - entry.remove(); - return Some(Err(e)); - } + }; + + let result = result.transpose(); - if let Some(blocks_result) = entry.get().responses(&self.chain.spec) { - entry.remove(); - // If the request is finished, dequeue everything - Some(blocks_result.map_err(RpcResponseError::BlockComponentCouplingError)) - } else { - None + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + match result.as_ref() { + Some(Ok((blocks, peer_group))) => { + let blocks_with_data = blocks + .iter() + .filter(|block| block.as_block().has_data()) + .count(); + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!( + %id, + blocks = blocks.len(), + blocks_with_data, + block_peer = ?peer_group.block(), + "Block components by range request success, removing" + ) + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Block components by range request failure, removing" ) + } + None => { + self.block_components_by_range_requests.insert(id, request); + } } + result } /// Request block of `block_root` if necessary by checking: @@ -853,7 +781,7 @@ impl SyncNetworkContext { } /// Request to send a single `data_columns_by_root` request to the network. - pub fn data_column_lookup_request( + pub fn data_columns_by_root_request( &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, @@ -951,7 +879,7 @@ impl SyncNetworkContext { ); let requester = CustodyRequester(id); - let mut request = ActiveCustodyRequest::new( + let mut request = ActiveCustodyByRootRequest::new( block_root, CustodyId { requester }, &custody_indexes_to_fetch, @@ -967,25 +895,7 @@ impl SyncNetworkContext { self.custody_by_root_requests.insert(requester, request); Ok(LookupRequestResult::RequestSent(id.req_id)) } - Err(e) => Err(match e { - CustodyRequestError::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } - // - TooManyFailures: Should never happen, `request` has just been created, it's - // count of download_failures is 0 here - // - BadState: Should never happen, a bad state can only happen when handling a - // network response - // - UnexpectedRequestId: Never happens: this Err is only constructed handling a - // download or processing response - // - SendFailed: Should never happen unless in a bad drop sequence when shutting - // down the node - e @ (CustodyRequestError::TooManyFailures - | CustodyRequestError::BadState { .. } - | CustodyRequestError::UnexpectedRequestId { .. } - | CustodyRequestError::SendFailed { .. }) => { - RpcRequestSendError::InternalError(format!("{e:?}")) - } - }), + Err(e) => Err(e.into()), } } @@ -1073,8 +983,8 @@ impl SyncNetworkContext { &mut self, peer_id: PeerId, request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { + parent_request_id: CustodyByRangeRequestId, + ) -> Result { let id = DataColumnsByRangeRequestId { id: self.next_id(), parent_request_id, @@ -1085,7 +995,7 @@ impl SyncNetworkContext { request: RequestType::DataColumnsByRange(request.clone()), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + .map_err(|_| "network send error")?; debug!( method = "DataColumnsByRange", @@ -1108,6 +1018,50 @@ impl SyncNetworkContext { Ok(id) } + /// Request to fetch all needed custody columns of a range of slot. This function may not send + /// any request to the network if no columns have to be fetched based on the import state of the + /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_range` + /// requests. + pub fn send_custody_by_range_request( + &mut self, + parent_id: ComponentsByRangeRequestId, + blocks_with_data: Vec, + epoch: Epoch, + column_indices: Vec, + lookup_peers: Arc>>, + ) -> Result { + let id = CustodyByRangeRequestId { + id: self.next_id(), + parent_request_id: parent_id, + }; + + debug!( + indices = ?column_indices, + %id, + "Starting custody columns by range request" + ); + + let mut request = ActiveCustodyByRangeRequest::new( + id, + epoch, + blocks_with_data, + &column_indices, + lookup_peers, + ); + + // Note that you can only send, but not handle a response here + match request.continue_requests(self) { + Ok(_) => { + // Ignoring the result of `continue_requests` is okay. A request that has just been + // created cannot return data immediately, it must send some request to the network + // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. + self.custody_by_range_requests.insert(id, request); + Ok(id) + } + Err(e) => Err(e.into()), + } + } + pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1212,34 +1166,6 @@ impl SyncNetworkContext { id } - /// Check whether a batch for this epoch (and only this epoch) should request just blocks or - /// blocks and blobs. - fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { - // Induces a compile time panic if this doesn't hold true. - #[allow(clippy::assertions_on_constants)] - const _: () = assert!( - super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 - && super::range_sync::EPOCHS_PER_BATCH == 1, - "To deal with alignment with deneb boundaries, batches need to be of just one epoch" - ); - - if self - .chain - .data_availability_checker - .data_columns_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndColumns - } else if self - .chain - .data_availability_checker - .blobs_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndBlobs - } else { - ByRangeRequestType::Blocks - } - } - /// Attempt to make progress on all custody_by_root requests. Some request may be stale waiting /// for custody peers. Returns a Vec of results as zero or more requests may fail in this /// attempt. @@ -1266,6 +1192,32 @@ impl SyncNetworkContext { .collect() } + /// Attempt to make progress on all custody_by_range requests. Some request may be stale waiting + /// for custody peers. Returns a Vec of results as zero or more requests may fail in this + /// attempt. + pub fn continue_custody_by_range_requests( + &mut self, + ) -> Vec<(CustodyByRangeRequestId, CustodyByRangeResult)> { + let ids = self + .custody_by_range_requests + .keys() + .copied() + .collect::>(); + + // Need to collect ids and results in separate steps to re-borrow self. + ids.into_iter() + .filter_map(|id| { + let mut request = self + .custody_by_range_requests + .remove(&id) + .expect("key of hashmap"); + let result = request.continue_requests(self); + self.handle_custody_by_range_result(id, request, result) + .map(|result| (id, result)) + }) + .collect() + } + // Request handlers pub(crate) fn on_single_block_response( @@ -1425,8 +1377,10 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Custody column downloaded event for unknown request"); + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_root"], + ); return None; }; @@ -1438,8 +1392,8 @@ impl SyncNetworkContext { fn handle_custody_by_root_result( &mut self, id: CustodyRequester, - request: ActiveCustodyRequest, - result: CustodyRequestResult, + request: ActiveCustodyByRootRequest, + result: CustodyByRootRequestResult, ) -> Option> { let span = span!( Level::INFO, @@ -1448,18 +1402,16 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result - .map_err(RpcResponseError::CustodyRequestError) - .transpose(); + let result = result.map_err(Into::::into).transpose(); // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } Some(Err(e)) => { - debug!(?id, error = ?e, "Custody request failure, removing" ) + debug!(%id, error = ?e, "Custody by root request failure, removing" ) } None => { self.custody_by_root_requests.insert(id, request); @@ -1468,6 +1420,61 @@ impl SyncNetworkContext { result } + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result. + /// - `None`: Request still active, requester should do no action + #[allow(clippy::type_complexity)] + pub fn on_custody_by_range_response( + &mut self, + id: CustodyByRangeRequestId, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + resp: RpcResponseResult>>>, + ) -> Option> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.custody_by_range_requests.remove(&id) else { + // TOOD(das): This log can happen if the request is error'ed early and dropped + debug!(%id, "Custody by range downloaded event for unknown request"); + return None; + }; + + let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + + self.handle_custody_by_range_result(id, request, result) + } + + fn handle_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + request: ActiveCustodyByRangeRequest, + result: CustodyByRangeRequestResult, + ) -> Option> { + let result = result.map_err(Into::::into).transpose(); + + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + match result.as_ref() { + Some(Ok((columns, _peer_group, _))) => { + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!(%id, count = columns.len(), "Custody by range request success, removing") + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Custody by range request failure, removing" ) + } + None => { + self.custody_by_range_requests.insert(id, request); + } + } + result + } + pub fn send_block_for_processing( &self, id: Id, @@ -1529,7 +1536,7 @@ impl SyncNetworkContext { .beacon_processor_if_enabled() .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - debug!(?block_root, ?id, "Sending blobs for processing"); + debug!(?block_root, %id, "Sending blobs for processing"); // Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync // must receive a single `SyncMessage::BlockComponentProcessed` event with this process type beacon_processor @@ -1600,8 +1607,8 @@ impl SyncNetworkContext { ), ("custody_by_root", self.custody_by_root_requests.len()), ( - "components_by_range", - self.components_by_range_requests.len(), + "block_components_by_range", + self.block_components_by_range_requests.len(), ), ] { metrics::set_gauge_vec(&metrics::SYNC_ACTIVE_NETWORK_REQUESTS, &[id], count as i64); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs new file mode 100644 index 00000000000..4545806a05e --- /dev/null +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -0,0 +1,550 @@ +use crate::sync::network_context::{ + PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, +}; +use crate::sync::range_sync::BatchPeers; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::data_column_verification::CustodyDataColumn; +use beacon_chain::{get_block_root, BeaconChainTypes}; +use lighthouse_network::rpc::methods::{BlobsByRangeRequest, BlocksByRangeRequest}; +use lighthouse_network::service::api_types::{ + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, +}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use types::{ + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, RuntimeVariableList, + SignedBeaconBlock, Slot, +}; + +pub struct BlockComponentsByRangeRequest { + id: ComponentsByRangeRequestId, + peers: Arc>>, + request: BlocksByRangeRequest, + state: State, +} + +enum State { + Base { + blocks_by_range_request: + ByRangeRequest>>>, + }, + // Two single concurrent requests for block + blobs + DenebEnabled { + blocks_by_range_request: + ByRangeRequest>>>, + blobs_by_range_request: ByRangeRequest>>>, + }, + // Request blocks first, then columns + FuluEnabled(FuluEnabledState), +} + +enum FuluEnabledState { + BlockRequest { + blocks_by_range_request: + ByRangeRequest>>>, + }, + CustodyRequest { + blocks: Vec>>, + block_peer: PeerId, + custody_by_range_request: + ByRangeRequest>>, PeerGroup>, + }, +} + +enum ByRangeRequest { + /// Active(RequestIndex) + Active(I), + /// Complete(DownloadedData, Peers) + Complete(T, P), +} + +pub type BlockComponentsByRangeRequestResult = + Result>, BatchPeers)>, Error>; + +pub enum Error { + InternalError(String), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } +} + +/// FOR TESTING ONLY +#[cfg(test)] +#[derive(Debug)] +pub enum BlockComponentsByRangeRequestStep { + BlocksRequest, + CustodyRequest, +} + +impl BlockComponentsByRangeRequest { + pub fn new( + id: ComponentsByRangeRequestId, + request: BlocksByRangeRequest, + peers: &HashSet, + peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, + cx: &mut SyncNetworkContext, + ) -> Result { + // Induces a compile time panic if this doesn't hold true. + #[allow(clippy::assertions_on_constants)] + const _: () = assert!( + super::super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 + && super::super::range_sync::EPOCHS_PER_BATCH == 1, + "To deal with alignment with deneb boundaries, batches need to be of just one epoch" + ); + // The assertion above ensures each batch is in one single epoch + let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); + let batch_fork = cx.spec().fork_name_at_epoch(batch_epoch); + + // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we + // will request all blocks for the first 5 epochs to that same single peer. Before we would + // query only idle peers in the syncing chain. + let Some(block_peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // TODO(das): Should we use active_request_count_by_peer? + // Prefer peers with less overall requests + // active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Prefer peers with less total cummulative requests, so we fetch data from a + // diverse set of peers + total_requests_per_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the SyncingChain is removed. + // TODO(das): add test for this. + return Err(RpcRequestSendError::InternalError( + "A batch peer set should never be empty".to_string(), + )); + }; + + let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; + + let state = if batch_fork.fulu_enabled() { + State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + }) + } else if batch_fork.deneb_enabled() { + // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I + // would like the beacon processor / da_checker to be the one that decides if an + // RpcBlock is valid or not with respect to containing blobs. Having sync not even + // attempt a requests seems like an added limitation. + let blobs_req_id = cx.send_blobs_by_range_request( + block_peer, + BlobsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + }, + id, + )?; + State::DenebEnabled { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + blobs_by_range_request: ByRangeRequest::Active(blobs_req_id), + } + } else { + State::Base { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + } + }; + + Ok(Self { + id, + // TODO(das): share the rwlock with the range sync batch. Are peers added to the batch + // after being created? + peers: Arc::new(RwLock::new(peers.clone())), + request, + state, + }) + } + + pub fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + // TODO(das): use the peer group + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_base( + blocks.to_vec(), + cx.network_globals().sampling_columns.len(), + ); + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range requests to complete + Ok(None) + } + } + State::DenebEnabled { + blocks_by_range_request, + blobs_by_range_request, + } => { + if let (Some((blocks, block_peer)), Some((blobs, _))) = ( + blocks_by_range_request.to_finished(), + blobs_by_range_request.to_finished(), + ) { + // We use the same block_peer for the blobs request + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = + couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range and blobs_by_range requests to complete + Ok(None) + } + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + // TODO(das): use the peer group + let blocks_with_data = blocks + .iter() + .filter(|block| block.has_data()) + .map(|block| block.signed_block_header()) + .collect::>(); + + if blocks_with_data.is_empty() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + // Done, we got blocks and no columns needed + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + vec![], + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + let mut column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect::>(); + column_indices.sort_unstable(); + + let req_id = cx + .send_custody_by_range_request( + self.id, + blocks_with_data, + Slot::new(*self.request.start_slot()) + .epoch(T::EthSpec::slots_per_epoch()), + column_indices, + self.peers.clone(), + ) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => { + Error::InternalError(e) + } + })?; + + *state = FuluEnabledState::CustodyRequest { + blocks: blocks.to_vec(), + block_peer: *block_peer, + custody_by_range_request: ByRangeRequest::Active(req_id), + }; + + // Wait for the new custody_by_range request to complete + Ok(None) + } + } else { + // Wait for the block request to complete + Ok(None) + } + } + FuluEnabledState::CustodyRequest { + blocks, + block_peer, + custody_by_range_request, + } => { + if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + let peer_group = + BatchPeers::new(*block_peer, column_peers.as_reversed_map()); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + columns.to_vec(), + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for the custody_by_range request to complete + Ok(None) + } + } + }, + } + } + + pub fn on_blocks_by_range_result( + &mut self, + id: BlocksByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } + | State::DenebEnabled { + blocks_by_range_request, + .. + } + | State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request, + }) => { + blocks_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(FuluEnabledState::CustodyRequest { .. }) => { + return Err(Error::InternalError( + "Received blocks_by_range response expecting custody_by_range".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_blobs_by_range_result( + &mut self, + id: BlobsByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } => { + return Err(Error::InternalError( + "Received blobs_by_range response before Deneb".to_string(), + )) + } + State::DenebEnabled { + blobs_by_range_request, + .. + } => { + blobs_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(_) => { + return Err(Error::InternalError( + "Received blobs_by_range response after PeerDAS".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + data: Vec>>, + peers: PeerGroup, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } | State::DenebEnabled { .. } => { + return Err(Error::InternalError( + "Received custody_by_range response before PeerDAS".to_string(), + )) + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + return Err(Error::InternalError( + "Received custody_by_range expecting blocks_by_range".to_string(), + )); + } + FuluEnabledState::CustodyRequest { + custody_by_range_request, + .. + } => { + custody_by_range_request.finish(id, data, peers)?; + } + }, + } + + self.continue_requests(cx) + } + + #[cfg(test)] + pub fn state_step(&self) -> BlockComponentsByRangeRequestStep { + match &self.state { + State::Base { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::DenebEnabled { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + BlockComponentsByRangeRequestStep::BlocksRequest + } + FuluEnabledState::CustodyRequest { .. } => { + BlockComponentsByRangeRequestStep::CustodyRequest + } + }, + } + } +} + +fn couple_blocks_base( + blocks: Vec>>, + custody_columns_count: usize, +) -> Vec> { + blocks + .into_iter() + .map(|block| RpcBlock::new_without_blobs(None, block, custody_columns_count)) + .collect() +} + +fn couple_blocks_deneb( + blocks: Vec>>, + blobs: Vec>>, + spec: &ChainSpec, +) -> Result>, Error> { + let mut blobs_by_block = HashMap::>>>::new(); + for blob in blobs { + let block_root = blob.block_root(); + blobs_by_block.entry(block_root).or_default().push(blob); + } + + // Now collect all blobs that match to the block by block root. BlobsByRange request checks + // the inclusion proof so we know that the commitment is the expected. + // + // BlobsByRange request handler ensures that we don't receive more blobs than possible. + // If the peer serving the request sends us blobs that don't pair well we'll send to the + // processor blocks without expected blobs, resulting in a downscoring event. A serving peer + // could serve fake blobs for blocks that don't have data, but it would gain nothing by it + // wasting theirs and our bandwidth 1:1. Therefore blobs that don't pair well are just ignored. + // + // RpcBlock::new ensures that the count of blobs is consistent with the block + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; + let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); + // BlobsByRange request handler enforces that blobs are sorted by index + let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block).map_err(|_| { + Error::InternalError("Blobs returned exceeds max length".to_string()) + })?; + Ok(RpcBlock::new(Some(block_root), block, Some(blobs)) + .expect("TODO: don't do matching here")) + }) + .collect::>, Error>>() +} + +fn couple_blocks_fulu( + blocks: Vec>>, + data_columns: Vec>>, + custody_column_indices: Vec, + spec: &ChainSpec, +) -> Result>, Error> { + // Group data columns by block_root and index + let mut custody_columns_by_block = HashMap::>>::new(); + + for column in data_columns { + let block_root = column.block_root(); + + if custody_column_indices.contains(&column.index) { + custody_columns_by_block + .entry(block_root) + .or_default() + // Safe to convert to `CustodyDataColumn`: we have asserted that the index of + // this column is in the set of `expects_custody_columns` and with the expected + // block root, so for the expected epoch of this batch. + .push(CustodyDataColumn::from_asserted_custody(column)); + } + } + + // Now iterate all blocks ensuring that the block roots of each block and data column match, + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let data_columns_with_block_root = custody_columns_by_block + // Remove to only use columns once + .remove(&block_root) + .unwrap_or_default(); + + // TODO(das): Change RpcBlock to holding a Vec of DataColumnSidecars so we don't need + // the spec here. + RpcBlock::new_with_custody_columns( + Some(block_root), + block, + data_columns_with_block_root, + custody_column_indices.clone(), + spec, + ) + .map_err(Error::InternalError) + }) + .collect::, _>>() +} + +impl ByRangeRequest { + fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), Error> { + match self { + Self::Active(expected_id) => { + if expected_id != &id { + return Err(Error::InternalError(format!( + "unexpected req_id expected {expected_id} got {id}" + ))); + } + *self = Self::Complete(data, peer_id); + Ok(()) + } + Self::Complete(_, _) => Err(Error::InternalError(format!( + "request already complete {id}" + ))), + } + } + + fn to_finished(&self) -> Option<(&T, &P)> { + match self { + Self::Active(_) => None, + Self::Complete(data, peer_id) => Some((data, peer_id)), + } + } +} diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs new file mode 100644 index 00000000000..9f8e163ba47 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -0,0 +1,481 @@ +use super::custody_by_root::{ColumnRequest, Error}; +use crate::sync::network_context::RpcResponseError; +use beacon_chain::validator_monitor::timestamp_now; +use beacon_chain::BeaconChainTypes; +use fnv::FnvHashMap; +use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; +use lighthouse_network::service::api_types::{ + CustodyByRangeRequestId, DataColumnsByRangeRequestId, +}; +use lighthouse_network::{PeerAction, PeerId}; +use lru_cache::LRUTimeCache; +use parking_lot::RwLock; +use rand::Rng; +use std::collections::HashSet; +use std::time::{Duration, Instant}; +use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use tracing::{debug, warn}; +use types::{ + data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Hash256, + SignedBeaconBlockHeader, Slot, +}; + +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; + +const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; +const REQUEST_EXPIRY_SECONDS: u64 = 300; + +type DataColumnSidecarList = Vec>>; + +pub struct ActiveCustodyByRangeRequest { + start_time: Instant, + id: CustodyByRangeRequestId, + // TODO(das): Pass a better type for the by_range request + epoch: Epoch, + /// Blocks that we expect peers to serve data columns for + blocks_with_data: Vec, + /// List of column indices this request needs to download to complete successfully + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>, + >, + /// Active requests for 1 or more columns each + active_batch_columns_requests: + FnvHashMap, + /// Peers that have recently failed to successfully respond to a columns by root request. + /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. + peers_with_custody_failures: LRUTimeCache, + peers_with_temporary_faults: LRUTimeCache, + // TODO(das): does this HashSet has an OOM risk? We should either: make sure that this request + // structs are dropped after some time, that disconnected peers are pruned (but we may want to + // retain faulty information if they just disconnect and reconnect) or make this an LRUTimeCache + // with a long time (like 5 minutes). + peers_with_permanent_faults: HashSet, + /// Set of peers that claim to have imported this block and their custody columns + lookup_peers: Arc>>, + + _phantom: PhantomData, +} + +struct ActiveBatchColumnsRequest { + indices: Vec, +} + +pub type CustodyByRangeRequestResult = + Result, PeerGroup, Duration)>, Error>; + +enum ColumnResponseError { + NonMatchingColumn { + slot: Slot, + actual_block_root: Hash256, + expected_block_root: Hash256, + }, + MissingColumn(Slot), +} + +impl ActiveCustodyByRangeRequest { + pub(crate) fn new( + id: CustodyByRangeRequestId, + epoch: Epoch, + blocks_with_data: Vec, + column_indices: &[ColumnIndex], + lookup_peers: Arc>>, + ) -> Self { + Self { + start_time: Instant::now(), + id, + epoch, + blocks_with_data, + column_requests: HashMap::from_iter( + column_indices + .iter() + .map(|index| (*index, ColumnRequest::new())), + ), + active_batch_columns_requests: <_>::default(), + peers_with_custody_failures: LRUTimeCache::new(Duration::from_secs( + TEMPORARY_FAULT_EXPIRY_SECONDS, + )), + peers_with_temporary_faults: LRUTimeCache::new(Duration::from_secs( + TEMPORARY_FAULT_EXPIRY_SECONDS, + )), + peers_with_permanent_faults: HashSet::new(), + lookup_peers, + _phantom: PhantomData, + } + } + + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Err`: Custody request has failed and will be dropped + /// - `Ok(Some)`: Custody request has successfully completed and will be dropped + /// - `Ok(None)`: Custody request still active + pub(crate) fn on_data_column_downloaded( + &mut self, + peer_id: PeerId, + req_id: DataColumnsByRangeRequestId, + resp: RpcResponseResult>, + cx: &mut SyncNetworkContext, + ) -> CustodyByRangeRequestResult { + let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { + warn!( + id = %self.id, + %req_id, + "Received custody by range response for unrequested index" + ); + return Ok(None); + }; + + match resp { + Ok((data_columns, seen_timestamp)) => { + // Map columns by index as an optimization to not loop the returned list on each + // requested index. The worse case is 128 loops over a 128 item vec + mutation to + // drop the consumed columns. + let mut data_columns_by_index = + HashMap::<(ColumnIndex, Slot), Arc>>::new(); + for data_column in data_columns { + data_columns_by_index + .insert((data_column.index, data_column.slot()), data_column); + } + + // Accumulate columns that the peer does not have to issue a single log per request + let mut missing_column_indexes = vec![]; + let mut incorrect_column_indices = vec![]; + let mut imported_column_indices = vec![]; + + for index in &batch_request.indices { + let column_request = + self.column_requests + .get_mut(index) + .ok_or(Error::InternalError(format!( + "unknown column_index {index}" + )))?; + + let columns_at_index = self + .blocks_with_data + .iter() + .map(|block| { + let slot = block.message.slot; + if let Some(data_column) = data_columns_by_index.remove(&(*index, slot)) + { + let actual_block_root = + data_column.signed_block_header.message.canonical_root(); + let expected_block_root = block.message.canonical_root(); + if actual_block_root != expected_block_root { + Err(ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root: data_column + .signed_block_header + .message + .canonical_root(), + expected_block_root: block.message.canonical_root(), + }) + } else { + Ok(data_column) + } + } else { + // The following three statements are true: + // - block at `slot` is not missed, and has data + // - peer custodies this column `index` + // - peer claims to be synced to at least `slot` + // + // Therefore not returning this column is an protocol violation that we + // penalize and mark the peer as failed to retry with another peer. + // + // TODO(das) do not consider this case a success. We know for sure the block has + // data. However we allow the peer to return empty as we can't attribute fault. + // TODO(das): Should track which columns are missing and eventually give up + // TODO(das): If the peer is in the lookup peer set it claims to have imported + // the block AND its custody columns. So in this case we can downscore + Err(ColumnResponseError::MissingColumn(slot)) + } + }) + .collect::, _>>(); + + match columns_at_index { + Ok(columns_at_index) => { + column_request.on_download_success( + req_id, + peer_id, + columns_at_index, + seen_timestamp, + )?; + + imported_column_indices.push(index); + } + Err(e) => { + column_request.on_download_error(req_id)?; + + match e { + ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root, + expected_block_root, + } => { + incorrect_column_indices.push(( + index, + slot, + actual_block_root, + expected_block_root, + )); + } + ColumnResponseError::MissingColumn(slot) => { + missing_column_indexes.push((index, slot)); + } + } + } + } + } + + // Log missing_column_indexes and incorrect_column_indices here in batch per request + // to make this logs more compact and less noisy. + if !imported_column_indices.is_empty() { + // TODO(das): this log may be redundant. We already log on DataColumnsByRange + // completed, and on DataColumnsByRange sent we log the column indices + // ``` + // Sync RPC request sent method="DataColumnsByRange" slots=8 epoch=4 columns=[52] peer=16Uiu2HAmEooeoHzHDYS35TSHrJDSfmREecPyFskrLPYm9Gm1EURj id=493/399/10/RangeSync/4/1 + // Sync RPC request completed id=493/399/10/RangeSync/4/1 method="DataColumnsByRange" count=1 + // ``` + // Which can be traced to this custody by range request, and the initial log + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + count = imported_column_indices.len(), + "Custody by range request download imported columns" + ); + } + + if !incorrect_column_indices.is_empty() { + // Note: Batch logging that columns are missing to not spam logger + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + // TODO(das): this property can become very noisy, being the full range 0..128 + incorrect_columns = ?incorrect_column_indices, + "Custody by range peer returned non-matching columns" + ); + + // Returning a non-canonical column is not a permanent fault. We should not + // retry the peer for some time but the peer may return a canonical column in + // the future. + // TODO(das): if this finalized sync the fault is permanent + self.peers_with_temporary_faults.insert(peer_id); + cx.report_peer( + peer_id, + PeerAction::MidToleranceError, + "non-matching data column", + ); + } + + if !missing_column_indexes.is_empty() { + // Note: Batch logging that columns are missing to not spam logger + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + // TODO(das): this property can become very noisy, being the full range 0..128 + ?missing_column_indexes, + "Custody by range peer claims to not have some data" + ); + + // Not having columns is not a permanent fault. The peer may be backfilling. + self.peers_with_custody_failures.insert(peer_id); + cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); + } + } + Err(err) => { + debug!( + id = %self.id, + %req_id, + %peer_id, + error = ?err, + "Custody by range download error" + ); + + // TODO(das): Should mark peer as failed and try from another peer + for column_index in &batch_request.indices { + self.column_requests + .get_mut(column_index) + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; + } + + match err { + // Verify errors are correctness errors against our request or about the + // returned data itself. This peer is faulty or malicious, should not be + // retried. + RpcResponseError::VerifyError(_) => { + self.peers_with_permanent_faults.insert(peer_id); + } + // Network errors are not permanent faults and worth retrying + RpcResponseError::RpcError(_) => { + self.peers_with_temporary_faults.insert(peer_id); + } + // Do nothing for internal errors + RpcResponseError::InternalError(_) => {} + // unreachable + RpcResponseError::RequestExpired(_) => {} + } + } + }; + + self.continue_requests(cx) + } + + pub(crate) fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> CustodyByRangeRequestResult { + if self.column_requests.values().all(|r| r.is_downloaded()) { + // All requests have completed successfully. + let mut peers = HashMap::>::new(); + let mut seen_timestamps = vec![]; + let columns = std::mem::take(&mut self.column_requests) + .into_values() + .map(|request| { + let (peer, data_columns, seen_timestamp) = request.complete()?; + + for data_column in &data_columns { + let columns_by_peer = peers.entry(peer).or_default(); + if !columns_by_peer.contains(&(data_column.index as usize)) { + columns_by_peer.push(data_column.index as usize); + } + } + + seen_timestamps.push(seen_timestamp); + + Ok(data_columns) + }) + .collect::, _>>()? + // Flatten Vec> to Vec + // TODO(das): maybe not optimal for the coupling logic later + .into_iter() + .flatten() + .collect(); + + let peer_group = PeerGroup::from_set(peers); + let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); + return Ok(Some((columns, peer_group, max_seen_timestamp))); + } + + let active_request_count_by_peer = cx.active_request_count_by_peer(); + let mut columns_to_request_by_peer = HashMap::>::new(); + let lookup_peers = self.lookup_peers.read(); + + // Need to: + // - track how many active requests a peer has for load balancing + // - which peers have failures to attempt others + // - which peer returned what to have PeerGroup attributability + + for (column_index, request) in self.column_requests.iter_mut() { + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); + } + + // TODO(das): When is a fork and only a subset of your peers know about a block, we should + // only query the peers on that fork. Should this case be handled? How to handle it? + let custodial_peers = cx.get_custodial_peers(*column_index); + + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation / status / block message claiming to have imported the + // lookup. The frequency of those messages is low, so drawing only from lookup_peers + // could cause many lookups to take much longer or fail as they don't have enough + // custody peers on a given column + let mut priorized_peers = custodial_peers + .iter() + .filter(|peer| { + // Never request again peers with permanent faults + // Do not request peers with custody failures for some time + !self.peers_with_permanent_faults.contains(peer) + && !self.peers_with_custody_failures.contains(peer) + }) + .map(|peer| { + ( + // Prioritize peers that claim to know have imported this block + if lookup_peers.contains(peer) { 0 } else { 1 }, + // De-prioritize peers that have failed to successfully respond to + // requests recently, but allow to immediatelly request them again + self.peers_with_temporary_faults.contains(peer), + // Prefer peers with fewer requests to load balance across peers. + // We batch requests to the same peer, so count existence in the + // `columns_to_request_by_peer` as a single 1 request. + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::thread_rng().gen::(), + *peer, + ) + }) + .collect::>(); + priorized_peers.sort_unstable(); + + if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + columns_to_request_by_peer + .entry(*peer_id) + .or_default() + .push(*column_index); + } else { + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request should be dropped and failed after some time. + // TODO(das): implement the above + } + } + } + + for (peer_id, indices) in columns_to_request_by_peer.into_iter() { + let req_id = cx + .send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + // TODO(das): generalize with constants from batch + start_slot: self + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + .as_u64(), + count: T::EthSpec::slots_per_epoch(), + columns: indices.clone(), + }, + self.id, + ) + .map_err(|e| Error::InternalError(format!("send failed {e}")))?; + + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::InternalError(format!( + "Unknown column_request {column_index}" + )))?; + + column_request.on_download_start(req_id)?; + } + + self.active_batch_columns_requests + .insert(req_id, ActiveBatchColumnsRequest { indices }); + } + + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + + Ok(None) + } +} diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs similarity index 70% rename from beacon_node/network/src/sync/network_context/custody.rs rename to beacon_node/network/src/sync/network_context/custody_by_root.rs index f4d010b881e..c547837fc7f 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,5 +1,6 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, RpcRequestSendError, + RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; @@ -12,22 +13,29 @@ use rand::Rng; use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::EthSpec; use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; -const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); +const REQUEST_EXPIRY_SECONDS: u64 = 300; +/// TODO(das): this attempt count is nested into the existing lookup request count. +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; type DataColumnSidecarList = Vec>>; -pub struct ActiveCustodyRequest { +pub struct ActiveCustodyByRootRequest { + start_time: Instant, block_root: Hash256, custody_id: CustodyId, /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap>, + #[allow(clippy::type_complexity)] + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>>, + >, /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, @@ -40,29 +48,47 @@ pub struct ActiveCustodyRequest { _phantom: PhantomData, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub enum Error { - SendFailed(&'static str), - TooManyFailures, - BadState(String), - NoPeer(ColumnIndex), - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, - }, + InternalError(String), + TooManyDownloadErrors(RpcResponseError), + ExpiredNoCustodyPeers(Vec), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + Error::TooManyDownloadErrors(e) => e, + Error::ExpiredNoCustodyPeers(indices) => RpcResponseError::RequestExpired(format!( + "Expired waiting for custody peers {indices:?}" + )), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::TooManyDownloadErrors(_) => { + RpcRequestSendError::InternalError("Download error in request send".to_string()) + } + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + Error::ExpiredNoCustodyPeers(_) => RpcRequestSendError::InternalError( + "Request can not expire when requesting it".to_string(), + ), + } + } } struct ActiveBatchColumnsRequest { indices: Vec, } -pub type CustodyRequestResult = +pub type CustodyByRootRequestResult = Result, PeerGroup, Duration)>, Error>; -impl ActiveCustodyRequest { +impl ActiveCustodyByRootRequest { pub(crate) fn new( block_root: Hash256, custody_id: CustodyId, @@ -70,6 +96,7 @@ impl ActiveCustodyRequest { lookup_peers: Arc>>, ) -> Self { Self { + start_time: Instant::now(), block_root, custody_id, column_requests: HashMap::from_iter( @@ -98,7 +125,7 @@ impl ActiveCustodyRequest { req_id: DataColumnsByRootRequestId, resp: RpcResponseResult>, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( block_root = ?self.block_root, @@ -131,7 +158,7 @@ impl ActiveCustodyRequest { let column_request = self .column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; if let Some(data_column) = data_columns.remove(column_index) { column_request.on_download_success( @@ -182,8 +209,8 @@ impl ActiveCustodyRequest { for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id)?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; } self.failed_peers.insert(peer_id); @@ -196,7 +223,7 @@ impl ActiveCustodyRequest { pub(crate) fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { if self.column_requests.values().all(|r| r.is_downloaded()) { // All requests have completed successfully. let mut peers = HashMap::>::new(); @@ -222,6 +249,7 @@ impl ActiveCustodyRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); + let mut indices_without_peers = vec![]; // Need to: // - track how many active requests a peer has for load balancing @@ -229,9 +257,9 @@ impl ActiveCustodyRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(wait_duration) = request.is_awaiting_download() { - if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyFailures); + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -270,21 +298,22 @@ impl ActiveCustodyRequest { .entry(*peer_id) .or_default() .push(*column_index); - } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { - // Allow to request to sit stale in `NotStarted` state for at most - // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that - // lookup will naturally retry when other peers send us attestations for - // descendants of this un-available lookup. - return Err(Error::NoPeer(*column_index)); } else { - // Do not issue requests if there is no custody peer on this column + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request should be dropped and failed after some time. + // TODO(das): implement the above + indices_without_peers.push(column_index); } } } for (peer_id, indices) in columns_to_request_by_peer.into_iter() { let request_result = cx - .data_column_lookup_request( + .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, DataColumnsByRootSingleBlockRequest { @@ -297,7 +326,9 @@ impl ActiveCustodyRequest { // columns. For the rest of peers, don't downscore if columns are missing. lookup_peers.contains(&peer_id), ) - .map_err(Error::SendFailed)?; + .map_err(|e| { + Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) + })?; match request_result { LookupRequestResult::RequestSent(req_id) => { @@ -306,7 +337,7 @@ impl ActiveCustodyRequest { .column_requests .get_mut(column_index) // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; column_request.on_download_start(req_id)?; } @@ -319,117 +350,149 @@ impl ActiveCustodyRequest { } } + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + Ok(None) } } -/// TODO(das): this attempt count is nested into the existing lookup request count. -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; - -struct ColumnRequest { - status: Status, - download_failures: usize, +pub struct ColumnRequest { + status: Status, + download_failures: Vec, } -#[derive(Debug, Clone)] -enum Status { - NotStarted(Instant), - Downloading(DataColumnsByRootRequestId), - Downloaded(PeerId, Arc>, Duration), +#[derive(Debug, Clone, IntoStaticStr)] +pub enum Status { + NotStarted, + Downloading(I), + Downloaded(PeerId, T, Duration), } -impl ColumnRequest { - fn new() -> Self { +impl ColumnRequest { + pub fn new() -> Self { Self { - status: Status::NotStarted(Instant::now()), - download_failures: 0, + status: Status::NotStarted, + download_failures: vec![], } } - fn is_awaiting_download(&self) -> Option { + pub fn is_awaiting_download(&self) -> bool { match self.status { - Status::NotStarted(start_time) => Some(start_time.elapsed()), - Status::Downloading { .. } | Status::Downloaded { .. } => None, + Status::NotStarted => true, + Status::Downloading { .. } | Status::Downloaded { .. } => false, } } - fn is_downloaded(&self) -> bool { + pub fn is_downloading(&self) -> bool { match self.status { - Status::NotStarted { .. } | Status::Downloading { .. } => false, + Status::NotStarted => false, + Status::Downloading { .. } => true, + Status::Downloaded { .. } => false, + } + } + + pub fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted | Status::Downloading { .. } => false, Status::Downloaded { .. } => true, } } - fn on_download_start(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { + pub fn too_many_failures(&self) -> Option { + if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + Some( + self.download_failures + .last() + .cloned() + .expect("download_failures is not empty"), + ) + } else { + None + } + } + + pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { match &self.status { - Status::NotStarted { .. } => { + Status::NotStarted => { self.status = Status::Downloading(req_id); Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_start expected NotStarted got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_start expected NotStarted got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { + pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } - self.status = Status::NotStarted(Instant::now()); + self.status = Status::NotStarted; Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_error expected Downloading got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_error expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error_and_mark_failure( + pub fn on_download_error_and_mark_failure( &mut self, - req_id: DataColumnsByRootRequestId, + req_id: I, + e: RpcResponseError, ) -> Result<(), Error> { - // TODO(das): Should track which peers don't have data - self.download_failures += 1; + self.download_failures.push(e); self.on_download_error(req_id) } - fn on_download_success( + pub fn on_download_success( &mut self, - req_id: DataColumnsByRootRequestId, + req_id: I, peer_id: PeerId, - data_column: Arc>, + data_column: T, seen_timestamp: Duration, ) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_success expected Downloading got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_success expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn complete(self) -> Result<(PeerId, Arc>, Duration), Error> { + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { Ok((peer_id, data_column, seen_timestamp)) } - other => Err(Error::BadState(format!( - "bad state complete expected Downloaded got {other:?}" + other => Err(Error::InternalError(format!( + "bad state complete expected Downloaded got {}", + Into::<&'static str>::into(other), ))), } } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index cd70a2e7ebc..8228ea5d9d5 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -26,7 +26,7 @@ mod blocks_by_root; mod data_columns_by_range; mod data_columns_by_root; -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] +#[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { NotEnoughResponsesReturned { actual: usize, @@ -177,12 +177,10 @@ impl ActiveRequests { } } - pub fn active_requests_of_peer(&self, peer_id: &PeerId) -> Vec<&K> { + pub fn active_requests(&self) -> impl Iterator { self.requests .iter() - .filter(|(_, request)| &request.peer_id == peer_id) - .map(|(id, _)| id) - .collect() + .map(|(id, request)| (id, &request.peer_id)) } pub fn iter_request_peers(&self) -> impl Iterator + '_ { diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index 59b751787e3..d76c7d2bbc2 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -98,13 +98,13 @@ impl Sampling { // TODO(das): Should track failed sampling request for some time? Otherwise there's // a risk of a loop with multiple triggers creating the request, then failing, // and repeat. - debug!(?id, "Ignoring duplicate sampling request"); + debug!(%id, "Ignoring duplicate sampling request"); return None; } }; debug!( - ?id, + %id, column_selection = ?request.column_selection(), "Created new sample request" ); @@ -138,7 +138,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample downloaded event for unknown request"); + debug!(%id, "Sample downloaded event for unknown request"); return None; }; @@ -167,7 +167,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample verified event for unknown request"); + debug!(%id, "Sample verified event for unknown request"); return None; }; @@ -191,7 +191,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let result = result.transpose(); if let Some(result) = result { - debug!(?id, ?result, "Sampling request completed, removing"); + debug!(%id, ?result, "Sampling request completed, removing"); metrics::inc_counter_vec( &metrics::SAMPLING_REQUEST_RESULT, &[metrics::from_result(&result)], @@ -570,7 +570,7 @@ impl ActiveSamplingRequest { // Send requests. let mut sent_request = false; for (peer_id, column_indexes) in column_indexes_to_request { - cx.data_column_lookup_request( + cx.data_columns_by_root_request( DataColumnsByRootRequester::Sampling(SamplingId { id: self.requester_id, sampling_request_id: self.current_sampling_request_id, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 72598a25405..81f33352f50 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,4 +1,5 @@ use beacon_chain::block_verification_types::RpcBlock; +use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; @@ -17,15 +18,7 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; -/// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] -#[strum(serialize_all = "snake_case")] -pub enum ByRangeRequestType { - BlocksAndColumns, - BlocksAndBlobs, - Blocks, -} - +// TODO(das): Consider merging with PeerGroup #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, @@ -53,6 +46,12 @@ impl BatchPeers { pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { self.column_peers.get(index) } + + pub fn iter_unique_peers(&self) -> impl Iterator { + std::iter::once(&self.block_peer) + .chain(self.column_peers.values()) + .unique() + } } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index ba809a14ba1..abea407b0ed 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -10,7 +10,7 @@ use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; -use std::collections::{btree_map::Entry, BTreeMap, HashSet}; +use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -87,9 +87,11 @@ pub struct SyncingChain { batches: BTreeMap>, /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain - /// and thus available to download this chain from, as well as the batches we are currently - /// requesting. - peers: HashSet, + /// and thus available to download this chain from. + /// + /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain + /// `HashMap` + peers: HashMap, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -121,7 +123,40 @@ pub enum ChainSyncingState { Syncing, } +#[cfg(test)] +#[derive(Debug, Eq, PartialEq)] +pub enum BatchStateSummary { + Downloading, + Processing, + AwaitingProcessing, + AwaitingValidation, + Unexpected(&'static str), +} + impl SyncingChain { + /// Returns a summary of batch states for assertions in tests. + #[cfg(test)] + pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> { + self.batches + .iter() + .map(|(id, batch)| { + let state = match batch.state() { + // A batch is never left in this state, it's only the initial value + BatchState::AwaitingDownload => { + BatchStateSummary::Unexpected("AwaitingDownload") + } + BatchState::Downloading { .. } => BatchStateSummary::Downloading, + BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing, + BatchState::Poisoned => BatchStateSummary::Unexpected("Poisoned"), + BatchState::Processing { .. } => BatchStateSummary::Processing, + BatchState::Failed => BatchStateSummary::Unexpected("Failed"), + BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation, + }; + (*id, state) + }) + .collect() + } + #[allow(clippy::too_many_arguments)] pub fn new( id: Id, @@ -138,7 +173,7 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers: HashSet::from_iter([peer_id]), + peers: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -168,7 +203,7 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.iter().cloned() + self.peers.keys().cloned() } /// Progress in epochs made by the chain @@ -221,6 +256,12 @@ impl SyncingChain { request_id: Id, blocks: Vec>, ) -> ProcessingResult { + // Account for one more requests to this peer + // TODO(das): this code assumes that we do a single request per peer per RpcBlock + for peer in batch_peers.iter_unique_peers() { + *self.peers.entry(*peer).or_default() += 1; + } + // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { None => { @@ -400,11 +441,6 @@ impl SyncingChain { self.request_batches(network)?; } } - } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - // This is to handle the case where no batch was sent for the current processing - // target when there is no sampling peers available. This is a valid state and should not - // return an error. - return Ok(KeepChain); } else { return Err(RemoveChain::WrongChainState(format!( "Batch not found for current processing target {}", @@ -577,7 +613,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for peer in self.peers.drain() { + for (peer, _) in self.peers.drain() { network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -842,7 +878,7 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - self.peers.insert(peer_id); + self.peers.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -854,7 +890,6 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> ProcessingResult { @@ -869,7 +904,6 @@ impl SyncingChain { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), - %peer_id, %request_id, ?batch_state, "Batch not expecting block" @@ -880,12 +914,13 @@ impl SyncingChain { batch_epoch = %batch_id, batch_state = ?batch.state(), error = ?err, - %peer_id, %request_id, "Batch download error" ); if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(*peer_id))? + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + batch.download_failed(None)? { return Err(RemoveChain::ChainFailed { blacklist, @@ -896,7 +931,6 @@ impl SyncingChain { } else { debug!( batch_epoch = %batch_id, - %peer_id, %request_id, batch_state, "Batch not found" @@ -937,6 +971,7 @@ impl SyncingChain { }, &synced_peers, &failed_peers, + &self.peers, ) { Ok(request_id) => { // inform the batch about the new request @@ -953,14 +988,7 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For - // sync to work properly it must be okay to have "stalled" batches in - // AwaitingDownload state. Currently it will error with invalid state if - // that happens. Sync manager must periodicatlly prune stalled batches like - // we do for lookup sync. Then we can deprecate the redundant - // `good_peers_on_sampling_subnets` checks. - e - @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { + RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried @@ -1019,11 +1047,6 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); - return Ok(KeepChain); - } - if let Entry::Vacant(entry) = self.batches.entry(epoch) { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); entry.insert(optimistic_batch); @@ -1046,35 +1069,6 @@ impl SyncingChain { Ok(KeepChain) } - /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in - /// every sampling column subnet. - fn good_peers_on_sampling_subnets( - &self, - epoch: Epoch, - network: &SyncNetworkContext, - ) -> bool { - if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { - // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network - .network_globals() - .sampling_subnets - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer(*subnet_id) - .count(); - - peer_count > 0 - }); - peers_on_all_custody_subnets - } else { - true - } - } - /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] @@ -1107,15 +1101,6 @@ impl SyncingChain { return None; } - // don't send batch requests until we have peers on sampling subnets - // TODO(das): this is a workaround to avoid sending out excessive block requests because - // block and data column requests are currently coupled. This can be removed once we find a - // way to decouple the requests and do retries individually, see issue #6258. - if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); - return None; - } - // If no batch needs a retry, attempt to send the batch of the next epoch to download let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index 9f500c61e0b..454f7c02d15 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -54,6 +54,13 @@ pub struct ChainCollection { } impl ChainCollection { + #[cfg(test)] + pub(crate) fn iter(&self) -> impl Iterator> { + self.finalized_chains + .values() + .chain(self.head_chains.values()) + } + pub fn new(beacon_chain: Arc>) -> Self { ChainCollection { beacon_chain, diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 1218e0cd09c..e9fb0219c45 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -9,10 +9,9 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, - ByRangeRequestType, }; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] -pub use chain_collection::SyncChainStatus; +pub use chain::BatchStateSummary; +pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index e2c076484a5..473e2066cee 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,6 +39,8 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. +#[cfg(test)] +use super::chain::BatchStateSummary; use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; @@ -100,10 +102,23 @@ where } #[cfg(test)] - pub(crate) fn __failed_chains(&mut self) -> Vec { + pub(crate) fn failed_chains(&mut self) -> Vec { self.failed_chains.keys().copied().collect() } + #[cfg(test)] + pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + self.chains + .iter() + .flat_map(|chain| { + chain + .batches_state() + .into_iter() + .map(|(batch_id, state)| (chain.id(), batch_id, state)) + }) + .collect() + } + #[instrument(parent = None, level = "info", fields(component = "range_sync"), @@ -344,7 +359,6 @@ where pub fn inject_error( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, batch_id: BatchId, chain_id: ChainId, request_id: Id, @@ -352,7 +366,7 @@ where ) { // check that this request is pending match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id, err) + chain.inject_error(network, batch_id, request_id, err) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 5863091cf0e..3e83605a276 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -35,7 +35,7 @@ use lighthouse_network::{ SamplingRequester, SingleLookupReqId, SyncRequestId, }, types::SyncState, - NetworkConfig, NetworkGlobals, PeerId, + NetworkConfig, NetworkGlobals, PeerId, SyncInfo, }; use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; @@ -53,8 +53,21 @@ const SAMPLING_REQUIRED_SUCCESSES: usize = 2; type DCByRootIds = Vec; type DCByRootId = (SyncRequestId, Vec); +pub enum PeersConfig { + SupernodeAndRandom, + SupernodeOnly, +} + impl TestRig { pub fn test_setup() -> Self { + Self::test_setup_with_options(false) + } + + pub fn test_setup_as_supernode() -> Self { + Self::test_setup_with_options(true) + } + + fn test_setup_with_options(is_supernode: bool) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -83,10 +96,11 @@ impl TestRig { // TODO(das): make the generation of the ENR use the deterministic rng to have consistent // column assignments let network_config = Arc::new(NetworkConfig::default()); - let globals = Arc::new(NetworkGlobals::new_test_globals( + let globals = Arc::new(NetworkGlobals::new_test_globals_as_supernode( Vec::new(), network_config, chain.spec.clone(), + is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, @@ -113,6 +127,7 @@ impl TestRig { network_rx, network_rx_queue: vec![], sync_rx, + sent_blocks_by_range: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), sync_manager: SyncManager::new( @@ -244,8 +259,8 @@ impl TestRig { self.sync_manager.active_parent_lookups().len() } - fn active_range_sync_chain(&self) -> (RangeSyncType, Slot, Slot) { - self.sync_manager.get_range_sync_chains().unwrap().unwrap() + fn active_range_sync_chain(&mut self) -> (RangeSyncType, Slot, Slot) { + self.sync_manager.range_sync().state().unwrap().unwrap() } fn assert_single_lookups_count(&self, count: usize) { @@ -355,29 +370,63 @@ impl TestRig { self.expect_empty_network(); } - pub fn new_connected_peer(&mut self) -> PeerId { + // Don't make pub, use `add_connected_peer_testing_only` + fn new_connected_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(false) + } + + // Don't make pub, use `add_connected_peer_testing_only` + fn new_connected_supernode_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(true) + } + + pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self .network_globals .peers .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key); - self.log(&format!("Added new peer for testing {peer_id:?}")); + .__add_connected_peer_testing_only(supernode, &self.harness.spec, key); + let mut peer_custody_subnets = self + .network_globals + .peers + .read() + .peer_info(&peer_id) + .expect("peer was just added") + .custody_subnets_iter() + .map(|subnet| **subnet) + .collect::>(); + peer_custody_subnets.sort_unstable(); + self.log(&format!( + "Added new peer for testing {peer_id:?} custody subnets {peer_custody_subnets:?}" + )); peer_id } - pub fn new_connected_supernode_peer(&mut self) -> PeerId { - let key = self.determinstic_key(); - self.network_globals - .peers - .write() - .__add_connected_peer_testing_only(true, &self.harness.spec, key) + pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { + let peer_id = self.add_connected_peer_testing_only(supernode); + self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); + peer_id } fn determinstic_key(&mut self) -> CombinedKey { k256::ecdsa::SigningKey::random(&mut self.rng).into() } + pub fn add_sync_peers(&mut self, config: PeersConfig, remote_info: SyncInfo) { + match config { + PeersConfig::SupernodeAndRandom => { + for _ in 0..100 { + self.add_sync_peer(false, remote_info.clone()); + } + self.add_sync_peer(true, remote_info); + } + PeersConfig::SupernodeOnly => { + self.add_sync_peer(true, remote_info); + } + } + } + pub fn new_connected_peers_for_peerdas(&mut self) { // Enough sampling peers with few columns for _ in 0..100 { @@ -840,6 +889,19 @@ impl TestRig { } } + // Find, not pop + pub fn filter_received_network_events) -> Option>( + &mut self, + predicate_transform: F, + ) -> Vec { + self.drain_network_rx(); + + self.network_rx_queue + .iter() + .filter_map(predicate_transform) + .collect() + } + pub fn pop_received_processor_event) -> Option>( &mut self, predicate_transform: F, @@ -1088,6 +1150,21 @@ impl TestRig { } } + pub fn expect_no_penalty_for_anyone(&mut self) { + self.drain_network_rx(); + let downscore_events = self + .network_rx_queue + .iter() + .filter_map(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), + _ => None, + }) + .collect::>(); + if !downscore_events.is_empty() { + panic!("Expected no downscoring events but found: {downscore_events:?}"); + } + } + #[track_caller] fn expect_parent_chain_process(&mut self) { match self.beacon_processor_rx.try_recv() { @@ -1123,6 +1200,25 @@ impl TestRig { } } + #[track_caller] + pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { + let all_penalties = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if all_penalties + .iter() + .any(|(_, msg)| *msg != expected_penalty_msg) + { + panic!( + "Expected penalties only of {expected_penalty_msg}, but found {all_penalties:?}" + ); + } + self.log(&format!( + "Found expected penalties {expected_penalty_msg}: {all_penalties:?}" + )); + } + #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index ec24ddb036a..a09313c5021 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -6,13 +6,17 @@ use beacon_chain::builder::Witness; use beacon_chain::eth1_chain::CachingEth1Backend; use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; use beacon_processor::WorkEvent; +use lighthouse_network::service::api_types::ComponentsByRangeRequestId; use lighthouse_network::NetworkGlobals; use rand_chacha::ChaCha20Rng; use slot_clock::ManualSlotClock; +use std::collections::HashMap; use std::sync::Arc; use store::MemoryStore; use tokio::sync::mpsc; -use types::{ChainSpec, ForkName, MinimalEthSpec as E}; +use types::{ChainSpec, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; + +pub use lookups::PeersConfig; mod lookups; mod range; @@ -64,4 +68,7 @@ struct TestRig { rng: ChaCha20Rng, fork_name: ForkName, spec: Arc, + + // Cache of sent blocks for PeerDAS responses + sent_blocks_by_range: HashMap>>>, } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 06dca355e53..c82e4f97769 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -2,27 +2,33 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; -use crate::sync::network_context::RangeRequestId; -use crate::sync::range_sync::RangeSyncType; -use crate::sync::SyncMessage; +use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; +use crate::sync::range_sync::{BatchId, BatchStateSummary, RangeSyncType}; +use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; +use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; +use lighthouse_network::discovery::{peer_id_to_node_id, CombinedKey}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, - OldBlocksByRangeRequestV2, }; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - SyncRequestId, + AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + DataColumnsByRangeRequestId, SyncRequestId, }; -use lighthouse_network::{PeerId, SyncInfo}; +use lighthouse_network::types::SyncState; +use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; +use rand::SeedableRng; +use rand_chacha::ChaCha20Rng; +use std::collections::HashSet; use std::time::Duration; +use types::data_column_custody_group::compute_subnets_for_node; use types::{ - BlobSidecarList, BlockImportSource, Epoch, EthSpec, Hash256, MinimalEthSpec as E, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, + DataColumnSubnetId, Epoch, EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, + SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -34,10 +40,43 @@ pub(crate) enum DataSidecars { enum ByRangeDataRequestIds { PreDeneb, - PrePeerDAS(BlobsByRangeRequestId, PeerId), - PostPeerDAS(Vec<(DataColumnsByRangeRequestId, PeerId)>), + PrePeerDAS(BlobsByRangeRequestId, PeerId, BlobsByRangeRequest), + PostPeerDAS( + Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )>, + ), } +impl ByRangeDataRequestIds { + fn peer(&self) -> PeerId { + match self { + Self::PreDeneb => panic!("no requests PreDeneb"), + Self::PrePeerDAS(_, peer, _) => *peer, + Self::PostPeerDAS(reqs) => { + if reqs.len() != 1 { + panic!("Should have 1 PostPeerDAS request"); + } + reqs.first().expect("no PostPeerDAS requests").1 + } + } + } +} + +struct Config { + peers: PeersConfig, +} + +type BlocksByRangeRequestData = (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest); + +type DataColumnsByRangeRequestData = ( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, +); + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -46,10 +85,11 @@ enum ByRangeDataRequestIds { /// To make writting tests succint, the machinery in this testing rig automatically identifies /// _which_ request to complete. Picking the right request is critical for tests to pass, so this /// filter allows better expressivity on the criteria to identify the right request. -#[derive(Default, Debug, Clone)] +#[derive(Default, Debug, Clone, Copy)] struct RequestFilter { peer: Option, epoch: Option, + column_index: Option, } impl RequestFilter { @@ -62,13 +102,117 @@ impl RequestFilter { self.epoch = Some(epoch); self } + + fn column_index(mut self, index: u64) -> Self { + self.column_index = Some(index); + self + } + + fn blocks_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), + } if self.matches_blocks_by_range(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + + fn data_columns_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), + } if self.matches_data_columns_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { + self.matches_common(peer, *req.start_slot()) + } + + fn matches_blobs_by_range(&self, peer: &PeerId, req: &BlobsByRangeRequest) -> bool { + self.matches_common(peer, req.start_slot) + } + + fn matches_data_columns_by_range( + &self, + peer: &PeerId, + req: &DataColumnsByRangeRequest, + ) -> bool { + if let Some(index) = self.column_index { + if !req.columns.contains(&index) { + return false; + } + } + self.matches_common(peer, req.start_slot) + } + + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { + if let Some(expected_epoch) = self.epoch { + let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); + if epoch != expected_epoch { + return false; + } + } + if let Some(expected_peer) = self.peer { + if *peer != expected_peer { + return false; + } + } + true + } } fn filter() -> RequestFilter { RequestFilter::default() } +/// Instruct the testing rig how to complete requests for _by_range requests +#[derive(Debug, Clone, Copy)] +struct CompleteConfig { + block_count: usize, + with_data: bool, + custody_failure_at_index: Option, +} + +impl CompleteConfig { + // TODO(das): add tests where blocks don't have data + + fn custody_failure_at_index(mut self, index: u64) -> Self { + self.custody_failure_at_index = Some(index); + self + } +} + +fn complete() -> CompleteConfig { + CompleteConfig { + block_count: 1, + with_data: true, + custody_failure_at_index: None, + } +} + impl TestRig { + fn our_custody_indices(&self) -> Vec { + self.network_globals + .sampling_columns + .iter() + .copied() + .collect() + } + /// Produce a head peer with an advanced head fn add_head_peer(&mut self) -> PeerId { self.add_head_peer_with_root(Hash256::random()) @@ -77,7 +221,7 @@ impl TestRig { /// Produce a head peer with an advanced head fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { let local_info = self.local_info(); - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { head_root, head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), ..local_info @@ -93,7 +237,7 @@ impl TestRig { fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { let local_info = self.local_info(); let finalized_epoch = local_info.finalized_epoch + 2; - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { finalized_epoch, finalized_root, head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), @@ -128,37 +272,22 @@ impl TestRig { } } - fn add_random_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { - let peer_id = self.new_connected_peer(); - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { + self.add_sync_peer(false, remote_info) } - fn add_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { + fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { // Create valid peer known to network globals // TODO(fulu): Using supernode peers to ensure we have peer across all column // subnets for syncing. Should add tests connecting to full node peers. - let peer_id = self.new_connected_supernode_peer(); - // Send peer to sync - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + self.add_sync_peer(true, remote_info) } - fn add_random_peers(&mut self, remote_info: SyncInfo, count: usize) { - for _ in 0..count { - let peer = self.new_connected_peer(); - self.add_peer(peer, remote_info.clone()); - } - } - - fn add_peer(&mut self, peer: PeerId, remote_info: SyncInfo) { - self.send_sync_message(SyncMessage::AddPeer(peer, remote_info)); - } - - fn assert_state(&self, state: RangeSyncType) { + fn assert_state(&mut self, state: RangeSyncType) { assert_eq!( self.sync_manager - .range_sync_state() + .range_sync() + .state() .expect("State is ok") .expect("Range should be syncing, there are no chains") .0, @@ -167,15 +296,28 @@ impl TestRig { ); } - fn assert_no_chains_exist(&self) { - if let Some(chain) = self.sync_manager.get_range_sync_chains().unwrap() { + fn get_sync_state(&mut self) -> SyncState { + self.sync_manager.network().network_globals().sync_state() + } + + fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + self.sync_manager.range_sync().batches_state() + } + + fn assert_sync_state(&mut self) { + let current_state = self.sync_manager.network().network_globals().sync_state(); + panic!("{:?}", current_state); + } + + fn assert_no_chains_exist(&mut self) { + if let Some(chain) = self.sync_manager.range_sync().state().unwrap() { panic!("There still exists a chain {chain:?}"); } } fn assert_no_failed_chains(&mut self) { assert_eq!( - self.sync_manager.__range_failed_chains(), + self.sync_manager.range_sync().failed_chains(), Vec::::new(), "Expected no failed chains" ) @@ -191,110 +333,359 @@ impl TestRig { } } + fn expect_blocks_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = + self.filter_received_network_events(|ev| request_filter.blocks_by_range_requests(ev)); + if events.is_empty() { + panic!("Expected to find blocks_by_range requests {request_filter:?}") + } + } + + fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = self + .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); + if !events.is_empty() { + panic!("Expected to not find data_columns_by_range requests {request_filter:?} by found {events:?}") + } + } + + fn expect_active_block_components_by_range_request_on_custody_step(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if requests.is_empty() { + panic!("No active block_components_by_range requests"); + } + for (id, step) in requests { + if !matches!(step, BlockComponentsByRangeRequestStep::CustodyRequest) { + panic!("block_components_by_range request {id} is not on CustodyRequest step: {step:?}"); + } + } + } + + fn expect_no_active_block_components_by_range_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if !requests.is_empty() { + panic!("Still active block_components_by_range requests {requests:?}"); + } + } + + fn expect_no_active_rpc_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_requests() + .collect::>(); + if !requests.is_empty() { + panic!("There are still active RPC requests {requests:?}"); + } + } + + fn expect_all_batches_in_state(&mut self, states: &[BatchStateSummary]) { + let batches = self.get_batch_states(); + if batches.is_empty() { + panic!("no batches"); + } + for batch in &batches { + if !states.contains(&batch.2) { + panic!("batch {batch:?} not in state {states:?}. Batches: {batches:?}"); + } + } + } + + fn expect_all_batches_downloading(&mut self) { + self.expect_all_batches_in_state(&[BatchStateSummary::Downloading]); + } + + fn expect_all_batches_processing_or_awaiting(&mut self) { + self.expect_all_batches_in_state(&[ + BatchStateSummary::Processing, + BatchStateSummary::AwaitingProcessing, + ]); + } + fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); } - fn find_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> ((BlocksByRangeRequestId, PeerId), ByRangeDataRequestIds) { - let filter_f = |peer: PeerId, start_slot: u64| { - if let Some(expected_epoch) = request_filter.epoch { - let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); - if epoch != expected_epoch { - return false; - } + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> Arc> { + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { + blob_kzg_commitments + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); } - if let Some(expected_peer) = request_filter.peer { - if peer != expected_peer { - return false; - } - } - true - }; + } + *block.slot_mut() = slot; + Arc::new(SignedBeaconBlock::from_block(block, Signature::empty())) + } - let block_req = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::BlocksByRange(OldBlocksByRangeRequest::V2( - OldBlocksByRangeRequestV2 { start_slot, .. }, - )), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) + fn last_sent_blocks_by_range( + &mut self, + id: ComponentsByRangeRequestId, + ) -> Vec>> { + self.sent_blocks_by_range + .get(&id) + .cloned() + .unwrap_or_else(|| panic!("No blocks for ComponentsByRangeRequestId {id}")) + } + + fn send_blocks_by_range_response( + &mut self, + req_id: BlocksByRangeRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); + self.log(&format!( + "Completing BlocksByRange request {req_id} to {peer_id} with blocks {slots:?}" + )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: None, + seen_timestamp: D, + }); + + if self + .sent_blocks_by_range + .insert(req_id.parent_request_id, blocks.to_vec()) + .is_some() + { + panic!("Sent two blocks_by_range requests in the same epoch. We need better tracking"); + } + } + + fn send_data_columns_by_range_response( + &mut self, + id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + + fn pop_blocks_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest) { + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) .unwrap_or_else(|e| { panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }); + }) + } - let by_range_data_requests = if self.after_fulu() { - let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::DataColumnsByRange(DataColumnsByRangeRequest { - start_slot, .. - }), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) { - data_columns_requests.push(data_columns_request); - } + fn pop_data_columns_by_range_requests( + &mut self, + request_filter: RequestFilter, + ) -> Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )> { + let mut data_columns_requests = vec![]; + while let Ok(data_columns_request) = + self.pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + data_columns_requests.push(data_columns_request); + } + data_columns_requests + } + + fn find_data_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> ByRangeDataRequestIds { + if self.after_fulu() { + let data_columns_requests = self.pop_data_columns_by_range_requests(request_filter); if data_columns_requests.is_empty() { panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); } ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) } else if self.after_deneb() { - let (id, peer) = self + let (id, peer, req) = self .pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, - request: RequestType::BlobsByRange(BlobsByRangeRequest { start_slot, .. }), + request: RequestType::BlobsByRange(req), app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches_blobs_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } _ => None, }) .unwrap_or_else(|e| { panic!("Should have a blobs by range request, filter {request_filter:?}: {e:?}") }); - ByRangeDataRequestIds::PrePeerDAS(id, peer) + ByRangeDataRequestIds::PrePeerDAS(id, peer, req) } else { ByRangeDataRequestIds::PreDeneb - }; + } + } - (block_req, by_range_data_requests) + fn find_and_complete_block_components_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let id = self.find_and_complete_blocks_by_range_request(request_filter, complete_config); + self.find_and_complete_data_by_range_request(request_filter, complete_config); + id } fn find_and_complete_blocks_by_range_request( &mut self, request_filter: RequestFilter, + complete_config: CompleteConfig, ) -> RangeRequestId { - let ((blocks_req_id, block_peer), by_range_data_request_ids) = - self.find_blocks_by_range_request(request_filter); + let (blocks_req_id, block_peer, blocks_req) = + self.pop_blocks_by_range_request(request_filter); - // Complete the request with a single stream termination - self.log(&format!( - "Completing BlocksByRange request {blocks_req_id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(blocks_req_id), - peer_id: block_peer, - beacon_block: None, - seen_timestamp: D, - }); + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn complete_blocks_by_range_request( + &mut self, + request: BlocksByRangeRequestData, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let (blocks_req_id, block_peer, blocks_req) = request; + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn complete_data_columns_by_range_request( + &mut self, + (id, peer_id, req): DataColumnsByRangeRequestData, + complete_config: CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!( + "Forced custody failure at request {id} for peer {peer_id} index {index:?}" + )); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); + } + fn find_and_complete_data_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + let by_range_data_request_ids = self.find_data_by_range_request(request_filter); + self.complete_data_by_range_request(by_range_data_request_ids, complete_config); + } + + fn complete_data_by_range_request( + &mut self, + by_range_data_request_ids: ByRangeDataRequestIds, + complete_config: CompleteConfig, + ) { match by_range_data_request_ids { ByRangeDataRequestIds::PreDeneb => {} - ByRangeDataRequestIds::PrePeerDAS(id, peer_id) => { + ByRangeDataRequestIds::PrePeerDAS(id, peer_id, req) => { // Complete the request with a single stream termination self.log(&format!( - "Completing BlobsByRange request {id:?} with empty stream" + "Completing BlobsByRange request {id} {req:?} with empty stream" )); self.send_sync_message(SyncMessage::RpcBlob { sync_request_id: SyncRequestId::BlobsByRange(id), @@ -305,21 +696,89 @@ impl TestRig { } ByRangeDataRequestIds::PostPeerDAS(data_column_req_ids) => { // Complete the request with a single stream termination - for (id, peer_id) in data_column_req_ids { - self.log(&format!( - "Completing DataColumnsByRange request {id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); + for (id, peer_id, req) in data_column_req_ids { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: + kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!("Forced custody failure at request {id} for peer {peer_id} index {index:?}")); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); } } } + } - blocks_req_id.parent_request_id.requester + fn progress_until_no_events( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + loop { + if let Ok(request) = + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) + { + self.complete_blocks_by_range_request(request, complete_config); + continue; + } + + if let Ok(request) = self + .pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + self.complete_data_columns_by_range_request(request, complete_config); + continue; + } + + let sync_state = self.get_sync_state(); + self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); + + return; + } } fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { @@ -344,15 +803,18 @@ impl TestRig { &mut self, last_epoch: u64, request_filter: RequestFilter, + complete_config: CompleteConfig, ) { for epoch in 0..last_epoch { // Note: In this test we can't predict the block peer - let id = - self.find_and_complete_blocks_by_range_request(request_filter.clone().epoch(epoch)); + let id = self.find_and_complete_block_components_by_range_request( + request_filter.epoch(epoch), + complete_config, + ); if let RangeRequestId::RangeSync { batch_id, .. } = id { assert_eq!(batch_id.as_u64(), epoch, "Unexpected batch_id"); } else { - panic!("unexpected RangeRequestId {id:?}"); + panic!("unexpected RangeRequestId {id}"); } let id = match id { @@ -476,14 +938,14 @@ fn head_chain_removed_while_finalized_syncing() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer(); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Fail the head chain by disconnecting the peer. rig.peer_disconnected(head_peer); @@ -510,14 +972,14 @@ async fn state_update_while_purging() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer_with_root(finalized_peer_root); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Now the chain knows both chains target roots. rig.remember_block(head_peer_block).await; @@ -536,7 +998,10 @@ fn pause_and_resume_on_ee_offline() { // make the ee offline rig.update_execution_engine_state(EngineState::Offline); // send the response to the request - rig.find_and_complete_blocks_by_range_request(filter().peer(peer1).epoch(0)); + rig.find_and_complete_block_components_by_range_request( + filter().peer(peer1).epoch(0), + complete(), + ); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); @@ -547,7 +1012,7 @@ fn pause_and_resume_on_ee_offline() { // Don't filter requests and the columns requests may be sent to peer1 or peer2 // We need to filter by epoch, because the previous batch eagerly sent requests for the next // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_blocks_by_range_request(filter().epoch(0)); + rig.find_and_complete_block_components_by_range_request(filter().epoch(0), complete()); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); // make the beacon processor available again. @@ -576,19 +1041,34 @@ fn finalized_sync_enough_global_custody_peers_few_chain_peers() { // Current priorization only sends batches to idle peers, so we need enough peers for each batch // TODO: Test this with a single peer in the chain, it should still work - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS) as usize, - ); + r.add_sync_peer(false, remote_info); r.assert_state(RangeSyncType::Finalized); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); } +// Same test with different types of peers: +// - 100 peers +// - 1 supernode +// - perfectly distributed peer ids + #[test] -fn finalized_sync_not_enough_custody_peers_on_start() { - let mut r = TestRig::test_setup(); +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeOnly, + }); +} + +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeAndRandom, + }); +} + +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { + let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; @@ -599,24 +1079,159 @@ fn finalized_sync_not_enough_custody_peers_on_start() { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit - r.add_random_peer_not_supernode(remote_info.clone()); + r.add_connected_sync_peer_not_supernode(remote_info.clone()); r.assert_state(RangeSyncType::Finalized); - // Because we don't have enough peers on all columns we haven't sent any request. - // NOTE: There's a small chance that this single peer happens to custody exactly the set we - // expect, in that case the test will fail. Find a way to make the test deterministic. - r.expect_empty_network(); + // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it + // doesn't have enough peers to cover all columns + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + + // Here we have a batch with partially completed block_components_by_range requests. The batch + // should not have failed, we are still syncing, and there are no downscoring events. + r.expect_no_penalty_for_anyone(); + r.expect_active_block_components_by_range_request_on_custody_step(); // Generate enough peers and supernodes to cover all custody columns - r.new_connected_peers_for_peerdas(); + r.add_sync_peers(config.peers, remote_info.clone()); // Note: not necessary to add this peers to the chain, as we draw from the global pool // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS - 1) as usize, + + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work + // events from here easily. What we pop from the beacon processor queue is an opaque closure + // wihtout any information. We don't know what batch it is for. +} + +#[test] +fn finalized_sync_single_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + + r.add_sync_peer(true, remote_info.clone()); + r.assert_state(RangeSyncType::Finalized); + + // Progress all blocks_by_range and columns_by_range requests but respond empty for a single + // column index + r.progress_until_no_events( + filter(), + complete().custody_failure_at_index(column_index_to_fail), + ); + r.expect_penalties("custody_failure"); + + // Some peer had a custody failure, but since there's a single peer in the batch we won't issue + // another request yet. + r.expect_no_active_rpc_requests(); + // Ensure that the block components by range request have not failed + r.expect_active_block_components_by_range_request_on_custody_step(); + r.expect_all_batches_downloading(); + + // After adding a new peer we will try to fetch from it + r.add_sync_peer(true, remote_info.clone()); + r.progress_until_no_events( + // Find the requests first to assert that this is the only request that exists + filter().column_index(column_index_to_fail), + // complete this one request without the custody failure now + complete(), ); - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + r.expect_all_batches_processing_or_awaiting(); +} + +#[test] +fn finalized_sync_permanent_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + const PEERS_IN_BATCH: usize = 4; + + for _ in 0..PEERS_IN_BATCH { + r.add_connected_sync_random_peer(remote_info.clone()); + } + r.assert_state(RangeSyncType::Finalized); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. + r.find_and_complete_block_components_by_range_request( + filter().epoch(0), + complete().custody_failure_at_index(column_index_to_fail), + ); + + let mut requested_peers = HashSet::new(); + + for i in 0..PEERS_IN_BATCH - 1 { + r.log(&format!("Loop {i} of custody failure round")); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. We want to make sure that the request goes to different peer + // than the attempts before. + let reqs = + r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); + let req_peer = reqs.peer(); + if requested_peers.contains(&req_peer) { + panic!("Re-requested the same peer {req_peer} again after a custody failure"); + } + requested_peers.insert(req_peer); + + // Find the requests first to assert that this is the only request that exists + r.expect_no_data_columns_by_range_requests(filter().epoch(0)); + // complete this one request without the custody failure now + r.complete_data_by_range_request( + reqs, + complete().custody_failure_at_index(column_index_to_fail), + ); + } + + // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch + // 1 successfully +} + +#[test] +#[ignore] +fn mine_peerids() { + let spec = test_spec::(); + let mut rng = ChaCha20Rng::from_seed([0u8; 32]); + + let expected_subnets = (0..3) + .map(|i| DataColumnSubnetId::new(i as u64)) + .collect::>(); + + for i in 0..usize::MAX { + let key: CombinedKey = k256::ecdsa::SigningKey::random(&mut rng).into(); + let enr = Enr::builder().build(&key).unwrap(); + let peer_id = enr.peer_id(); + // Use default custody groups count + let node_id = peer_id_to_node_id(&peer_id).expect("convert peer_id to node_id"); + let subnets = compute_subnets_for_node(node_id.raw(), spec.custody_requirement, &spec) + .expect("should compute custody subnets"); + if expected_subnets == subnets { + panic!("{:?}", subnets); + } else { + let matches = expected_subnets + .iter() + .filter(|index| subnets.contains(index)) + .count(); + if matches > 0 { + println!("{i} {:?}", matches); + } + } + } } diff --git a/consensus/types/src/signed_beacon_block.rs b/consensus/types/src/signed_beacon_block.rs index 85bed35a19c..de572014edc 100644 --- a/consensus/types/src/signed_beacon_block.rs +++ b/consensus/types/src/signed_beacon_block.rs @@ -321,6 +321,10 @@ impl> SignedBeaconBlock .unwrap_or(0) } + pub fn has_data(&self) -> bool { + self.num_expected_blobs() > 0 + } + /// Used for displaying commitments in logs. pub fn commitments_formatted(&self) -> String { let Ok(commitments) = self.message().body().blob_kzg_commitments() else { From 801659d4ae200600305787e0538b6ba0559ac98e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 22 May 2025 01:06:57 -0500 Subject: [PATCH 02/66] Resolve some TODOs --- .../src/sync/network_context/block_components_by_range.rs | 2 -- .../network/src/sync/network_context/custody_by_range.rs | 3 +-- .../network/src/sync/network_context/custody_by_root.rs | 5 +---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 4545806a05e..45e5091665b 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -191,7 +191,6 @@ impl BlockComponentsByRangeRequest { blocks_by_range_request, } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - // TODO(das): use the peer group let peer_group = BatchPeers::new_from_block_peer(*block_peer); let rpc_blocks = couple_blocks_base( blocks.to_vec(), @@ -226,7 +225,6 @@ impl BlockComponentsByRangeRequest { blocks_by_range_request, } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - // TODO(das): use the peer group let blocks_with_data = blocks .iter() .filter(|block| block.has_data()) diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 9f8e163ba47..22d0d02d984 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -425,8 +425,7 @@ impl ActiveCustodyByRangeRequest { // - Add a new peer that custodies the missing columns // - Call `continue_requests` // - // Otherwise this request should be dropped and failed after some time. - // TODO(das): implement the above + // Otherwise this request will be dropped and failed after some time. } } } diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index c547837fc7f..3b7b373790f 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -249,7 +249,6 @@ impl ActiveCustodyByRootRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); - let mut indices_without_peers = vec![]; // Need to: // - track how many active requests a peer has for load balancing @@ -304,9 +303,7 @@ impl ActiveCustodyByRootRequest { // - Add a new peer that custodies the missing columns // - Call `continue_requests` // - // Otherwise this request should be dropped and failed after some time. - // TODO(das): implement the above - indices_without_peers.push(column_index); + // Otherwise this request will be dropped and failed after some time. } } } From b383f7af536329ef99989fe3390b83659d47339c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 18:37:20 -0500 Subject: [PATCH 03/66] More comments --- beacon_node/network/src/sync/manager.rs | 4 +- .../network/src/sync/network_context.rs | 158 ++++++++++-------- .../block_components_by_range.rs | 11 +- 3 files changed, 100 insertions(+), 73 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 0cf17c7b899..adcc177b8b2 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1301,11 +1301,11 @@ impl SyncManager { range_request_id: ComponentsByRangeRequestId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self + if let Some(result) = self .network .on_block_components_by_range_response(range_request_id, range_block_component) { - match resp { + match result { Ok((blocks, batch_peers)) => { match range_request_id.requester { RangeRequestId::RangeSync { chain_id, batch_id } => { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index d7ad9d3eb7a..3bc81924378 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -507,75 +507,6 @@ impl SyncNetworkContext { Ok(id.id) } - /// Received a blocks by range or blobs by range response for a request that couples blocks ' - /// and blobs. - #[allow(clippy::type_complexity)] - pub fn on_block_components_by_range_response( - &mut self, - id: ComponentsByRangeRequestId, - range_block_component: RangeBlockComponent, - ) -> Option>, BatchPeers), RpcResponseError>> { - // Note: need to remove the request to borrow self again below. Otherwise we can't - // do nested requests - let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { - metrics::inc_counter_vec( - &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, - &["block_components_by_range"], - ); - return None; - }; - - let result = match range_block_component { - RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { - request - .on_blocks_by_range_result(req_id, blocks, peer_id, self) - .map_err(Into::::into) - }), - RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { - request - .on_blobs_by_range_result(req_id, blobs, peer_id, self) - .map_err(Into::::into) - }), - RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { - resp.and_then(|(custody_columns, _)| { - request - .on_custody_by_range_result(req_id, custody_columns, peers, self) - .map_err(Into::::into) - }) - } - }; - - let result = result.transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { - Some(Ok((blocks, peer_group))) => { - let blocks_with_data = blocks - .iter() - .filter(|block| block.as_block().has_data()) - .count(); - // Don't log the peer_group here, it's very long (could be up to 128 peers). If you - // want to trace which peer sent the column at index X, search for the log: - // `Sync RPC request sent method="DataColumnsByRange" ...` - debug!( - %id, - blocks = blocks.len(), - blocks_with_data, - block_peer = ?peer_group.block(), - "Block components by range request success, removing" - ) - } - Some(Err(e)) => { - debug!(%id, error = ?e, "Block components by range request failure, removing" ) - } - None => { - self.block_components_by_range_requests.insert(id, request); - } - } - result - } - /// Request block of `block_root` if necessary by checking: /// - If the da_checker has a pending block from gossip or a previous request /// @@ -1220,6 +1151,8 @@ impl SyncNetworkContext { // Request handlers + /// Processes a single `RpcEvent` blocks_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] but it converts a `Vec` into a `Block` pub(crate) fn on_single_block_response( &mut self, id: SingleLookupReqId, @@ -1242,6 +1175,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlocksByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` blobs_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] pub(crate) fn on_single_blob_response( &mut self, id: SingleLookupReqId, @@ -1271,6 +1206,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlobsByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_root_response( &mut self, @@ -1284,6 +1221,10 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` for a blocks_by_range RPC request. + /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks + /// - If the event is an error it fails the request and returns `Some(Err)` + /// - else it appends the response chunk to the active request state and returns `None` #[allow(clippy::type_complexity)] pub(crate) fn on_blocks_by_range_response( &mut self, @@ -1295,6 +1236,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()) } + /// Processes a single `RpcEvent` for a blobs_by_range RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_blobs_by_range_response( &mut self, @@ -1306,6 +1249,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlobsByRangeRequest", resp, peer_id, |b| b.len()) } + /// Processes a single `RpcEvent` for a data_columns_by_range RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_range_response( &mut self, @@ -1319,6 +1264,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRange", resp, peer_id, |d| d.len()) } + /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics + /// and peer reporting for all request types. fn on_rpc_response_result usize>( &mut self, id: I, @@ -1475,6 +1422,79 @@ impl SyncNetworkContext { result } + /// Processes the result of an `*_by_range` RPC request issued by a + /// block_components_by_range_request. + /// + /// - If the result completes the request, it returns `Some(Ok)` with a vec of coupled RpcBlocks + /// - If the result fails the request, it returns `Some(Err)`. Note that a failed request may + /// not fail the block_components_by_range_request as it implements retries. + /// - else it appends the result to the active request state and returns `None` + #[allow(clippy::type_complexity)] + pub fn on_block_components_by_range_response( + &mut self, + id: ComponentsByRangeRequestId, + range_block_component: RangeBlockComponent, + ) -> Option>, BatchPeers), RpcResponseError>> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); + return None; + }; + + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + request + .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { + resp.and_then(|(custody_columns, _)| { + request + .on_custody_by_range_result(req_id, custody_columns, peers, self) + .map_err(Into::::into) + }) + } + } + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + .transpose(); + + match result.as_ref() { + Some(Ok((blocks, peer_group))) => { + let blocks_with_data = blocks + .iter() + .filter(|block| block.as_block().has_data()) + .count(); + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!( + %id, + blocks = blocks.len(), + blocks_with_data, + block_peer = ?peer_group.block(), + "Block components by range request success, removing" + ) + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Block components by range request failure, removing" ) + } + None => { + self.block_components_by_range_requests.insert(id, request); + } + } + result + } + pub fn send_block_for_processing( &self, id: Id, diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 45e5091665b..7c8e59eb970 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -19,6 +19,10 @@ use types::{ SignedBeaconBlock, Slot, }; +/// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return +/// potentially available RpcBlocks. +/// +/// See [`State`] for the set of `*_by_range` it may issue depending on the fork. pub struct BlockComponentsByRangeRequest { id: ComponentsByRangeRequestId, peers: Arc>>, @@ -31,13 +35,16 @@ enum State { blocks_by_range_request: ByRangeRequest>>>, }, - // Two single concurrent requests for block + blobs + // Two single concurrent requests for block + blobs. As of now we request blocks and blobs to + // the same peer, so we can attribute coupling errors to the same unique peer. DenebEnabled { blocks_by_range_request: ByRangeRequest>>>, blobs_by_range_request: ByRangeRequest>>>, }, - // Request blocks first, then columns + // Request blocks first, then columns. Assuming the block peer is honest we can attribute + // custody failures to the peers serving us columns. We want to get rid of the honest block + // peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 FuluEnabled(FuluEnabledState), } From 7d0fb93274cf5bf47e5fd662743ce1813106e497 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 18:49:45 -0500 Subject: [PATCH 04/66] Reduce conversions --- .../network/src/sync/network_context.rs | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 3bc81924378..458ff755d2c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1349,22 +1349,20 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result.map_err(Into::::into).transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { - Some(Ok((columns, peer_group, _))) => { + match &result { + Ok(Some((columns, peer_group, _))) => { debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } - Some(Err(e)) => { - debug!(%id, error = ?e, "Custody by root request failure, removing" ) + Err(e) => { + debug!(%id, error = ?e, "Custody by root request failure, removing") } - None => { + Ok(None) => { self.custody_by_root_requests.insert(id, request); } } - result + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + result.map_err(Into::::into).transpose() } /// Insert a downloaded column into an active custody request. Then make progress on the @@ -1385,8 +1383,10 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_range_requests.remove(&id) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(%id, "Custody by range downloaded event for unknown request"); + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_range"], + ); return None; }; @@ -1401,25 +1401,23 @@ impl SyncNetworkContext { request: ActiveCustodyByRangeRequest, result: CustodyByRangeRequestResult, ) -> Option> { - let result = result.map_err(Into::::into).transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { - Some(Ok((columns, _peer_group, _))) => { + match &result { + Ok(Some((columns, _peer_group, _))) => { // Don't log the peer_group here, it's very long (could be up to 128 peers). If you // want to trace which peer sent the column at index X, search for the log: // `Sync RPC request sent method="DataColumnsByRange" ...` debug!(%id, count = columns.len(), "Custody by range request success, removing") } - Some(Err(e)) => { - debug!(%id, error = ?e, "Custody by range request failure, removing" ) + Err(e) => { + debug!(%id, error = ?e, "Custody by range request failure, removing") } - None => { + Ok(None) => { self.custody_by_range_requests.insert(id, request); } } - result + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + result.map_err(Into::::into).transpose() } /// Processes the result of an `*_by_range` RPC request issued by a From c8a0c9e37932d67ee74da255a796d367725ea93b Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 19:04:50 -0500 Subject: [PATCH 05/66] Remove CustodyByRoot and CustodyByRange types --- beacon_node/network/src/sync/manager.rs | 7 +- .../network/src/sync/network_context.rs | 68 +++++++++++-------- 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index adcc177b8b2..b5e936d7e80 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -36,8 +36,7 @@ use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::block_lookups::BlockLookups; use super::network_context::{ - CustodyByRangeResult, CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, - SyncNetworkContext, + CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; @@ -1236,7 +1235,7 @@ impl SyncManager { fn on_custody_by_range_result( &mut self, id: CustodyByRangeRequestId, - result: CustodyByRangeResult, + result: CustodyRequestResult, ) { // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not // not have to pass a PeerGroup in case of error @@ -1259,7 +1258,7 @@ impl SyncManager { fn on_custody_by_root_result( &mut self, requester: CustodyRequester, - response: CustodyByRootResult, + response: CustodyRequestResult, ) { self.block_lookups .on_download_response::>( diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 458ff755d2c..ce2f91a391d 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,8 +1,8 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody_by_range::{ActiveCustodyByRangeRequest, CustodyByRangeRequestResult}; -use self::custody_by_root::{ActiveCustodyByRootRequest, CustodyByRootRequestResult}; +use self::custody_by_range::ActiveCustodyByRangeRequest; +use self::custody_by_root::ActiveCustodyByRootRequest; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::manager::BlockProcessType; use super::range_sync::BatchPeers; @@ -75,12 +75,12 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; -pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; - /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = RpcResponseBatchResult>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; -pub type CustodyByRangeResult = RpcResponseBatchResult>; +/// Common result type for `custody_by_root` and `custody_by_range` requests. The peers are part of +/// the `Ok` response since they are not known until the entire request succeeds. +pub type CustodyRequestResult = RpcResponseBatchResult>; #[derive(Debug, Clone)] pub enum RpcResponseError { @@ -1102,7 +1102,7 @@ impl SyncNetworkContext { /// attempt. pub fn continue_custody_by_root_requests( &mut self, - ) -> Vec<(CustodyRequester, CustodyByRootResult)> { + ) -> Vec<(CustodyRequester, CustodyRequestResult)> { let ids = self .custody_by_root_requests .keys() @@ -1116,7 +1116,10 @@ impl SyncNetworkContext { .custody_by_root_requests .remove(&id) .expect("key of hashmap"); - let result = request.continue_requests(self); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id, request, result) .map(|result| (id, result)) }) @@ -1128,7 +1131,7 @@ impl SyncNetworkContext { /// attempt. pub fn continue_custody_by_range_requests( &mut self, - ) -> Vec<(CustodyByRangeRequestId, CustodyByRangeResult)> { + ) -> Vec<(CustodyByRangeRequestId, CustodyRequestResult)> { let ids = self .custody_by_range_requests .keys() @@ -1142,7 +1145,10 @@ impl SyncNetworkContext { .custody_by_range_requests .remove(&id) .expect("key of hashmap"); - let result = request.continue_requests(self); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_range_result(id, request, result) .map(|result| (id, result)) }) @@ -1313,7 +1319,7 @@ impl SyncNetworkContext { req_id: DataColumnsByRootRequestId, peer_id: PeerId, resp: RpcResponseResult>>>, - ) -> Option> { + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1331,7 +1337,10 @@ impl SyncNetworkContext { return None; }; - let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id.requester, request, result) } @@ -1340,8 +1349,8 @@ impl SyncNetworkContext { &mut self, id: CustodyRequester, request: ActiveCustodyByRootRequest, - result: CustodyByRootRequestResult, - ) -> Option> { + result: Option>, + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1350,19 +1359,17 @@ impl SyncNetworkContext { let _enter = span.enter(); match &result { - Ok(Some((columns, peer_group, _))) => { + Some(Ok((columns, peer_group, _))) => { debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } - Err(e) => { + Some(Err(e)) => { debug!(%id, error = ?e, "Custody by root request failure, removing") } - Ok(None) => { + None => { self.custody_by_root_requests.insert(id, request); } } - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - result.map_err(Into::::into).transpose() + result } /// Insert a downloaded column into an active custody request. Then make progress on the @@ -1379,7 +1386,7 @@ impl SyncNetworkContext { req_id: DataColumnsByRangeRequestId, peer_id: PeerId, resp: RpcResponseResult>>>, - ) -> Option> { + ) -> Option> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_range_requests.remove(&id) else { @@ -1390,7 +1397,10 @@ impl SyncNetworkContext { return None; }; - let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_range_result(id, request, result) } @@ -1399,25 +1409,23 @@ impl SyncNetworkContext { &mut self, id: CustodyByRangeRequestId, request: ActiveCustodyByRangeRequest, - result: CustodyByRangeRequestResult, - ) -> Option> { + result: Option>, + ) -> Option> { match &result { - Ok(Some((columns, _peer_group, _))) => { + Some(Ok((columns, _peer_group, _))) => { // Don't log the peer_group here, it's very long (could be up to 128 peers). If you // want to trace which peer sent the column at index X, search for the log: // `Sync RPC request sent method="DataColumnsByRange" ...` debug!(%id, count = columns.len(), "Custody by range request success, removing") } - Err(e) => { + Some(Err(e)) => { debug!(%id, error = ?e, "Custody by range request failure, removing") } - Ok(None) => { + None => { self.custody_by_range_requests.insert(id, request); } } - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - result.map_err(Into::::into).transpose() + result } /// Processes the result of an `*_by_range` RPC request issued by a From 01329ab2303fee85ef115fd0f745e47e7662ba47 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 19:07:15 -0500 Subject: [PATCH 06/66] Improve RangeBlockComponent type --- beacon_node/network/src/sync/manager.rs | 14 +------------- beacon_node/network/src/sync/network_context.rs | 10 +++------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index b5e936d7e80..5c72ac6d124 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1237,21 +1237,9 @@ impl SyncManager { id: CustodyByRangeRequestId, result: CustodyRequestResult, ) { - // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not - // not have to pass a PeerGroup in case of error - let peers = match &result { - Ok((_, peers, _)) => peers.clone(), - // TODO(das): this PeerGroup with no peers incorrect - Err(_) => PeerGroup::from_set(<_>::default()), - }; - self.on_block_components_by_range_response( id.parent_request_id, - RangeBlockComponent::CustodyColumns( - id, - result.map(|(data, _peers, timestamp)| (data, timestamp)), - peers, - ), + RangeBlockComponent::CustodyColumns(id, result), ); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index ce2f91a391d..3197fcf13e9 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -240,11 +240,7 @@ pub enum RangeBlockComponent { RpcResponseResult>>>, PeerId, ), - CustodyColumns( - CustodyByRangeRequestId, - RpcResponseResult>>>, - PeerGroup, - ), + CustodyColumns(CustodyByRangeRequestId, CustodyRequestResult), } #[cfg(test)] @@ -1462,8 +1458,8 @@ impl SyncNetworkContext { .on_blobs_by_range_result(req_id, blobs, peer_id, self) .map_err(Into::::into) }), - RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { - resp.and_then(|(custody_columns, _)| { + RangeBlockComponent::CustodyColumns(req_id, resp) => { + resp.and_then(|(custody_columns, peers, _)| { request .on_custody_by_range_result(req_id, custody_columns, peers, self) .map_err(Into::::into) From 34b37b97ed5c0b5aa0665780e216ec3d3a169ad0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 00:37:12 -0500 Subject: [PATCH 07/66] Remove unused module --- .../src/sync/block_sidecar_coupling.rs | 588 ------------------ 1 file changed, 588 deletions(-) delete mode 100644 beacon_node/network/src/sync/block_sidecar_coupling.rs diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs deleted file mode 100644 index 68f15491256..00000000000 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ /dev/null @@ -1,588 +0,0 @@ -use beacon_chain::{ - block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, -}; -use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - }, - PeerId, -}; -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; -use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, -}; - -use super::range_sync::BatchPeers; - -pub struct RangeBlockComponentsRequest { - /// Blocks we have received awaiting for their corresponding sidecar. - blocks_request: ByRangeRequest>>>, - /// Sidecars we have received awaiting for their corresponding block. - block_data_request: RangeBlockDataRequest, -} - -enum ByRangeRequest { - Active(I), - Complete(T, PeerId), -} - -enum RangeBlockDataRequest { - /// All pre-deneb blocks - NoData, - /// All post-Deneb blocks, regardless of if they have data or not - Blobs(ByRangeRequest>>>), - /// All post-Fulu blocks, regardless of if they have data or not - DataColumns { - requests: HashMap< - DataColumnsByRangeRequestId, - ByRangeRequest>, - >, - expected_column_to_peer: HashMap, - }, -} - -impl RangeBlockComponentsRequest { - pub fn new( - blocks_req_id: BlocksByRangeRequestId, - blobs_req_id: Option, - data_columns: Option<( - Vec, - HashMap, - )>, - ) -> Self { - let block_data_request = if let Some(blobs_req_id) = blobs_req_id { - RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) - } else if let Some((requests, expected_column_to_peer)) = data_columns { - RangeBlockDataRequest::DataColumns { - requests: requests - .into_iter() - .map(|id| (id, ByRangeRequest::Active(id))) - .collect(), - expected_column_to_peer, - } - } else { - RangeBlockDataRequest::NoData - }; - - Self { - blocks_request: ByRangeRequest::Active(blocks_req_id), - block_data_request, - } - } - - pub fn add_blocks( - &mut self, - req_id: BlocksByRangeRequestId, - blocks: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - self.blocks_request.finish(req_id, blocks, peer_id) - } - - pub fn add_blobs( - &mut self, - req_id: BlobsByRangeRequestId, - blobs: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), - RangeBlockDataRequest::Blobs(ref mut req) => req.finish(req_id, blobs, peer_id), - RangeBlockDataRequest::DataColumns { .. } => { - Err("received blobs but expected data columns".to_owned()) - } - } - } - - pub fn add_custody_columns( - &mut self, - req_id: DataColumnsByRangeRequestId, - columns: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => { - Err("received data columns but expected no data".to_owned()) - } - RangeBlockDataRequest::Blobs(_) => { - Err("received data columns but expected blobs".to_owned()) - } - RangeBlockDataRequest::DataColumns { - ref mut requests, .. - } => { - let req = requests - .get_mut(&req_id) - .ok_or(format!("unknown data columns by range req_id {req_id}"))?; - req.finish(req_id, columns, peer_id) - } - } - } - - /// If all internal requests are complete returns a Vec of coupled RpcBlocks - #[allow(clippy::type_complexity)] - pub fn responses( - &self, - spec: &ChainSpec, - ) -> Option>, BatchPeers), String>> { - let Some((blocks, &block_peer)) = self.blocks_request.to_finished() else { - return None; - }; - - match &self.block_data_request { - RangeBlockDataRequest::NoData => Some( - Self::responses_with_blobs(blocks.to_vec(), vec![], spec) - .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), - ), - RangeBlockDataRequest::Blobs(request) => { - let Some((blobs, _blob_peer)) = request.to_finished() else { - return None; - }; - Some( - Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) - .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), - ) - } - RangeBlockDataRequest::DataColumns { - requests, - expected_column_to_peer, - } => { - let mut data_columns = vec![]; - let mut column_peers = HashMap::new(); - for req in requests.values() { - let Some((resp_columns, column_peer)) = req.to_finished() else { - return None; - }; - data_columns.extend(resp_columns.clone()); - for column in resp_columns { - column_peers.insert(column.index, *column_peer); - } - } - - Some( - Self::responses_with_custody_columns( - blocks.to_vec(), - data_columns, - expected_column_to_peer.clone(), - spec, - ) - .map(|blocks| (blocks, BatchPeers::new(block_peer, column_peers))), - ) - } - } - } - - fn responses_with_blobs( - blocks: Vec>>, - blobs: Vec>>, - spec: &ChainSpec, - ) -> Result>, String> { - // There can't be more more blobs than blocks. i.e. sending any blob (empty - // included) for a skipped slot is not permitted. - let mut responses = Vec::with_capacity(blocks.len()); - let mut blob_iter = blobs.into_iter().peekable(); - for block in blocks.into_iter() { - let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; - let mut blob_list = Vec::with_capacity(max_blobs_per_block); - while { - let pair_next_blob = blob_iter - .peek() - .map(|sidecar| sidecar.slot() == block.slot()) - .unwrap_or(false); - pair_next_blob - } { - blob_list.push(blob_iter.next().ok_or("Missing next blob".to_string())?); - } - - let mut blobs_buffer = vec![None; max_blobs_per_block]; - for blob in blob_list { - let blob_index = blob.index as usize; - let Some(blob_opt) = blobs_buffer.get_mut(blob_index) else { - return Err("Invalid blob index".to_string()); - }; - if blob_opt.is_some() { - return Err("Repeat blob index".to_string()); - } else { - *blob_opt = Some(blob); - } - } - let blobs = RuntimeVariableList::new( - blobs_buffer.into_iter().flatten().collect::>(), - max_blobs_per_block, - ) - .map_err(|_| "Blobs returned exceeds max length".to_string())?; - responses.push(RpcBlock::new(None, block, Some(blobs)).map_err(|e| format!("{e:?}"))?) - } - - // if accumulated sidecars is not empty, throw an error. - if blob_iter.next().is_some() { - return Err("Received sidecars that don't pair well".to_string()); - } - - Ok(responses) - } - - fn responses_with_custody_columns( - blocks: Vec>>, - data_columns: DataColumnSidecarList, - expected_custody_columns: HashMap, - spec: &ChainSpec, - ) -> Result>, String> { - // Group data columns by block_root and index - let mut custody_columns_by_block = HashMap::>>::new(); - let mut block_roots_by_slot = HashMap::>::new(); - let expected_custody_indices = expected_custody_columns.keys().cloned().collect::>(); - - for column in data_columns { - let block_root = column.block_root(); - let index = column.index; - - block_roots_by_slot - .entry(column.slot()) - .or_default() - .insert(block_root); - - // Sanity check before casting to `CustodyDataColumn`. But this should never happen - if !expected_custody_columns.contains_key(&index) { - return Err(format!( - "Received column not in expected custody indices {index}" - )); - } - - custody_columns_by_block - .entry(block_root) - .or_default() - .push(CustodyDataColumn::from_asserted_custody(column)); - } - - // Now iterate all blocks ensuring that the block roots of each block and data column match, - // plus we have columns for our custody requirements - let rpc_blocks = blocks - .into_iter() - .map(|block| { - let block_root = get_block_root(&block); - block_roots_by_slot - .entry(block.slot()) - .or_default() - .insert(block_root); - - let custody_columns = custody_columns_by_block - .remove(&block_root) - .unwrap_or_default(); - - RpcBlock::new_with_custody_columns( - Some(block_root), - block, - custody_columns, - expected_custody_indices.clone(), - spec, - ) - .map_err(|e| format!("{e:?}")) - }) - .collect::, _>>()?; - - // Assert that there are no columns left for other blocks - if !custody_columns_by_block.is_empty() { - let remaining_roots = custody_columns_by_block.keys().collect::>(); - return Err(format!("Not all columns consumed: {remaining_roots:?}")); - } - - for (_slot, block_roots) in block_roots_by_slot { - if block_roots.len() > 1 { - // TODO: Some peer(s) are faulty or malicious. This batch will fail processing but - // we want to send it to the process to better attribute fault. Maybe warn log for - // now and track it in a metric? - } - } - - Ok(rpc_blocks) - } -} - -impl ByRangeRequest { - fn finish(&mut self, id: I, data: T, peer_id: PeerId) -> Result<(), String> { - match self { - Self::Active(expected_id) => { - if expected_id != &id { - return Err(format!("unexpected req_id expected {expected_id} got {id}")); - } - *self = Self::Complete(data, peer_id); - Ok(()) - } - Self::Complete(_, _) => Err("request already complete".to_owned()), - } - } - - fn to_finished(&self) -> Option<(&T, &PeerId)> { - match self { - Self::Active(_) => None, - Self::Complete(data, peer_id) => Some((data, peer_id)), - } - } -} - -#[cfg(test)] -mod tests { - use super::RangeBlockComponentsRequest; - use beacon_chain::test_utils::{ - generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, NumBlobs, - }; - use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, - }, - PeerId, - }; - use rand::SeedableRng; - use std::{collections::HashMap, sync::Arc}; - use types::{test_utils::XorShiftRng, Epoch, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; - - fn components_id() -> ComponentsByRangeRequestId { - ComponentsByRangeRequestId { - id: 0, - requester: RangeRequestId::RangeSync { - chain_id: 1, - batch_id: Epoch::new(0), - }, - } - } - - fn blocks_id(parent_request_id: ComponentsByRangeRequestId) -> BlocksByRangeRequestId { - BlocksByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn blobs_id(parent_request_id: ComponentsByRangeRequestId) -> BlobsByRangeRequestId { - BlobsByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn columns_id( - id: Id, - parent_request_id: ComponentsByRangeRequestId, - ) -> DataColumnsByRangeRequestId { - DataColumnsByRangeRequestId { - id, - parent_request_id, - } - } - - fn is_finished(info: &RangeBlockComponentsRequest) -> bool { - let spec = test_spec::(); - info.responses(&spec).is_some() - } - - #[test] - fn no_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, &mut rng, &spec) - .0 - .into() - }) - .collect::>>>(); - - let blocks_req_id = blocks_id(components_id()); - let mut info = RangeBlockComponentsRequest::::new(blocks_req_id, None, None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn empty_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - // Always generate some blobs. - generate_rand_block_and_blobs::( - ForkName::Deneb, - NumBlobs::Number(3), - &mut rng, - &spec, - ) - .0 - .into() - }) - .collect::>>>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let blobs_req_id = blobs_id(components_id); - let mut info = - RangeBlockComponentsRequest::::new(blocks_req_id, Some(blobs_req_id), None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - // Expect no blobs returned - info.add_blobs(blobs_req_id, vec![], peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed, even if blobs weren't returned. - // This makes sure we don't expect blobs here when they have expired. Checking this logic should - // be hendled elsewhere. - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns() { - let spec = test_spec::(); - let peer = PeerId::random(); - let expects_custody_columns = [1, 2, 3, 4]; - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = expects_custody_columns - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let column_to_peer = expects_custody_columns - .iter() - .map(|index| (*index, peer)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), column_to_peer)), - ); - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - // Send data columns - for (i, &column_index) in expects_custody_columns.iter().enumerate() { - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| b.1.iter().filter(|d| d.index == column_index).cloned()) - .collect(), - peer, - ) - .unwrap(); - - if i < expects_custody_columns.len() - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns_batched() { - let spec = test_spec::(); - let peer = PeerId::random(); - let batched_column_requests = [vec![1_u64, 2], vec![3, 4]]; - let expects_custody_columns = batched_column_requests - .iter() - .flatten() - .map(|index| (*index, peer)) - .collect::>(); - let custody_column_request_ids = - (0..batched_column_requests.len() as u32).collect::>(); - let num_of_data_column_requests = custody_column_request_ids.len(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = batched_column_requests - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), expects_custody_columns.clone())), - ); - - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - for (i, column_indices) in batched_column_requests.iter().enumerate() { - // Send the set of columns in the same batch request - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| { - b.1.iter() - .filter(|d| column_indices.contains(&d.index)) - .cloned() - }) - .collect::>(), - peer, - ) - .unwrap(); - - if i < num_of_data_column_requests - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } -} From 8f74adc66f711790bd41ca423c2a454197ef72b6 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 00:43:38 -0500 Subject: [PATCH 08/66] Use DataColumnSidecarList --- .../network/src/network_beacon_processor/mod.rs | 2 +- beacon_node/network/src/sync/network_context.rs | 6 +++--- .../network_context/block_components_by_range.rs | 10 +++++----- .../src/sync/network_context/custody_by_range.rs | 6 ++---- .../src/sync/network_context/custody_by_root.rs | 4 +--- .../requests/data_columns_by_range.rs | 4 ++-- beacon_node/network/src/sync/tests/lookups.rs | 12 +++++------- 7 files changed, 19 insertions(+), 25 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 7a4d6978800..e026d04776b 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -554,7 +554,7 @@ impl NetworkBeaconProcessor { pub fn send_rpc_validate_data_columns( self: &Arc, block_root: Hash256, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, seen_timestamp: Duration, id: SamplingId, ) -> Result<(), Error> { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 3197fcf13e9..f4db7e22566 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1216,7 +1216,7 @@ impl SyncNetworkContext { id: DataColumnsByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>>>> { + ) -> Option>> { let resp = self .data_columns_by_root_requests .on_response(id, rpc_event); @@ -1314,7 +1314,7 @@ impl SyncNetworkContext { id: CustodyId, req_id: DataColumnsByRootRequestId, peer_id: PeerId, - resp: RpcResponseResult>>>, + resp: RpcResponseResult>, ) -> Option> { let span = span!( Level::INFO, @@ -1381,7 +1381,7 @@ impl SyncNetworkContext { id: CustodyByRangeRequestId, req_id: DataColumnsByRangeRequestId, peer_id: PeerId, - resp: RpcResponseResult>>>, + resp: RpcResponseResult>, ) -> Option> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 7c8e59eb970..00f64f2e39c 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -15,8 +15,8 @@ use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, RuntimeVariableList, - SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, Hash256, + RuntimeVariableList, SignedBeaconBlock, Slot, }; /// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return @@ -57,7 +57,7 @@ enum FuluEnabledState { blocks: Vec>>, block_peer: PeerId, custody_by_range_request: - ByRangeRequest>>, PeerGroup>, + ByRangeRequest, PeerGroup>, }, } @@ -389,7 +389,7 @@ impl BlockComponentsByRangeRequest { pub fn on_custody_by_range_result( &mut self, id: CustodyByRangeRequestId, - data: Vec>>, + data: DataColumnSidecarList, peers: PeerGroup, cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { @@ -483,7 +483,7 @@ fn couple_blocks_deneb( fn couple_blocks_fulu( blocks: Vec>>, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, custody_column_indices: Vec, spec: &ChainSpec, ) -> Result>, Error> { diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 22d0d02d984..6b4d2331889 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -16,8 +16,8 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; use types::{ - data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Hash256, - SignedBeaconBlockHeader, Slot, + data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + Hash256, SignedBeaconBlockHeader, Slot, }; use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -25,8 +25,6 @@ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; const REQUEST_EXPIRY_SECONDS: u64 = 300; -type DataColumnSidecarList = Vec>>; - pub struct ActiveCustodyByRangeRequest { start_time: Instant, id: CustodyByRangeRequestId, diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 3b7b373790f..489b9c3b11b 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -15,7 +15,7 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; +use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -24,8 +24,6 @@ const REQUEST_EXPIRY_SECONDS: u64 = 300; /// TODO(das): this attempt count is nested into the existing lookup request count. const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; -type DataColumnSidecarList = Vec>>; - pub struct ActiveCustodyByRootRequest { start_time: Instant, block_root: Hash256, diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs index 276ede93c12..54ff0c1c735 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs @@ -1,13 +1,13 @@ use super::{ActiveRequestItems, LookupVerifyError}; use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use std::sync::Arc; -use types::{DataColumnSidecar, EthSpec, Slot}; +use types::{DataColumnSidecar, DataColumnSidecarList, EthSpec, Slot}; /// Accumulates results of a data_columns_by_range request. Only returns items after receiving the /// stream termination. pub struct DataColumnsByRangeRequestItems { request: DataColumnsByRangeRequest, - items: Vec>>, + items: DataColumnSidecarList, } impl DataColumnsByRangeRequestItems { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 3e83605a276..d85504d4654 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -43,8 +43,8 @@ use tracing::info; use types::{ data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, }; const D: Duration = Duration::new(0, 0); @@ -216,9 +216,7 @@ impl TestRig { generate_rand_block_and_blobs::(fork_name, num_blobs, rng, &self.spec) } - fn rand_block_and_data_columns( - &mut self, - ) -> (SignedBeaconBlock, Vec>>) { + fn rand_block_and_data_columns(&mut self) -> (SignedBeaconBlock, DataColumnSidecarList) { let num_blobs = NumBlobs::Number(1); generate_rand_block_and_data_columns::( self.fork_name, @@ -721,7 +719,7 @@ impl TestRig { fn complete_valid_sampling_column_requests( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, ) { for id in ids { self.log(&format!("return valid data column for {id:?}")); @@ -766,7 +764,7 @@ impl TestRig { fn complete_valid_custody_request( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, missing_components: bool, ) { let lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { From 86ad87eced677bd1e045d7f4eb6be78441291fbd Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 12:21:42 -0500 Subject: [PATCH 09/66] Lint tests --- beacon_node/network/src/sync/tests/range.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index c82e4f97769..1fb19e15ef1 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -304,9 +304,16 @@ impl TestRig { self.sync_manager.range_sync().batches_state() } - fn assert_sync_state(&mut self) { + fn assert_sync_state(&mut self, expected_state: SyncState) { let current_state = self.sync_manager.network().network_globals().sync_state(); - panic!("{:?}", current_state); + assert_eq!(current_state, expected_state); + } + + fn assert_syncing_finalized(&mut self) { + self.assert_sync_state(SyncState::SyncingFinalized { + start_slot: Slot::new(0), + target_slot: Slot::new(0), + }); } fn assert_no_chains_exist(&mut self) { @@ -333,14 +340,6 @@ impl TestRig { } } - fn expect_blocks_by_range_requests(&mut self, request_filter: RequestFilter) { - let events = - self.filter_received_network_events(|ev| request_filter.blocks_by_range_requests(ev)); - if events.is_empty() { - panic!("Expected to find blocks_by_range requests {request_filter:?}") - } - } - fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { let events = self .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); @@ -1080,7 +1079,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit r.add_connected_sync_peer_not_supernode(remote_info.clone()); - r.assert_state(RangeSyncType::Finalized); + r.assert_syncing_finalized(); // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it // doesn't have enough peers to cover all columns From 52722b7b2ee627b76af82ee7357437b01e6ea2c0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 14:13:31 -0500 Subject: [PATCH 10/66] Resolve TODO(das) --- .../network/src/sync/backfill_sync/mod.rs | 3 -- .../network/src/sync/block_lookups/mod.rs | 2 +- .../network/src/sync/network_context.rs | 47 +++++++++---------- .../block_components_by_range.rs | 7 +-- .../sync/network_context/custody_by_range.rs | 27 +++++------ .../sync/network_context/custody_by_root.rs | 3 +- .../network/src/sync/range_sync/batch.rs | 13 ++--- beacon_node/network/src/sync/tests/range.rs | 7 +-- 8 files changed, 48 insertions(+), 61 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 45b9c61641b..e4bf1d93ef7 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -948,9 +948,6 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { - // TODO(das): block_components_by_range requests can now hang out indefinitely. - // Is that fine? Maybe we should fail the requests from the network_context - // level without involving the BackfillSync itself. RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 2c59f710d04..f676068326b 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -724,7 +724,7 @@ impl BlockLookups { // Collect all peers that sent a column that was invalid. Must // run .unique as a single peer can send multiple invalid // columns. Penalize once to avoid insta-bans - .flat_map(|(index, _)| peer_group.of_index((*index) as usize)) + .flat_map(|(index, _)| peer_group.of_index(&(*index as usize))) .unique() .collect(), _ => peer_group.all().collect(), diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index f4db7e22566..61f223d938c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -48,7 +48,7 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; @@ -124,42 +124,41 @@ pub struct PeerGroup { /// Peers group by which indexed section of the block component they served. For example: /// - PeerA served = [blob index 0, blob index 2] /// - PeerA served = [blob index 1] - peers: HashMap>, + peers: HashMap, } impl PeerGroup { + pub fn empty() -> Self { + Self { + peers: HashMap::new(), + } + } + /// Return a peer group where a single peer returned all parts of a block component. For /// example, a block has a single component (the block = index 0/1). pub fn from_single(peer: PeerId) -> Self { Self { - peers: HashMap::from_iter([(peer, vec![0])]), + peers: HashMap::from_iter([(0, peer)]), } } - pub fn from_set(peers: HashMap>) -> Self { + pub fn from_set(peer_to_indices: HashMap>) -> Self { + let mut peers = HashMap::new(); + for (peer, indices) in peer_to_indices { + for index in indices { + peers.insert(index, peer); + } + } Self { peers } } pub fn all(&self) -> impl Iterator + '_ { - self.peers.keys() + self.peers.values() } - pub fn of_index(&self, index: usize) -> impl Iterator + '_ { - self.peers.iter().filter_map(move |(peer, indices)| { - if indices.contains(&index) { - Some(peer) - } else { - None - } - }) + pub fn of_index(&self, index: &usize) -> Option<&PeerId> { + self.peers.get(index) } - pub fn as_reversed_map(&self) -> HashMap { - // TODO(das): should we change PeerGroup to hold this map? - let mut index_to_peer = HashMap::::new(); - for (peer, indices) in self.peers.iter() { - for &index in indices { - index_to_peer.insert(index as u64, *peer); - } - } - index_to_peer + pub fn as_map(&self) -> &HashMap { + &self.peers } } @@ -953,7 +952,7 @@ impl SyncNetworkContext { &mut self, parent_id: ComponentsByRangeRequestId, blocks_with_data: Vec, - epoch: Epoch, + request: BlocksByRangeRequest, column_indices: Vec, lookup_peers: Arc>>, ) -> Result { @@ -970,7 +969,7 @@ impl SyncNetworkContext { let mut request = ActiveCustodyByRangeRequest::new( id, - epoch, + request, blocks_with_data, &column_indices, lookup_peers, diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 00f64f2e39c..fc08bcdb9c5 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -144,7 +144,6 @@ impl BlockComponentsByRangeRequest { else { // When a peer disconnects and is removed from the SyncingChain peer set, if the set // reaches zero the SyncingChain is removed. - // TODO(das): add test for this. return Err(RpcRequestSendError::InternalError( "A batch peer set should never be empty".to_string(), )); @@ -270,8 +269,7 @@ impl BlockComponentsByRangeRequest { .send_custody_by_range_request( self.id, blocks_with_data, - Slot::new(*self.request.start_slot()) - .epoch(T::EthSpec::slots_per_epoch()), + self.request.clone(), column_indices, self.peers.clone(), ) @@ -309,8 +307,7 @@ impl BlockComponentsByRangeRequest { .copied() .collect(); - let peer_group = - BatchPeers::new(*block_peer, column_peers.as_reversed_map()); + let peer_group = BatchPeers::new(*block_peer, column_peers.clone()); let rpc_blocks = couple_blocks_fulu( blocks.to_vec(), columns.to_vec(), diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 6b4d2331889..18dea2070f2 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -3,7 +3,7 @@ use crate::sync::network_context::RpcResponseError; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; +use lighthouse_network::rpc::{methods::DataColumnsByRangeRequest, BlocksByRangeRequest}; use lighthouse_network::service::api_types::{ CustodyByRangeRequestId, DataColumnsByRangeRequestId, }; @@ -16,8 +16,8 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; use types::{ - data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, - Hash256, SignedBeaconBlockHeader, Slot, + data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256, + SignedBeaconBlockHeader, Slot, }; use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -28,8 +28,7 @@ const REQUEST_EXPIRY_SECONDS: u64 = 300; pub struct ActiveCustodyByRangeRequest { start_time: Instant, id: CustodyByRangeRequestId, - // TODO(das): Pass a better type for the by_range request - epoch: Epoch, + request: BlocksByRangeRequest, /// Blocks that we expect peers to serve data columns for blocks_with_data: Vec, /// List of column indices this request needs to download to complete successfully @@ -74,7 +73,7 @@ enum ColumnResponseError { impl ActiveCustodyByRangeRequest { pub(crate) fn new( id: CustodyByRangeRequestId, - epoch: Epoch, + request: BlocksByRangeRequest, blocks_with_data: Vec, column_indices: &[ColumnIndex], lookup_peers: Arc>>, @@ -82,7 +81,7 @@ impl ActiveCustodyByRangeRequest { Self { start_time: Instant::now(), id, - epoch, + request, blocks_with_data, column_requests: HashMap::from_iter( column_indices @@ -350,7 +349,6 @@ impl ActiveCustodyByRangeRequest { }) .collect::, _>>()? // Flatten Vec> to Vec - // TODO(das): maybe not optimal for the coupling logic later .into_iter() .flatten() .collect(); @@ -375,8 +373,9 @@ impl ActiveCustodyByRangeRequest { return Err(Error::TooManyDownloadErrors(last_error)); } - // TODO(das): When is a fork and only a subset of your peers know about a block, we should - // only query the peers on that fork. Should this case be handled? How to handle it? + // TODO(das): We should only query peers that are likely to know about this block. + // For by_range requests, only peers in the SyncingChain peer set. Else consider a + // fallback to the peers that are synced up to the epoch we want to query. let custodial_peers = cx.get_custodial_peers(*column_index); // We draw from the total set of peers, but prioritize those peers who we have @@ -433,12 +432,8 @@ impl ActiveCustodyByRangeRequest { .send_data_columns_by_range_request( peer_id, DataColumnsByRangeRequest { - // TODO(das): generalize with constants from batch - start_slot: self - .epoch - .start_slot(T::EthSpec::slots_per_epoch()) - .as_u64(), - count: T::EthSpec::slots_per_epoch(), + start_slot: *self.request.start_slot(), + count: *self.request.count(), columns: indices.clone(), }, self.id, diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 489b9c3b11b..1ca2a55a13a 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -21,7 +21,8 @@ use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContex const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; const REQUEST_EXPIRY_SECONDS: u64 = 300; -/// TODO(das): this attempt count is nested into the existing lookup request count. +/// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each +/// `custody_by_*` request is already retried multiple inside of a lookup or batch const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; pub struct ActiveCustodyByRootRequest { diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 81f33352f50..8ee9748ebcc 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,9 +1,10 @@ +use crate::sync::network_context::PeerGroup; use beacon_chain::block_verification_types::RpcBlock; use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::hash::{Hash, Hasher}; use std::ops::Sub; @@ -22,17 +23,17 @@ const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, - column_peers: HashMap, + column_peers: PeerGroup, } impl BatchPeers { pub fn new_from_block_peer(block_peer: PeerId) -> Self { Self { block_peer, - column_peers: <_>::default(), + column_peers: PeerGroup::empty(), } } - pub fn new(block_peer: PeerId, column_peers: HashMap) -> Self { + pub fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { Self { block_peer, column_peers, @@ -44,12 +45,12 @@ impl BatchPeers { } pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { - self.column_peers.get(index) + self.column_peers.of_index(&((*index) as usize)) } pub fn iter_unique_peers(&self) -> impl Iterator { std::iter::once(&self.block_peer) - .chain(self.column_peers.values()) + .chain(self.column_peers.all()) .unique() } } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 1fb19e15ef1..09c99d07d8c 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -188,8 +188,6 @@ struct CompleteConfig { } impl CompleteConfig { - // TODO(das): add tests where blocks don't have data - fn custody_failure_at_index(mut self, index: u64) -> Self { self.custody_failure_at_index = Some(index); self @@ -1192,15 +1190,14 @@ fn finalized_sync_permanent_custody_peer_failure() { // Find the requests first to assert that this is the only request that exists r.expect_no_data_columns_by_range_requests(filter().epoch(0)); - // complete this one request without the custody failure now r.complete_data_by_range_request( reqs, complete().custody_failure_at_index(column_index_to_fail), ); } - // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch - // 1 successfully + // custody_by_range request is still active waiting for a new peer to connect + r.expect_active_block_components_by_range_request_on_custody_step(); } #[test] From fc3922f854113bdb6a4c449224229cdf10d39ec9 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 15:28:03 -0500 Subject: [PATCH 11/66] Resolve more TODOs --- .../network/src/sync/backfill_sync/mod.rs | 33 ++++---- beacon_node/network/src/sync/manager.rs | 2 + .../network/src/sync/network_context.rs | 3 +- .../block_components_by_range.rs | 11 +-- .../sync/network_context/custody_by_range.rs | 82 +++++-------------- .../network/src/sync/range_sync/chain.rs | 47 ++++++----- 6 files changed, 66 insertions(+), 112 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index e4bf1d93ef7..47810d536e5 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -21,8 +21,9 @@ use beacon_chain::{BeaconChain, BeaconChainTypes}; use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::PeerAction; +use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use parking_lot::RwLock; use std::collections::{ btree_map::{BTreeMap, Entry}, HashMap, HashSet, @@ -135,6 +136,8 @@ pub struct BackFillSync { /// This signifies that we are able to attempt to restart a failed chain. restart_failed_sync: bool, + peers: Arc>>, + /// Reference to the beacon chain to obtain initial starting points for the backfill sync. beacon_chain: Arc>, @@ -179,6 +182,7 @@ impl BackFillSync { current_processing_batch: None, validated_batches: 0, restart_failed_sync: false, + peers: <_>::default(), beacon_chain, }; @@ -218,14 +222,7 @@ impl BackFillSync { match self.state() { BackFillState::Syncing => {} // already syncing ignore. BackFillState::Paused => { - if self - .network_globals - .peers - .read() - .synced_peers() - .next() - .is_some() - { + if !self.peers.read().is_empty() { // If there are peers to resume with, begin the resume. debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync"); self.set_state(BackFillState::Syncing); @@ -298,6 +295,14 @@ impl BackFillSync { } } + pub fn add_peer(&mut self, peer_id: PeerId) { + self.peers.write().insert(peer_id); + } + + pub fn peer_disconnected(&mut self, peer_id: &PeerId) { + self.peers.write().remove(peer_id); + } + /// An RPC error has occurred. /// /// If the batch exists it is re-requested. @@ -920,20 +925,12 @@ impl BackFillSync { batch_id: BatchId, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { - let synced_peers = self - .network_globals - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); match network.block_components_by_range_request( request, RangeRequestId::BackfillSync { batch_id }, - &synced_peers, + self.peers.clone(), &failed_peers, // Does not track total requests per peers for now &HashMap::new(), diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 5c72ac6d124..1fc46b95762 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -413,6 +413,7 @@ impl SyncManager { PeerSyncType::Advanced => { self.range_sync .add_peer(&mut self.network, local, peer_id, remote); + self.backfill_sync.add_peer(peer_id); } PeerSyncType::FullySynced => { // Sync considers this peer close enough to the head to not trigger range sync. @@ -530,6 +531,7 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); + self.backfill_sync.peer_disconnected(peer_id); self.block_lookups.peer_disconnected(peer_id); // Regardless of the outcome, we update the sync status. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 61f223d938c..7a4175f2708 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -478,7 +478,7 @@ impl SyncNetworkContext { &mut self, request: BlocksByRangeRequest, requester: RangeRequestId, - peers: &HashSet, + peers: Arc>>, peers_to_deprioritize: &HashSet, total_requests_per_peer: &HashMap, ) -> Result { @@ -498,7 +498,6 @@ impl SyncNetworkContext { self.block_components_by_range_requests.insert(id, req); - // TODO: use ID Ok(id.id) } diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index fc08bcdb9c5..bb981e31543 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -91,7 +91,7 @@ impl From for RpcRequestSendError { } } -/// FOR TESTING ONLY +/// Used to typesafe assertions of state in range sync tests #[cfg(test)] #[derive(Debug)] pub enum BlockComponentsByRangeRequestStep { @@ -103,7 +103,7 @@ impl BlockComponentsByRangeRequest { pub fn new( id: ComponentsByRangeRequestId, request: BlocksByRangeRequest, - peers: &HashSet, + peers: Arc>>, peers_to_deprioritize: &HashSet, total_requests_per_peer: &HashMap, cx: &mut SyncNetworkContext, @@ -123,6 +123,7 @@ impl BlockComponentsByRangeRequest { // will request all blocks for the first 5 epochs to that same single peer. Before we would // query only idle peers in the syncing chain. let Some(block_peer) = peers + .read() .iter() .map(|peer| { ( @@ -180,9 +181,7 @@ impl BlockComponentsByRangeRequest { Ok(Self { id, - // TODO(das): share the rwlock with the range sync batch. Are peers added to the batch - // after being created? - peers: Arc::new(RwLock::new(peers.clone())), + peers, request, state, }) @@ -511,8 +510,6 @@ fn couple_blocks_fulu( .remove(&block_root) .unwrap_or_default(); - // TODO(das): Change RpcBlock to holding a Vec of DataColumnSidecars so we don't need - // the spec here. RpcBlock::new_with_custody_columns( Some(block_root), block, diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 18dea2070f2..ed796155e26 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -1,5 +1,4 @@ use super::custody_by_root::{ColumnRequest, Error}; -use crate::sync::network_context::RpcResponseError; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; @@ -22,7 +21,7 @@ use types::{ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; -const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; +const FAILED_PEERS_EXPIRY_SECONDS: u64 = 15; const REQUEST_EXPIRY_SECONDS: u64 = 300; pub struct ActiveCustodyByRangeRequest { @@ -41,13 +40,7 @@ pub struct ActiveCustodyByRangeRequest { FnvHashMap, /// Peers that have recently failed to successfully respond to a columns by root request. /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. - peers_with_custody_failures: LRUTimeCache, - peers_with_temporary_faults: LRUTimeCache, - // TODO(das): does this HashSet has an OOM risk? We should either: make sure that this request - // structs are dropped after some time, that disconnected peers are pruned (but we may want to - // retain faulty information if they just disconnect and reconnect) or make this an LRUTimeCache - // with a long time (like 5 minutes). - peers_with_permanent_faults: HashSet, + failed_peers: LRUTimeCache, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, @@ -89,13 +82,7 @@ impl ActiveCustodyByRangeRequest { .map(|index| (*index, ColumnRequest::new())), ), active_batch_columns_requests: <_>::default(), - peers_with_custody_failures: LRUTimeCache::new(Duration::from_secs( - TEMPORARY_FAULT_EXPIRY_SECONDS, - )), - peers_with_temporary_faults: LRUTimeCache::new(Duration::from_secs( - TEMPORARY_FAULT_EXPIRY_SECONDS, - )), - peers_with_permanent_faults: HashSet::new(), + failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_EXPIRY_SECONDS)), lookup_peers, _phantom: PhantomData, } @@ -138,7 +125,7 @@ impl ActiveCustodyByRangeRequest { } // Accumulate columns that the peer does not have to issue a single log per request - let mut missing_column_indexes = vec![]; + let mut missing_column_indices = vec![]; let mut incorrect_column_indices = vec![]; let mut imported_column_indices = vec![]; @@ -178,14 +165,8 @@ impl ActiveCustodyByRangeRequest { // - peer custodies this column `index` // - peer claims to be synced to at least `slot` // - // Therefore not returning this column is an protocol violation that we - // penalize and mark the peer as failed to retry with another peer. - // - // TODO(das) do not consider this case a success. We know for sure the block has - // data. However we allow the peer to return empty as we can't attribute fault. - // TODO(das): Should track which columns are missing and eventually give up - // TODO(das): If the peer is in the lookup peer set it claims to have imported - // the block AND its custody columns. So in this case we can downscore + // Then we penalize the faulty peer, mark it as failed and try with + // another. Err(ColumnResponseError::MissingColumn(slot)) } }) @@ -219,15 +200,15 @@ impl ActiveCustodyByRangeRequest { )); } ColumnResponseError::MissingColumn(slot) => { - missing_column_indexes.push((index, slot)); + missing_column_indices.push((index, slot)); } } } } } - // Log missing_column_indexes and incorrect_column_indices here in batch per request - // to make this logs more compact and less noisy. + // Log `imported_column_indices`, `missing_column_indexes` and + // `incorrect_column_indices` once per request to make the logs less noisy. if !imported_column_indices.is_empty() { // TODO(das): this log may be redundant. We already log on DataColumnsByRange // completed, and on DataColumnsByRange sent we log the column indices @@ -246,21 +227,18 @@ impl ActiveCustodyByRangeRequest { } if !incorrect_column_indices.is_empty() { - // Note: Batch logging that columns are missing to not spam logger debug!( id = %self.id, data_columns_by_range_req_id = %req_id, %peer_id, - // TODO(das): this property can become very noisy, being the full range 0..128 - incorrect_columns = ?incorrect_column_indices, + ?incorrect_column_indices, "Custody by range peer returned non-matching columns" ); // Returning a non-canonical column is not a permanent fault. We should not // retry the peer for some time but the peer may return a canonical column in // the future. - // TODO(das): if this finalized sync the fault is permanent - self.peers_with_temporary_faults.insert(peer_id); + self.failed_peers.insert(peer_id); cx.report_peer( peer_id, PeerAction::MidToleranceError, @@ -268,19 +246,17 @@ impl ActiveCustodyByRangeRequest { ); } - if !missing_column_indexes.is_empty() { - // Note: Batch logging that columns are missing to not spam logger + if !missing_column_indices.is_empty() { debug!( id = %self.id, data_columns_by_range_req_id = %req_id, %peer_id, - // TODO(das): this property can become very noisy, being the full range 0..128 - ?missing_column_indexes, + ?missing_column_indices, "Custody by range peer claims to not have some data" ); // Not having columns is not a permanent fault. The peer may be backfilling. - self.peers_with_custody_failures.insert(peer_id); + self.failed_peers.insert(peer_id); cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); } } @@ -293,7 +269,6 @@ impl ActiveCustodyByRangeRequest { "Custody by range download error" ); - // TODO(das): Should mark peer as failed and try from another peer for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) @@ -301,22 +276,8 @@ impl ActiveCustodyByRangeRequest { .on_download_error_and_mark_failure(req_id, err.clone())?; } - match err { - // Verify errors are correctness errors against our request or about the - // returned data itself. This peer is faulty or malicious, should not be - // retried. - RpcResponseError::VerifyError(_) => { - self.peers_with_permanent_faults.insert(peer_id); - } - // Network errors are not permanent faults and worth retrying - RpcResponseError::RpcError(_) => { - self.peers_with_temporary_faults.insert(peer_id); - } - // Do nothing for internal errors - RpcResponseError::InternalError(_) => {} - // unreachable - RpcResponseError::RequestExpired(_) => {} - } + // An RpcResponseError is already downscored in network_context + self.failed_peers.insert(peer_id); } }; @@ -386,18 +347,13 @@ impl ActiveCustodyByRangeRequest { let mut priorized_peers = custodial_peers .iter() .filter(|peer| { - // Never request again peers with permanent faults - // Do not request peers with custody failures for some time - !self.peers_with_permanent_faults.contains(peer) - && !self.peers_with_custody_failures.contains(peer) + // Do not request faulty peers for some time + !self.failed_peers.contains(peer) }) .map(|peer| { ( // Prioritize peers that claim to know have imported this block if lookup_peers.contains(peer) { 0 } else { 1 }, - // De-prioritize peers that have failed to successfully respond to - // requests recently, but allow to immediatelly request them again - self.peers_with_temporary_faults.contains(peer), // Prefer peers with fewer requests to load balance across peers. // We batch requests to the same peer, so count existence in the // `columns_to_request_by_peer` as a single 1 request. @@ -411,7 +367,7 @@ impl ActiveCustodyByRangeRequest { .collect::>(); priorized_peers.sort_unstable(); - if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + if let Some((_, _, _, peer_id)) = priorized_peers.first() { columns_to_request_by_peer .entry(*peer_id) .or_default() diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index abea407b0ed..b62ed2c9dd2 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -10,7 +10,9 @@ use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use parking_lot::RwLock; use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; +use std::sync::Arc; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -91,7 +93,11 @@ pub struct SyncingChain { /// /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain /// `HashMap` - peers: HashMap, + peers: Arc>>, + + /// Tracks the total requests done to each peer for this SyncingChain. Forces us to fetch data + /// from all peers to prevent eclipse attacks + requests_per_peer: HashMap, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -173,7 +179,8 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers: HashMap::from_iter([(peer_id, <_>::default())]), + peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))), + requests_per_peer: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -191,7 +198,7 @@ impl SyncingChain { /// Check if the chain has peers from which to process batches. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn available_peers(&self) -> usize { - self.peers.len() + self.peers.read().len() } /// Get the chain's id. @@ -203,7 +210,12 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.keys().cloned() + self.peers + .read() + .iter() + .copied() + .collect::>() + .into_iter() } /// Progress in epochs made by the chain @@ -227,9 +239,10 @@ impl SyncingChain { /// If the peer has active batches, those are considered failed and re-requested. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { - self.peers.remove(peer_id); + self.peers.write().remove(peer_id); + self.requests_per_peer.remove(peer_id); - if self.peers.is_empty() { + if self.peers.read().is_empty() { Err(RemoveChain::EmptyPeerPool) } else { Ok(KeepChain) @@ -259,7 +272,7 @@ impl SyncingChain { // Account for one more requests to this peer // TODO(das): this code assumes that we do a single request per peer per RpcBlock for peer in batch_peers.iter_unique_peers() { - *self.peers.entry(*peer).or_default() += 1; + *self.requests_per_peer.entry(*peer).or_default() += 1; } // check if we have this batch @@ -613,7 +626,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for (peer, _) in self.peers.drain() { + for peer in self.peers.write().drain() { network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -878,7 +891,8 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - self.peers.insert(peer_id, <_>::default()); + self.peers.write().insert(peer_id); + self.requests_per_peer.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -952,26 +966,15 @@ impl SyncingChain { let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); - // TODO(das): we should request only from peers that are part of this SyncingChain. - // However, then we hit the NoPeer error frequently which causes the batch to fail and - // the SyncingChain to be dropped. We need to handle this case more gracefully. - let synced_peers = network - .network_globals() - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - match network.block_components_by_range_request( request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, - &synced_peers, + self.peers.clone(), &failed_peers, - &self.peers, + &self.requests_per_peer, ) { Ok(request_id) => { // inform the batch about the new request From 0ef95dd7f834f67198ae8886ebf6f0754f0e3c37 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 15:33:39 -0500 Subject: [PATCH 12/66] Remove stale TODO --- beacon_node/network/src/sync/range_sync/batch.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 8ee9748ebcc..99ee4fb6be2 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -19,7 +19,6 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; -// TODO(das): Consider merging with PeerGroup #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, From 144b83e6257918d3b66f9a078946cd43112d0b12 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 15:52:14 -0500 Subject: [PATCH 13/66] Remove BatchStateSummary --- .../network/src/sync/range_sync/chain.rs | 30 ++-------------- .../network/src/sync/range_sync/mod.rs | 2 -- .../network/src/sync/range_sync/range.rs | 6 ++-- beacon_node/network/src/sync/tests/range.rs | 34 +++++++++++++------ 4 files changed, 29 insertions(+), 43 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index b62ed2c9dd2..76721ec5aa3 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -129,37 +129,13 @@ pub enum ChainSyncingState { Syncing, } -#[cfg(test)] -#[derive(Debug, Eq, PartialEq)] -pub enum BatchStateSummary { - Downloading, - Processing, - AwaitingProcessing, - AwaitingValidation, - Unexpected(&'static str), -} - impl SyncingChain { - /// Returns a summary of batch states for assertions in tests. + /// Leaks the state of all active batches for assertions in tests. #[cfg(test)] - pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> { + pub fn batches_state(&self) -> Vec<(BatchId, &BatchState)> { self.batches .iter() - .map(|(id, batch)| { - let state = match batch.state() { - // A batch is never left in this state, it's only the initial value - BatchState::AwaitingDownload => { - BatchStateSummary::Unexpected("AwaitingDownload") - } - BatchState::Downloading { .. } => BatchStateSummary::Downloading, - BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing, - BatchState::Poisoned => BatchStateSummary::Unexpected("Poisoned"), - BatchState::Processing { .. } => BatchStateSummary::Processing, - BatchState::Failed => BatchStateSummary::Unexpected("Failed"), - BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation, - }; - (*id, state) - }) + .map(|(id, batch)| (*id, batch.state())) .collect() } diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index e9fb0219c45..225b536d1de 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -10,8 +10,6 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, }; -#[cfg(test)] -pub use chain::BatchStateSummary; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 473e2066cee..62d18252683 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,8 +39,6 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. -#[cfg(test)] -use super::chain::BatchStateSummary; use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; @@ -48,6 +46,8 @@ use super::BatchPeers; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; +#[cfg(test)] +use crate::sync::range_sync::BatchState; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; @@ -107,7 +107,7 @@ where } #[cfg(test)] - pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, &BatchState)> { self.chains .iter() .flat_map(|chain| { diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 09c99d07d8c..75ad7d2767e 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -3,7 +3,7 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; -use crate::sync::range_sync::{BatchId, BatchStateSummary, RangeSyncType}; +use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; @@ -298,7 +298,7 @@ impl TestRig { self.sync_manager.network().network_globals().sync_state() } - fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, &BatchState)> { self.sync_manager.range_sync().batches_state() } @@ -382,27 +382,39 @@ impl TestRig { } } - fn expect_all_batches_in_state(&mut self, states: &[BatchStateSummary]) { + fn expect_all_batches_in_state) -> bool>( + &mut self, + predicate: F, + expected_state: &'static str, + ) { let batches = self.get_batch_states(); if batches.is_empty() { panic!("no batches"); } - for batch in &batches { - if !states.contains(&batch.2) { - panic!("batch {batch:?} not in state {states:?}. Batches: {batches:?}"); + for (chain_id, batch_id, state) in &batches { + if !predicate(state) { + panic!("batch {chain_id} {batch_id} not in state {expected_state}, {state}"); } } } fn expect_all_batches_downloading(&mut self) { - self.expect_all_batches_in_state(&[BatchStateSummary::Downloading]); + self.expect_all_batches_in_state( + |state| matches!(state, BatchState::Downloading { .. }), + "Downloading", + ); } fn expect_all_batches_processing_or_awaiting(&mut self) { - self.expect_all_batches_in_state(&[ - BatchStateSummary::Processing, - BatchStateSummary::AwaitingProcessing, - ]); + self.expect_all_batches_in_state( + |state| { + matches!( + state, + BatchState::Processing { .. } | BatchState::AwaitingProcessing { .. } + ) + }, + "Processing or AwaitingProcessing", + ); } fn update_execution_engine_state(&mut self, state: EngineState) { From 02d97377a5510bd6659abe6c4836f994e62eec49 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 16:07:45 -0500 Subject: [PATCH 14/66] Address review comments --- beacon_node/network/src/sync/manager.rs | 5 +--- .../network/src/sync/network_context.rs | 5 ++-- beacon_node/network/src/sync/tests/lookups.rs | 23 +++++++++---------- beacon_node/network/src/sync/tests/range.rs | 1 + 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 1fc46b95762..dfafc884050 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1225,10 +1225,7 @@ impl SyncManager { // custody_by_range accumulates the results of multiple data_columns_by_range requests // returning a bigger list of data columns across all the column indices this node has // to custody - if let Some(result) = - self.network - .on_custody_by_range_response(id.parent_request_id, id, peer_id, resp) - { + if let Some(result) = self.network.on_custody_by_range_response(id, peer_id, resp) { self.on_custody_by_range_result(id.parent_request_id, result); } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 7a4175f2708..58eb3053034 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1376,14 +1376,13 @@ impl SyncNetworkContext { #[allow(clippy::type_complexity)] pub fn on_custody_by_range_response( &mut self, - id: CustodyByRangeRequestId, req_id: DataColumnsByRangeRequestId, peer_id: PeerId, resp: RpcResponseResult>, ) -> Option> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests - let Some(mut request) = self.custody_by_range_requests.remove(&id) else { + let Some(mut request) = self.custody_by_range_requests.remove(&id.parent_request_id) else { metrics::inc_counter_vec( &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["custody_by_range"], @@ -1396,7 +1395,7 @@ impl SyncNetworkContext { .map_err(Into::::into) .transpose(); - self.handle_custody_by_range_result(id, request, result) + self.handle_custody_by_range_result(id.parent_request_id, request, result) } fn handle_custody_by_range_result( diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index d85504d4654..1a37c231861 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -368,16 +368,19 @@ impl TestRig { self.expect_empty_network(); } - // Don't make pub, use `add_connected_peer_testing_only` + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor fn new_connected_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(false) } - // Don't make pub, use `add_connected_peer_testing_only` + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor fn new_connected_supernode_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(true) } + /// Add a random connected peer that is not known by the sync module pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self @@ -401,6 +404,7 @@ impl TestRig { peer_id } + /// Add a random connected peer + add it to sync with a specific remote Status pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { let peer_id = self.add_connected_peer_testing_only(supernode); self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); @@ -887,7 +891,7 @@ impl TestRig { } } - // Find, not pop + /// Similar to `pop_received_network_events` but finds matching events without removing them. pub fn filter_received_network_events) -> Option>( &mut self, predicate_transform: F, @@ -1149,15 +1153,10 @@ impl TestRig { } pub fn expect_no_penalty_for_anyone(&mut self) { - self.drain_network_rx(); - let downscore_events = self - .network_rx_queue - .iter() - .filter_map(|ev| match ev { - NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), - _ => None, - }) - .collect::>(); + let downscore_events = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), + _ => None, + }); if !downscore_events.is_empty() { panic!("Expected no downscoring events but found: {downscore_events:?}"); } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 75ad7d2767e..642f92ee664 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -51,6 +51,7 @@ enum ByRangeDataRequestIds { } impl ByRangeDataRequestIds { + /// If there's a single active request, returns its peer, else panics fn peer(&self) -> PeerId { match self { Self::PreDeneb => panic!("no requests PreDeneb"), From ae0ef8f92926e9c99189a271e23736f7cfa148d2 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 4 Jun 2025 22:20:45 -0600 Subject: [PATCH 15/66] Fix finalized_sync_permanent_custody_peer_failure --- beacon_node/network/src/sync/manager.rs | 5 +- .../network/src/sync/range_sync/chain.rs | 19 +++-- .../src/sync/range_sync/chain_collection.rs | 6 +- .../network/src/sync/range_sync/mod.rs | 2 +- .../network/src/sync/range_sync/range.rs | 4 +- beacon_node/network/src/sync/tests/lookups.rs | 23 +++++-- beacon_node/network/src/sync/tests/range.rs | 69 +++++++------------ 7 files changed, 66 insertions(+), 62 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index dfafc884050..f21576372d4 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -50,6 +50,7 @@ use crate::sync::block_lookups::{ BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, }; use crate::sync::network_context::PeerGroup; +use crate::sync::range_sync::BATCH_BUFFER_SIZE; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::{ @@ -280,6 +281,7 @@ pub fn spawn( sync_recv, SamplingConfig::Default, fork_context, + BATCH_BUFFER_SIZE, ); // spawn the sync manager thread @@ -302,6 +304,7 @@ impl SyncManager { sync_recv: mpsc::UnboundedReceiver>, sampling_config: SamplingConfig, fork_context: Arc, + batch_buffer_size: usize, ) -> Self { let network_globals = beacon_processor.network_globals.clone(); Self { @@ -313,7 +316,7 @@ impl SyncManager { beacon_chain.clone(), fork_context.clone(), ), - range_sync: RangeSync::new(beacon_chain.clone()), + range_sync: RangeSync::new(beacon_chain.clone(), batch_buffer_size), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), block_lookups: BlockLookups::new(), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 76721ec5aa3..44b2b1937d6 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -26,7 +26,7 @@ use types::{Epoch, EthSpec, Hash256, Slot}; pub const EPOCHS_PER_BATCH: u64 = 1; /// The maximum number of batches to queue before requesting more. -const BATCH_BUFFER_SIZE: u8 = 5; +pub const BATCH_BUFFER_SIZE: usize = 5; /// A return type for functions that act on a `Chain` which informs the caller whether the chain /// has been completed and should be removed or to be kept if further processing is @@ -119,6 +119,9 @@ pub struct SyncingChain { /// The current processing batch, if any. current_processing_batch: Option, + + /// The maximum number of batches to queue before requesting more. + batch_buffer_size: usize, } #[derive(PartialEq, Debug)] @@ -147,6 +150,7 @@ impl SyncingChain { target_head_root: Hash256, peer_id: PeerId, chain_type: SyncingChainType, + batch_buffer_size: usize, ) -> Self { SyncingChain { id, @@ -163,6 +167,7 @@ impl SyncingChain { attempted_optimistic_starts: HashSet::default(), state: ChainSyncingState::Stopped, current_processing_batch: None, + batch_buffer_size, } } @@ -1075,7 +1080,7 @@ impl SyncingChain { .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) .count() - > BATCH_BUFFER_SIZE as usize + >= self.batch_buffer_size as usize { return None; } @@ -1105,28 +1110,28 @@ impl SyncingChain { /// batch states. See [BatchState::visualize] for symbol definitions. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] fn visualize_batch_state(&self) -> String { - let mut visualization_string = String::with_capacity((BATCH_BUFFER_SIZE * 3) as usize); + let mut visualization_string = String::with_capacity((self.batch_buffer_size * 3) as usize); // Start of the block visualization_string.push('['); - for mut batch_index in 0..BATCH_BUFFER_SIZE { + for mut batch_index in 0..self.batch_buffer_size { if let Some(batch) = self .batches .get(&(self.processing_target + batch_index as u64 * EPOCHS_PER_BATCH)) { visualization_string.push(batch.visualize()); - if batch_index != BATCH_BUFFER_SIZE { + if batch_index != self.batch_buffer_size { // Add a comma in between elements visualization_string.push(','); } } else { // No batch exists, it is on our list to be downloaded // Fill in the rest of the gaps - while batch_index < BATCH_BUFFER_SIZE { + while batch_index < self.batch_buffer_size { visualization_string.push('E'); // Add a comma between the empty batches - if batch_index < BATCH_BUFFER_SIZE.saturating_sub(1) { + if batch_index < self.batch_buffer_size.saturating_sub(1) { visualization_string.push(',') } batch_index += 1; diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index 454f7c02d15..44ce43d56aa 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -51,6 +51,8 @@ pub struct ChainCollection { head_chains: FnvHashMap>, /// The current sync state of the process. state: RangeSyncState, + /// The maximum number of batches to queue before requesting more. + batch_buffer_size: usize, } impl ChainCollection { @@ -61,12 +63,13 @@ impl ChainCollection { .chain(self.head_chains.values()) } - pub fn new(beacon_chain: Arc>) -> Self { + pub fn new(beacon_chain: Arc>, batch_buffer_size: usize) -> Self { ChainCollection { beacon_chain, finalized_chains: FnvHashMap::default(), head_chains: FnvHashMap::default(), state: RangeSyncState::Idle, + batch_buffer_size, } } @@ -504,6 +507,7 @@ impl ChainCollection { target_head_root, peer, sync_type.into(), + self.batch_buffer_size, ); debug!( diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 225b536d1de..67479f9a1e0 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -10,6 +10,6 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, }; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; +pub use chain::{BatchId, ChainId, BATCH_BUFFER_SIZE, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 62d18252683..8f52fa7a496 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -90,10 +90,10 @@ where name = "range_sync", skip_all )] - pub fn new(beacon_chain: Arc>) -> Self { + pub fn new(beacon_chain: Arc>, batch_buffer_size: usize) -> Self { RangeSync { beacon_chain: beacon_chain.clone(), - chains: ChainCollection::new(beacon_chain), + chains: ChainCollection::new(beacon_chain, batch_buffer_size), failed_chains: LRUTimeCache::new(std::time::Duration::from_secs( FAILED_CHAINS_EXPIRY_SECONDS, )), diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index f26a467f273..8477b46958f 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -2,6 +2,7 @@ use crate::network_beacon_processor::NetworkBeaconProcessor; use crate::sync::block_lookups::{ BlockLookupSummary, PARENT_DEPTH_TOLERANCE, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS, }; +use crate::sync::range_sync::BATCH_BUFFER_SIZE; use crate::sync::{ manager::{BlockProcessType, BlockProcessingResult, SyncManager}, peer_sampling::SamplingConfig, @@ -59,16 +60,29 @@ pub enum PeersConfig { SupernodeOnly, } +pub struct TestOptions { + /// If the node created by this test harness is a supernode + pub is_supernode: bool, + /// The maximum number of batches to queue before requesting more. + pub batch_buffer_size: usize, +} + impl TestRig { pub fn test_setup() -> Self { - Self::test_setup_with_options(false) + Self::test_setup_with_options(TestOptions { + is_supernode: false, + batch_buffer_size: BATCH_BUFFER_SIZE, + }) } pub fn test_setup_as_supernode() -> Self { - Self::test_setup_with_options(true) + Self::test_setup_with_options(TestOptions { + is_supernode: true, + batch_buffer_size: BATCH_BUFFER_SIZE, + }) } - fn test_setup_with_options(is_supernode: bool) -> Self { + pub fn test_setup_with_options(options: TestOptions) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -101,7 +115,7 @@ impl TestRig { Vec::new(), network_config, chain.spec.clone(), - is_supernode, + options.is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, @@ -143,6 +157,7 @@ impl TestRig { required_successes: vec![SAMPLING_REQUIRED_SUCCESSES], }, fork_context, + options.batch_buffer_size, ), harness, fork_name, diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 642f92ee664..382965ec97b 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -4,12 +4,12 @@ use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; +use crate::sync::tests::lookups::TestOptions; use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; +use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; -use lighthouse_network::discovery::{peer_id_to_node_id, CombinedKey}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, }; @@ -19,16 +19,13 @@ use lighthouse_network::service::api_types::{ DataColumnsByRangeRequestId, SyncRequestId, }; use lighthouse_network::types::SyncState; -use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; -use rand::SeedableRng; -use rand_chacha::ChaCha20Rng; +use lighthouse_network::{PeerId, SyncInfo}; use std::collections::HashSet; use std::time::Duration; -use types::data_column_custody_group::compute_subnets_for_node; use types::{ - BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, - DataColumnSubnetId, Epoch, EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, + EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, + SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -93,6 +90,12 @@ struct RequestFilter { column_index: Option, } +const NO_FILTER: RequestFilter = RequestFilter { + peer: None, + epoch: None, + column_index: None, +}; + impl RequestFilter { fn peer(mut self, peer: PeerId) -> Self { self.peer = Some(peer); @@ -1094,7 +1097,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it // doesn't have enough peers to cover all columns - r.progress_until_no_events(filter(), complete()); + r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); // Here we have a batch with partially completed block_components_by_range requests. The batch @@ -1108,7 +1111,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.progress_until_no_events(filter(), complete()); + r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); r.expect_no_active_block_components_by_range_requests(); // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work @@ -1134,7 +1137,7 @@ fn finalized_sync_single_custody_peer_failure() { // Progress all blocks_by_range and columns_by_range requests but respond empty for a single // column index r.progress_until_no_events( - filter(), + NO_FILTER, complete().custody_failure_at_index(column_index_to_fail), ); r.expect_penalties("custody_failure"); @@ -1162,7 +1165,13 @@ fn finalized_sync_single_custody_peer_failure() { #[test] fn finalized_sync_permanent_custody_peer_failure() { - let mut r = TestRig::test_setup(); + let mut r = TestRig::test_setup_with_options(TestOptions { + is_supernode: false, + // The default buffer size is 5, but we want to manually complete only the batch for epoch + // 0. By setting this buffer to 1 sync will create a single batch until it completes. We can + // do better assertions of state assuming there's only one batch and logs are cleaner. + batch_buffer_size: 1, + }); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; @@ -1192,7 +1201,7 @@ fn finalized_sync_permanent_custody_peer_failure() { // Some peer had a costudy failure at `column_index` so sync should do a single extra request // for that index and epoch. We want to make sure that the request goes to different peer - // than the attempts before. + // than the attempted before. let reqs = r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); let req_peer = reqs.peer(); @@ -1212,35 +1221,3 @@ fn finalized_sync_permanent_custody_peer_failure() { // custody_by_range request is still active waiting for a new peer to connect r.expect_active_block_components_by_range_request_on_custody_step(); } - -#[test] -#[ignore] -fn mine_peerids() { - let spec = test_spec::(); - let mut rng = ChaCha20Rng::from_seed([0u8; 32]); - - let expected_subnets = (0..3) - .map(|i| DataColumnSubnetId::new(i as u64)) - .collect::>(); - - for i in 0..usize::MAX { - let key: CombinedKey = k256::ecdsa::SigningKey::random(&mut rng).into(); - let enr = Enr::builder().build(&key).unwrap(); - let peer_id = enr.peer_id(); - // Use default custody groups count - let node_id = peer_id_to_node_id(&peer_id).expect("convert peer_id to node_id"); - let subnets = compute_subnets_for_node(node_id.raw(), spec.custody_requirement, &spec) - .expect("should compute custody subnets"); - if expected_subnets == subnets { - panic!("{:?}", subnets); - } else { - let matches = expected_subnets - .iter() - .filter(|index| subnets.contains(index)) - .count(); - if matches > 0 { - println!("{i} {:?}", matches); - } - } - } -} From 28d9d8b8e2993b27f7cac279613b97858092afc8 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:02:37 +0200 Subject: [PATCH 16/66] lint --- beacon_node/network/src/sync/range_sync/chain.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 44b2b1937d6..9e0363c379f 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1080,7 +1080,7 @@ impl SyncingChain { .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) .count() - >= self.batch_buffer_size as usize + >= self.batch_buffer_size { return None; } @@ -1110,7 +1110,7 @@ impl SyncingChain { /// batch states. See [BatchState::visualize] for symbol definitions. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] fn visualize_batch_state(&self) -> String { - let mut visualization_string = String::with_capacity((self.batch_buffer_size * 3) as usize); + let mut visualization_string = String::with_capacity(self.batch_buffer_size * 3); // Start of the block visualization_string.push('['); From 7a035787954e2406a4e1040c46f346783704fa4c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:21:12 +0200 Subject: [PATCH 17/66] Remove total_requests_per_peer --- .../network/src/sync/backfill_sync/mod.rs | 4 +--- .../network/src/sync/network_context.rs | 11 ++--------- .../block_components_by_range.rs | 9 +-------- .../network/src/sync/range_sync/batch.rs | 7 ------- .../network/src/sync/range_sync/chain.rs | 19 +------------------ 5 files changed, 5 insertions(+), 45 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 47810d536e5..0a68dc2ce8a 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -26,7 +26,7 @@ use logging::crit; use parking_lot::RwLock; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashMap, HashSet, + HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -932,8 +932,6 @@ impl BackFillSync { RangeRequestId::BackfillSync { batch_id }, self.peers.clone(), &failed_peers, - // Does not track total requests per peers for now - &HashMap::new(), ) { Ok(request_id) => { // inform the batch about the new request diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 5bb277d996c..f66f6668427 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -480,21 +480,14 @@ impl SyncNetworkContext { requester: RangeRequestId, peers: Arc>>, peers_to_deprioritize: &HashSet, - total_requests_per_peer: &HashMap, ) -> Result { let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - let req = BlockComponentsByRangeRequest::new( - id, - request, - peers, - peers_to_deprioritize, - total_requests_per_peer, - self, - )?; + let req = + BlockComponentsByRangeRequest::new(id, request, peers, peers_to_deprioritize, self)?; self.block_components_by_range_requests.insert(id, req); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index bb981e31543..07132f5ac1c 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -105,7 +105,6 @@ impl BlockComponentsByRangeRequest { request: BlocksByRangeRequest, peers: Arc>>, peers_to_deprioritize: &HashSet, - total_requests_per_peer: &HashMap, cx: &mut SyncNetworkContext, ) -> Result { // Induces a compile time panic if this doesn't hold true. @@ -129,19 +128,13 @@ impl BlockComponentsByRangeRequest { ( // If contains -> 1 (order after), not contains -> 0 (order first) peers_to_deprioritize.contains(peer), - // TODO(das): Should we use active_request_count_by_peer? - // Prefer peers with less overall requests - // active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Prefer peers with less total cummulative requests, so we fetch data from a - // diverse set of peers - total_requests_per_peer.get(peer).copied().unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties rand::random::(), peer, ) }) .min() - .map(|(_, _, _, peer)| *peer) + .map(|(_, _, peer)| *peer) else { // When a peer disconnects and is removed from the SyncingChain peer set, if the set // reaches zero the SyncingChain is removed. diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 99ee4fb6be2..ab9fd40babd 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,6 +1,5 @@ use crate::sync::network_context::PeerGroup; use beacon_chain::block_verification_types::RpcBlock; -use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; @@ -46,12 +45,6 @@ impl BatchPeers { pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { self.column_peers.of_index(&((*index) as usize)) } - - pub fn iter_unique_peers(&self) -> impl Iterator { - std::iter::once(&self.block_peer) - .chain(self.column_peers.all()) - .unique() - } } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 9e0363c379f..87e00bc91a2 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -11,7 +11,7 @@ use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; use parking_lot::RwLock; -use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; +use std::collections::{btree_map::Entry, BTreeMap, HashSet}; use std::sync::Arc; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; @@ -90,15 +90,8 @@ pub struct SyncingChain { /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain /// and thus available to download this chain from. - /// - /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain - /// `HashMap` peers: Arc>>, - /// Tracks the total requests done to each peer for this SyncingChain. Forces us to fetch data - /// from all peers to prevent eclipse attacks - requests_per_peer: HashMap, - /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -160,7 +153,6 @@ impl SyncingChain { target_head_root, batches: BTreeMap::new(), peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))), - requests_per_peer: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -221,7 +213,6 @@ impl SyncingChain { #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { self.peers.write().remove(peer_id); - self.requests_per_peer.remove(peer_id); if self.peers.read().is_empty() { Err(RemoveChain::EmptyPeerPool) @@ -250,12 +241,6 @@ impl SyncingChain { request_id: Id, blocks: Vec>, ) -> ProcessingResult { - // Account for one more requests to this peer - // TODO(das): this code assumes that we do a single request per peer per RpcBlock - for peer in batch_peers.iter_unique_peers() { - *self.requests_per_peer.entry(*peer).or_default() += 1; - } - // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { None => { @@ -873,7 +858,6 @@ impl SyncingChain { peer_id: PeerId, ) -> ProcessingResult { self.peers.write().insert(peer_id); - self.requests_per_peer.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -955,7 +939,6 @@ impl SyncingChain { }, self.peers.clone(), &failed_peers, - &self.requests_per_peer, ) { Ok(request_id) => { // inform the batch about the new request From 4e13b3be0f77d50eb5db04d4af335b4ee440f62e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:49:25 +0200 Subject: [PATCH 18/66] Fix failed_peers post fulu --- .../network/src/sync/backfill_sync/mod.rs | 11 +++- .../network/src/sync/range_sync/batch.rs | 58 +++++++++---------- .../network/src/sync/range_sync/chain.rs | 12 +++- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0a68dc2ce8a..5037cf48605 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -617,9 +617,12 @@ impl BackFillSync { error, } => { // TODO(sync): De-dup between back and forwards sync + let mut failed_peers = vec![]; + if let Some(penalty) = peer_action.block_peer { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); + failed_peers.push(batch_peers.block()); } // Penalize each peer only once. Currently a peer_action does not mix different @@ -635,9 +638,11 @@ impl BackFillSync { .unique() { network.report_peer(peer, penalty, "faulty_batch_column"); + failed_peers.push(peer); } - match batch.processing_completed(BatchProcessingResult::FaultyFailure) { + match batch.processing_completed(BatchProcessingResult::FaultyFailure(failed_peers)) + { Err(e) => { // Batch was in the wrong state self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) @@ -926,12 +931,12 @@ impl BackFillSync { ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_block_peers(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( request, RangeRequestId::BackfillSync { batch_id }, self.peers.clone(), - &failed_peers, + failed_peers, ) { Ok(request_id) => { // inform the batch about the new request diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index ab9fd40babd..5267ba56ba5 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -112,7 +112,7 @@ pub enum BatchOperationOutcome { pub enum BatchProcessingResult { Success, - FaultyFailure, + FaultyFailure(Vec), NonFaultyFailure, } @@ -128,7 +128,9 @@ pub struct BatchInfo { /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: Vec>, + failed_download_attempts: usize, + /// Peers that returned bad data, and we want to de-prioritize + failed_peers: HashSet, /// State of the batch. state: BatchState, /// Pin the generic @@ -197,7 +199,8 @@ impl BatchInfo { start_slot, end_slot, failed_processing_attempts: Vec::new(), - failed_download_attempts: Vec::new(), + failed_download_attempts: 0, + failed_peers: <_>::default(), non_faulty_processing_attempts: 0, state: BatchState::AwaitingDownload, marker: std::marker::PhantomData, @@ -206,23 +209,8 @@ impl BatchInfo { /// Gives a list of peers from which this batch has had a failed download or processing /// attempt. - /// - /// TODO(das): Returns only block peers to keep the mainnet path equivalent. The failed peers - /// mechanism is broken for PeerDAS and will be fixed with https://github.com/sigp/lighthouse/issues/6258 - pub fn failed_block_peers(&self) -> HashSet { - let mut peers = HashSet::with_capacity( - self.failed_processing_attempts.len() + self.failed_download_attempts.len(), - ); - - for attempt in &self.failed_processing_attempts { - peers.insert(attempt.peers.block()); - } - - for peer in self.failed_download_attempts.iter().flatten() { - peers.insert(*peer); - } - - peers + pub fn failed_peers(&self) -> &HashSet { + &self.failed_peers } /// Verifies if an incoming block belongs to this batch. @@ -272,8 +260,7 @@ impl BatchInfo { match self.state { BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Failed => BatchOperationOutcome::Failed { - blacklist: self.failed_processing_attempts.len() - > self.failed_download_attempts.len(), + blacklist: self.failed_processing_attempts.len() > self.failed_download_attempts, }, _ => BatchOperationOutcome::Continue, } @@ -325,15 +312,19 @@ impl BatchInfo { match self.state.poison() { BatchState::Downloading(_request_id) => { // register the attempt and check if the batch can be tried again - self.failed_download_attempts.push(peer); - self.state = if self.failed_download_attempts.len() - >= B::max_batch_download_attempts() as usize - { - BatchState::Failed - } else { - // drop the blocks - BatchState::AwaitingDownload - }; + if let Some(peer) = peer { + self.failed_peers.insert(peer); + } + + self.failed_download_attempts += 1; + + self.state = + if self.failed_download_attempts >= B::max_batch_download_attempts() as usize { + BatchState::Failed + } else { + // drop the blocks + BatchState::AwaitingDownload + }; Ok(self.outcome()) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -390,9 +381,12 @@ impl BatchInfo { BatchState::Processing(attempt) => { self.state = match procesing_result { BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), - BatchProcessingResult::FaultyFailure => { + BatchProcessingResult::FaultyFailure(failed_peers) => { // register the failed attempt self.failed_processing_attempts.push(attempt); + for peer in failed_peers { + self.failed_peers.insert(peer); + } // check if the batch can be downloaded again if self.failed_processing_attempts.len() diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 87e00bc91a2..17bce62a7c7 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -539,10 +539,13 @@ impl SyncingChain { // TODO(sync): propagate error in logs error: _, } => { + let mut failed_peers = vec![]; + // TODO(sync): De-dup between back and forwards sync if let Some(penalty) = peer_action.block_peer { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); + failed_peers.push(batch_peers.block()); } // Penalize each peer only once. Currently a peer_action does not mix different @@ -558,10 +561,13 @@ impl SyncingChain { .unique() { network.report_peer(peer, penalty, "faulty_batch_column"); + failed_peers.push(peer); } // Check if this batch is allowed to continue - match batch.processing_completed(BatchProcessingResult::FaultyFailure)? { + match batch + .processing_completed(BatchProcessingResult::FaultyFailure(failed_peers))? + { BatchOperationOutcome::Continue => { // Chain can continue. Check if it can be moved forward. if *imported_blocks > 0 { @@ -929,7 +935,7 @@ impl SyncingChain { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_block_peers(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( request, @@ -938,7 +944,7 @@ impl SyncingChain { batch_id, }, self.peers.clone(), - &failed_peers, + failed_peers, ) { Ok(request_id) => { // inform the batch about the new request From e426e45455bd457a5735e722b08f11dd42630fc0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:38:55 +0200 Subject: [PATCH 19/66] Don't use failed_peers for download errors, rely on randomness to skip potentially faulty peers --- beacon_node/network/src/sync/backfill_sync/mod.rs | 6 ++---- beacon_node/network/src/sync/range_sync/batch.rs | 10 +--------- beacon_node/network/src/sync/range_sync/chain.rs | 8 ++------ 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 5037cf48605..70d6573264b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -330,9 +330,7 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - // TODO(das): Is it necessary for the batch to track failed peers? Can we make this - // mechanism compatible with PeerDAS and before PeerDAS? - match batch.download_failed(None) { + match batch.download_failed() { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { RpcResponseError::RpcError(_) @@ -956,7 +954,7 @@ impl BackFillSync { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } - match batch.download_failed(None) { + match batch.download_failed() { Err(e) => { self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? } diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 5267ba56ba5..8834c74c08b 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -305,17 +305,9 @@ impl BatchInfo { /// The `peer` parameter, when set to None, does not increment the failed attempts of /// this batch and register the peer, rather attempts a re-download. #[must_use = "Batch may have failed"] - pub fn download_failed( - &mut self, - peer: Option, - ) -> Result { + pub fn download_failed(&mut self) -> Result { match self.state.poison() { BatchState::Downloading(_request_id) => { - // register the attempt and check if the batch can be tried again - if let Some(peer) = peer { - self.failed_peers.insert(peer); - } - self.failed_download_attempts += 1; self.state = diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 17bce62a7c7..921d134c681 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -902,11 +902,7 @@ impl SyncingChain { %request_id, "Batch download error" ); - if let BatchOperationOutcome::Failed { blacklist } = - // TODO(das): Is it necessary for the batch to track failed peers? Can we make this - // mechanism compatible with PeerDAS and before PeerDAS? - batch.download_failed(None)? - { + if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed()? { return Err(RemoveChain::ChainFailed { blacklist, failing_batch: batch_id, @@ -966,7 +962,7 @@ impl SyncingChain { warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried batch.start_downloading(1)?; // fake request_id = 1 is not relevant - match batch.download_failed(None)? { + match batch.download_failed()? { BatchOperationOutcome::Failed { blacklist } => { return Err(RemoveChain::ChainFailed { blacklist, From 82c8e82fe1a65eddcbb24734b2f1903e872fdc45 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:46:18 +0200 Subject: [PATCH 20/66] Re-add NoPeers error --- .../network/src/sync/backfill_sync/mod.rs | 17 +++++++++++++++++ beacon_node/network/src/sync/network_context.rs | 1 + .../block_components_by_range.rs | 8 +++++--- .../network/src/sync/range_sync/chain.rs | 2 +- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 70d6573264b..0aaea4d65fd 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -301,6 +301,14 @@ impl BackFillSync { pub fn peer_disconnected(&mut self, peer_id: &PeerId) { self.peers.write().remove(peer_id); + + if self.peers.read().is_empty() { + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); + } } /// An RPC error has occurred. @@ -946,6 +954,15 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { + RpcRequestSendError::NoPeers => { + // If we are here the chain has no more synced peers + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); + return Err(BackFillError::Paused); + } RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index f66f6668427..a81591e58f0 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -98,6 +98,7 @@ pub enum RpcRequestSendError { // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream // consumers that sends are expected to be infallible. If this assumption changes in the future, // add a new variant. + NoPeers, } #[derive(Debug, PartialEq, Eq)] diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 07132f5ac1c..913b798d8ed 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -138,9 +138,7 @@ impl BlockComponentsByRangeRequest { else { // When a peer disconnects and is removed from the SyncingChain peer set, if the set // reaches zero the SyncingChain is removed. - return Err(RpcRequestSendError::InternalError( - "A batch peer set should never be empty".to_string(), - )); + return Err(RpcRequestSendError::NoPeers); }; let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; @@ -269,6 +267,10 @@ impl BlockComponentsByRangeRequest { RpcRequestSendError::InternalError(e) => { Error::InternalError(e) } + RpcRequestSendError::NoPeers => Error::InternalError( + "send_custody_by_range_request does not error with NoPeers" + .to_owned(), + ), })?; *state = FuluEnabledState::CustodyRequest { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 921d134c681..83a9dc07b71 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -957,7 +957,7 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - RpcRequestSendError::InternalError(e) => { + e @ (RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_)) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried From 56fcf289ec7bca2d9a77d776675108fad41a2900 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 12 Jun 2025 15:45:36 +0200 Subject: [PATCH 21/66] lint --- .../lighthouse_network/src/types/globals.rs | 4 ++++ .../block_components_by_range.rs | 22 +++++-------------- beacon_node/network/src/sync/tests/range.rs | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index 0c023442087..1c11e7aa1f0 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -248,6 +248,10 @@ impl NetworkGlobals { } } + pub fn sampling_columns_count(&self) -> usize { + self.sampling_columns.read().len() + } + pub fn sampling_columns(&self) -> HashSet { self.sampling_columns.read().clone() } diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 913b798d8ed..f896589f85a 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -188,10 +188,7 @@ impl BlockComponentsByRangeRequest { } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_base( - blocks.to_vec(), - cx.network_globals().sampling_columns.len(), - ); + let rpc_blocks = couple_blocks_base(blocks.to_vec()); Ok(Some((rpc_blocks, peer_group))) } else { // Wait for blocks_by_range requests to complete @@ -230,8 +227,7 @@ impl BlockComponentsByRangeRequest { if blocks_with_data.is_empty() { let custody_column_indices = cx .network_globals() - .sampling_columns - .clone() + .sampling_columns() .iter() .copied() .collect(); @@ -248,8 +244,7 @@ impl BlockComponentsByRangeRequest { } else { let mut column_indices = cx .network_globals() - .sampling_columns - .clone() + .sampling_columns() .iter() .copied() .collect::>(); @@ -295,8 +290,7 @@ impl BlockComponentsByRangeRequest { if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { let custody_column_indices = cx .network_globals() - .sampling_columns - .clone() + .sampling_columns() .iter() .copied() .collect(); @@ -425,13 +419,10 @@ impl BlockComponentsByRangeRequest { } } -fn couple_blocks_base( - blocks: Vec>>, - custody_columns_count: usize, -) -> Vec> { +fn couple_blocks_base(blocks: Vec>>) -> Vec> { blocks .into_iter() - .map(|block| RpcBlock::new_without_blobs(None, block, custody_columns_count)) + .map(|block| RpcBlock::new_without_blobs(None, block)) .collect() } @@ -509,7 +500,6 @@ fn couple_blocks_fulu( Some(block_root), block, data_columns_with_block_root, - custody_column_indices.clone(), spec, ) .map_err(Error::InternalError) diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 6de5902bb16..599c808befa 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -209,7 +209,7 @@ fn complete() -> CompleteConfig { impl TestRig { fn our_custody_indices(&self) -> Vec { self.network_globals - .sampling_columns + .sampling_columns() .iter() .copied() .collect() From aa726cc72cce7703073da8754abda794664d72a3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:29:14 +0200 Subject: [PATCH 22/66] lint --- beacon_node/beacon_chain/tests/store_tests.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 9f8c14f3398..73e2a9025c7 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2603,7 +2603,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .deconstruct(); if wss_fork.fulu_enabled() { info!(block_slot = %block.slot(), ?block_root, "Corrupting data column KZG proof"); - let (mut data_columns, expected_column_indices) = cols.unwrap(); + let mut data_columns = cols.unwrap(); assert!( !data_columns.is_empty(), "data column sidecars shouldn't be empty" @@ -2618,7 +2618,6 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { Some(block_root), block, data_columns.to_vec(), - expected_column_indices, &harness.spec, ) .unwrap() @@ -3819,7 +3818,6 @@ fn available_to_rpc_block(block: AvailableBlock, spec: &ChainSpec .into_iter() .map(|d| CustodyDataColumn::from_asserted_custody(d)) .collect(), - vec![], spec, ) .unwrap(), From cb5f76f13700970ac9f87554369870236fada881 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:41:20 +0200 Subject: [PATCH 23/66] Add peers to backfill if FullySynced --- beacon_node/network/src/sync/manager.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index c7d727c63fc..94599a072ee 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -416,7 +416,6 @@ impl SyncManager { PeerSyncType::Advanced => { self.range_sync .add_peer(&mut self.network, local, peer_id, remote); - self.backfill_sync.add_peer(peer_id); } PeerSyncType::FullySynced => { // Sync considers this peer close enough to the head to not trigger range sync. @@ -434,6 +433,13 @@ impl SyncManager { } } } + + match sync_type { + PeerSyncType::Behind => {} + PeerSyncType::Advanced | PeerSyncType::FullySynced => { + self.backfill_sync.add_peer(peer_id); + } + } } self.update_sync_state(); From 6a4dde03fae32a6d47c3faee9cd7a303fdd94338 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 18 Jun 2025 18:41:46 +0200 Subject: [PATCH 24/66] Design with roots and tips --- .../lighthouse_network/src/rpc/methods.rs | 12 +- .../src/service/api_types.rs | 61 +- .../src/network_beacon_processor/mod.rs | 72 --- .../network_beacon_processor/sync_methods.rs | 392 +------------ beacon_node/network/src/router.rs | 4 +- .../network/src/sync/backfill_sync/mod.rs | 15 +- .../network/src/sync/block_lookups/common.rs | 81 +-- .../network/src/sync/block_lookups/mod.rs | 19 +- beacon_node/network/src/sync/block_tree.rs | 453 +++++++++++++++ beacon_node/network/src/sync/manager.rs | 295 ++++------ beacon_node/network/src/sync/mod.rs | 1 + .../network/src/sync/network_context.rs | 537 ++++++------------ .../block_components_by_range.rs | 370 ++++++------ .../sync/network_context/custody_by_root.rs | 42 +- .../src/sync/network_context/requests.rs | 8 +- .../network_context/requests/blobs_by_root.rs | 37 +- .../requests/blocks_by_root.rs | 21 +- .../requests/data_columns_by_root.rs | 45 +- beacon_node/network/src/sync/peer_sampling.rs | 16 +- .../network/src/sync/range_sync/batch.rs | 35 +- .../network/src/sync/range_sync/chain.rs | 20 +- beacon_node/network/src/sync/tests/lookups.rs | 119 ++-- beacon_node/network/src/sync/tests/mod.rs | 3 +- beacon_node/network/src/sync/tests/range.rs | 264 ++++++++- 24 files changed, 1374 insertions(+), 1548 deletions(-) create mode 100644 beacon_node/network/src/sync/block_tree.rs diff --git a/beacon_node/lighthouse_network/src/rpc/methods.rs b/beacon_node/lighthouse_network/src/rpc/methods.rs index 8a11a6f29d6..1f9ad0868b4 100644 --- a/beacon_node/lighthouse_network/src/rpc/methods.rs +++ b/beacon_node/lighthouse_network/src/rpc/methods.rs @@ -16,9 +16,9 @@ use types::blob_sidecar::BlobIdentifier; use types::light_client_update::MAX_REQUEST_LIGHT_CLIENT_UPDATES; use types::{ blob_sidecar::BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, - DataColumnsByRootIdentifier, Epoch, EthSpec, ForkContext, Hash256, LightClientBootstrap, - LightClientFinalityUpdate, LightClientOptimisticUpdate, LightClientUpdate, RuntimeVariableList, - SignedBeaconBlock, Slot, + DataColumnsByRootIdentifier, Epoch, EthSpec, ForkContext, ForkName, Hash256, + LightClientBootstrap, LightClientFinalityUpdate, LightClientOptimisticUpdate, + LightClientUpdate, RuntimeVariableList, SignedBeaconBlock, Slot, }; /// Maximum length of error message. @@ -440,10 +440,8 @@ pub struct BlocksByRootRequest { } impl BlocksByRootRequest { - pub fn new(block_roots: Vec, fork_context: &ForkContext) -> Self { - let max_request_blocks = fork_context - .spec - .max_request_blocks(fork_context.current_fork()); + pub fn new(block_roots: Vec, spec: &ChainSpec, current_fork: ForkName) -> Self { + let max_request_blocks = spec.max_request_blocks(current_fork); let block_roots = RuntimeVariableList::from_vec(block_roots, max_request_blocks); Self::V2(BlocksByRootRequestV2 { block_roots }) } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 8300ad4bb89..edc61dfe777 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -18,9 +18,9 @@ pub struct SingleLookupReqId { #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum SyncRequestId { /// Request searching for a block given a hash. - SingleBlock { id: SingleLookupReqId }, + BlocksByRoot(BlocksByRootRequestId), /// Request searching for a set of blobs given a hash. - SingleBlob { id: SingleLookupReqId }, + BlobsByRoot(BlobsByRootRequestId), /// Request searching for a set of data columns given a hash and list of column indices. DataColumnsByRoot(DataColumnsByRootRequestId), /// Blocks by range request @@ -31,6 +31,18 @@ pub enum SyncRequestId { DataColumnsByRange(DataColumnsByRangeRequestId), } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct BlocksByRootRequestId { + pub id: Id, + pub parent_request_id: BlocksByRootRequester, +} + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct HeaderLookupId(pub Hash256); + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct BatchId(pub Id); + /// Request ID for data_columns_by_root requests. Block lookups do not issue this request directly. /// Wrapping this particular req_id, ensures not mixing this request with a custody req_id. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -47,6 +59,14 @@ pub struct BlocksByRangeRequestId { pub parent_request_id: ComponentsByRangeRequestId, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct BlobsByRootRequestId { + /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` + pub id: Id, + /// The Id of the overall By Range request for block components. + pub parent_request_id: ComponentsByRangeRequestId, +} + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct BlobsByRangeRequestId { /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` @@ -89,10 +109,16 @@ pub enum RangeRequestId { BackfillSync { batch_id: Epoch }, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub enum BlocksByRootRequester { + Header(HeaderLookupId), + RangeSync(ComponentsByRangeRequestId), +} + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum DataColumnsByRootRequester { Sampling(SamplingId), - Custody(CustodyId), + Custody(CustodyByRootRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -117,8 +143,8 @@ pub enum SamplingRequester { pub struct SamplingRequestId(pub usize); #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct CustodyId { - pub requester: CustodyRequester, +pub struct CustodyByRootRequestId { + pub parent_request_id: ComponentsByRangeRequestId, } /// Downstream components that perform custody by root requests. @@ -231,9 +257,11 @@ impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(CustodyByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); +impl_display!(BlocksByRootRequestId, "{}/{}", id, parent_request_id); +impl_display!(BlobsByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); -impl_display!(CustodyId, "{}", requester); +impl_display!(CustodyByRootRequestId, "{}", parent_request_id); impl_display!(SamplingId, "{}/{}", sampling_request_id, id); impl Display for DataColumnsByRootRequester { @@ -245,6 +273,18 @@ impl Display for DataColumnsByRootRequester { } } +impl Display for HeaderLookupId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl Display for BatchId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + impl Display for CustodyRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) @@ -260,6 +300,15 @@ impl Display for RangeRequestId { } } +impl Display for BlocksByRootRequester { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Header(id) => write!(f, "Header/{id}"), + Self::RangeSync(id) => write!(f, "RangeSync/{id}"), + } + } +} + impl Display for SamplingRequestId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index fd119824cfc..b7e01b84a4a 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -1,4 +1,3 @@ -use crate::sync::manager::BlockProcessType; use crate::sync::SamplingId; use crate::{service::NetworkMessage, sync::manager::SyncMessage}; use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob}; @@ -34,7 +33,6 @@ use tracing::{debug, error, trace, warn, Instrument}; use types::*; pub use sync_methods::{ChainSegmentProcessId, PeerGroupAction}; -use types::blob_sidecar::FixedBlobSidecarList; pub type Error = TrySendError>; @@ -479,76 +477,6 @@ impl NetworkBeaconProcessor { }) } - /// Create a new `Work` event for some block, where the result from computation (if any) is - /// sent to the other side of `result_tx`. - pub fn send_rpc_beacon_block( - self: &Arc, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), Error> { - let process_fn = self.clone().generate_rpc_beacon_block_process_fn( - block_root, - block, - seen_timestamp, - process_type, - ); - self.try_send(BeaconWorkEvent { - drop_during_sync: false, - work: Work::RpcBlock { process_fn }, - }) - } - - /// Create a new `Work` event for some blobs, where the result from computation (if any) is - /// sent to the other side of `result_tx`. - pub fn send_rpc_blobs( - self: &Arc, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), Error> { - let blob_count = blobs.iter().filter(|b| b.is_some()).count(); - if blob_count == 0 { - return Ok(()); - } - let process_fn = self.clone().generate_rpc_blobs_process_fn( - block_root, - blobs, - seen_timestamp, - process_type, - ); - self.try_send(BeaconWorkEvent { - drop_during_sync: false, - work: Work::RpcBlobs { process_fn }, - }) - } - - /// Create a new `Work` event for some custody columns. `process_rpc_custody_columns` reports - /// the result back to sync. - pub fn send_rpc_custody_columns( - self: &Arc, - block_root: Hash256, - custody_columns: DataColumnSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), Error> { - let s = self.clone(); - self.try_send(BeaconWorkEvent { - drop_during_sync: false, - work: Work::RpcCustodyColumn(Box::pin(async move { - s.process_rpc_custody_columns( - block_root, - custody_columns, - seen_timestamp, - process_type, - ) - .await; - })), - }) - } - /// Create a new `Work` event for some sampling columns, and reports the verification result /// back to sync. pub fn send_rpc_validate_data_columns( diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index b1777cef792..d4285c41cb5 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,33 +1,19 @@ use crate::metrics::{self, register_process_result_metrics}; use crate::network_beacon_processor::{NetworkBeaconProcessor, FUTURE_SLOT_TOLERANCE}; use crate::sync::BatchProcessResult; -use crate::sync::{ - manager::{BlockProcessType, SyncMessage}, - ChainId, -}; +use crate::sync::{manager::SyncMessage, ChainId}; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; use beacon_chain::{ - validator_monitor::get_slot_delay_ms, AvailabilityProcessingStatus, BeaconChainTypes, - BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, -}; -use beacon_processor::{ - work_reprocessing_queue::{QueuedRpcBlock, ReprocessQueueMessage}, - AsyncFn, BlockingFn, DuplicateCache, + BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, }; use lighthouse_network::PeerAction; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use store::KzgCommitment; -use tokio::sync::mpsc; -use tracing::{debug, error, info, warn}; -use types::beacon_block_body::format_kzg_commitments; -use types::blob_sidecar::FixedBlobSidecarList; -use types::{ - BlockImportSource, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, Hash256, -}; +use tracing::{debug, warn}; +use types::{ColumnIndex, DataColumnSidecar, Epoch, Hash256}; /// Id associated to a batch processing request, either a sync batch or a parent lookup. #[derive(Clone, Debug, PartialEq)] @@ -101,376 +87,6 @@ impl PeerGroupAction { } impl NetworkBeaconProcessor { - /// Returns an async closure which processes a beacon block received via RPC. - /// - /// This separate function was required to prevent a cycle during compiler - /// type checking. - pub fn generate_rpc_beacon_block_process_fn( - self: Arc, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> AsyncFn { - let process_fn = async move { - let reprocess_tx = self.reprocess_tx.clone(); - let duplicate_cache = self.duplicate_cache.clone(); - self.process_rpc_block( - block_root, - block, - seen_timestamp, - process_type, - reprocess_tx, - duplicate_cache, - ) - .await; - }; - Box::pin(process_fn) - } - - /// Returns the `process_fn` and `ignore_fn` required when requeuing an RPC block. - pub fn generate_rpc_beacon_block_fns( - self: Arc, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> (AsyncFn, BlockingFn) { - // An async closure which will import the block. - let process_fn = self.clone().generate_rpc_beacon_block_process_fn( - block_root, - block, - seen_timestamp, - process_type.clone(), - ); - // A closure which will ignore the block. - let ignore_fn = move || { - // Sync handles these results - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: crate::sync::manager::BlockProcessingResult::Ignored, - }); - }; - (process_fn, Box::new(ignore_fn)) - } - - /// Attempt to process a block received from a direct RPC request. - #[allow(clippy::too_many_arguments)] - pub async fn process_rpc_block( - self: Arc>, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - reprocess_tx: mpsc::Sender, - duplicate_cache: DuplicateCache, - ) { - // Check if the block is already being imported through another source - let Some(handle) = duplicate_cache.check_and_insert(block_root) else { - debug!( - action = "sending rpc block to reprocessing queue", - %block_root, - ?process_type, - "Gossip block is being processed" - ); - - // Send message to work reprocess queue to retry the block - let (process_fn, ignore_fn) = self.clone().generate_rpc_beacon_block_fns( - block_root, - block, - seen_timestamp, - process_type, - ); - let reprocess_msg = ReprocessQueueMessage::RpcBlock(QueuedRpcBlock { - beacon_block_root: block_root, - process_fn, - ignore_fn, - }); - - if reprocess_tx.try_send(reprocess_msg).is_err() { - error!(source = "rpc", %block_root,"Failed to inform block import") - }; - return; - }; - - let slot = block.slot(); - let block_has_data = block.as_block().num_expected_blobs() > 0; - let parent_root = block.message().parent_root(); - let commitments_formatted = block.as_block().commitments_formatted(); - - debug!( - ?block_root, - proposer = block.message().proposer_index(), - slot = %block.slot(), - commitments_formatted, - ?process_type, - "Processing RPC block" - ); - - let signed_beacon_block = block.block_cloned(); - let result = self - .chain - .process_block_with_early_caching( - block_root, - block, - BlockImportSource::Lookup, - NotifyExecutionLayer::Yes, - ) - .await; - register_process_result_metrics(&result, metrics::BlockSource::Rpc, "block"); - - // RPC block imported, regardless of process type - match result.as_ref() { - Ok(AvailabilityProcessingStatus::Imported(hash)) => { - info!( - %slot, - %hash, - "New RPC block received", - ); - // Trigger processing for work referencing this block. - let reprocess_msg = ReprocessQueueMessage::BlockImported { - block_root: *hash, - parent_root, - }; - if reprocess_tx.try_send(reprocess_msg).is_err() { - error!( - source = "rpc", - block_root = %hash, - "Failed to inform block import" - ); - }; - self.chain.block_times_cache.write().set_time_observed( - *hash, - slot, - seen_timestamp, - None, - None, - ); - - self.chain.recompute_head_at_current_slot().await; - } - Ok(AvailabilityProcessingStatus::MissingComponents(..)) => { - // Block is valid, we can now attempt fetching blobs from EL using version hashes - // derived from kzg commitments from the block, without having to wait for all blobs - // to be sent from the peers if we already have them. - let publish_blobs = false; - self.fetch_engine_blobs_and_publish(signed_beacon_block, block_root, publish_blobs) - .await - } - _ => {} - } - - // RPC block imported or execution validated. If the block was already imported by gossip we - // receive Err(BlockError::AlreadyKnown). - if result.is_ok() && - // Block has at least one blob, so it produced columns - block_has_data && - // Block slot is within the DA boundary (should always be the case) and PeerDAS is activated - self.chain.should_sample_slot(slot) - { - self.send_sync_message(SyncMessage::SampleBlock(block_root, slot)); - } - - // Sync handles these results - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: result.into(), - }); - - // Drop the handle to remove the entry from the cache - drop(handle); - } - - /// Returns an async closure which processes a list of blobs received via RPC. - /// - /// This separate function was required to prevent a cycle during compiler - /// type checking. - pub fn generate_rpc_blobs_process_fn( - self: Arc, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> AsyncFn { - let process_fn = async move { - self.clone() - .process_rpc_blobs(block_root, blobs, seen_timestamp, process_type) - .await; - }; - Box::pin(process_fn) - } - - /// Attempt to process a list of blobs received from a direct RPC request. - pub async fn process_rpc_blobs( - self: Arc>, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) { - let Some(slot) = blobs - .iter() - .find_map(|blob| blob.as_ref().map(|blob| blob.slot())) - else { - return; - }; - - let (indices, commitments): (Vec, Vec) = blobs - .iter() - .filter_map(|blob_opt| { - blob_opt - .as_ref() - .map(|blob| (blob.index, blob.kzg_commitment)) - }) - .unzip(); - let commitments = format_kzg_commitments(&commitments); - - debug!( - ?indices, - %block_root, - %slot, - commitments, - "RPC blobs received" - ); - - if let Ok(current_slot) = self.chain.slot() { - if current_slot == slot { - // Note: this metric is useful to gauge how long it takes to receive blobs requested - // over rpc. Since we always send the request for block components at `slot_clock.single_lookup_delay()` - // we can use that as a baseline to measure against. - let delay = get_slot_delay_ms(seen_timestamp, slot, &self.chain.slot_clock); - - metrics::observe_duration(&metrics::BEACON_BLOB_RPC_SLOT_START_DELAY_TIME, delay); - } - } - - let result = self.chain.process_rpc_blobs(slot, block_root, blobs).await; - register_process_result_metrics(&result, metrics::BlockSource::Rpc, "blobs"); - - match &result { - Ok(AvailabilityProcessingStatus::Imported(hash)) => { - debug!( - result = "imported block and blobs", - %slot, - block_hash = %hash, - "Block components retrieved" - ); - self.chain.recompute_head_at_current_slot().await; - } - Ok(AvailabilityProcessingStatus::MissingComponents(_, _)) => { - debug!( - block_hash = %block_root, - %slot, - "Missing components over rpc" - ); - } - Err(BlockError::DuplicateFullyImported(_)) => { - debug!( - block_hash = %block_root, - %slot, - "Blobs have already been imported" - ); - } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - %slot, - "Error when importing rpc blobs" - ); - } - } - - // Sync handles these results - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: result.into(), - }); - } - - pub async fn process_rpc_custody_columns( - self: Arc>, - block_root: Hash256, - custody_columns: DataColumnSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) { - // custody_columns must always have at least one element - let Some(slot) = custody_columns.first().map(|d| d.slot()) else { - return; - }; - - if let Ok(current_slot) = self.chain.slot() { - if current_slot == slot { - let delay = get_slot_delay_ms(seen_timestamp, slot, &self.chain.slot_clock); - metrics::observe_duration(&metrics::BEACON_BLOB_RPC_SLOT_START_DELAY_TIME, delay); - } - } - - let mut indices = custody_columns.iter().map(|d| d.index).collect::>(); - indices.sort_unstable(); - debug!( - ?indices, - %block_root, - %slot, - "RPC custody data columns received" - ); - - let mut result = self - .chain - .process_rpc_custody_columns(custody_columns) - .await; - register_process_result_metrics(&result, metrics::BlockSource::Rpc, "custody_columns"); - - match &result { - Ok(availability) => match availability { - AvailabilityProcessingStatus::Imported(hash) => { - debug!( - result = "imported block and custody columns", - block_hash = %hash, - "Block components retrieved" - ); - self.chain.recompute_head_at_current_slot().await; - } - AvailabilityProcessingStatus::MissingComponents(_, _) => { - debug!( - block_hash = %block_root, - "Missing components over rpc" - ); - // Attempt reconstruction here before notifying sync, to avoid sending out more requests - // that we may no longer need. - // We don't publish columns reconstructed from rpc columns to the gossip network, - // as these are likely historic columns. - let publish_columns = false; - if let Some(availability) = self - .attempt_data_column_reconstruction(block_root, publish_columns) - .await - { - result = Ok(availability) - } - } - }, - Err(BlockError::DuplicateFullyImported(_)) => { - debug!( - block_hash = %block_root, - "Custody columns have already been imported" - ); - } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - "Error when importing rpc custody columns" - ); - } - } - - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: result.into(), - }); - } - /// Validate a list of data columns received from RPC requests pub async fn validate_rpc_data_columns( self: Arc>, diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index 2a7bc597c26..c02f11cbee3 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -628,7 +628,7 @@ impl Router { ) { let sync_request_id = match app_request_id { AppRequestId::Sync(sync_id) => match sync_id { - id @ SyncRequestId::SingleBlock { .. } => id, + id @ SyncRequestId::BlocksByRoot { .. } => id, other => { crit!(request = ?other, "BlocksByRoot response on incorrect request"); return; @@ -662,7 +662,7 @@ impl Router { ) { let sync_request_id = match app_request_id { AppRequestId::Sync(sync_id) => match sync_id { - id @ SyncRequestId::SingleBlob { .. } => id, + id @ SyncRequestId::BlobsByRoot { .. } => id, other => { crit!(request = ?other, "BlobsByRoot response on incorrect request"); return; diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0aaea4d65fd..de853fd0d03 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -576,8 +576,7 @@ impl BackFillSync { debug!( ?result, - %batch, - batch_epoch = %batch_id, + %batch_id, "Backfill batch processed" ); @@ -841,7 +840,7 @@ impl BackFillSync { } BatchState::AwaitingProcessing(..) => {} BatchState::Processing(_) => { - debug!(batch = %id, %batch, "Advancing chain while processing a batch"); + debug!(batch = %id, "Advancing chain while processing a batch"); if let Some(processing_id) = self.current_processing_batch { if id >= processing_id { self.current_processing_batch = None; @@ -936,7 +935,7 @@ impl BackFillSync { batch_id: BatchId, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { - let request = batch.to_blocks_by_range_request(); + let request = todo!(); let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( request, @@ -949,7 +948,7 @@ impl BackFillSync { if let Err(e) = batch.start_downloading(request_id) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } - debug!(epoch = %batch_id, %batch, "Requesting batch"); + debug!(%batch_id, "Requesting batch"); return Ok(()); } @@ -965,7 +964,7 @@ impl BackFillSync { } RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); + warn!(%batch_id, error = ?e, "Could not send batch request"); // register the failed download and check if the batch can be retried if let Err(e) = batch.start_downloading(1) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); @@ -1097,8 +1096,8 @@ impl BackFillSync { .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); self.include_next_batch(network) } - Entry::Vacant(entry) => { - entry.insert(BatchInfo::new(&batch_id, BACKFILL_EPOCHS_PER_BATCH)); + Entry::Vacant(_entry) => { + // TODO if self.would_complete(batch_id) { self.last_batch_downloaded = true; } diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs index 86b6894bac4..8304e01bf03 100644 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ b/beacon_node/network/src/sync/block_lookups/common.rs @@ -4,7 +4,6 @@ use crate::sync::block_lookups::single_block_lookup::{ use crate::sync::block_lookups::{ BlobRequestState, BlockRequestState, CustodyRequestState, PeerId, }; -use crate::sync::manager::BlockProcessType; use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::Id; @@ -76,28 +75,20 @@ impl RequestState for BlockRequestState { fn make_request( &self, - id: SingleLookupId, - lookup_peers: Arc>>, + _id: SingleLookupId, + _lookup_peers: Arc>>, _: usize, - cx: &mut SyncNetworkContext, + _cx: &mut SyncNetworkContext, ) -> Result { - cx.block_lookup_request(id, lookup_peers, self.requested_block_root) - .map_err(LookupRequestError::SendFailedNetwork) + todo!(); } fn send_for_processing( - id: SingleLookupId, - download_result: DownloadResult, - cx: &SyncNetworkContext, + _id: SingleLookupId, + _download_result: DownloadResult, + _cx: &SyncNetworkContext, ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_block_for_processing(id, block_root, value, seen_timestamp) - .map_err(LookupRequestError::SendFailedProcessor) + todo!(); } fn response_type() -> ResponseType { @@ -119,28 +110,20 @@ impl RequestState for BlobRequestState { fn make_request( &self, - id: Id, - lookup_peers: Arc>>, - expected_blobs: usize, - cx: &mut SyncNetworkContext, + _id: Id, + _lookup_peers: Arc>>, + _expected_blobs: usize, + _cx: &mut SyncNetworkContext, ) -> Result { - cx.blob_lookup_request(id, lookup_peers, self.block_root, expected_blobs) - .map_err(LookupRequestError::SendFailedNetwork) + todo!(); } fn send_for_processing( - id: Id, - download_result: DownloadResult, - cx: &SyncNetworkContext, + _id: Id, + _download_result: DownloadResult, + _cx: &SyncNetworkContext, ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_blobs_for_processing(id, block_root, value, seen_timestamp) - .map_err(LookupRequestError::SendFailedProcessor) + todo!(); } fn response_type() -> ResponseType { @@ -167,34 +150,20 @@ impl RequestState for CustodyRequestState { fn make_request( &self, - id: Id, - lookup_peers: Arc>>, + _id: Id, + _lookup_peers: Arc>>, _: usize, - cx: &mut SyncNetworkContext, + _cx: &mut SyncNetworkContext, ) -> Result { - cx.custody_lookup_request(id, self.block_root, lookup_peers) - .map_err(LookupRequestError::SendFailedNetwork) + todo!(); } fn send_for_processing( - id: Id, - download_result: DownloadResult, - cx: &SyncNetworkContext, + _id: Id, + _download_result: DownloadResult, + _cx: &SyncNetworkContext, ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_custody_columns_for_processing( - id, - block_root, - value, - seen_timestamp, - BlockProcessType::SingleCustodyColumn(id), - ) - .map_err(LookupRequestError::SendFailedProcessor) + todo!(); } fn response_type() -> ResponseType { diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 652122688e3..c545facdd9c 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -587,22 +587,11 @@ impl BlockLookups { )] pub fn on_processing_result( &mut self, - process_type: BlockProcessType, - result: BlockProcessingResult, - cx: &mut SyncNetworkContext, + _process_type: BlockProcessType, + _result: BlockProcessingResult, + _cx: &mut SyncNetworkContext, ) { - let lookup_result = match process_type { - BlockProcessType::SingleBlock { id } => { - self.on_processing_result_inner::>(id, result, cx) - } - BlockProcessType::SingleBlob { id } => { - self.on_processing_result_inner::>(id, result, cx) - } - BlockProcessType::SingleCustodyColumn(id) => { - self.on_processing_result_inner::>(id, result, cx) - } - }; - self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx); + todo!(); } #[instrument(parent = None, diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs new file mode 100644 index 00000000000..6bb071b13e4 --- /dev/null +++ b/beacon_node/network/src/sync/block_tree.rs @@ -0,0 +1,453 @@ +use super::network_context::{LookupRequestResult, RpcResponseError, SyncNetworkContext}; +use crate::sync::network_context::custody_by_root::ColumnRequest; +use crate::sync::network_context::{BlocksByRootSameForkRequest, RpcResponseResult}; +use crate::sync::range_sync::{BatchInfo, BatchPeers}; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use lighthouse_network::rpc::BlocksByRootRequest; +use lighthouse_network::service::api_types::{ + BlocksByRootRequestId, BlocksByRootRequester, HeaderLookupId, Id, RangeRequestId, +}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use tracing::debug; +use types::{BeaconBlockHeader, Epoch, ForkName, Hash256, SignedBeaconBlock, Slot}; + +pub struct BlockTree { + blocks: HashMap, + batches: HashMap>, + roots: HashMap, + tips: HashSet, + chain: Arc>, +} + +struct TreeRoot { + peers: HashSet, + request: ColumnRequest, +} + +struct Block { + id: HeaderLookupId, + block: BeaconBlockHeader, + is_syncing: bool, +} + +// TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got +// through gossip and sync. + +impl Block { + fn new(block_root: Hash256, block: BeaconBlockHeader) -> Self { + Self { + id: HeaderLookupId(block_root), + block, + is_syncing: false, + } + } + + fn start(&mut self, cx: &mut SyncNetworkContext) { + cx.block_lookup_request(self.id, &self.peers, self.id.0); + } + + fn on_error(&mut self, _e: RpcResponseError) { + todo!(); + } + + fn slot(&self) -> Option { + if let Some(block) = self.request.peek_downloaded_data() { + Some(block.slot) + } else { + None + } + } + + fn root(&self) -> Hash256 { + todo!(); + } + + fn is_syncing(&self) -> bool { + self.is_syncing + } + + fn parent_root(&self) -> Option { + if let Some(block) = self.request.peek_downloaded_data() { + Some(block.parent_root) + } else { + None + } + } + + fn parent_root_and_slot(&self) -> Option<(Hash256, Slot)> { + if let Some(block) = self.request.peek_downloaded_data() { + Some((block.parent_root, block.slot)) + } else { + None + } + } + + fn is_rooted(&self) -> bool { + todo!(); + } +} + +enum Error { + A, +} + +impl BlockTree { + pub fn new(chain: Arc>) -> Self { + Self { + blocks: <_>::default(), + batches: <_>::default(), + roots: <_>::default(), + tips: <_>::default(), + chain, + } + } + + pub fn pause(&mut self) { + todo!() + } + + pub fn remove_peer(&mut self, _peer: PeerId) { + todo!(); + } + + pub fn search( + &mut self, + block_root: Hash256, + peers: &[PeerId], + cx: &mut SyncNetworkContext, + ) -> bool { + if self.blocks.contains_key(&block_root) { + // `block_root` points to a known block item in the header DAG + // Target root is the oldest known ancestor of `block_root` in the header tree + let oldest_ancestor = self.oldest_known_ancestor(block_root); + let Some(root) = self.roots.get_mut(&oldest_ancestor) else { + panic!("root node should exist"); + }; + // Add peer to the root's peer set + for peer in peers { + if root.peers.insert(peer) { + debug!(block_root = ?oldest_ancestor, ?peer, "Adding peer to existing header lookup"); + } + } + true + } else { + debug!(?block_root, ?peers, "Creating new header lookup"); + + let new_lookup_peers = HashSet::from_iter(peers); + + // If any root has a parent that points to `block_root` remove them from roots and don't + // make `block_root` node a tip + let roots_that_descend_from_new_block = self + .roots + .keys() + .filter(|root| { + if let Some(parent_root) = self + .blocks + .get(root) + .expect("node must exist") + .parent_root() + { + parent_root == block_root + } else { + false + } + }) + .copied() + .collect::>(); + + // We only remove roots that have have a known parent, so they have completed download + for block_root in roots_that_descend_from_new_block { + let root = self.roots.remove(&block_root).expect("node must exist"); + new_lookup_peers.extend(root.peers.values()); + } + + // New nodes always become roots since we don't know their parent + self.roots.insert( + block_root, + TreeRoot { + peers: new_lookup_peers, + request: ColumnRequest::new(), + }, + ); + + // If no one descends from this new node, add it to tips + if roots_that_descend_from_new_block.is_empty() { + self.tips.insert(block_root); + } + + // TODO(tree-sync): have good peer selection + let Some(peer) = lookup.peers.iter().next() else { + todo!("no peer"); + }; + + let req_id = cx + .send_blocks_by_root_request( + *peer, + BlocksByRootRequest::new(vec![block_root], cx.spec(), ForkName::Fulu), + BlocksByRootRequester::Header(lookup.id), + ) + .unwrap(); + + lookup.request.on_download_start(req_id).unwrap(); + + self.blocks.insert(block_root, lookup); + true + } + } + + fn oldest_known_ancestor(&self, mut block_root: Hash256) -> Hash256 { + let Some(mut parent_root) = self + .blocks + .get(&block_root) + .and_then(|lookup| lookup.parent_root()) + else { + return block_root; + }; + + loop { + if let Some(lookup) = self.blocks.get(&parent_root) { + if let Some(next_parent_root) = lookup.parent_root() { + // Continue iterating the parent chain + block_root = parent_root; + parent_root = next_parent_root; + } else { + // There's an entry for parent_root but it's not downloaded yet + return parent_root; + } + } else { + // There's no entry in the DAG for parent_root, thus block_root is the root node + return block_root; + } + } + } + + pub fn on_block( + &mut self, + req_id: BlocksByRootRequestId, + lookup_id: HeaderLookupId, + response: RpcResponseResult>>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> Result<(), String> { + let block_root = lookup_id.0; + let Some(lookup) = self.roots.get_mut(&block_root) else { + return Err(format!("No header lookup for root {block_root}")); + }; + + match response { + Ok((blocks, received)) => { + if blocks.len() != 1 { + return Err(format!( + "Lookup {block_root} returned {} blocks expecting 1", + blocks.len() + )); + } + let block = blocks.first().expect("blocks len == 1").clone(); + + let block_header = block.message().block_header(); + let parent_root = block_header.parent_root; + + lookup + .request + .on_download_success(req_id, peer_id, block_header, received) + .unwrap(); + + // TODO(tree-sync): Should check if node already exist to not override state + self.blocks + .insert(block_root, Block::new(block_root, block_header)); + + // Once we discover the parent_root of this block three things can happen + // 1. The parent root is a known block -> stop + // 2. We conflicts with finality -> reject + // 3. The parent root is unknown -> continue search + + // TODO(tree-sync): should check if the block is descendant of finalized + // TODO(tree-sync): on finalization or every interval we should drop branches that + // conflict with finality + let parent_imported = self.chain.block_is_known_to_fork_choice(&parent_root); + let parent_known = self.blocks.contains_key(&parent_root); + + if parent_known { + self.tips.remove(&parent_root); + } + + let finalized_slot = Slot::new(0); + + if block_header.slot <= finalized_slot { + panic!("Block conflicts with finality"); + } + if parent_imported || parent_known { + // Stop search we reached a known block + self.mark_descendants_as_rooted(parent_root); + self.trigger_forward_sync(cx); + } else { + let lookup = self.blocks.get_mut(&block_root).expect("lookup exists"); + let peers = lookup.peers(); + self.search(parent_root, &peers, cx); + } + } + Err(e) => { + lookup.request.on_download_error(req_id).unwrap(); + lookup.start(cx); + todo!("error {e:?}"); + } + } + Ok(()) + } + + pub fn prune(&mut self) { + // Prune blocks once imported, and once finality advances + } + + pub fn prune_root(&mut self, _block_root: Hash256, _imported: bool) { + todo!(); + } + + fn mark_descendants_as_rooted(&mut self, _block_root: Hash256) { + // TODO: iterate all blocks and mark descendants of `block_root` as rooted + } + + fn mark_as_syncing(&mut self, _blocks: &[Hash256]) { + // TODO: mark all this block entries as syncing + } + + fn collect_ancestors(&self, mut block_root: Hash256) -> Vec { + let mut ancestors = vec![]; + while let Some(block) = self.blocks.get(&block_root) { + ancestors.push(block_root); + if let Some(parent_root) = block.parent_root() { + block_root = parent_root; + } else { + break; + } + } + ancestors + } + + fn trigger_forward_sync(&mut self, cx: &mut SyncNetworkContext) { + // Find the block range with most peers and highest slot. This is the block + // to be used as tip of the chain of blocks to fetch. + let Some(block_root) = self + .blocks + .iter() + .filter_map(|(root, block)| { + // Ignore blocks that are already being forward synced + if block.is_syncing() { + return None; + } + // Ignore block roots which header is not downloaded yet + let Some((parent_root, slot)) = block.parent_root_and_slot() else { + return None; + }; + // Check if the parent is known in the header tree + if let Some(slot) = block.slot() { + // Find highest peer count, then slot + Some((block.peer_count(), slot, root)) + } else { + None + } + }) + .max() + .map(|(_, _, root)| *root) + else { + return; + }; + + // Get the chain of ancestors of that block_root. Because they are ancestors + // of block_root all these blocks have the same peer count as `block_root`. + // Consider limiting the length of blocks so some sensible number to not sync + // too much at once. There's no good reason to do a big fetch at once. + let blocks = self.collect_ancestors(block_root); + self.mark_as_syncing(&blocks); + + // TODO: We can sync parallel chains at once here, if we have multiple chains + // rooted in different places + let peers = self + .blocks + .get(&block_root) + .expect("block for block_root should exist") + .peers(); + + self.forward_sync_blocks(&blocks, &peers, cx) + } + + fn forward_sync_blocks( + &mut self, + blocks: &[Hash256], + peers: &[PeerId], + cx: &mut SyncNetworkContext, + ) { + // Create a batch with this blocks + // Trigger batch sync + + let headers = blocks + .iter() + .map(|root| { + self.blocks + .get(root) + .expect("block should exist") + .request + .peek_downloaded_data() + .expect("header should be downloaded") + .clone() + }) + .collect::>(); + + // TODO(tree-sync): only choose ranges of blocks in the same fork + let first_header = headers.first().unwrap(); + let fork = cx.spec().fork_name_at_slot::(first_header.slot); + + // Create batch here? + let mut batch = BatchInfo::new(blocks.to_vec()); + + let request = BlocksByRootSameForkRequest { + block_roots: batch + .to_blocks_by_root_request(cx.spec()) + .block_roots() + .to_vec(), + fork, + }; + let chain_id = cx.next_id(); + let requester = RangeRequestId::RangeSync { + chain_id, + batch_id: Epoch::new(0), + }; + let peers = Arc::new(RwLock::new(HashSet::from_iter(peers.iter().copied()))); + let failed_peers = HashSet::new(); + + let id = + match cx.block_components_by_range_request(request, requester, peers, &failed_peers) { + Ok(req_id) => { + // TODO: Update batch state + batch.start_downloading(req_id); + self.batches.insert(chain_id, batch); + } + Err(e) => { + // Log failed chain, mark blocks as not syncing + } + }; + } + + pub fn on_blocks_response( + &mut self, + batch_id: Id, + blocks: Vec>, + batch_peers: BatchPeers, + ) { + let Some(batch) = self.batches.get_mut(&batch_id) else { + panic!("Unknown batch id {batch_id}"); + }; + + let received = batch + .download_completed(blocks, batch_peers) + .map_err(|e| e.0) + .unwrap(); + debug!(%batch_id, blocks = received, "Batch downloaded"); + + // Continue batches + } +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 94599a072ee..e36e462c1b8 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -34,7 +34,7 @@ //! search for the block and subsequently search for parents if needed. use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; -use super::block_lookups::BlockLookups; +use super::block_tree::BlockTree; use super::network_context::{ CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; @@ -46,23 +46,19 @@ use crate::network_beacon_processor::{ }; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::block_lookups::{ - BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, -}; -use crate::sync::network_context::PeerGroup; use crate::sync::range_sync::BATCH_BUFFER_SIZE; use beacon_chain::block_verification_types::AsBlock; -use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, EngineState, }; use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyByRangeRequestId, CustodyRequester, DataColumnsByRangeRequestId, + BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRangeRequestId, BlocksByRootRequestId, + BlocksByRootRequester, ComponentsByRangeRequestId, CustodyByRangeRequestId, + CustodyByRootRequestId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, - SingleLookupReqId, SyncRequestId, + SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::PeerId; @@ -170,12 +166,6 @@ pub enum SyncMessage { result: BatchProcessResult, }, - /// Block processed - BlockComponentProcessed { - process_type: BlockProcessType, - result: BlockProcessingResult, - }, - /// Sample data column verified SampleVerified { id: SamplingId, @@ -189,16 +179,16 @@ pub enum SyncMessage { /// The type of processing specified for a received block. #[derive(Debug, Clone)] pub enum BlockProcessType { - SingleBlock { id: Id }, - SingleBlob { id: Id }, + BlocksByRoot { id: Id }, + BlobsByRoot { id: Id }, SingleCustodyColumn(Id), } impl BlockProcessType { pub fn id(&self) -> Id { match self { - BlockProcessType::SingleBlock { id } - | BlockProcessType::SingleBlob { id } + BlockProcessType::BlocksByRoot { id } + | BlockProcessType::BlobsByRoot { id } | BlockProcessType::SingleCustodyColumn(id) => *id, } } @@ -248,13 +238,14 @@ pub struct SyncManager { /// Backfill syncing. backfill_sync: BackFillSync, - block_lookups: BlockLookups, /// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer /// may forward us thousands of a attestations, each one triggering an individual event. Only /// one event is useful, the rest generating log noise and wasted cycles notified_unknown_roots: LRUTimeCache<(PeerId, Hash256)>, sampling: Sampling, + + block_tree: BlockTree, } /// Spawns a new `SyncManager` thread which has a weak reference to underlying beacon @@ -318,36 +309,32 @@ impl SyncManager { ), range_sync: RangeSync::new(beacon_chain.clone(), batch_buffer_size), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), - block_lookups: BlockLookups::new(), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, )), sampling: Sampling::new(sampling_config), + block_tree: BlockTree::new(beacon_chain.clone()), } } #[cfg(test)] pub(crate) fn active_single_lookups(&self) -> Vec { - self.block_lookups.active_single_lookups() + todo!(); } #[cfg(test)] pub(crate) fn active_parent_lookups(&self) -> Vec> { - self.block_lookups - .active_parent_lookups() - .iter() - .map(|c| c.chain.clone()) - .collect() + todo!(); } #[cfg(test)] pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.block_lookups.get_failed_chains() + todo!(); } #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.block_lookups.insert_failed_chain(block_root); + pub(crate) fn insert_failed_chain(&mut self, _block_root: Hash256) { + todo!(); } #[cfg(test)] @@ -406,34 +393,22 @@ impl SyncManager { finalized_root: status.finalized_root, }; + debug!(?remote, ?local, "new peer"); + if !self.chain.block_is_known_to_fork_choice(&remote.head_root) + && remote.head_slot + > status + .finalized_epoch + .start_slot(T::EthSpec::slots_per_epoch()) + { + self.block_tree + .search(remote.head_root, &[peer_id], &mut self.network); + } + let sync_type = remote_sync_type(&local, &remote, &self.chain); // update the state of the peer. let is_still_connected = self.update_peer_sync_state(&peer_id, &local, &remote, &sync_type); if is_still_connected { - match sync_type { - PeerSyncType::Behind => {} // Do nothing - PeerSyncType::Advanced => { - self.range_sync - .add_peer(&mut self.network, local, peer_id, remote); - } - PeerSyncType::FullySynced => { - // Sync considers this peer close enough to the head to not trigger range sync. - // Range sync handles well syncing large ranges of blocks, of a least a few blocks. - // However this peer may be in a fork that we should sync but we have not discovered - // yet. If the head of the peer is unknown, attempt block lookup first. If the - // unknown head turns out to be on a longer fork, it will trigger range sync. - // - // A peer should always be considered `Advanced` if its finalized root is - // unknown and ahead of ours, so we don't check for that root here. - // - // TODO: This fork-choice check is potentially duplicated, review code - if !self.chain.block_is_known_to_fork_choice(&remote.head_root) { - self.handle_unknown_block_root(peer_id, remote.head_root); - } - } - } - match sync_type { PeerSyncType::Behind => {} PeerSyncType::Advanced | PeerSyncType::FullySynced => { @@ -505,11 +480,11 @@ impl SyncManager { fn inject_error(&mut self, peer_id: PeerId, sync_request_id: SyncRequestId, error: RPCError) { trace!("Sync manager received a failed RPC"); match sync_request_id { - SyncRequestId::SingleBlock { id } => { - self.on_single_block_response(id, peer_id, RpcEvent::RPCError(error)) + SyncRequestId::BlocksByRoot(req_id) => { + self.on_blocks_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } - SyncRequestId::SingleBlob { id } => { - self.on_single_blob_response(id, peer_id, RpcEvent::RPCError(error)) + SyncRequestId::BlobsByRoot(req_id) => { + self.on_blobs_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } SyncRequestId::DataColumnsByRoot(req_id) => { self.on_data_columns_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) @@ -541,7 +516,7 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); self.backfill_sync.peer_disconnected(peer_id); - self.block_lookups.peer_disconnected(peer_id); + self.block_tree.remove_peer(*peer_id); // Regardless of the outcome, we update the sync status. self.update_sync_state(); @@ -752,7 +727,7 @@ impl SyncManager { self.handle_new_execution_engine_state(engine_state); } _ = prune_lookups_interval.tick() => { - self.block_lookups.prune_lookups(); + self.block_tree.prune(); } _ = prune_requests.tick() => { self.prune_requests(); @@ -809,54 +784,24 @@ impl SyncManager { let block_slot = block.slot(); let parent_root = block.parent_root(); debug!(%block_root, %parent_root, "Received unknown parent block message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - block_slot, - BlockComponent::Block(DownloadResult { - value: block.block_cloned(), - block_root, - seen_timestamp: timestamp_now(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); + self.handle_unknown_parent(peer_id, block_root, parent_root, block_slot); + // TODO(tree-sync): Consider caching this block somewhere for re-processing } SyncMessage::UnknownParentBlob(peer_id, blob) => { let blob_slot = blob.slot(); let block_root = blob.block_root(); let parent_root = blob.block_parent_root(); debug!(%block_root, %parent_root, "Received unknown parent blob message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - blob_slot, - BlockComponent::Blob(DownloadResult { - value: blob, - block_root, - seen_timestamp: timestamp_now(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); + self.handle_unknown_parent(peer_id, block_root, parent_root, blob_slot); + // TODO(tree-sync): Consider caching this blob somewhere for re-processing } SyncMessage::UnknownParentDataColumn(peer_id, data_column) => { let data_column_slot = data_column.slot(); let block_root = data_column.block_root(); let parent_root = data_column.block_parent_root(); debug!(%block_root, %parent_root, "Received unknown parent data column message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - data_column_slot, - BlockComponent::DataColumn(DownloadResult { - value: data_column, - block_root, - seen_timestamp: timestamp_now(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); + self.handle_unknown_parent(peer_id, block_root, parent_root, data_column_slot); + // TODO(tree-sync): Consider caching this column somewhere for re-processing } SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => { if !self.notified_unknown_roots.contains(&(peer_id, block_root)) { @@ -883,20 +828,10 @@ impl SyncManager { sync_request_id, error, } => self.inject_error(peer_id, sync_request_id, error), - SyncMessage::BlockComponentProcessed { - process_type, - result, - } => self - .block_lookups - .on_processing_result(process_type, result, &mut self.network), SyncMessage::GossipBlockProcessResult { block_root, imported, - } => self.block_lookups.on_external_processing_result( - block_root, - imported, - &mut self.network, - ), + } => self.block_tree.prune_root(block_root, imported), SyncMessage::BatchProcessed { sync_type, result } => match sync_type { ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { self.range_sync.handle_block_process_result( @@ -940,16 +875,13 @@ impl SyncManager { block_root: Hash256, parent_root: Hash256, slot: Slot, - block_component: BlockComponent, ) { match self.should_search_for_block(Some(slot), &peer_id) { Ok(_) => { - if self.block_lookups.search_child_and_parent( - block_root, - block_component, - peer_id, - &mut self.network, - ) { + if self + .block_tree + .search(block_root, &[peer_id], &mut self.network) + { // Lookup created. No need to log here it's logged in `new_current_lookup` } else { debug!( @@ -968,11 +900,10 @@ impl SyncManager { fn handle_unknown_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { match self.should_search_for_block(None, &peer_id) { Ok(_) => { - if self.block_lookups.search_unknown_block( - block_root, - &[peer_id], - &mut self.network, - ) { + if self + .block_tree + .search(block_root, &[peer_id], &mut self.network) + { // Lookup created. No need to log here it's logged in `new_current_lookup` } else { debug!(?block_root, "No lookup created for unknown block"); @@ -1045,8 +976,8 @@ impl SyncManager { // - Block lookups: // Disabled while in this state. We drop current requests and don't search for new // blocks. - let dropped_single_blocks_requests = - self.block_lookups.drop_single_block_requests(); + // TODO(tree-sync): should we pause it instead? + self.block_tree.pause(); // - Range: // We still send found peers to range so that it can keep track of potential chains @@ -1056,12 +987,7 @@ impl SyncManager { // - Backfill: Not affected by ee states, nothing to do. // Some logs. - if dropped_single_blocks_requests > 0 { - debug!( - dropped_single_blocks_requests, - "Execution engine not online. Dropping active requests." - ); - } + debug!("Execution engine not online. Stopping active sync requests."); } } } @@ -1074,7 +1000,7 @@ impl SyncManager { seen_timestamp: Duration, ) { match sync_request_id { - SyncRequestId::SingleBlock { id } => self.on_single_block_response( + SyncRequestId::BlocksByRoot(id) => self.on_blocks_by_root_response( id, peer_id, RpcEvent::from_chunk(block, seen_timestamp), @@ -1090,21 +1016,45 @@ impl SyncManager { } } - fn on_single_block_response( + fn on_blocks_by_root_response( &mut self, - id: SingleLookupReqId, + req_id: BlocksByRootRequestId, peer_id: PeerId, block: RpcEvent>>, ) { - if let Some(resp) = self.network.on_single_block_response(id, peer_id, block) { - self.block_lookups - .on_download_response::>( - id, - resp.map(|(value, seen_timestamp)| { - (value, PeerGroup::from_single(peer_id), seen_timestamp) - }), - &mut self.network, - ) + if let Some(result) = self + .network + .on_blocks_by_root_response(req_id, peer_id, block) + { + match req_id.parent_request_id { + BlocksByRootRequester::Header(lookup_id) => { + self.block_tree + .on_block(req_id, lookup_id, result, peer_id, &mut self.network); + } + BlocksByRootRequester::RangeSync(batch_id) => { + self.on_block_components_by_range_response( + batch_id, + RangeBlockComponent::Block(req_id, result, peer_id), + ); + } + } + } + } + + fn on_blobs_by_root_response( + &mut self, + req_id: BlobsByRootRequestId, + peer_id: PeerId, + block: RpcEvent>>, + ) { + if let Some(result) = self + .network + .on_blobs_by_root_response(req_id, peer_id, block) + { + self.on_block_components_by_range_response( + req_id.parent_request_id, + RangeBlockComponent::Blob(req_id, result, peer_id), + ); } } @@ -1116,11 +1066,7 @@ impl SyncManager { seen_timestamp: Duration, ) { match sync_request_id { - SyncRequestId::SingleBlob { id } => self.on_single_blob_response( - id, - peer_id, - RpcEvent::from_chunk(blob, seen_timestamp), - ), + SyncRequestId::BlobsByRoot { .. } => todo!(), SyncRequestId::BlobsByRange(id) => self.on_blobs_by_range_response( id, peer_id, @@ -1158,24 +1104,6 @@ impl SyncManager { } } - fn on_single_blob_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blob: RpcEvent>>, - ) { - if let Some(resp) = self.network.on_single_blob_response(id, peer_id, blob) { - self.block_lookups - .on_download_response::>( - id, - resp.map(|(value, seen_timestamp)| { - (value, PeerGroup::from_single(peer_id), seen_timestamp) - }), - &mut self.network, - ) - } - } - fn on_data_columns_by_root_response( &mut self, req_id: DataColumnsByRootRequestId, @@ -1200,7 +1128,7 @@ impl SyncManager { .network .on_custody_by_root_response(custody_id, req_id, peer_id, resp) { - self.on_custody_by_root_result(custody_id.requester, result); + self.on_custody_by_root_result(custody_id, result); } } } @@ -1214,10 +1142,7 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - self.on_block_components_by_range_response( - id.parent_request_id, - RangeBlockComponent::Block(id, resp, peer_id), - ); + todo!(); } } @@ -1228,10 +1153,7 @@ impl SyncManager { blob: RpcEvent>>, ) { if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - self.on_block_components_by_range_response( - id.parent_request_id, - RangeBlockComponent::Blob(id, resp, peer_id), - ); + todo!(); } } @@ -1257,26 +1179,21 @@ impl SyncManager { fn on_custody_by_range_result( &mut self, - id: CustodyByRangeRequestId, - result: CustodyRequestResult, + _id: CustodyByRangeRequestId, + _result: CustodyRequestResult, ) { - self.on_block_components_by_range_response( - id.parent_request_id, - RangeBlockComponent::CustodyColumns(id, result), - ); + todo!(); } fn on_custody_by_root_result( &mut self, - requester: CustodyRequester, - response: CustodyRequestResult, + id: CustodyByRootRequestId, + result: CustodyRequestResult, ) { - self.block_lookups - .on_download_response::>( - requester.0, - response, - &mut self.network, - ); + self.on_block_components_by_range_response( + id.parent_request_id, + RangeBlockComponent::CustodyColumns(id, result), + ); } fn on_sampling_result(&mut self, requester: SamplingRequester, result: SamplingResult) { @@ -1319,14 +1236,8 @@ impl SyncManager { Ok((blocks, batch_peers)) => { match range_request_id.requester { RangeRequestId::RangeSync { chain_id, batch_id } => { - self.range_sync.blocks_by_range_response( - &mut self.network, - batch_peers, - chain_id, - batch_id, - range_request_id.id, - blocks, - ); + self.block_tree + .on_blocks_response(chain_id, blocks, batch_peers); self.update_sync_state(); } RangeRequestId::BackfillSync { batch_id } => { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 97302df04e8..b81545573bd 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -3,6 +3,7 @@ //! Stores the various syncing methods for the beacon chain. mod backfill_sync; mod block_lookups; +mod block_tree; pub mod manager; mod network_context; mod peer_sampling; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index dd141286ff2..85e886c4885 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -3,8 +3,6 @@ use self::custody_by_range::ActiveCustodyByRangeRequest; use self::custody_by_root::ActiveCustodyByRootRequest; -pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; -use super::manager::BlockProcessType; use super::range_sync::BatchPeers; use super::SyncMessage; use crate::metrics; @@ -14,20 +12,25 @@ use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::SingleLookupId; -use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; +use crate::sync::network_context::requests::BlobCountPerBlock; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; pub use block_components_by_range::BlockComponentsByRangeRequest; #[cfg(test)] pub use block_components_by_range::BlockComponentsByRangeRequestStep; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; +use lighthouse_network::rpc::methods::{ + BlobsByRangeRequest, BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRangeRequest, + DataColumnsByRootRequest, +}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyByRangeRequestId, CustodyId, CustodyRequester, DataColumnsByRangeRequestId, - DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + AppRequestId, BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRangeRequestId, + BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRangeRequestId, + CustodyByRangeRequestId, CustodyByRootRequestId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, HeaderLookupId, Id, SingleLookupReqId, + SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; @@ -45,11 +48,12 @@ use std::time::Duration; #[cfg(test)] use task_executor::TaskExecutor; use tokio::sync::mpsc; -use tracing::{debug, error, span, warn, Level}; +use tracing::{debug, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, - ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, + BlobIdentifier, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, + DataColumnsByRootIdentifier, EthSpec, ForkContext, ForkName, Hash256, RuntimeVariableList, + SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; pub mod block_components_by_range; @@ -180,6 +184,12 @@ pub enum LookupRequestResult { Pending(&'static str), } +#[derive(Clone)] +pub struct BlocksByRootSameForkRequest { + pub block_roots: Vec, + pub fork: ForkName, +} + /// Wraps a Network channel to employ various RPC related network functionality for the Sync manager. This includes management of a global RPC request Id. pub struct SyncNetworkContext { /// The network channel to relay messages to the Network service. @@ -190,9 +200,10 @@ pub struct SyncNetworkContext { /// A mapping of active BlocksByRoot requests, including both current slot and parent lookups. blocks_by_root_requests: - ActiveRequests>, + ActiveRequests>, /// A mapping of active BlobsByRoot requests, including both current slot and parent lookups. - blobs_by_root_requests: ActiveRequests>, + blobs_by_root_requests: + ActiveRequests>, /// A mapping of active DataColumnsByRoot requests data_columns_by_root_requests: ActiveRequests>, @@ -207,7 +218,7 @@ pub struct SyncNetworkContext { ActiveRequests>, /// Mapping of active custody column by root requests for a block root - custody_by_root_requests: FnvHashMap>, + custody_by_root_requests: FnvHashMap>, /// Mapping of active custody column by range requests custody_by_range_requests: FnvHashMap>, @@ -231,16 +242,16 @@ pub struct SyncNetworkContext { /// Small enumeration to make dealing with block and blob requests easier. pub enum RangeBlockComponent { Block( - BlocksByRangeRequestId, + BlocksByRootRequestId, RpcResponseResult>>>, PeerId, ), Blob( - BlobsByRangeRequestId, + BlobsByRootRequestId, RpcResponseResult>>>, PeerId, ), - CustodyColumns(CustodyByRangeRequestId, CustodyRequestResult), + CustodyColumns(CustodyByRootRequestId, CustodyRequestResult), } #[cfg(test)] @@ -344,10 +355,10 @@ impl SyncNetworkContext { let blocks_by_root_ids = blocks_by_root_requests .active_requests() - .map(|(id, peer)| (SyncRequestId::SingleBlock { id: *id }, peer)); + .map(|(id, peer)| (SyncRequestId::BlocksByRoot(*id), peer)); let blobs_by_root_ids = blobs_by_root_requests .active_requests() - .map(|(id, peer)| (SyncRequestId::SingleBlob { id: *id }, peer)); + .map(|(id, peer)| (SyncRequestId::BlobsByRoot(*id), peer)); let data_column_by_root_ids = data_columns_by_root_requests .active_requests() .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); @@ -477,7 +488,7 @@ impl SyncNetworkContext { /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, - request: BlocksByRangeRequest, + request: BlocksByRootSameForkRequest, requester: RangeRequestId, peers: Arc>>, peers_to_deprioritize: &HashSet, @@ -501,13 +512,12 @@ impl SyncNetworkContext { /// Returns false if no request was made, because the block is already imported pub fn block_lookup_request( &mut self, - lookup_id: SingleLookupId, - lookup_peers: Arc>>, + parent_request_id: HeaderLookupId, + lookup_peers: &HashSet, block_root: Hash256, - ) -> Result { + ) -> Result, RpcRequestSendError> { let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers - .read() .iter() .map(|peer| { ( @@ -557,146 +567,7 @@ impl SyncNetworkContext { } } - let id = SingleLookupReqId { - lookup_id, - req_id: self.next_id(), - }; - - let request = BlocksByRootSingleRequest(block_root); - - // Lookup sync event safety: If network_send.send() returns Ok(_) we are guaranteed that - // eventually at least one this 3 events will be received: - // - StreamTermination(request_id): handled by `Self::on_single_block_response` - // - RPCError(request_id): handled by `Self::on_single_block_response` - // - Disconnect(peer_id) handled by `Self::peer_disconnected``which converts it to a - // ` RPCError(request_id)`event handled by the above method - self.network_send - .send(NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlocksByRoot(request.into_request(&self.fork_context)), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "BlocksByRoot", - ?block_root, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.blocks_by_root_requests.insert( - id, - peer_id, - // true = enforce max_requests as returned for blocks_by_root. We always request a single - // block and the peer must have it. - true, - BlocksByRootRequestItems::new(request), - ); - - Ok(LookupRequestResult::RequestSent(id.req_id)) - } - - /// Request necessary blobs for `block_root`. Requests only the necessary blobs by checking: - /// - If we have a downloaded but not yet processed block - /// - If the da_checker has a pending block - /// - If the da_checker has pending blobs from gossip - /// - /// Returns false if no request was made, because we don't need to import (more) blobs. - pub fn blob_lookup_request( - &mut self, - lookup_id: SingleLookupId, - lookup_peers: Arc>>, - block_root: Hash256, - expected_blobs: usize, - ) -> Result { - let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(peer_id) = lookup_peers - .read() - .iter() - .map(|peer| { - ( - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, peer)| *peer) - else { - // Allow lookup to not have any peers and do nothing. This is an optimization to not - // lose progress of lookups created from a block with unknown parent before we receive - // attestations for said block. - // Lookup sync event safety: If a lookup requires peers to make progress, and does - // not receive any new peers for some time it will be dropped. If it receives a new - // peer it must attempt to make progress. - return Ok(LookupRequestResult::Pending("no peers")); - }; - - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let imported_blob_indexes = self - .chain - .data_availability_checker - .cached_blob_indexes(&block_root) - .unwrap_or_default(); - // Include only the blob indexes not yet imported (received through gossip) - let indices = (0..expected_blobs as u64) - .filter(|index| !imported_blob_indexes.contains(index)) - .collect::>(); - - if indices.is_empty() { - // No blobs required, do not issue any request - return Ok(LookupRequestResult::NoRequestNeeded("no indices to fetch")); - } - - let id = SingleLookupReqId { - lookup_id, - req_id: self.next_id(), - }; - - let request = BlobsByRootSingleBlockRequest { - block_root, - indices: indices.clone(), - }; - - // Lookup sync event safety: Refer to `Self::block_lookup_request` `network_send.send` call - self.network_send - .send(NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlobsByRoot(request.clone().into_request(&self.fork_context)), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "BlobsByRoot", - ?block_root, - blob_indices = ?indices, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.blobs_by_root_requests.insert( - id, - peer_id, - // true = enforce max_requests are returned for blobs_by_root. We only issue requests for - // blocks after we know the block has data, and only request peers after they claim to - // have imported the block+blobs. - true, - BlobsByRootRequestItems::new(request), - ); - - Ok(LookupRequestResult::RequestSent(id.req_id)) + todo!(); } /// Request to send a single `data_columns_by_root` request to the network. @@ -704,7 +575,8 @@ impl SyncNetworkContext { &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, - request: DataColumnsByRootSingleBlockRequest, + block_roots: Vec, + indices: Vec, expect_max_responses: bool, ) -> Result, &'static str> { let span = span!( @@ -719,21 +591,28 @@ impl SyncNetworkContext { requester, }; + let request = DataColumnsByRootRequest::new( + block_roots + .iter() + .map(|block_root| DataColumnsByRootIdentifier { + block_root: *block_root, + columns: RuntimeVariableList::from_vec(indices.clone(), usize::MAX), + }) + .collect(), + usize::MAX, + ); + self.send_network_msg(NetworkMessage::SendRequest { peer_id, - request: RequestType::DataColumnsByRoot( - request - .clone() - .try_into_request(self.fork_context.current_fork(), &self.chain.spec)?, - ), + request: RequestType::DataColumnsByRoot(request), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), })?; debug!( method = "DataColumnsByRoot", - block_root = ?request.block_root, - indices = ?request.indices, peer = %peer_id, + ?block_roots, + ?indices, %id, "Sync RPC request sent" ); @@ -742,7 +621,7 @@ impl SyncNetworkContext { id, peer_id, expect_max_responses, - DataColumnsByRootRequestItems::new(request), + DataColumnsByRootRequestItems::new(block_roots, indices), ); Ok(LookupRequestResult::RequestSent(id)) @@ -752,12 +631,12 @@ impl SyncNetworkContext { /// any request to the network if no columns have to be fetched based on the import state of the /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_root` /// requests. - pub fn custody_lookup_request( + pub fn send_custody_by_root_request( &mut self, - lookup_id: SingleLookupId, - block_root: Hash256, + parent_request_id: ComponentsByRangeRequestId, + request: BlocksByRootRequest, lookup_peers: Arc>>, - ) -> Result { + ) -> Result { let span = span!( Level::INFO, "SyncNetworkContext", @@ -765,42 +644,22 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let custody_indexes_imported = self - .chain - .data_availability_checker - .cached_data_column_indexes(&block_root) - .unwrap_or_default(); + let id = CustodyByRootRequestId { parent_request_id }; + debug!( + %id, + "Starting custody columns request" + ); - // Include only the blob indexes not yet imported (received through gossip) - let custody_indexes_to_fetch = self + let custody_indices = self .network_globals() .sampling_columns() .into_iter() - .filter(|index| !custody_indexes_imported.contains(index)) .collect::>(); - if custody_indexes_to_fetch.is_empty() { - // No indexes required, do not issue any request - return Ok(LookupRequestResult::NoRequestNeeded("no indices to fetch")); - } - - let id = SingleLookupReqId { - lookup_id, - req_id: self.next_id(), - }; - - debug!( - ?block_root, - indices = ?custody_indexes_to_fetch, - %id, - "Starting custody columns request" - ); - - let requester = CustodyRequester(id); let mut request = ActiveCustodyByRootRequest::new( - block_root, - CustodyId { requester }, - &custody_indexes_to_fetch, + request.block_roots().to_vec(), + id, + &custody_indices, lookup_peers, ); @@ -810,13 +669,56 @@ impl SyncNetworkContext { // Ignoring the result of `continue_requests` is okay. A request that has just been // created cannot return data immediately, it must send some request to the network // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. - self.custody_by_root_requests.insert(requester, request); - Ok(LookupRequestResult::RequestSent(id.req_id)) + self.custody_by_root_requests.insert(id, request); + Ok(id) } Err(e) => Err(e.into()), } } + pub fn send_blocks_by_root_request( + &mut self, + peer_id: PeerId, + request: BlocksByRootRequest, + parent_request_id: BlocksByRootRequester, + ) -> Result { + let id = BlocksByRootRequestId { + id: self.next_id(), + parent_request_id, + }; + + // Lookup sync event safety: If network_send.send() returns Ok(_) we are guaranteed that + // eventually at least one this 3 events will be received: + // - StreamTermination(request_id): handled by `Self::on_single_block_response` + // - RPCError(request_id): handled by `Self::on_single_block_response` + // - Disconnect(peer_id) handled by `Self::peer_disconnected``which converts it to a + // ` RPCError(request_id)`event handled by the above method + self.network_send + .send(NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRoot(request.clone().into()), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), + }) + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + + debug!( + method = "BlocksByRoot", + peer = %peer_id, + %id, + "Sync RPC request sent" + ); + + self.blocks_by_root_requests.insert( + id, + peer_id, + // true = enforce max_requests as returned for blocks_by_root. We always request from + // peers to claim to have these blocks + true, + BlocksByRootRequestItems::new(request), + ); + Ok(id) + } + fn send_blocks_by_range_request( &mut self, peer_id: PeerId, @@ -855,6 +757,56 @@ impl SyncNetworkContext { Ok(id) } + fn send_blobs_by_root_request( + &mut self, + peer_id: PeerId, + request: BlobCountPerBlock, + parent_request_id: ComponentsByRangeRequestId, + ) -> Result { + let id = BlobsByRootRequestId { + id: self.next_id(), + parent_request_id, + }; + + let blob_identifiers = request + .0 + .iter() + .flat_map(|(block_root, blob_count)| { + (0..(*blob_count as u64)).map(|index| BlobIdentifier { + block_root: *block_root, + index, + }) + }) + .collect::>(); + + // Create the blob request based on the blocks request. + self.network_send + .send(NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlobsByRoot(BlobsByRootRequest { + blob_ids: RuntimeVariableList::new(blob_identifiers, usize::MAX).unwrap(), + }), + app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), + }) + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + + debug!( + method = "BlobsByRange", + peer = %peer_id, + %id, + "Sync RPC request sent" + ); + + self.blobs_by_root_requests.insert( + id, + peer_id, + // true = we know exactly how many blobs total we expect + true, + BlobsByRootRequestItems::new(request), + ); + Ok(id) + } + fn send_blobs_by_range_request( &mut self, peer_id: PeerId, @@ -1089,7 +1041,7 @@ impl SyncNetworkContext { /// attempt. pub fn continue_custody_by_root_requests( &mut self, - ) -> Vec<(CustodyRequester, CustodyRequestResult)> { + ) -> Vec<(CustodyByRootRequestId, CustodyRequestResult)> { let ids = self .custody_by_root_requests .keys() @@ -1146,56 +1098,25 @@ impl SyncNetworkContext { /// Processes a single `RpcEvent` blocks_by_root RPC request. /// Same logic as [`on_blocks_by_range_response`] but it converts a `Vec` into a `Block` - pub(crate) fn on_single_block_response( + pub(crate) fn on_blocks_by_root_response( &mut self, - id: SingleLookupReqId, + id: BlocksByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>>> { + ) -> Option>>>> { let resp = self.blocks_by_root_requests.on_response(id, rpc_event); - let resp = resp.map(|res| { - res.and_then(|(mut blocks, seen_timestamp)| { - // Enforce that exactly one chunk = one block is returned. ReqResp behavior limits the - // response count to at most 1. - match blocks.pop() { - Some(block) => Ok((block, seen_timestamp)), - // Should never happen, `blocks_by_root_requests` enforces that we receive at least - // 1 chunk. - None => Err(LookupVerifyError::NotEnoughResponsesReturned { actual: 0 }.into()), - } - }) - }); self.on_rpc_response_result(id, "BlocksByRoot", resp, peer_id, |_| 1) } /// Processes a single `RpcEvent` blobs_by_root RPC request. /// Same logic as [`on_blocks_by_range_response`] - pub(crate) fn on_single_blob_response( + pub(crate) fn on_blobs_by_root_response( &mut self, - id: SingleLookupReqId, + id: BlobsByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>> { + ) -> Option>>>> { let resp = self.blobs_by_root_requests.on_response(id, rpc_event); - let resp = resp.map(|res| { - res.and_then(|(blobs, seen_timestamp)| { - if let Some(max_len) = blobs - .first() - .map(|blob| self.chain.spec.max_blobs_per_block(blob.epoch()) as usize) - { - match to_fixed_blob_sidecar_list(blobs, max_len) { - Ok(blobs) => Ok((blobs, seen_timestamp)), - Err(e) => Err(e.into()), - } - } else { - Err(RpcResponseError::VerifyError( - LookupVerifyError::InternalError( - "Requested blobs for a block that has no blobs".to_string(), - ), - )) - } - }) - }); self.on_rpc_response_result(id, "BlobsByRoot", resp, peer_id, |_| 1) } @@ -1302,7 +1223,7 @@ impl SyncNetworkContext { #[allow(clippy::type_complexity)] pub fn on_custody_by_root_response( &mut self, - id: CustodyId, + id: CustodyByRootRequestId, req_id: DataColumnsByRootRequestId, peer_id: PeerId, resp: RpcResponseResult>, @@ -1316,7 +1237,7 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests - let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else { + let Some(mut request) = self.custody_by_root_requests.remove(&id) else { metrics::inc_counter_vec( &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["custody_by_root"], @@ -1329,12 +1250,12 @@ impl SyncNetworkContext { .map_err(Into::::into) .transpose(); - self.handle_custody_by_root_result(id.requester, request, result) + self.handle_custody_by_root_result(id, request, result) } fn handle_custody_by_root_result( &mut self, - id: CustodyRequester, + id: CustodyByRootRequestId, request: ActiveCustodyByRootRequest, result: Option>, ) -> Option> { @@ -1442,18 +1363,18 @@ impl SyncNetworkContext { let result = match range_block_component { RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { request - .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .on_blocks_by_root_result(req_id, blocks, peer_id, self) .map_err(Into::::into) }), RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { request - .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .on_blobs_by_root_result(req_id, blobs, peer_id, self) .map_err(Into::::into) }), RangeBlockComponent::CustodyColumns(req_id, resp) => { resp.and_then(|(custody_columns, peers, _)| { request - .on_custody_by_range_result(req_id, custody_columns, peers, self) + .on_custody_by_root_result(req_id, custody_columns, peers, self) .map_err(Into::::into) }) } @@ -1489,118 +1410,6 @@ impl SyncNetworkContext { result } - pub fn send_block_for_processing( - &self, - id: Id, - block_root: Hash256, - block: Arc>, - seen_timestamp: Duration, - ) -> Result<(), SendErrorProcessor> { - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let beacon_processor = self - .beacon_processor_if_enabled() - .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - - let block = RpcBlock::new_without_blobs(Some(block_root), block); - - debug!(block = ?block_root, id, "Sending block for processing"); - // Lookup sync event safety: If `beacon_processor.send_rpc_beacon_block` returns Ok() sync - // must receive a single `SyncMessage::BlockComponentProcessed` with this process type - beacon_processor - .send_rpc_beacon_block( - block_root, - block, - seen_timestamp, - BlockProcessType::SingleBlock { id }, - ) - .map_err(|e| { - error!( - error = ?e, - "Failed to send sync block to processor" - ); - SendErrorProcessor::SendError - }) - } - - pub fn send_blobs_for_processing( - &self, - id: Id, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - ) -> Result<(), SendErrorProcessor> { - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let beacon_processor = self - .beacon_processor_if_enabled() - .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - - debug!(?block_root, %id, "Sending blobs for processing"); - // Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync - // must receive a single `SyncMessage::BlockComponentProcessed` event with this process type - beacon_processor - .send_rpc_blobs( - block_root, - blobs, - seen_timestamp, - BlockProcessType::SingleBlob { id }, - ) - .map_err(|e| { - error!( - error = ?e, - "Failed to send sync blobs to processor" - ); - SendErrorProcessor::SendError - }) - } - - pub fn send_custody_columns_for_processing( - &self, - _id: Id, - block_root: Hash256, - custody_columns: DataColumnSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), SendErrorProcessor> { - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let beacon_processor = self - .beacon_processor_if_enabled() - .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - - debug!( - ?block_root, - ?process_type, - "Sending custody columns for processing" - ); - - beacon_processor - .send_rpc_custody_columns(block_root, custody_columns, seen_timestamp, process_type) - .map_err(|e| { - error!( - error = ?e, - "Failed to send sync custody columns to processor" - ); - SendErrorProcessor::SendError - }) - } - pub(crate) fn register_metrics(&self) { for (id, count) in [ ("blocks_by_root", self.blocks_by_root_requests.len()), diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index f896589f85a..135e36453f7 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -1,14 +1,18 @@ +use crate::sync::network_context::requests::BlobCountPerBlock; use crate::sync::network_context::{ - PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, + BlocksByRootSameForkRequest, PeerGroup, RpcRequestSendError, RpcResponseError, + SyncNetworkContext, }; use crate::sync::range_sync::BatchPeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::{get_block_root, BeaconChainTypes}; -use lighthouse_network::rpc::methods::{BlobsByRangeRequest, BlocksByRangeRequest}; +use lighthouse_network::rpc::methods::{ + BlobsByRangeRequest, BlocksByRangeRequest, BlocksByRootRequest, +}; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyByRangeRequestId, + BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, + ComponentsByRangeRequestId, CustodyByRangeRequestId, CustodyByRootRequestId, }; use lighthouse_network::PeerId; use parking_lot::RwLock; @@ -26,42 +30,34 @@ use types::{ pub struct BlockComponentsByRangeRequest { id: ComponentsByRangeRequestId, peers: Arc>>, - request: BlocksByRangeRequest, + request: BlocksByRootSameForkRequest, state: State, } +// Request blocks first, then columns. Assuming the block peer is honest we can attribute +// custody failures to the peers serving us columns. We want to get rid of the honest block +// peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 enum State { - Base { - blocks_by_range_request: - ByRangeRequest>>>, + BlocksRequest { + blocks_request: Request>>>, }, - // Two single concurrent requests for block + blobs. As of now we request blocks and blobs to - // the same peer, so we can attribute coupling errors to the same unique peer. - DenebEnabled { - blocks_by_range_request: - ByRangeRequest>>>, - blobs_by_range_request: ByRangeRequest>>>, + DataRequest { + blocks: Vec>>, + block_peer: PeerId, + data_request: DataRequest, }, - // Request blocks first, then columns. Assuming the block peer is honest we can attribute - // custody failures to the peers serving us columns. We want to get rid of the honest block - // peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 - FuluEnabled(FuluEnabledState), } -enum FuluEnabledState { - BlockRequest { - blocks_by_range_request: - ByRangeRequest>>>, +enum DataRequest { + Deneb { + blobs_request: Request>>>, }, - CustodyRequest { - blocks: Vec>>, - block_peer: PeerId, - custody_by_range_request: - ByRangeRequest, PeerGroup>, + Fulu { + custody_request: Request, PeerGroup>, }, } -enum ByRangeRequest { +enum Request { /// Active(RequestIndex) Active(I), /// Complete(DownloadedData, Peers) @@ -102,21 +98,12 @@ pub enum BlockComponentsByRangeRequestStep { impl BlockComponentsByRangeRequest { pub fn new( id: ComponentsByRangeRequestId, - request: BlocksByRangeRequest, + request: BlocksByRootSameForkRequest, peers: Arc>>, peers_to_deprioritize: &HashSet, cx: &mut SyncNetworkContext, ) -> Result { - // Induces a compile time panic if this doesn't hold true. - #[allow(clippy::assertions_on_constants)] - const _: () = assert!( - super::super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 - && super::super::range_sync::EPOCHS_PER_BATCH == 1, - "To deal with alignment with deneb boundaries, batches need to be of just one epoch" - ); - // The assertion above ensures each batch is in one single epoch - let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let batch_fork = cx.spec().fork_name_at_epoch(batch_epoch); + let batch_fork = request.fork; // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we // will request all blocks for the first 5 epochs to that same single peer. Before we would @@ -141,33 +128,14 @@ impl BlockComponentsByRangeRequest { return Err(RpcRequestSendError::NoPeers); }; - let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; + let blocks_req_id = cx.send_blocks_by_root_request( + block_peer, + BlocksByRootRequest::new(request.block_roots.clone(), cx.spec(), request.fork), + BlocksByRootRequester::RangeSync(id), + )?; - let state = if batch_fork.fulu_enabled() { - State::FuluEnabled(FuluEnabledState::BlockRequest { - blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), - }) - } else if batch_fork.deneb_enabled() { - // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I - // would like the beacon processor / da_checker to be the one that decides if an - // RpcBlock is valid or not with respect to containing blobs. Having sync not even - // attempt a requests seems like an added limitation. - let blobs_req_id = cx.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - )?; - State::DenebEnabled { - blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), - blobs_by_range_request: ByRangeRequest::Active(blobs_req_id), - } - } else { - State::Base { - blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), - } + let state = State::BlocksRequest { + blocks_request: Request::Active(blocks_req_id), }; Ok(Self { @@ -183,109 +151,118 @@ impl BlockComponentsByRangeRequest { cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { match &mut self.state { - State::Base { - blocks_by_range_request, + State::BlocksRequest { + blocks_request: blocks_by_range_request, } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_base(blocks.to_vec()); - Ok(Some((rpc_blocks, peer_group))) + let fork = self.request.fork; + let blocks_have_data = blocks.iter().any(|block| block.has_data()); + + if blocks_have_data && fork.fulu_enabled() { + let mut column_indices = cx + .network_globals() + .sampling_columns() + .iter() + .copied() + .collect::>(); + column_indices.sort_unstable(); + + let block_roots_with_data = blocks + .iter() + .filter(|block| block.has_data()) + // TODO(tree-sync): cache block root + .map(|block| get_block_root(block)) + .collect::>(); + + let request = BlocksByRootRequest::new( + block_roots_with_data, + cx.spec(), + self.request.fork, + ); + + let req_id = cx + .send_custody_by_root_request(self.id, request, self.peers.clone()) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => Error::InternalError(e), + RpcRequestSendError::NoPeers => Error::InternalError( + "send_custody_by_range_request does not error with NoPeers" + .to_owned(), + ), + })?; + + self.state = State::DataRequest { + blocks: blocks.to_vec(), + block_peer: *block_peer, + data_request: DataRequest::Fulu { + custody_request: Request::Active(req_id), + }, + }; + Ok(None) + } else if blocks_have_data && fork.deneb_enabled() { + let blob_count_per_block = blocks + .iter() + .filter(|block| block.has_data()) + // TODO(tree-sync): cache block root + .map(|block| (get_block_root(block), block.num_expected_blobs())) + .collect::>(); + + // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I + // would like the beacon processor / da_checker to be the one that decides if an + // RpcBlock is valid or not with respect to containing blobs. Having sync not even + // attempt a requests seems like an added limitation. + let req_id = cx + .send_blobs_by_root_request( + *block_peer, + BlobCountPerBlock(blob_count_per_block), + self.id, + ) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => Error::InternalError(e), + RpcRequestSendError::NoPeers => Error::InternalError( + "send_custody_by_range_request does not error with NoPeers" + .to_owned(), + ), + })?; + + self.state = State::DataRequest { + blocks: blocks.to_vec(), + block_peer: *block_peer, + data_request: DataRequest::Deneb { + blobs_request: Request::Active(req_id), + }, + }; + Ok(None) + } else { + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_base(blocks.to_vec()); + Ok(Some((rpc_blocks, peer_group))) + } } else { // Wait for blocks_by_range requests to complete Ok(None) } } - State::DenebEnabled { - blocks_by_range_request, - blobs_by_range_request, - } => { - if let (Some((blocks, block_peer)), Some((blobs, _))) = ( - blocks_by_range_request.to_finished(), - blobs_by_range_request.to_finished(), - ) { - // We use the same block_peer for the blobs request - let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = - couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; - Ok(Some((rpc_blocks, peer_group))) - } else { - // Wait for blocks_by_range and blobs_by_range requests to complete - Ok(None) - } - } - State::FuluEnabled(state) => match state { - FuluEnabledState::BlockRequest { - blocks_by_range_request, + State::DataRequest { + blocks, + block_peer, + data_request, + } => match data_request { + DataRequest::Deneb { + blobs_request: blobs_by_range_request, } => { - if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - let blocks_with_data = blocks - .iter() - .filter(|block| block.has_data()) - .map(|block| block.signed_block_header()) - .collect::>(); - - if blocks_with_data.is_empty() { - let custody_column_indices = cx - .network_globals() - .sampling_columns() - .iter() - .copied() - .collect(); - - // Done, we got blocks and no columns needed - let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_fulu( - blocks.to_vec(), - vec![], - custody_column_indices, - cx.spec(), - )?; - Ok(Some((rpc_blocks, peer_group))) - } else { - let mut column_indices = cx - .network_globals() - .sampling_columns() - .iter() - .copied() - .collect::>(); - column_indices.sort_unstable(); - - let req_id = cx - .send_custody_by_range_request( - self.id, - blocks_with_data, - self.request.clone(), - column_indices, - self.peers.clone(), - ) - .map_err(|e| match e { - RpcRequestSendError::InternalError(e) => { - Error::InternalError(e) - } - RpcRequestSendError::NoPeers => Error::InternalError( - "send_custody_by_range_request does not error with NoPeers" - .to_owned(), - ), - })?; - - *state = FuluEnabledState::CustodyRequest { - blocks: blocks.to_vec(), - block_peer: *block_peer, - custody_by_range_request: ByRangeRequest::Active(req_id), - }; - - // Wait for the new custody_by_range request to complete - Ok(None) - } + if let Some((blobs, _)) = blobs_by_range_request.to_finished() { + // We use the same block_peer for the blobs request + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = + couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; + Ok(Some((rpc_blocks, peer_group))) } else { - // Wait for the block request to complete + // Wait for blocks_by_range and blobs_by_range requests to complete Ok(None) } } - FuluEnabledState::CustodyRequest { - blocks, - block_peer, - custody_by_range_request, + DataRequest::Fulu { + custody_request: custody_by_range_request, } => { if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { let custody_column_indices = cx @@ -312,29 +289,20 @@ impl BlockComponentsByRangeRequest { } } - pub fn on_blocks_by_range_result( + pub fn on_blocks_by_root_result( &mut self, - id: BlocksByRangeRequestId, + id: BlocksByRootRequestId, data: Vec>>, peer_id: PeerId, cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { match &mut self.state { - State::Base { - blocks_by_range_request, - } - | State::DenebEnabled { - blocks_by_range_request, - .. - } - | State::FuluEnabled(FuluEnabledState::BlockRequest { - blocks_by_range_request, - }) => { - blocks_by_range_request.finish(id, data, peer_id)?; + State::BlocksRequest { blocks_request } => { + blocks_request.finish(id, data, peer_id)?; } - State::FuluEnabled(FuluEnabledState::CustodyRequest { .. }) => { + _ => { return Err(Error::InternalError( - "Received blocks_by_range response expecting custody_by_range".to_string(), + "Received unexpected blocks_by_range response".to_string(), )) } } @@ -342,28 +310,23 @@ impl BlockComponentsByRangeRequest { self.continue_requests(cx) } - pub fn on_blobs_by_range_result( + pub fn on_blobs_by_root_result( &mut self, - id: BlobsByRangeRequestId, + id: BlobsByRootRequestId, data: Vec>>, peer_id: PeerId, cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { match &mut self.state { - State::Base { .. } => { - return Err(Error::InternalError( - "Received blobs_by_range response before Deneb".to_string(), - )) - } - State::DenebEnabled { - blobs_by_range_request, + State::DataRequest { + data_request: DataRequest::Deneb { blobs_request }, .. } => { - blobs_by_range_request.finish(id, data, peer_id)?; + blobs_request.finish(id, data, peer_id)?; } - State::FuluEnabled(_) => { + _ => { return Err(Error::InternalError( - "Received blobs_by_range response after PeerDAS".to_string(), + "Received unexpected blobs_by_range response".to_string(), )) } } @@ -371,32 +334,25 @@ impl BlockComponentsByRangeRequest { self.continue_requests(cx) } - pub fn on_custody_by_range_result( + pub fn on_custody_by_root_result( &mut self, - id: CustodyByRangeRequestId, + id: CustodyByRootRequestId, data: DataColumnSidecarList, peers: PeerGroup, cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { match &mut self.state { - State::Base { .. } | State::DenebEnabled { .. } => { + State::DataRequest { + data_request: DataRequest::Fulu { custody_request }, + .. + } => { + custody_request.finish(id, data, peers)?; + } + _ => { return Err(Error::InternalError( - "Received custody_by_range response before PeerDAS".to_string(), + "Received unexpected custody_by_range response".to_string(), )) } - State::FuluEnabled(state) => match state { - FuluEnabledState::BlockRequest { .. } => { - return Err(Error::InternalError( - "Received custody_by_range expecting blocks_by_range".to_string(), - )); - } - FuluEnabledState::CustodyRequest { - custody_by_range_request, - .. - } => { - custody_by_range_request.finish(id, data, peers)?; - } - }, } self.continue_requests(cx) @@ -405,16 +361,8 @@ impl BlockComponentsByRangeRequest { #[cfg(test)] pub fn state_step(&self) -> BlockComponentsByRangeRequestStep { match &self.state { - State::Base { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, - State::DenebEnabled { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, - State::FuluEnabled(state) => match state { - FuluEnabledState::BlockRequest { .. } => { - BlockComponentsByRangeRequestStep::BlocksRequest - } - FuluEnabledState::CustodyRequest { .. } => { - BlockComponentsByRangeRequestStep::CustodyRequest - } - }, + State::BlocksRequest { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::DataRequest { .. } => BlockComponentsByRangeRequestStep::CustodyRequest, } } } @@ -507,7 +455,7 @@ fn couple_blocks_fulu( .collect::, _>>() } -impl ByRangeRequest { +impl Request { fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), Error> { match self { Self::Active(expected_id) => { diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 1ca2a55a13a..d39fef93f3b 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,11 +1,11 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, RpcRequestSendError, - RpcResponseError, + DataColumnsByRootRequestId, RpcRequestSendError, RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; -use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; +use lighthouse_network::rpc::methods::DataColumnsByRootRequest; +use lighthouse_network::service::api_types::{CustodyByRootRequestId, DataColumnsByRootRequester}; use lighthouse_network::PeerId; use lru_cache::LRUTimeCache; use parking_lot::RwLock; @@ -15,7 +15,10 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; +use types::{ + data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, + DataColumnsByRootIdentifier, Hash256, RuntimeVariableList, +}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -27,8 +30,8 @@ const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; pub struct ActiveCustodyByRootRequest { start_time: Instant, - block_root: Hash256, - custody_id: CustodyId, + block_roots: Vec, + custody_id: CustodyByRootRequestId, /// List of column indices this request needs to download to complete successfully #[allow(clippy::type_complexity)] column_requests: FnvHashMap< @@ -89,14 +92,14 @@ pub type CustodyByRootRequestResult = impl ActiveCustodyByRootRequest { pub(crate) fn new( - block_root: Hash256, - custody_id: CustodyId, + block_roots: Vec, + custody_id: CustodyByRootRequestId, column_indices: &[ColumnIndex], lookup_peers: Arc>>, ) -> Self { Self { start_time: Instant::now(), - block_root, + block_roots, custody_id, column_requests: HashMap::from_iter( column_indices @@ -127,7 +130,6 @@ impl ActiveCustodyByRootRequest { ) -> CustodyByRootRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( - block_root = ?self.block_root, %req_id, "Received custody column response for unrequested index" ); @@ -137,7 +139,6 @@ impl ActiveCustodyByRootRequest { match resp { Ok((data_columns, seen_timestamp)) => { debug!( - block_root = ?self.block_root, %req_id, %peer_id, count = data_columns.len(), @@ -184,7 +185,6 @@ impl ActiveCustodyByRootRequest { if !missing_column_indexes.is_empty() { // Note: Batch logging that columns are missing to not spam logger debug!( - block_root = ?self.block_root, %req_id, %peer_id, // TODO(das): this property can become very noisy, being the full range 0..128 @@ -197,10 +197,9 @@ impl ActiveCustodyByRootRequest { } Err(err) => { debug!( - block_root = ?self.block_root, %req_id, - %peer_id, - error = ?err, + %peer_id, + error = ?err, "Custody column download error" ); @@ -312,10 +311,8 @@ impl ActiveCustodyByRootRequest { .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, - DataColumnsByRootSingleBlockRequest { - block_root: self.block_root, - indices: indices.clone(), - }, + self.block_roots.clone(), + indices.clone(), // If peer is in the lookup peer set, it claims to have imported the block and // must have its columns in custody. In that case, set `true = enforce max_requests` // and downscore if data_columns_by_root does not returned the expected custody @@ -481,6 +478,13 @@ impl ColumnRequest { } } + pub fn peek_downloaded_data(&self) -> Option<&T> { + match &self.status { + Status::Downloaded(_, data, _) => Some(data), + _ => None, + } + } + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 8228ea5d9d5..8b32c4f61b8 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -7,13 +7,11 @@ use strum::IntoStaticStr; use types::{Hash256, Slot}; pub use blobs_by_range::BlobsByRangeRequestItems; -pub use blobs_by_root::{BlobsByRootRequestItems, BlobsByRootSingleBlockRequest}; +pub use blobs_by_root::{BlobCountPerBlock, BlobsByRootRequestItems}; pub use blocks_by_range::BlocksByRangeRequestItems; -pub use blocks_by_root::{BlocksByRootRequestItems, BlocksByRootSingleRequest}; +pub use blocks_by_root::BlocksByRootRequestItems; pub use data_columns_by_range::DataColumnsByRangeRequestItems; -pub use data_columns_by_root::{ - DataColumnsByRootRequestItems, DataColumnsByRootSingleBlockRequest, -}; +pub use data_columns_by_root::DataColumnsByRootRequestItems; use crate::metrics; diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs index 547c51198e4..8f420936bcc 100644 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs @@ -1,39 +1,24 @@ use lighthouse_network::rpc::methods::BlobsByRootRequest; +use std::collections::HashMap; use std::sync::Arc; use types::{blob_sidecar::BlobIdentifier, BlobSidecar, EthSpec, ForkContext, Hash256}; use super::{ActiveRequestItems, LookupVerifyError}; -#[derive(Debug, Clone)] -pub struct BlobsByRootSingleBlockRequest { - pub block_root: Hash256, - pub indices: Vec, -} - -impl BlobsByRootSingleBlockRequest { - pub fn into_request(self, spec: &ForkContext) -> BlobsByRootRequest { - BlobsByRootRequest::new( - self.indices - .into_iter() - .map(|index| BlobIdentifier { - block_root: self.block_root, - index, - }) - .collect(), - spec, - ) - } -} +pub struct BlobCountPerBlock(pub HashMap); pub struct BlobsByRootRequestItems { - request: BlobsByRootSingleBlockRequest, + // TODO(tree-sync): we know ahead of time how many blobs each block has, track it + block_roots: Vec, + indices: Vec, items: Vec>>, } impl BlobsByRootRequestItems { - pub fn new(request: BlobsByRootSingleBlockRequest) -> Self { + pub fn new(request: BlobCountPerBlock) -> Self { Self { - request, + block_roots: todo!(), + indices: todo!(), items: vec![], } } @@ -47,13 +32,13 @@ impl ActiveRequestItems for BlobsByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, blob: Self::Item) -> Result { let block_root = blob.block_root(); - if self.request.block_root != block_root { + if !self.block_roots.contains(&block_root) { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } if !blob.verify_blob_sidecar_inclusion_proof() { return Err(LookupVerifyError::InvalidInclusionProof); } - if !self.request.indices.contains(&blob.index) { + if !self.indices.contains(&blob.index) { return Err(LookupVerifyError::UnrequestedIndex(blob.index)); } if self.items.iter().any(|b| b.index == blob.index) { @@ -62,7 +47,7 @@ impl ActiveRequestItems for BlobsByRootRequestItems { self.items.push(blob); - Ok(self.items.len() >= self.request.indices.len()) + Ok(self.items.len() >= self.block_roots.len() * self.indices.len()) } fn consume(&mut self) -> Vec { diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs index 6d7eabf909f..2d98310aaaf 100644 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs @@ -5,22 +5,13 @@ use types::{EthSpec, ForkContext, Hash256, SignedBeaconBlock}; use super::{ActiveRequestItems, LookupVerifyError}; -#[derive(Debug, Copy, Clone)] -pub struct BlocksByRootSingleRequest(pub Hash256); - -impl BlocksByRootSingleRequest { - pub fn into_request(self, fork_context: &ForkContext) -> BlocksByRootRequest { - BlocksByRootRequest::new(vec![self.0], fork_context) - } -} - pub struct BlocksByRootRequestItems { - request: BlocksByRootSingleRequest, + request: BlocksByRootRequest, items: Vec>>, } impl BlocksByRootRequestItems { - pub fn new(request: BlocksByRootSingleRequest) -> Self { + pub fn new(request: BlocksByRootRequest) -> Self { Self { request, items: vec![], @@ -35,8 +26,14 @@ impl ActiveRequestItems for BlocksByRootRequestItems { /// resolved immediately. /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, block: Self::Item) -> Result { + // TODO(tree-sync): Cache this block root calculation let block_root = get_block_root(&block); - if self.request.0 != block_root { + if !self + .request + .block_roots() + .iter() + .any(|root| root == &block_root) + { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 09d7f4b3b77..3517207a72c 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -1,46 +1,29 @@ use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use std::sync::Arc; use types::{ - ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, Hash256, - RuntimeVariableList, + ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, + Hash256, RuntimeVariableList, }; use super::{ActiveRequestItems, LookupVerifyError}; -#[derive(Debug, Clone)] -pub struct DataColumnsByRootSingleBlockRequest { - pub block_root: Hash256, - pub indices: Vec, -} - -impl DataColumnsByRootSingleBlockRequest { - pub fn try_into_request( - self, - fork_name: ForkName, - spec: &ChainSpec, - ) -> Result { - let number_of_columns = spec.number_of_columns as usize; - let columns = RuntimeVariableList::new(self.indices, number_of_columns) - .map_err(|_| "Number of indices exceeds total number of columns")?; - Ok(DataColumnsByRootRequest::new( - vec![DataColumnsByRootIdentifier { - block_root: self.block_root, - columns, - }], - spec.max_request_blocks(fork_name), - )) - } +pub struct DataColumnsByRootRequestSameIndices { + block_roots: Vec, + indices: Vec, } pub struct DataColumnsByRootRequestItems { - request: DataColumnsByRootSingleBlockRequest, + // Assumes each block root has the same indices + block_roots: Vec, + indices: Vec, items: Vec>>, } impl DataColumnsByRootRequestItems { - pub fn new(request: DataColumnsByRootSingleBlockRequest) -> Self { + pub fn new(block_roots: Vec, indices: Vec) -> Self { Self { - request, + block_roots, + indices, items: vec![], } } @@ -54,13 +37,13 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, data_column: Self::Item) -> Result { let block_root = data_column.block_root(); - if self.request.block_root != block_root { + if !self.block_roots.contains(&block_root) { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } if !data_column.verify_inclusion_proof() { return Err(LookupVerifyError::InvalidInclusionProof); } - if !self.request.indices.contains(&data_column.index) { + if !self.indices.contains(&data_column.index) { return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); } if self.items.iter().any(|d| d.index == data_column.index) { @@ -72,7 +55,7 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { self.items.push(data_column); - Ok(self.items.len() >= self.request.indices.len()) + Ok(self.items.len() >= self.block_roots.len() * self.indices.len()) } fn consume(&mut self) -> Vec { diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index d76c7d2bbc2..e92af4b3559 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -1,12 +1,11 @@ use self::request::ActiveColumnSampleRequest; #[cfg(test)] pub(crate) use self::request::Status; -use super::network_context::{ - DataColumnsByRootSingleBlockRequest, RpcResponseError, SyncNetworkContext, -}; +use super::network_context::{RpcResponseError, SyncNetworkContext}; use crate::metrics; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; +use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use lighthouse_network::service::api_types::{ DataColumnsByRootRequester, SamplingId, SamplingRequestId, SamplingRequester, }; @@ -17,7 +16,10 @@ use std::{ time::Duration, }; use tracing::{debug, error, instrument, warn}; -use types::{data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, Hash256}; +use types::{ + data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, + Hash256, RuntimeVariableList, +}; pub type SamplingResult = Result<(), SamplingError>; @@ -576,10 +578,8 @@ impl ActiveSamplingRequest { sampling_request_id: self.current_sampling_request_id, }), peer_id, - DataColumnsByRootSingleBlockRequest { - block_root: self.block_root, - indices: column_indexes.clone(), - }, + vec![self.block_root], + column_indexes.clone(), // false = We issue request to custodians who may or may not have received the // samples yet. We don't any signal (like an attestation or status messages that the // custodian has received data). diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 8834c74c08b..280157957c8 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,6 +1,6 @@ use crate::sync::network_context::PeerGroup; use beacon_chain::block_verification_types::RpcBlock; -use lighthouse_network::rpc::methods::BlocksByRangeRequest; +use lighthouse_network::rpc::methods::{BlocksByRangeRequest, BlocksByRootRequest}; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; use std::collections::HashSet; @@ -9,7 +9,7 @@ use std::hash::{Hash, Hasher}; use std::ops::Sub; use std::time::{Duration, Instant}; use strum::Display; -use types::{ColumnIndex, Epoch, EthSpec, Slot}; +use types::{ChainSpec, ColumnIndex, Epoch, EthSpec, ForkName, Hash256, Slot}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; @@ -120,9 +120,7 @@ pub enum BatchProcessingResult { /// A segment of a chain. pub struct BatchInfo { /// Start slot of the batch. - start_slot: Slot, - /// End slot of the batch. - end_slot: Slot, + block_roots: Vec, /// The `Attempts` that have been made and failed to send us this batch. failed_processing_attempts: Vec, /// Number of processing attempts that have failed but we do not count. @@ -137,16 +135,6 @@ pub struct BatchInfo { marker: std::marker::PhantomData, } -impl fmt::Display for BatchInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "Start Slot: {}, End Slot: {}, State: {}", - self.start_slot, self.end_slot, self.state - ) - } -} - #[derive(Display)] /// Current state of a batch pub enum BatchState { @@ -192,12 +180,9 @@ impl BatchInfo { /// fork boundary will be of mixed type (all blocks and one last blockblob), and I don't want to /// deal with this for now. /// This means finalization might be slower in deneb - pub fn new(start_epoch: &Epoch, num_of_epochs: u64) -> Self { - let start_slot = start_epoch.start_slot(E::slots_per_epoch()); - let end_slot = start_slot + num_of_epochs * E::slots_per_epoch(); + pub fn new(block_roots: Vec) -> Self { BatchInfo { - start_slot, - end_slot, + block_roots, failed_processing_attempts: Vec::new(), failed_download_attempts: 0, failed_peers: <_>::default(), @@ -247,10 +232,12 @@ impl BatchInfo { /// Returns a BlocksByRange request associated with the batch. pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest { - BlocksByRangeRequest::new( - self.start_slot.into(), - self.end_slot.sub(self.start_slot).into(), - ) + todo!(); + } + + pub fn to_blocks_by_root_request(&self, spec: &ChainSpec) -> BlocksByRootRequest { + // TODO: Is it necessary to pass ForkName to BlocksByRootRequest + BlocksByRootRequest::new(self.block_roots.clone(), spec, ForkName::Fulu) } /// After different operations over a batch, this could be in a state that allows it to diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 83a9dc07b71..b484b7e4ac6 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -721,7 +721,7 @@ impl SyncingChain { } BatchState::AwaitingProcessing(..) => {} BatchState::Processing(_) => { - debug!(batch = %id, %batch, "Advancing chain while processing a batch"); + debug!(batch = %id, "Advancing chain while processing a batch"); if let Some(processing_id) = self.current_processing_batch { if id <= processing_id { self.current_processing_batch = None; @@ -934,7 +934,7 @@ impl SyncingChain { let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( - request, + todo!(), RangeRequestId::RangeSync { chain_id: self.id, batch_id, @@ -950,16 +950,16 @@ impl SyncingChain { .map(|epoch| epoch == batch_id) .unwrap_or(false) { - debug!(epoch = %batch_id, %batch, %batch_state, "Requesting optimistic batch"); + debug!(%batch_id, %batch_state, "Requesting optimistic batch"); } else { - debug!(epoch = %batch_id, %batch, %batch_state, "Requesting batch"); + debug!(%batch_id, %batch_state, "Requesting batch"); } return Ok(KeepChain); } Err(e) => match e { e @ (RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_)) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); + warn!(%batch_id, error = ?e, "Could not send batch request"); // register the failed download and check if the batch can be retried batch.start_downloading(1)?; // fake request_id = 1 is not relevant match batch.download_failed()? { @@ -1016,10 +1016,8 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - if let Entry::Vacant(entry) = self.batches.entry(epoch) { - let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); - entry.insert(optimistic_batch); - self.send_batch(network, epoch)?; + if let Entry::Vacant(_entry) = self.batches.entry(epoch) { + todo!(); } return Ok(KeepChain); } @@ -1079,8 +1077,8 @@ impl SyncingChain { self.to_be_downloaded += EPOCHS_PER_BATCH; self.include_next_batch(network) } - Entry::Vacant(entry) => { - entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH)); + Entry::Vacant(_entry) => { + todo!(); self.to_be_downloaded += EPOCHS_PER_BATCH; Some(next_batch_id) } diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 8132269f717..bfc5ad34eeb 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -4,7 +4,7 @@ use crate::sync::block_lookups::{ }; use crate::sync::range_sync::BATCH_BUFFER_SIZE; use crate::sync::{ - manager::{BlockProcessType, BlockProcessingResult, SyncManager}, + manager::{BlockProcessingResult, SyncManager}, peer_sampling::SamplingConfig, SamplingId, SyncMessage, }; @@ -145,6 +145,7 @@ impl TestRig { network_rx_queue: vec![], sync_rx, sent_blocks_by_range: <_>::default(), + blocks_by_root: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), sync_manager: SyncManager::new( @@ -502,11 +503,8 @@ impl TestRig { ); } - fn single_block_component_processed(&mut self, id: Id, result: BlockProcessingResult) { - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type: BlockProcessType::SingleBlock { id }, - result, - }) + fn single_block_component_processed(&mut self, _id: Id, _result: BlockProcessingResult) { + todo!(); } fn single_block_component_processed_imported(&mut self, block_root: Hash256) { @@ -517,41 +515,26 @@ impl TestRig { ) } - fn single_blob_component_processed(&mut self, id: Id, result: BlockProcessingResult) { - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type: BlockProcessType::SingleBlob { id }, - result, - }) + fn single_blob_component_processed(&mut self, _id: Id, _result: BlockProcessingResult) { + todo!(); } fn parent_lookup_block_response( &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - beacon_block: Option>>, + _id: SingleLookupReqId, + _peer_id: PeerId, + _beacon_block: Option>>, ) { - self.log("parent_lookup_block_response"); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::SingleBlock { id }, - peer_id, - beacon_block, - seen_timestamp: D, - }); + todo!(); } fn single_lookup_block_response( &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - beacon_block: Option>>, + _id: SingleLookupReqId, + _peer_id: PeerId, + _beacon_block: Option>>, ) { - self.log("single_lookup_block_response"); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::SingleBlock { id }, - peer_id, - beacon_block, - seen_timestamp: D, - }); + todo!(); } fn parent_lookup_blob_response( @@ -565,7 +548,7 @@ impl TestRig { blob_sidecar.as_ref().map(|b| b.index) )); self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: SyncRequestId::SingleBlob { id }, + sync_request_id: todo!(), peer_id, blob_sidecar, seen_timestamp: D, @@ -579,7 +562,7 @@ impl TestRig { blob_sidecar: Option>>, ) { self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: SyncRequestId::SingleBlob { id }, + sync_request_id: todo!(), peer_id, blob_sidecar, seen_timestamp: D, @@ -652,12 +635,8 @@ impl TestRig { self.complete_lookup_block_import_valid(block_root, import) } - fn parent_lookup_failed(&mut self, id: SingleLookupReqId, peer_id: PeerId, error: RPCError) { - self.send_sync_message(SyncMessage::RpcError { - peer_id, - sync_request_id: SyncRequestId::SingleBlock { id }, - error, - }) + fn parent_lookup_failed(&mut self, _id: SingleLookupReqId, _peer_id: PeerId, _error: RPCError) { + todo!() } fn parent_lookup_failed_unavailable(&mut self, id: SingleLookupReqId, peer_id: PeerId) { @@ -671,12 +650,8 @@ impl TestRig { ); } - fn single_lookup_failed(&mut self, id: SingleLookupReqId, peer_id: PeerId, error: RPCError) { - self.send_sync_message(SyncMessage::RpcError { - peer_id, - sync_request_id: SyncRequestId::SingleBlock { id }, - error, - }) + fn single_lookup_failed(&mut self, _id: SingleLookupReqId, _peer_id: PeerId, _error: RPCError) { + todo!(); } fn return_empty_sampling_requests(&mut self, ids: DCByRootIds) { @@ -787,19 +762,20 @@ impl TestRig { &mut self, ids: DCByRootIds, data_columns: DataColumnSidecarList, - missing_components: bool, + _missing_components: bool, ) { - let lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { + let _lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { requester: DataColumnsByRootRequester::Custody(id), .. }) = ids.first().unwrap().0 { - id.requester.0.lookup_id + todo!(); + // id.parent_request_id.0.lookup_id } else { panic!("not a custody requester") }; - let first_column = data_columns.first().cloned().unwrap(); + let _first_column = data_columns.first().cloned().unwrap(); for id in ids { self.log(&format!("return valid data column for {id:?}")); @@ -815,19 +791,7 @@ impl TestRig { self.expect_rpc_custody_column_work_event(); // Respond with valid result - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type: BlockProcessType::SingleCustodyColumn(lookup_id), - result: if missing_components { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - first_column.slot(), - first_column.block_root(), - )) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported( - first_column.block_root(), - )) - }, - }); + todo!(); } fn complete_data_columns_by_root_request( @@ -958,16 +922,9 @@ impl TestRig { fn find_block_lookup_request( &mut self, - for_block: Hash256, + _for_block: Hash256, ) -> Result { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlocksByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), - } if request.block_roots().to_vec().contains(&for_block) => Some(*id), - _ => None, - }) + todo!(); } #[track_caller] @@ -984,14 +941,14 @@ impl TestRig { NetworkMessage::SendRequest { peer_id: _, request: RequestType::BlobsByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), + app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), } if request .blob_ids .to_vec() .iter() .any(|r| r.block_root == for_block) => { - Some(*id) + todo!(); } _ => None, }) @@ -1004,16 +961,8 @@ impl TestRig { } #[track_caller] - fn expect_block_parent_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlocksByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), - } if request.block_roots().to_vec().contains(&for_block) => Some(*id), - _ => None, - }) - .unwrap_or_else(|e| panic!("Expected block parent request for {for_block:?}: {e}")) + fn expect_block_parent_request(&mut self, _for_block: Hash256) -> SingleLookupReqId { + todo!(); } fn expect_no_requests_for(&mut self, block_root: Hash256) { @@ -1031,14 +980,14 @@ impl TestRig { NetworkMessage::SendRequest { peer_id: _, request: RequestType::BlobsByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), + app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), } if request .blob_ids .to_vec() .iter() .all(|r| r.block_root == for_block) => { - Some(*id) + todo!(); } _ => None, }) diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index cf8af7f5348..804be212ef9 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -20,7 +20,7 @@ use tokio::sync::mpsc; use tracing_subscriber::fmt::MakeWriter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; -use types::{ChainSpec, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; +use types::{ChainSpec, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock}; mod lookups; mod range; @@ -75,6 +75,7 @@ struct TestRig { // Cache of sent blocks for PeerDAS responses sent_blocks_by_range: HashMap>>>, + blocks_by_root: HashMap>>, } // Environment variable to read if `fork_from_env` feature is enabled. diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 599c808befa..323ac81144c 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -11,12 +11,14 @@ use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, + BlobsByRangeRequest, BlocksByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, + OldBlocksByRangeRequest, }; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, SyncRequestId, + AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, BlocksByRootRequestId, + BlocksByRootRequester, ComponentsByRangeRequestId, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, }; use lighthouse_network::types::SyncState; use lighthouse_network::{PeerId, SyncInfo}; @@ -69,12 +71,16 @@ struct Config { type BlocksByRangeRequestData = (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest); +type BlocksByRootRequestData = (BlocksByRootRequestId, PeerId, BlocksByRootRequest); + type DataColumnsByRangeRequestData = ( DataColumnsByRangeRequestId, PeerId, DataColumnsByRangeRequest, ); +type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest); + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -126,6 +132,20 @@ impl RequestFilter { } } + fn blocks_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), + } if self.matches_blocks_by_root(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + fn data_columns_by_range_requests( &self, ev: &NetworkMessage, @@ -142,6 +162,26 @@ impl RequestFilter { } } + fn data_columns_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), + } if self.matches_data_columns_by_root(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn matches_blocks_by_root(&self, peer: &PeerId, _req: &BlocksByRootRequest) -> bool { + self.matches_peer(peer) + } + fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { self.matches_common(peer, *req.start_slot()) } @@ -163,6 +203,19 @@ impl RequestFilter { self.matches_common(peer, req.start_slot) } + fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { + if let Some(index) = self.column_index { + if !req + .data_column_ids + .iter() + .any(|id| id.columns.iter().any(|i| *i == index)) + { + return false; + } + } + self.matches_peer(peer) + } + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { if let Some(expected_epoch) = self.epoch { let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); @@ -170,6 +223,10 @@ impl RequestFilter { return false; } } + self.matches_peer(peer) + } + + fn matches_peer(&self, peer: &PeerId) -> bool { if let Some(expected_peer) = self.peer { if *peer != expected_peer { return false; @@ -426,7 +483,7 @@ impl TestRig { self.sync_manager.update_execution_engine_state(state); } - fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> Arc> { + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> SignedBeaconBlock { let mut block = BeaconBlock::empty(&self.spec); if with_data { if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { @@ -436,7 +493,24 @@ impl TestRig { } } *block.slot_mut() = slot; - Arc::new(SignedBeaconBlock::from_block(block, Signature::empty())) + SignedBeaconBlock::from_block(block, Signature::empty()) + } + + fn create_parent_chain(&mut self) -> (Hash256, Slot) { + let current_head = self.harness.chain.head(); + let mut parent_root = current_head.head_block_root(); + let mut slot = current_head.head_slot(); + for _ in 0..64 { + let mut block = self.zero_block_at_slot(slot, true); + *block.message_mut().parent_root_mut() = parent_root; + *block.message_mut().slot_mut() = slot; + let block_root = block.canonical_root(); + self.blocks_by_root.insert(block_root, block.into()); + + parent_root = block_root; + slot = slot + Slot::new(1); + } + (parent_root, slot) } fn last_sent_blocks_by_range( @@ -484,6 +558,33 @@ impl TestRig { } } + fn send_blocks_by_root_response( + &mut self, + req_id: BlocksByRootRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); + self.log(&format!( + "Completing BlocksByRoot request {req_id} to {peer_id} with blocks {slots:?}" + )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRoot(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRoot(req_id), + peer_id, + beacon_block: None, + seen_timestamp: D, + }); + } + fn send_data_columns_by_range_response( &mut self, id: DataColumnsByRangeRequestId, @@ -515,6 +616,37 @@ impl TestRig { }); } + fn send_data_columns_by_root_response( + &mut self, + id: DataColumnsByRootRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + fn pop_blocks_by_range_request( &mut self, request_filter: RequestFilter, @@ -595,6 +727,7 @@ impl TestRig { let blocks = (0..complete_config.block_count) .map(|i| { self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + .into() }) .collect::>(); self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); @@ -612,6 +745,7 @@ impl TestRig { let blocks = (0..complete_config.block_count) .map(|i| { self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + .into() }) .collect::>(); self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); @@ -619,6 +753,29 @@ impl TestRig { blocks_req_id.parent_request_id.requester } + fn complete_blocks_by_root_request( + &mut self, + request: BlocksByRootRequestData, + complete_config: CompleteConfig, + ) -> BlocksByRootRequester { + let (blocks_req_id, block_peer, blocks_req) = request; + + let blocks = blocks_req + .block_roots() + .iter() + .map(|block_root| { + self.blocks_by_root + .get(block_root) + .expect("Test consumer requested unknown block") + .clone() + }) + .collect::>(); + + self.send_blocks_by_root_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id + } + fn complete_data_columns_by_range_request( &mut self, (id, peer_id, req): DataColumnsByRangeRequestData, @@ -679,6 +836,72 @@ impl TestRig { self.send_data_columns_by_range_response(id, peer_id, &data_columns); } + fn complete_data_columns_by_root_request_range_sync( + &mut self, + (id, peer_id, req): DataColumnsByRootRequestData, + complete_config: CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + let mut triggered_custody_failure = false; + + let data_columns = req + .data_column_ids + .iter() + .flat_map(|column_id| { + let block = self + .blocks_by_root + .get(&column_id.block_root) + .expect("Test consumer requested unknown block") + .clone(); + + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + column_id.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + triggered_custody_failure = true; + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if triggered_custody_failure { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!( + "Forced custody failure at request {id} for peer {peer_id} index {index:?}" + )); + } + } + + self.send_data_columns_by_root_response(id, peer_id, &data_columns); + } + fn find_and_complete_data_by_range_request( &mut self, request_filter: RequestFilter, @@ -780,6 +1003,13 @@ impl TestRig { continue; } + if let Ok(request) = + self.pop_received_network_event(|ev| request_filter.blocks_by_root_requests(ev)) + { + self.complete_blocks_by_root_request(request, complete_config); + continue; + } + if let Ok(request) = self .pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) { @@ -787,6 +1017,13 @@ impl TestRig { continue; } + if let Ok(request) = self + .pop_received_network_event(|ev| request_filter.data_columns_by_root_requests(ev)) + { + self.complete_data_columns_by_root_request_range_sync(request, complete_config); + continue; + } + let sync_state = self.get_sync_state(); self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); @@ -1212,3 +1449,20 @@ fn finalized_sync_permanent_custody_peer_failure() { // custody_by_range request is still active waiting for a new peer to connect r.expect_active_block_components_by_range_request_on_custody_step(); } + +#[tokio::test] +async fn tree_sync_happy_path() { + let mut r = TestRig::test_setup(); + let (head_root, head_slot) = r.create_parent_chain(); + let remote_info = SyncInfo { + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + head_slot, + head_root, + }; + r.add_sync_peer(false, remote_info.clone()); + r.progress_until_no_events(NO_FILTER, complete()); + r.add_sync_peer(true, remote_info); + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_empty_network(); +} From f4b0e621bc87d5b088e57c6afce20f369bda7698 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 19 Jun 2025 19:48:57 +0200 Subject: [PATCH 25/66] Happy case works in test --- .../src/service/api_types.rs | 8 +- .../src/network_beacon_processor/mod.rs | 2 +- .../network_beacon_processor/sync_methods.rs | 22 +- beacon_node/network/src/sync/block_tree.rs | 553 +++++++++++------- beacon_node/network/src/sync/manager.rs | 62 +- .../network/src/sync/range_sync/chain.rs | 7 +- beacon_node/network/src/sync/tests/range.rs | 115 +++- 7 files changed, 487 insertions(+), 282 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index edc61dfe777..9fadffb9eb4 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -38,7 +38,7 @@ pub struct BlocksByRootRequestId { } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct HeaderLookupId(pub Hash256); +pub struct HeaderLookupId(pub Hash256, pub Id); #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct BatchId(pub Id); @@ -105,7 +105,7 @@ pub struct ComponentsByRangeRequestId { /// Range sync chain or backfill batch #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { - RangeSync { chain_id: Id, batch_id: Epoch }, + RangeSync(HeaderLookupId), BackfillSync { batch_id: Epoch }, } @@ -275,7 +275,7 @@ impl Display for DataColumnsByRootRequester { impl Display for HeaderLookupId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) + write!(f, "{}/{}", self.0, self.1) } } @@ -294,7 +294,7 @@ impl Display for CustodyRequester { impl Display for RangeRequestId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::RangeSync { chain_id, batch_id } => write!(f, "RangeSync/{batch_id}/{chain_id}"), + Self::RangeSync(id) => write!(f, "RangeSync/{id}"), Self::BackfillSync { batch_id } => write!(f, "BackfillSync/{batch_id}"), } } diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index b7e01b84a4a..534057fae40 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -521,7 +521,7 @@ impl NetworkBeaconProcessor { blocks: Vec>, ) -> Result<(), Error> { let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. }); - debug!(blocks = blocks.len(), id = ?process_id, "Batch sending for process"); + debug!(blocks = blocks.len(), id = %process_id, "Batch sending for process"); let processor = self.clone(); let process_fn = async move { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index d4285c41cb5..1f2c56adaee 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -8,8 +8,10 @@ use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; use beacon_chain::{ BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, }; +use lighthouse_network::service::api_types::HeaderLookupId; use lighthouse_network::PeerAction; use std::collections::HashMap; +use std::fmt::{Display, Formatter}; use std::sync::Arc; use std::time::Duration; use tracing::{debug, warn}; @@ -19,13 +21,12 @@ use types::{ColumnIndex, DataColumnSidecar, Epoch, Hash256}; #[derive(Clone, Debug, PartialEq)] pub enum ChainSegmentProcessId { /// Processing Id of a range syncing batch. - RangeBatchId(ChainId, Epoch), + RangeBatchId(HeaderLookupId), /// Processing ID for a backfill syncing batch. BackSyncBatchId(Epoch), } /// Returned when a chain segment import fails. -#[derive(Debug)] pub struct ChainSegmentFailed { /// To be displayed in logs. pub message: String, @@ -116,7 +117,7 @@ impl NetworkBeaconProcessor { ) { let result = match sync_type { // this a request from the range sync - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { + ChainSegmentProcessId::RangeBatchId(id) => { let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); let sent_blocks = downloaded_blocks.len(); @@ -127,9 +128,8 @@ impl NetworkBeaconProcessor { { (imported_blocks, Ok(_)) => { debug!( - batch_epoch = %epoch, + %id, first_block_slot = start_slot, - chain = chain_id, last_block_slot = end_slot, processed_blocks = sent_blocks, service= "sync", @@ -141,9 +141,8 @@ impl NetworkBeaconProcessor { } (imported_blocks, Err(e)) => { debug!( - batch_epoch = %epoch, + %id, first_block_slot = start_slot, - chain = chain_id, last_block_slot = end_slot, imported_blocks, error = %e.message, @@ -429,3 +428,12 @@ impl NetworkBeaconProcessor { }) } } + +impl Display for ChainSegmentProcessId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::RangeBatchId(id) => write!(f, "RangeBatchId/{id}"), + Self::BackSyncBatchId(id) => write!(f, "BackSyncBatchId/{id}"), + } + } +} diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index 6bb071b13e4..d602265ab05 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -1,7 +1,11 @@ use super::network_context::{LookupRequestResult, RpcResponseError, SyncNetworkContext}; +use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::custody_by_root::ColumnRequest; -use crate::sync::network_context::{BlocksByRootSameForkRequest, RpcResponseResult}; +use crate::sync::network_context::{ + BlocksByRootSameForkRequest, RpcResponseBatchResult, RpcResponseResult, +}; use crate::sync::range_sync::{BatchInfo, BatchPeers}; +use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::rpc::BlocksByRootRequest; @@ -13,81 +17,105 @@ use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tracing::debug; -use types::{BeaconBlockHeader, Epoch, ForkName, Hash256, SignedBeaconBlock, Slot}; +use types::{BeaconBlockHeader, Epoch, EthSpec, ForkName, Hash256, SignedBeaconBlock, Slot}; pub struct BlockTree { - blocks: HashMap, + blocks: HashMap>, batches: HashMap>, - roots: HashMap, - tips: HashSet, chain: Arc>, } -struct TreeRoot { +struct Block { + id: HeaderLookupId, peers: HashSet, - request: ColumnRequest, + status: Status, } -struct Block { - id: HeaderLookupId, - block: BeaconBlockHeader, - is_syncing: bool, +enum Status { + DownloadingHeader(ColumnRequest), + Header(BeaconBlockHeader), + Syncing(BeaconBlockHeader, SyncingStatus), +} + +enum SyncingStatus { + AwaitingDownload, + Downloading(Id), + AwaitingProcessing(RpcBlock, BatchPeers), + Processing(BatchPeers), } // TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got // through gossip and sync. -impl Block { - fn new(block_root: Hash256, block: BeaconBlockHeader) -> Self { +impl Block { + fn new(block_root: Hash256, id: Id, peers: &[PeerId]) -> Self { Self { - id: HeaderLookupId(block_root), - block, - is_syncing: false, + id: HeaderLookupId(block_root, id), + peers: HashSet::from_iter(peers.iter().copied()), + status: Status::DownloadingHeader(ColumnRequest::new()), } } - fn start(&mut self, cx: &mut SyncNetworkContext) { - cx.block_lookup_request(self.id, &self.peers, self.id.0); - } + fn start(&mut self, cx: &mut SyncNetworkContext) {} fn on_error(&mut self, _e: RpcResponseError) { todo!(); } - fn slot(&self) -> Option { - if let Some(block) = self.request.peek_downloaded_data() { - Some(block.slot) - } else { - None - } - } - fn root(&self) -> Hash256 { todo!(); } + fn peer_count(&self) -> usize { + self.peers.len() + } + fn is_syncing(&self) -> bool { - self.is_syncing + match self.status { + Status::DownloadingHeader(..) => false, + Status::Header(..) => false, + Status::Syncing(..) => true, + } } - fn parent_root(&self) -> Option { - if let Some(block) = self.request.peek_downloaded_data() { - Some(block.parent_root) - } else { - None + fn header(&self) -> Option<&BeaconBlockHeader> { + match &self.status { + Status::DownloadingHeader(..) => None, + Status::Header(header) => Some(header), + Status::Syncing(header, _) => Some(header), } } + fn parent_root(&self) -> Option { + self.header().map(|header| header.parent_root) + } + fn parent_root_and_slot(&self) -> Option<(Hash256, Slot)> { - if let Some(block) = self.request.peek_downloaded_data() { - Some((block.parent_root, block.slot)) - } else { - None + self.header() + .map(|header| (header.parent_root, header.slot)) + } + + fn header_request( + &mut self, + ) -> Result<&mut ColumnRequest, String> { + match &mut self.status { + Status::DownloadingHeader(request) => Ok(request), + _ => Err("Expected lookup to be in DownloadingHeader state".to_owned()), } } - fn is_rooted(&self) -> bool { - todo!(); + fn syncing(&mut self) -> Option<(&mut BeaconBlockHeader, &mut SyncingStatus)> { + match &mut self.status { + Status::Syncing(header, request) => Some((header, request)), + _ => None, + } + } + + fn block_request(&mut self) -> Result<&mut SyncingStatus, String> { + match &mut self.status { + Status::Syncing(_, request) => Ok(request), + _ => Err("Expected lookup to be in Syncing state".to_owned()), + } } } @@ -100,12 +128,24 @@ impl BlockTree { Self { blocks: <_>::default(), batches: <_>::default(), - roots: <_>::default(), - tips: <_>::default(), chain, } } + #[cfg(test)] + pub fn get_processing_ids(&self) -> Vec { + self.blocks + .values() + .filter(|block| { + matches!( + block.status, + Status::Syncing(_, SyncingStatus::Processing(_)), + ) + }) + .map(|block| block.id) + .collect() + } + pub fn pause(&mut self) { todo!() } @@ -121,63 +161,27 @@ impl BlockTree { cx: &mut SyncNetworkContext, ) -> bool { if self.blocks.contains_key(&block_root) { - // `block_root` points to a known block item in the header DAG - // Target root is the oldest known ancestor of `block_root` in the header tree - let oldest_ancestor = self.oldest_known_ancestor(block_root); - let Some(root) = self.roots.get_mut(&oldest_ancestor) else { - panic!("root node should exist"); - }; - // Add peer to the root's peer set - for peer in peers { - if root.peers.insert(peer) { - debug!(block_root = ?oldest_ancestor, ?peer, "Adding peer to existing header lookup"); + // Add peer to `block`'s entry and all its ancestors + let mut target_block_root = block_root; + while let Some(lookup) = self.blocks.get_mut(&target_block_root) { + for peer in peers { + // TODO(tree-sync): If peer already in set no need to add to its ancestors + lookup.peers.insert(*peer); + // TODO(tree-sync): This log can be very noisy maybe log once per peer + debug!(block_root = ?target_block_root, ?peer, "Adding peer to existing header lookup"); + } + if let Some(parent_root) = lookup.parent_root() { + target_block_root = parent_root; + } else { + break; } } + true } else { debug!(?block_root, ?peers, "Creating new header lookup"); - let new_lookup_peers = HashSet::from_iter(peers); - - // If any root has a parent that points to `block_root` remove them from roots and don't - // make `block_root` node a tip - let roots_that_descend_from_new_block = self - .roots - .keys() - .filter(|root| { - if let Some(parent_root) = self - .blocks - .get(root) - .expect("node must exist") - .parent_root() - { - parent_root == block_root - } else { - false - } - }) - .copied() - .collect::>(); - - // We only remove roots that have have a known parent, so they have completed download - for block_root in roots_that_descend_from_new_block { - let root = self.roots.remove(&block_root).expect("node must exist"); - new_lookup_peers.extend(root.peers.values()); - } - - // New nodes always become roots since we don't know their parent - self.roots.insert( - block_root, - TreeRoot { - peers: new_lookup_peers, - request: ColumnRequest::new(), - }, - ); - - // If no one descends from this new node, add it to tips - if roots_that_descend_from_new_block.is_empty() { - self.tips.insert(block_root); - } + let mut lookup = Block::new(block_root, cx.next_id(), peers); // TODO(tree-sync): have good peer selection let Some(peer) = lookup.peers.iter().next() else { @@ -192,7 +196,11 @@ impl BlockTree { ) .unwrap(); - lookup.request.on_download_start(req_id).unwrap(); + lookup + .header_request() + .expect("A new lookup is in DownloadingHeader request state") + .on_download_start(req_id) + .expect("A new request is in AwaitingDownload state"); self.blocks.insert(block_root, lookup); true @@ -225,7 +233,7 @@ impl BlockTree { } } - pub fn on_block( + pub fn on_block_header( &mut self, req_id: BlocksByRootRequestId, lookup_id: HeaderLookupId, @@ -234,31 +242,30 @@ impl BlockTree { cx: &mut SyncNetworkContext, ) -> Result<(), String> { let block_root = lookup_id.0; - let Some(lookup) = self.roots.get_mut(&block_root) else { + let Some(lookup) = self.blocks.get_mut(&block_root) else { return Err(format!("No header lookup for root {block_root}")); }; - match response { - Ok((blocks, received)) => { - if blocks.len() != 1 { - return Err(format!( - "Lookup {block_root} returned {} blocks expecting 1", - blocks.len() - )); - } - let block = blocks.first().expect("blocks len == 1").clone(); + let response = response.and_then(|(blocks, timestamp)| { + let block = blocks + .first() + .cloned() + .ok_or(RpcResponseError::InternalError( + "blocks_by_root response contains zero blocks".to_owned(), + ))?; + Ok((block, timestamp)) + }); + match response { + Ok((block, received)) => { let block_header = block.message().block_header(); let parent_root = block_header.parent_root; lookup - .request - .on_download_success(req_id, peer_id, block_header, received) + .header_request()? + .on_download_success(req_id, peer_id, block_header.clone(), received) .unwrap(); - - // TODO(tree-sync): Should check if node already exist to not override state - self.blocks - .insert(block_root, Block::new(block_root, block_header)); + lookup.status = Status::Header(block_header.clone()); // Once we discover the parent_root of this block three things can happen // 1. The parent root is a known block -> stop @@ -269,30 +276,51 @@ impl BlockTree { // TODO(tree-sync): on finalization or every interval we should drop branches that // conflict with finality let parent_imported = self.chain.block_is_known_to_fork_choice(&parent_root); + let finalized_checkpoint = self.chain.head().finalized_checkpoint(); let parent_known = self.blocks.contains_key(&parent_root); - if parent_known { - self.tips.remove(&parent_root); - } - - let finalized_slot = Slot::new(0); - - if block_header.slot <= finalized_slot { - panic!("Block conflicts with finality"); + if block_header.slot + <= finalized_checkpoint + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + && block_root != finalized_checkpoint.root + { + panic!( + "Block {:?} {} conflicts with finalized checkpoint {:?}", + block_root, block_header.slot, finalized_checkpoint + ); } if parent_imported || parent_known { // Stop search we reached a known block - self.mark_descendants_as_rooted(parent_root); self.trigger_forward_sync(cx); } else { let lookup = self.blocks.get_mut(&block_root).expect("lookup exists"); - let peers = lookup.peers(); + let peers = lookup.peers.iter().copied().collect::>(); self.search(parent_root, &peers, cx); } } Err(e) => { - lookup.request.on_download_error(req_id).unwrap(); - lookup.start(cx); + lookup.header_request()?.on_download_error(req_id).unwrap(); + + // TODO(tree-sync): have good peer selection + let Some(peer) = lookup.peers.iter().next() else { + todo!("no peer"); + }; + + let req_id = cx + .send_blocks_by_root_request( + *peer, + BlocksByRootRequest::new(vec![block_root], cx.spec(), ForkName::Fulu), + BlocksByRootRequester::Header(lookup.id), + ) + .unwrap(); + + lookup + .header_request() + .expect("A new lookup is in DownloadingHeader request state") + .on_download_start(req_id) + .expect("A new request is in AwaitingDownload state"); + todo!("error {e:?}"); } } @@ -328,126 +356,207 @@ impl BlockTree { ancestors } + /// Marks blocks ready for download as syncing + /// Should be called anytime: + /// - A new block is imported to fork-choice + /// - A block in the header tree is advanced to Syncing + /// - A new header is downloaded with a parent that is imported or syncing fn trigger_forward_sync(&mut self, cx: &mut SyncNetworkContext) { - // Find the block range with most peers and highest slot. This is the block - // to be used as tip of the chain of blocks to fetch. - let Some(block_root) = self + // We want to download and import blocks whose parent is imported in our fork-choice. Also + // to buffer we want to download children of blocks that are awaiting import. + // + // We may want to avoid 1M calls into fork-choice to check if a block is imported. We only + // need to work of roots. Once a root is processed we have re-compute roots, or track + // children. + + // TODO(tree-sync): don't build on demand, cache roots somewhere + + let blocks_syncing = self .blocks - .iter() - .filter_map(|(root, block)| { - // Ignore blocks that are already being forward synced - if block.is_syncing() { - return None; - } - // Ignore block roots which header is not downloaded yet - let Some((parent_root, slot)) = block.parent_root_and_slot() else { - return None; - }; - // Check if the parent is known in the header tree - if let Some(slot) = block.slot() { - // Find highest peer count, then slot - Some((block.peer_count(), slot, root)) - } else { - None + .values() + .filter(|block| block.is_syncing()) + .count(); + let mut new_syncing_blocks = false; + + // Have up to 2 blocks syncing + for _ in blocks_syncing..2 { + // Find the block range with most peers and highest slot. This is the block + // to be used as tip of the chain of blocks to fetch. + let Some(block_root) = self + .blocks + .iter() + .filter_map(|(root, block)| { + // Ignore blocks that are already being forward synced + if block.is_syncing() { + return None; + } + // Ignore block roots which header is not downloaded yet + let Some((parent_root, slot)) = block.parent_root_and_slot() else { + return None; + }; + // Check if the parent is known in the header tree + let is_candidate = if let Some(parent) = self.blocks.get(&parent_root) { + parent.is_syncing() + } else { + // TODO(tree-sync): cache this calls in the struct + cx.chain.block_is_known_to_fork_choice(&parent_root) + }; + + if is_candidate { + // Find highest peer count, then min slot + Some((block.peer_count(), Slot::new(u64::MAX) - slot, root)) + } else { + None + } + }) + .max() + .map(|(_, _, root)| *root) + else { + break; + }; + + // Start syncing `block_root` + let block_to_sync = self + .blocks + .get_mut(&block_root) + .expect("Block should exist"); + + match &mut block_to_sync.status { + Status::Header(header) => { + block_to_sync.status = + Status::Syncing(header.clone(), SyncingStatus::AwaitingDownload); } - }) - .max() - .map(|(_, _, root)| *root) - else { - return; - }; + _ => panic!("Unpected state"), + } + debug!(id = %block_to_sync.id, "Starting forwards sync of block"); - // Get the chain of ancestors of that block_root. Because they are ancestors - // of block_root all these blocks have the same peer count as `block_root`. - // Consider limiting the length of blocks so some sensible number to not sync - // too much at once. There's no good reason to do a big fetch at once. - let blocks = self.collect_ancestors(block_root); - self.mark_as_syncing(&blocks); + new_syncing_blocks = true; + } - // TODO: We can sync parallel chains at once here, if we have multiple chains - // rooted in different places - let peers = self - .blocks - .get(&block_root) - .expect("block for block_root should exist") - .peers(); + if new_syncing_blocks { + self.continue_syncing_blocks(cx); + } + } - self.forward_sync_blocks(&blocks, &peers, cx) + fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { + for lookup in self.blocks.values_mut().filter(|block| block.is_syncing()) { + match &mut lookup.status { + Status::Syncing(header, syncing_status) => match syncing_status { + SyncingStatus::AwaitingDownload => { + let request = BlocksByRootSameForkRequest { + // TODO(tree-sync): cache block root + block_roots: vec![header.canonical_root()], + fork: cx.spec().fork_name_at_slot::(header.slot), + }; + + // TODO + let chain_id = cx.next_id(); + let requester = RangeRequestId::RangeSync(lookup.id); + let peers = Arc::new(RwLock::new(HashSet::from_iter( + lookup.peers.iter().copied(), + ))); + let failed_peers = HashSet::new(); + + match cx.block_components_by_range_request( + request, + requester, + peers, + &failed_peers, + ) { + Ok(req_id) => { + *syncing_status = SyncingStatus::Downloading(req_id); + } + Err(e) => { + // Log failed chain, mark blocks as not syncing + } + }; + } + SyncingStatus::Downloading(_) => {} // wait for event + SyncingStatus::AwaitingProcessing(block, peers) => { + let Some(beacon_processor) = cx.beacon_processor_if_enabled() else { + todo!("processor disabled"); + }; + if let Err(e) = beacon_processor.send_chain_segment( + ChainSegmentProcessId::RangeBatchId(lookup.id), + vec![block.clone()], + ) { + todo!("error sending"); + } + *syncing_status = SyncingStatus::Processing(peers.clone()); + } + SyncingStatus::Processing(_) => {} // wait for event + }, + _ => panic!("bad state"), + } + } } - fn forward_sync_blocks( + pub fn on_blocks_response( &mut self, - blocks: &[Hash256], - peers: &[PeerId], + id: HeaderLookupId, + result: Result<(Vec>, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - // Create a batch with this blocks - // Trigger batch sync - - let headers = blocks - .iter() - .map(|root| { - self.blocks - .get(root) - .expect("block should exist") - .request - .peek_downloaded_data() - .expect("header should be downloaded") - .clone() - }) - .collect::>(); - - // TODO(tree-sync): only choose ranges of blocks in the same fork - let first_header = headers.first().unwrap(); - let fork = cx.spec().fork_name_at_slot::(first_header.slot); - - // Create batch here? - let mut batch = BatchInfo::new(blocks.to_vec()); - - let request = BlocksByRootSameForkRequest { - block_roots: batch - .to_blocks_by_root_request(cx.spec()) - .block_roots() - .to_vec(), - fork, - }; - let chain_id = cx.next_id(); - let requester = RangeRequestId::RangeSync { - chain_id, - batch_id: Epoch::new(0), + // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right + // one, while still indexing by block_root only + let Some(lookup) = self.blocks.get_mut(&id.0) else { + panic!("Unknown batch id {id}"); }; - let peers = Arc::new(RwLock::new(HashSet::from_iter(peers.iter().copied()))); - let failed_peers = HashSet::new(); - - let id = - match cx.block_components_by_range_request(request, requester, peers, &failed_peers) { - Ok(req_id) => { - // TODO: Update batch state - batch.start_downloading(req_id); - self.batches.insert(chain_id, batch); + + let result = result.and_then(|(blocks, peers)| { + let block = blocks + .first() + .cloned() + .ok_or(RpcResponseError::InternalError( + "blocks_by_root response contains zero blocks".to_owned(), + ))?; + Ok((block, peers)) + }); + + let request = lookup.block_request().unwrap(); + match request { + SyncingStatus::Downloading(_) => match result { + Ok((block, peers)) => { + debug!(%id, "Sync block downloaded"); + *request = SyncingStatus::AwaitingProcessing(block, peers); } Err(e) => { - // Log failed chain, mark blocks as not syncing + debug!(%id, "Sync block download error"); + *request = SyncingStatus::AwaitingDownload; } - }; + }, + _ => panic!("Bad state"), + } + + // Continue batches + self.continue_syncing_blocks(cx); } - pub fn on_blocks_response( + pub fn handle_block_process_result( &mut self, - batch_id: Id, - blocks: Vec>, - batch_peers: BatchPeers, + id: HeaderLookupId, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, ) { - let Some(batch) = self.batches.get_mut(&batch_id) else { - panic!("Unknown batch id {batch_id}"); + let Some(lookup) = self.blocks.get_mut(&id.0) else { + panic!("Unknown batch id {id}"); }; - let received = batch - .download_completed(blocks, batch_peers) - .map_err(|e| e.0) - .unwrap(); - debug!(%batch_id, blocks = received, "Batch downloaded"); - - // Continue batches + let request = lookup.block_request().unwrap(); + match request { + SyncingStatus::Processing(peers) => match result { + BatchProcessResult::Success { .. } => { + debug!(%id, "Sync block process success"); + self.blocks.remove(&id.0); + self.trigger_forward_sync(cx); + } + BatchProcessResult::FaultyFailure { .. } | BatchProcessResult::NonFaultyFailure => { + debug!(%id, "Sync block process error"); + *request = SyncingStatus::AwaitingDownload; + // TODO(tree-sync): add peer to failed peers and downscore + } + }, + _ => panic!("Bad state"), + } } } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index e36e462c1b8..e8bcffd7c16 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -363,6 +363,12 @@ impl SyncManager { &mut self.range_sync } + // Leak the full struct to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn block_tree(&mut self) -> &mut BlockTree { + &mut self.block_tree + } + #[cfg(test)] pub(crate) fn update_execution_engine_state(&mut self, state: EngineState) { self.handle_new_execution_engine_state(state); @@ -833,13 +839,9 @@ impl SyncManager { imported, } => self.block_tree.prune_root(block_root, imported), SyncMessage::BatchProcessed { sync_type, result } => match sync_type { - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { - self.range_sync.handle_block_process_result( - &mut self.network, - chain_id, - epoch, - result, - ); + ChainSegmentProcessId::RangeBatchId(id) => { + self.block_tree + .handle_block_process_result(id, result, &mut self.network); self.update_sync_state(); } ChainSegmentProcessId::BackSyncBatchId(epoch) => { @@ -1028,8 +1030,13 @@ impl SyncManager { { match req_id.parent_request_id { BlocksByRootRequester::Header(lookup_id) => { - self.block_tree - .on_block(req_id, lookup_id, result, peer_id, &mut self.network); + self.block_tree.on_block_header( + req_id, + lookup_id, + result, + peer_id, + &mut self.network, + ); } BlocksByRootRequester::RangeSync(batch_id) => { self.on_block_components_by_range_response( @@ -1232,15 +1239,14 @@ impl SyncManager { .network .on_block_components_by_range_response(range_request_id, range_block_component) { - match result { - Ok((blocks, batch_peers)) => { - match range_request_id.requester { - RangeRequestId::RangeSync { chain_id, batch_id } => { - self.block_tree - .on_blocks_response(chain_id, blocks, batch_peers); - self.update_sync_state(); - } - RangeRequestId::BackfillSync { batch_id } => { + match range_request_id.requester { + RangeRequestId::RangeSync(id) => { + self.block_tree + .on_blocks_response(id, result, &mut self.network); + } + RangeRequestId::BackfillSync { batch_id } => { + match result { + Ok((blocks, batch_peers)) => { match self.backfill_sync.on_block_response( &mut self.network, batch_id, @@ -1257,21 +1263,7 @@ impl SyncManager { } } } - } - } - Err(e) => match range_request_id.requester { - RangeRequestId::RangeSync { chain_id, batch_id } => { - self.range_sync.inject_error( - &mut self.network, - batch_id, - chain_id, - range_request_id.id, - e, - ); - self.update_sync_state(); - } - RangeRequestId::BackfillSync { batch_id } => { - match self.backfill_sync.inject_error( + Err(e) => match self.backfill_sync.inject_error( &mut self.network, batch_id, range_request_id.id, @@ -1279,9 +1271,9 @@ impl SyncManager { ) { Ok(_) => {} Err(_) => self.update_sync_state(), - } + }, } - }, + } } } } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index b484b7e4ac6..82c6b24a0c6 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -310,7 +310,7 @@ impl SyncingChain { duration_in_awaiting_processing, ); - let process_id = ChainSegmentProcessId::RangeBatchId(self.id, batch_id); + let process_id = todo!(); self.current_processing_batch = Some(batch_id); if let Err(e) = beacon_processor.send_chain_segment(process_id, blocks) { @@ -935,10 +935,7 @@ impl SyncingChain { match network.block_components_by_range_request( todo!(), - RangeRequestId::RangeSync { - chain_id: self.id, - batch_id, - }, + todo!(), self.peers.clone(), failed_peers, ) { diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 323ac81144c..f945a5fb252 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -5,10 +5,14 @@ use crate::sync::manager::SLOT_IMPORT_TOLERANCE; use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; use crate::sync::tests::lookups::TestOptions; +use crate::sync::BatchProcessResult; use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; -use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; +use beacon_chain::{ + block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer, + PayloadVerificationStatus, +}; use beacon_processor::WorkType; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlocksByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, @@ -496,7 +500,45 @@ impl TestRig { SignedBeaconBlock::from_block(block, Signature::empty()) } - fn create_parent_chain(&mut self) -> (Hash256, Slot) { + async fn create_unimported_parent_chain(&mut self) -> (Hash256, Slot) { + let block_count = 8; + self.log(&format!( + "Creating unimported chain of {block_count} blocks" + )); + + let mut r = TestRig::test_setup(); + + r.harness.advance_slot(); + let head_root = r + .harness + .extend_chain( + block_count, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let store = &r.harness.chain.store; + let head_block = store.get_full_block(&head_root).unwrap().unwrap(); + + let mut target_block_root = head_root; + while let Some(block) = store.get_full_block(&target_block_root).unwrap() { + self.log(&format!( + "Adding block {target_block_root:?} slot {} to known blocks", + block.slot() + )); + let parent_root = block.parent_root(); + self.blocks_by_root.insert(target_block_root, block.into()); + if parent_root == Hash256::ZERO { + break; + } + target_block_root = parent_root; + } + + (head_root, head_block.slot()) + } + + fn create_not_rooted_parent_chain(&mut self) -> (Hash256, Slot) { let current_head = self.harness.chain.head(); let mut parent_root = current_head.head_block_root(); let mut slot = current_head.head_slot(); @@ -628,7 +670,7 @@ impl TestRig { .collect::>(); ids.sort_unstable(); self.log(&format!( - "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" + "Completing DataColumnsByRoot request {id} to {peer_id} with data_columns {ids:?}" )); for data_column in data_columns { @@ -990,6 +1032,56 @@ impl TestRig { } } + fn complete_block_processing(&mut self, ids: Vec) { + // Sort ids first as we need to process blocks in order of ancestors. This only works if the + // test does not send blocks of two parallel chains at once. + let mut blocks = ids + .into_iter() + .map(|id| { + let block = self + .blocks_by_root + .get(&id.0) + .cloned() + .expect("unknown block"); + (id, block) + }) + .collect::>(); + blocks.sort_by_key(|(_, block)| block.slot()); + + for (id, block) in blocks { + self.log(&format!( + "Completing block processing {id} slot {}", + block.slot() + )); + + { + let mut head_state = self.harness.chain.head().snapshot.beacon_state.clone(); + *head_state.slot_mut() = block.slot(); + + let mut fork_choice = self.harness.chain.canonical_head.fork_choice_write_lock(); + fork_choice + .on_block( + block.slot(), + block.message(), + id.0, + Duration::from_secs(0), + &head_state, + PayloadVerificationStatus::Verified, + &self.spec, + ) + .expect("error importing block to fork-choice"); + } + + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: ChainSegmentProcessId::RangeBatchId(id), + result: BatchProcessResult::Success { + sent_blocks: 1, + imported_blocks: 1, + }, + }); + } + } + fn progress_until_no_events( &mut self, request_filter: RequestFilter, @@ -1024,6 +1116,13 @@ impl TestRig { continue; } + // TODO(tree-sync): find a way to get this info from the beacon processor events + let ids = self.sync_manager.block_tree().get_processing_ids(); + if !ids.is_empty() { + self.complete_block_processing(ids); + continue; + } + let sync_state = self.get_sync_state(); self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); @@ -1061,15 +1160,15 @@ impl TestRig { request_filter.epoch(epoch), complete_config, ); - if let RangeRequestId::RangeSync { batch_id, .. } = id { - assert_eq!(batch_id.as_u64(), epoch, "Unexpected batch_id"); + if let RangeRequestId::RangeSync { .. } = id { + todo!(); } else { panic!("unexpected RangeRequestId {id}"); } let id = match id { - RangeRequestId::RangeSync { chain_id, batch_id } => { - ChainSegmentProcessId::RangeBatchId(chain_id, batch_id) + RangeRequestId::RangeSync(id) => { + todo!(); } RangeRequestId::BackfillSync { batch_id } => { ChainSegmentProcessId::BackSyncBatchId(batch_id) @@ -1453,7 +1552,7 @@ fn finalized_sync_permanent_custody_peer_failure() { #[tokio::test] async fn tree_sync_happy_path() { let mut r = TestRig::test_setup(); - let (head_root, head_slot) = r.create_parent_chain(); + let (head_root, head_slot) = r.create_unimported_parent_chain().await; let remote_info = SyncInfo { finalized_epoch: Epoch::new(0), finalized_root: Hash256::ZERO, From d3d1457fddf57a699d3547b0ce4b2c629daa081c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 20 Jun 2025 10:36:42 +0200 Subject: [PATCH 26/66] Cleanup requests --- beacon_node/network/src/sync/block_tree.rs | 26 +-- beacon_node/network/src/sync/manager.rs | 2 +- .../network/src/sync/network_context.rs | 74 ++++----- .../block_components_by_range.rs | 151 +++++++----------- .../sync/network_context/custody_by_root.rs | 8 +- .../src/sync/network_context/requests.rs | 2 +- .../network_context/requests/blobs_by_root.rs | 16 +- .../requests/blocks_by_root.rs | 14 +- .../requests/data_columns_by_root.rs | 22 +-- beacon_node/network/src/sync/peer_sampling.rs | 2 +- 10 files changed, 119 insertions(+), 198 deletions(-) diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index d602265ab05..9a7561df21a 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -191,7 +191,7 @@ impl BlockTree { let req_id = cx .send_blocks_by_root_request( *peer, - BlocksByRootRequest::new(vec![block_root], cx.spec(), ForkName::Fulu), + block_root, BlocksByRootRequester::Header(lookup.id), ) .unwrap(); @@ -310,7 +310,7 @@ impl BlockTree { let req_id = cx .send_blocks_by_root_request( *peer, - BlocksByRootRequest::new(vec![block_root], cx.spec(), ForkName::Fulu), + block_root, BlocksByRootRequester::Header(lookup.id), ) .unwrap(); @@ -443,13 +443,7 @@ impl BlockTree { match &mut lookup.status { Status::Syncing(header, syncing_status) => match syncing_status { SyncingStatus::AwaitingDownload => { - let request = BlocksByRootSameForkRequest { - // TODO(tree-sync): cache block root - block_roots: vec![header.canonical_root()], - fork: cx.spec().fork_name_at_slot::(header.slot), - }; - - // TODO + // TODO(tree-sync): pick the right ID let chain_id = cx.next_id(); let requester = RangeRequestId::RangeSync(lookup.id); let peers = Arc::new(RwLock::new(HashSet::from_iter( @@ -458,7 +452,7 @@ impl BlockTree { let failed_peers = HashSet::new(); match cx.block_components_by_range_request( - request, + header.canonical_root(), requester, peers, &failed_peers, @@ -494,7 +488,7 @@ impl BlockTree { pub fn on_blocks_response( &mut self, id: HeaderLookupId, - result: Result<(Vec>, BatchPeers), RpcResponseError>, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right @@ -503,16 +497,6 @@ impl BlockTree { panic!("Unknown batch id {id}"); }; - let result = result.and_then(|(blocks, peers)| { - let block = blocks - .first() - .cloned() - .ok_or(RpcResponseError::InternalError( - "blocks_by_root response contains zero blocks".to_owned(), - ))?; - Ok((block, peers)) - }); - let request = lookup.block_request().unwrap(); match request { SyncingStatus::Downloading(_) => match result { diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index e8bcffd7c16..2fa10ae83de 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1252,7 +1252,7 @@ impl SyncManager { batch_id, batch_peers, range_request_id.id, - blocks, + vec![blocks], ) { Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), Ok(ProcessResult::Successful) => {} diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 85e886c4885..c93850bedb4 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -12,7 +12,6 @@ use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::SingleLookupId; -use crate::sync::network_context::requests::BlobCountPerBlock; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; pub use block_components_by_range::BlockComponentsByRangeRequest; @@ -488,7 +487,7 @@ impl SyncNetworkContext { /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, - request: BlocksByRootSameForkRequest, + block_root: Hash256, requester: RangeRequestId, peers: Arc>>, peers_to_deprioritize: &HashSet, @@ -499,7 +498,7 @@ impl SyncNetworkContext { }; let req = - BlockComponentsByRangeRequest::new(id, request, peers, peers_to_deprioritize, self)?; + BlockComponentsByRangeRequest::new(id, block_root, peers, peers_to_deprioritize, self)?; self.block_components_by_range_requests.insert(id, req); @@ -575,7 +574,7 @@ impl SyncNetworkContext { &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, - block_roots: Vec, + block_root: Hash256, indices: Vec, expect_max_responses: bool, ) -> Result, &'static str> { @@ -592,13 +591,10 @@ impl SyncNetworkContext { }; let request = DataColumnsByRootRequest::new( - block_roots - .iter() - .map(|block_root| DataColumnsByRootIdentifier { - block_root: *block_root, - columns: RuntimeVariableList::from_vec(indices.clone(), usize::MAX), - }) - .collect(), + vec![DataColumnsByRootIdentifier { + block_root, + columns: RuntimeVariableList::from_vec(indices.clone(), usize::MAX), + }], usize::MAX, ); @@ -611,7 +607,7 @@ impl SyncNetworkContext { debug!( method = "DataColumnsByRoot", peer = %peer_id, - ?block_roots, + ?block_root, ?indices, %id, "Sync RPC request sent" @@ -621,7 +617,7 @@ impl SyncNetworkContext { id, peer_id, expect_max_responses, - DataColumnsByRootRequestItems::new(block_roots, indices), + DataColumnsByRootRequestItems::new(block_root, indices), ); Ok(LookupRequestResult::RequestSent(id)) @@ -634,7 +630,7 @@ impl SyncNetworkContext { pub fn send_custody_by_root_request( &mut self, parent_request_id: ComponentsByRangeRequestId, - request: BlocksByRootRequest, + block_root: Hash256, lookup_peers: Arc>>, ) -> Result { let span = span!( @@ -656,12 +652,8 @@ impl SyncNetworkContext { .into_iter() .collect::>(); - let mut request = ActiveCustodyByRootRequest::new( - request.block_roots().to_vec(), - id, - &custody_indices, - lookup_peers, - ); + let mut request = + ActiveCustodyByRootRequest::new(block_root, id, &custody_indices, lookup_peers); // Note that you can only send, but not handle a response here match request.continue_requests(self) { @@ -679,7 +671,7 @@ impl SyncNetworkContext { pub fn send_blocks_by_root_request( &mut self, peer_id: PeerId, - request: BlocksByRootRequest, + block_root: Hash256, parent_request_id: BlocksByRootRequester, ) -> Result { let id = BlocksByRootRequestId { @@ -687,6 +679,8 @@ impl SyncNetworkContext { parent_request_id, }; + let request = BlocksByRootRequest::new(vec![block_root], self.spec(), ForkName::Fulu); + // Lookup sync event safety: If network_send.send() returns Ok(_) we are guaranteed that // eventually at least one this 3 events will be received: // - StreamTermination(request_id): handled by `Self::on_single_block_response` @@ -696,7 +690,7 @@ impl SyncNetworkContext { self.network_send .send(NetworkMessage::SendRequest { peer_id, - request: RequestType::BlocksByRoot(request.clone().into()), + request: RequestType::BlocksByRoot(request), app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), }) .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; @@ -714,7 +708,7 @@ impl SyncNetworkContext { // true = enforce max_requests as returned for blocks_by_root. We always request from // peers to claim to have these blocks true, - BlocksByRootRequestItems::new(request), + BlocksByRootRequestItems::new(block_root), ); Ok(id) } @@ -760,7 +754,8 @@ impl SyncNetworkContext { fn send_blobs_by_root_request( &mut self, peer_id: PeerId, - request: BlobCountPerBlock, + block_root: Hash256, + blobs_per_block: usize, parent_request_id: ComponentsByRangeRequestId, ) -> Result { let id = BlobsByRootRequestId { @@ -768,14 +763,12 @@ impl SyncNetworkContext { parent_request_id, }; - let blob_identifiers = request - .0 + let indices = (0..(blobs_per_block as u64)).collect::>(); + let blob_identifiers = indices .iter() - .flat_map(|(block_root, blob_count)| { - (0..(*blob_count as u64)).map(|index| BlobIdentifier { - block_root: *block_root, - index, - }) + .map(|index| BlobIdentifier { + block_root, + index: *index, }) .collect::>(); @@ -802,7 +795,7 @@ impl SyncNetworkContext { peer_id, // true = we know exactly how many blobs total we expect true, - BlobsByRootRequestItems::new(request), + BlobsByRootRequestItems::new(block_root, indices), ); Ok(id) } @@ -1349,7 +1342,7 @@ impl SyncNetworkContext { &mut self, id: ComponentsByRangeRequestId, range_block_component: RangeBlockComponent, - ) -> Option>, BatchPeers), RpcResponseError>> { + ) -> Option, BatchPeers), RpcResponseError>> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { @@ -1362,8 +1355,11 @@ impl SyncNetworkContext { let result = match range_block_component { RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + let block = blocks.first().ok_or(RpcResponseError::InternalError( + "blocks_by_root returned zero blocks".to_owned(), + ))?; request - .on_blocks_by_root_result(req_id, blocks, peer_id, self) + .on_blocks_by_root_result(req_id, block.clone(), peer_id, self) .map_err(Into::::into) }), RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { @@ -1384,18 +1380,14 @@ impl SyncNetworkContext { .transpose(); match result.as_ref() { - Some(Ok((blocks, peer_group))) => { - let blocks_with_data = blocks - .iter() - .filter(|block| block.as_block().has_data()) - .count(); + Some(Ok((block, peer_group))) => { // Don't log the peer_group here, it's very long (could be up to 128 peers). If you // want to trace which peer sent the column at index X, search for the log: // `Sync RPC request sent method="DataColumnsByRange" ...` debug!( %id, - blocks = blocks.len(), - blocks_with_data, + slot = %block.as_block().slot(), + block_has_data = block.as_block().has_data(), block_peer = ?peer_group.block(), "Block components by range request success, removing" ) diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 135e36453f7..dd4901d82e9 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -1,4 +1,3 @@ -use crate::sync::network_context::requests::BlobCountPerBlock; use crate::sync::network_context::{ BlocksByRootSameForkRequest, PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, @@ -19,8 +18,8 @@ use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, Hash256, - RuntimeVariableList, SignedBeaconBlock, Slot, + BeaconBlockHeader, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, + Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, }; /// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return @@ -30,7 +29,7 @@ use types::{ pub struct BlockComponentsByRangeRequest { id: ComponentsByRangeRequestId, peers: Arc>>, - request: BlocksByRootSameForkRequest, + block_root: Hash256, state: State, } @@ -39,10 +38,10 @@ pub struct BlockComponentsByRangeRequest { // peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 enum State { BlocksRequest { - blocks_request: Request>>>, + blocks_request: Request>>, }, DataRequest { - blocks: Vec>>, + block: Arc>, block_peer: PeerId, data_request: DataRequest, }, @@ -64,8 +63,7 @@ enum Request { Complete(T, P), } -pub type BlockComponentsByRangeRequestResult = - Result>, BatchPeers)>, Error>; +pub type BlockComponentsByRangeRequestResult = Result, BatchPeers)>, Error>; pub enum Error { InternalError(String), @@ -98,13 +96,11 @@ pub enum BlockComponentsByRangeRequestStep { impl BlockComponentsByRangeRequest { pub fn new( id: ComponentsByRangeRequestId, - request: BlocksByRootSameForkRequest, + block_root: Hash256, peers: Arc>>, peers_to_deprioritize: &HashSet, cx: &mut SyncNetworkContext, ) -> Result { - let batch_fork = request.fork; - // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we // will request all blocks for the first 5 epochs to that same single peer. Before we would // query only idle peers in the syncing chain. @@ -130,7 +126,7 @@ impl BlockComponentsByRangeRequest { let blocks_req_id = cx.send_blocks_by_root_request( block_peer, - BlocksByRootRequest::new(request.block_roots.clone(), cx.spec(), request.fork), + block_root, BlocksByRootRequester::RangeSync(id), )?; @@ -141,7 +137,7 @@ impl BlockComponentsByRangeRequest { Ok(Self { id, peers, - request, + block_root, state, }) } @@ -154,11 +150,11 @@ impl BlockComponentsByRangeRequest { State::BlocksRequest { blocks_request: blocks_by_range_request, } => { - if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - let fork = self.request.fork; - let blocks_have_data = blocks.iter().any(|block| block.has_data()); + if let Some((block, block_peer)) = blocks_by_range_request.to_finished() { + let fork = cx.spec().fork_name_at_slot::(block.slot()); + let block_has_data = block.has_data(); - if blocks_have_data && fork.fulu_enabled() { + if block_has_data && fork.fulu_enabled() { let mut column_indices = cx .network_globals() .sampling_columns() @@ -167,21 +163,12 @@ impl BlockComponentsByRangeRequest { .collect::>(); column_indices.sort_unstable(); - let block_roots_with_data = blocks - .iter() - .filter(|block| block.has_data()) - // TODO(tree-sync): cache block root - .map(|block| get_block_root(block)) - .collect::>(); - - let request = BlocksByRootRequest::new( - block_roots_with_data, - cx.spec(), - self.request.fork, - ); - let req_id = cx - .send_custody_by_root_request(self.id, request, self.peers.clone()) + .send_custody_by_root_request( + self.id, + self.block_root, + self.peers.clone(), + ) .map_err(|e| match e { RpcRequestSendError::InternalError(e) => Error::InternalError(e), RpcRequestSendError::NoPeers => Error::InternalError( @@ -191,21 +178,14 @@ impl BlockComponentsByRangeRequest { })?; self.state = State::DataRequest { - blocks: blocks.to_vec(), + block: block.clone(), block_peer: *block_peer, data_request: DataRequest::Fulu { custody_request: Request::Active(req_id), }, }; Ok(None) - } else if blocks_have_data && fork.deneb_enabled() { - let blob_count_per_block = blocks - .iter() - .filter(|block| block.has_data()) - // TODO(tree-sync): cache block root - .map(|block| (get_block_root(block), block.num_expected_blobs())) - .collect::>(); - + } else if block_has_data && fork.deneb_enabled() { // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I // would like the beacon processor / da_checker to be the one that decides if an // RpcBlock is valid or not with respect to containing blobs. Having sync not even @@ -213,7 +193,8 @@ impl BlockComponentsByRangeRequest { let req_id = cx .send_blobs_by_root_request( *block_peer, - BlobCountPerBlock(blob_count_per_block), + self.block_root, + block.num_expected_blobs(), self.id, ) .map_err(|e| match e { @@ -225,7 +206,7 @@ impl BlockComponentsByRangeRequest { })?; self.state = State::DataRequest { - blocks: blocks.to_vec(), + block: block.clone(), block_peer: *block_peer, data_request: DataRequest::Deneb { blobs_request: Request::Active(req_id), @@ -234,8 +215,8 @@ impl BlockComponentsByRangeRequest { Ok(None) } else { let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_base(blocks.to_vec()); - Ok(Some((rpc_blocks, peer_group))) + let rpc_block = couple_block_base(block.clone()); + Ok(Some((rpc_block, peer_group))) } } else { // Wait for blocks_by_range requests to complete @@ -243,7 +224,7 @@ impl BlockComponentsByRangeRequest { } } State::DataRequest { - blocks, + block, block_peer, data_request, } => match data_request { @@ -253,9 +234,9 @@ impl BlockComponentsByRangeRequest { if let Some((blobs, _)) = blobs_by_range_request.to_finished() { // We use the same block_peer for the blobs request let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = - couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; - Ok(Some((rpc_blocks, peer_group))) + let rpc_block = + couple_block_deneb(block.clone(), blobs.to_vec(), cx.spec())?; + Ok(Some((rpc_block, peer_group))) } else { // Wait for blocks_by_range and blobs_by_range requests to complete Ok(None) @@ -273,13 +254,13 @@ impl BlockComponentsByRangeRequest { .collect(); let peer_group = BatchPeers::new(*block_peer, column_peers.clone()); - let rpc_blocks = couple_blocks_fulu( - blocks.to_vec(), + let rpc_block = couple_block_fulu( + block.clone(), columns.to_vec(), custody_column_indices, cx.spec(), )?; - Ok(Some((rpc_blocks, peer_group))) + Ok(Some((rpc_block, peer_group))) } else { // Wait for the custody_by_range request to complete Ok(None) @@ -292,7 +273,7 @@ impl BlockComponentsByRangeRequest { pub fn on_blocks_by_root_result( &mut self, id: BlocksByRootRequestId, - data: Vec>>, + data: Arc>, peer_id: PeerId, cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { @@ -367,18 +348,15 @@ impl BlockComponentsByRangeRequest { } } -fn couple_blocks_base(blocks: Vec>>) -> Vec> { - blocks - .into_iter() - .map(|block| RpcBlock::new_without_blobs(None, block)) - .collect() +fn couple_block_base(block: Arc>) -> RpcBlock { + RpcBlock::new_without_blobs(None, block) } -fn couple_blocks_deneb( - blocks: Vec>>, +fn couple_block_deneb( + block: Arc>, blobs: Vec>>, spec: &ChainSpec, -) -> Result>, Error> { +) -> Result, Error> { let mut blobs_by_block = HashMap::>>>::new(); for blob in blobs { let block_root = blob.block_root(); @@ -395,28 +373,21 @@ fn couple_blocks_deneb( // wasting theirs and our bandwidth 1:1. Therefore blobs that don't pair well are just ignored. // // RpcBlock::new ensures that the count of blobs is consistent with the block - blocks - .into_iter() - .map(|block| { - let block_root = get_block_root(&block); - let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; - let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); - // BlobsByRange request handler enforces that blobs are sorted by index - let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block).map_err(|_| { - Error::InternalError("Blobs returned exceeds max length".to_string()) - })?; - Ok(RpcBlock::new(Some(block_root), block, Some(blobs)) - .expect("TODO: don't do matching here")) - }) - .collect::>, Error>>() + let block_root = get_block_root(&block); + let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; + let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); + // BlobsByRange request handler enforces that blobs are sorted by index + let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block) + .map_err(|_| Error::InternalError("Blobs returned exceeds max length".to_string()))?; + Ok(RpcBlock::new(Some(block_root), block, Some(blobs)).expect("TODO: don't do matching here")) } -fn couple_blocks_fulu( - blocks: Vec>>, +fn couple_block_fulu( + block: Arc>, data_columns: DataColumnSidecarList, custody_column_indices: Vec, spec: &ChainSpec, -) -> Result>, Error> { +) -> Result, Error> { // Group data columns by block_root and index let mut custody_columns_by_block = HashMap::>>::new(); @@ -435,24 +406,14 @@ fn couple_blocks_fulu( } // Now iterate all blocks ensuring that the block roots of each block and data column match, - blocks - .into_iter() - .map(|block| { - let block_root = get_block_root(&block); - let data_columns_with_block_root = custody_columns_by_block - // Remove to only use columns once - .remove(&block_root) - .unwrap_or_default(); - - RpcBlock::new_with_custody_columns( - Some(block_root), - block, - data_columns_with_block_root, - spec, - ) - .map_err(Error::InternalError) - }) - .collect::, _>>() + let block_root = get_block_root(&block); + let data_columns_with_block_root = custody_columns_by_block + // Remove to only use columns once + .remove(&block_root) + .unwrap_or_default(); + + RpcBlock::new_with_custody_columns(Some(block_root), block, data_columns_with_block_root, spec) + .map_err(Error::InternalError) } impl Request { diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index d39fef93f3b..79f463d8735 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -30,7 +30,7 @@ const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; pub struct ActiveCustodyByRootRequest { start_time: Instant, - block_roots: Vec, + block_root: Hash256, custody_id: CustodyByRootRequestId, /// List of column indices this request needs to download to complete successfully #[allow(clippy::type_complexity)] @@ -92,14 +92,14 @@ pub type CustodyByRootRequestResult = impl ActiveCustodyByRootRequest { pub(crate) fn new( - block_roots: Vec, + block_root: Hash256, custody_id: CustodyByRootRequestId, column_indices: &[ColumnIndex], lookup_peers: Arc>>, ) -> Self { Self { start_time: Instant::now(), - block_roots, + block_root, custody_id, column_requests: HashMap::from_iter( column_indices @@ -311,7 +311,7 @@ impl ActiveCustodyByRootRequest { .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, - self.block_roots.clone(), + self.block_root, indices.clone(), // If peer is in the lookup peer set, it claims to have imported the block and // must have its columns in custody. In that case, set `true = enforce max_requests` diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 8b32c4f61b8..574f4e32458 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -7,7 +7,7 @@ use strum::IntoStaticStr; use types::{Hash256, Slot}; pub use blobs_by_range::BlobsByRangeRequestItems; -pub use blobs_by_root::{BlobCountPerBlock, BlobsByRootRequestItems}; +pub use blobs_by_root::BlobsByRootRequestItems; pub use blocks_by_range::BlocksByRangeRequestItems; pub use blocks_by_root::BlocksByRootRequestItems; pub use data_columns_by_range::DataColumnsByRangeRequestItems; diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs index 8f420936bcc..2f1d2ab408a 100644 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs @@ -1,24 +1,20 @@ use lighthouse_network::rpc::methods::BlobsByRootRequest; -use std::collections::HashMap; use std::sync::Arc; use types::{blob_sidecar::BlobIdentifier, BlobSidecar, EthSpec, ForkContext, Hash256}; use super::{ActiveRequestItems, LookupVerifyError}; -pub struct BlobCountPerBlock(pub HashMap); - pub struct BlobsByRootRequestItems { - // TODO(tree-sync): we know ahead of time how many blobs each block has, track it - block_roots: Vec, + block_root: Hash256, indices: Vec, items: Vec>>, } impl BlobsByRootRequestItems { - pub fn new(request: BlobCountPerBlock) -> Self { + pub fn new(block_root: Hash256, indices: Vec) -> Self { Self { - block_roots: todo!(), - indices: todo!(), + block_root, + indices, items: vec![], } } @@ -32,7 +28,7 @@ impl ActiveRequestItems for BlobsByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, blob: Self::Item) -> Result { let block_root = blob.block_root(); - if !self.block_roots.contains(&block_root) { + if self.block_root != block_root { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } if !blob.verify_blob_sidecar_inclusion_proof() { @@ -47,7 +43,7 @@ impl ActiveRequestItems for BlobsByRootRequestItems { self.items.push(blob); - Ok(self.items.len() >= self.block_roots.len() * self.indices.len()) + Ok(self.items.len() >= self.indices.len()) } fn consume(&mut self) -> Vec { diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs index 2d98310aaaf..0eb9ce79936 100644 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs @@ -6,14 +6,14 @@ use types::{EthSpec, ForkContext, Hash256, SignedBeaconBlock}; use super::{ActiveRequestItems, LookupVerifyError}; pub struct BlocksByRootRequestItems { - request: BlocksByRootRequest, + block_root: Hash256, items: Vec>>, } impl BlocksByRootRequestItems { - pub fn new(request: BlocksByRootRequest) -> Self { + pub fn new(block_root: Hash256) -> Self { Self { - request, + block_root, items: vec![], } } @@ -26,14 +26,8 @@ impl ActiveRequestItems for BlocksByRootRequestItems { /// resolved immediately. /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, block: Self::Item) -> Result { - // TODO(tree-sync): Cache this block root calculation let block_root = get_block_root(&block); - if !self - .request - .block_roots() - .iter() - .any(|root| root == &block_root) - { + if self.block_root != block_root { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 3517207a72c..6c1c1ace2fc 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -1,28 +1,22 @@ use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use std::sync::Arc; use types::{ - ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, - Hash256, RuntimeVariableList, + ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, Hash256, + RuntimeVariableList, }; use super::{ActiveRequestItems, LookupVerifyError}; -pub struct DataColumnsByRootRequestSameIndices { - block_roots: Vec, - indices: Vec, -} - pub struct DataColumnsByRootRequestItems { - // Assumes each block root has the same indices - block_roots: Vec, - indices: Vec, + block_root: Hash256, + indices: Vec, items: Vec>>, } impl DataColumnsByRootRequestItems { - pub fn new(block_roots: Vec, indices: Vec) -> Self { + pub fn new(block_root: Hash256, indices: Vec) -> Self { Self { - block_roots, + block_root, indices, items: vec![], } @@ -37,7 +31,7 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, data_column: Self::Item) -> Result { let block_root = data_column.block_root(); - if !self.block_roots.contains(&block_root) { + if self.block_root != block_root { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } if !data_column.verify_inclusion_proof() { @@ -55,7 +49,7 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { self.items.push(data_column); - Ok(self.items.len() >= self.block_roots.len() * self.indices.len()) + Ok(self.items.len() >= self.indices.len()) } fn consume(&mut self) -> Vec { diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index e92af4b3559..d0a220fdcd0 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -578,7 +578,7 @@ impl ActiveSamplingRequest { sampling_request_id: self.current_sampling_request_id, }), peer_id, - vec![self.block_root], + self.block_root, column_indexes.clone(), // false = We issue request to custodians who may or may not have received the // samples yet. We don't any signal (like an attestation or status messages that the From b1b0bf88673978864240322c061553410dc64bc4 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:58:22 +0200 Subject: [PATCH 27/66] Delete by_range code --- .../src/service/api_types.rs | 66 +- beacon_node/network/src/router.rs | 89 +- .../network/src/sync/block_lookups/common.rs | 186 --- .../network/src/sync/block_lookups/mod.rs | 1150 ----------------- .../src/sync/block_lookups/parent_chain.rs | 198 --- .../sync/block_lookups/single_block_lookup.rs | 678 ---------- beacon_node/network/src/sync/manager.rs | 114 +- beacon_node/network/src/sync/mod.rs | 1 - .../network/src/sync/network_context.rs | 478 +------ .../block_components_by_range.rs | 38 +- .../sync/network_context/custody_by_range.rs | 429 ------ .../src/sync/network_context/requests.rs | 3 - .../requests/blobs_by_range.rs | 61 - .../requests/blocks_by_range.rs | 53 - .../requests/data_columns_by_range.rs | 59 - beacon_node/network/src/sync/tests/lookups.rs | 74 +- beacon_node/network/src/sync/tests/mod.rs | 4 +- beacon_node/network/src/sync/tests/range.rs | 400 +----- 18 files changed, 119 insertions(+), 3962 deletions(-) delete mode 100644 beacon_node/network/src/sync/block_lookups/common.rs delete mode 100644 beacon_node/network/src/sync/block_lookups/mod.rs delete mode 100644 beacon_node/network/src/sync/block_lookups/parent_chain.rs delete mode 100644 beacon_node/network/src/sync/block_lookups/single_block_lookup.rs delete mode 100644 beacon_node/network/src/sync/network_context/custody_by_range.rs delete mode 100644 beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs delete mode 100644 beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs delete mode 100644 beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 9fadffb9eb4..e3c10acfd5a 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -23,12 +23,6 @@ pub enum SyncRequestId { BlobsByRoot(BlobsByRootRequestId), /// Request searching for a set of data columns given a hash and list of column indices. DataColumnsByRoot(DataColumnsByRootRequestId), - /// Blocks by range request - BlocksByRange(BlocksByRangeRequestId), - /// Blobs by range request - BlobsByRange(BlobsByRangeRequestId), - /// Data columns by range request - DataColumnsByRange(DataColumnsByRangeRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -48,15 +42,7 @@ pub struct BatchId(pub Id); #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct DataColumnsByRootRequestId { pub id: Id, - pub requester: DataColumnsByRootRequester, -} - -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct BlocksByRangeRequestId { - /// Id to identify this attempt at a blocks_by_range request for `parent_request_id` - pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + pub parent_request_id: DataColumnsByRootRequester, } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -64,37 +50,13 @@ pub struct BlobsByRootRequestId { /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` pub id: Id, /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, -} - -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct BlobsByRangeRequestId { - /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` - pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, -} - -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct DataColumnsByRangeRequestId { - /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` - pub id: Id, - /// The Id of the parent custody by range request that issued this data_columns_by_range request - pub parent_request_id: CustodyByRangeRequestId, -} - -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct CustodyByRangeRequestId { - /// Id to identify this attempt at a meta custody by range request for `parent_request_id` - pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + pub parent_request_id: ComponentsByRootRequestId, } /// Block components by range request for range sync. Includes an ID for downstream consumers to /// handle retries and tie all their sub requests together. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct ComponentsByRangeRequestId { +pub struct ComponentsByRootRequestId { /// Each `RangeRequestId` may request the same data in a later retry. This Id identifies the /// current attempt. pub id: Id, @@ -112,7 +74,7 @@ pub enum RangeRequestId { #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum BlocksByRootRequester { Header(HeaderLookupId), - RangeSync(ComponentsByRangeRequestId), + RangeSync(ComponentsByRootRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -144,7 +106,7 @@ pub struct SamplingRequestId(pub usize); #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CustodyByRootRequestId { - pub parent_request_id: ComponentsByRangeRequestId, + pub parent_request_id: ComponentsByRootRequestId, } /// Downstream components that perform custody by root requests. @@ -252,14 +214,10 @@ macro_rules! impl_display { // Since each request Id is deeply nested with various types, if rendered with Debug on logs they // take too much visual space. This custom Display implementations make the overall Id short while // not losing information -impl_display!(BlocksByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(CustodyByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); +impl_display!(ComponentsByRootRequestId, "{}/{}", id, requester); impl_display!(BlocksByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(BlobsByRootRequestId, "{}/{}", id, parent_request_id); -impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); +impl_display!(DataColumnsByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); impl_display!(CustodyByRootRequestId, "{}", parent_request_id); impl_display!(SamplingId, "{}/{}", sampling_request_id, id); @@ -331,7 +289,7 @@ mod tests { fn display_id_data_columns_by_root_custody() { let id = DataColumnsByRootRequestId { id: 123, - requester: DataColumnsByRootRequester::Custody(CustodyId { + parent_request_id: DataColumnsByRootRequester::Custody(CustodyId { requester: CustodyRequester(SingleLookupReqId { req_id: 121, lookup_id: 101, @@ -345,7 +303,7 @@ mod tests { fn display_id_data_columns_by_root_sampling() { let id = DataColumnsByRootRequestId { id: 123, - requester: DataColumnsByRootRequester::Sampling(SamplingId { + parent_request_id: DataColumnsByRootRequester::Sampling(SamplingId { id: SamplingRequester::ImportedBlock(Hash256::ZERO), sampling_request_id: SamplingRequestId(101), }), @@ -355,11 +313,11 @@ mod tests { #[test] fn display_id_data_columns_by_range() { - let id = DataColumnsByRangeRequestId { + let id = DataColumnsByRootRequestId { id: 123, - parent_request_id: CustodyByRangeRequestId { + parent_request_id: CustodyByRootRequestId { id: 122, - parent_request_id: ComponentsByRangeRequestId { + parent_request_id: ComponentsByRootRequestId { id: 121, requester: RangeRequestId::RangeSync { chain_id: 54, diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index c02f11cbee3..2426cd2c1d9 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -297,13 +297,13 @@ impl Router { ) } Response::BlocksByRange(beacon_block) => { - self.on_blocks_by_range_response(peer_id, app_request_id, beacon_block); + crit!(id = ?app_request_id, "No BlocksByRange response expected"); } Response::BlocksByRoot(beacon_block) => { self.on_blocks_by_root_response(peer_id, app_request_id, beacon_block); } Response::BlobsByRange(blob) => { - self.on_blobs_by_range_response(peer_id, app_request_id, blob); + crit!(id = ?app_request_id, "No BlobsByRange response expected"); } Response::BlobsByRoot(blob) => { self.on_blobs_by_root_response(peer_id, app_request_id, blob); @@ -312,7 +312,7 @@ impl Router { self.on_data_columns_by_root_response(peer_id, app_request_id, data_column); } Response::DataColumnsByRange(data_column) => { - self.on_data_columns_by_range_response(peer_id, app_request_id, data_column); + crit!(id = ?app_request_id, "No DataColumnsByRange response expected"); } // Light client responses should not be received Response::LightClientBootstrap(_) @@ -559,66 +559,6 @@ impl Router { ) } - /// Handle a `BlocksByRange` response from the peer. - /// A `beacon_block` behaves as a stream which is terminated on a `None` response. - pub fn on_blocks_by_range_response( - &mut self, - peer_id: PeerId, - app_request_id: AppRequestId, - beacon_block: Option>>, - ) { - let sync_request_id = match app_request_id { - AppRequestId::Sync(sync_request_id) => match sync_request_id { - id @ SyncRequestId::BlocksByRange { .. } => id, - other => { - crit!(request = ?other, "BlocksByRange response on incorrect request"); - return; - } - }, - AppRequestId::Router => { - crit!(%peer_id, "All BBRange requests belong to sync"); - return; - } - AppRequestId::Internal => unreachable!("Handled internally"), - }; - - trace!( - %peer_id, - "Received BlocksByRange Response" - - ); - - self.send_to_sync(SyncMessage::RpcBlock { - peer_id, - sync_request_id, - beacon_block, - seen_timestamp: timestamp_now(), - }); - } - - pub fn on_blobs_by_range_response( - &mut self, - peer_id: PeerId, - app_request_id: AppRequestId, - blob_sidecar: Option>>, - ) { - trace!( - %peer_id, - "Received BlobsByRange Response" - ); - - if let AppRequestId::Sync(sync_request_id) = app_request_id { - self.send_to_sync(SyncMessage::RpcBlob { - peer_id, - sync_request_id, - blob_sidecar, - seen_timestamp: timestamp_now(), - }); - } else { - crit!("All blobs by range responses should belong to sync"); - } - } - /// Handle a `BlocksByRoot` response from the peer. pub fn on_blocks_by_root_response( &mut self, @@ -721,29 +661,6 @@ impl Router { }); } - pub fn on_data_columns_by_range_response( - &mut self, - peer_id: PeerId, - app_request_id: AppRequestId, - data_column: Option>>, - ) { - trace!( - %peer_id, - "Received DataColumnsByRange Response" - ); - - if let AppRequestId::Sync(sync_request_id) = app_request_id { - self.send_to_sync(SyncMessage::RpcDataColumn { - peer_id, - sync_request_id, - data_column, - seen_timestamp: timestamp_now(), - }); - } else { - crit!("All data columns by range responses should belong to sync"); - } - } - fn handle_beacon_processor_send_result( &mut self, result: Result<(), crate::network_beacon_processor::Error>, diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs deleted file mode 100644 index 8304e01bf03..00000000000 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ /dev/null @@ -1,186 +0,0 @@ -use crate::sync::block_lookups::single_block_lookup::{ - LookupRequestError, SingleBlockLookup, SingleLookupRequestState, -}; -use crate::sync::block_lookups::{ - BlobRequestState, BlockRequestState, CustodyRequestState, PeerId, -}; -use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; -use beacon_chain::BeaconChainTypes; -use lighthouse_network::service::api_types::Id; -use parking_lot::RwLock; -use std::collections::HashSet; -use std::sync::Arc; -use types::blob_sidecar::FixedBlobSidecarList; -use types::{DataColumnSidecarList, SignedBeaconBlock}; - -use super::single_block_lookup::{ComponentRequests, DownloadResult}; -use super::SingleLookupId; - -#[derive(Debug, Copy, Clone)] -pub enum ResponseType { - Block, - Blob, - CustodyColumn, -} - -/// This trait unifies common single block lookup functionality across blocks and blobs. This -/// includes making requests, verifying responses, and handling processing results. A -/// `SingleBlockLookup` includes both a `BlockRequestState` and a `BlobRequestState`, this trait is -/// implemented for each. -/// -/// The use of the `ResponseType` associated type gives us a degree of type -/// safety when handling a block/blob response ensuring we only mutate the correct corresponding -/// state. -pub trait RequestState { - /// The type created after validation. - type VerifiedResponseType: Clone; - - /// Request the network context to prepare a request of a component of `block_root`. If the - /// request is not necessary because the component is already known / processed, return false. - /// Return true if it sent a request and we can expect an event back from the network. - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - expected_blobs: usize, - cx: &mut SyncNetworkContext, - ) -> Result; - - /* Response handling methods */ - - /// Send the response to the beacon processor. - fn send_for_processing( - id: Id, - result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError>; - - /* Utility methods */ - - /// Returns the `ResponseType` associated with this trait implementation. Useful in logging. - fn response_type() -> ResponseType; - - /// A getter for the `BlockRequestState` or `BlobRequestState` associated with this trait. - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str>; - - /// A getter for a reference to the `SingleLookupRequestState` associated with this trait. - fn get_state(&self) -> &SingleLookupRequestState; - - /// A getter for a mutable reference to the SingleLookupRequestState associated with this trait. - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState; -} - -impl RequestState for BlockRequestState { - type VerifiedResponseType = Arc>; - - fn make_request( - &self, - _id: SingleLookupId, - _lookup_peers: Arc>>, - _: usize, - _cx: &mut SyncNetworkContext, - ) -> Result { - todo!(); - } - - fn send_for_processing( - _id: SingleLookupId, - _download_result: DownloadResult, - _cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - todo!(); - } - - fn response_type() -> ResponseType { - ResponseType::Block - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - Ok(&mut request.block_request_state) - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} - -impl RequestState for BlobRequestState { - type VerifiedResponseType = FixedBlobSidecarList; - - fn make_request( - &self, - _id: Id, - _lookup_peers: Arc>>, - _expected_blobs: usize, - _cx: &mut SyncNetworkContext, - ) -> Result { - todo!(); - } - - fn send_for_processing( - _id: Id, - _download_result: DownloadResult, - _cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - todo!(); - } - - fn response_type() -> ResponseType { - ResponseType::Blob - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - match &mut request.component_requests { - ComponentRequests::WaitingForBlock => Err("waiting for block"), - ComponentRequests::ActiveBlobRequest(request, _) => Ok(request), - ComponentRequests::ActiveCustodyRequest { .. } => Err("expecting custody request"), - ComponentRequests::NotNeeded { .. } => Err("not needed"), - } - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} - -impl RequestState for CustodyRequestState { - type VerifiedResponseType = DataColumnSidecarList; - - fn make_request( - &self, - _id: Id, - _lookup_peers: Arc>>, - _: usize, - _cx: &mut SyncNetworkContext, - ) -> Result { - todo!(); - } - - fn send_for_processing( - _id: Id, - _download_result: DownloadResult, - _cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - todo!(); - } - - fn response_type() -> ResponseType { - ResponseType::CustodyColumn - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - match &mut request.component_requests { - ComponentRequests::WaitingForBlock => Err("waiting for block"), - ComponentRequests::ActiveBlobRequest { .. } => Err("expecting blob request"), - ComponentRequests::ActiveCustodyRequest(request) => Ok(request), - ComponentRequests::NotNeeded { .. } => Err("not needed"), - } - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs deleted file mode 100644 index c545facdd9c..00000000000 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ /dev/null @@ -1,1150 +0,0 @@ -//! Implements block lookup sync. -//! -//! Block lookup sync is triggered when a peer claims to have imported a block we don't know about. -//! For example, a peer attesting to a head block root that is not in our fork-choice. Lookup sync -//! is recursive in nature, as we may discover that this attested head block root has a parent that -//! is also unknown to us. -//! -//! Block lookup is implemented as an event-driven state machine. It sends events to the network and -//! beacon processor, and expects some set of events back. A discrepancy in the expected event API -//! will result in lookups getting "stuck". A lookup becomes stuck when there is no future event -//! that will trigger the lookup to make progress. There's a fallback mechanism that drops lookups -//! that live for too long, logging the line "Notify the devs a sync lookup is stuck". -//! -//! The expected event API is documented in the code paths that are making assumptions with the -//! comment prefix "Lookup sync event safety:" -//! -//! Block lookup sync attempts to not re-download or re-process data that we already have. Block -//! components are cached temporarily in multiple places before they are imported into fork-choice. -//! Therefore, block lookup sync must peek these caches correctly to decide when to skip a download -//! or consider a lookup complete. These caches are read from the `SyncNetworkContext` and its state -//! returned to this module as `LookupRequestResult` variants. - -use self::parent_chain::{compute_parent_chains, NodeChain}; -pub use self::single_block_lookup::DownloadResult; -use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup}; -use super::manager::{BlockProcessType, BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; -use super::network_context::{PeerGroup, RpcResponseError, SyncNetworkContext}; -use crate::metrics; -use crate::sync::block_lookups::common::ResponseType; -use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor; -use crate::sync::SyncMessage; -use beacon_chain::block_verification_types::AsBlock; -use beacon_chain::data_availability_checker::{ - AvailabilityCheckError, AvailabilityCheckErrorCategory, -}; -use beacon_chain::{AvailabilityProcessingStatus, BeaconChainTypes, BlockError}; -pub use common::RequestState; -use fnv::FnvHashMap; -use itertools::Itertools; -use lighthouse_network::service::api_types::SingleLookupReqId; -use lighthouse_network::{PeerAction, PeerId}; -use lru_cache::LRUTimeCache; -pub use single_block_lookup::{BlobRequestState, BlockRequestState, CustodyRequestState}; -use std::collections::hash_map::Entry; -use std::sync::Arc; -use std::time::Duration; -use store::Hash256; -use tracing::{debug, error, instrument, warn}; -use types::{BlobSidecar, DataColumnSidecar, EthSpec, SignedBeaconBlock}; - -pub mod common; -pub mod parent_chain; -mod single_block_lookup; - -/// The maximum depth we will search for a parent block. In principle we should have sync'd any -/// canonical chain to its head once the peer connects. A chain should not appear where it's depth -/// is further back than the most recent head slot. -/// -/// Have the same value as range's sync tolerance to consider a peer synced. Once sync lookup -/// reaches the maximum depth it will force trigger range sync. -pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE; - -const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; -pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4; - -/// Maximum time we allow a lookup to exist before assuming it is stuck and will never make -/// progress. Assume the worse case processing time per block component set * times max depth. -/// 15 * 2 * 32 = 16 minutes. -const LOOKUP_MAX_DURATION_STUCK_SECS: u64 = 15 * PARENT_DEPTH_TOLERANCE as u64; -/// The most common case of child-lookup without peers is receiving block components before the -/// attestation deadline when the node is lagging behind. Once peers start attesting for the child -/// lookup at most after 4 seconds, the lookup should gain peers. -const LOOKUP_MAX_DURATION_NO_PEERS_SECS: u64 = 10; - -/// Lookups contain untrusted data, including blocks that have not yet been validated. In case of -/// bugs or malicious activity we want to bound how much memory these lookups can consume. Aprox the -/// max size of a lookup is ~ 10 MB (current max size of gossip and RPC blocks). 200 lookups can -/// take at most 2 GB. 200 lookups allow 3 parallel chains of depth 64 (current maximum). -const MAX_LOOKUPS: usize = 200; - -pub enum BlockComponent { - Block(DownloadResult>>), - Blob(DownloadResult>>), - DataColumn(DownloadResult>>), -} - -impl BlockComponent { - fn parent_root(&self) -> Hash256 { - match self { - BlockComponent::Block(block) => block.value.parent_root(), - BlockComponent::Blob(blob) => blob.value.block_parent_root(), - BlockComponent::DataColumn(column) => column.value.block_parent_root(), - } - } - fn get_type(&self) -> &'static str { - match self { - BlockComponent::Block(_) => "block", - BlockComponent::Blob(_) => "blob", - BlockComponent::DataColumn(_) => "data_column", - } - } -} - -pub type SingleLookupId = u32; - -enum Action { - Retry, - ParentUnknown { parent_root: Hash256 }, - Drop(/* reason: */ String), - Continue, -} - -pub struct BlockLookups { - /// A cache of failed chain lookups to prevent duplicate searches. - failed_chains: LRUTimeCache, - - // TODO: Why not index lookups by block_root? - single_block_lookups: FnvHashMap>, -} - -#[cfg(test)] -use lighthouse_network::service::api_types::Id; - -#[cfg(test)] -/// Tuple of `SingleLookupId`, requested block root, awaiting parent block root (if any), -/// and list of peers that claim to have imported this set of block components. -pub(crate) type BlockLookupSummary = (Id, Hash256, Option, Vec); - -impl BlockLookups { - #[instrument(parent = None,level = "info", fields(service = "lookup_sync"), name = "lookup_sync")] - pub fn new() -> Self { - Self { - failed_chains: LRUTimeCache::new(Duration::from_secs( - FAILED_CHAINS_CACHE_EXPIRY_SECONDS, - )), - single_block_lookups: Default::default(), - } - } - - #[cfg(test)] - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.failed_chains.insert(block_root); - } - - #[cfg(test)] - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.failed_chains.keys().cloned().collect() - } - - #[cfg(test)] - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn active_single_lookups(&self) -> Vec { - self.single_block_lookups - .iter() - .map(|(id, l)| (*id, l.block_root(), l.awaiting_parent(), l.all_peers())) - .collect() - } - - /// Returns a vec of all parent lookup chains by tip, in descending slot order (tip first) - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn active_parent_lookups(&self) -> Vec { - compute_parent_chains( - &self - .single_block_lookups - .values() - .map(|lookup| lookup.into()) - .collect::>(), - ) - } - - /* Lookup requests */ - - /// Creates a parent lookup for the block with the given `block_root` and immediately triggers it. - /// If a parent lookup exists or is triggered, a current lookup will be created. - /// - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - pub fn search_child_and_parent( - &mut self, - block_root: Hash256, - block_component: BlockComponent, - peer_id: PeerId, - cx: &mut SyncNetworkContext, - ) -> bool { - let parent_root = block_component.parent_root(); - - let parent_lookup_exists = - self.search_parent_of_child(parent_root, block_root, &[peer_id], cx); - // Only create the child lookup if the parent exists - if parent_lookup_exists { - // `search_parent_of_child` ensures that parent root is not a failed chain - self.new_current_lookup( - block_root, - Some(block_component), - Some(parent_root), - // On a `UnknownParentBlock` or `UnknownParentBlob` event the peer is not required - // to have the rest of the block components (refer to decoupled blob gossip). Create - // the lookup with zero peers to house the block components. - &[], - cx, - ) - } else { - false - } - } - - /// Seach a block whose parent root is unknown. - /// - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - pub fn search_unknown_block( - &mut self, - block_root: Hash256, - peer_source: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> bool { - self.new_current_lookup(block_root, None, None, peer_source, cx) - } - - /// A block or blob triggers the search of a parent. - /// Check if this new lookup extends a bad chain: - /// - Extending `child_block_root_trigger` would exceed the max depth - /// - `block_root_to_search` is a failed chain - /// - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - pub fn search_parent_of_child( - &mut self, - block_root_to_search: Hash256, - child_block_root_trigger: Hash256, - peers: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> bool { - let parent_chains = self.active_parent_lookups(); - - for (chain_idx, parent_chain) in parent_chains.iter().enumerate() { - // `block_root_to_search` will trigger a new lookup, and it will extend a parent_chain - // beyond its max length - let block_would_extend_chain = parent_chain.ancestor() == child_block_root_trigger; - // `block_root_to_search` already has a lookup, and with the block trigger it extends - // the parent_chain beyond its length. This can happen because when creating a lookup - // for a new root we don't do any parent chain length checks - let trigger_is_chain_tip = parent_chain.tip == child_block_root_trigger; - - if (block_would_extend_chain || trigger_is_chain_tip) - && parent_chain.len() >= PARENT_DEPTH_TOLERANCE - { - debug!(block_root = ?block_root_to_search, "Parent lookup chain too long"); - - // Searching for this parent would extend a parent chain over the max - // Insert the tip only to failed chains - self.failed_chains.insert(parent_chain.tip); - - // Note: Drop only the chain that's too long until it merges with another chain - // that's not too long. Consider this attack: there's a chain of valid unknown - // blocks A -> B. A malicious peer builds `PARENT_DEPTH_TOLERANCE` garbage - // blocks on top of A forming A -> C. The malicious peer forces us to fetch C - // from it, which will result in parent A hitting the chain_too_long error. Then - // the valid chain A -> B is dropped too. - // - // `find_oldest_fork_ancestor` should never return Err, unwrapping to tip for - // complete-ness - let parent_chain_tip = parent_chain.tip; - let block_to_drop = - find_oldest_fork_ancestor(parent_chains, chain_idx).unwrap_or(parent_chain_tip); - // Drop all lookups descending from the child of the too long parent chain - if let Some((lookup_id, lookup)) = self - .single_block_lookups - .iter() - .find(|(_, l)| l.block_root() == block_to_drop) - { - // If a lookup chain is too long, we can't distinguish a valid chain from a - // malicious one. We must attempt to sync this chain to not lose liveness. If - // the chain grows too long, we stop lookup sync and transition this head to - // forward range sync. We need to tell range sync which head to sync to, and - // from which peers. The lookup of the very tip of this chain may contain zero - // peers if it's the parent-child lookup. So we do a bit of a trick here: - // - Tell range sync to sync to the tip's root (if available, else its ancestor) - // - But use all peers in the ancestor lookup, which should have at least one - // peer, and its peer set is a strict superset of the tip's lookup. - if let Some((_, tip_lookup)) = self - .single_block_lookups - .iter() - .find(|(_, l)| l.block_root() == parent_chain_tip) - { - cx.send_sync_message(SyncMessage::AddPeersForceRangeSync { - peers: lookup.all_peers(), - head_slot: tip_lookup.peek_downloaded_block_slot(), - head_root: parent_chain_tip, - }); - } else { - // Should never happen, log error and continue the lookup drop - error!( - error = "Parent chain tip lookup not found", - block_root = ?parent_chain_tip, - "Unable to transition lookup to range sync" - ); - } - - // Do not downscore peers here. Because we can't distinguish a valid chain from - // a malicious one we may penalize honest peers for attempting to discover us a - // valid chain. Until blocks_by_range allows to specify a tip, for example with - // https://github.com/ethereum/consensus-specs/pull/3845 we will have poor - // attributability. A peer can send us garbage blocks over blocks_by_root, and - // then correct blocks via blocks_by_range. - - self.drop_lookup_and_children(*lookup_id); - } else { - // Should never happen - error!( - error = "Block to drop lookup not found", - block_root = ?block_to_drop, - "Unable to transition lookup to range sync" - ); - } - - return false; - } - } - - // `block_root_to_search` is a failed chain check happens inside new_current_lookup - self.new_current_lookup(block_root_to_search, None, None, peers, cx) - } - - /// Searches for a single block hash. If the blocks parent is unknown, a chain of blocks is - /// constructed. - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - fn new_current_lookup( - &mut self, - block_root: Hash256, - block_component: Option>, - awaiting_parent: Option, - peers: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> bool { - // If this block or it's parent is part of a known failed chain, ignore it. - if self.failed_chains.contains(&block_root) { - debug!(?block_root, "Block is from a past failed chain. Dropping"); - for peer_id in peers { - cx.report_peer(*peer_id, PeerAction::MidToleranceError, "failed_chain"); - } - return false; - } - - // Do not re-request a block that is already being requested - if let Some((&lookup_id, lookup)) = self - .single_block_lookups - .iter_mut() - .find(|(_id, lookup)| lookup.is_for_block(block_root)) - { - if let Some(block_component) = block_component { - let component_type = block_component.get_type(); - let imported = lookup.add_child_components(block_component); - if !imported { - debug!( - ?block_root, - component_type, "Lookup child component ignored" - ); - } - } - - if let Err(e) = self.add_peers_to_lookup_and_ancestors(lookup_id, peers, cx) { - warn!(error = ?e, "Error adding peers to ancestor lookup"); - } - - return true; - } - - // Ensure that awaiting parent exists, otherwise this lookup won't be able to make progress - if let Some(awaiting_parent) = awaiting_parent { - if !self - .single_block_lookups - .iter() - .any(|(_, lookup)| lookup.is_for_block(awaiting_parent)) - { - warn!(block_root = ?awaiting_parent, "Ignoring child lookup parent lookup not found"); - return false; - } - } - - // Lookups contain untrusted data, bound the total count of lookups hold in memory to reduce - // the risk of OOM in case of bugs of malicious activity. - if self.single_block_lookups.len() > MAX_LOOKUPS { - warn!(?block_root, "Dropping lookup reached max"); - return false; - } - - // If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve), - // signal here to hold processing downloaded data. - let mut lookup = SingleBlockLookup::new(block_root, peers, cx.next_id(), awaiting_parent); - - // Add block components to the new request - if let Some(block_component) = block_component { - lookup.add_child_components(block_component); - } - - let id = lookup.id; - let lookup = match self.single_block_lookups.entry(id) { - Entry::Vacant(entry) => entry.insert(lookup), - Entry::Occupied(_) => { - // Should never happen - warn!(id, "Lookup exists with same id"); - return false; - } - }; - - debug!( - ?peers, - ?block_root, - awaiting_parent = awaiting_parent - .map(|root| root.to_string()) - .unwrap_or("none".to_owned()), - id = lookup.id, - "Created block lookup" - ); - metrics::inc_counter(&metrics::SYNC_LOOKUP_CREATED); - - let result = lookup.continue_requests(cx); - if self.on_lookup_result(id, result, "new_current_lookup", cx) { - self.update_metrics(); - true - } else { - false - } - } - - /* Lookup responses */ - - /// Process a block or blob response received from a single lookup request. - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_download_response>( - &mut self, - id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, - cx: &mut SyncNetworkContext, - ) { - let result = self.on_download_response_inner::(id, response, cx); - self.on_lookup_result(id.lookup_id, result, "download_response", cx); - } - - /// Process a block or blob response received from a single lookup request. - pub fn on_download_response_inner>( - &mut self, - id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, - cx: &mut SyncNetworkContext, - ) -> Result { - // Note: no need to downscore peers here, already downscored on network context - - let response_type = R::response_type(); - let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { - // We don't have the ability to cancel in-flight RPC requests. So this can happen - // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(%id, "Block returned for single block lookup not present"); - return Err(LookupRequestError::UnknownLookup); - }; - - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - - match response { - Ok((response, peer_group, seen_timestamp)) => { - debug!( - ?block_root, - %id, - ?peer_group, - ?response_type, - "Received lookup download success" - ); - - // Here we could check if response extends a parent chain beyond its max length. - // However we defer that check to the handling of a processing error ParentUnknown. - // - // Here we could check if there's already a lookup for parent_root of `response`. In - // that case we know that sending the response for processing will likely result in - // a `ParentUnknown` error. However, for simplicity we choose to not implement this - // optimization. - - // Register the download peer here. Once we have received some data over the wire we - // attribute it to this peer for scoring latter regardless of how the request was - // done. - request_state.on_download_success( - id.req_id, - DownloadResult { - value: response, - block_root, - seen_timestamp, - peer_group, - }, - )?; - // continue_request will send for processing as the request state is AwaitingProcessing - } - Err(e) => { - // No need to log peer source here. When sending a DataColumnsByRoot request we log - // the peer and the request ID which is linked to this `id` value here. - debug!( - ?block_root, - %id, - ?response_type, - error = ?e, - "Received lookup download failure" - ); - - request_state.on_download_failure(id.req_id)?; - // continue_request will retry a download as the request state is AwaitingDownload - } - } - - lookup.continue_requests(cx) - } - - /* Error responses */ - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn peer_disconnected(&mut self, peer_id: &PeerId) { - for (_, lookup) in self.single_block_lookups.iter_mut() { - lookup.remove_peer(peer_id); - } - } - - /* Processing responses */ - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_processing_result( - &mut self, - _process_type: BlockProcessType, - _result: BlockProcessingResult, - _cx: &mut SyncNetworkContext, - ) { - todo!(); - } - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_processing_result_inner>( - &mut self, - lookup_id: SingleLookupId, - result: BlockProcessingResult, - cx: &mut SyncNetworkContext, - ) -> Result { - let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else { - debug!(id = lookup_id, "Unknown single block lookup"); - return Err(LookupRequestError::UnknownLookup); - }; - - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - - debug!( - component = ?R::response_type(), - ?block_root, - id = lookup_id, - ?result, - "Received lookup processing result" - ); - - let action = match result { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(_)) - | BlockProcessingResult::Err(BlockError::DuplicateFullyImported(..)) => { - // Successfully imported - request_state.on_processing_success()?; - Action::Continue - } - - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents { - .. - }) => { - // `on_processing_success` is called here to ensure the request state is updated prior to checking - // if both components have been processed. - request_state.on_processing_success()?; - - if lookup.all_components_processed() { - // We don't request for other block components until being sure that the block has - // data. If we request blobs / columns to a peer we are sure those must exist. - // Therefore if all components are processed and we still receive `MissingComponents` - // it indicates an internal bug. - return Err(LookupRequestError::MissingComponentsAfterAllProcessed); - } else { - // Continue request, potentially request blobs - Action::Retry - } - } - BlockProcessingResult::Err(BlockError::DuplicateImportStatusUnknown(..)) => { - // This is unreachable because RPC blocks do not undergo gossip verification, and - // this error can *only* come from gossip verification. - error!(?block_root, "Single block lookup hit unreachable condition"); - Action::Drop("DuplicateImportStatusUnknown".to_owned()) - } - BlockProcessingResult::Ignored => { - // Beacon processor signalled to ignore the block processing result. - // This implies that the cpu is overloaded. Drop the request. - warn!( - component = ?R::response_type(), - "Lookup component processing ignored, cpu might be overloaded" - ); - Action::Drop("Block processing ignored".to_owned()) - } - BlockProcessingResult::Err(e) => { - match e { - BlockError::BeaconChainError(e) => { - // Internal error - error!(%block_root, error = ?e, "Beacon chain error processing lookup component"); - Action::Drop(format!("{e:?}")) - } - BlockError::ParentUnknown { parent_root, .. } => { - // Reverts the status of this request to `AwaitingProcessing` holding the - // downloaded data. A future call to `continue_requests` will re-submit it - // once there are no pending parent requests. - // Note: `BlockError::ParentUnknown` is only returned when processing - // blocks, not blobs. - request_state.revert_to_awaiting_processing()?; - Action::ParentUnknown { parent_root } - } - ref e @ BlockError::ExecutionPayloadError(ref epe) if !epe.penalize_peer() => { - // These errors indicate that the execution layer is offline - // and failed to validate the execution payload. Do not downscore peer. - debug!( - ?block_root, - error = ?e, - "Single block lookup failed. Execution layer is offline / unsynced / misconfigured" - ); - Action::Drop(format!("{e:?}")) - } - BlockError::AvailabilityCheck(e) - if e.category() == AvailabilityCheckErrorCategory::Internal => - { - // There errors indicate internal problems and should not downscore the peer - warn!(?block_root, error = ?e, "Internal availability check failure"); - - // Here we choose *not* to call `on_processing_failure` because this could result in a bad - // lookup state transition. This error invalidates both blob and block requests, and we don't know the - // state of both requests. Blobs may have already successfullly processed for example. - // We opt to drop the lookup instead. - Action::Drop(format!("{e:?}")) - } - other => { - debug!( - ?block_root, - component = ?R::response_type(), - error = ?other, - "Invalid lookup component" - ); - let peer_group = request_state.on_processing_failure()?; - let peers_to_penalize: Vec<_> = match other { - // Note: currenlty only InvalidColumn errors have index granularity, - // but future errors may follow the same pattern. Generalize this - // pattern with https://github.com/sigp/lighthouse/pull/6321 - BlockError::AvailabilityCheck( - AvailabilityCheckError::InvalidColumn(errors), - ) => errors - .iter() - // Collect all peers that sent a column that was invalid. Must - // run .unique as a single peer can send multiple invalid - // columns. Penalize once to avoid insta-bans - .flat_map(|(index, _)| peer_group.of_index(&(*index as usize))) - .unique() - .collect(), - _ => peer_group.all().collect(), - }; - for peer in peers_to_penalize { - cx.report_peer( - *peer, - PeerAction::MidToleranceError, - match R::response_type() { - ResponseType::Block => "lookup_block_processing_failure", - ResponseType::Blob => "lookup_blobs_processing_failure", - ResponseType::CustodyColumn => { - "lookup_custody_column_processing_failure" - } - }, - ); - } - - Action::Retry - } - } - } - }; - - match action { - Action::Retry => { - // Trigger download for all components in case `MissingComponents` failed the blob - // request. Also if blobs are `AwaitingProcessing` and need to be progressed - lookup.continue_requests(cx) - } - Action::ParentUnknown { parent_root } => { - let peers = lookup.all_peers(); - // Mark lookup as awaiting **before** creating the parent lookup. At this point the - // lookup maybe inconsistent. - lookup.set_awaiting_parent(parent_root); - let parent_lookup_exists = - self.search_parent_of_child(parent_root, block_root, &peers, cx); - if parent_lookup_exists { - // The parent lookup exist or has been created. It's safe for `lookup` to - // reference the parent as awaiting. - debug!( - id = lookup_id, - ?block_root, - ?parent_root, - "Marking lookup as awaiting parent" - ); - Ok(LookupResult::Pending) - } else { - // The parent lookup is faulty and was not created, we must drop the `lookup` as - // it's in an inconsistent state. We must drop all of its children too. - Err(LookupRequestError::Failed(format!( - "Parent lookup is faulty {parent_root:?}" - ))) - } - } - Action::Drop(reason) => { - // Drop with noop - Err(LookupRequestError::Failed(reason)) - } - Action::Continue => { - // Drop this completed lookup only - Ok(LookupResult::Completed) - } - } - } - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_external_processing_result( - &mut self, - block_root: Hash256, - imported: bool, - cx: &mut SyncNetworkContext, - ) { - let Some((id, lookup)) = self - .single_block_lookups - .iter_mut() - .find(|(_, lookup)| lookup.is_for_block(block_root)) - else { - // Ok to ignore gossip process events - return; - }; - - let lookup_result = if imported { - Ok(LookupResult::Completed) - } else { - lookup.continue_requests(cx) - }; - let id = *id; - self.on_lookup_result(id, lookup_result, "external_processing_result", cx); - } - - /// Makes progress on the immediate children of `block_root` - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn continue_child_lookups(&mut self, block_root: Hash256, cx: &mut SyncNetworkContext) { - let mut lookup_results = vec![]; // < need to buffer lookup results to not re-borrow &mut self - - for (id, lookup) in self.single_block_lookups.iter_mut() { - if lookup.awaiting_parent() == Some(block_root) { - lookup.resolve_awaiting_parent(); - debug!( - parent_root = ?block_root, - id, - block_root = ?lookup.block_root(), - "Continuing child lookup" - ); - let result = lookup.continue_requests(cx); - lookup_results.push((*id, result)); - } - } - - for (id, result) in lookup_results { - self.on_lookup_result(id, result, "continue_child_lookups", cx); - } - } - - /// Drops `dropped_id` lookup and all its children recursively. Lookups awaiting a parent need - /// the parent to make progress to resolve, therefore we must drop them if the parent is - /// dropped. - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn drop_lookup_and_children(&mut self, dropped_id: SingleLookupId) { - if let Some(dropped_lookup) = self.single_block_lookups.remove(&dropped_id) { - debug!( - id = ?dropped_id, - block_root = ?dropped_lookup.block_root(), - awaiting_parent = ?dropped_lookup.awaiting_parent(), - "Dropping lookup" - ); - - let child_lookups = self - .single_block_lookups - .iter() - .filter(|(_, lookup)| lookup.awaiting_parent() == Some(dropped_lookup.block_root())) - .map(|(id, _)| *id) - .collect::>(); - - for id in child_lookups { - self.drop_lookup_and_children(id); - } - } - } - - /// Common handler a lookup request error, drop it and update metrics - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn on_lookup_result( - &mut self, - id: SingleLookupId, - result: Result, - source: &str, - cx: &mut SyncNetworkContext, - ) -> bool { - match result { - Ok(LookupResult::Pending) => true, // no action - Ok(LookupResult::Completed) => { - if let Some(lookup) = self.single_block_lookups.remove(&id) { - debug!(block = ?lookup.block_root(), id, "Dropping completed lookup"); - metrics::inc_counter(&metrics::SYNC_LOOKUP_COMPLETED); - // Block imported, continue the requests of pending child blocks - self.continue_child_lookups(lookup.block_root(), cx); - self.update_metrics(); - } else { - debug!(id, "Attempting to drop non-existent lookup"); - } - false - } - // If UnknownLookup do not log the request error. No need to drop child lookups nor - // update metrics because the lookup does not exist. - Err(LookupRequestError::UnknownLookup) => false, - Err(error) => { - debug!(id, source, ?error, "Dropping lookup on request error"); - metrics::inc_counter_vec(&metrics::SYNC_LOOKUP_DROPPED, &[error.into()]); - self.drop_lookup_and_children(id); - self.update_metrics(); - false - } - } - } - - /* Helper functions */ - - /// Drops all the single block requests and returns how many requests were dropped. - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn drop_single_block_requests(&mut self) -> usize { - let requests_to_drop = self.single_block_lookups.len(); - self.single_block_lookups.clear(); - requests_to_drop - } - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn update_metrics(&self) { - metrics::set_gauge( - &metrics::SYNC_SINGLE_BLOCK_LOOKUPS, - self.single_block_lookups.len() as i64, - ); - } - - /// Perform some prune operations on lookups on some interval - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn prune_lookups(&mut self) { - self.drop_lookups_without_peers(); - self.drop_stuck_lookups(); - } - - /// Lookups without peers are allowed to exist for some time. See this common race condition: - /// - /// 1. Receive unknown block parent event - /// 2. Create child lookup with zero peers - /// 3. Parent is processed, before receiving any attestation for the child block - /// 4. Child lookup is attempted to make progress but has no peers - /// 5. We receive an attestion for child block and add a peer to the child block lookup - /// - /// On step 4 we could drop the lookup because we attempt to issue a request with no peers - /// available. This has two issues: - /// - We may drop the lookup while some other block component is processing, triggering an - /// unknown lookup error. This can potentially cause un-related child lookups to also be - /// dropped when calling `drop_lookup_and_children`. - /// - We lose all progress of the lookup, and have to re-download its components that we may - /// already have there cached. - /// - /// Instead there's no negative for keeping lookups with no peers around for some time. If we - /// regularly prune them, it should not be a memory concern (TODO: maybe yes!). - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn drop_lookups_without_peers(&mut self) { - for (lookup_id, block_root) in self - .single_block_lookups - .values() - .filter(|lookup| { - // Do not drop lookup that are awaiting events to prevent inconsinstencies. If a - // lookup gets stuck, it will be eventually pruned by `drop_stuck_lookups` - lookup.has_no_peers() - && lookup.elapsed_since_created() - > Duration::from_secs(LOOKUP_MAX_DURATION_NO_PEERS_SECS) - && !lookup.is_awaiting_event() - }) - .map(|lookup| (lookup.id, lookup.block_root())) - .collect::>() - { - debug!( - id = lookup_id, - %block_root, - "Dropping lookup with no peers" - ); - self.drop_lookup_and_children(lookup_id); - } - } - - /// Safety mechanism to unstuck lookup sync. Lookup sync if purely event driven and depends on - /// external components to feed it events to make progress. If there is a bug in network, in - /// beacon processor, or here internally: lookups can get stuck forever. A stuck lookup can - /// stall a node indefinitely as other lookup will be awaiting on a parent lookup to make - /// progress. - /// - /// If a lookup lasts more than LOOKUP_MAX_DURATION_SECS this function will find its oldest - /// ancestor and then drop it and all its children. This action will allow the node to unstuck - /// itself. Bugs that cause lookups to get stuck may be triggered consistently. So this strategy - /// is useful for two reasons: - /// - /// - One single clear warn level log per stuck incident - /// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn drop_stuck_lookups(&mut self) { - // While loop to find and drop all disjoint trees of potentially stuck lookups. - while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| { - lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_STUCK_SECS) - }) { - let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) { - Ok(lookup) => lookup, - Err(e) => { - warn!(error = ?e,"Error finding oldest ancestor lookup"); - // Default to dropping the lookup that exceeds the max duration so at least - // eventually sync should be unstuck - stuck_lookup - } - }; - - if stuck_lookup.id == ancestor_stuck_lookup.id { - warn!( - block_root = ?stuck_lookup.block_root(), - lookup = ?stuck_lookup, - "Notify the devs a sync lookup is stuck" - ); - } else { - warn!( - block_root = ?stuck_lookup.block_root(), - lookup = ?stuck_lookup, - ancestor_block_root = ?ancestor_stuck_lookup.block_root(), - ancestor_lookup = ?ancestor_stuck_lookup, - "Notify the devs a sync lookup is stuck" - ); - } - - metrics::inc_counter(&metrics::SYNC_LOOKUPS_STUCK); - self.drop_lookup_and_children(ancestor_stuck_lookup.id); - } - } - - /// Recursively find the oldest ancestor lookup of another lookup - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn find_oldest_ancestor_lookup<'a>( - &'a self, - lookup: &'a SingleBlockLookup, - ) -> Result<&'a SingleBlockLookup, String> { - if let Some(awaiting_parent) = lookup.awaiting_parent() { - if let Some(lookup) = self - .single_block_lookups - .values() - .find(|l| l.block_root() == awaiting_parent) - { - self.find_oldest_ancestor_lookup(lookup) - } else { - Err(format!( - "Lookup references unknown parent {awaiting_parent:?}" - )) - } - } else { - Ok(lookup) - } - } - - /// Adds peers to a lookup and its ancestors recursively. - /// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having - /// to duplicate the code to add peers to a lookup - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn add_peers_to_lookup_and_ancestors( - &mut self, - lookup_id: SingleLookupId, - peers: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> Result<(), String> { - let lookup = self - .single_block_lookups - .get_mut(&lookup_id) - .ok_or(format!("Unknown lookup for id {lookup_id}"))?; - - let mut added_some_peer = false; - for peer in peers { - if lookup.add_peer(*peer) { - added_some_peer = true; - debug!( - block_root = ?lookup.block_root(), - ?peer, - "Adding peer to existing single block lookup" - ); - } - } - - if let Some(parent_root) = lookup.awaiting_parent() { - if let Some((&child_id, _)) = self - .single_block_lookups - .iter() - .find(|(_, l)| l.block_root() == parent_root) - { - self.add_peers_to_lookup_and_ancestors(child_id, peers, cx) - } else { - Err(format!("Lookup references unknown parent {parent_root:?}")) - } - } else if added_some_peer { - // If this lookup is not awaiting a parent and we added at least one peer, attempt to - // make progress. It is possible that a lookup is created with zero peers, attempted to - // make progress, and then receives peers. After that time the lookup will never be - // pruned with `drop_lookups_without_peers` because it has peers. This is rare corner - // case, but it can result in stuck lookups. - let result = lookup.continue_requests(cx); - self.on_lookup_result(lookup_id, result, "add_peers", cx); - Ok(()) - } else { - Ok(()) - } - } -} diff --git a/beacon_node/network/src/sync/block_lookups/parent_chain.rs b/beacon_node/network/src/sync/block_lookups/parent_chain.rs deleted file mode 100644 index 009b5e2ff74..00000000000 --- a/beacon_node/network/src/sync/block_lookups/parent_chain.rs +++ /dev/null @@ -1,198 +0,0 @@ -use super::single_block_lookup::SingleBlockLookup; -use beacon_chain::BeaconChainTypes; -use std::collections::{HashMap, HashSet}; -use types::Hash256; - -/// Summary of a lookup of which we may not know it's parent_root yet -pub(crate) struct Node { - block_root: Hash256, - parent_root: Option, -} - -impl From<&SingleBlockLookup> for Node { - fn from(value: &SingleBlockLookup) -> Self { - Self { - block_root: value.block_root(), - parent_root: value.awaiting_parent(), - } - } -} - -/// Wrapper around a chain of block roots that have a least one element (tip) -pub(crate) struct NodeChain { - // Parent chain blocks in descending slot order - pub(crate) chain: Vec, - pub(crate) tip: Hash256, -} - -impl NodeChain { - /// Returns the block_root of the oldest ancestor (min slot) of this chain - pub(crate) fn ancestor(&self) -> Hash256 { - self.chain.last().copied().unwrap_or(self.tip) - } - pub(crate) fn len(&self) -> usize { - self.chain.len() - } -} - -/// Given a set of nodes that reference each other, returns a list of chains with unique tips that -/// contain at least two elements. In descending slot order (tip first). -pub(crate) fn compute_parent_chains(nodes: &[Node]) -> Vec { - let mut child_to_parent = HashMap::new(); - let mut parent_to_child = HashMap::>::new(); - for node in nodes { - child_to_parent.insert(node.block_root, node.parent_root); - if let Some(parent_root) = node.parent_root { - parent_to_child - .entry(parent_root) - .or_default() - .push(node.block_root); - } - } - - let mut parent_chains = vec![]; - - // Iterate blocks with no children - for tip in nodes { - let mut block_root = tip.block_root; - if !parent_to_child.contains_key(&block_root) { - let mut chain = vec![]; - - // Resolve chain of blocks - while let Some(parent_root) = child_to_parent.get(&block_root) { - // block_root is a known block that may or may not have a parent root - chain.push(block_root); - if let Some(parent_root) = parent_root { - block_root = *parent_root; - } else { - break; - } - } - - if chain.len() > 1 { - parent_chains.push(NodeChain { - chain, - tip: tip.block_root, - }); - } - } - } - - parent_chains -} - -/// Given a list of node chains, find the oldest node of a specific chain that is not contained in -/// any other chain. -pub(crate) fn find_oldest_fork_ancestor( - parent_chains: Vec, - chain_idx: usize, -) -> Result { - let mut other_blocks = HashSet::new(); - - // Register blocks from other chains - for (i, parent_chain) in parent_chains.iter().enumerate() { - if i != chain_idx { - for block in &parent_chain.chain { - other_blocks.insert(block); - } - } - } - - // Should never happen - let parent_chain = parent_chains - .get(chain_idx) - .ok_or("chain_idx out of bounds")?; - // Find the first block in the target parent chain that is not in other parent chains - // Iterate in ascending slot order - for block in parent_chain.chain.iter().rev() { - if !other_blocks.contains(block) { - return Ok(*block); - } - } - - // No match means that the chain is fully contained within another chain. This should never - // happen, but if that was the case just return the tip - Ok(parent_chain.tip) -} - -#[cfg(test)] -mod tests { - use super::{compute_parent_chains, find_oldest_fork_ancestor, Node}; - use types::{FixedBytesExtended, Hash256}; - - fn h(n: u64) -> Hash256 { - Hash256::from_low_u64_be(n) - } - - fn n(block: u64) -> Node { - Node { - block_root: h(block), - parent_root: None, - } - } - - fn np(parent: u64, block: u64) -> Node { - Node { - block_root: h(block), - parent_root: Some(h(parent)), - } - } - - fn compute_parent_chains_test(nodes: &[Node], expected_chain: Vec>) { - assert_eq!( - compute_parent_chains(nodes) - .iter() - .map(|c| c.chain.clone()) - .collect::>(), - expected_chain - ); - } - - fn find_oldest_fork_ancestor_test(nodes: &[Node], expected: Hash256) { - let chains = compute_parent_chains(nodes); - println!( - "chains {:?}", - chains.iter().map(|c| &c.chain).collect::>() - ); - assert_eq!(find_oldest_fork_ancestor(chains, 0).unwrap(), expected); - } - - #[test] - fn compute_parent_chains_empty_case() { - compute_parent_chains_test(&[], vec![]); - } - - #[test] - fn compute_parent_chains_single_branch() { - compute_parent_chains_test(&[n(0), np(0, 1), np(1, 2)], vec![vec![h(2), h(1), h(0)]]); - } - - #[test] - fn compute_parent_chains_single_branch_with_solo() { - compute_parent_chains_test( - &[n(0), np(0, 1), np(1, 2), np(3, 4)], - vec![vec![h(2), h(1), h(0)]], - ); - } - - #[test] - fn compute_parent_chains_two_forking_branches() { - compute_parent_chains_test( - &[n(0), np(0, 1), np(1, 2), np(1, 3)], - vec![vec![h(2), h(1), h(0)], vec![h(3), h(1), h(0)]], - ); - } - - #[test] - fn compute_parent_chains_two_independent_branches() { - compute_parent_chains_test( - &[n(0), np(0, 1), np(1, 2), n(3), np(3, 4)], - vec![vec![h(2), h(1), h(0)], vec![h(4), h(3)]], - ); - } - - #[test] - fn find_oldest_fork_ancestor_simple_case() { - find_oldest_fork_ancestor_test(&[n(0), np(0, 1), np(1, 2), np(0, 3)], h(1)) - } -} diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs deleted file mode 100644 index 30947cf1f0a..00000000000 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ /dev/null @@ -1,678 +0,0 @@ -use super::{BlockComponent, PeerId, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS}; -use crate::sync::block_lookups::common::RequestState; -use crate::sync::network_context::{ - LookupRequestResult, PeerGroup, ReqId, RpcRequestSendError, SendErrorProcessor, - SyncNetworkContext, -}; -use beacon_chain::{BeaconChainTypes, BlockProcessStatus}; -use derivative::Derivative; -use lighthouse_network::service::api_types::Id; -use parking_lot::RwLock; -use std::collections::HashSet; -use std::fmt::Debug; -use std::sync::Arc; -use std::time::{Duration, Instant}; -use store::Hash256; -use strum::IntoStaticStr; -use types::blob_sidecar::FixedBlobSidecarList; -use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock, Slot}; - -// Dedicated enum for LookupResult to force its usage -#[must_use = "LookupResult must be handled with on_lookup_result"] -pub enum LookupResult { - /// Lookup completed successfully - Completed, - /// Lookup is expecting some future event from the network - Pending, -} - -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] -pub enum LookupRequestError { - /// Too many failed attempts - TooManyAttempts { - /// The failed attempts were primarily due to processing failures. - cannot_process: bool, - }, - /// Error sending event to network - SendFailedNetwork(RpcRequestSendError), - /// Error sending event to processor - SendFailedProcessor(SendErrorProcessor), - /// Inconsistent lookup request state - BadState(String), - /// Lookup failed for some other reason and should be dropped - Failed(/* reason: */ String), - /// Received MissingComponents when all components have been processed. This should never - /// happen, and indicates some internal bug - MissingComponentsAfterAllProcessed, - /// Attempted to retrieve a not known lookup id - UnknownLookup, - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: ReqId, - req_id: ReqId, - }, -} - -#[derive(Derivative)] -#[derivative(Debug(bound = "T: BeaconChainTypes"))] -pub struct SingleBlockLookup { - pub id: Id, - pub block_request_state: BlockRequestState, - pub component_requests: ComponentRequests, - /// Peers that claim to have imported this set of block components. This state is shared with - /// the custody request to have an updated view of the peers that claim to have imported the - /// block associated with this lookup. The peer set of a lookup can change rapidly, and faster - /// than the lifetime of a custody request. - #[derivative(Debug(format_with = "fmt_peer_set_as_len"))] - peers: Arc>>, - block_root: Hash256, - awaiting_parent: Option, - created: Instant, -} - -#[derive(Debug)] -pub(crate) enum ComponentRequests { - WaitingForBlock, - ActiveBlobRequest(BlobRequestState, usize), - ActiveCustodyRequest(CustodyRequestState), - // When printing in debug this state display the reason why it's not needed - #[allow(dead_code)] - NotNeeded(&'static str), -} - -impl SingleBlockLookup { - pub fn new( - requested_block_root: Hash256, - peers: &[PeerId], - id: Id, - awaiting_parent: Option, - ) -> Self { - Self { - id, - block_request_state: BlockRequestState::new(requested_block_root), - component_requests: ComponentRequests::WaitingForBlock, - peers: Arc::new(RwLock::new(HashSet::from_iter(peers.iter().copied()))), - block_root: requested_block_root, - awaiting_parent, - created: Instant::now(), - } - } - - /// Return the slot of this lookup's block if it's currently cached as `AwaitingProcessing` - pub fn peek_downloaded_block_slot(&self) -> Option { - self.block_request_state - .state - .peek_downloaded_data() - .map(|block| block.slot()) - } - - /// Get the block root that is being requested. - pub fn block_root(&self) -> Hash256 { - self.block_root - } - - pub fn awaiting_parent(&self) -> Option { - self.awaiting_parent - } - - /// Mark this lookup as awaiting a parent lookup from being processed. Meanwhile don't send - /// components for processing. - pub fn set_awaiting_parent(&mut self, parent_root: Hash256) { - self.awaiting_parent = Some(parent_root) - } - - /// Mark this lookup as no longer awaiting a parent lookup. Components can be sent for - /// processing. - pub fn resolve_awaiting_parent(&mut self) { - self.awaiting_parent = None; - } - - /// Returns the time elapsed since this lookup was created - pub fn elapsed_since_created(&self) -> Duration { - self.created.elapsed() - } - - /// Maybe insert a verified response into this lookup. Returns true if imported - pub fn add_child_components(&mut self, block_component: BlockComponent) -> bool { - match block_component { - BlockComponent::Block(block) => self - .block_request_state - .state - .insert_verified_response(block), - BlockComponent::Blob(_) | BlockComponent::DataColumn(_) => { - // For now ignore single blobs and columns, as the blob request state assumes all blobs are - // attributed to the same peer = the peer serving the remaining blobs. Ignoring this - // block component has a minor effect, causing the node to re-request this blob - // once the parent chain is successfully resolved - false - } - } - } - - /// Check the block root matches the requested block root. - pub fn is_for_block(&self, block_root: Hash256) -> bool { - self.block_root() == block_root - } - - /// Returns true if the block has already been downloaded. - pub fn all_components_processed(&self) -> bool { - self.block_request_state.state.is_processed() - && match &self.component_requests { - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveBlobRequest(request, _) => request.state.is_processed(), - ComponentRequests::ActiveCustodyRequest(request) => request.state.is_processed(), - ComponentRequests::NotNeeded { .. } => true, - } - } - - /// Returns true if this request is expecting some event to make progress - pub fn is_awaiting_event(&self) -> bool { - self.awaiting_parent.is_some() - || self.block_request_state.state.is_awaiting_event() - || match &self.component_requests { - // If components are waiting for the block request to complete, here we should - // check if the`block_request_state.state.is_awaiting_event(). However we already - // checked that above, so `WaitingForBlock => false` is equivalent. - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveBlobRequest(request, _) => { - request.state.is_awaiting_event() - } - ComponentRequests::ActiveCustodyRequest(request) => { - request.state.is_awaiting_event() - } - ComponentRequests::NotNeeded { .. } => false, - } - } - - /// Makes progress on all requests of this lookup. Any error is not recoverable and must result - /// in dropping the lookup. May mark the lookup as completed. - pub fn continue_requests( - &mut self, - cx: &mut SyncNetworkContext, - ) -> Result { - // TODO: Check what's necessary to download, specially for blobs - self.continue_request::>(cx, 0)?; - - if let ComponentRequests::WaitingForBlock = self.component_requests { - let downloaded_block = self - .block_request_state - .state - .peek_downloaded_data() - .cloned(); - - if let Some(block) = downloaded_block.or_else(|| { - // If the block is already being processed or fully validated, retrieve how many blobs - // it expects. Consider any stage of the block. If the block root has been validated, we - // can assert that this is the correct value of `blob_kzg_commitments_count`. - match cx.chain.get_block_process_status(&self.block_root) { - BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) - | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), - } - }) { - let expected_blobs = block.num_expected_blobs(); - let block_epoch = block.slot().epoch(T::EthSpec::slots_per_epoch()); - if expected_blobs == 0 { - self.component_requests = ComponentRequests::NotNeeded("no data"); - } else if cx.chain.should_fetch_blobs(block_epoch) { - self.component_requests = ComponentRequests::ActiveBlobRequest( - BlobRequestState::new(self.block_root), - expected_blobs, - ); - } else if cx.chain.should_fetch_custody_columns(block_epoch) { - self.component_requests = ComponentRequests::ActiveCustodyRequest( - CustodyRequestState::new(self.block_root), - ); - } else { - self.component_requests = ComponentRequests::NotNeeded("outside da window"); - } - } else { - // Wait to download the block before downloading blobs. Then we can be sure that the - // block has data, so there's no need to do "blind" requests for all possible blobs and - // latter handle the case where if the peer sent no blobs, penalize. - // - // Lookup sync event safety: Reaching this code means that a block is not in any pre-import - // cache nor in the request state of this lookup. Therefore, the block must either: (1) not - // be downloaded yet or (2) the block is already imported into the fork-choice. - // In case (1) the lookup must either successfully download the block or get dropped. - // In case (2) the block will be downloaded, processed, reach `DuplicateFullyImported` - // and get dropped as completed. - } - } - - match &self.component_requests { - ComponentRequests::WaitingForBlock => {} // do nothing - ComponentRequests::ActiveBlobRequest(_, expected_blobs) => { - self.continue_request::>(cx, *expected_blobs)? - } - ComponentRequests::ActiveCustodyRequest(_) => { - self.continue_request::>(cx, 0)? - } - ComponentRequests::NotNeeded { .. } => {} // do nothing - } - - // If all components of this lookup are already processed, there will be no future events - // that can make progress so it must be dropped. Consider the lookup completed. - // This case can happen if we receive the components from gossip during a retry. - if self.all_components_processed() { - Ok(LookupResult::Completed) - } else { - Ok(LookupResult::Pending) - } - } - - /// Potentially makes progress on this request if it's in a progress-able state - fn continue_request>( - &mut self, - cx: &mut SyncNetworkContext, - expected_blobs: usize, - ) -> Result<(), LookupRequestError> { - let id = self.id; - let awaiting_parent = self.awaiting_parent.is_some(); - let request = - R::request_state_mut(self).map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - // Attempt to progress awaiting downloads - if request.get_state().is_awaiting_download() { - // Verify the current request has not exceeded the maximum number of attempts. - let request_state = request.get_state(); - if request_state.failed_attempts() >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { - let cannot_process = request_state.more_failed_processing_attempts(); - return Err(LookupRequestError::TooManyAttempts { cannot_process }); - } - - let peers = self.peers.clone(); - let request = R::request_state_mut(self) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - match request.make_request(id, peers, expected_blobs, cx)? { - LookupRequestResult::RequestSent(req_id) => { - // Lookup sync event safety: If make_request returns `RequestSent`, we are - // guaranteed that `BlockLookups::on_download_response` will be called exactly - // with this `req_id`. - request.get_state_mut().on_download_start(req_id)? - } - LookupRequestResult::NoRequestNeeded(reason) => { - // Lookup sync event safety: Advances this request to the terminal `Processed` - // state. If all requests reach this state, the request is marked as completed - // in `Self::continue_requests`. - request.get_state_mut().on_completed_request(reason)? - } - // Sync will receive a future event to make progress on the request, do nothing now - LookupRequestResult::Pending(reason) => { - // Lookup sync event safety: Refer to the code paths constructing - // `LookupRequestResult::Pending` - request - .get_state_mut() - .update_awaiting_download_status(reason); - return Ok(()); - } - } - - // Otherwise, attempt to progress awaiting processing - // If this request is awaiting a parent lookup to be processed, do not send for processing. - // The request will be rejected with unknown parent error. - } else if !awaiting_parent { - // maybe_start_processing returns Some if state == AwaitingProcess. This pattern is - // useful to conditionally access the result data. - if let Some(result) = request.get_state_mut().maybe_start_processing() { - // Lookup sync event safety: If `send_for_processing` returns Ok() we are guaranteed - // that `BlockLookups::on_processing_result` will be called exactly once with this - // lookup_id - return R::send_for_processing(id, result, cx); - } - // Lookup sync event safety: If the request is not in `AwaitingDownload` or - // `AwaitingProcessing` state it is guaranteed to receive some event to make progress. - } - - // Lookup sync event safety: If a lookup is awaiting a parent we are guaranteed to either: - // (1) attempt to make progress with `BlockLookups::continue_child_lookups` if the parent - // lookup completes, or (2) get dropped if the parent fails and is dropped. - - Ok(()) - } - - /// Get all unique peers that claim to have imported this set of block components - pub fn all_peers(&self) -> Vec { - self.peers.read().iter().copied().collect() - } - - /// Add peer to all request states. The peer must be able to serve this request. - /// Returns true if the peer was newly inserted into some request state. - pub fn add_peer(&mut self, peer_id: PeerId) -> bool { - self.peers.write().insert(peer_id) - } - - /// Remove peer from available peers. - pub fn remove_peer(&mut self, peer_id: &PeerId) { - self.peers.write().remove(peer_id); - } - - /// Returns true if this lookup has zero peers - pub fn has_no_peers(&self) -> bool { - self.peers.read().is_empty() - } -} - -/// The state of the blob request component of a `SingleBlockLookup`. -#[derive(Derivative)] -#[derivative(Debug)] -pub struct BlobRequestState { - #[derivative(Debug = "ignore")] - pub block_root: Hash256, - pub state: SingleLookupRequestState>, -} - -impl BlobRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - block_root, - state: SingleLookupRequestState::new(), - } - } -} - -/// The state of the custody request component of a `SingleBlockLookup`. -#[derive(Derivative)] -#[derivative(Debug)] -pub struct CustodyRequestState { - #[derivative(Debug = "ignore")] - pub block_root: Hash256, - pub state: SingleLookupRequestState>, -} - -impl CustodyRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - block_root, - state: SingleLookupRequestState::new(), - } - } -} - -/// The state of the block request component of a `SingleBlockLookup`. -#[derive(Derivative)] -#[derivative(Debug)] -pub struct BlockRequestState { - #[derivative(Debug = "ignore")] - pub requested_block_root: Hash256, - pub state: SingleLookupRequestState>>, -} - -impl BlockRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - requested_block_root: block_root, - state: SingleLookupRequestState::new(), - } - } -} - -#[derive(Debug, Clone)] -pub struct DownloadResult { - pub value: T, - pub block_root: Hash256, - pub seen_timestamp: Duration, - pub peer_group: PeerGroup, -} - -#[derive(IntoStaticStr)] -pub enum State { - AwaitingDownload(/* reason */ &'static str), - Downloading(ReqId), - AwaitingProcess(DownloadResult), - /// Request is processing, sent by lookup sync - Processing(DownloadResult), - /// Request is processed - Processed(/* reason */ &'static str), -} - -/// Object representing the state of a single block or blob lookup request. -#[derive(Debug)] -pub struct SingleLookupRequestState { - /// State of this request. - state: State, - /// How many times have we attempted to process this block or blob. - failed_processing: u8, - /// How many times have we attempted to download this block or blob. - failed_downloading: u8, -} - -impl SingleLookupRequestState { - pub fn new() -> Self { - Self { - state: State::AwaitingDownload("not started"), - failed_processing: 0, - failed_downloading: 0, - } - } - - pub fn is_awaiting_download(&self) -> bool { - match self.state { - State::AwaitingDownload { .. } => true, - State::Downloading { .. } - | State::AwaitingProcess { .. } - | State::Processing { .. } - | State::Processed { .. } => false, - } - } - - pub fn is_processed(&self) -> bool { - match self.state { - State::AwaitingDownload { .. } - | State::Downloading { .. } - | State::AwaitingProcess { .. } - | State::Processing { .. } => false, - State::Processed { .. } => true, - } - } - - /// Returns true if we can expect some future event to progress this block component request - /// specifically. - pub fn is_awaiting_event(&self) -> bool { - match self.state { - // No event will progress this request specifically, but the request may be put on hold - // due to some external event - State::AwaitingDownload { .. } => false, - // Network will emit a download success / error event - State::Downloading { .. } => true, - // Not awaiting any external event - State::AwaitingProcess { .. } => false, - // Beacon processor will emit a processing result event - State::Processing { .. } => true, - // Request complete, no future event left - State::Processed { .. } => false, - } - } - - pub fn peek_downloaded_data(&self) -> Option<&T> { - match &self.state { - State::AwaitingDownload { .. } => None, - State::Downloading { .. } => None, - State::AwaitingProcess(result) => Some(&result.value), - State::Processing(result) => Some(&result.value), - State::Processed { .. } => None, - } - } - - /// Switch to `AwaitingProcessing` if the request is in `AwaitingDownload` state, otherwise - /// ignore. - pub fn insert_verified_response(&mut self, result: DownloadResult) -> bool { - if let State::AwaitingDownload { .. } = &self.state { - self.state = State::AwaitingProcess(result); - true - } else { - false - } - } - - /// Append metadata on why this request is in AwaitingDownload status. Very helpful to debug - /// stuck lookups. Not fallible as it's purely informational. - pub fn update_awaiting_download_status(&mut self, new_status: &'static str) { - if let State::AwaitingDownload(status) = &mut self.state { - *status = new_status - } - } - - /// Switch to `Downloading` if the request is in `AwaitingDownload` state, otherwise returns None. - pub fn on_download_start(&mut self, req_id: ReqId) -> Result<(), LookupRequestError> { - match &self.state { - State::AwaitingDownload { .. } => { - self.state = State::Downloading(req_id); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_download_start expected AwaitingDownload got {other}" - ))), - } - } - - /// Registers a failure in downloading a block. This might be a peer disconnection or a wrong - /// block. - pub fn on_download_failure(&mut self, req_id: ReqId) -> Result<(), LookupRequestError> { - match &self.state { - State::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(LookupRequestError::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); - } - self.failed_downloading = self.failed_downloading.saturating_add(1); - self.state = State::AwaitingDownload("not started"); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_download_failure expected Downloading got {other}" - ))), - } - } - - pub fn on_download_success( - &mut self, - req_id: ReqId, - result: DownloadResult, - ) -> Result<(), LookupRequestError> { - match &self.state { - State::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(LookupRequestError::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); - } - self.state = State::AwaitingProcess(result); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_download_success expected Downloading got {other}" - ))), - } - } - - /// Switch to `Processing` if the request is in `AwaitingProcess` state, otherwise returns None. - pub fn maybe_start_processing(&mut self) -> Option> { - // For 2 lines replace state with placeholder to gain ownership of `result` - match &self.state { - State::AwaitingProcess(result) => { - let result = result.clone(); - self.state = State::Processing(result.clone()); - Some(result) - } - _ => None, - } - } - - /// Revert into `AwaitingProcessing`, if the payload if not invalid and can be submitted for - /// processing latter. - pub fn revert_to_awaiting_processing(&mut self) -> Result<(), LookupRequestError> { - match &self.state { - State::Processing(result) => { - self.state = State::AwaitingProcess(result.clone()); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on revert_to_awaiting_processing expected Processing got {other}" - ))), - } - } - - /// Registers a failure in processing a block. - pub fn on_processing_failure(&mut self) -> Result { - match &self.state { - State::Processing(result) => { - let peers_source = result.peer_group.clone(); - self.failed_processing = self.failed_processing.saturating_add(1); - self.state = State::AwaitingDownload("not started"); - Ok(peers_source) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_processing_failure expected Processing got {other}" - ))), - } - } - - pub fn on_processing_success(&mut self) -> Result<(), LookupRequestError> { - match &self.state { - State::Processing(_) => { - self.state = State::Processed("processing success"); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_processing_success expected Processing got {other}" - ))), - } - } - - /// Mark a request as complete without any download or processing - pub fn on_completed_request(&mut self, reason: &'static str) -> Result<(), LookupRequestError> { - match &self.state { - State::AwaitingDownload { .. } => { - self.state = State::Processed(reason); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_completed_request expected AwaitingDownload got {other}" - ))), - } - } - - /// The total number of failures, whether it be processing or downloading. - pub fn failed_attempts(&self) -> u8 { - self.failed_processing + self.failed_downloading - } - - pub fn more_failed_processing_attempts(&self) -> bool { - self.failed_processing >= self.failed_downloading - } -} - -// Display is used in the BadState assertions above -impl std::fmt::Display for State { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", Into::<&'static str>::into(self)) - } -} - -// Debug is used in the log_stuck_lookups print to include some more info. Implements custom Debug -// to not dump an entire block or blob to terminal which don't add valuable data. -impl std::fmt::Debug for State { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::AwaitingDownload(reason) => write!(f, "AwaitingDownload({})", reason), - Self::Downloading(req_id) => write!(f, "Downloading({:?})", req_id), - Self::AwaitingProcess(d) => write!(f, "AwaitingProcess({:?})", d.peer_group), - Self::Processing(d) => write!(f, "Processing({:?})", d.peer_group), - Self::Processed(reason) => write!(f, "Processed({})", reason), - } - } -} - -fn fmt_peer_set_as_len( - peer_set: &Arc>>, - f: &mut std::fmt::Formatter, -) -> Result<(), std::fmt::Error> { - write!(f, "{}", peer_set.read().len()) -} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 2fa10ae83de..c3e613c933d 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -54,11 +54,9 @@ use beacon_chain::{ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRangeRequestId, BlocksByRootRequestId, - BlocksByRootRequester, ComponentsByRangeRequestId, CustodyByRangeRequestId, - CustodyByRootRequestId, CustodyRequester, DataColumnsByRangeRequestId, - DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, - SyncRequestId, + BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, + CustodyByRootRequestId, CustodyRequester, DataColumnsByRootRequestId, + DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::PeerId; @@ -317,26 +315,6 @@ impl SyncManager { } } - #[cfg(test)] - pub(crate) fn active_single_lookups(&self) -> Vec { - todo!(); - } - - #[cfg(test)] - pub(crate) fn active_parent_lookups(&self) -> Vec> { - todo!(); - } - - #[cfg(test)] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - todo!(); - } - - #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, _block_root: Hash256) { - todo!(); - } - #[cfg(test)] pub(crate) fn active_sampling_requests(&self) -> Vec { self.sampling.active_sampling_requests() @@ -429,9 +407,6 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } - for (id, result) in self.network.continue_custody_by_range_requests() { - self.on_custody_by_range_result(id, result); - } } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. @@ -495,15 +470,6 @@ impl SyncManager { SyncRequestId::DataColumnsByRoot(req_id) => { self.on_data_columns_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } - SyncRequestId::BlocksByRange(req_id) => { - self.on_blocks_by_range_response(req_id, peer_id, RpcEvent::RPCError(error)) - } - SyncRequestId::BlobsByRange(req_id) => { - self.on_blobs_by_range_response(req_id, peer_id, RpcEvent::RPCError(error)) - } - SyncRequestId::DataColumnsByRange(req_id) => { - self.on_data_columns_by_range_response(req_id, peer_id, RpcEvent::RPCError(error)) - } } } @@ -536,9 +502,6 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } - for (id, result) in self.network.continue_custody_by_range_requests() { - self.on_custody_by_range_result(id, result); - } } /// Updates the syncing state of a peer. @@ -1007,11 +970,6 @@ impl SyncManager { peer_id, RpcEvent::from_chunk(block, seen_timestamp), ), - SyncRequestId::BlocksByRange(id) => self.on_blocks_by_range_response( - id, - peer_id, - RpcEvent::from_chunk(block, seen_timestamp), - ), _ => { crit!(%peer_id, "bad request id for block"); } @@ -1073,8 +1031,7 @@ impl SyncManager { seen_timestamp: Duration, ) { match sync_request_id { - SyncRequestId::BlobsByRoot { .. } => todo!(), - SyncRequestId::BlobsByRange(id) => self.on_blobs_by_range_response( + SyncRequestId::BlobsByRoot(id) => self.on_blobs_by_root_response( id, peer_id, RpcEvent::from_chunk(blob, seen_timestamp), @@ -1100,11 +1057,6 @@ impl SyncManager { RpcEvent::from_chunk(data_column, seen_timestamp), ); } - SyncRequestId::DataColumnsByRange(id) => self.on_data_columns_by_range_response( - id, - peer_id, - RpcEvent::from_chunk(data_column, seen_timestamp), - ), _ => { crit!(%peer_id, "bad request id for data_column"); } @@ -1121,7 +1073,7 @@ impl SyncManager { self.network .on_data_columns_by_root_response(req_id, peer_id, data_column) { - match req_id.requester { + match req_id.parent_request_id { DataColumnsByRootRequester::Sampling(id) => { if let Some((requester, result)) = self.sampling @@ -1142,56 +1094,6 @@ impl SyncManager { } } - fn on_blocks_by_range_response( - &mut self, - id: BlocksByRangeRequestId, - peer_id: PeerId, - block: RpcEvent>>, - ) { - if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - todo!(); - } - } - - fn on_blobs_by_range_response( - &mut self, - id: BlobsByRangeRequestId, - peer_id: PeerId, - blob: RpcEvent>>, - ) { - if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - todo!(); - } - } - - fn on_data_columns_by_range_response( - &mut self, - id: DataColumnsByRangeRequestId, - peer_id: PeerId, - data_column: RpcEvent>>, - ) { - // data_columns_by_range returns either an Ok list of data columns, or an RpcResponseError - if let Some(resp) = self - .network - .on_data_columns_by_range_response(id, peer_id, data_column) - { - // custody_by_range accumulates the results of multiple data_columns_by_range requests - // returning a bigger list of data columns across all the column indices this node has - // to custody - if let Some(result) = self.network.on_custody_by_range_response(id, peer_id, resp) { - self.on_custody_by_range_result(id.parent_request_id, result); - } - } - } - - fn on_custody_by_range_result( - &mut self, - _id: CustodyByRangeRequestId, - _result: CustodyRequestResult, - ) { - todo!(); - } - fn on_custody_by_root_result( &mut self, id: CustodyByRootRequestId, @@ -1230,14 +1132,14 @@ impl SyncManager { /// Handles receiving a response for a range sync request that should have both blocks and /// blobs. - fn on_block_components_by_range_response( + fn on_block_components_by_root_response( &mut self, - range_request_id: ComponentsByRangeRequestId, + range_request_id: ComponentsByRootRequestId, range_block_component: RangeBlockComponent, ) { if let Some(result) = self .network - .on_block_components_by_range_response(range_request_id, range_block_component) + .on_block_components_by_root_response(range_request_id, range_block_component) { match range_request_id.requester { RangeRequestId::RangeSync(id) => { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index b81545573bd..a0460c5beb4 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -2,7 +2,6 @@ //! //! Stores the various syncing methods for the beacon chain. mod backfill_sync; -mod block_lookups; mod block_tree; pub mod manager; mod network_context; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index c93850bedb4..870e98cccee 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,7 +1,6 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody_by_range::ActiveCustodyByRangeRequest; use self::custody_by_root::ActiveCustodyByRootRequest; use super::range_sync::BatchPeers; use super::SyncMessage; @@ -11,12 +10,11 @@ use crate::network_beacon_processor::NetworkBeaconProcessor; use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::block_lookups::SingleLookupId; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; -pub use block_components_by_range::BlockComponentsByRangeRequest; +pub use block_components_by_range::BlockComponentsByRootRequest; #[cfg(test)] -pub use block_components_by_range::BlockComponentsByRangeRequestStep; +pub use block_components_by_range::BlockComponentsByRootRequestStep; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRangeRequest, @@ -25,9 +23,8 @@ use lighthouse_network::rpc::methods::{ use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRangeRequestId, - BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRangeRequestId, - CustodyByRangeRequestId, CustodyByRootRequestId, CustodyRequester, DataColumnsByRangeRequestId, + AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, + ComponentsByRootRequestId, CustodyByRootRequestId, CustodyRequester, DataColumnsByRootRequestId, DataColumnsByRootRequester, HeaderLookupId, Id, SingleLookupReqId, SyncRequestId, }; @@ -35,8 +32,8 @@ use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSourc use parking_lot::RwLock; pub use requests::LookupVerifyError; use requests::{ - ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems, - BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems, + ActiveRequests, BlobsByRootRequestItems, BlocksByRootRequestItems, + DataColumnsByRootRequestItems, }; #[cfg(test)] use slot_clock::SlotClock; @@ -56,7 +53,6 @@ use types::{ }; pub mod block_components_by_range; -pub mod custody_by_range; pub mod custody_by_root; mod requests; @@ -206,25 +202,13 @@ pub struct SyncNetworkContext { /// A mapping of active DataColumnsByRoot requests data_columns_by_root_requests: ActiveRequests>, - /// A mapping of active BlocksByRange requests - blocks_by_range_requests: - ActiveRequests>, - /// A mapping of active BlobsByRange requests - blobs_by_range_requests: - ActiveRequests>, - /// A mapping of active DataColumnsByRange requests - data_columns_by_range_requests: - ActiveRequests>, /// Mapping of active custody column by root requests for a block root custody_by_root_requests: FnvHashMap>, - /// Mapping of active custody column by range requests - custody_by_range_requests: FnvHashMap>, - - /// BlocksByRange requests paired with other ByRange requests for data components - block_components_by_range_requests: - FnvHashMap>, + /// BlocksByRoot requests paired with other ByRoot requests for data components + block_components_by_root_requests: + FnvHashMap>, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -302,12 +286,8 @@ impl SyncNetworkContext { blocks_by_root_requests: ActiveRequests::new("blocks_by_root"), blobs_by_root_requests: ActiveRequests::new("blobs_by_root"), data_columns_by_root_requests: ActiveRequests::new("data_columns_by_root"), - blocks_by_range_requests: ActiveRequests::new("blocks_by_range"), - blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), - data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), - custody_by_range_requests: <_>::default(), - block_components_by_range_requests: <_>::default(), + block_components_by_root_requests: <_>::default(), network_beacon_processor, chain, fork_context, @@ -338,14 +318,10 @@ impl SyncNetworkContext { blocks_by_root_requests, blobs_by_root_requests, data_columns_by_root_requests, - blocks_by_range_requests, - blobs_by_range_requests, - data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, - custody_by_range_requests: _, - // components_by_range_requests is a meta request of various _by_range requests - block_components_by_range_requests: _, + // components_by_root_requests is a meta request of various _by_root requests + block_components_by_root_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -361,32 +337,17 @@ impl SyncNetworkContext { let data_column_by_root_ids = data_columns_by_root_requests .active_requests() .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); - let blocks_by_range_ids = blocks_by_range_requests - .active_requests() - .map(|(id, peer)| (SyncRequestId::BlocksByRange(*id), peer)); - let blobs_by_range_ids = blobs_by_range_requests - .active_requests() - .map(|(id, peer)| (SyncRequestId::BlobsByRange(*id), peer)); - let data_column_by_range_ids = data_columns_by_range_requests - .active_requests() - .map(|(id, peer)| (SyncRequestId::DataColumnsByRange(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) - .chain(blocks_by_range_ids) - .chain(blobs_by_range_ids) - .chain(data_column_by_range_ids) } #[cfg(test)] - pub fn active_block_components_by_range_requests( + pub fn active_block_components_by_root_requests( &self, - ) -> Vec<( - ComponentsByRangeRequestId, - BlockComponentsByRangeRequestStep, - )> { - self.block_components_by_range_requests + ) -> Vec<(ComponentsByRootRequestId, BlockComponentsByRootRequestStep)> { + self.block_components_by_root_requests .iter() .map(|(id, req)| (*id, req.state_step())) .collect() @@ -452,14 +413,10 @@ impl SyncNetworkContext { blocks_by_root_requests, blobs_by_root_requests, data_columns_by_root_requests, - blocks_by_range_requests, - blobs_by_range_requests, - data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, - custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - block_components_by_range_requests: _, + block_components_by_root_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -474,9 +431,6 @@ impl SyncNetworkContext { .iter_request_peers() .chain(blobs_by_root_requests.iter_request_peers()) .chain(data_columns_by_root_requests.iter_request_peers()) - .chain(blocks_by_range_requests.iter_request_peers()) - .chain(blobs_by_range_requests.iter_request_peers()) - .chain(data_columns_by_range_requests.iter_request_peers()) { *active_request_count_by_peer.entry(peer_id).or_default() += 1; } @@ -492,83 +446,19 @@ impl SyncNetworkContext { peers: Arc>>, peers_to_deprioritize: &HashSet, ) -> Result { - let id = ComponentsByRangeRequestId { + let id = ComponentsByRootRequestId { id: self.next_id(), requester, }; let req = - BlockComponentsByRangeRequest::new(id, block_root, peers, peers_to_deprioritize, self)?; + BlockComponentsByRootRequest::new(id, block_root, peers, peers_to_deprioritize, self)?; - self.block_components_by_range_requests.insert(id, req); + self.block_components_by_root_requests.insert(id, req); Ok(id.id) } - /// Request block of `block_root` if necessary by checking: - /// - If the da_checker has a pending block from gossip or a previous request - /// - /// Returns false if no request was made, because the block is already imported - pub fn block_lookup_request( - &mut self, - parent_request_id: HeaderLookupId, - lookup_peers: &HashSet, - block_root: Hash256, - ) -> Result, RpcRequestSendError> { - let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(peer_id) = lookup_peers - .iter() - .map(|peer| { - ( - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, peer)| *peer) - else { - // Allow lookup to not have any peers and do nothing. This is an optimization to not - // lose progress of lookups created from a block with unknown parent before we receive - // attestations for said block. - // Lookup sync event safety: If a lookup requires peers to make progress, and does - // not receive any new peers for some time it will be dropped. If it receives a new - // peer it must attempt to make progress. - return Ok(LookupRequestResult::Pending("no peers")); - }; - - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - match self.chain.get_block_process_status(&block_root) { - // Unknown block, continue request to download - BlockProcessStatus::Unknown => {} - // Block is known are currently processing, expect a future event with the result of - // processing. - BlockProcessStatus::NotValidated { .. } => { - // Lookup sync event safety: If the block is currently in the processing cache, we - // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will - // make progress on this lookup - return Ok(LookupRequestResult::Pending("block in processing cache")); - } - // Block is fully validated. If it's not yet imported it's waiting for missing block - // components. Consider this request completed and do nothing. - BlockProcessStatus::ExecutionValidated { .. } => { - return Ok(LookupRequestResult::NoRequestNeeded( - "block execution validated", - )) - } - } - - todo!(); - } - /// Request to send a single `data_columns_by_root` request to the network. pub fn data_columns_by_root_request( &mut self, @@ -587,7 +477,7 @@ impl SyncNetworkContext { let id = DataColumnsByRootRequestId { id: self.next_id(), - requester, + parent_request_id: requester, }; let request = DataColumnsByRootRequest::new( @@ -629,7 +519,7 @@ impl SyncNetworkContext { /// requests. pub fn send_custody_by_root_request( &mut self, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: ComponentsByRootRequestId, block_root: Hash256, lookup_peers: Arc>>, ) -> Result { @@ -713,50 +603,12 @@ impl SyncNetworkContext { Ok(id) } - fn send_blocks_by_range_request( - &mut self, - peer_id: PeerId, - request: BlocksByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { - let id = BlocksByRangeRequestId { - id: self.next_id(), - parent_request_id, - }; - self.network_send - .send(NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlocksByRange(request.clone().into()), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "BlocksByRange", - slots = request.count(), - epoch = %Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()), - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.blocks_by_range_requests.insert( - id, - peer_id, - // false = do not enforce max_requests are returned for *_by_range methods. We don't - // know if there are missed blocks. - false, - BlocksByRangeRequestItems::new(request), - ); - Ok(id) - } - fn send_blobs_by_root_request( &mut self, peer_id: PeerId, block_root: Hash256, blobs_per_block: usize, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: ComponentsByRootRequestId, ) -> Result { let id = BlobsByRootRequestId { id: self.next_id(), @@ -784,7 +636,7 @@ impl SyncNetworkContext { .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( - method = "BlobsByRange", + method = "BlobsByRoot", peer = %peer_id, %id, "Sync RPC request sent" @@ -800,131 +652,6 @@ impl SyncNetworkContext { Ok(id) } - fn send_blobs_by_range_request( - &mut self, - peer_id: PeerId, - request: BlobsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { - let id = BlobsByRangeRequestId { - id: self.next_id(), - parent_request_id, - }; - let request_epoch = Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()); - - // Create the blob request based on the blocks request. - self.network_send - .send(NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlobsByRange(request.clone()), - app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "BlobsByRange", - slots = request.count, - epoch = %request_epoch, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - let max_blobs_per_block = self.chain.spec.max_blobs_per_block(request_epoch); - self.blobs_by_range_requests.insert( - id, - peer_id, - // false = do not enforce max_requests are returned for *_by_range methods. We don't - // know if there are missed blocks. - false, - BlobsByRangeRequestItems::new(request, max_blobs_per_block), - ); - Ok(id) - } - - fn send_data_columns_by_range_request( - &mut self, - peer_id: PeerId, - request: DataColumnsByRangeRequest, - parent_request_id: CustodyByRangeRequestId, - ) -> Result { - let id = DataColumnsByRangeRequestId { - id: self.next_id(), - parent_request_id, - }; - - self.send_network_msg(NetworkMessage::SendRequest { - peer_id, - request: RequestType::DataColumnsByRange(request.clone()), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - }) - .map_err(|_| "network send error")?; - - debug!( - method = "DataColumnsByRange", - slots = request.count, - epoch = %Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()), - columns = ?request.columns, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.data_columns_by_range_requests.insert( - id, - peer_id, - // false = do not enforce max_requests are returned for *_by_range methods. We don't - // know if there are missed blocks. - false, - DataColumnsByRangeRequestItems::new(request), - ); - Ok(id) - } - - /// Request to fetch all needed custody columns of a range of slot. This function may not send - /// any request to the network if no columns have to be fetched based on the import state of the - /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_range` - /// requests. - pub fn send_custody_by_range_request( - &mut self, - parent_id: ComponentsByRangeRequestId, - blocks_with_data: Vec, - request: BlocksByRangeRequest, - column_indices: Vec, - lookup_peers: Arc>>, - ) -> Result { - let id = CustodyByRangeRequestId { - id: self.next_id(), - parent_request_id: parent_id, - }; - - debug!( - indices = ?column_indices, - %id, - "Starting custody columns by range request" - ); - - let mut request = ActiveCustodyByRangeRequest::new( - id, - request, - blocks_with_data, - &column_indices, - lookup_peers, - ); - - // Note that you can only send, but not handle a response here - match request.continue_requests(self) { - Ok(_) => { - // Ignoring the result of `continue_requests` is okay. A request that has just been - // created cannot return data immediately, it must send some request to the network - // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. - self.custody_by_range_requests.insert(id, request); - Ok(id) - } - Err(e) => Err(e.into()), - } - } - pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1058,39 +785,12 @@ impl SyncNetworkContext { .collect() } - /// Attempt to make progress on all custody_by_range requests. Some request may be stale waiting - /// for custody peers. Returns a Vec of results as zero or more requests may fail in this - /// attempt. - pub fn continue_custody_by_range_requests( - &mut self, - ) -> Vec<(CustodyByRangeRequestId, CustodyRequestResult)> { - let ids = self - .custody_by_range_requests - .keys() - .copied() - .collect::>(); - - // Need to collect ids and results in separate steps to re-borrow self. - ids.into_iter() - .filter_map(|id| { - let mut request = self - .custody_by_range_requests - .remove(&id) - .expect("key of hashmap"); - let result = request - .continue_requests(self) - .map_err(Into::::into) - .transpose(); - self.handle_custody_by_range_result(id, request, result) - .map(|result| (id, result)) - }) - .collect() - } - // Request handlers - /// Processes a single `RpcEvent` blocks_by_root RPC request. - /// Same logic as [`on_blocks_by_range_response`] but it converts a `Vec` into a `Block` + /// Processes a single `RpcEvent` for a blocks_by_root RPC request. + /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks + /// - If the event is an error it fails the request and returns `Some(Err)` + /// - else it appends the response chunk to the active request state and returns `None` pub(crate) fn on_blocks_by_root_response( &mut self, id: BlocksByRootRequestId, @@ -1102,7 +802,7 @@ impl SyncNetworkContext { } /// Processes a single `RpcEvent` blobs_by_root RPC request. - /// Same logic as [`on_blocks_by_range_response`] + /// Same logic as [`on_blocks_by_root_response`] pub(crate) fn on_blobs_by_root_response( &mut self, id: BlobsByRootRequestId, @@ -1114,7 +814,7 @@ impl SyncNetworkContext { } /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. - /// Same logic as [`on_blocks_by_range_response`] + /// Same logic as [`on_blocks_by_root_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_root_response( &mut self, @@ -1128,49 +828,6 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) } - /// Processes a single `RpcEvent` for a blocks_by_range RPC request. - /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks - /// - If the event is an error it fails the request and returns `Some(Err)` - /// - else it appends the response chunk to the active request state and returns `None` - #[allow(clippy::type_complexity)] - pub(crate) fn on_blocks_by_range_response( - &mut self, - id: BlocksByRangeRequestId, - peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>>>> { - let resp = self.blocks_by_range_requests.on_response(id, rpc_event); - self.on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()) - } - - /// Processes a single `RpcEvent` for a blobs_by_range RPC request. - /// Same logic as [`on_blocks_by_range_response`] - #[allow(clippy::type_complexity)] - pub(crate) fn on_blobs_by_range_response( - &mut self, - id: BlobsByRangeRequestId, - peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>>>> { - let resp = self.blobs_by_range_requests.on_response(id, rpc_event); - self.on_rpc_response_result(id, "BlobsByRangeRequest", resp, peer_id, |b| b.len()) - } - - /// Processes a single `RpcEvent` for a data_columns_by_range RPC request. - /// Same logic as [`on_blocks_by_range_response`] - #[allow(clippy::type_complexity)] - pub(crate) fn on_data_columns_by_range_response( - &mut self, - id: DataColumnsByRangeRequestId, - peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>> { - let resp = self - .data_columns_by_range_requests - .on_response(id, rpc_event); - self.on_rpc_response_result(id, "DataColumnsByRange", resp, peer_id, |d| d.len()) - } - /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics /// and peer reporting for all request types. fn on_rpc_response_result usize>( @@ -1273,63 +930,6 @@ impl SyncNetworkContext { result } - /// Insert a downloaded column into an active custody request. Then make progress on the - /// entire request. - /// - /// ### Returns - /// - /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result. - /// - `None`: Request still active, requester should do no action - #[allow(clippy::type_complexity)] - pub fn on_custody_by_range_response( - &mut self, - req_id: DataColumnsByRangeRequestId, - peer_id: PeerId, - resp: RpcResponseResult>, - ) -> Option> { - let custody_by_range_id = req_id.parent_request_id; - - // Note: need to remove the request to borrow self again below. Otherwise we can't - // do nested requests - let Some(mut request) = self.custody_by_range_requests.remove(&custody_by_range_id) else { - metrics::inc_counter_vec( - &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, - &["custody_by_range"], - ); - return None; - }; - - let result = request - .on_data_column_downloaded(peer_id, req_id, resp, self) - .map_err(Into::::into) - .transpose(); - - self.handle_custody_by_range_result(custody_by_range_id, request, result) - } - - fn handle_custody_by_range_result( - &mut self, - id: CustodyByRangeRequestId, - request: ActiveCustodyByRangeRequest, - result: Option>, - ) -> Option> { - match &result { - Some(Ok((columns, _peer_group, _))) => { - // Don't log the peer_group here, it's very long (could be up to 128 peers). If you - // want to trace which peer sent the column at index X, search for the log: - // `Sync RPC request sent method="DataColumnsByRange" ...` - debug!(%id, count = columns.len(), "Custody by range request success, removing") - } - Some(Err(e)) => { - debug!(%id, error = ?e, "Custody by range request failure, removing") - } - None => { - self.custody_by_range_requests.insert(id, request); - } - } - result - } - /// Processes the result of an `*_by_range` RPC request issued by a /// block_components_by_range_request. /// @@ -1338,14 +938,14 @@ impl SyncNetworkContext { /// not fail the block_components_by_range_request as it implements retries. /// - else it appends the result to the active request state and returns `None` #[allow(clippy::type_complexity)] - pub fn on_block_components_by_range_response( + pub fn on_block_components_by_root_response( &mut self, - id: ComponentsByRangeRequestId, + id: ComponentsByRootRequestId, range_block_component: RangeBlockComponent, ) -> Option, BatchPeers), RpcResponseError>> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests - let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + let Some(mut request) = self.block_components_by_root_requests.remove(&id) else { metrics::inc_counter_vec( &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["block_components_by_range"], @@ -1383,7 +983,7 @@ impl SyncNetworkContext { Some(Ok((block, peer_group))) => { // Don't log the peer_group here, it's very long (could be up to 128 peers). If you // want to trace which peer sent the column at index X, search for the log: - // `Sync RPC request sent method="DataColumnsByRange" ...` + // `Sync RPC request sent method="DataColumnsByRoot" ...` debug!( %id, slot = %block.as_block().slot(), @@ -1396,7 +996,7 @@ impl SyncNetworkContext { debug!(%id, error = ?e, "Block components by range request failure, removing" ) } None => { - self.block_components_by_range_requests.insert(id, request); + self.block_components_by_root_requests.insert(id, request); } } result @@ -1410,16 +1010,10 @@ impl SyncNetworkContext { "data_columns_by_root", self.data_columns_by_root_requests.len(), ), - ("blocks_by_range", self.blocks_by_range_requests.len()), - ("blobs_by_range", self.blobs_by_range_requests.len()), - ( - "data_columns_by_range", - self.data_columns_by_range_requests.len(), - ), ("custody_by_root", self.custody_by_root_requests.len()), ( - "block_components_by_range", - self.block_components_by_range_requests.len(), + "block_components_by_root", + self.block_components_by_root_requests.len(), ), ] { metrics::set_gauge_vec(&metrics::SYNC_ACTIVE_NETWORK_REQUESTS, &[id], count as i64); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index dd4901d82e9..69a09934273 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -6,12 +6,10 @@ use crate::sync::range_sync::BatchPeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::{get_block_root, BeaconChainTypes}; -use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, BlocksByRangeRequest, BlocksByRootRequest, -}; +use lighthouse_network::rpc::methods::BlocksByRootRequest; use lighthouse_network::service::api_types::{ BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, - ComponentsByRangeRequestId, CustodyByRangeRequestId, CustodyByRootRequestId, + ComponentsByRootRequestId, CustodyByRangeRequestId, CustodyByRootRequestId, }; use lighthouse_network::PeerId; use parking_lot::RwLock; @@ -22,12 +20,12 @@ use types::{ Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, }; -/// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return -/// potentially available RpcBlocks. +/// Given a `BlocksByRootRequest` (a collection of block roots) fetches all necessary data to +/// return potentially available RpcBlocks. /// -/// See [`State`] for the set of `*_by_range` it may issue depending on the fork. -pub struct BlockComponentsByRangeRequest { - id: ComponentsByRangeRequestId, +/// See [`State`] for the set of `*_by_root` it may issue depending on the fork. +pub struct BlockComponentsByRootRequest { + id: ComponentsByRootRequestId, peers: Arc>>, block_root: Hash256, state: State, @@ -63,7 +61,7 @@ enum Request { Complete(T, P), } -pub type BlockComponentsByRangeRequestResult = Result, BatchPeers)>, Error>; +pub type BlockComponentsByRootRequestResult = Result, BatchPeers)>, Error>; pub enum Error { InternalError(String), @@ -88,14 +86,14 @@ impl From for RpcRequestSendError { /// Used to typesafe assertions of state in range sync tests #[cfg(test)] #[derive(Debug)] -pub enum BlockComponentsByRangeRequestStep { +pub enum BlockComponentsByRootRequestStep { BlocksRequest, CustodyRequest, } -impl BlockComponentsByRangeRequest { +impl BlockComponentsByRootRequest { pub fn new( - id: ComponentsByRangeRequestId, + id: ComponentsByRootRequestId, block_root: Hash256, peers: Arc>>, peers_to_deprioritize: &HashSet, @@ -145,7 +143,7 @@ impl BlockComponentsByRangeRequest { pub fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> BlockComponentsByRangeRequestResult { + ) -> BlockComponentsByRootRequestResult { match &mut self.state { State::BlocksRequest { blocks_request: blocks_by_range_request, @@ -276,7 +274,7 @@ impl BlockComponentsByRangeRequest { data: Arc>, peer_id: PeerId, cx: &mut SyncNetworkContext, - ) -> BlockComponentsByRangeRequestResult { + ) -> BlockComponentsByRootRequestResult { match &mut self.state { State::BlocksRequest { blocks_request } => { blocks_request.finish(id, data, peer_id)?; @@ -297,7 +295,7 @@ impl BlockComponentsByRangeRequest { data: Vec>>, peer_id: PeerId, cx: &mut SyncNetworkContext, - ) -> BlockComponentsByRangeRequestResult { + ) -> BlockComponentsByRootRequestResult { match &mut self.state { State::DataRequest { data_request: DataRequest::Deneb { blobs_request }, @@ -321,7 +319,7 @@ impl BlockComponentsByRangeRequest { data: DataColumnSidecarList, peers: PeerGroup, cx: &mut SyncNetworkContext, - ) -> BlockComponentsByRangeRequestResult { + ) -> BlockComponentsByRootRequestResult { match &mut self.state { State::DataRequest { data_request: DataRequest::Fulu { custody_request }, @@ -340,10 +338,10 @@ impl BlockComponentsByRangeRequest { } #[cfg(test)] - pub fn state_step(&self) -> BlockComponentsByRangeRequestStep { + pub fn state_step(&self) -> BlockComponentsByRootRequestStep { match &self.state { - State::BlocksRequest { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, - State::DataRequest { .. } => BlockComponentsByRangeRequestStep::CustodyRequest, + State::BlocksRequest { .. } => BlockComponentsByRootRequestStep::BlocksRequest, + State::DataRequest { .. } => BlockComponentsByRootRequestStep::CustodyRequest, } } } diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs deleted file mode 100644 index ed796155e26..00000000000 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ /dev/null @@ -1,429 +0,0 @@ -use super::custody_by_root::{ColumnRequest, Error}; -use beacon_chain::validator_monitor::timestamp_now; -use beacon_chain::BeaconChainTypes; -use fnv::FnvHashMap; -use lighthouse_network::rpc::{methods::DataColumnsByRangeRequest, BlocksByRangeRequest}; -use lighthouse_network::service::api_types::{ - CustodyByRangeRequestId, DataColumnsByRangeRequestId, -}; -use lighthouse_network::{PeerAction, PeerId}; -use lru_cache::LRUTimeCache; -use parking_lot::RwLock; -use rand::Rng; -use std::collections::HashSet; -use std::time::{Duration, Instant}; -use std::{collections::HashMap, marker::PhantomData, sync::Arc}; -use tracing::{debug, warn}; -use types::{ - data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256, - SignedBeaconBlockHeader, Slot, -}; - -use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; - -const FAILED_PEERS_EXPIRY_SECONDS: u64 = 15; -const REQUEST_EXPIRY_SECONDS: u64 = 300; - -pub struct ActiveCustodyByRangeRequest { - start_time: Instant, - id: CustodyByRangeRequestId, - request: BlocksByRangeRequest, - /// Blocks that we expect peers to serve data columns for - blocks_with_data: Vec, - /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap< - ColumnIndex, - ColumnRequest>, - >, - /// Active requests for 1 or more columns each - active_batch_columns_requests: - FnvHashMap, - /// Peers that have recently failed to successfully respond to a columns by root request. - /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. - failed_peers: LRUTimeCache, - /// Set of peers that claim to have imported this block and their custody columns - lookup_peers: Arc>>, - - _phantom: PhantomData, -} - -struct ActiveBatchColumnsRequest { - indices: Vec, -} - -pub type CustodyByRangeRequestResult = - Result, PeerGroup, Duration)>, Error>; - -enum ColumnResponseError { - NonMatchingColumn { - slot: Slot, - actual_block_root: Hash256, - expected_block_root: Hash256, - }, - MissingColumn(Slot), -} - -impl ActiveCustodyByRangeRequest { - pub(crate) fn new( - id: CustodyByRangeRequestId, - request: BlocksByRangeRequest, - blocks_with_data: Vec, - column_indices: &[ColumnIndex], - lookup_peers: Arc>>, - ) -> Self { - Self { - start_time: Instant::now(), - id, - request, - blocks_with_data, - column_requests: HashMap::from_iter( - column_indices - .iter() - .map(|index| (*index, ColumnRequest::new())), - ), - active_batch_columns_requests: <_>::default(), - failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_EXPIRY_SECONDS)), - lookup_peers, - _phantom: PhantomData, - } - } - - /// Insert a downloaded column into an active custody request. Then make progress on the - /// entire request. - /// - /// ### Returns - /// - /// - `Err`: Custody request has failed and will be dropped - /// - `Ok(Some)`: Custody request has successfully completed and will be dropped - /// - `Ok(None)`: Custody request still active - pub(crate) fn on_data_column_downloaded( - &mut self, - peer_id: PeerId, - req_id: DataColumnsByRangeRequestId, - resp: RpcResponseResult>, - cx: &mut SyncNetworkContext, - ) -> CustodyByRangeRequestResult { - let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { - warn!( - id = %self.id, - %req_id, - "Received custody by range response for unrequested index" - ); - return Ok(None); - }; - - match resp { - Ok((data_columns, seen_timestamp)) => { - // Map columns by index as an optimization to not loop the returned list on each - // requested index. The worse case is 128 loops over a 128 item vec + mutation to - // drop the consumed columns. - let mut data_columns_by_index = - HashMap::<(ColumnIndex, Slot), Arc>>::new(); - for data_column in data_columns { - data_columns_by_index - .insert((data_column.index, data_column.slot()), data_column); - } - - // Accumulate columns that the peer does not have to issue a single log per request - let mut missing_column_indices = vec![]; - let mut incorrect_column_indices = vec![]; - let mut imported_column_indices = vec![]; - - for index in &batch_request.indices { - let column_request = - self.column_requests - .get_mut(index) - .ok_or(Error::InternalError(format!( - "unknown column_index {index}" - )))?; - - let columns_at_index = self - .blocks_with_data - .iter() - .map(|block| { - let slot = block.message.slot; - if let Some(data_column) = data_columns_by_index.remove(&(*index, slot)) - { - let actual_block_root = - data_column.signed_block_header.message.canonical_root(); - let expected_block_root = block.message.canonical_root(); - if actual_block_root != expected_block_root { - Err(ColumnResponseError::NonMatchingColumn { - slot, - actual_block_root: data_column - .signed_block_header - .message - .canonical_root(), - expected_block_root: block.message.canonical_root(), - }) - } else { - Ok(data_column) - } - } else { - // The following three statements are true: - // - block at `slot` is not missed, and has data - // - peer custodies this column `index` - // - peer claims to be synced to at least `slot` - // - // Then we penalize the faulty peer, mark it as failed and try with - // another. - Err(ColumnResponseError::MissingColumn(slot)) - } - }) - .collect::, _>>(); - - match columns_at_index { - Ok(columns_at_index) => { - column_request.on_download_success( - req_id, - peer_id, - columns_at_index, - seen_timestamp, - )?; - - imported_column_indices.push(index); - } - Err(e) => { - column_request.on_download_error(req_id)?; - - match e { - ColumnResponseError::NonMatchingColumn { - slot, - actual_block_root, - expected_block_root, - } => { - incorrect_column_indices.push(( - index, - slot, - actual_block_root, - expected_block_root, - )); - } - ColumnResponseError::MissingColumn(slot) => { - missing_column_indices.push((index, slot)); - } - } - } - } - } - - // Log `imported_column_indices`, `missing_column_indexes` and - // `incorrect_column_indices` once per request to make the logs less noisy. - if !imported_column_indices.is_empty() { - // TODO(das): this log may be redundant. We already log on DataColumnsByRange - // completed, and on DataColumnsByRange sent we log the column indices - // ``` - // Sync RPC request sent method="DataColumnsByRange" slots=8 epoch=4 columns=[52] peer=16Uiu2HAmEooeoHzHDYS35TSHrJDSfmREecPyFskrLPYm9Gm1EURj id=493/399/10/RangeSync/4/1 - // Sync RPC request completed id=493/399/10/RangeSync/4/1 method="DataColumnsByRange" count=1 - // ``` - // Which can be traced to this custody by range request, and the initial log - debug!( - id = %self.id, - data_columns_by_range_req_id = %req_id, - %peer_id, - count = imported_column_indices.len(), - "Custody by range request download imported columns" - ); - } - - if !incorrect_column_indices.is_empty() { - debug!( - id = %self.id, - data_columns_by_range_req_id = %req_id, - %peer_id, - ?incorrect_column_indices, - "Custody by range peer returned non-matching columns" - ); - - // Returning a non-canonical column is not a permanent fault. We should not - // retry the peer for some time but the peer may return a canonical column in - // the future. - self.failed_peers.insert(peer_id); - cx.report_peer( - peer_id, - PeerAction::MidToleranceError, - "non-matching data column", - ); - } - - if !missing_column_indices.is_empty() { - debug!( - id = %self.id, - data_columns_by_range_req_id = %req_id, - %peer_id, - ?missing_column_indices, - "Custody by range peer claims to not have some data" - ); - - // Not having columns is not a permanent fault. The peer may be backfilling. - self.failed_peers.insert(peer_id); - cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); - } - } - Err(err) => { - debug!( - id = %self.id, - %req_id, - %peer_id, - error = ?err, - "Custody by range download error" - ); - - for column_index in &batch_request.indices { - self.column_requests - .get_mut(column_index) - .ok_or(Error::InternalError("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id, err.clone())?; - } - - // An RpcResponseError is already downscored in network_context - self.failed_peers.insert(peer_id); - } - }; - - self.continue_requests(cx) - } - - pub(crate) fn continue_requests( - &mut self, - cx: &mut SyncNetworkContext, - ) -> CustodyByRangeRequestResult { - if self.column_requests.values().all(|r| r.is_downloaded()) { - // All requests have completed successfully. - let mut peers = HashMap::>::new(); - let mut seen_timestamps = vec![]; - let columns = std::mem::take(&mut self.column_requests) - .into_values() - .map(|request| { - let (peer, data_columns, seen_timestamp) = request.complete()?; - - for data_column in &data_columns { - let columns_by_peer = peers.entry(peer).or_default(); - if !columns_by_peer.contains(&(data_column.index as usize)) { - columns_by_peer.push(data_column.index as usize); - } - } - - seen_timestamps.push(seen_timestamp); - - Ok(data_columns) - }) - .collect::, _>>()? - // Flatten Vec> to Vec - .into_iter() - .flatten() - .collect(); - - let peer_group = PeerGroup::from_set(peers); - let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); - return Ok(Some((columns, peer_group, max_seen_timestamp))); - } - - let active_request_count_by_peer = cx.active_request_count_by_peer(); - let mut columns_to_request_by_peer = HashMap::>::new(); - let lookup_peers = self.lookup_peers.read(); - - // Need to: - // - track how many active requests a peer has for load balancing - // - which peers have failures to attempt others - // - which peer returned what to have PeerGroup attributability - - for (column_index, request) in self.column_requests.iter_mut() { - if request.is_awaiting_download() { - if let Some(last_error) = request.too_many_failures() { - return Err(Error::TooManyDownloadErrors(last_error)); - } - - // TODO(das): We should only query peers that are likely to know about this block. - // For by_range requests, only peers in the SyncingChain peer set. Else consider a - // fallback to the peers that are synced up to the epoch we want to query. - let custodial_peers = cx.get_custodial_peers(*column_index); - - // We draw from the total set of peers, but prioritize those peers who we have - // received an attestation / status / block message claiming to have imported the - // lookup. The frequency of those messages is low, so drawing only from lookup_peers - // could cause many lookups to take much longer or fail as they don't have enough - // custody peers on a given column - let mut priorized_peers = custodial_peers - .iter() - .filter(|peer| { - // Do not request faulty peers for some time - !self.failed_peers.contains(peer) - }) - .map(|peer| { - ( - // Prioritize peers that claim to know have imported this block - if lookup_peers.contains(peer) { 0 } else { 1 }, - // Prefer peers with fewer requests to load balance across peers. - // We batch requests to the same peer, so count existence in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::thread_rng().gen::(), - *peer, - ) - }) - .collect::>(); - priorized_peers.sort_unstable(); - - if let Some((_, _, _, peer_id)) = priorized_peers.first() { - columns_to_request_by_peer - .entry(*peer_id) - .or_default() - .push(*column_index); - } else { - // Do not issue requests if there is no custody peer on this column. The request - // will sit idle without making progress. The only way to make to progress is: - // - Add a new peer that custodies the missing columns - // - Call `continue_requests` - // - // Otherwise this request will be dropped and failed after some time. - } - } - } - - for (peer_id, indices) in columns_to_request_by_peer.into_iter() { - let req_id = cx - .send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *self.request.start_slot(), - count: *self.request.count(), - columns: indices.clone(), - }, - self.id, - ) - .map_err(|e| Error::InternalError(format!("send failed {e}")))?; - - for column_index in &indices { - let column_request = self - .column_requests - .get_mut(column_index) - // Should never happen: column_index is iterated from column_requests - .ok_or(Error::InternalError(format!( - "Unknown column_request {column_index}" - )))?; - - column_request.on_download_start(req_id)?; - } - - self.active_batch_columns_requests - .insert(req_id, ActiveBatchColumnsRequest { indices }); - } - - if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) - && !self.column_requests.values().any(|r| r.is_downloading()) - { - let awaiting_peers_indicies = self - .column_requests - .iter() - .filter(|(_, r)| r.is_awaiting_download()) - .map(|(id, _)| *id) - .collect::>(); - return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); - } - - Ok(None) - } -} diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 574f4e32458..505c65c2f92 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -17,11 +17,8 @@ use crate::metrics; use super::{RpcEvent, RpcResponseResult}; -mod blobs_by_range; mod blobs_by_root; -mod blocks_by_range; mod blocks_by_root; -mod data_columns_by_range; mod data_columns_by_root; #[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs deleted file mode 100644 index 8a9a8c9813c..00000000000 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs +++ /dev/null @@ -1,61 +0,0 @@ -use super::{ActiveRequestItems, LookupVerifyError}; -use lighthouse_network::rpc::methods::BlobsByRangeRequest; -use std::sync::Arc; -use types::{BlobSidecar, EthSpec, Slot}; - -/// Accumulates results of a blobs_by_range request. Only returns items after receiving the -/// stream termination. -pub struct BlobsByRangeRequestItems { - request: BlobsByRangeRequest, - items: Vec>>, - max_blobs_per_block: u64, -} - -impl BlobsByRangeRequestItems { - pub fn new(request: BlobsByRangeRequest, max_blobs_per_block: u64) -> Self { - Self { - request, - items: vec![], - max_blobs_per_block, - } - } -} - -impl ActiveRequestItems for BlobsByRangeRequestItems { - type Item = Arc>; - - fn add(&mut self, blob: Self::Item) -> Result { - let start_slot = Slot::new(self.request.start_slot); - let end_slot = start_slot + Slot::new(self.request.count); - - if blob.slot() < start_slot || blob.slot() >= end_slot { - return Err(LookupVerifyError::UnrequestedSlot { - slot: blob.slot(), - start_slot, - end_slot, - }); - } - if blob.index >= self.max_blobs_per_block { - return Err(LookupVerifyError::UnrequestedIndex(blob.index)); - } - if !blob.verify_blob_sidecar_inclusion_proof() { - return Err(LookupVerifyError::InvalidInclusionProof); - } - if self - .items - .iter() - .any(|existing| existing.slot() == blob.slot() && existing.index == blob.index) - { - return Err(LookupVerifyError::DuplicatedData(blob.slot(), blob.index)); - } - - self.items.push(blob); - - // Skip check if blobs are ready as it's rare that all blocks have max blobs - Ok(false) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs deleted file mode 100644 index ae39ac1d766..00000000000 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs +++ /dev/null @@ -1,53 +0,0 @@ -use super::{ActiveRequestItems, LookupVerifyError}; -use lighthouse_network::rpc::BlocksByRangeRequest; -use std::sync::Arc; -use types::{EthSpec, SignedBeaconBlock, Slot}; - -/// Accumulates results of a blocks_by_range request. Only returns items after receiving the -/// stream termination. -pub struct BlocksByRangeRequestItems { - request: BlocksByRangeRequest, - items: Vec>>, -} - -impl BlocksByRangeRequestItems { - pub fn new(request: BlocksByRangeRequest) -> Self { - Self { - request, - items: vec![], - } - } -} - -impl ActiveRequestItems for BlocksByRangeRequestItems { - type Item = Arc>; - - fn add(&mut self, block: Self::Item) -> Result { - let start_slot = Slot::new(*self.request.start_slot()); - let end_slot = start_slot + Slot::new(*self.request.count()); - - if block.slot() < start_slot || block.slot() >= end_slot { - return Err(LookupVerifyError::UnrequestedSlot { - slot: block.slot(), - start_slot, - end_slot, - }); - } - if self - .items - .iter() - .any(|existing| existing.slot() == block.slot()) - { - // DuplicatedData is a common error for all components, default index to 0 - return Err(LookupVerifyError::DuplicatedData(block.slot(), 0)); - } - - self.items.push(block); - - Ok(self.items.len() >= *self.request.count() as usize) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs deleted file mode 100644 index 54ff0c1c735..00000000000 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ /dev/null @@ -1,59 +0,0 @@ -use super::{ActiveRequestItems, LookupVerifyError}; -use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; -use std::sync::Arc; -use types::{DataColumnSidecar, DataColumnSidecarList, EthSpec, Slot}; - -/// Accumulates results of a data_columns_by_range request. Only returns items after receiving the -/// stream termination. -pub struct DataColumnsByRangeRequestItems { - request: DataColumnsByRangeRequest, - items: DataColumnSidecarList, -} - -impl DataColumnsByRangeRequestItems { - pub fn new(request: DataColumnsByRangeRequest) -> Self { - Self { - request, - items: vec![], - } - } -} - -impl ActiveRequestItems for DataColumnsByRangeRequestItems { - type Item = Arc>; - - fn add(&mut self, data_column: Self::Item) -> Result { - let start_slot = Slot::new(self.request.start_slot); - let end_slot = start_slot + Slot::new(self.request.count); - - if data_column.slot() < start_slot || data_column.slot() >= end_slot { - return Err(LookupVerifyError::UnrequestedSlot { - slot: data_column.slot(), - start_slot, - end_slot, - }); - } - if !self.request.columns.contains(&data_column.index) { - return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); - } - if !data_column.verify_inclusion_proof() { - return Err(LookupVerifyError::InvalidInclusionProof); - } - if self.items.iter().any(|existing| { - existing.slot() == data_column.slot() && existing.index == data_column.index - }) { - return Err(LookupVerifyError::DuplicatedData( - data_column.slot(), - data_column.index, - )); - } - - self.items.push(data_column); - - Ok(self.items.len() >= self.request.count as usize * self.request.columns.len()) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index bfc5ad34eeb..8f9801f203c 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1,7 +1,4 @@ use crate::network_beacon_processor::NetworkBeaconProcessor; -use crate::sync::block_lookups::{ - BlockLookupSummary, PARENT_DEPTH_TOLERANCE, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS, -}; use crate::sync::range_sync::BATCH_BUFFER_SIZE; use crate::sync::{ manager::{BlockProcessingResult, SyncManager}, @@ -14,7 +11,6 @@ use std::time::Duration; use super::*; -use crate::sync::block_lookups::common::ResponseType; use beacon_chain::observed_data_sidecars::Observe; use beacon_chain::{ blob_verification::GossipVerifiedBlob, @@ -50,7 +46,9 @@ use types::{ }; const D: Duration = Duration::new(0, 0); +const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 5; const PARENT_FAIL_TOLERANCE: u8 = SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS; +const PARENT_DEPTH_TOLERANCE: usize = 32; const SAMPLING_REQUIRED_SUCCESSES: usize = 2; type DCByRootIds = Vec; type DCByRootId = (SyncRequestId, Vec); @@ -60,6 +58,14 @@ pub enum PeersConfig { SupernodeOnly, } +pub enum ResponseType { + Block, + Blob, + CustodyColumn, +} + +struct BlockLookupSummary {} + pub struct TestOptions { /// If the node created by this test harness is a supernode pub is_supernode: bool, @@ -261,19 +267,19 @@ impl TestRig { } fn active_single_lookups(&self) -> Vec { - self.sync_manager.active_single_lookups() + todo!(); } fn active_single_lookups_count(&self) -> usize { - self.sync_manager.active_single_lookups().len() + self.active_single_lookups().len() } fn active_parent_lookups(&self) -> Vec> { - self.sync_manager.active_parent_lookups() + todo!(); } fn active_parent_lookups_count(&self) -> usize { - self.sync_manager.active_parent_lookups().len() + self.active_single_lookups_count() } fn active_range_sync_chain(&mut self) -> (RangeSyncType, Slot, Slot) { @@ -284,8 +290,7 @@ impl TestRig { assert_eq!( self.active_single_lookups_count(), count, - "Unexpected count of single lookups. Current lookups: {:?}", - self.active_single_lookups() + "Unexpected count of single lookups. Current lookups: -", ); } @@ -314,66 +319,49 @@ impl TestRig { assert_eq!( self.active_parent_lookups_count(), count, - "Unexpected count of parent lookups. Parent lookups: {:?}. Current lookups: {:?}", - self.active_parent_lookups(), - self.active_single_lookups() + "Unexpected count of parent lookups. Parent lookups: -. Current lookups: -", ); } fn assert_lookup_is_active(&self, block_root: Hash256) { - let lookups = self.sync_manager.active_single_lookups(); - if !lookups.iter().any(|l| l.1 == block_root) { - panic!("Expected lookup {block_root} to be the only active: {lookups:?}"); - } + todo!(); } fn assert_lookup_peers(&self, block_root: Hash256, mut expected_peers: Vec) { - let mut lookup = self - .sync_manager - .active_single_lookups() - .into_iter() - .find(|l| l.1 == block_root) - .unwrap_or_else(|| panic!("no lookup for {block_root}")); - lookup.3.sort(); - expected_peers.sort(); - assert_eq!( - lookup.3, expected_peers, - "unexpected peers on lookup {block_root}" - ); + todo!(); } fn insert_failed_chain(&mut self, block_root: Hash256) { - self.sync_manager.insert_failed_chain(block_root); + todo!(); } fn assert_not_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); + let failed_chains = self.get_failed_chains(); if failed_chains.contains(&chain_hash) { panic!("failed chains contain {chain_hash:?}: {failed_chains:?}"); } } + fn get_failed_chains(&mut self) -> Vec { + todo!(); + } + fn assert_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); + let failed_chains = self.get_failed_chains(); if !failed_chains.contains(&chain_hash) { panic!("expected failed chains to contain {chain_hash:?}: {failed_chains:?}"); } } fn find_single_lookup_for(&self, block_root: Hash256) -> Id { - self.active_single_lookups() - .iter() - .find(|l| l.1 == block_root) - .unwrap_or_else(|| panic!("no single block lookup found for {block_root}")) - .0 + todo!(); } #[track_caller] fn expect_no_active_single_lookups(&self) { assert!( self.active_single_lookups().is_empty(), - "expect no single block lookups: {:?}", - self.active_single_lookups() + "expect no single block lookups", ); } @@ -478,9 +466,7 @@ impl TestRig { .find(|chain| chain.first() == Some(&chain_hash)) .unwrap_or_else(|| { panic!( - "No parent chain with chain_hash {chain_hash:?}: Parent lookups {:?} Single lookups {:?}", - self.active_parent_lookups(), - self.active_single_lookups(), + "No parent chain with chain_hash {chain_hash:?}: Parent lookups - Single lookups -", ) }); *parent_chain.last().unwrap() @@ -738,7 +724,7 @@ impl TestRig { let block_root = first_dc.block_root(); let sampling_request_id = match id.0 { SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { - requester: DataColumnsByRootRequester::Sampling(sampling_id), + parent_request_id: DataColumnsByRootRequester::Sampling(sampling_id), .. }) => sampling_id.sampling_request_id, _ => unreachable!(), @@ -765,7 +751,7 @@ impl TestRig { _missing_components: bool, ) { let _lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { - requester: DataColumnsByRootRequester::Custody(id), + parent_request_id: DataColumnsByRootRequester::Custody(id), .. }) = ids.first().unwrap().0 { diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index 804be212ef9..1113d1e554a 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -6,7 +6,7 @@ use beacon_chain::builder::Witness; use beacon_chain::eth1_chain::CachingEth1Backend; use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; use beacon_processor::WorkEvent; -use lighthouse_network::service::api_types::ComponentsByRangeRequestId; +use lighthouse_network::service::api_types::ComponentsByRootRequestId; use lighthouse_network::NetworkGlobals; pub use lookups::PeersConfig; use rand_chacha::ChaCha20Rng; @@ -74,7 +74,7 @@ struct TestRig { spec: Arc, // Cache of sent blocks for PeerDAS responses - sent_blocks_by_range: HashMap>>>, + sent_blocks_by_range: HashMap>>>, blocks_by_root: HashMap>>, } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index f945a5fb252..72913148952 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -2,7 +2,7 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; -use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; +use crate::sync::network_context::{BlockComponentsByRootRequestStep, RangeRequestId}; use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; use crate::sync::tests::lookups::TestOptions; use crate::sync::BatchProcessResult; @@ -14,14 +14,10 @@ use beacon_chain::{ PayloadVerificationStatus, }; use beacon_processor::WorkType; -use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, BlocksByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, - OldBlocksByRangeRequest, -}; +use lighthouse_network::rpc::methods::{BlocksByRootRequest, DataColumnsByRootRequest}; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, BlocksByRootRequestId, - BlocksByRootRequester, ComponentsByRangeRequestId, DataColumnsByRangeRequestId, + AppRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, }; use lighthouse_network::types::SyncState; @@ -41,19 +37,13 @@ pub(crate) enum DataSidecars { DataColumns(Vec>), } -enum ByRangeDataRequestIds { +enum ByRootDataRequestIds { PreDeneb, - PrePeerDAS(BlobsByRangeRequestId, PeerId, BlobsByRangeRequest), - PostPeerDAS( - Vec<( - DataColumnsByRangeRequestId, - PeerId, - DataColumnsByRangeRequest, - )>, - ), + PrePeerDAS(BlobsByRootRequestId, PeerId, BlobsByRootRequest), + PostPeerDAS(Vec<(DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest)>), } -impl ByRangeDataRequestIds { +impl ByRootDataRequestIds { /// If there's a single active request, returns its peer, else panics fn peer(&self) -> PeerId { match self { @@ -73,16 +63,8 @@ struct Config { peers: PeersConfig, } -type BlocksByRangeRequestData = (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest); - type BlocksByRootRequestData = (BlocksByRootRequestId, PeerId, BlocksByRootRequest); -type DataColumnsByRangeRequestData = ( - DataColumnsByRangeRequestId, - PeerId, - DataColumnsByRangeRequest, -); - type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest); /// Sync tests are usually written in the form: @@ -122,20 +104,6 @@ impl RequestFilter { self } - fn blocks_by_range_requests( - &self, - ev: &NetworkMessage, - ) -> Option { - match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlocksByRange(req), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if self.matches_blocks_by_range(peer_id, req) => Some((*id, *peer_id, req.clone())), - _ => None, - } - } - fn blocks_by_root_requests( &self, ev: &NetworkMessage, @@ -150,22 +118,6 @@ impl RequestFilter { } } - fn data_columns_by_range_requests( - &self, - ev: &NetworkMessage, - ) -> Option { - match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::DataColumnsByRange(req), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if self.matches_data_columns_by_range(peer_id, req) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - } - } - fn data_columns_by_root_requests( &self, ev: &NetworkMessage, @@ -186,27 +138,6 @@ impl RequestFilter { self.matches_peer(peer) } - fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { - self.matches_common(peer, *req.start_slot()) - } - - fn matches_blobs_by_range(&self, peer: &PeerId, req: &BlobsByRangeRequest) -> bool { - self.matches_common(peer, req.start_slot) - } - - fn matches_data_columns_by_range( - &self, - peer: &PeerId, - req: &DataColumnsByRangeRequest, - ) -> bool { - if let Some(index) = self.column_index { - if !req.columns.contains(&index) { - return false; - } - } - self.matches_common(peer, req.start_slot) - } - fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { if let Some(index) = self.column_index { if !req @@ -420,7 +351,7 @@ impl TestRig { panic!("No active block_components_by_range requests"); } for (id, step) in requests { - if !matches!(step, BlockComponentsByRangeRequestStep::CustodyRequest) { + if !matches!(step, BlockComponentsByRootRequestStep::CustodyRequest) { panic!("block_components_by_range request {id} is not on CustodyRequest step: {step:?}"); } } @@ -557,47 +488,12 @@ impl TestRig { fn last_sent_blocks_by_range( &mut self, - id: ComponentsByRangeRequestId, + id: ComponentsByRootRequestId, ) -> Vec>> { self.sent_blocks_by_range .get(&id) .cloned() - .unwrap_or_else(|| panic!("No blocks for ComponentsByRangeRequestId {id}")) - } - - fn send_blocks_by_range_response( - &mut self, - req_id: BlocksByRangeRequestId, - peer_id: PeerId, - blocks: &[Arc>], - ) { - let slots = blocks.iter().map(|block| block.slot()).collect::>(); - self.log(&format!( - "Completing BlocksByRange request {req_id} to {peer_id} with blocks {slots:?}" - )); - - for block in blocks { - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(req_id), - peer_id, - beacon_block: Some(block.clone()), - seen_timestamp: D, - }); - } - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(req_id), - peer_id, - beacon_block: None, - seen_timestamp: D, - }); - - if self - .sent_blocks_by_range - .insert(req_id.parent_request_id, blocks.to_vec()) - .is_some() - { - panic!("Sent two blocks_by_range requests in the same epoch. We need better tracking"); - } + .unwrap_or_else(|| panic!("No blocks for ComponentsByRootRequestId {id}")) } fn send_blocks_by_root_response( @@ -627,37 +523,6 @@ impl TestRig { }); } - fn send_data_columns_by_range_response( - &mut self, - id: DataColumnsByRangeRequestId, - peer_id: PeerId, - data_columns: &[Arc>], - ) { - let mut ids = data_columns - .iter() - .map(|d| (d.slot().as_u64(), d.index)) - .collect::>(); - ids.sort_unstable(); - self.log(&format!( - "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" - )); - - for data_column in data_columns { - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: Some(data_column.clone()), - seen_timestamp: D, - }); - } - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); - } - fn send_data_columns_by_root_response( &mut self, id: DataColumnsByRootRequestId, @@ -689,112 +554,6 @@ impl TestRig { }); } - fn pop_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest) { - self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) - .unwrap_or_else(|e| { - panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }) - } - - fn pop_data_columns_by_range_requests( - &mut self, - request_filter: RequestFilter, - ) -> Vec<( - DataColumnsByRangeRequestId, - PeerId, - DataColumnsByRangeRequest, - )> { - let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = - self.pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) - { - data_columns_requests.push(data_columns_request); - } - data_columns_requests - } - - fn find_data_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> ByRangeDataRequestIds { - if self.after_fulu() { - let data_columns_requests = self.pop_data_columns_by_range_requests(request_filter); - if data_columns_requests.is_empty() { - panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); - } - ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) - } else if self.after_deneb() { - let (id, peer, req) = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlobsByRange(req), - app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if request_filter.matches_blobs_by_range(peer_id, req) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Should have a blobs by range request, filter {request_filter:?}: {e:?}") - }); - ByRangeDataRequestIds::PrePeerDAS(id, peer, req) - } else { - ByRangeDataRequestIds::PreDeneb - } - } - - fn find_and_complete_block_components_by_range_request( - &mut self, - request_filter: RequestFilter, - complete_config: CompleteConfig, - ) -> RangeRequestId { - let id = self.find_and_complete_blocks_by_range_request(request_filter, complete_config); - self.find_and_complete_data_by_range_request(request_filter, complete_config); - id - } - - fn find_and_complete_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - complete_config: CompleteConfig, - ) -> RangeRequestId { - let (blocks_req_id, block_peer, blocks_req) = - self.pop_blocks_by_range_request(request_filter); - - let start_slot = Slot::new(*blocks_req.start_slot()); - let blocks = (0..complete_config.block_count) - .map(|i| { - self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) - .into() - }) - .collect::>(); - self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); - - blocks_req_id.parent_request_id.requester - } - - fn complete_blocks_by_range_request( - &mut self, - request: BlocksByRangeRequestData, - complete_config: CompleteConfig, - ) -> RangeRequestId { - let (blocks_req_id, block_peer, blocks_req) = request; - let start_slot = Slot::new(*blocks_req.start_slot()); - let blocks = (0..complete_config.block_count) - .map(|i| { - self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) - .into() - }) - .collect::>(); - self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); - - blocks_req_id.parent_request_id.requester - } - fn complete_blocks_by_root_request( &mut self, request: BlocksByRootRequestData, @@ -818,66 +577,6 @@ impl TestRig { blocks_req_id.parent_request_id } - fn complete_data_columns_by_range_request( - &mut self, - (id, peer_id, req): DataColumnsByRangeRequestData, - complete_config: CompleteConfig, - ) { - // To reply with a valid DataColumnsByRange we need to construct - // DataColumnsByRange for the block root that we requested the block peer, plus - // figure out which exact columns we requested this peer - - let components_by_range_req_id = id.parent_request_id.parent_request_id; - let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); - - let data_columns = blocks - .iter() - .flat_map(|block| { - let kzg_commitments_inclusion_proof = block - .message() - .body() - .kzg_commitments_merkle_proof() - .unwrap(); - let kzg_commitments = block - .message() - .body() - .blob_kzg_commitments() - .unwrap() - .clone(); - let signed_block_header = block.signed_block_header(); - - req.columns.iter().filter_map(move |index| { - // Skip column generation if index is marked as failure - if complete_config.custody_failure_at_index == Some(*index) { - return None; - } - - // We need to produce a DataColumn with valid inclusion proof, but can - // be with random KZG proof and data as we won't send it for processing - Some(Arc::new(DataColumnSidecar { - index: *index, - column: VariableList::empty(), - kzg_commitments: kzg_commitments.clone(), - kzg_proofs: VariableList::from(vec![]), - signed_block_header: signed_block_header.clone(), - kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), - })) - }) - }) - .collect::>(); - - // Need to log here because I can't capture &mut self inside the columns iter - if !blocks.is_empty() { - if let Some(index) = complete_config.custody_failure_at_index { - self.log(&format!( - "Forced custody failure at request {id} for peer {peer_id} index {index:?}" - )); - } - } - - self.send_data_columns_by_range_response(id, peer_id, &data_columns); - } - fn complete_data_columns_by_root_request_range_sync( &mut self, (id, peer_id, req): DataColumnsByRootRequestData, @@ -953,85 +652,6 @@ impl TestRig { self.complete_data_by_range_request(by_range_data_request_ids, complete_config); } - fn complete_data_by_range_request( - &mut self, - by_range_data_request_ids: ByRangeDataRequestIds, - complete_config: CompleteConfig, - ) { - match by_range_data_request_ids { - ByRangeDataRequestIds::PreDeneb => {} - ByRangeDataRequestIds::PrePeerDAS(id, peer_id, req) => { - // Complete the request with a single stream termination - self.log(&format!( - "Completing BlobsByRange request {id} {req:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: SyncRequestId::BlobsByRange(id), - peer_id, - blob_sidecar: None, - seen_timestamp: D, - }); - } - ByRangeDataRequestIds::PostPeerDAS(data_column_req_ids) => { - // Complete the request with a single stream termination - for (id, peer_id, req) in data_column_req_ids { - // To reply with a valid DataColumnsByRange we need to construct - // DataColumnsByRange for the block root that we requested the block peer, plus - // figure out which exact columns we requested this peer - - let components_by_range_req_id = id.parent_request_id.parent_request_id; - let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); - - let data_columns = blocks - .iter() - .flat_map(|block| { - let kzg_commitments_inclusion_proof = block - .message() - .body() - .kzg_commitments_merkle_proof() - .unwrap(); - let kzg_commitments = block - .message() - .body() - .blob_kzg_commitments() - .unwrap() - .clone(); - let signed_block_header = block.signed_block_header(); - - req.columns.iter().filter_map(move |index| { - // Skip column generation if index is marked as failure - if complete_config.custody_failure_at_index == Some(*index) { - return None; - } - - // We need to produce a DataColumn with valid inclusion proof, but can - // be with random KZG proof and data as we won't send it for processing - Some(Arc::new(DataColumnSidecar { - index: *index, - column: VariableList::empty(), - kzg_commitments: kzg_commitments.clone(), - kzg_proofs: VariableList::from(vec![]), - signed_block_header: signed_block_header.clone(), - kzg_commitments_inclusion_proof: - kzg_commitments_inclusion_proof.clone(), - })) - }) - }) - .collect::>(); - - // Need to log here because I can't capture &mut self inside the columns iter - if !blocks.is_empty() { - if let Some(index) = complete_config.custody_failure_at_index { - self.log(&format!("Forced custody failure at request {id} for peer {peer_id} index {index:?}")); - } - } - - self.send_data_columns_by_range_response(id, peer_id, &data_columns); - } - } - } - } - fn complete_block_processing(&mut self, ids: Vec) { // Sort ids first as we need to process blocks in order of ancestors. This only works if the // test does not send blocks of two parallel chains at once. From 8d227f4e6197ed84e57d5f7a4626319de31f7dfa Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:28:30 +0200 Subject: [PATCH 28/66] Compiles src --- .../src/service/api_types.rs | 4 +- .../network_beacon_processor/sync_methods.rs | 10 +- .../network/src/sync/backfill_sync/mod.rs | 1120 ++-------------- beacon_node/network/src/sync/block_tree.rs | 78 +- beacon_node/network/src/sync/manager.rs | 249 +--- beacon_node/network/src/sync/mod.rs | 2 - .../network/src/sync/network_context.rs | 44 +- .../block_components_by_range.rs | 13 +- .../sync/network_context/custody_by_root.rs | 6 +- .../src/sync/network_context/requests.rs | 3 - .../network_context/requests/blobs_by_root.rs | 3 +- .../requests/blocks_by_root.rs | 3 +- .../requests/data_columns_by_root.rs | 6 +- beacon_node/network/src/sync/peer_sampling.rs | 6 +- .../network/src/sync/range_sync/batch.rs | 489 ------- .../network/src/sync/range_sync/chain.rs | 1150 ----------------- .../src/sync/range_sync/chain_collection.rs | 541 -------- .../network/src/sync/range_sync/mod.rs | 15 - .../network/src/sync/range_sync/range.rs | 460 ------- .../network/src/sync/range_sync/sync_type.rs | 46 - beacon_node/network/src/sync/tests/range.rs | 8 +- 21 files changed, 205 insertions(+), 4051 deletions(-) delete mode 100644 beacon_node/network/src/sync/range_sync/batch.rs delete mode 100644 beacon_node/network/src/sync/range_sync/chain.rs delete mode 100644 beacon_node/network/src/sync/range_sync/chain_collection.rs delete mode 100644 beacon_node/network/src/sync/range_sync/mod.rs delete mode 100644 beacon_node/network/src/sync/range_sync/range.rs delete mode 100644 beacon_node/network/src/sync/range_sync/sync_type.rs diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index e3c10acfd5a..e739775763c 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -68,7 +68,7 @@ pub struct ComponentsByRootRequestId { #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { RangeSync(HeaderLookupId), - BackfillSync { batch_id: Epoch }, + BackfillSync(Id), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -253,7 +253,7 @@ impl Display for RangeRequestId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { Self::RangeSync(id) => write!(f, "RangeSync/{id}"), - Self::BackfillSync { batch_id } => write!(f, "BackfillSync/{batch_id}"), + Self::BackfillSync(id) => write!(f, "BackfillSync/{id}"), } } } diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 1f2c56adaee..c0b33582295 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,21 +1,21 @@ -use crate::metrics::{self, register_process_result_metrics}; +use crate::metrics::{self}; use crate::network_beacon_processor::{NetworkBeaconProcessor, FUTURE_SLOT_TOLERANCE}; +use crate::sync::manager::SyncMessage; use crate::sync::BatchProcessResult; -use crate::sync::{manager::SyncMessage, ChainId}; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; use beacon_chain::{ BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, }; -use lighthouse_network::service::api_types::HeaderLookupId; +use lighthouse_network::service::api_types::{HeaderLookupId, Id}; use lighthouse_network::PeerAction; use std::collections::HashMap; use std::fmt::{Display, Formatter}; use std::sync::Arc; use std::time::Duration; use tracing::{debug, warn}; -use types::{ColumnIndex, DataColumnSidecar, Epoch, Hash256}; +use types::{ColumnIndex, DataColumnSidecar, Hash256}; /// Id associated to a batch processing request, either a sync batch or a parent lookup. #[derive(Clone, Debug, PartialEq)] @@ -23,7 +23,7 @@ pub enum ChainSegmentProcessId { /// Processing Id of a range syncing batch. RangeBatchId(HeaderLookupId), /// Processing ID for a backfill syncing batch. - BackSyncBatchId(Epoch), + BackSyncBatchId(Id), } /// Returned when a chain segment import fails. diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index de853fd0d03..6b23457ae4d 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -11,39 +11,18 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ - RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, -}; -use crate::sync::range_sync::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + BatchPeers, RangeRequestId, RpcResponseError, SyncNetworkContext, }; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::{PeerAction, PeerId}; -use logging::crit; +use lighthouse_network::PeerId; use parking_lot::RwLock; -use std::collections::{ - btree_map::{BTreeMap, Entry}, - HashSet, -}; +use std::collections::HashSet; use std::sync::Arc; -use tracing::{debug, error, info, instrument, warn}; -use types::{Epoch, EthSpec}; - -use super::range_sync::BatchPeers; - -/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of -/// blocks per batch are requested _at most_. A batch may request less blocks to account for -/// already requested slots. There is a timeout for each batch request. If this value is too high, -/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which -/// case the responder will fill the response up to the max request size, assuming they have the -/// bandwidth to do so. -pub const BACKFILL_EPOCHS_PER_BATCH: u64 = 1; - -/// The maximum number of batches to queue before requesting more. -const BACKFILL_BATCH_BUFFER_SIZE: u8 = 20; +use tracing::{debug, info, instrument, warn}; +use types::{Epoch, EthSpec, Hash256}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; @@ -52,25 +31,6 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10; -/// Custom configuration for the batch object. -struct BackFillBatchConfig {} - -impl BatchConfig for BackFillBatchConfig { - fn max_batch_download_attempts() -> u8 { - MAX_BATCH_DOWNLOAD_ATTEMPTS - } - fn max_batch_processing_attempts() -> u8 { - MAX_BATCH_PROCESSING_ATTEMPTS - } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - let mut hasher = DefaultHasher::new(); - blocks.hash(&mut hasher); - hasher.finish() - } -} - /// Return type when attempting to start the backfill sync process. pub enum SyncStart { /// The chain started syncing or is already syncing. @@ -97,40 +57,26 @@ pub enum ProcessResult { #[derive(Debug)] pub enum BackFillError { /// A batch failed to be downloaded. - BatchDownloadFailed(#[allow(dead_code)] BatchId), + BatchDownloadFailed(#[allow(dead_code)] Id), /// A batch could not be processed. - BatchProcessingFailed(#[allow(dead_code)] BatchId), + BatchProcessingFailed(#[allow(dead_code)] Id), /// A batch entered an invalid state. - BatchInvalidState(#[allow(dead_code)] BatchId, #[allow(dead_code)] String), + BatchInvalidState(#[allow(dead_code)] Id, #[allow(dead_code)] String), /// The sync algorithm entered an invalid state. InvalidSyncState(#[allow(dead_code)] String), /// The chain became paused. Paused, } -pub struct BackFillSync { - /// Keeps track of the current progress of the backfill. - /// This only gets refreshed from the beacon chain if we enter a failed state. - current_start: BatchId, - - /// Starting epoch of the batch that needs to be processed next. - /// This is incremented as the chain advances. - processing_target: BatchId, - - /// Starting epoch of the next batch that needs to be downloaded. - to_be_downloaded: BatchId, - - /// Keeps track if we have requested the final batch. - last_batch_downloaded: bool, - - /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, - - /// The current processing batch, if any. - current_processing_batch: Option, +enum SyncingStatus { + AwaitingDownload(Hash256), + Downloading(Hash256, Id), + AwaitingProcessing(RpcBlock, BatchPeers), + Processing(RpcBlock, BatchPeers), +} - /// Batches validated by this chain. - validated_batches: u64, +pub struct BackFillSync { + status: SyncingStatus, /// When a backfill sync fails, we keep track of whether a new fully synced peer has joined. /// This signifies that we are able to attempt to restart a failed chain. @@ -173,17 +119,11 @@ impl BackFillSync { }; let bfs = BackFillSync { - batches: BTreeMap::new(), - processing_target: current_start, - current_start, - last_batch_downloaded: false, - to_be_downloaded: current_start, - network_globals, - current_processing_batch: None, - validated_batches: 0, + status: SyncingStatus::AwaitingDownload(anchor_info.oldest_block_parent), restart_failed_sync: false, peers: <_>::default(), beacon_chain, + network_globals, }; // Update the global network state with the current backfill state. @@ -200,7 +140,7 @@ impl BackFillSync { )] pub fn pause(&mut self) { if let BackFillState::Syncing = self.state() { - debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused"); + debug!("Backfill sync paused"); self.set_state(BackFillState::Paused); } } @@ -224,15 +164,9 @@ impl BackFillSync { BackFillState::Paused => { if !self.peers.read().is_empty() { // If there are peers to resume with, begin the resume. - debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync"); + debug!("Resuming backfill sync"); self.set_state(BackFillState::Syncing); - // Resume any previously failed batches. - self.resume_batches(network)?; - // begin requesting blocks from the peer pool, until all peers are exhausted. - self.request_batches(network)?; - - // start processing batches if needed - self.process_completed_batches(network)?; + self.continue_syncing_blocks(network); } else { return Ok(SyncStart::NotSyncing); } @@ -248,35 +182,19 @@ impl BackFillSync { self.set_state(BackFillState::Syncing); - // Obtain a new start slot, from the beacon chain and handle possible errors. - if let Err(e) = self.reset_start_epoch() { - // This infallible match exists to force us to update this code if a future - // refactor of `ResetEpochError` adds a variant. - let ResetEpochError::SyncCompleted = e; - error!("Backfill sync completed whilst in failed status"); - self.set_state(BackFillState::Completed); - return Err(BackFillError::InvalidSyncState(String::from( - "chain completed", - ))); - } - - debug!(start_epoch = %self.current_start, "Resuming a failed backfill sync"); + debug!("Resuming a failed backfill sync"); // begin requesting blocks from the peer pool, until all peers are exhausted. - self.request_batches(network)?; + self.continue_syncing_blocks(network); } BackFillState::Completed => return Ok(SyncStart::NotSyncing), } Ok(SyncStart::Syncing { - completed: (self.validated_batches - * BACKFILL_EPOCHS_PER_BATCH - * T::EthSpec::slots_per_epoch()) as usize, - remaining: self - .current_start - .start_slot(T::EthSpec::slots_per_epoch()) - .saturating_sub(self.beacon_chain.genesis_backfill_slot) - .as_usize(), + // TODO(tree-sync): is this actually used? The remaining does not account for the 6 + // months of data expiration + completed: todo!(), + remaining: todo!(), }) } @@ -311,865 +229,111 @@ impl BackFillSync { } } - /// An RPC error has occurred. - /// - /// If the batch exists it is re-requested. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn inject_error( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - request_id: Id, - err: RpcResponseError, - ) -> Result<(), BackFillError> { - if let Some(batch) = self.batches.get_mut(&batch_id) { - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. - if !batch.is_expecting_block(&request_id) { - return Ok(()); - } - debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - match batch.download_failed() { - Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { - RpcResponseError::RpcError(_) - | RpcResponseError::VerifyError(_) - | RpcResponseError::InternalError(_) => { - BackFillError::BatchDownloadFailed(batch_id) - } - RpcResponseError::RequestExpired(_) => BackFillError::Paused, - }), - Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), - } - } else { - // this could be an error for an old batch, removed when the chain advances - Ok(()) - } - } - - /// A block has been received for a batch relating to this backfilling chain. - /// If the block correctly completes the batch it will be processed if possible. - /// If this returns an error, the backfill sync has failed and will be restarted once new peers - /// join the system. - /// The sync manager should update the global sync state on failure. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] pub fn on_block_response( &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - batch_peers: BatchPeers, - request_id: Id, - blocks: Vec>, - ) -> Result { - // check if we have this batch - let Some(batch) = self.batches.get_mut(&batch_id) else { - if !matches!(self.state(), BackFillState::Failed) { - // A batch might get removed when the chain advances, so this is non fatal. - debug!(epoch = %batch_id, "Received a block for unknown batch"); - } - return Ok(ProcessResult::Successful); - }; - - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer, and that the - // request_id matches - if !batch.is_expecting_block(&request_id) { - return Ok(ProcessResult::Successful); - } - - match batch.download_completed(blocks, batch_peers) { - Ok(received) => { - let awaiting_batches = - self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; - debug!( - epoch = %batch_id, - blocks = received, - %awaiting_batches, - "Completed batch received" - ); - - // pre-emptively request more blocks from peers whilst we process current blocks, - self.request_batches(network)?; - self.process_completed_batches(network) - } - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; - Ok(ProcessResult::Successful) - } - } - } - - /// The syncing process has failed. - /// - /// This resets past variables, to allow for a fresh start when resuming. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> { - // Some errors shouldn't fail the chain. - if matches!(error, BackFillError::Paused) { - return Ok(()); - } - - // Set the state - self.set_state(BackFillState::Failed); - // Remove all batches and active requests and participating peers. - self.batches.clear(); - self.restart_failed_sync = false; - - // Reset all downloading and processing targets - self.processing_target = self.current_start; - self.to_be_downloaded = self.current_start; - self.last_batch_downloaded = false; - self.current_processing_batch = None; - - // NOTE: Lets keep validated_batches for posterity - - // Emit the log here - error!(?error, "Backfill sync failed"); - - // Return the error, kinda weird pattern, but I want to use - // `self.fail_chain(_)?` in other parts of the code. - Err(error) - } - - /// Processes the batch with the given id. - /// The batch must exist and be ready for processing - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn process_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result { - // Only process batches if this chain is Syncing, and only one at a time - if self.state() != BackFillState::Syncing || self.current_processing_batch.is_some() { - return Ok(ProcessResult::Successful); - } - - let Some(batch) = self.batches.get_mut(&batch_id) else { - return self - .fail_sync(BackFillError::InvalidSyncState(format!( - "Trying to process a batch that does not exist: {}", - batch_id - ))) - .map(|_| ProcessResult::Successful); - }; - - // NOTE: We send empty batches to the processor in order to trigger the block processor - // result callback. This is done, because an empty batch could end a chain and the logic - // for removing chains and checking completion is in the callback. - - let (blocks, _) = match batch.start_processing() { - Err(e) => { - return self - .fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) - .map(|_| ProcessResult::Successful) - } - Ok(v) => v, - }; - - let process_id = ChainSegmentProcessId::BackSyncBatchId(batch_id); - self.current_processing_batch = Some(batch_id); - - if let Err(e) = network - .beacon_processor() - .send_chain_segment(process_id, blocks) - { - crit!( - msg = "process_batch", - error = %e, - batch = ?self.processing_target, - "Failed to send backfill segment to processor." - ); - // This is unlikely to happen but it would stall syncing since the batch now has no - // blocks to continue, and the chain is expecting a processing result that won't - // arrive. To mitigate this, (fake) fail this processing so that the batch is - // re-downloaded. - self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure) - } else { - Ok(ProcessResult::Successful) - } - } - - /// The block processor has completed processing a batch. This function handles the result - /// of the batch processor. - /// If an error is returned the BackFill sync has failed. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn on_batch_process_result( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - result: &BatchProcessResult, - ) -> Result { - // The first two cases are possible in regular sync, should not occur in backfill, but we - // keep this logic for handling potential processing race conditions. - // result - let batch = match &self.current_processing_batch { - Some(processing_id) if *processing_id != batch_id => { - debug!( - batch_epoch = %batch_id.as_u64(), - expected_batch_epoch = processing_id.as_u64(), - "Unexpected batch result" - ); - return Ok(ProcessResult::Successful); - } - None => { - debug!(%batch_id, "Chain was not expecting a batch result"); - return Ok(ProcessResult::Successful); - } - _ => { - // batch_id matches, continue - self.current_processing_batch = None; - - match self.batches.get_mut(&batch_id) { - Some(batch) => batch, - None => { - // This is an error. Fail the sync algorithm. - return self - .fail_sync(BackFillError::InvalidSyncState(format!( - "Current processing batch not found: {}", - batch_id - ))) - .map(|_| ProcessResult::Successful); - } - } - } - }; - - let Some(batch_peers) = batch.processing_peers() else { - self.fail_sync(BackFillError::BatchInvalidState( - batch_id, - String::from("Peer does not exist"), - ))?; - return Ok(ProcessResult::Successful); - }; - - debug!( - ?result, - %batch_id, - "Backfill batch processed" - ); - - match result { - BatchProcessResult::Success { - imported_blocks, .. - } => { - if let Err(e) = batch.processing_completed(BatchProcessingResult::Success) { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; - } - // If the processed batch was not empty, we can validate previous unvalidated - // blocks. - if *imported_blocks > 0 { - self.advance_chain(network, batch_id); - } - - if batch_id == self.processing_target { - self.processing_target = self - .processing_target - .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); - } - - // check if the chain has completed syncing - if self.check_completed() { - // chain is completed - info!( - blocks_processed = self.validated_batches * T::EthSpec::slots_per_epoch(), - "Backfill sync completed" - ); - self.set_state(BackFillState::Completed); - Ok(ProcessResult::SyncCompleted) - } else { - // chain is not completed - // attempt to request more batches - self.request_batches(network)?; - // attempt to process more batches - self.process_completed_batches(network) - } - } - BatchProcessResult::FaultyFailure { - imported_blocks, - peer_action, - error, - } => { - // TODO(sync): De-dup between back and forwards sync - let mut failed_peers = vec![]; - - if let Some(penalty) = peer_action.block_peer { - // Penalize the peer appropiately. - network.report_peer(batch_peers.block(), penalty, "faulty_batch"); - failed_peers.push(batch_peers.block()); - } - - // Penalize each peer only once. Currently a peer_action does not mix different - // PeerAction levels. - for (peer, penalty) in peer_action - .column_peer - .iter() - .filter_map(|(column_index, penalty)| { - batch_peers - .column(column_index) - .map(|peer| (*peer, *penalty)) - }) - .unique() - { - network.report_peer(peer, penalty, "faulty_batch_column"); - failed_peers.push(peer); - } - - match batch.processing_completed(BatchProcessingResult::FaultyFailure(failed_peers)) - { - Err(e) => { - // Batch was in the wrong state - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) - .map(|_| ProcessResult::Successful) - } - Ok(BatchOperationOutcome::Failed { .. }) => { - // When backfill syncing post-PeerDAS we can't attribute fault to previous - // peers if a batch fails to process too many times. We have strict peer - // scoring for faulty errors, so participating peers that sent invalid - // data are already downscored. - // - // Because backfill sync deals with historical data that we can assert - // to be correct, once we import a batch that contains at least one - // block we are sure we got the right data. There's no need to penalize - // all participating peers in backfill sync if a batch fails - warn!( - batch_epoch = %batch_id, - error, - "Backfill sync failed after attempting to process batch too many times" - ); - - self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) - .map(|_| ProcessResult::Successful) - } - - Ok(BatchOperationOutcome::Continue) => { - // chain can continue. Check if it can be progressed - if *imported_blocks > 0 { - // At least one block was successfully verified and imported, then we can be sure all - // previous batches are valid and we only need to download the current failed - // batch. - self.advance_chain(network, batch_id); - } - // Handle this invalid batch, that is within the re-process retries limit. - self.handle_invalid_batch(network, batch_id) - .map(|_| ProcessResult::Successful) - } - } - } - BatchProcessResult::NonFaultyFailure => { - if let Err(e) = batch.processing_completed(BatchProcessingResult::NonFaultyFailure) - { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; - } - self.send_batch(network, batch_id)?; - Ok(ProcessResult::Successful) - } - } - } - - /// Processes the next ready batch. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn process_completed_batches( - &mut self, - network: &mut SyncNetworkContext, - ) -> Result { - // Only process batches if backfill is syncing and only process one batch at a time - if self.state() != BackFillState::Syncing || self.current_processing_batch.is_some() { - return Ok(ProcessResult::Successful); - } - - // Find the id of the batch we are going to process. - if let Some(batch) = self.batches.get(&self.processing_target) { - let state = batch.state(); - match state { - BatchState::AwaitingProcessing(..) => { - return self.process_batch(network, self.processing_target); - } - BatchState::Downloading(..) => { - // Batch is not ready, nothing to process - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { - // these are all inconsistent states: - // - Failed -> non recoverable batch. Chain should have been removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. - // - Processing -> `self.current_processing_batch` is None - self.fail_sync(BackFillError::InvalidSyncState(String::from( - "Invalid expected batch state", - )))?; - return Ok(ProcessResult::Successful); - } - BatchState::AwaitingValidation(_) => { - // TODO: I don't think this state is possible, log a CRIT just in case. - // If this is not observed, add it to the failed state branch above. - crit!( - batch = ?self.processing_target, - "Chain encountered a robust batch awaiting validation" - ); - - self.processing_target -= BACKFILL_EPOCHS_PER_BATCH; - if self.to_be_downloaded >= self.processing_target { - self.to_be_downloaded = self.processing_target - BACKFILL_EPOCHS_PER_BATCH; - } - self.request_batches(network)?; + id: Id, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) { + match self.status { + SyncingStatus::Downloading(block_root, expected_id) => { + if id != expected_id { + panic!("unexpected ID"); } - } - } else { - self.fail_sync(BackFillError::InvalidSyncState(format!( - "Batch not found for current processing target {}", - self.processing_target - )))?; - return Ok(ProcessResult::Successful); - } - Ok(ProcessResult::Successful) - } - - /// Removes any batches previous to the given `validating_epoch` and updates the current - /// boundaries of the chain. - /// - /// The `validating_epoch` must align with batch boundaries. - /// - /// If a previous batch has been validated and it had been re-processed, penalize the original - /// peer. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn advance_chain(&mut self, network: &mut SyncNetworkContext, validating_epoch: Epoch) { - // make sure this epoch produces an advancement - if validating_epoch >= self.current_start { - return; - } - - // We can now validate higher batches that the current batch. Here we remove all - // batches that are higher than the current batch. We add on an extra - // `BACKFILL_EPOCHS_PER_BATCH` as `split_off` is inclusive. - let removed_batches = self - .batches - .split_off(&(validating_epoch + BACKFILL_EPOCHS_PER_BATCH)); - - for (id, batch) in removed_batches.into_iter() { - self.validated_batches = self.validated_batches.saturating_add(1); - // only for batches awaiting validation can we be sure the last attempt is - // right, and thus, that any different attempt is wrong - match batch.state() { - BatchState::AwaitingValidation(ref processed_attempt) => { - for attempt in batch.attempts() { - // The validated batch has been re-processed - if attempt.hash != processed_attempt.hash { - // The re-downloaded version was different. - // TODO(das): should penalize other peers? - let valid_attempt_peer = processed_attempt.block_peer(); - let bad_attempt_peer = attempt.block_peer(); - if valid_attempt_peer != bad_attempt_peer { - // A different peer sent the correct batch, the previous peer did not - // We negatively score the original peer. - let action = PeerAction::LowToleranceError; - debug!( - batch_epoch = %id, score_adjustment = %action, - original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer, - "Re-processed batch validated. Scoring original peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_original_peer", - ); - } else { - // The same peer corrected it's previous mistake. There was an error, so we - // negative score the original peer. - let action = PeerAction::MidToleranceError; - debug!( - batch_epoch = %id, - score_adjustment = %action, - original_peer = %bad_attempt_peer, - new_peer = %valid_attempt_peer, - "Re-processed batch validated by the same peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_same_peer", - ); - } - } + match result { + Ok((block, peers)) => { + // TODO(tree-sync): check that id matches + debug!(%id, "Sync block downloaded"); + self.status = SyncingStatus::Processing(block, peers); } - } - BatchState::Downloading(..) => {} - BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { - crit!("batch indicates inconsistent chain state while advancing chain") - } - BatchState::AwaitingProcessing(..) => {} - BatchState::Processing(_) => { - debug!(batch = %id, "Advancing chain while processing a batch"); - if let Some(processing_id) = self.current_processing_batch { - if id >= processing_id { - self.current_processing_batch = None; - } + Err(e) => { + // TODO(tree-sync): Handle the error explicitly with a match, check unstable + debug!(%id, "Sync block download error"); + self.status = SyncingStatus::AwaitingDownload(block_root); } } } + _ => panic!("Bad state"), } - self.processing_target = self.processing_target.min(validating_epoch); - self.current_start = validating_epoch; - self.to_be_downloaded = self.to_be_downloaded.min(validating_epoch); - if self.batches.contains_key(&self.to_be_downloaded) { - // if a chain is advanced by Range beyond the previous `self.to_be_downloaded`, we - // won't have this batch, so we need to request it. - self.to_be_downloaded -= BACKFILL_EPOCHS_PER_BATCH; - } - debug!(?validating_epoch, processing_target = ?self.processing_target, "Backfill advanced"); + // Continue batches + self.continue_syncing_blocks(cx); } - /// An invalid batch has been received that could not be processed, but that can be retried. - /// - /// These events occur when a peer has successfully responded with blocks, but the blocks we - /// have received are incorrect or invalid. This indicates the peer has not performed as - /// intended and can result in downvoting a peer. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn handle_invalid_batch( + pub fn handle_block_process_result( &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result<(), BackFillError> { - // The current batch could not be processed, indicating either the current or previous - // batches are invalid. - - // The previous batch could be incomplete due to the block sizes being too large to fit in - // a single RPC request or there could be consecutive empty batches which are not supposed - // to be there - - // The current (sub-optimal) strategy is to simply re-request all batches that could - // potentially be faulty. If a batch returns a different result than the original and - // results in successful processing, we downvote the original peer that sent us the batch. - - // this is our robust `processing_target`. All previous batches must be awaiting - // validation - let mut redownload_queue = Vec::new(); - - for (id, batch) in self - .batches - .iter_mut() - .filter(|(&id, _batch)| id > batch_id) - { - match batch - .validation_failed() - .map_err(|e| BackFillError::BatchInvalidState(batch_id, e.0))? - { - BatchOperationOutcome::Failed { blacklist: _ } => { - // Batch has failed and cannot be redownloaded. - return self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)); + id: Id, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) { + match &mut self.status { + SyncingStatus::Processing(block, _peers) => match result { + BatchProcessResult::Success { .. } => { + debug!(%id, "Sync block process success"); + self.status = SyncingStatus::AwaitingDownload(block.as_block().parent_root()) } - BatchOperationOutcome::Continue => { - redownload_queue.push(*id); + BatchProcessResult::FaultyFailure { .. } => { + debug!(%id, "Sync block process error"); + self.status = SyncingStatus::AwaitingDownload(block.block_root()) + // TODO(tree-sync): add peer to failed peers and downscore } - } + BatchProcessResult::NonFaultyFailure => { + debug!(%id, "Sync block process error"); + self.status = SyncingStatus::AwaitingDownload(block.block_root()) + // TODO(tree-sync): add peer to failed peers and downscore + } + }, + _ => panic!("Bad state"), } - // no batch maxed out it process attempts, so now the chain's volatile progress must be - // reset - self.processing_target = self.current_start; - - for id in redownload_queue { - self.send_batch(network, id)?; - } - // finally, re-request the failed batch. - self.send_batch(network, batch_id) + // Continue batches + self.continue_syncing_blocks(cx); } - /// Requests the batch assigned to the given id from a given peer. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn send_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result<(), BackFillError> { - if let Some(batch) = self.batches.get_mut(&batch_id) { - let request = todo!(); - let failed_peers = batch.failed_peers(); - match network.block_components_by_range_request( - request, - RangeRequestId::BackfillSync { batch_id }, - self.peers.clone(), - failed_peers, - ) { - Ok(request_id) => { - // inform the batch about the new request - if let Err(e) = batch.start_downloading(request_id) { - return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); - } - debug!(%batch_id, "Requesting batch"); - - return Ok(()); - } - Err(e) => match e { - RpcRequestSendError::NoPeers => { - // If we are here the chain has no more synced peers - info!( - "reason" = "insufficient_synced_peers", - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); + fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { + match &mut self.status { + SyncingStatus::AwaitingDownload(block_root) => { + // TODO(tree-sync): pick the right ID + let requester = RangeRequestId::BackfillSync(cx.next_id()); + let failed_peers = HashSet::new(); + + match cx.block_components_by_range_request( + *block_root, + requester, + self.peers.clone(), + &failed_peers, + ) { + Ok(req_id) => { + self.status = SyncingStatus::Downloading(*block_root, req_id); } - RpcRequestSendError::InternalError(e) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, "Could not send batch request"); - // register the failed download and check if the batch can be retried - if let Err(e) = batch.start_downloading(1) { - return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); - } - - match batch.download_failed() { - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? - } - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? - } - Ok(BatchOperationOutcome::Continue) => { - return self.send_batch(network, batch_id) - } - } + Err(e) => { + // TODO(tree-sync): Match error explicitly + // Log failed chain, mark blocks as not syncing + todo!("error sending {e:?}"); } - }, + }; } - } - - Ok(()) - } - - /// When resuming a chain, this function searches for batches that need to be re-downloaded and - /// transitions their state to redownload the batch. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn resume_batches(&mut self, network: &mut SyncNetworkContext) -> Result<(), BackFillError> { - let batch_ids_to_retry = self - .batches - .iter() - .filter_map(|(batch_id, batch)| { - // In principle there should only ever be on of these, and we could terminate the - // loop early, however the processing is negligible and we continue the search - // for robustness to handle potential future modification - if matches!(batch.state(), BatchState::AwaitingDownload) { - Some(*batch_id) - } else { - None + SyncingStatus::Downloading(..) => {} // wait for event + SyncingStatus::AwaitingProcessing(block, peers) => { + let id = cx.next_id(); + let Some(beacon_processor) = cx.beacon_processor_if_enabled() else { + todo!("processor disabled"); + }; + // TODO(tree-sync): pick the right ID + if let Err(e) = beacon_processor.send_chain_segment( + ChainSegmentProcessId::BackSyncBatchId(id), + vec![block.clone()], + ) { + todo!("error sending {e:?}"); } - }) - .collect::>(); - - for batch_id in batch_ids_to_retry { - self.send_batch(network, batch_id)?; - } - Ok(()) - } - - /// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer - /// pool and left over batches until the batch buffer is reached or all peers are exhausted. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn request_batches( - &mut self, - network: &mut SyncNetworkContext, - ) -> Result<(), BackFillError> { - if !matches!(self.state(), BackFillState::Syncing) { - return Ok(()); - } - - // find the next pending batch and request it from the peer - // Note: for this function to not infinite loop we must: - // - If `include_next_batch` returns Some we MUST increase the count of batches that are - // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of - // that function. - while let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id)?; - } - - // No more batches, simply stop - Ok(()) - } - - /// Creates the next required batch from the chain. If there are no more batches required, - /// `false` is returned. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { - // don't request batches beyond genesis; - if self.last_batch_downloaded { - return None; - } - - // only request batches up to the buffer size limit - // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync - // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { - matches!( - batch.state(), - BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) - ) - }; - if self - .batches - .iter() - .filter(|&(_epoch, batch)| in_buffer(batch)) - .count() - > BACKFILL_BATCH_BUFFER_SIZE as usize - { - return None; - } - - let batch_id = self.to_be_downloaded; - // this batch could have been included already being an optimistic batch - match self.batches.entry(batch_id) { - Entry::Occupied(_) => { - // this batch doesn't need downloading, let this same function decide the next batch - if self.would_complete(batch_id) { - self.last_batch_downloaded = true; - } - - self.to_be_downloaded = self - .to_be_downloaded - .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); - self.include_next_batch(network) - } - Entry::Vacant(_entry) => { - // TODO - if self.would_complete(batch_id) { - self.last_batch_downloaded = true; - } - self.to_be_downloaded = self - .to_be_downloaded - .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); - Some(batch_id) - } - } - } - - /// Resets the start epoch based on the beacon chain. - /// - /// This errors if the beacon chain indicates that backfill sync has already completed or is - /// not required. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> { - let anchor_info = self.beacon_chain.store.get_anchor_info(); - if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) { - Err(ResetEpochError::SyncCompleted) - } else { - self.current_start = anchor_info - .oldest_block_slot - .epoch(T::EthSpec::slots_per_epoch()); - Ok(()) - } - } - - /// Checks with the beacon chain if backfill sync has completed. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn check_completed(&mut self) -> bool { - if self.would_complete(self.current_start) { - // Check that the beacon chain agrees - let anchor_info = self.beacon_chain.store.get_anchor_info(); - // Conditions that we have completed a backfill sync - if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) { - return true; - } else { - error!("Backfill out of sync with beacon chain"); + self.status = SyncingStatus::Processing(block.clone(), peers.clone()); } + SyncingStatus::Processing(..) => {} // wait for event } - false - } - - /// Checks if backfill would complete by syncing to `start_epoch`. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn would_complete(&self, start_epoch: Epoch) -> bool { - start_epoch - <= self - .beacon_chain - .genesis_backfill_slot - .epoch(T::EthSpec::slots_per_epoch()) } /// Updates the global network state indicating the current state of a backfill sync. #[instrument(parent = None, - level = "info", fields(service = "backfill_sync"), name = "backfill_sync", skip_all @@ -1178,12 +342,6 @@ impl BackFillSync { *self.network_globals.backfill_state.write() = state; } - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] fn state(&self) -> BackFillState { self.network_globals.backfill_state.read().clone() } @@ -1194,73 +352,3 @@ enum ResetEpochError { /// The chain has already completed. SyncCompleted, } - -#[cfg(test)] -mod tests { - use super::*; - use beacon_chain::test_utils::BeaconChainHarness; - use bls::Hash256; - use lighthouse_network::{NetworkConfig, SyncInfo, SyncStatus}; - use rand::prelude::StdRng; - use rand::SeedableRng; - use types::MinimalEthSpec; - - #[test] - fn request_batches_should_not_loop_infinitely() { - let harness = BeaconChainHarness::builder(MinimalEthSpec) - .default_spec() - .deterministic_keypairs(4) - .fresh_ephemeral_store() - .build(); - - let beacon_chain = harness.chain.clone(); - let slots_per_epoch = MinimalEthSpec::slots_per_epoch(); - - let network_globals = Arc::new(NetworkGlobals::new_test_globals( - vec![], - Arc::new(NetworkConfig::default()), - beacon_chain.spec.clone(), - )); - - { - let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); - let peer_id = network_globals - .peers - .write() - .__add_connected_peer_testing_only( - true, - &beacon_chain.spec, - k256::ecdsa::SigningKey::random(&mut rng).into(), - ); - - // Simulate finalized epoch and head being 2 epochs ahead - let finalized_epoch = Epoch::new(40); - let head_epoch = finalized_epoch + 2; - let head_slot = head_epoch.start_slot(slots_per_epoch) + 1; - - network_globals.peers.write().update_sync_status( - &peer_id, - SyncStatus::Synced { - info: SyncInfo { - head_slot, - head_root: Hash256::random(), - finalized_epoch, - finalized_root: Hash256::random(), - }, - }, - ); - } - - let mut network = SyncNetworkContext::new_for_testing( - beacon_chain.clone(), - network_globals.clone(), - harness.runtime.task_executor.clone(), - ); - - let mut backfill = BackFillSync::new(beacon_chain, network_globals); - backfill.set_state(BackFillState::Syncing); - - // if this ends up running into an infinite loop, the test will overflow the stack pretty quickly. - let _ = backfill.request_batches(&mut network); - } -} diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index 9a7561df21a..a8352d12398 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -1,14 +1,10 @@ -use super::network_context::{LookupRequestResult, RpcResponseError, SyncNetworkContext}; +use super::network_context::{RpcResponseError, SyncNetworkContext}; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::custody_by_root::ColumnRequest; -use crate::sync::network_context::{ - BlocksByRootSameForkRequest, RpcResponseBatchResult, RpcResponseResult, -}; -use crate::sync::range_sync::{BatchInfo, BatchPeers}; +use crate::sync::network_context::{BatchPeers, RpcResponseResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::rpc::BlocksByRootRequest; use lighthouse_network::service::api_types::{ BlocksByRootRequestId, BlocksByRootRequester, HeaderLookupId, Id, RangeRequestId, }; @@ -17,11 +13,10 @@ use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tracing::debug; -use types::{BeaconBlockHeader, Epoch, EthSpec, ForkName, Hash256, SignedBeaconBlock, Slot}; +use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; pub struct BlockTree { blocks: HashMap>, - batches: HashMap>, chain: Arc>, } @@ -56,16 +51,6 @@ impl Block { } } - fn start(&mut self, cx: &mut SyncNetworkContext) {} - - fn on_error(&mut self, _e: RpcResponseError) { - todo!(); - } - - fn root(&self) -> Hash256 { - todo!(); - } - fn peer_count(&self) -> usize { self.peers.len() } @@ -127,7 +112,6 @@ impl BlockTree { pub fn new(chain: Arc>) -> Self { Self { blocks: <_>::default(), - batches: <_>::default(), chain, } } @@ -147,11 +131,13 @@ impl BlockTree { } pub fn pause(&mut self) { - todo!() + todo!(); } - pub fn remove_peer(&mut self, _peer: PeerId) { - todo!(); + pub fn remove_peer(&mut self, peer: PeerId) { + for block in self.blocks.values_mut() { + block.peers.remove(&peer); + } } pub fn search( @@ -207,32 +193,6 @@ impl BlockTree { } } - fn oldest_known_ancestor(&self, mut block_root: Hash256) -> Hash256 { - let Some(mut parent_root) = self - .blocks - .get(&block_root) - .and_then(|lookup| lookup.parent_root()) - else { - return block_root; - }; - - loop { - if let Some(lookup) = self.blocks.get(&parent_root) { - if let Some(next_parent_root) = lookup.parent_root() { - // Continue iterating the parent chain - block_root = parent_root; - parent_root = next_parent_root; - } else { - // There's an entry for parent_root but it's not downloaded yet - return parent_root; - } - } else { - // There's no entry in the DAG for parent_root, thus block_root is the root node - return block_root; - } - } - } - pub fn on_block_header( &mut self, req_id: BlocksByRootRequestId, @@ -443,9 +403,8 @@ impl BlockTree { match &mut lookup.status { Status::Syncing(header, syncing_status) => match syncing_status { SyncingStatus::AwaitingDownload => { - // TODO(tree-sync): pick the right ID - let chain_id = cx.next_id(); let requester = RangeRequestId::RangeSync(lookup.id); + // TODO(tree-sync) use RwLock or manually add to active request let peers = Arc::new(RwLock::new(HashSet::from_iter( lookup.peers.iter().copied(), ))); @@ -461,7 +420,8 @@ impl BlockTree { *syncing_status = SyncingStatus::Downloading(req_id); } Err(e) => { - // Log failed chain, mark blocks as not syncing + // Handle send error + todo!("Error sending {e:?}"); } }; } @@ -485,7 +445,7 @@ impl BlockTree { } } - pub fn on_blocks_response( + pub fn on_block_response( &mut self, id: HeaderLookupId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, @@ -505,7 +465,7 @@ impl BlockTree { *request = SyncingStatus::AwaitingProcessing(block, peers); } Err(e) => { - debug!(%id, "Sync block download error"); + debug!(%id, error = ?e, "Sync block download error"); *request = SyncingStatus::AwaitingDownload; } }, @@ -528,13 +488,18 @@ impl BlockTree { let request = lookup.block_request().unwrap(); match request { - SyncingStatus::Processing(peers) => match result { + SyncingStatus::Processing(_peers) => match result { BatchProcessResult::Success { .. } => { debug!(%id, "Sync block process success"); self.blocks.remove(&id.0); self.trigger_forward_sync(cx); } - BatchProcessResult::FaultyFailure { .. } | BatchProcessResult::NonFaultyFailure => { + BatchProcessResult::FaultyFailure { .. } => { + debug!(%id, "Sync block process error"); + *request = SyncingStatus::AwaitingDownload; + // TODO(tree-sync): add peer to failed peers and downscore + } + BatchProcessResult::NonFaultyFailure => { debug!(%id, "Sync block process error"); *request = SyncingStatus::AwaitingDownload; // TODO(tree-sync): add peer to failed peers and downscore @@ -542,5 +507,8 @@ impl BlockTree { }, _ => panic!("Bad state"), } + + // Continue batches + self.continue_syncing_blocks(cx); } } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index c3e613c933d..2b209f931fb 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -33,20 +33,18 @@ //! needs to be searched for (i.e if an attestation references an unknown block) this manager can //! search for the block and subsequently search for parents if needed. -use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; +use super::backfill_sync::BackFillSync; use super::block_tree::BlockTree; use super::network_context::{ CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; -use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH}; use crate::network_beacon_processor::{ ChainSegmentProcessId, NetworkBeaconProcessor, PeerGroupAction, }; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::range_sync::BATCH_BUFFER_SIZE; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, EngineState, @@ -55,10 +53,10 @@ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, - CustodyByRootRequestId, CustodyRequester, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, SyncRequestId, + CustodyByRootRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, + SamplingRequester, SyncRequestId, }; -use lighthouse_network::types::{NetworkGlobals, SyncState}; +use lighthouse_network::types::NetworkGlobals; use lighthouse_network::PeerId; use lighthouse_network::SyncInfo; use logging::crit; @@ -67,7 +65,7 @@ use std::ops::Sub; use std::sync::Arc; use std::time::Duration; use tokio::sync::mpsc; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info_span, trace, warn, Instrument}; use types::{ BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, }; @@ -95,15 +93,6 @@ pub enum SyncMessage { /// A useful peer has been discovered. AddPeer(PeerId, SyncInfo), - /// Force trigger range sync for a set of peers given a head they claim to have imported. Used - /// by block lookup to trigger range sync if a parent chain grows too large. - AddPeersForceRangeSync { - peers: Vec, - head_root: Hash256, - /// Sync lookup may not know the Slot of this head. However this situation is very rare. - head_slot: Option, - }, - /// Peer manager has received a MetaData of a peer with a new or updated CGC value. UpdatedPeerCgc(PeerId), @@ -230,11 +219,9 @@ pub struct SyncManager { /// A network context to contact the network service. network: SyncNetworkContext, - /// The object handling long-range batch load-balanced syncing. - range_sync: RangeSync, - /// Backfill syncing. backfill_sync: BackFillSync, + block_tree: BlockTree, /// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer /// may forward us thousands of a attestations, each one triggering an individual event. Only @@ -242,8 +229,6 @@ pub struct SyncManager { notified_unknown_roots: LRUTimeCache<(PeerId, Hash256)>, sampling: Sampling, - - block_tree: BlockTree, } /// Spawns a new `SyncManager` thread which has a weak reference to underlying beacon @@ -257,11 +242,6 @@ pub fn spawn( sync_recv: mpsc::UnboundedReceiver>, fork_context: Arc, ) { - assert!( - beacon_chain.spec.max_request_blocks(fork_context.current_fork()) as u64 >= T::EthSpec::slots_per_epoch() * EPOCHS_PER_BATCH, - "Max blocks that can be requested in a single batch greater than max allowed blocks in a single request" - ); - // create an instance of the SyncManager let mut sync_manager = SyncManager::new( beacon_chain, @@ -270,7 +250,6 @@ pub fn spawn( sync_recv, SamplingConfig::Default, fork_context, - BATCH_BUFFER_SIZE, ); // spawn the sync manager thread @@ -293,7 +272,6 @@ impl SyncManager { sync_recv: mpsc::UnboundedReceiver>, sampling_config: SamplingConfig, fork_context: Arc, - batch_buffer_size: usize, ) -> Self { let network_globals = beacon_processor.network_globals.clone(); Self { @@ -305,13 +283,12 @@ impl SyncManager { beacon_chain.clone(), fork_context.clone(), ), - range_sync: RangeSync::new(beacon_chain.clone(), batch_buffer_size), + block_tree: BlockTree::new(beacon_chain.clone()), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, )), sampling: Sampling::new(sampling_config), - block_tree: BlockTree::new(beacon_chain.clone()), } } @@ -409,52 +386,11 @@ impl SyncManager { } } - /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. - fn add_peers_force_range_sync( - &mut self, - peers: &[PeerId], - head_root: Hash256, - head_slot: Option, - ) { - let status = self.chain.status_message(); - let local = SyncInfo { - head_slot: status.head_slot, - head_root: status.head_root, - finalized_epoch: status.finalized_epoch, - finalized_root: status.finalized_root, - }; - - let head_slot = head_slot.unwrap_or_else(|| { - debug!( - local_head_slot = %local.head_slot, - ?head_root, - "On add peers force range sync assuming local head_slot" - ); - local.head_slot - }); - - let remote = SyncInfo { - head_slot, - head_root, - // Set finalized to same as local to trigger Head sync - finalized_epoch: local.finalized_epoch, - finalized_root: local.finalized_root, - }; - - for peer_id in peers { - self.range_sync - .add_peer(&mut self.network, local.clone(), *peer_id, remote.clone()); - } - } - fn updated_peer_cgc(&mut self, _peer_id: PeerId) { // Try to make progress on custody requests that are waiting for peers for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } - - // Attempt to resume range sync too - self.range_sync.resume(&mut self.network); } /// Handles RPC errors related to requests that were emitted from the sync manager. @@ -486,7 +422,6 @@ impl SyncManager { } // Remove peer from all data structures - self.range_sync.peer_disconnect(&mut self.network, peer_id); self.backfill_sync.peer_disconnected(peer_id); self.block_tree.remove_peer(*peer_id); @@ -563,104 +498,8 @@ impl SyncManager { /// - If there is no range sync and no required backfill and we have synced up to the currently /// known peers, we consider ourselves synced. fn update_sync_state(&mut self) { - let new_state: SyncState = match self.range_sync.state() { - Err(e) => { - crit!(error = %e, "Error getting range sync state"); - return; - } - Ok(state) => match state { - None => { - // No range sync, so we decide if we are stalled or synced. - // For this we check if there is at least one advanced peer. An advanced peer - // with Idle range is possible since a peer's status is updated periodically. - // If we synced a peer between status messages, most likely the peer has - // advanced and will produce a head chain on re-status. Otherwise it will shift - // to being synced - let mut sync_state = { - let head = self.chain.best_slot(); - let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0)); - - let peers = self.network_globals().peers.read(); - if current_slot >= head - && current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64) - && head > 0 - { - SyncState::Synced - } else if peers.advanced_peers().next().is_some() { - SyncState::SyncTransition - } else if peers.synced_peers().next().is_none() { - SyncState::Stalled - } else { - // There are no peers that require syncing and we have at least one synced - // peer - SyncState::Synced - } - }; - - // If we would otherwise be synced, first check if we need to perform or - // complete a backfill sync. - #[cfg(not(feature = "disable-backfill"))] - if matches!(sync_state, SyncState::Synced) { - // Determine if we need to start/resume/restart a backfill sync. - match self.backfill_sync.start(&mut self.network) { - Ok(SyncStart::Syncing { - completed, - remaining, - }) => { - sync_state = SyncState::BackFillSyncing { - completed, - remaining, - }; - } - Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start. - Err(e) => { - error!(error = ?e, "Backfill sync failed to start"); - } - } - } - - // Return the sync state if backfilling is not required. - sync_state - } - Some((RangeSyncType::Finalized, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. - #[cfg(not(feature = "disable-backfill"))] - self.backfill_sync.pause(); - - SyncState::SyncingFinalized { - start_slot, - target_slot, - } - } - Some((RangeSyncType::Head, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. - #[cfg(not(feature = "disable-backfill"))] - self.backfill_sync.pause(); - - SyncState::SyncingHead { - start_slot, - target_slot, - } - } - }, - }; - - let old_state = self.network_globals().set_sync_state(new_state); - let new_state = self.network_globals().sync_state.read().clone(); - if !new_state.eq(&old_state) { - info!(%old_state, %new_state, "Sync state updated"); - // If we have become synced - Subscribe to all the core subnet topics - // We don't need to subscribe if the old state is a state that would have already - // invoked this call. - if new_state.is_synced() - && !matches!( - old_state, - SyncState::Synced | SyncState::BackFillSyncing { .. } - ) - { - self.network.subscribe_core_topics(); - } - } + // TODO(tree-sync): re-think how to set a sync state + todo!(); } /// The main driving future for the sync manager. @@ -713,13 +552,6 @@ impl SyncManager { SyncMessage::AddPeer(peer_id, info) => { self.add_peer(peer_id, info); } - SyncMessage::AddPeersForceRangeSync { - peers, - head_root, - head_slot, - } => { - self.add_peers_force_range_sync(&peers, head_root, head_slot); - } SyncMessage::UpdatedPeerCgc(peer_id) => { debug!( peer_id = ?peer_id, @@ -807,20 +639,10 @@ impl SyncManager { .handle_block_process_result(id, result, &mut self.network); self.update_sync_state(); } - ChainSegmentProcessId::BackSyncBatchId(epoch) => { - match self.backfill_sync.on_batch_process_result( - &mut self.network, - epoch, - &result, - ) { - Ok(ProcessResult::Successful) => {} - Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), - Err(error) => { - error!(error = ?error, "Backfill sync failed"); - // Update the global status - self.update_sync_state(); - } - } + ChainSegmentProcessId::BackSyncBatchId(id) => { + // TODO(tree-sync): should update sync state + self.backfill_sync + .handle_block_process_result(id, result, &mut self.network) } }, SyncMessage::SampleVerified { id, result } => { @@ -919,6 +741,7 @@ impl SyncManager { EngineState::Online => { // Resume sync components. + // TODO(tree-sync): review this // - Block lookups: // We start searching for blocks again. This is done by updating the stored ee online // state. No further action required. @@ -927,10 +750,6 @@ impl SyncManager { // We start searching for parents again. This is done by updating the stored ee // online state. No further action required. - // - Range: - // Actively resume. - self.range_sync.resume(&mut self.network); - // - Backfill: // Not affected by ee states, nothing to do. } @@ -997,7 +816,7 @@ impl SyncManager { ); } BlocksByRootRequester::RangeSync(batch_id) => { - self.on_block_components_by_range_response( + self.on_block_components_by_root_response( batch_id, RangeBlockComponent::Block(req_id, result, peer_id), ); @@ -1016,7 +835,7 @@ impl SyncManager { .network .on_blobs_by_root_response(req_id, peer_id, block) { - self.on_block_components_by_range_response( + self.on_block_components_by_root_response( req_id.parent_request_id, RangeBlockComponent::Blob(req_id, result, peer_id), ); @@ -1099,7 +918,7 @@ impl SyncManager { id: CustodyByRootRequestId, result: CustodyRequestResult, ) { - self.on_block_components_by_range_response( + self.on_block_components_by_root_response( id.parent_request_id, RangeBlockComponent::CustodyColumns(id, result), ); @@ -1144,37 +963,11 @@ impl SyncManager { match range_request_id.requester { RangeRequestId::RangeSync(id) => { self.block_tree - .on_blocks_response(id, result, &mut self.network); + .on_block_response(id, result, &mut self.network); } - RangeRequestId::BackfillSync { batch_id } => { - match result { - Ok((blocks, batch_peers)) => { - match self.backfill_sync.on_block_response( - &mut self.network, - batch_id, - batch_peers, - range_request_id.id, - vec![blocks], - ) { - Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), - Ok(ProcessResult::Successful) => {} - Err(_error) => { - // The backfill sync has failed, errors are reported - // within. - self.update_sync_state(); - } - } - } - Err(e) => match self.backfill_sync.inject_error( - &mut self.network, - batch_id, - range_request_id.id, - e, - ) { - Ok(_) => {} - Err(_) => self.update_sync_state(), - }, - } + RangeRequestId::BackfillSync(id) => { + self.backfill_sync + .on_block_response(id, result, &mut self.network) } } } diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index a0460c5beb4..22a52544e63 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -7,10 +7,8 @@ pub mod manager; mod network_context; mod peer_sampling; mod peer_sync_info; -mod range_sync; #[cfg(test)] mod tests; pub use lighthouse_network::service::api_types::SamplingId; pub use manager::{BatchProcessResult, SyncMessage}; -pub use range_sync::{BatchOperationOutcome, ChainId}; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 870e98cccee..e5595bf65ee 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -2,7 +2,6 @@ //! channel and stores a global RPC ID to perform requests. use self::custody_by_root::ActiveCustodyByRootRequest; -use super::range_sync::BatchPeers; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -11,22 +10,20 @@ use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; +use beacon_chain::{BeaconChain, BeaconChainTypes, EngineState}; pub use block_components_by_range::BlockComponentsByRootRequest; #[cfg(test)] pub use block_components_by_range::BlockComponentsByRootRequestStep; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRangeRequest, - DataColumnsByRootRequest, + BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, }; -use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; +use lighthouse_network::rpc::{GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, - ComponentsByRootRequestId, CustodyByRootRequestId, CustodyRequester, - DataColumnsByRootRequestId, DataColumnsByRootRequester, HeaderLookupId, Id, SingleLookupReqId, - SyncRequestId, + ComponentsByRootRequestId, CustodyByRootRequestId, DataColumnsByRootRequestId, + DataColumnsByRootRequester, Id, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; @@ -49,7 +46,7 @@ use types::blob_sidecar::FixedBlobSidecarList; use types::{ BlobIdentifier, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, DataColumnsByRootIdentifier, EthSpec, ForkContext, ForkName, Hash256, RuntimeVariableList, - SignedBeaconBlock, SignedBeaconBlockHeader, Slot, + SignedBeaconBlock, }; pub mod block_components_by_range; @@ -162,6 +159,35 @@ impl PeerGroup { } } +#[derive(Clone, Debug)] +pub struct BatchPeers { + block_peer: PeerId, + column_peers: PeerGroup, +} + +impl BatchPeers { + pub fn new_from_block_peer(block_peer: PeerId) -> Self { + Self { + block_peer, + column_peers: PeerGroup::empty(), + } + } + pub fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { + Self { + block_peer, + column_peers, + } + } + + pub fn block(&self) -> PeerId { + self.block_peer + } + + pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { + self.column_peers.of_index(&((*index) as usize)) + } +} + /// Sequential ID that uniquely identifies ReqResp outgoing requests pub type ReqId = u32; diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 69a09934273..e0f61c6f839 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -1,23 +1,20 @@ use crate::sync::network_context::{ - BlocksByRootSameForkRequest, PeerGroup, RpcRequestSendError, RpcResponseError, - SyncNetworkContext, + BatchPeers, PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; -use crate::sync::range_sync::BatchPeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::{get_block_root, BeaconChainTypes}; -use lighthouse_network::rpc::methods::BlocksByRootRequest; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, - ComponentsByRootRequestId, CustodyByRangeRequestId, CustodyByRootRequestId, + BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, + CustodyByRootRequestId, }; use lighthouse_network::PeerId; use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use types::{ - BeaconBlockHeader, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, Hash256, + RuntimeVariableList, SignedBeaconBlock, }; /// Given a `BlocksByRootRequest` (a collection of block roots) fetches all necessary data to diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 79f463d8735..ab09e1a8674 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -4,7 +4,6 @@ use crate::sync::network_context::{ use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use lighthouse_network::service::api_types::{CustodyByRootRequestId, DataColumnsByRootRequester}; use lighthouse_network::PeerId; use lru_cache::LRUTimeCache; @@ -15,10 +14,7 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::{ - data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, - DataColumnsByRootIdentifier, Hash256, RuntimeVariableList, -}; +use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 505c65c2f92..d648978cc26 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -6,11 +6,8 @@ use lighthouse_network::PeerId; use strum::IntoStaticStr; use types::{Hash256, Slot}; -pub use blobs_by_range::BlobsByRangeRequestItems; pub use blobs_by_root::BlobsByRootRequestItems; -pub use blocks_by_range::BlocksByRangeRequestItems; pub use blocks_by_root::BlocksByRootRequestItems; -pub use data_columns_by_range::DataColumnsByRangeRequestItems; pub use data_columns_by_root::DataColumnsByRootRequestItems; use crate::metrics; diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs index 2f1d2ab408a..1fa9763cf0f 100644 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs @@ -1,6 +1,5 @@ -use lighthouse_network::rpc::methods::BlobsByRootRequest; use std::sync::Arc; -use types::{blob_sidecar::BlobIdentifier, BlobSidecar, EthSpec, ForkContext, Hash256}; +use types::{BlobSidecar, EthSpec, Hash256}; use super::{ActiveRequestItems, LookupVerifyError}; diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs index 0eb9ce79936..e80e70d9c3d 100644 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs @@ -1,7 +1,6 @@ use beacon_chain::get_block_root; -use lighthouse_network::rpc::BlocksByRootRequest; use std::sync::Arc; -use types::{EthSpec, ForkContext, Hash256, SignedBeaconBlock}; +use types::{EthSpec, Hash256, SignedBeaconBlock}; use super::{ActiveRequestItems, LookupVerifyError}; diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 6c1c1ace2fc..dba4a71794a 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -1,9 +1,5 @@ -use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use std::sync::Arc; -use types::{ - ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, Hash256, - RuntimeVariableList, -}; +use types::{DataColumnSidecar, EthSpec, Hash256}; use super::{ActiveRequestItems, LookupVerifyError}; diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index d0a220fdcd0..e92e5365f9b 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -5,7 +5,6 @@ use super::network_context::{RpcResponseError, SyncNetworkContext}; use crate::metrics; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use lighthouse_network::service::api_types::{ DataColumnsByRootRequester, SamplingId, SamplingRequestId, SamplingRequester, }; @@ -16,10 +15,7 @@ use std::{ time::Duration, }; use tracing::{debug, error, instrument, warn}; -use types::{ - data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, - Hash256, RuntimeVariableList, -}; +use types::{data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, Hash256}; pub type SamplingResult = Result<(), SamplingError>; diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs deleted file mode 100644 index 280157957c8..00000000000 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ /dev/null @@ -1,489 +0,0 @@ -use crate::sync::network_context::PeerGroup; -use beacon_chain::block_verification_types::RpcBlock; -use lighthouse_network::rpc::methods::{BlocksByRangeRequest, BlocksByRootRequest}; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::PeerId; -use std::collections::HashSet; -use std::fmt; -use std::hash::{Hash, Hasher}; -use std::ops::Sub; -use std::time::{Duration, Instant}; -use strum::Display; -use types::{ChainSpec, ColumnIndex, Epoch, EthSpec, ForkName, Hash256, Slot}; - -/// The number of times to retry a batch before it is considered failed. -const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; - -/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed -/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. -const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; - -#[derive(Clone, Debug)] -pub struct BatchPeers { - block_peer: PeerId, - column_peers: PeerGroup, -} - -impl BatchPeers { - pub fn new_from_block_peer(block_peer: PeerId) -> Self { - Self { - block_peer, - column_peers: PeerGroup::empty(), - } - } - pub fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { - Self { - block_peer, - column_peers, - } - } - - pub fn block(&self) -> PeerId { - self.block_peer - } - - pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { - self.column_peers.of_index(&((*index) as usize)) - } -} - -/// Allows customisation of the above constants used in other sync methods such as BackFillSync. -pub trait BatchConfig { - /// The maximum batch download attempts. - fn max_batch_download_attempts() -> u8; - /// The max batch processing attempts. - fn max_batch_processing_attempts() -> u8; - /// Hashing function of a batch's attempt. Used for scoring purposes. - /// - /// When a batch fails processing, it is possible that the batch is wrong (faulty or - /// incomplete) or that a previous one is wrong. For this reason we need to re-download and - /// re-process the batches awaiting validation and the current one. Consider this scenario: - /// - /// ```ignore - /// BatchA BatchB BatchC BatchD - /// -----X Empty Empty Y----- - /// ``` - /// - /// BatchA declares that it refers X, but BatchD declares that it's first block is Y. There is no - /// way to know if BatchD is faulty/incomplete or if batches B and/or C are missing blocks. It is - /// also possible that BatchA belongs to a different chain to the rest starting in some block - /// midway in the batch's range. For this reason, the four batches would need to be re-downloaded - /// and re-processed. - /// - /// If batchD was actually good, it will still register two processing attempts for the same set of - /// blocks. In this case, we don't want to penalize the peer that provided the first version, since - /// it's equal to the successfully processed one. - /// - /// The function `batch_attempt_hash` provides a way to compare two batch attempts without - /// storing the full set of blocks. - /// - /// Note that simpler hashing functions considered in the past (hash of first block, hash of last - /// block, number of received blocks) are not good enough to differentiate attempts. For this - /// reason, we hash the complete set of blocks both in RangeSync and BackFillSync. - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64; -} - -#[derive(Debug)] -pub struct RangeSyncBatchConfig {} - -impl BatchConfig for RangeSyncBatchConfig { - fn max_batch_download_attempts() -> u8 { - MAX_BATCH_DOWNLOAD_ATTEMPTS - } - fn max_batch_processing_attempts() -> u8 { - MAX_BATCH_PROCESSING_ATTEMPTS - } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - blocks.hash(&mut hasher); - hasher.finish() - } -} - -/// Error type of a batch in a wrong state. -// Such errors should never be encountered. -pub struct WrongState(pub(crate) String); - -/// After batch operations, we use this to communicate whether a batch can continue or not -pub enum BatchOperationOutcome { - Continue, - Failed { blacklist: bool }, -} - -pub enum BatchProcessingResult { - Success, - FaultyFailure(Vec), - NonFaultyFailure, -} - -#[derive(Debug)] -/// A segment of a chain. -pub struct BatchInfo { - /// Start slot of the batch. - block_roots: Vec, - /// The `Attempts` that have been made and failed to send us this batch. - failed_processing_attempts: Vec, - /// Number of processing attempts that have failed but we do not count. - non_faulty_processing_attempts: u8, - /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: usize, - /// Peers that returned bad data, and we want to de-prioritize - failed_peers: HashSet, - /// State of the batch. - state: BatchState, - /// Pin the generic - marker: std::marker::PhantomData, -} - -#[derive(Display)] -/// Current state of a batch -pub enum BatchState { - /// The batch has failed either downloading or processing, but can be requested again. - AwaitingDownload, - /// The batch is being downloaded. - Downloading(Id), - /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(BatchPeers, Vec>, Instant), - /// The batch is being processed. - Processing(Attempt), - /// The batch was successfully processed and is waiting to be validated. - /// - /// It is not sufficient to process a batch successfully to consider it correct. This is - /// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered - /// valid, only if the next sequential batch imports at least a block. - AwaitingValidation(Attempt), - /// Intermediate state for inner state handling. - Poisoned, - /// The batch has maxed out the allowed attempts for either downloading or processing. It - /// cannot be recovered. - Failed, -} - -impl BatchState { - /// Helper function for poisoning a state. - pub fn poison(&mut self) -> BatchState { - std::mem::replace(self, BatchState::Poisoned) - } -} - -impl BatchInfo { - /// Batches are downloaded excluding the first block of the epoch assuming it has already been - /// downloaded. - /// - /// For example: - /// - /// Epoch boundary | | - /// ... | 30 | 31 | 32 | 33 | 34 | ... | 61 | 62 | 63 | 64 | 65 | - /// Batch 1 | Batch 2 | Batch 3 - /// - /// NOTE: Removed the shift by one for deneb because otherwise the last batch before the blob - /// fork boundary will be of mixed type (all blocks and one last blockblob), and I don't want to - /// deal with this for now. - /// This means finalization might be slower in deneb - pub fn new(block_roots: Vec) -> Self { - BatchInfo { - block_roots, - failed_processing_attempts: Vec::new(), - failed_download_attempts: 0, - failed_peers: <_>::default(), - non_faulty_processing_attempts: 0, - state: BatchState::AwaitingDownload, - marker: std::marker::PhantomData, - } - } - - /// Gives a list of peers from which this batch has had a failed download or processing - /// attempt. - pub fn failed_peers(&self) -> &HashSet { - &self.failed_peers - } - - /// Verifies if an incoming block belongs to this batch. - pub fn is_expecting_block(&self, request_id: &Id) -> bool { - if let BatchState::Downloading(expected_id) = &self.state { - return expected_id == request_id; - } - false - } - - /// Returns the peers that provided this batch's downloaded contents - pub fn processing_peers(&self) -> Option<&BatchPeers> { - match &self.state { - BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, - BatchState::AwaitingProcessing(peers, _, _) - | BatchState::Processing(Attempt { peers, .. }) - | BatchState::AwaitingValidation(Attempt { peers, .. }) => Some(peers), - BatchState::Poisoned => unreachable!("Poisoned batch"), - } - } - - /// Returns the count of stored pending blocks if in awaiting processing state - pub fn pending_blocks(&self) -> usize { - match &self.state { - BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(), - BatchState::AwaitingDownload - | BatchState::Downloading { .. } - | BatchState::Processing { .. } - | BatchState::AwaitingValidation { .. } - | BatchState::Poisoned - | BatchState::Failed => 0, - } - } - - /// Returns a BlocksByRange request associated with the batch. - pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest { - todo!(); - } - - pub fn to_blocks_by_root_request(&self, spec: &ChainSpec) -> BlocksByRootRequest { - // TODO: Is it necessary to pass ForkName to BlocksByRootRequest - BlocksByRootRequest::new(self.block_roots.clone(), spec, ForkName::Fulu) - } - - /// After different operations over a batch, this could be in a state that allows it to - /// continue, or in failed state. When the batch has failed, we check if it did mainly due to - /// processing failures. In this case the batch is considered failed and faulty. - pub fn outcome(&self) -> BatchOperationOutcome { - match self.state { - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed => BatchOperationOutcome::Failed { - blacklist: self.failed_processing_attempts.len() > self.failed_download_attempts, - }, - _ => BatchOperationOutcome::Continue, - } - } - - pub fn state(&self) -> &BatchState { - &self.state - } - - pub fn attempts(&self) -> &[Attempt] { - &self.failed_processing_attempts - } - - /// Marks the batch as ready to be processed if the blocks are in the range. The number of - /// received blocks is returned, or the wrong batch end on failure - #[must_use = "Batch may have failed"] - pub fn download_completed( - &mut self, - blocks: Vec>, - batch_peers: BatchPeers, - ) -> Result { - match self.state.poison() { - BatchState::Downloading(_request_id) => { - let received = blocks.len(); - self.state = BatchState::AwaitingProcessing(batch_peers, blocks, Instant::now()); - Ok(received) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Download completed for batch in wrong state {:?}", - self.state - ))) - } - } - } - - /// Mark the batch as failed and return whether we can attempt a re-download. - /// - /// This can happen if a peer disconnects or some error occurred that was not the peers fault. - /// The `peer` parameter, when set to None, does not increment the failed attempts of - /// this batch and register the peer, rather attempts a re-download. - #[must_use = "Batch may have failed"] - pub fn download_failed(&mut self) -> Result { - match self.state.poison() { - BatchState::Downloading(_request_id) => { - self.failed_download_attempts += 1; - - self.state = - if self.failed_download_attempts >= B::max_batch_download_attempts() as usize { - BatchState::Failed - } else { - // drop the blocks - BatchState::AwaitingDownload - }; - Ok(self.outcome()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Download failed for batch in wrong state {:?}", - self.state - ))) - } - } - } - - pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> { - match self.state.poison() { - BatchState::AwaitingDownload => { - self.state = BatchState::Downloading(request_id); - Ok(()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Starting download for batch in wrong state {:?}", - self.state - ))) - } - } - } - - pub fn start_processing(&mut self) -> Result<(Vec>, Duration), WrongState> { - match self.state.poison() { - BatchState::AwaitingProcessing(peers, blocks, start_instant) => { - self.state = BatchState::Processing(Attempt::new::(peers, &blocks)); - Ok((blocks, start_instant.elapsed())) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Starting procesing batch in wrong state {:?}", - self.state - ))) - } - } - } - - #[must_use = "Batch may have failed"] - pub fn processing_completed( - &mut self, - procesing_result: BatchProcessingResult, - ) -> Result { - match self.state.poison() { - BatchState::Processing(attempt) => { - self.state = match procesing_result { - BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), - BatchProcessingResult::FaultyFailure(failed_peers) => { - // register the failed attempt - self.failed_processing_attempts.push(attempt); - for peer in failed_peers { - self.failed_peers.insert(peer); - } - - // check if the batch can be downloaded again - if self.failed_processing_attempts.len() - >= B::max_batch_processing_attempts() as usize - { - BatchState::Failed - } else { - BatchState::AwaitingDownload - } - } - BatchProcessingResult::NonFaultyFailure => { - self.non_faulty_processing_attempts = - self.non_faulty_processing_attempts.saturating_add(1); - BatchState::AwaitingDownload - } - }; - Ok(self.outcome()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Procesing completed for batch in wrong state: {:?}", - self.state - ))) - } - } - } - - #[must_use = "Batch may have failed"] - pub fn validation_failed(&mut self) -> Result { - match self.state.poison() { - BatchState::AwaitingValidation(attempt) => { - self.failed_processing_attempts.push(attempt); - - // check if the batch can be downloaded again - self.state = if self.failed_processing_attempts.len() - >= B::max_batch_processing_attempts() as usize - { - BatchState::Failed - } else { - BatchState::AwaitingDownload - }; - Ok(self.outcome()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Validation failed for batch in wrong state: {:?}", - self.state - ))) - } - } - } - - // Visualizes the state of this batch using state::visualize() - pub fn visualize(&self) -> char { - self.state.visualize() - } -} - -/// Represents a batch attempt awaiting validation -/// -/// Invalid attempts will downscore its peers -#[derive(Debug)] -pub struct Attempt { - /// The peers that served this batch contents - peers: BatchPeers, - /// The hash of the blocks of the attempt. - pub hash: u64, -} - -impl Attempt { - fn new(peers: BatchPeers, blocks: &[RpcBlock]) -> Self { - let hash = B::batch_attempt_hash(blocks); - Attempt { peers, hash } - } - - pub fn block_peer(&self) -> PeerId { - self.peers.block() - } -} - -impl std::fmt::Debug for BatchState { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - BatchState::Processing(Attempt { ref peers, hash: _ }) => { - write!(f, "Processing({})", peers.block()) - } - BatchState::AwaitingValidation(Attempt { ref peers, hash: _ }) => { - write!(f, "AwaitingValidation({})", peers.block()) - } - BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), - BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(_, ref blocks, _) => { - write!(f, "AwaitingProcessing({} blocks)", blocks.len()) - } - BatchState::Downloading(request_id) => { - write!(f, "Downloading({})", request_id) - } - BatchState::Poisoned => f.write_str("Poisoned"), - } - } -} - -impl BatchState { - /// Creates a character representation/visualization for the batch state to display in logs for quicker and - /// easier recognition - fn visualize(&self) -> char { - match self { - BatchState::Downloading(..) => 'D', - BatchState::Processing(_) => 'P', - BatchState::AwaitingValidation(_) => 'v', - BatchState::AwaitingDownload => 'd', - BatchState::Failed => 'F', - BatchState::AwaitingProcessing(..) => 'p', - BatchState::Poisoned => 'X', - } - } -} diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs deleted file mode 100644 index 82c6b24a0c6..00000000000 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ /dev/null @@ -1,1150 +0,0 @@ -use super::batch::{BatchInfo, BatchPeers, BatchProcessingResult, BatchState}; -use super::RangeSyncType; -use crate::metrics; -use crate::network_beacon_processor::ChainSegmentProcessId; -use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; -use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; -use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::BeaconChainTypes; -use itertools::Itertools; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::{PeerAction, PeerId}; -use logging::crit; -use parking_lot::RwLock; -use std::collections::{btree_map::Entry, BTreeMap, HashSet}; -use std::sync::Arc; -use strum::IntoStaticStr; -use tracing::{debug, instrument, warn}; -use types::{Epoch, EthSpec, Hash256, Slot}; - -/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of -/// blocks per batch are requested _at most_. A batch may request less blocks to account for -/// already requested slots. There is a timeout for each batch request. If this value is too high, -/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which -/// case the responder will fill the response up to the max request size, assuming they have the -/// bandwidth to do so. -pub const EPOCHS_PER_BATCH: u64 = 1; - -/// The maximum number of batches to queue before requesting more. -pub const BATCH_BUFFER_SIZE: usize = 5; - -/// A return type for functions that act on a `Chain` which informs the caller whether the chain -/// has been completed and should be removed or to be kept if further processing is -/// required. -/// -/// Should be checked, since a failed chain must be removed. A chain that requested being removed -/// and continued is now in an inconsistent state. -pub type ProcessingResult = Result; - -/// Reasons for removing a chain -#[derive(Debug)] -#[allow(dead_code)] -pub enum RemoveChain { - EmptyPeerPool, - ChainCompleted, - /// A chain has failed. This boolean signals whether the chain should be blacklisted. - ChainFailed { - blacklist: bool, - failing_batch: BatchId, - }, - WrongBatchState(String), - WrongChainState(String), -} - -#[derive(Debug)] -pub struct KeepChain; - -/// A chain identifier -pub type ChainId = Id; -pub type BatchId = Epoch; - -#[derive(Debug, Copy, Clone, IntoStaticStr)] -pub enum SyncingChainType { - Head, - Finalized, - Backfill, -} - -/// A chain of blocks that need to be downloaded. Peers who claim to contain the target head -/// root are grouped into the peer pool and queried for batches when downloading the -/// chain. -#[derive(Debug)] -pub struct SyncingChain { - /// A random id used to identify this chain. - id: ChainId, - - /// SyncingChain type - pub chain_type: SyncingChainType, - - /// The start of the chain segment. Any epoch previous to this one has been validated. - pub start_epoch: Epoch, - - /// The target head slot. - pub target_head_slot: Slot, - - /// The target head root. - pub target_head_root: Hash256, - - /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, - - /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain - /// and thus available to download this chain from. - peers: Arc>>, - - /// Starting epoch of the next batch that needs to be downloaded. - to_be_downloaded: BatchId, - - /// Starting epoch of the batch that needs to be processed next. - /// This is incremented as the chain advances. - processing_target: BatchId, - - /// Optimistic head to sync. - /// If a block is imported for this batch, the chain advances to this point. - optimistic_start: Option, - - /// When a batch for an optimistic start is tried (either successful or not), it is stored to - /// avoid trying it again due to chain stopping/re-starting on chain switching. - attempted_optimistic_starts: HashSet, - - /// The current state of the chain. - pub state: ChainSyncingState, - - /// The current processing batch, if any. - current_processing_batch: Option, - - /// The maximum number of batches to queue before requesting more. - batch_buffer_size: usize, -} - -#[derive(PartialEq, Debug)] -pub enum ChainSyncingState { - /// The chain is not being synced. - Stopped, - /// The chain is undergoing syncing. - Syncing, -} - -impl SyncingChain { - /// Leaks the state of all active batches for assertions in tests. - #[cfg(test)] - pub fn batches_state(&self) -> Vec<(BatchId, &BatchState)> { - self.batches - .iter() - .map(|(id, batch)| (*id, batch.state())) - .collect() - } - - #[allow(clippy::too_many_arguments)] - pub fn new( - id: Id, - start_epoch: Epoch, - target_head_slot: Slot, - target_head_root: Hash256, - peer_id: PeerId, - chain_type: SyncingChainType, - batch_buffer_size: usize, - ) -> Self { - SyncingChain { - id, - chain_type, - start_epoch, - target_head_slot, - target_head_root, - batches: BTreeMap::new(), - peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))), - to_be_downloaded: start_epoch, - processing_target: start_epoch, - optimistic_start: None, - attempted_optimistic_starts: HashSet::default(), - state: ChainSyncingState::Stopped, - current_processing_batch: None, - batch_buffer_size, - } - } - - /// Returns true if this chain has the same target - pub fn has_same_target(&self, target_head_slot: Slot, target_head_root: Hash256) -> bool { - self.target_head_slot == target_head_slot && self.target_head_root == target_head_root - } - - /// Check if the chain has peers from which to process batches. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn available_peers(&self) -> usize { - self.peers.read().len() - } - - /// Get the chain's id. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn id(&self) -> ChainId { - self.id - } - - /// Peers currently syncing this chain. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn peers(&self) -> impl Iterator + '_ { - self.peers - .read() - .iter() - .copied() - .collect::>() - .into_iter() - } - - /// Progress in epochs made by the chain - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn processed_epochs(&self) -> u64 { - self.processing_target - .saturating_sub(self.start_epoch) - .into() - } - - /// Returns the total count of pending blocks in all the batches of this chain - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn pending_blocks(&self) -> usize { - self.batches - .values() - .map(|batch| batch.pending_blocks()) - .sum() - } - - /// Removes a peer from the chain. - /// If the peer has active batches, those are considered failed and re-requested. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { - self.peers.write().remove(peer_id); - - if self.peers.read().is_empty() { - Err(RemoveChain::EmptyPeerPool) - } else { - Ok(KeepChain) - } - } - - /// Returns the latest slot number that has been processed. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn current_processed_slot(&self) -> Slot { - // the last slot we processed was included in the previous batch, and corresponds to the - // first slot of the current target epoch - self.processing_target - .start_slot(T::EthSpec::slots_per_epoch()) - } - - /// A block has been received for a batch on this chain. - /// If the block correctly completes the batch it will be processed if possible. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn on_block_response( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - batch_peers: BatchPeers, - request_id: Id, - blocks: Vec>, - ) -> ProcessingResult { - // check if we have this batch - let batch = match self.batches.get_mut(&batch_id) { - None => { - debug!(epoch = %batch_id, "Received a block for unknown batch"); - // A batch might get removed when the chain advances, so this is non fatal. - return Ok(KeepChain); - } - Some(batch) => { - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer, and that the - // request_id matches - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. - if !batch.is_expecting_block(&request_id) { - return Ok(KeepChain); - } - batch - } - }; - - // A stream termination has been sent. This batch has ended. Process a completed batch. - // Remove the request from the peer's active batches - - let received = batch.download_completed(blocks, batch_peers)?; - let awaiting_batches = batch_id - .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) - / EPOCHS_PER_BATCH; - debug!(epoch = %batch_id, blocks = received, batch_state = self.visualize_batch_state(), %awaiting_batches,"Batch downloaded"); - - // pre-emptively request more blocks from peers whilst we process current blocks, - self.request_batches(network)?; - self.process_completed_batches(network) - } - - /// Processes the batch with the given id. - /// The batch must exist and be ready for processing - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn process_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - // Only process batches if this chain is Syncing, and only one at a time - if self.state != ChainSyncingState::Syncing || self.current_processing_batch.is_some() { - return Ok(KeepChain); - } - - let Some(beacon_processor) = network.beacon_processor_if_enabled() else { - return Ok(KeepChain); - }; - - let Some(batch) = self.batches.get_mut(&batch_id) else { - return Err(RemoveChain::WrongChainState(format!( - "Trying to process a batch that does not exist: {}", - batch_id - ))); - }; - - // NOTE: We send empty batches to the processor in order to trigger the block processor - // result callback. This is done, because an empty batch could end a chain and the logic - // for removing chains and checking completion is in the callback. - - let (blocks, duration_in_awaiting_processing) = batch.start_processing()?; - metrics::observe_duration( - &metrics::SYNCING_CHAIN_BATCH_AWAITING_PROCESSING, - duration_in_awaiting_processing, - ); - - let process_id = todo!(); - self.current_processing_batch = Some(batch_id); - - if let Err(e) = beacon_processor.send_chain_segment(process_id, blocks) { - crit!(msg = "process_batch",error = %e, batch = ?self.processing_target, "Failed to send chain segment to processor."); - // This is unlikely to happen but it would stall syncing since the batch now has no - // blocks to continue, and the chain is expecting a processing result that won't - // arrive. To mitigate this, (fake) fail this processing so that the batch is - // re-downloaded. - self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure) - } else { - Ok(KeepChain) - } - } - - /// Processes the next ready batch, prioritizing optimistic batches over the processing target. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn process_completed_batches( - &mut self, - network: &mut SyncNetworkContext, - ) -> ProcessingResult { - // Only process batches if this chain is Syncing and only process one batch at a time - if self.state != ChainSyncingState::Syncing || self.current_processing_batch.is_some() { - return Ok(KeepChain); - } - - // Find the id of the batch we are going to process. - // - // First try our optimistic start, if any. If this batch is ready, we process it. If the - // batch has not already been completed, check the current chain target. - if let Some(epoch) = self.optimistic_start { - if let Some(batch) = self.batches.get(&epoch) { - let state = batch.state(); - match state { - BatchState::AwaitingProcessing(..) => { - // this batch is ready - debug!(%epoch, "Processing optimistic start"); - return self.process_batch(network, epoch); - } - BatchState::Downloading(..) => { - // The optimistic batch is being downloaded. We wait for this before - // attempting to process other batches. - return Ok(KeepChain); - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Processing(_) - | BatchState::AwaitingDownload - | BatchState::Failed => { - // these are all inconsistent states: - // - Processing -> `self.current_processing_batch` is None - // - Failed -> non recoverable batch. For an optimistic batch, it should - // have been removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. - return Err(RemoveChain::WrongChainState(format!( - "Optimistic batch indicates inconsistent chain state: {:?}", - state - ))); - } - BatchState::AwaitingValidation(_) => { - // If an optimistic start is given to the chain after the corresponding - // batch has been requested and processed we can land here. We drop the - // optimistic candidate since we can't conclude whether the batch included - // blocks or not at this point - debug!(batch = %epoch, "Dropping optimistic candidate"); - self.optimistic_start = None; - } - } - } - } - - // if the optimistic target can't be processed, check the processing target - if let Some(batch) = self.batches.get(&self.processing_target) { - let state = batch.state(); - match state { - BatchState::AwaitingProcessing(..) => { - return self.process_batch(network, self.processing_target); - } - BatchState::Downloading(..) => { - // Batch is not ready, nothing to process - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { - // these are all inconsistent states: - // - Failed -> non recoverable batch. Chain should have beee removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. - // - Processing -> `self.current_processing_batch` is None - return Err(RemoveChain::WrongChainState(format!( - "Robust target batch indicates inconsistent chain state: {:?}", - state - ))); - } - BatchState::AwaitingValidation(_) => { - // we can land here if an empty optimistic batch succeeds processing and is - // inside the download buffer (between `self.processing_target` and - // `self.to_be_downloaded`). In this case, eventually the chain advances to the - // batch (`self.processing_target` reaches this point). - debug!( - batch = %self.processing_target, - "Chain encountered a robust batch awaiting validation" - ); - - self.processing_target += EPOCHS_PER_BATCH; - if self.to_be_downloaded <= self.processing_target { - self.to_be_downloaded = self.processing_target + EPOCHS_PER_BATCH; - } - self.request_batches(network)?; - } - } - } else { - return Err(RemoveChain::WrongChainState(format!( - "Batch not found for current processing target {}", - self.processing_target - ))); - } - Ok(KeepChain) - } - - /// The block processor has completed processing a batch. This function handles the result - /// of the batch processor. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn on_batch_process_result( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - result: &BatchProcessResult, - ) -> ProcessingResult { - // the first two cases are possible if the chain advances while waiting for a processing - // result - let batch_state = self.visualize_batch_state(); - let batch = match &self.current_processing_batch { - Some(processing_id) if *processing_id != batch_id => { - debug!(batch_epoch = %batch_id, expected_batch_epoch = %processing_id,"Unexpected batch result"); - return Ok(KeepChain); - } - None => { - debug!(batch_epoch = %batch_id,"Chain was not expecting a batch result"); - return Ok(KeepChain); - } - _ => { - // batch_id matches, continue - self.current_processing_batch = None; - self.batches.get_mut(&batch_id).ok_or_else(|| { - RemoveChain::WrongChainState(format!( - "Current processing batch not found: {}", - batch_id - )) - })? - } - }; - - let batch_peers = batch.processing_peers().ok_or_else(|| { - RemoveChain::WrongBatchState(format!( - "Processing target is in wrong state: {:?}", - batch.state(), - )) - })?; - - // Log the process result and the batch for debugging purposes. - debug!( - result = ?result, - batch_epoch = %batch_id, - batch_state = ?batch_state, - ?batch, - "Batch processing result" - ); - - // We consider three cases. Batch was successfully processed, Batch failed processing due - // to a faulty peer, or batch failed processing but the peer can't be deemed faulty. - match result { - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } => { - if sent_blocks > imported_blocks { - let ignored_blocks = sent_blocks - imported_blocks; - metrics::inc_counter_vec_by( - &metrics::SYNCING_CHAINS_IGNORED_BLOCKS, - &[self.chain_type.into()], - ignored_blocks as u64, - ); - } - metrics::inc_counter_vec( - &metrics::SYNCING_CHAINS_PROCESSED_BATCHES, - &[self.chain_type.into()], - ); - - batch.processing_completed(BatchProcessingResult::Success)?; - - // was not empty = sent_blocks > 0 - if *sent_blocks > 0 { - // If the processed batch was not empty, we can validate previous unvalidated - // blocks. - self.advance_chain(network, batch_id); - // we register so that on chain switching we don't try it again - self.attempted_optimistic_starts.insert(batch_id); - } else if self.optimistic_start == Some(batch_id) { - // check if this batch corresponds to an optimistic batch. In this case, we - // reject it as an optimistic candidate since the batch was empty - self.reject_optimistic_batch( - network, - false, /* do not re-request */ - "batch was empty", - )?; - } - - if batch_id == self.processing_target { - self.processing_target += EPOCHS_PER_BATCH; - } - - // check if the chain has completed syncing - if self.current_processed_slot() >= self.target_head_slot { - // chain is completed - Err(RemoveChain::ChainCompleted) - } else { - // chain is not completed - // attempt to request more batches - self.request_batches(network)?; - // attempt to process more batches - self.process_completed_batches(network) - } - } - BatchProcessResult::FaultyFailure { - imported_blocks, - peer_action, - // TODO(sync): propagate error in logs - error: _, - } => { - let mut failed_peers = vec![]; - - // TODO(sync): De-dup between back and forwards sync - if let Some(penalty) = peer_action.block_peer { - // Penalize the peer appropiately. - network.report_peer(batch_peers.block(), penalty, "faulty_batch"); - failed_peers.push(batch_peers.block()); - } - - // Penalize each peer only once. Currently a peer_action does not mix different - // PeerAction levels. - for (peer, penalty) in peer_action - .column_peer - .iter() - .filter_map(|(column_index, penalty)| { - batch_peers - .column(column_index) - .map(|peer| (*peer, *penalty)) - }) - .unique() - { - network.report_peer(peer, penalty, "faulty_batch_column"); - failed_peers.push(peer); - } - - // Check if this batch is allowed to continue - match batch - .processing_completed(BatchProcessingResult::FaultyFailure(failed_peers))? - { - BatchOperationOutcome::Continue => { - // Chain can continue. Check if it can be moved forward. - if *imported_blocks > 0 { - // At least one block was successfully verified and imported, so we can be sure all - // previous batches are valid and we only need to download the current failed - // batch. - self.advance_chain(network, batch_id); - } - // Handle this invalid batch, that is within the re-process retries limit. - self.handle_invalid_batch(network, batch_id) - } - BatchOperationOutcome::Failed { blacklist } => { - // TODO(das): what peer action should we apply to the rest of - // peers? Say a batch repeatedly fails because a custody peer is not - // sending us its custody columns - let penalty = PeerAction::LowToleranceError; - - // Check that we have not exceeded the re-process retry counter, - // If a batch has exceeded the invalid batch lookup attempts limit, it means - // that it is likely all peers in this chain are are sending invalid batches - // repeatedly and are either malicious or faulty. We drop the chain and - // report all peers. - // There are some edge cases with forks that could land us in this situation. - // This should be unlikely, so we tolerate these errors, but not often. - warn!( - score_adjustment = %penalty, - batch_epoch = %batch_id, - "Batch failed to download. Dropping chain scoring peers" - ); - - for peer in self.peers.write().drain() { - network.report_peer(peer, penalty, "faulty_chain"); - } - Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }) - } - } - } - BatchProcessResult::NonFaultyFailure => { - batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; - // Simply redownload the batch. - self.send_batch(network, batch_id) - } - } - } - - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn reject_optimistic_batch( - &mut self, - network: &mut SyncNetworkContext, - redownload: bool, - reason: &str, - ) -> ProcessingResult { - if let Some(epoch) = self.optimistic_start.take() { - self.attempted_optimistic_starts.insert(epoch); - // if this batch is inside the current processing range, keep it, otherwise drop - // it. NOTE: this is done to prevent non-sequential batches coming from optimistic - // starts from filling up the buffer size - if epoch < self.to_be_downloaded { - debug!(%epoch, reason, "Rejected optimistic batch left for future use"); - // this batch is now treated as any other batch, and re-requested for future use - if redownload { - return self.send_batch(network, epoch); - } - } else { - debug!(%epoch, reason, "Rejected optimistic batch"); - self.batches.remove(&epoch); - } - } - - Ok(KeepChain) - } - - /// Removes any batches previous to the given `validating_epoch` and updates the current - /// boundaries of the chain. - /// - /// The `validating_epoch` must align with batch boundaries. - /// - /// If a previous batch has been validated and it had been re-processed, penalize the original - /// peer. - #[allow(clippy::modulo_one)] - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn advance_chain(&mut self, network: &mut SyncNetworkContext, validating_epoch: Epoch) { - // make sure this epoch produces an advancement - if validating_epoch <= self.start_epoch { - return; - } - - // safety check for batch boundaries - if validating_epoch % EPOCHS_PER_BATCH != self.start_epoch % EPOCHS_PER_BATCH { - crit!("Validating Epoch is not aligned"); - return; - } - - // batches in the range [BatchId, ..) (not yet validated) - let remaining_batches = self.batches.split_off(&validating_epoch); - // batches less than `validating_epoch` - let removed_batches = std::mem::replace(&mut self.batches, remaining_batches); - - for (id, batch) in removed_batches.into_iter() { - // only for batches awaiting validation can we be sure the last attempt is - // right, and thus, that any different attempt is wrong - match batch.state() { - BatchState::AwaitingValidation(ref processed_attempt) => { - for attempt in batch.attempts() { - // The validated batch has been re-processed - if attempt.hash != processed_attempt.hash { - // The re-downloaded version was different - // TODO(das): should penalize other peers? - let valid_attempt_peer = processed_attempt.block_peer(); - let bad_attempt_peer = attempt.block_peer(); - if valid_attempt_peer != bad_attempt_peer { - // A different peer sent the correct batch, the previous peer did not - // We negatively score the original peer. - let action = PeerAction::LowToleranceError; - debug!( - batch_epoch = %id, score_adjustment = %action, - original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer, - "Re-processed batch validated. Scoring original peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_original_peer", - ); - } else { - // The same peer corrected it's previous mistake. There was an error, so we - // negative score the original peer. - let action = PeerAction::MidToleranceError; - debug!( - batch_epoch = %id, - score_adjustment = %action, - original_peer = %bad_attempt_peer, - new_peer = %valid_attempt_peer, - "Re-processed batch validated by the same peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_same_peer", - ); - } - } - } - } - BatchState::Downloading(..) => {} - BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { - crit!("batch indicates inconsistent chain state while advancing chain") - } - BatchState::AwaitingProcessing(..) => {} - BatchState::Processing(_) => { - debug!(batch = %id, "Advancing chain while processing a batch"); - if let Some(processing_id) = self.current_processing_batch { - if id <= processing_id { - self.current_processing_batch = None; - } - } - } - } - } - - self.processing_target = self.processing_target.max(validating_epoch); - let old_start = self.start_epoch; - self.start_epoch = validating_epoch; - self.to_be_downloaded = self.to_be_downloaded.max(validating_epoch); - if self.batches.contains_key(&self.to_be_downloaded) { - // if a chain is advanced by Range beyond the previous `self.to_be_downloaded`, we - // won't have this batch, so we need to request it. - self.to_be_downloaded += EPOCHS_PER_BATCH; - } - if let Some(epoch) = self.optimistic_start { - if epoch <= validating_epoch { - self.optimistic_start = None; - } - } - debug!( - previous_start = %old_start, - new_start = %self.start_epoch, - processing_target = %self.processing_target, - "Chain advanced" - ); - } - - /// An invalid batch has been received that could not be processed, but that can be retried. - /// - /// These events occur when a peer has successfully responded with blocks, but the blocks we - /// have received are incorrect or invalid. This indicates the peer has not performed as - /// intended and can result in downvoting a peer. - #[instrument(parent = None,level = "info", fields(service = self.id, network), skip_all)] - fn handle_invalid_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - // The current batch could not be processed, indicating either the current or previous - // batches are invalid. - - // The previous batch could be incomplete due to the block sizes being too large to fit in - // a single RPC request or there could be consecutive empty batches which are not supposed - // to be there - - // The current (sub-optimal) strategy is to simply re-request all batches that could - // potentially be faulty. If a batch returns a different result than the original and - // results in successful processing, we downvote the original peer that sent us the batch. - - if let Some(epoch) = self.optimistic_start { - // If this batch is an optimistic batch, we reject this epoch as an optimistic - // candidate and try to re download it - if epoch == batch_id { - return self.reject_optimistic_batch(network, true, "batch was invalid"); - // since this is the optimistic batch, we can't consider previous batches as - // invalid. - } - } - // this is our robust `processing_target`. All previous batches must be awaiting - // validation - let mut redownload_queue = Vec::new(); - - for (id, batch) in self.batches.range_mut(..batch_id) { - if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? { - // remove the chain early - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: *id, - }); - } - redownload_queue.push(*id); - } - - // no batch maxed out it process attempts, so now the chain's volatile progress must be - // reset - self.processing_target = self.start_epoch; - - for id in redownload_queue { - self.send_batch(network, id)?; - } - // finally, re-request the failed batch. - self.send_batch(network, batch_id) - } - - pub fn stop_syncing(&mut self) { - self.state = ChainSyncingState::Stopped; - } - - /// Either a new chain, or an old one with a peer list - /// This chain has been requested to start syncing. - /// - /// This could be new chain, or an old chain that is being resumed. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn start_syncing( - &mut self, - network: &mut SyncNetworkContext, - local_finalized_epoch: Epoch, - optimistic_start_epoch: Epoch, - ) -> ProcessingResult { - // to avoid dropping local progress, we advance the chain wrt its batch boundaries. This - let align = |epoch| { - // start_epoch + (number of batches in between)*length_of_batch - self.start_epoch + ((epoch - self.start_epoch) / EPOCHS_PER_BATCH) * EPOCHS_PER_BATCH - }; - // get the *aligned* epoch that produces a batch containing the `local_finalized_epoch` - let validating_epoch = align(local_finalized_epoch); - // align the optimistic_start too. - let optimistic_epoch = align(optimistic_start_epoch); - - // advance the chain to the new validating epoch - self.advance_chain(network, validating_epoch); - if self.optimistic_start.is_none() - && optimistic_epoch > self.processing_target - && !self.attempted_optimistic_starts.contains(&optimistic_epoch) - { - self.optimistic_start = Some(optimistic_epoch); - } - - // update the state - self.state = ChainSyncingState::Syncing; - - // begin requesting blocks from the peer pool, until all peers are exhausted. - self.request_batches(network)?; - - // start processing batches if needed - self.process_completed_batches(network) - } - - /// Add a peer to the chain. - /// - /// If the chain is active, this starts requesting batches from this peer. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn add_peer( - &mut self, - network: &mut SyncNetworkContext, - peer_id: PeerId, - ) -> ProcessingResult { - self.peers.write().insert(peer_id); - self.request_batches(network) - } - - /// An RPC error has occurred. - /// - /// If the batch exists it is re-requested. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn inject_error( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - request_id: Id, - err: RpcResponseError, - ) -> ProcessingResult { - let batch_state = self.visualize_batch_state(); - if let Some(batch) = self.batches.get_mut(&batch_id) { - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. - if !batch.is_expecting_block(&request_id) { - debug!( - batch_epoch = %batch_id, - batch_state = ?batch.state(), - %request_id, - ?batch_state, - "Batch not expecting block" - ); - return Ok(KeepChain); - } - debug!( - batch_epoch = %batch_id, - batch_state = ?batch.state(), - error = ?err, - %request_id, - "Batch download error" - ); - if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed()? { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }); - } - self.send_batch(network, batch_id) - } else { - debug!( - batch_epoch = %batch_id, - %request_id, - batch_state, - "Batch not found" - ); - // this could be an error for an old batch, removed when the chain advances - Ok(KeepChain) - } - } - - /// Requests the batch assigned to the given id from a given peer. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn send_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - let batch_state = self.visualize_batch_state(); - if let Some(batch) = self.batches.get_mut(&batch_id) { - let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_peers(); - - match network.block_components_by_range_request( - todo!(), - todo!(), - self.peers.clone(), - failed_peers, - ) { - Ok(request_id) => { - // inform the batch about the new request - batch.start_downloading(request_id)?; - if self - .optimistic_start - .map(|epoch| epoch == batch_id) - .unwrap_or(false) - { - debug!(%batch_id, %batch_state, "Requesting optimistic batch"); - } else { - debug!(%batch_id, %batch_state, "Requesting batch"); - } - return Ok(KeepChain); - } - Err(e) => match e { - e @ (RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_)) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, "Could not send batch request"); - // register the failed download and check if the batch can be retried - batch.start_downloading(1)?; // fake request_id = 1 is not relevant - match batch.download_failed()? { - BatchOperationOutcome::Failed { blacklist } => { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }) - } - BatchOperationOutcome::Continue => { - return self.send_batch(network, batch_id) - } - } - } - }, - } - } - - Ok(KeepChain) - } - - /// Returns true if this chain is currently syncing. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn is_syncing(&self) -> bool { - match self.state { - ChainSyncingState::Syncing => true, - ChainSyncingState::Stopped => false, - } - } - - /// Kickstarts the chain by sending for processing batches that are ready and requesting more - /// batches if needed. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn resume( - &mut self, - network: &mut SyncNetworkContext, - ) -> Result { - // Request more batches if needed. - self.request_batches(network)?; - // If there is any batch ready for processing, send it. - self.process_completed_batches(network) - } - - /// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer - /// pool and left over batches until the batch buffer is reached or all peers are exhausted. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn request_batches(&mut self, network: &mut SyncNetworkContext) -> ProcessingResult { - if !matches!(self.state, ChainSyncingState::Syncing) { - return Ok(KeepChain); - } - - // find the next pending batch and request it from the peer - - // check if we have the batch for our optimistic start. If not, request it first. - // We wait for this batch before requesting any other batches. - if let Some(epoch) = self.optimistic_start { - if let Entry::Vacant(_entry) = self.batches.entry(epoch) { - todo!(); - } - return Ok(KeepChain); - } - - // find the next pending batch and request it from the peer - // Note: for this function to not infinite loop we must: - // - If `include_next_batch` returns Some we MUST increase the count of batches that are - // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of - // that function. - while let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id)?; - } - - // No more batches, simply stop - Ok(KeepChain) - } - - /// Creates the next required batch from the chain. If there are no more batches required, - /// `false` is returned. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { - // don't request batches beyond the target head slot - if self - .to_be_downloaded - .start_slot(T::EthSpec::slots_per_epoch()) - >= self.target_head_slot - { - return None; - } - - // only request batches up to the buffer size limit - // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync - // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { - matches!( - batch.state(), - BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) - ) - }; - if self - .batches - .iter() - .filter(|&(_epoch, batch)| in_buffer(batch)) - .count() - >= self.batch_buffer_size - { - return None; - } - - // If no batch needs a retry, attempt to send the batch of the next epoch to download - let next_batch_id = self.to_be_downloaded; - // this batch could have been included already being an optimistic batch - match self.batches.entry(next_batch_id) { - Entry::Occupied(_) => { - // this batch doesn't need downloading, let this same function decide the next batch - self.to_be_downloaded += EPOCHS_PER_BATCH; - self.include_next_batch(network) - } - Entry::Vacant(_entry) => { - todo!(); - self.to_be_downloaded += EPOCHS_PER_BATCH; - Some(next_batch_id) - } - } - } - - /// Creates a string visualization of the current state of the chain, to make it easier for debugging and understanding - /// where sync is up to from glancing at the logs. - /// - /// This produces a string of the form: [D,E,E,E,E] - /// to indicate the current buffer state of the chain. The symbols are defined on each of the - /// batch states. See [BatchState::visualize] for symbol definitions. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn visualize_batch_state(&self) -> String { - let mut visualization_string = String::with_capacity(self.batch_buffer_size * 3); - - // Start of the block - visualization_string.push('['); - - for mut batch_index in 0..self.batch_buffer_size { - if let Some(batch) = self - .batches - .get(&(self.processing_target + batch_index as u64 * EPOCHS_PER_BATCH)) - { - visualization_string.push(batch.visualize()); - if batch_index != self.batch_buffer_size { - // Add a comma in between elements - visualization_string.push(','); - } - } else { - // No batch exists, it is on our list to be downloaded - // Fill in the rest of the gaps - while batch_index < self.batch_buffer_size { - visualization_string.push('E'); - // Add a comma between the empty batches - if batch_index < self.batch_buffer_size.saturating_sub(1) { - visualization_string.push(',') - } - batch_index += 1; - } - break; - } - } - visualization_string.push(']'); - visualization_string - } -} - -use super::batch::WrongState as WrongBatchState; -impl From for RemoveChain { - fn from(err: WrongBatchState) -> Self { - RemoveChain::WrongBatchState(err.0) - } -} - -impl RemoveChain { - pub fn is_critical(&self) -> bool { - matches!( - self, - RemoveChain::WrongBatchState(..) | RemoveChain::WrongChainState(..) - ) - } -} - -impl From for SyncingChainType { - fn from(value: RangeSyncType) -> Self { - match value { - RangeSyncType::Head => Self::Head, - RangeSyncType::Finalized => Self::Finalized, - } - } -} diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs deleted file mode 100644 index 44ce43d56aa..00000000000 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ /dev/null @@ -1,541 +0,0 @@ -//! This provides the logic for the finalized and head chains. -//! -//! Each chain type is stored in it's own map. A variety of helper functions are given along with -//! this struct to simplify the logic of the other layers of sync. - -use super::chain::{ChainId, ProcessingResult, RemoveChain, SyncingChain}; -use super::sync_type::RangeSyncType; -use crate::metrics; -use crate::sync::network_context::SyncNetworkContext; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use fnv::FnvHashMap; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::PeerId; -use lighthouse_network::SyncInfo; -use logging::crit; -use smallvec::SmallVec; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::sync::Arc; -use tracing::{debug, error}; -use types::EthSpec; -use types::{Epoch, Hash256, Slot}; - -/// The number of head syncing chains to sync at a time. -const PARALLEL_HEAD_CHAINS: usize = 2; - -/// Minimum work we require a finalized chain to do before picking a chain with more peers. -const MIN_FINALIZED_CHAIN_PROCESSED_EPOCHS: u64 = 10; - -/// The state of the long range/batch sync. -#[derive(Clone)] -pub enum RangeSyncState { - /// A finalized chain is being synced. - Finalized(Id), - /// There are no finalized chains and we are syncing one more head chains. - Head(SmallVec<[Id; PARALLEL_HEAD_CHAINS]>), - /// There are no head or finalized chains and no long range sync is in progress. - Idle, -} - -pub type SyncChainStatus = - Result, &'static str>; - -/// A collection of finalized and head chains currently being processed. -pub struct ChainCollection { - /// The beacon chain for processing. - beacon_chain: Arc>, - /// The set of finalized chains being synced. - finalized_chains: FnvHashMap>, - /// The set of head chains being synced. - head_chains: FnvHashMap>, - /// The current sync state of the process. - state: RangeSyncState, - /// The maximum number of batches to queue before requesting more. - batch_buffer_size: usize, -} - -impl ChainCollection { - #[cfg(test)] - pub(crate) fn iter(&self) -> impl Iterator> { - self.finalized_chains - .values() - .chain(self.head_chains.values()) - } - - pub fn new(beacon_chain: Arc>, batch_buffer_size: usize) -> Self { - ChainCollection { - beacon_chain, - finalized_chains: FnvHashMap::default(), - head_chains: FnvHashMap::default(), - state: RangeSyncState::Idle, - batch_buffer_size, - } - } - - /// Updates the Syncing state of the collection after a chain is removed. - fn on_chain_removed(&mut self, id: &ChainId, was_syncing: bool, sync_type: RangeSyncType) { - metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_REMOVED, &[sync_type.as_str()]); - self.update_metrics(); - - match self.state { - RangeSyncState::Finalized(ref syncing_id) => { - if syncing_id == id { - // the finalized chain that was syncing was removed - debug_assert!(was_syncing && sync_type == RangeSyncType::Finalized); - let syncing_head_ids: SmallVec<[Id; PARALLEL_HEAD_CHAINS]> = self - .head_chains - .iter() - .filter(|(_id, chain)| chain.is_syncing()) - .map(|(id, _)| *id) - .collect(); - self.state = if syncing_head_ids.is_empty() { - RangeSyncState::Idle - } else { - RangeSyncState::Head(syncing_head_ids) - }; - } else { - // we removed a head chain, or a stopped finalized chain - debug_assert!(!was_syncing || sync_type != RangeSyncType::Finalized); - } - } - RangeSyncState::Head(ref mut syncing_head_ids) => { - if let Some(index) = syncing_head_ids - .iter() - .enumerate() - .find(|(_, &chain_id)| &chain_id == id) - .map(|(i, _)| i) - { - // a syncing head chain was removed - debug_assert!(was_syncing); - syncing_head_ids.swap_remove(index); - if syncing_head_ids.is_empty() { - self.state = RangeSyncState::Idle; - } - } else { - debug_assert!(!was_syncing); - } - } - RangeSyncState::Idle => { - // the removed chain should not be syncing - debug_assert!(!was_syncing) - } - } - } - - /// Calls `func` on every chain of the collection. If the result is - /// `ProcessingResult::RemoveChain`, the chain is removed and returned. - /// NOTE: `func` must not change the syncing state of a chain. - pub fn call_all(&mut self, mut func: F) -> Vec<(SyncingChain, RangeSyncType, RemoveChain)> - where - F: FnMut(&mut SyncingChain) -> ProcessingResult, - { - let mut to_remove = Vec::new(); - - for (id, chain) in self.finalized_chains.iter_mut() { - if let Err(remove_reason) = func(chain) { - to_remove.push((*id, RangeSyncType::Finalized, remove_reason)); - } - } - - for (id, chain) in self.head_chains.iter_mut() { - if let Err(remove_reason) = func(chain) { - to_remove.push((*id, RangeSyncType::Head, remove_reason)); - } - } - - let mut results = Vec::with_capacity(to_remove.len()); - for (id, sync_type, reason) in to_remove.into_iter() { - let chain = match sync_type { - RangeSyncType::Finalized => self.finalized_chains.remove(&id), - RangeSyncType::Head => self.head_chains.remove(&id), - }; - let chain = chain.expect("Chain exists"); - self.on_chain_removed(&id, chain.is_syncing(), sync_type); - results.push((chain, sync_type, reason)); - } - results - } - - /// Executes a function on the chain with the given id. - /// - /// If the function returns `ProcessingResult::RemoveChain`, the chain is removed and returned. - /// If the chain is found, its syncing type is returned, or an error otherwise. - /// NOTE: `func` should not change the sync state of a chain. - #[allow(clippy::type_complexity)] - pub fn call_by_id( - &mut self, - id: ChainId, - func: F, - ) -> Result<(Option<(SyncingChain, RemoveChain)>, RangeSyncType), ()> - where - F: FnOnce(&mut SyncingChain) -> ProcessingResult, - { - if let Entry::Occupied(mut entry) = self.finalized_chains.entry(id) { - // Search in our finalized chains first - if let Err(remove_reason) = func(entry.get_mut()) { - let chain = entry.remove(); - self.on_chain_removed(&id, chain.is_syncing(), RangeSyncType::Finalized); - Ok((Some((chain, remove_reason)), RangeSyncType::Finalized)) - } else { - Ok((None, RangeSyncType::Finalized)) - } - } else if let Entry::Occupied(mut entry) = self.head_chains.entry(id) { - // Search in our head chains next - if let Err(remove_reason) = func(entry.get_mut()) { - let chain = entry.remove(); - self.on_chain_removed(&id, chain.is_syncing(), RangeSyncType::Head); - Ok((Some((chain, remove_reason)), RangeSyncType::Head)) - } else { - Ok((None, RangeSyncType::Head)) - } - } else { - // Chain was not found in the finalized collection, nor the head collection - Err(()) - } - } - - /// Updates the state of the chain collection. - /// - /// This removes any out-dated chains, swaps to any higher priority finalized chains and - /// updates the state of the collection. This starts head chains syncing if any are required to - /// do so. - pub fn update( - &mut self, - network: &mut SyncNetworkContext, - local: &SyncInfo, - awaiting_head_peers: &mut HashMap, - ) { - // Remove any outdated finalized/head chains - self.purge_outdated_chains(local, awaiting_head_peers); - - let local_head_epoch = local.head_slot.epoch(T::EthSpec::slots_per_epoch()); - // Choose the best finalized chain if one needs to be selected. - self.update_finalized_chains(network, local.finalized_epoch, local_head_epoch); - - if !matches!(self.state, RangeSyncState::Finalized(_)) { - // Handle head syncing chains if there are no finalized chains left. - self.update_head_chains( - network, - local.finalized_epoch, - local_head_epoch, - awaiting_head_peers, - ); - } - } - - pub fn state(&self) -> SyncChainStatus { - match self.state { - RangeSyncState::Finalized(ref syncing_id) => { - let chain = self - .finalized_chains - .get(syncing_id) - .ok_or("Finalized syncing chain not found")?; - Ok(Some(( - RangeSyncType::Finalized, - chain.start_epoch.start_slot(T::EthSpec::slots_per_epoch()), - chain.target_head_slot, - ))) - } - RangeSyncState::Head(ref syncing_head_ids) => { - let mut range: Option<(Slot, Slot)> = None; - for id in syncing_head_ids { - let chain = self - .head_chains - .get(id) - .ok_or("Head syncing chain not found")?; - let start = chain.start_epoch.start_slot(T::EthSpec::slots_per_epoch()); - let target = chain.target_head_slot; - - range = range - .map(|(min_start, max_slot)| (min_start.min(start), max_slot.max(target))) - .or(Some((start, target))); - } - let (start_slot, target_slot) = range.ok_or("Syncing head with empty head ids")?; - Ok(Some((RangeSyncType::Head, start_slot, target_slot))) - } - RangeSyncState::Idle => Ok(None), - } - } - - /// This looks at all current finalized chains and decides if a new chain should be prioritised - /// or not. - fn update_finalized_chains( - &mut self, - network: &mut SyncNetworkContext, - local_epoch: Epoch, - local_head_epoch: Epoch, - ) { - // Find the chain with most peers and check if it is already syncing - if let Some((mut new_id, max_peers)) = self - .finalized_chains - .iter() - .max_by_key(|(_, chain)| chain.available_peers()) - .map(|(id, chain)| (*id, chain.available_peers())) - { - let mut old_id = None; - if let RangeSyncState::Finalized(syncing_id) = self.state { - if syncing_id == new_id { - // best chain is already syncing - old_id = Some(None); - } else { - // chains are different, check that they don't have the same number of peers - if let Some(syncing_chain) = self.finalized_chains.get_mut(&syncing_id) { - if max_peers > syncing_chain.available_peers() - && syncing_chain.processed_epochs() - > MIN_FINALIZED_CHAIN_PROCESSED_EPOCHS - { - syncing_chain.stop_syncing(); - old_id = Some(Some(syncing_id)); - } else { - // chains have the same number of peers, pick the currently syncing - // chain to avoid unnecessary switchings and try to advance it - new_id = syncing_id; - old_id = Some(None); - } - } - } - } - - let chain = self - .finalized_chains - .get_mut(&new_id) - .expect("Chain exists"); - - match old_id { - Some(Some(old_id)) => debug!(old_id, id = chain.id(), "Switching finalized chains"), - None => debug!(id = chain.id(), "Syncing new finalized chain"), - Some(None) => { - // this is the same chain. We try to advance it. - } - } - - // update the state to a new finalized state - self.state = RangeSyncState::Finalized(new_id); - - if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch) - { - if remove_reason.is_critical() { - crit!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains"); - } else { - // this happens only if sending a batch over the `network` fails a lot - error!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains"); - } - self.finalized_chains.remove(&new_id); - self.on_chain_removed(&new_id, true, RangeSyncType::Finalized); - } - } - } - - /// Start syncing any head chains if required. - fn update_head_chains( - &mut self, - network: &mut SyncNetworkContext, - local_epoch: Epoch, - local_head_epoch: Epoch, - awaiting_head_peers: &mut HashMap, - ) { - // Include the awaiting head peers - for (peer_id, peer_sync_info) in awaiting_head_peers.drain() { - debug!("including head peer"); - self.add_peer_or_create_chain( - local_epoch, - peer_sync_info.head_root, - peer_sync_info.head_slot, - peer_id, - RangeSyncType::Head, - network, - ); - } - - if self.head_chains.is_empty() { - // There are no finalized chains, update the state. - self.state = RangeSyncState::Idle; - return; - } - - // Order chains by available peers, if two chains have the same number of peers, prefer one - // that is already syncing - let mut preferred_ids = self - .head_chains - .iter() - .map(|(id, chain)| (chain.available_peers(), !chain.is_syncing(), *id)) - .collect::>(); - preferred_ids.sort_unstable(); - - let mut syncing_chains = SmallVec::<[Id; PARALLEL_HEAD_CHAINS]>::new(); - for (_, _, id) in preferred_ids { - let chain = self.head_chains.get_mut(&id).expect("known chain"); - if syncing_chains.len() < PARALLEL_HEAD_CHAINS { - // start this chain if it's not already syncing - if !chain.is_syncing() { - debug!(id = chain.id(), "New head chain started syncing"); - } - if let Err(remove_reason) = - chain.start_syncing(network, local_epoch, local_head_epoch) - { - self.head_chains.remove(&id); - if remove_reason.is_critical() { - crit!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains"); - } else { - error!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains"); - } - } else { - syncing_chains.push(id); - } - } else { - // stop any other chain - chain.stop_syncing(); - } - } - - self.state = if syncing_chains.is_empty() { - RangeSyncState::Idle - } else { - RangeSyncState::Head(syncing_chains) - }; - } - - /// Returns if `true` if any finalized chains exist, `false` otherwise. - pub fn is_finalizing_sync(&self) -> bool { - !self.finalized_chains.is_empty() - } - - /// Removes any outdated finalized or head chains. - /// This removes chains with no peers, or chains whose start block slot is less than our current - /// finalized block slot. Peers that would create outdated chains are removed too. - pub fn purge_outdated_chains( - &mut self, - local_info: &SyncInfo, - awaiting_head_peers: &mut HashMap, - ) { - let local_finalized_slot = local_info - .finalized_epoch - .start_slot(T::EthSpec::slots_per_epoch()); - - let beacon_chain = &self.beacon_chain; - - let is_outdated = |target_slot: &Slot, target_root: &Hash256| { - target_slot <= &local_finalized_slot - || beacon_chain.block_is_known_to_fork_choice(target_root) - }; - - // Retain only head peers that remain relevant - awaiting_head_peers.retain(|_peer_id, peer_sync_info| { - !is_outdated(&peer_sync_info.head_slot, &peer_sync_info.head_root) - }); - - // Remove chains that are out-dated - let mut removed_chains = Vec::new(); - removed_chains.extend(self.finalized_chains.iter().filter_map(|(id, chain)| { - if is_outdated(&chain.target_head_slot, &chain.target_head_root) - || chain.available_peers() == 0 - { - debug!(id, "Purging out of finalized chain"); - Some((*id, chain.is_syncing(), RangeSyncType::Finalized)) - } else { - None - } - })); - - removed_chains.extend(self.head_chains.iter().filter_map(|(id, chain)| { - if is_outdated(&chain.target_head_slot, &chain.target_head_root) - || chain.available_peers() == 0 - { - debug!(id, "Purging out of date head chain"); - Some((*id, chain.is_syncing(), RangeSyncType::Head)) - } else { - None - } - })); - - // update the state of the collection - for (id, was_syncing, sync_type) in removed_chains { - // remove each chain, updating the state for each removal. - match sync_type { - RangeSyncType::Finalized => self.finalized_chains.remove(&id), - RangeSyncType::Head => self.head_chains.remove(&id), - }; - self.on_chain_removed(&id, was_syncing, sync_type); - } - } - - /// Adds a peer to a chain with the given target, or creates a new syncing chain if it doesn't - /// exists. - #[allow(clippy::too_many_arguments)] - pub fn add_peer_or_create_chain( - &mut self, - start_epoch: Epoch, - target_head_root: Hash256, - target_head_slot: Slot, - peer: PeerId, - sync_type: RangeSyncType, - network: &mut SyncNetworkContext, - ) { - let collection = if let RangeSyncType::Finalized = sync_type { - &mut self.finalized_chains - } else { - &mut self.head_chains - }; - - match collection - .iter_mut() - .find(|(_, chain)| chain.has_same_target(target_head_slot, target_head_root)) - { - Some((&id, chain)) => { - debug!(peer_id = %peer, ?sync_type, id, "Adding peer to known chain"); - debug_assert_eq!(chain.target_head_root, target_head_root); - debug_assert_eq!(chain.target_head_slot, target_head_slot); - if let Err(remove_reason) = chain.add_peer(network, peer) { - if remove_reason.is_critical() { - crit!(id, reason = ?remove_reason, "Chain removed after adding peer"); - } else { - error!(id, reason = ?remove_reason, "Chain removed after adding peer"); - } - let is_syncing = chain.is_syncing(); - collection.remove(&id); - self.on_chain_removed(&id, is_syncing, sync_type); - } - } - None => { - let peer_rpr = peer.to_string(); - let id = network.next_id(); - let new_chain = SyncingChain::new( - id, - start_epoch, - target_head_slot, - target_head_root, - peer, - sync_type.into(), - self.batch_buffer_size, - ); - - debug!( - peer_id = peer_rpr, - ?sync_type, - id, - %start_epoch, - %target_head_slot, - ?target_head_root, - "New chain added to sync" - ); - collection.insert(id, new_chain); - metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_ADDED, &[sync_type.as_str()]); - self.update_metrics(); - } - } - } - - fn update_metrics(&self) { - metrics::set_gauge_vec( - &metrics::SYNCING_CHAINS_COUNT, - &[RangeSyncType::Finalized.as_str()], - self.finalized_chains.len() as i64, - ); - metrics::set_gauge_vec( - &metrics::SYNCING_CHAINS_COUNT, - &[RangeSyncType::Head.as_str()], - self.head_chains.len() as i64, - ); - } -} diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs deleted file mode 100644 index 67479f9a1e0..00000000000 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! This provides the logic for syncing a chain when the local node is far behind it's current -//! peers. - -mod batch; -mod chain; -mod chain_collection; -mod range; -mod sync_type; - -pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, -}; -pub use chain::{BatchId, ChainId, BATCH_BUFFER_SIZE, EPOCHS_PER_BATCH}; -pub use range::RangeSync; -pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs deleted file mode 100644 index 8f52fa7a496..00000000000 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ /dev/null @@ -1,460 +0,0 @@ -//! This contains the logic for the long range (batch) sync strategy. -//! -//! The general premise is to group peers by their self-proclaimed finalized blocks and head -//! blocks. Once grouped, the peers become sources to download a specific `Chain`. A `Chain` is a -//! collection of blocks that terminates at the specified target head. -//! -//! This sync strategy can be separated into two distinct forms: -//! - Finalized Chain Sync -//! - Head Chain Sync -//! -//! ## Finalized chain sync -//! -//! This occurs when a peer connects that claims to have a finalized head slot that is greater -//! than our own. In this case, we form a chain from our last finalized epoch, to their claimed -//! finalized slot. Any peer that also claims to have this last finalized slot is added to a pool -//! of peers from which batches of blocks may be downloaded. Blocks are downloaded until the -//! finalized slot of the chain is reached. Once reached, all peers within the pool are sent a -//! STATUS message to potentially start a head chain sync, or check if further finalized chains -//! need to be downloaded. -//! -//! A few interesting notes about finalized chain syncing: -//! - Only one finalized chain can sync at a time -//! - The finalized chain with the largest peer pool takes priority. -//! - As one finalized chain completes, others are checked to see if we they can be continued, -//! otherwise they are removed. -//! -//! ## Head Chain Sync -//! -//! If a peer joins and there is no active finalized chains being synced, and it's head is beyond -//! our `SLOT_IMPORT_TOLERANCE` a chain is formed starting from this peers finalized epoch (this -//! has been necessarily downloaded by our node, otherwise we would start a finalized chain sync) -//! to this peers head slot. Any other peers that match this head slot and head root, are added to -//! this chain's peer pool, which will be downloaded in parallel. -//! -//! Unlike finalized chains, head chains can be synced in parallel. -//! -//! ## Batch Syncing -//! -//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially -//! and further batches are requested as current blocks are being processed. - -use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; -use super::chain_collection::{ChainCollection, SyncChainStatus}; -use super::sync_type::RangeSyncType; -use super::BatchPeers; -use crate::metrics; -use crate::status::ToStatusMessage; -use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; -#[cfg(test)] -use crate::sync::range_sync::BatchState; -use crate::sync::BatchProcessResult; -use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::rpc::GoodbyeReason; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::{PeerId, SyncInfo}; -use logging::crit; -use lru_cache::LRUTimeCache; -use std::collections::HashMap; -use std::sync::Arc; -use tracing::{debug, instrument, trace, warn}; -use types::{Epoch, EthSpec, Hash256}; - -/// For how long we store failed finalized chains to prevent retries. -const FAILED_CHAINS_EXPIRY_SECONDS: u64 = 30; - -/// The primary object dealing with long range/batch syncing. This contains all the active and -/// non-active chains that need to be processed before the syncing is considered complete. This -/// holds the current state of the long range sync. -pub struct RangeSync { - /// The beacon chain for processing. - beacon_chain: Arc>, - /// Last known sync info of our useful connected peers. We use this information to create Head - /// chains after all finalized chains have ended. - awaiting_head_peers: HashMap, - /// A collection of chains that need to be downloaded. This stores any head or finalized chains - /// that need to be downloaded. - chains: ChainCollection, - /// Chains that have failed and are stored to prevent being retried. - failed_chains: LRUTimeCache, -} - -impl RangeSync -where - T: BeaconChainTypes, -{ - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn new(beacon_chain: Arc>, batch_buffer_size: usize) -> Self { - RangeSync { - beacon_chain: beacon_chain.clone(), - chains: ChainCollection::new(beacon_chain, batch_buffer_size), - failed_chains: LRUTimeCache::new(std::time::Duration::from_secs( - FAILED_CHAINS_EXPIRY_SECONDS, - )), - awaiting_head_peers: HashMap::new(), - } - } - - #[cfg(test)] - pub(crate) fn failed_chains(&mut self) -> Vec { - self.failed_chains.keys().copied().collect() - } - - #[cfg(test)] - pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, &BatchState)> { - self.chains - .iter() - .flat_map(|chain| { - chain - .batches_state() - .into_iter() - .map(|(batch_id, state)| (chain.id(), batch_id, state)) - }) - .collect() - } - - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn state(&self) -> SyncChainStatus { - self.chains.state() - } - - /// A useful peer has been added. The SyncManager has identified this peer as needing either - /// a finalized or head chain sync. This processes the peer and starts/resumes any chain that - /// may need to be synced as a result. A new peer, may increase the peer pool of a finalized - /// chain, this may result in a different finalized chain from syncing as finalized chains are - /// prioritised by peer-pool size. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn add_peer( - &mut self, - network: &mut SyncNetworkContext, - local_info: SyncInfo, - peer_id: PeerId, - remote_info: SyncInfo, - ) { - // evaluate which chain to sync from - - // determine if we need to run a sync to the nearest finalized state or simply sync to - // its current head - - // convenience variable - let remote_finalized_slot = remote_info - .finalized_epoch - .start_slot(T::EthSpec::slots_per_epoch()); - - // NOTE: A peer that has been re-status'd may now exist in multiple finalized chains. This - // is OK since we since only one finalized chain at a time. - - // determine which kind of sync to perform and set up the chains - match RangeSyncType::new(self.beacon_chain.as_ref(), &local_info, &remote_info) { - RangeSyncType::Finalized => { - // Make sure we have not recently tried this chain - if self.failed_chains.contains(&remote_info.finalized_root) { - debug!(failed_root = ?remote_info.finalized_root, %peer_id,"Disconnecting peer that belongs to previously failed chain"); - network.goodbye_peer(peer_id, GoodbyeReason::IrrelevantNetwork); - return; - } - - // Finalized chain search - debug!(%peer_id, "Finalization sync peer joined"); - self.awaiting_head_peers.remove(&peer_id); - - // Because of our change in finalized sync batch size from 2 to 1 and our transition - // to using exact epoch boundaries for batches (rather than one slot past the epoch - // boundary), we need to sync finalized sync to 2 epochs + 1 slot past our peer's - // finalized slot in order to finalize the chain locally. - let target_head_slot = - remote_finalized_slot + (2 * T::EthSpec::slots_per_epoch()) + 1; - - // Note: We keep current head chains. These can continue syncing whilst we complete - // this new finalized chain. - - self.chains.add_peer_or_create_chain( - local_info.finalized_epoch, - remote_info.finalized_root, - target_head_slot, - peer_id, - RangeSyncType::Finalized, - network, - ); - - self.chains - .update(network, &local_info, &mut self.awaiting_head_peers); - } - RangeSyncType::Head => { - // This peer requires a head chain sync - - if self.chains.is_finalizing_sync() { - // If there are finalized chains to sync, finish these first, before syncing head - // chains. - trace!(%peer_id, awaiting_head_peers = &self.awaiting_head_peers.len(),"Waiting for finalized sync to complete"); - self.awaiting_head_peers.insert(peer_id, remote_info); - return; - } - - // if the peer existed in any other head chain, remove it. - self.remove_peer(network, &peer_id); - self.awaiting_head_peers.remove(&peer_id); - - // The new peer has the same finalized (earlier filters should prevent a peer with an - // earlier finalized chain from reaching here). - - let start_epoch = std::cmp::min(local_info.head_slot, remote_finalized_slot) - .epoch(T::EthSpec::slots_per_epoch()); - self.chains.add_peer_or_create_chain( - start_epoch, - remote_info.head_root, - remote_info.head_slot, - peer_id, - RangeSyncType::Head, - network, - ); - self.chains - .update(network, &local_info, &mut self.awaiting_head_peers); - } - } - } - - /// A `BlocksByRange` response has been received from the network. - /// - /// This function finds the chain that made this request. Once found, processes the result. - /// This request could complete a chain or simply add to its progress. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn blocks_by_range_response( - &mut self, - network: &mut SyncNetworkContext, - batch_peers: BatchPeers, - chain_id: ChainId, - batch_id: BatchId, - request_id: Id, - blocks: Vec>, - ) { - // check if this chunk removes the chain - match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, batch_peers, request_id, blocks) - }) { - Ok((removed_chain, sync_type)) => { - if let Some((removed_chain, remove_reason)) = removed_chain { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "block response", - ); - } - } - Err(_) => { - trace!(%chain_id, "BlocksByRange response for removed chain") - } - } - } - - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn handle_block_process_result( - &mut self, - network: &mut SyncNetworkContext, - chain_id: ChainId, - batch_id: Epoch, - result: BatchProcessResult, - ) { - // check if this response removes the chain - match self.chains.call_by_id(chain_id, |chain| { - chain.on_batch_process_result(network, batch_id, &result) - }) { - Ok((None, _sync_type)) => { - // Chain was found and not removed - } - Ok((Some((removed_chain, remove_reason)), sync_type)) => { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "batch processing result", - ); - } - - Err(_) => { - trace!(%chain_id, "BlocksByRange response for removed chain") - } - } - } - - /// A peer has disconnected. This removes the peer from any ongoing chains and mappings. A - /// disconnected peer could remove a chain - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn peer_disconnect(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) { - // if the peer is in the awaiting head mapping, remove it - self.awaiting_head_peers.remove(peer_id); - - // remove the peer from any peer pool, failing its batches - self.remove_peer(network, peer_id); - } - - /// When a peer gets removed, both the head and finalized chains need to be searched to check - /// which pool the peer is in. The chain may also have a batch or batches awaiting - /// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum - /// retries. In this case, we need to remove the chain. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - fn remove_peer(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) { - for (removed_chain, sync_type, remove_reason) in - self.chains.call_all(|chain| chain.remove_peer(peer_id)) - { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "peer removed", - ); - } - } - - /// An RPC error has occurred. - /// - /// Check to see if the request corresponds to a pending batch. If so, re-request it if possible, if there have - /// been too many failed attempts for the batch, remove the chain. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn inject_error( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - chain_id: ChainId, - request_id: Id, - err: RpcResponseError, - ) { - // check that this request is pending - match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, request_id, err) - }) { - Ok((removed_chain, sync_type)) => { - if let Some((removed_chain, remove_reason)) = removed_chain { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "RPC error", - ); - } - } - Err(_) => { - trace!(%chain_id, "BlocksByRange response for removed chain") - } - } - } - - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - fn on_chain_removed( - &mut self, - chain: SyncingChain, - sync_type: RangeSyncType, - remove_reason: RemoveChain, - network: &mut SyncNetworkContext, - op: &'static str, - ) { - if remove_reason.is_critical() { - crit!(id = chain.id(), ?sync_type, reason = ?remove_reason, op, "Chain removed"); - } else { - debug!(id = chain.id(), ?sync_type, reason = ?remove_reason, op, "Chain removed"); - } - - if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason { - if RangeSyncType::Finalized == sync_type && blacklist { - warn!( - id = chain.id(), - "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", - FAILED_CHAINS_EXPIRY_SECONDS - ); - self.failed_chains.insert(chain.target_head_root); - } - } - - metrics::inc_counter_vec_by( - &metrics::SYNCING_CHAINS_DROPPED_BLOCKS, - &[sync_type.as_str()], - chain.pending_blocks() as u64, - ); - - network.status_peers(self.beacon_chain.as_ref(), chain.peers()); - - let status = self.beacon_chain.status_message(); - let local = SyncInfo { - head_slot: status.head_slot, - head_root: status.head_root, - finalized_epoch: status.finalized_epoch, - finalized_root: status.finalized_root, - }; - - // update the state of the collection - self.chains - .update(network, &local, &mut self.awaiting_head_peers); - } - - /// Kickstarts sync. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn resume(&mut self, network: &mut SyncNetworkContext) { - for (removed_chain, sync_type, remove_reason) in - self.chains.call_all(|chain| chain.resume(network)) - { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "chain resumed", - ); - } - } -} diff --git a/beacon_node/network/src/sync/range_sync/sync_type.rs b/beacon_node/network/src/sync/range_sync/sync_type.rs deleted file mode 100644 index 4ff7e393101..00000000000 --- a/beacon_node/network/src/sync/range_sync/sync_type.rs +++ /dev/null @@ -1,46 +0,0 @@ -//! Contains logic about identifying which Sync to perform given PeerSyncInfo of ourselves and -//! of a remote. - -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::SyncInfo; - -/// The type of Range sync that should be done relative to our current state. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum RangeSyncType { - /// A finalized chain sync should be started with this peer. - Finalized, - /// A head chain sync should be started with this peer. - Head, -} - -impl RangeSyncType { - /// Determines the type of sync given our local `PeerSyncInfo` and the remote's - /// `PeerSyncInfo`. - pub fn new( - chain: &BeaconChain, - local_info: &SyncInfo, - remote_info: &SyncInfo, - ) -> RangeSyncType { - // Check for finalized chain sync - // - // The condition is: - // - The remotes finalized epoch is greater than our current finalized epoch and we have - // not seen the finalized hash before. - - if remote_info.finalized_epoch > local_info.finalized_epoch - && !chain.block_is_known_to_fork_choice(&remote_info.finalized_root) - { - RangeSyncType::Finalized - } else { - RangeSyncType::Head - } - } - - /// Get a `str` representation of the `RangeSyncType`. - pub fn as_str(&self) -> &'static str { - match self { - RangeSyncType::Finalized => "Finalized", - RangeSyncType::Head => "Head", - } - } -} diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 72913148952..880365d943d 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -14,11 +14,13 @@ use beacon_chain::{ PayloadVerificationStatus, }; use beacon_processor::WorkType; -use lighthouse_network::rpc::methods::{BlocksByRootRequest, DataColumnsByRootRequest}; +use lighthouse_network::rpc::methods::{ + BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, +}; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, - DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, + AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, + ComponentsByRootRequestId, DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, }; use lighthouse_network::types::SyncState; use lighthouse_network::{PeerId, SyncInfo}; From 805d284eb204966e4264521a0457576e95d0e1a3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 01:39:31 +0200 Subject: [PATCH 29/66] Add error handling in fowards sync --- .../network_beacon_processor/sync_methods.rs | 35 +- beacon_node/network/src/router.rs | 6 +- .../network/src/sync/backfill_sync/mod.rs | 9 +- beacon_node/network/src/sync/block_tree.rs | 488 ++++++++++-------- beacon_node/network/src/sync/manager.rs | 35 +- .../network/src/sync/network_context.rs | 90 ++-- .../sync/network_context/custody_by_root.rs | 37 +- 7 files changed, 336 insertions(+), 364 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index c0b33582295..264cde106c4 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -126,7 +126,7 @@ impl NetworkBeaconProcessor { .process_blocks(downloaded_blocks.iter(), notify_execution_layer) .await { - (imported_blocks, Ok(_)) => { + (_imported_blocks, Ok(_)) => { debug!( %id, first_block_slot = start_slot, @@ -134,27 +134,19 @@ impl NetworkBeaconProcessor { processed_blocks = sent_blocks, service= "sync", "Batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } + BatchProcessResult::Success } - (imported_blocks, Err(e)) => { + (_imported_blocks, Err(e)) => { debug!( %id, first_block_slot = start_slot, last_block_slot = end_slot, - imported_blocks, error = %e.message, service = "sync", "Batch processing failed"); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks, - peer_action: penalty, - error: e.message, - }, - None => BatchProcessResult::NonFaultyFailure, + BatchProcessResult::Failure { + peer_action: e.peer_action, + error: e.message, } } } @@ -185,10 +177,7 @@ impl NetworkBeaconProcessor { processed_data_columns = n_data_columns, service= "sync", "Backfill batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } + BatchProcessResult::Success } Err(e) => { debug!( @@ -200,13 +189,9 @@ impl NetworkBeaconProcessor { service = "sync", "Backfill batch processing failed" ); - match e.peer_action { - Some(peer_action) => BatchProcessResult::FaultyFailure { - imported_blocks: 0, - peer_action, - error: e.message, - }, - None => BatchProcessResult::NonFaultyFailure, + BatchProcessResult::Failure { + peer_action: e.peer_action, + error: e.message, } } } diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index 2426cd2c1d9..71c3de95949 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -296,13 +296,13 @@ impl Router { .send_status_message(peer_id, status_message), ) } - Response::BlocksByRange(beacon_block) => { + Response::BlocksByRange(_) => { crit!(id = ?app_request_id, "No BlocksByRange response expected"); } Response::BlocksByRoot(beacon_block) => { self.on_blocks_by_root_response(peer_id, app_request_id, beacon_block); } - Response::BlobsByRange(blob) => { + Response::BlobsByRange(_) => { crit!(id = ?app_request_id, "No BlobsByRange response expected"); } Response::BlobsByRoot(blob) => { @@ -311,7 +311,7 @@ impl Router { Response::DataColumnsByRoot(data_column) => { self.on_data_columns_by_root_response(peer_id, app_request_id, data_column); } - Response::DataColumnsByRange(data_column) => { + Response::DataColumnsByRange(_) => { crit!(id = ?app_request_id, "No DataColumnsByRange response expected"); } // Light client responses should not be received diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 6b23457ae4d..6d213d54dd2 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -268,16 +268,11 @@ impl BackFillSync { ) { match &mut self.status { SyncingStatus::Processing(block, _peers) => match result { - BatchProcessResult::Success { .. } => { + BatchProcessResult::Success => { debug!(%id, "Sync block process success"); self.status = SyncingStatus::AwaitingDownload(block.as_block().parent_root()) } - BatchProcessResult::FaultyFailure { .. } => { - debug!(%id, "Sync block process error"); - self.status = SyncingStatus::AwaitingDownload(block.block_root()) - // TODO(tree-sync): add peer to failed peers and downscore - } - BatchProcessResult::NonFaultyFailure => { + BatchProcessResult::Failure { .. } => { debug!(%id, "Sync block process error"); self.status = SyncingStatus::AwaitingDownload(block.block_root()) // TODO(tree-sync): add peer to failed peers and downscore diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index a8352d12398..7a075bfd5e7 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -1,6 +1,6 @@ -use super::network_context::{RpcResponseError, SyncNetworkContext}; +use super::network_context::{RpcRequestSendError, RpcResponseError, SyncNetworkContext}; use crate::network_beacon_processor::ChainSegmentProcessId; -use crate::sync::network_context::custody_by_root::ColumnRequest; +use crate::sync::network_context::custody_by_root::{ColumnRequest, Error as ColumnRequestError}; use crate::sync::network_context::{BatchPeers, RpcResponseResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; @@ -12,7 +12,7 @@ use lighthouse_network::PeerId; use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use tracing::debug; +use tracing::{debug, warn}; use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; pub struct BlockTree { @@ -29,7 +29,11 @@ struct Block { enum Status { DownloadingHeader(ColumnRequest), Header(BeaconBlockHeader), - Syncing(BeaconBlockHeader, SyncingStatus), + Syncing { + block_root: Hash256, + parent_root: Hash256, + request: SyncingStatus, + }, } enum SyncingStatus { @@ -59,53 +63,55 @@ impl Block { match self.status { Status::DownloadingHeader(..) => false, Status::Header(..) => false, - Status::Syncing(..) => true, + Status::Syncing { .. } => true, } } - fn header(&self) -> Option<&BeaconBlockHeader> { + fn parent_root(&self) -> Option { match &self.status { Status::DownloadingHeader(..) => None, - Status::Header(header) => Some(header), - Status::Syncing(header, _) => Some(header), + Status::Header(header) => Some(header.parent_root), + Status::Syncing { parent_root, .. } => Some(*parent_root), } } - fn parent_root(&self) -> Option { - self.header().map(|header| header.parent_root) - } - - fn parent_root_and_slot(&self) -> Option<(Hash256, Slot)> { - self.header() - .map(|header| (header.parent_root, header.slot)) - } - fn header_request( &mut self, - ) -> Result<&mut ColumnRequest, String> { + ) -> Result<&mut ColumnRequest, Error> { match &mut self.status { Status::DownloadingHeader(request) => Ok(request), - _ => Err("Expected lookup to be in DownloadingHeader state".to_owned()), + _ => Err(Error::InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), + )), } } - fn syncing(&mut self) -> Option<(&mut BeaconBlockHeader, &mut SyncingStatus)> { + fn block_request(&mut self) -> Result<&mut SyncingStatus, Error> { match &mut self.status { - Status::Syncing(header, request) => Some((header, request)), - _ => None, + Status::Syncing { request, .. } => Ok(request), + _ => Err(Error::InternalError( + "Expected lookup to be in Syncing state".to_owned(), + )), } } +} - fn block_request(&mut self) -> Result<&mut SyncingStatus, String> { - match &mut self.status { - Status::Syncing(_, request) => Ok(request), - _ => Err("Expected lookup to be in Syncing state".to_owned()), - } +#[derive(Debug)] +enum Error { + InternalError(String), + BlockConflictsWithFinality(String), +} + +impl From for Error { + fn from(_e: ColumnRequestError) -> Self { + todo!(); } } -enum Error { - A, +impl From for Error { + fn from(_e: RpcRequestSendError) -> Self { + todo!(); + } } impl BlockTree { @@ -145,7 +151,7 @@ impl BlockTree { block_root: Hash256, peers: &[PeerId], cx: &mut SyncNetworkContext, - ) -> bool { + ) { if self.blocks.contains_key(&block_root) { // Add peer to `block`'s entry and all its ancestors let mut target_block_root = block_root; @@ -162,34 +168,18 @@ impl BlockTree { break; } } - - true } else { debug!(?block_root, ?peers, "Creating new header lookup"); let mut lookup = Block::new(block_root, cx.next_id(), peers); - - // TODO(tree-sync): have good peer selection - let Some(peer) = lookup.peers.iter().next() else { - todo!("no peer"); - }; - - let req_id = cx - .send_blocks_by_root_request( - *peer, - block_root, - BlocksByRootRequester::Header(lookup.id), - ) - .unwrap(); - - lookup - .header_request() - .expect("A new lookup is in DownloadingHeader request state") - .on_download_start(req_id) - .expect("A new request is in AwaitingDownload state"); - - self.blocks.insert(block_root, lookup); - true + match Self::send_block_header_request(&mut lookup, block_root, cx) { + Ok(_) => { + self.blocks.insert(block_root, lookup); + } + Err(e) => { + warn!(id = ?lookup.id, error = ?e, "Error sending initial lookup request"); + } + } } } @@ -200,91 +190,86 @@ impl BlockTree { response: RpcResponseResult>>>, peer_id: PeerId, cx: &mut SyncNetworkContext, - ) -> Result<(), String> { + ) { let block_root = lookup_id.0; - let Some(lookup) = self.blocks.get_mut(&block_root) else { - return Err(format!("No header lookup for root {block_root}")); - }; - let response = response.and_then(|(blocks, timestamp)| { - let block = blocks - .first() - .cloned() - .ok_or(RpcResponseError::InternalError( - "blocks_by_root response contains zero blocks".to_owned(), - ))?; - Ok((block, timestamp)) - }); - - match response { - Ok((block, received)) => { - let block_header = block.message().block_header(); - let parent_root = block_header.parent_root; - - lookup - .header_request()? - .on_download_success(req_id, peer_id, block_header.clone(), received) - .unwrap(); - lookup.status = Status::Header(block_header.clone()); - - // Once we discover the parent_root of this block three things can happen - // 1. The parent root is a known block -> stop - // 2. We conflicts with finality -> reject - // 3. The parent root is unknown -> continue search - - // TODO(tree-sync): should check if the block is descendant of finalized - // TODO(tree-sync): on finalization or every interval we should drop branches that - // conflict with finality - let parent_imported = self.chain.block_is_known_to_fork_choice(&parent_root); - let finalized_checkpoint = self.chain.head().finalized_checkpoint(); - let parent_known = self.blocks.contains_key(&parent_root); - - if block_header.slot - <= finalized_checkpoint - .epoch - .start_slot(T::EthSpec::slots_per_epoch()) - && block_root != finalized_checkpoint.root - { - panic!( - "Block {:?} {} conflicts with finalized checkpoint {:?}", - block_root, block_header.slot, finalized_checkpoint - ); + let result = (|| { + let Some(mut lookup) = self.blocks.get_mut(&block_root) else { + // TODO(tree-sync): register metric + debug!(id = ?req_id, "Received header request for unknown lookup"); + return Ok(()); + }; + + let response = response.and_then(|(blocks, timestamp)| { + let block = blocks + .first() + .cloned() + .ok_or(RpcResponseError::InternalError( + "blocks_by_root response contains zero blocks".to_owned(), + ))?; + Ok((block, timestamp)) + }); + + match response { + Ok((block, received)) => { + debug!(%req_id, "Forward sync block header downloaded success"); + + let block_header = block.message().block_header(); + let parent_root = block_header.parent_root; + + lookup.header_request()?.on_download_success( + req_id, + peer_id, + block_header.clone(), + received, + )?; + lookup.status = Status::Header(block_header.clone()); + + // Once we discover the parent_root of this block three things can happen + // 1. The parent root is a known block -> stop + // 2. We conflicts with finality -> reject + // 3. The parent root is unknown -> continue search + + // TODO(tree-sync): should check if the block is descendant of finalized + // TODO(tree-sync): on finalization or every interval we should drop branches that + // conflict with finality + let parent_imported = self.chain.block_is_known_to_fork_choice(&parent_root); + let finalized_checkpoint = self.chain.head().finalized_checkpoint(); + let parent_known = self.blocks.contains_key(&parent_root); + + if block_header.slot + <= finalized_checkpoint + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + && block_root != finalized_checkpoint.root + { + return Err(Error::BlockConflictsWithFinality(format!( + "Block {:?} {} conflicts with finalized checkpoint {:?}", + block_root, block_header.slot, finalized_checkpoint + ))); + } + if parent_imported || parent_known { + // Stop search we reached a known block + self.trigger_forward_sync(cx); + } else { + let lookup = self.blocks.get_mut(&block_root).expect("lookup exists"); + let peers = lookup.peers.iter().copied().collect::>(); + self.search(parent_root, &peers, cx); + } } - if parent_imported || parent_known { - // Stop search we reached a known block - self.trigger_forward_sync(cx); - } else { - let lookup = self.blocks.get_mut(&block_root).expect("lookup exists"); - let peers = lookup.peers.iter().copied().collect::>(); - self.search(parent_root, &peers, cx); + Err(e) => { + debug!(%req_id, error = ?e, "Forward sync block header downloaded error"); + lookup.header_request()?.on_download_error(req_id)?; + Self::send_block_header_request(lookup, block_root, cx)?; } } - Err(e) => { - lookup.header_request()?.on_download_error(req_id).unwrap(); - - // TODO(tree-sync): have good peer selection - let Some(peer) = lookup.peers.iter().next() else { - todo!("no peer"); - }; - - let req_id = cx - .send_blocks_by_root_request( - *peer, - block_root, - BlocksByRootRequester::Header(lookup.id), - ) - .unwrap(); - - lookup - .header_request() - .expect("A new lookup is in DownloadingHeader request state") - .on_download_start(req_id) - .expect("A new request is in AwaitingDownload state"); - - todo!("error {e:?}"); - } + Ok(()) + })(); + + if let Err(e) = result { + debug!(error = ?e, "Dropping forward sync block header lookup"); + self.drop_lookup_and_children(block_root); } - Ok(()) } pub fn prune(&mut self) { @@ -342,35 +327,39 @@ impl BlockTree { for _ in blocks_syncing..2 { // Find the block range with most peers and highest slot. This is the block // to be used as tip of the chain of blocks to fetch. - let Some(block_root) = self + let Some((block_root, parent_root)) = self .blocks .iter() .filter_map(|(root, block)| { - // Ignore blocks that are already being forward synced - if block.is_syncing() { - return None; - } - // Ignore block roots which header is not downloaded yet - let Some((parent_root, slot)) = block.parent_root_and_slot() else { - return None; + let header = match &block.status { + // Ignore blocks that are still downloading + Status::DownloadingHeader(_) => return None, + Status::Header(header) => header, + // Ignore blocks already syncing + Status::Syncing { .. } => return None, }; // Check if the parent is known in the header tree - let is_candidate = if let Some(parent) = self.blocks.get(&parent_root) { + let is_candidate = if let Some(parent) = self.blocks.get(&header.parent_root) { parent.is_syncing() } else { // TODO(tree-sync): cache this calls in the struct - cx.chain.block_is_known_to_fork_choice(&parent_root) + cx.chain.block_is_known_to_fork_choice(&header.parent_root) }; if is_candidate { // Find highest peer count, then min slot - Some((block.peer_count(), Slot::new(u64::MAX) - slot, root)) + Some(( + block.peer_count(), + Slot::new(u64::MAX) - header.slot, + root, + &header.parent_root, + )) } else { None } }) .max() - .map(|(_, _, root)| *root) + .map(|(_, _, root, parent_root)| (*root, *parent_root)) else { break; }; @@ -379,15 +368,15 @@ impl BlockTree { let block_to_sync = self .blocks .get_mut(&block_root) - .expect("Block should exist"); + .expect("block_root is a key of self.blocks"); + + // The code above ensures that `block_to_sync` is in `Status::Header` status + block_to_sync.status = Status::Syncing { + block_root, + parent_root, + request: SyncingStatus::AwaitingDownload, + }; - match &mut block_to_sync.status { - Status::Header(header) => { - block_to_sync.status = - Status::Syncing(header.clone(), SyncingStatus::AwaitingDownload); - } - _ => panic!("Unpected state"), - } debug!(id = %block_to_sync.id, "Starting forwards sync of block"); new_syncing_blocks = true; @@ -399,9 +388,13 @@ impl BlockTree { } fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { - for lookup in self.blocks.values_mut().filter(|block| block.is_syncing()) { - match &mut lookup.status { - Status::Syncing(header, syncing_status) => match syncing_status { + let mut lookups_to_drop = vec![]; + + for (block_root, lookup) in self.blocks.iter_mut() { + let result = match &mut lookup.status { + Status::DownloadingHeader(..) => continue, + Status::Header(_) => continue, + Status::Syncing { request, .. } => match request { SyncingStatus::AwaitingDownload => { let requester = RangeRequestId::RangeSync(lookup.id); // TODO(tree-sync) use RwLock or manually add to active request @@ -411,38 +404,55 @@ impl BlockTree { let failed_peers = HashSet::new(); match cx.block_components_by_range_request( - header.canonical_root(), + *block_root, requester, peers, &failed_peers, ) { Ok(req_id) => { - *syncing_status = SyncingStatus::Downloading(req_id); - } - Err(e) => { - // Handle send error - todo!("Error sending {e:?}"); + *request = SyncingStatus::Downloading(req_id); + Ok(()) } - }; + Err(e) => match e { + RpcRequestSendError::NoPeers + | RpcRequestSendError::InternalError(_) => { + Err(format!("Error sending block components request: {e:?}")) + } + }, + } } - SyncingStatus::Downloading(_) => {} // wait for event + SyncingStatus::Downloading(_) => Ok(()), // wait for event SyncingStatus::AwaitingProcessing(block, peers) => { - let Some(beacon_processor) = cx.beacon_processor_if_enabled() else { - todo!("processor disabled"); - }; - if let Err(e) = beacon_processor.send_chain_segment( - ChainSegmentProcessId::RangeBatchId(lookup.id), - vec![block.clone()], - ) { - todo!("error sending"); + if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { + if let Err(e) = beacon_processor.send_chain_segment( + ChainSegmentProcessId::RangeBatchId(lookup.id), + vec![block.clone()], + ) { + Err(format!("Error sending block to processor: {e:?}")) + } else { + *request = SyncingStatus::Processing(peers.clone()); + Ok(()) + } + } else { + // TODO(tree-sync): This error will cause the full chain of headers to + // be dropped if the beacon processor goes offline. When can that + // happen? + Err("Beacon processor is disabled".to_owned()) } - *syncing_status = SyncingStatus::Processing(peers.clone()); } - SyncingStatus::Processing(_) => {} // wait for event + SyncingStatus::Processing(_) => Ok(()), // wait for event }, - _ => panic!("bad state"), + }; + + if let Err(_e) = result { + // TODO(tree-sync): should log error? + lookups_to_drop.push(*block_root); } } + + for block_root in lookups_to_drop { + self.drop_lookup_and_children(block_root); + } } pub fn on_block_response( @@ -451,26 +461,35 @@ impl BlockTree { result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right - // one, while still indexing by block_root only - let Some(lookup) = self.blocks.get_mut(&id.0) else { - panic!("Unknown batch id {id}"); - }; + let result = (|| { + // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right + // one, while still indexing by block_root only + let Some(lookup) = self.blocks.get_mut(&id.0) else { + // TODO(tree-sync): register metric + debug!(?id, "Received block request for unknown lookup"); + return Ok(()); + }; - let request = lookup.block_request().unwrap(); - match request { - SyncingStatus::Downloading(_) => match result { - Ok((block, peers)) => { - debug!(%id, "Sync block downloaded"); - *request = SyncingStatus::AwaitingProcessing(block, peers); - } - Err(e) => { - debug!(%id, error = ?e, "Sync block download error"); - *request = SyncingStatus::AwaitingDownload; - } - }, - _ => panic!("Bad state"), - } + let request = lookup.block_request()?; + match request { + SyncingStatus::Downloading(_) => match result { + Ok((block, peers)) => { + debug!(%id, "Sync block downloaded"); + *request = SyncingStatus::AwaitingProcessing(block, peers); + Ok(()) + } + Err(e) => { + // TODO(tree-sync): increase error counter + debug!(%id, error = ?e, "Sync block download error"); + *request = SyncingStatus::AwaitingDownload; + Ok(()) + } + }, + _ => Err(Error::InternalError( + "Lookup not in expected state Downloading".to_owned(), + )), + } + })(); // Continue batches self.continue_syncing_blocks(cx); @@ -482,33 +501,66 @@ impl BlockTree { result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - let Some(lookup) = self.blocks.get_mut(&id.0) else { - panic!("Unknown batch id {id}"); - }; + let result = (|| { + let Some(lookup) = self.blocks.get_mut(&id.0) else { + debug!(?id, "Received block process result for unknown lookup"); + return Ok(()); + }; - let request = lookup.block_request().unwrap(); - match request { - SyncingStatus::Processing(_peers) => match result { - BatchProcessResult::Success { .. } => { - debug!(%id, "Sync block process success"); - self.blocks.remove(&id.0); - self.trigger_forward_sync(cx); - } - BatchProcessResult::FaultyFailure { .. } => { - debug!(%id, "Sync block process error"); - *request = SyncingStatus::AwaitingDownload; - // TODO(tree-sync): add peer to failed peers and downscore - } - BatchProcessResult::NonFaultyFailure => { - debug!(%id, "Sync block process error"); - *request = SyncingStatus::AwaitingDownload; - // TODO(tree-sync): add peer to failed peers and downscore - } - }, - _ => panic!("Bad state"), - } + let request = lookup.block_request()?; + match request { + SyncingStatus::Processing(peers) => match result { + BatchProcessResult::Success => { + debug!(%id, "Sync block process success"); + self.blocks.remove(&id.0); + self.trigger_forward_sync(cx); + Ok(()) + } + BatchProcessResult::Failure { peer_action, error } => { + debug!(%id, "Sync block process error"); + + if let Some(peer_action) = peer_action { + for (peer, penalty) in peers.blame(peer_action) { + cx.report_peer(peer, penalty, "faulty_batch"); + } + } + + *request = SyncingStatus::AwaitingDownload; + + Ok(()) + } + }, + _ => Err(Error::InternalError( + "Lookup not in expected state Processing".to_owned(), + )), + } + })(); // Continue batches self.continue_syncing_blocks(cx); } + + fn drop_lookup_and_children(&mut self, _block_root: Hash256) { + todo!(); + } + + fn send_block_header_request( + lookup: &mut Block, + block_root: Hash256, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + // TODO(tree-sync): have good peer selection + let Some(peer) = lookup.peers.iter().next() else { + return Err(Error::InternalError("No peers".to_owned())); + }; + + let req_id = cx.send_blocks_by_root_request( + *peer, + block_root, + BlocksByRootRequester::Header(lookup.id), + )?; + + lookup.header_request()?.on_download_start(req_id)?; + Ok(()) + } } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 2b209f931fb..fd4830dca2b 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -192,17 +192,12 @@ pub enum BlockProcessingResult { #[derive(Debug)] pub enum BatchProcessResult { /// The batch was completed successfully. It carries whether the sent batch contained blocks. - Success { - sent_blocks: usize, - imported_blocks: usize, - }, + Success, /// The batch processing failed. It carries whether the processing imported any block. - FaultyFailure { - imported_blocks: usize, - peer_action: PeerGroupAction, + Failure { + peer_action: Option, error: String, }, - NonFaultyFailure, } /// The primary object for handling and driving all the current syncing logic. It maintains the @@ -665,18 +660,8 @@ impl SyncManager { ) { match self.should_search_for_block(Some(slot), &peer_id) { Ok(_) => { - if self - .block_tree - .search(block_root, &[peer_id], &mut self.network) - { - // Lookup created. No need to log here it's logged in `new_current_lookup` - } else { - debug!( - ?block_root, - ?parent_root, - "No lookup created for child and parent" - ); - } + self.block_tree + .search(block_root, &[peer_id], &mut self.network); } Err(reason) => { debug!(%block_root, %parent_root, reason, "Ignoring unknown parent request"); @@ -687,14 +672,8 @@ impl SyncManager { fn handle_unknown_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { match self.should_search_for_block(None, &peer_id) { Ok(_) => { - if self - .block_tree - .search(block_root, &[peer_id], &mut self.network) - { - // Lookup created. No need to log here it's logged in `new_current_lookup` - } else { - debug!(?block_root, "No lookup created for unknown block"); - } + self.block_tree + .search(block_root, &[peer_id], &mut self.network); } Err(reason) => { debug!(%block_root, reason, "Ignoring unknown block request"); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index e5595bf65ee..88bc66dd0f9 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -4,9 +4,9 @@ use self::custody_by_root::ActiveCustodyByRootRequest; use super::SyncMessage; use crate::metrics; -use crate::network_beacon_processor::NetworkBeaconProcessor; #[cfg(test)] use crate::network_beacon_processor::TestBeaconChainType; +use crate::network_beacon_processor::{NetworkBeaconProcessor, PeerGroupAction}; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use beacon_chain::block_verification_types::RpcBlock; @@ -15,6 +15,7 @@ pub use block_components_by_range::BlockComponentsByRootRequest; #[cfg(test)] pub use block_components_by_range::BlockComponentsByRootRequestStep; use fnv::FnvHashMap; +use itertools::Itertools; use lighthouse_network::rpc::methods::{ BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, }; @@ -125,20 +126,13 @@ pub struct PeerGroup { } impl PeerGroup { - pub fn empty() -> Self { + pub(crate) fn empty() -> Self { Self { peers: HashMap::new(), } } - /// Return a peer group where a single peer returned all parts of a block component. For - /// example, a block has a single component (the block = index 0/1). - pub fn from_single(peer: PeerId) -> Self { - Self { - peers: HashMap::from_iter([(0, peer)]), - } - } - pub fn from_set(peer_to_indices: HashMap>) -> Self { + pub(crate) fn from_set(peer_to_indices: HashMap>) -> Self { let mut peers = HashMap::new(); for (peer, indices) in peer_to_indices { for index in indices { @@ -147,15 +141,9 @@ impl PeerGroup { } Self { peers } } - pub fn all(&self) -> impl Iterator + '_ { - self.peers.values() - } - pub fn of_index(&self, index: &usize) -> Option<&PeerId> { - self.peers.get(index) - } - pub fn as_map(&self) -> &HashMap { - &self.peers + pub(crate) fn of_index(&self, index: &usize) -> Option<&PeerId> { + self.peers.get(index) } } @@ -166,24 +154,44 @@ pub struct BatchPeers { } impl BatchPeers { - pub fn new_from_block_peer(block_peer: PeerId) -> Self { + pub(crate) fn new_from_block_peer(block_peer: PeerId) -> Self { Self { block_peer, column_peers: PeerGroup::empty(), } } - pub fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { + pub(crate) fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { Self { block_peer, column_peers, } } - pub fn block(&self) -> PeerId { + pub(crate) fn blame(&self, peer_action: PeerGroupAction) -> Vec<(PeerId, PeerAction)> { + // Penalize each peer only once. Currently a peer_action does not mix different + // PeerAction levels. + let mut peer_penalties = peer_action + .column_peer + .iter() + .filter_map(|(column_index, penalty)| { + self.column(column_index).map(|peer| (*peer, *penalty)) + }) + .unique() + .collect::>(); + + if let Some(penalty) = peer_action.block_peer { + // Penalize the peer appropiately. + peer_penalties.push((self.block(), penalty)); + } + + peer_penalties + } + + fn block(&self) -> PeerId { self.block_peer } - pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { + fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { self.column_peers.of_index(&((*index) as usize)) } } @@ -191,26 +199,6 @@ impl BatchPeers { /// Sequential ID that uniquely identifies ReqResp outgoing requests pub type ReqId = u32; -pub enum LookupRequestResult { - /// A request is sent. Sync MUST receive an event from the network in the future for either: - /// completed response or failed request - RequestSent(I), - /// No request is sent, and no further action is necessary to consider this request completed. - /// Includes a reason why this request is not needed. - NoRequestNeeded(&'static str), - /// No request is sent, but the request is not completed. Sync MUST receive some future event - /// that makes progress on the request. For example: request is processing from a different - /// source (i.e. block received from gossip) and sync MUST receive an event with that processing - /// result. - Pending(&'static str), -} - -#[derive(Clone)] -pub struct BlocksByRootSameForkRequest { - pub block_roots: Vec, - pub fork: ForkName, -} - /// Wraps a Network channel to employ various RPC related network functionality for the Sync manager. This includes management of a global RPC request Id. pub struct SyncNetworkContext { /// The network channel to relay messages to the Network service. @@ -493,7 +481,7 @@ impl SyncNetworkContext { block_root: Hash256, indices: Vec, expect_max_responses: bool, - ) -> Result, &'static str> { + ) -> Result { let span = span!( Level::INFO, "SyncNetworkContext", @@ -536,7 +524,7 @@ impl SyncNetworkContext { DataColumnsByRootRequestItems::new(block_root, indices), ); - Ok(LookupRequestResult::RequestSent(id)) + Ok(id) } /// Request to fetch all needed custody columns of a specific block. This function may not send @@ -1046,17 +1034,3 @@ impl SyncNetworkContext { } } } - -fn to_fixed_blob_sidecar_list( - blobs: Vec>>, - max_len: usize, -) -> Result, LookupVerifyError> { - let mut fixed_list = FixedBlobSidecarList::new(vec![None; max_len]); - for blob in blobs.into_iter() { - let index = blob.index as usize; - *fixed_list - .get_mut(index) - .ok_or(LookupVerifyError::UnrequestedIndex(index as u64))? = Some(blob) - } - Ok(fixed_list) -} diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index ab09e1a8674..15a62072bd0 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -16,7 +16,7 @@ use strum::IntoStaticStr; use tracing::{debug, warn}; use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; -use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; const REQUEST_EXPIRY_SECONDS: u64 = 300; @@ -303,7 +303,7 @@ impl ActiveCustodyByRootRequest { } for (peer_id, indices) in columns_to_request_by_peer.into_iter() { - let request_result = cx + let req_id = cx .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, @@ -319,24 +319,18 @@ impl ActiveCustodyByRootRequest { Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) })?; - match request_result { - LookupRequestResult::RequestSent(req_id) => { - for column_index in &indices { - let column_request = self - .column_requests - .get_mut(column_index) - // Should never happen: column_index is iterated from column_requests - .ok_or(Error::InternalError("unknown column_index".to_owned()))?; + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; - column_request.on_download_start(req_id)?; - } - - self.active_batch_columns_requests - .insert(req_id, ActiveBatchColumnsRequest { indices }); - } - LookupRequestResult::NoRequestNeeded(_) => unreachable!(), - LookupRequestResult::Pending(_) => unreachable!(), + column_request.on_download_start(req_id)?; } + + self.active_batch_columns_requests + .insert(req_id, ActiveBatchColumnsRequest { indices }); } if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) @@ -474,13 +468,6 @@ impl ColumnRequest { } } - pub fn peek_downloaded_data(&self) -> Option<&T> { - match &self.status { - Status::Downloaded(_, data, _) => Some(data), - _ => None, - } - } - pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { From 060e5e26b3df9168454f4cb0330adb5f812bc959 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 13:02:32 +0200 Subject: [PATCH 30/66] More lints --- .../network_beacon_processor/sync_methods.rs | 2 +- .../network/src/sync/backfill_sync/mod.rs | 18 +++---- beacon_node/network/src/sync/block_tree.rs | 54 +++++++------------ .../network/src/sync/network_context.rs | 1 - 4 files changed, 27 insertions(+), 48 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 264cde106c4..058b2ba0ca8 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -166,7 +166,7 @@ impl NetworkBeaconProcessor { .sum::(); match self.process_backfill_blocks(downloaded_blocks) { - Ok(imported_blocks) => { + Ok(_imported_blocks) => { debug!( batch_epoch = %epoch, first_block_slot = start_slot, diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 6d213d54dd2..1e4b15a2487 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -106,17 +106,11 @@ impl BackFillSync { // If, for some reason a backfill has already been completed (or we've used a trusted // genesis root) then backfill has been completed. let anchor_info = beacon_chain.store.get_anchor_info(); - let (state, current_start) = - if anchor_info.block_backfill_complete(beacon_chain.genesis_backfill_slot) { - (BackFillState::Completed, Epoch::new(0)) - } else { - ( - BackFillState::Paused, - anchor_info - .oldest_block_slot - .epoch(T::EthSpec::slots_per_epoch()), - ) - }; + let state = if anchor_info.block_backfill_complete(beacon_chain.genesis_backfill_slot) { + BackFillState::Completed + } else { + BackFillState::Paused + }; let bfs = BackFillSync { status: SyncingStatus::AwaitingDownload(anchor_info.oldest_block_parent), @@ -248,7 +242,7 @@ impl BackFillSync { } Err(e) => { // TODO(tree-sync): Handle the error explicitly with a match, check unstable - debug!(%id, "Sync block download error"); + debug!(%id, error = ?e, "Sync block download error"); self.status = SyncingStatus::AwaitingDownload(block_root); } } diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index 7a075bfd5e7..b9bb1a3bcce 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -194,7 +194,7 @@ impl BlockTree { let block_root = lookup_id.0; let result = (|| { - let Some(mut lookup) = self.blocks.get_mut(&block_root) else { + let Some(lookup) = self.blocks.get_mut(&block_root) else { // TODO(tree-sync): register metric debug!(id = ?req_id, "Received header request for unknown lookup"); return Ok(()); @@ -280,27 +280,6 @@ impl BlockTree { todo!(); } - fn mark_descendants_as_rooted(&mut self, _block_root: Hash256) { - // TODO: iterate all blocks and mark descendants of `block_root` as rooted - } - - fn mark_as_syncing(&mut self, _blocks: &[Hash256]) { - // TODO: mark all this block entries as syncing - } - - fn collect_ancestors(&self, mut block_root: Hash256) -> Vec { - let mut ancestors = vec![]; - while let Some(block) = self.blocks.get(&block_root) { - ancestors.push(block_root); - if let Some(parent_root) = block.parent_root() { - block_root = parent_root; - } else { - break; - } - } - ancestors - } - /// Marks blocks ready for download as syncing /// Should be called anytime: /// - A new block is imported to fork-choice @@ -423,21 +402,28 @@ impl BlockTree { } SyncingStatus::Downloading(_) => Ok(()), // wait for event SyncingStatus::AwaitingProcessing(block, peers) => { - if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { - if let Err(e) = beacon_processor.send_chain_segment( - ChainSegmentProcessId::RangeBatchId(lookup.id), - vec![block.clone()], - ) { - Err(format!("Error sending block to processor: {e:?}")) + if cx + .chain + .block_is_known_to_fork_choice(&block.as_block().parent_root()) + { + if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { + if let Err(e) = beacon_processor.send_chain_segment( + ChainSegmentProcessId::RangeBatchId(lookup.id), + vec![block.clone()], + ) { + Err(format!("Error sending block to processor: {e:?}")) + } else { + *request = SyncingStatus::Processing(peers.clone()); + Ok(()) + } } else { - *request = SyncingStatus::Processing(peers.clone()); - Ok(()) + // TODO(tree-sync): This error will cause the full chain of headers to + // be dropped if the beacon processor goes offline. When can that + // happen? + Err("Beacon processor is disabled".to_owned()) } } else { - // TODO(tree-sync): This error will cause the full chain of headers to - // be dropped if the beacon processor goes offline. When can that - // happen? - Err("Beacon processor is disabled".to_owned()) + Ok(()) } } SyncingStatus::Processing(_) => Ok(()), // wait for event diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 88bc66dd0f9..da2457857e0 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -43,7 +43,6 @@ use std::time::Duration; use task_executor::TaskExecutor; use tokio::sync::mpsc; use tracing::{debug, span, warn, Level}; -use types::blob_sidecar::FixedBlobSidecarList; use types::{ BlobIdentifier, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, DataColumnsByRootIdentifier, EthSpec, ForkContext, ForkName, Hash256, RuntimeVariableList, From e62dc910e65091cf7ed8c4f2bbeb846396a6adba Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 14:04:23 +0200 Subject: [PATCH 31/66] Resolve todos --- beacon_node/client/src/notifier.rs | 4 +- beacon_node/http_api/src/lib.rs | 7 +- .../src/peer_manager/network_behaviour.rs | 5 +- .../src/network_beacon_processor/mod.rs | 15 +-- .../network/src/sync/backfill_sync/mod.rs | 14 +-- beacon_node/network/src/sync/block_tree.rs | 36 ++++++ beacon_node/network/src/sync/manager.rs | 109 +++++++++++++++--- common/eth2/src/lighthouse/sync_state.rs | 32 +---- 8 files changed, 147 insertions(+), 75 deletions(-) diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index 53c9c85c001..57dd8b0a34a 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -146,9 +146,7 @@ pub fn spawn_notifier( Instant::now(), ); } - SyncState::SyncingFinalized { .. } - | SyncState::SyncingHead { .. } - | SyncState::SyncTransition => { + SyncState::Syncing { .. } | SyncState::SyncTransition => { speedo.observe(head_slot, Instant::now()); } SyncState::Stalled | SyncState::Synced => {} diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index a4ec41ac06c..5d764464167 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -472,7 +472,8 @@ pub fn serve( move |network_globals: Arc>, chain: Arc>| async move { match *network_globals.sync_state.read() { - SyncState::SyncingFinalized { .. } => { + // TODO(tree-sync): review, we don't have a notion of finalized sync now + SyncState::Syncing { .. } => { let head_slot = chain.canonical_head.cached_head().head_slot(); let current_slot = @@ -494,9 +495,7 @@ pub fn serve( ))) } } - SyncState::SyncingHead { .. } - | SyncState::SyncTransition - | SyncState::BackFillSyncing { .. } => Ok(()), + SyncState::SyncTransition | SyncState::BackFillSyncing { .. } => Ok(()), SyncState::Synced => Ok(()), SyncState::Stalled => Ok(()), } diff --git a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs index 1ad55ce5c4a..ca5dfafa352 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs @@ -79,10 +79,7 @@ impl NetworkBehaviour for PeerManager { } } - if !matches!( - self.network_globals.sync_state(), - SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } - ) { + if !matches!(self.network_globals.sync_state(), SyncState::Syncing { .. }) { loop { match self.status_peers.poll_next_unpin(cx) { Poll::Ready(Some(Ok(peer_id))) => { diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 534057fae40..7ba52a62fd6 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -525,16 +525,11 @@ impl NetworkBeaconProcessor { let processor = self.clone(); let process_fn = async move { - let notify_execution_layer = if processor - .network_globals - .sync_state - .read() - .is_syncing_finalized() - { - NotifyExecutionLayer::No - } else { - NotifyExecutionLayer::Yes - }; + // TODO(tree-sync): Now that we group peers in a header tree they could have diverging + // opinions on what's finalized and what's not. So don't have a clear yes / no to guess + // if this block is finalized or not. Review the optimization of NOT notifying the + // execution layer if we belive this block is finalized. + let notify_execution_layer = NotifyExecutionLayer::Yes; processor .process_chain_segment(process_id, blocks, notify_execution_layer) .await; diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 1e4b15a2487..21c5504bd20 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -34,12 +34,7 @@ const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10; /// Return type when attempting to start the backfill sync process. pub enum SyncStart { /// The chain started syncing or is already syncing. - Syncing { - /// The number of slots that have been processed so far. - completed: usize, - /// The number of slots still to be processed. - remaining: usize, - }, + Syncing, /// The chain didn't start syncing. NotSyncing, } @@ -184,12 +179,7 @@ impl BackFillSync { BackFillState::Completed => return Ok(SyncStart::NotSyncing), } - Ok(SyncStart::Syncing { - // TODO(tree-sync): is this actually used? The remaining does not account for the 6 - // months of data expiration - completed: todo!(), - remaining: todo!(), - }) + Ok(SyncStart::Syncing) } /// A fully synced peer has joined us. diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index b9bb1a3bcce..f8dc4ac2e40 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -94,6 +94,17 @@ impl Block { )), } } + + fn assert_expected_lookup_id(&self, lookup_id: HeaderLookupId) -> Result<(), Error> { + if self.id == lookup_id { + Ok(()) + } else { + Err(Error::InternalError(format!( + "Unexpected lookup ID {} != {}", + self.id, lookup_id + ))) + } + } } #[derive(Debug)] @@ -114,6 +125,11 @@ impl From for Error { } } +pub(crate) enum SyncState { + Synced, + Syncing { max_slot: Slot }, +} + impl BlockTree { pub fn new(chain: Arc>) -> Self { Self { @@ -122,6 +138,23 @@ impl BlockTree { } } + pub fn block_count(&self) -> usize { + self.blocks.len() + } + + pub fn max_slot_to_sync(&self) -> Option { + // TODO(tree-sync): weak metric, who have a better heuristic for sync? Now that lookups + // count here + self.blocks + .values() + .filter_map(|block| match &block.status { + Status::DownloadingHeader(..) => None, + Status::Header(header) => Some(header.slot), + Status::Syncing { .. } => None, + }) + .max() + } + #[cfg(test)] pub fn get_processing_ids(&self) -> Vec { self.blocks @@ -199,6 +232,7 @@ impl BlockTree { debug!(id = ?req_id, "Received header request for unknown lookup"); return Ok(()); }; + lookup.assert_expected_lookup_id(lookup_id)?; let response = response.and_then(|(blocks, timestamp)| { let block = blocks @@ -455,6 +489,7 @@ impl BlockTree { debug!(?id, "Received block request for unknown lookup"); return Ok(()); }; + lookup.assert_expected_lookup_id(id)?; let request = lookup.block_request()?; match request { @@ -492,6 +527,7 @@ impl BlockTree { debug!(?id, "Received block process result for unknown lookup"); return Ok(()); }; + lookup.assert_expected_lookup_id(id)?; let request = lookup.block_request()?; match request { diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index fd4830dca2b..b547a556e36 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -45,6 +45,7 @@ use crate::network_beacon_processor::{ }; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; +use crate::sync::backfill_sync::SyncStart; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, EngineState, @@ -56,16 +57,15 @@ use lighthouse_network::service::api_types::{ CustodyByRootRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, SyncRequestId, }; -use lighthouse_network::types::NetworkGlobals; -use lighthouse_network::PeerId; -use lighthouse_network::SyncInfo; +use lighthouse_network::types::{NetworkGlobals, SyncState}; +use lighthouse_network::{PeerId, SyncInfo}; use logging::crit; use lru_cache::LRUTimeCache; use std::ops::Sub; use std::sync::Arc; use std::time::Duration; use tokio::sync::mpsc; -use tracing::{debug, error, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use types::{ BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, }; @@ -349,6 +349,7 @@ impl SyncManager { finalized_root: status.finalized_root, }; + // Search for any block that is unknown and more recent than finality debug!(?remote, ?local, "new peer"); if !self.chain.block_is_known_to_fork_choice(&remote.head_root) && remote.head_slot @@ -362,16 +363,9 @@ impl SyncManager { let sync_type = remote_sync_type(&local, &remote, &self.chain); - // update the state of the peer. - let is_still_connected = self.update_peer_sync_state(&peer_id, &local, &remote, &sync_type); - if is_still_connected { - match sync_type { - PeerSyncType::Behind => {} - PeerSyncType::Advanced | PeerSyncType::FullySynced => { - self.backfill_sync.add_peer(peer_id); - } - } - } + // TODO(tree-sync): Okay to add all peers to backfill sync? How can we know which have the + // blocks we need? + self.backfill_sync.add_peer(peer_id); self.update_sync_state(); @@ -493,8 +487,91 @@ impl SyncManager { /// - If there is no range sync and no required backfill and we have synced up to the currently /// known peers, we consider ourselves synced. fn update_sync_state(&mut self) { - // TODO(tree-sync): re-think how to set a sync state - todo!(); + // TODO(tree-sync): We could just iterate the PeerDB and count the most common head as the + // sync target. + + let forward_sync_active = if self.block_tree.block_count() > 32 { + self.block_tree.max_slot_to_sync() + } else { + None + }; + + let new_state: SyncState = match forward_sync_active { + None => { + // No range sync, so we decide if we are stalled or synced. + // For this we check if there is at least one advanced peer. An advanced peer + // with Idle range is possible since a peer's status is updated periodically. + // If we synced a peer between status messages, most likely the peer has + // advanced and will produce a head chain on re-status. Otherwise it will shift + // to being synced + let mut sync_state = { + let head = self.chain.best_slot(); + let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0)); + + let peers = self.network_globals().peers.read(); + if current_slot >= head + && current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64) + && head > 0 + { + SyncState::Synced + } else if peers.advanced_peers().next().is_some() { + SyncState::SyncTransition + } else if peers.synced_peers().next().is_none() { + SyncState::Stalled + } else { + // There are no peers that require syncing and we have at least one synced + // peer + SyncState::Synced + } + }; + + // If we would otherwise be synced, first check if we need to perform or + // complete a backfill sync. + #[cfg(not(feature = "disable-backfill"))] + if matches!(sync_state, SyncState::Synced) { + // Determine if we need to start/resume/restart a backfill sync. + match self.backfill_sync.start(&mut self.network) { + Ok(SyncStart::Syncing) => { + sync_state = SyncState::BackFillSyncing; + } + Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start. + Err(e) => { + error!(error = ?e, "Backfill sync failed to start"); + } + } + } + + // Return the sync state if backfilling is not required. + sync_state + } + Some(target_slot) => { + // If there is a backfill sync in progress pause it. + #[cfg(not(feature = "disable-backfill"))] + self.backfill_sync.pause(); + + SyncState::Syncing { + start_slot: self.chain.best_slot(), + target_slot, + } + } + }; + + let old_state = self.network_globals().set_sync_state(new_state); + let new_state = self.network_globals().sync_state.read().clone(); + if !new_state.eq(&old_state) { + info!(%old_state, %new_state, "Sync state updated"); + // If we have become synced - Subscribe to all the core subnet topics + // We don't need to subscribe if the old state is a state that would have already + // invoked this call. + if new_state.is_synced() + && !matches!( + old_state, + SyncState::Synced | SyncState::BackFillSyncing { .. } + ) + { + self.network.subscribe_core_topics(); + } + } } /// The main driving future for the sync manager. diff --git a/common/eth2/src/lighthouse/sync_state.rs b/common/eth2/src/lighthouse/sync_state.rs index 0327f7073fa..793070432a6 100644 --- a/common/eth2/src/lighthouse/sync_state.rs +++ b/common/eth2/src/lighthouse/sync_state.rs @@ -4,17 +4,13 @@ use types::Slot; /// The current state of the node. #[derive(Clone, Debug, Serialize, Deserialize)] pub enum SyncState { - /// The node is performing a long-range (batch) sync over a finalized chain. - /// In this state, parent lookups are disabled. - SyncingFinalized { start_slot: Slot, target_slot: Slot }, - /// The node is performing a long-range (batch) sync over one or many head chains. - /// In this state parent lookups are disabled. - SyncingHead { start_slot: Slot, target_slot: Slot }, + /// The node is syncing one or many chains, either finalized or not + Syncing { start_slot: Slot, target_slot: Slot }, /// The node is undertaking a backfill sync. This occurs when a user has specified a trusted /// state. The node first syncs "forward" by downloading blocks up to the current head as /// specified by its peers. Once completed, the node enters this sync state and attempts to /// download all required historical blocks. - BackFillSyncing { completed: usize, remaining: usize }, + BackFillSyncing, /// The node has completed syncing a finalized chain and is in the process of re-evaluating /// which sync state to progress to. SyncTransition, @@ -43,10 +39,7 @@ impl PartialEq for SyncState { fn eq(&self, other: &Self) -> bool { matches!( (self, other), - ( - SyncState::SyncingFinalized { .. }, - SyncState::SyncingFinalized { .. } - ) | (SyncState::SyncingHead { .. }, SyncState::SyncingHead { .. }) + (SyncState::Syncing { .. }, SyncState::Syncing { .. }) | (SyncState::Synced, SyncState::Synced) | (SyncState::Stalled, SyncState::Stalled) | (SyncState::SyncTransition, SyncState::SyncTransition) @@ -62,8 +55,7 @@ impl SyncState { /// Returns a boolean indicating the node is currently performing a long-range sync. pub fn is_syncing(&self) -> bool { match self { - SyncState::SyncingFinalized { .. } => true, - SyncState::SyncingHead { .. } => true, + SyncState::Syncing { .. } => true, SyncState::SyncTransition => true, // Backfill doesn't effect any logic, we consider this state, not syncing. SyncState::BackFillSyncing { .. } => false, @@ -72,17 +64,6 @@ impl SyncState { } } - pub fn is_syncing_finalized(&self) -> bool { - match self { - SyncState::SyncingFinalized { .. } => true, - SyncState::SyncingHead { .. } => false, - SyncState::SyncTransition => false, - SyncState::BackFillSyncing { .. } => false, - SyncState::Synced => false, - SyncState::Stalled => false, - } - } - /// Returns true if the node is synced. /// /// NOTE: We consider the node synced if it is fetching old historical blocks. @@ -102,8 +83,7 @@ impl SyncState { impl std::fmt::Display for SyncState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - SyncState::SyncingFinalized { .. } => write!(f, "Syncing Finalized Chain"), - SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"), + SyncState::Syncing { .. } => write!(f, "Syncing"), SyncState::Synced => write!(f, "Synced"), SyncState::Stalled => write!(f, "Stalled"), SyncState::SyncTransition => write!(f, "Evaluating known peers"), From 6b0e16114f5fae1ab7bf476c5cb115ec13b4259c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 18:37:19 +0200 Subject: [PATCH 32/66] Temp --- .../src/service/api_types.rs | 16 +- .../src/network_beacon_processor/mod.rs | 2 +- .../network_beacon_processor/sync_methods.rs | 12 +- .../network/src/sync/backfill_sync/mod.rs | 123 ++-- beacon_node/network/src/sync/block_tree.rs | 586 ++++++++++++------ beacon_node/network/src/sync/manager.rs | 20 +- .../block_components_by_range.rs | 2 +- .../sync/network_context/custody_by_root.rs | 4 + 8 files changed, 456 insertions(+), 309 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index e739775763c..349c6127081 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -2,7 +2,7 @@ use crate::rpc::methods::{ResponseTermination, RpcResponse, RpcSuccessResponse, use std::fmt::{Display, Formatter}; use std::sync::Arc; use types::{ - BlobSidecar, DataColumnSidecar, Epoch, EthSpec, Hash256, LightClientBootstrap, + BlobSidecar, DataColumnSidecar, EthSpec, Hash256, LightClientBootstrap, LightClientFinalityUpdate, LightClientOptimisticUpdate, LightClientUpdate, SignedBeaconBlock, }; @@ -67,14 +67,14 @@ pub struct ComponentsByRootRequestId { /// Range sync chain or backfill batch #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { - RangeSync(HeaderLookupId), + ForwardSync(HeaderLookupId), BackfillSync(Id), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum BlocksByRootRequester { Header(HeaderLookupId), - RangeSync(ComponentsByRootRequestId), + ForwardSync(ComponentsByRootRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -83,12 +83,6 @@ pub enum DataColumnsByRootRequester { Custody(CustodyByRootRequestId), } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub enum RangeRequester { - RangeSync { chain_id: u64, batch_id: Epoch }, - BackfillSync { batch_id: Epoch }, -} - #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct SamplingId { pub id: SamplingRequester, @@ -252,7 +246,7 @@ impl Display for CustodyRequester { impl Display for RangeRequestId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::RangeSync(id) => write!(f, "RangeSync/{id}"), + Self::ForwardSync(id) => write!(f, "ForwardSync/{id}"), Self::BackfillSync(id) => write!(f, "BackfillSync/{id}"), } } @@ -262,7 +256,7 @@ impl Display for BlocksByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { Self::Header(id) => write!(f, "Header/{id}"), - Self::RangeSync(id) => write!(f, "RangeSync/{id}"), + Self::ForwardSync(id) => write!(f, "ForwardSync/{id}"), } } } diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 7ba52a62fd6..c8eb5c5571d 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -520,7 +520,7 @@ impl NetworkBeaconProcessor { process_id: ChainSegmentProcessId, blocks: Vec>, ) -> Result<(), Error> { - let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. }); + let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackfillSync { .. }); debug!(blocks = blocks.len(), id = %process_id, "Batch sending for process"); let processor = self.clone(); diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 058b2ba0ca8..3dd38d4ce2b 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -21,9 +21,9 @@ use types::{ColumnIndex, DataColumnSidecar, Hash256}; #[derive(Clone, Debug, PartialEq)] pub enum ChainSegmentProcessId { /// Processing Id of a range syncing batch. - RangeBatchId(HeaderLookupId), + ForwardSync(HeaderLookupId), /// Processing ID for a backfill syncing batch. - BackSyncBatchId(Id), + BackfillSync(Id), } /// Returned when a chain segment import fails. @@ -117,7 +117,7 @@ impl NetworkBeaconProcessor { ) { let result = match sync_type { // this a request from the range sync - ChainSegmentProcessId::RangeBatchId(id) => { + ChainSegmentProcessId::ForwardSync(id) => { let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); let sent_blocks = downloaded_blocks.len(); @@ -152,7 +152,7 @@ impl NetworkBeaconProcessor { } } // this a request from the Backfill sync - ChainSegmentProcessId::BackSyncBatchId(epoch) => { + ChainSegmentProcessId::BackfillSync(epoch) => { let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); let sent_blocks = downloaded_blocks.len(); @@ -417,8 +417,8 @@ impl NetworkBeaconProcessor { impl Display for ChainSegmentProcessId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::RangeBatchId(id) => write!(f, "RangeBatchId/{id}"), - Self::BackSyncBatchId(id) => write!(f, "BackSyncBatchId/{id}"), + Self::ForwardSync(id) => write!(f, "ForwardSync/{id}"), + Self::BackfillSync(id) => write!(f, "BackfillSync/{id}"), } } } diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 21c5504bd20..cd4388411f5 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -8,7 +8,7 @@ //! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill //! sync as failed, log an error and attempt to retry once a new peer joins the node. -use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::block_tree::{Error as TempError, SyncBlock, SyncBlockResult}; use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ BatchPeers, RangeRequestId, RpcResponseError, SyncNetworkContext, @@ -22,7 +22,7 @@ use parking_lot::RwLock; use std::collections::HashSet; use std::sync::Arc; use tracing::{debug, info, instrument, warn}; -use types::{Epoch, EthSpec, Hash256}; +use types::{EthSpec, Hash256}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; @@ -71,14 +71,12 @@ enum SyncingStatus { } pub struct BackFillSync { - status: SyncingStatus, + status: SyncBlock, /// When a backfill sync fails, we keep track of whether a new fully synced peer has joined. /// This signifies that we are able to attempt to restart a failed chain. restart_failed_sync: bool, - peers: Arc>>, - /// Reference to the beacon chain to obtain initial starting points for the backfill sync. beacon_chain: Arc>, @@ -108,9 +106,12 @@ impl BackFillSync { }; let bfs = BackFillSync { - status: SyncingStatus::AwaitingDownload(anchor_info.oldest_block_parent), + status: SyncBlock::new( + RangeRequestId::BackfillSync(0), + anchor_info.oldest_block_parent, + &[], + ), restart_failed_sync: false, - peers: <_>::default(), beacon_chain, network_globals, }; @@ -198,7 +199,7 @@ impl BackFillSync { } pub fn add_peer(&mut self, peer_id: PeerId) { - self.peers.write().insert(peer_id); + self.status.peers.write().insert(peer_id); } pub fn peer_disconnected(&mut self, peer_id: &PeerId) { @@ -213,102 +214,54 @@ impl BackFillSync { } } - pub fn on_block_response( + pub fn on_block_download_result( &mut self, id: Id, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - match self.status { - SyncingStatus::Downloading(block_root, expected_id) => { - if id != expected_id { - panic!("unexpected ID"); - } - match result { - Ok((block, peers)) => { - // TODO(tree-sync): check that id matches - debug!(%id, "Sync block downloaded"); - self.status = SyncingStatus::Processing(block, peers); - } - Err(e) => { - // TODO(tree-sync): Handle the error explicitly with a match, check unstable - debug!(%id, error = ?e, "Sync block download error"); - self.status = SyncingStatus::AwaitingDownload(block_root); - } - } - } - _ => panic!("Bad state"), - } - - // Continue batches - self.continue_syncing_blocks(cx); + let outcome = self.status.on_download_result(result, cx); + self.handle_outcome(outcome, cx); } - pub fn handle_block_process_result( + pub fn on_block_process_result( &mut self, id: Id, result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - match &mut self.status { - SyncingStatus::Processing(block, _peers) => match result { - BatchProcessResult::Success => { - debug!(%id, "Sync block process success"); - self.status = SyncingStatus::AwaitingDownload(block.as_block().parent_root()) - } - BatchProcessResult::Failure { .. } => { - debug!(%id, "Sync block process error"); - self.status = SyncingStatus::AwaitingDownload(block.block_root()) - // TODO(tree-sync): add peer to failed peers and downscore - } - }, - _ => panic!("Bad state"), - } - - // Continue batches - self.continue_syncing_blocks(cx); + let outcome = self.status.on_process_result(result, cx); + self.handle_outcome(outcome, cx); } fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { - match &mut self.status { - SyncingStatus::AwaitingDownload(block_root) => { - // TODO(tree-sync): pick the right ID - let requester = RangeRequestId::BackfillSync(cx.next_id()); - let failed_peers = HashSet::new(); + let outcome = self.status.continue_request(&self.peers, cx); + self.handle_outcome(outcome, cx); + } - match cx.block_components_by_range_request( - *block_root, - requester, - self.peers.clone(), - &failed_peers, - ) { - Ok(req_id) => { - self.status = SyncingStatus::Downloading(*block_root, req_id); - } - Err(e) => { - // TODO(tree-sync): Match error explicitly - // Log failed chain, mark blocks as not syncing - todo!("error sending {e:?}"); - } - }; - } - SyncingStatus::Downloading(..) => {} // wait for event - SyncingStatus::AwaitingProcessing(block, peers) => { - let id = cx.next_id(); - let Some(beacon_processor) = cx.beacon_processor_if_enabled() else { - todo!("processor disabled"); - }; - // TODO(tree-sync): pick the right ID - if let Err(e) = beacon_processor.send_chain_segment( - ChainSegmentProcessId::BackSyncBatchId(id), - vec![block.clone()], - ) { - todo!("error sending {e:?}"); + fn handle_outcome( + &mut self, + result: Result, + cx: &mut SyncNetworkContext, + ) { + match result { + Ok(SyncBlockResult::Done { parent_root, slot }) => { + if is_done(slot) { + todo!("done"); + } else { + self.status = + SyncBlock::new(RangeRequestId::BackfillSync(cx.next_id()), parent_root) } - self.status = SyncingStatus::Processing(block.clone(), peers.clone()); } - SyncingStatus::Processing(..) => {} // wait for event + Ok(SyncBlockResult::Wait) => { + // Do nothing wait for future event + } + Err(e) => match e { + TempError::InternalError(_) => {} + TempError::BlockConflictsWithFinality(_) => {} + }, } + self.continue_syncing_blocks(cx); } /// Updates the global network state indicating the current state of a backfill sync. diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index f8dc4ac2e40..a83325bab5a 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -10,30 +10,187 @@ use lighthouse_network::service::api_types::{ }; use lighthouse_network::PeerId; use parking_lot::RwLock; -use std::collections::{HashMap, HashSet}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; use tracing::{debug, warn}; use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; +const MAX_LOOKUP_COUNT: usize = 1_000_000; +const PRUNE_COUNT: usize = 100_000; + pub struct BlockTree { - blocks: HashMap>, + blocks: HashMap>, chain: Arc>, } -struct Block { +struct ForwardSyncBlock { id: HeaderLookupId, - peers: HashSet, - status: Status, + status: Status, } -enum Status { - DownloadingHeader(ColumnRequest), - Header(BeaconBlockHeader), - Syncing { - block_root: Hash256, - parent_root: Hash256, - request: SyncingStatus, +enum Status { + BackfillHeader { + peers: HashSet, + request: ColumnRequest, }, + ForwardSyncBlock { + header: BeaconBlockHeader, + request: SyncBlock, + }, +} + +// TODO(tree-sync): have the peer set inside here when syncing add dedup logic +// TODO(tree-sync): for backfill sync use the sync state to check the peers have this block or not +pub struct SyncBlock { + id: RangeRequestId, + block_root: Hash256, + failed_peers: HashSet, + peers: Arc>>, + request: SyncingStatus, +} + +pub enum SyncBlockResult { + Done { parent_root: Hash256, slot: Slot }, + Wait, +} + +impl SyncBlock { + pub fn new(id: RangeRequestId, block_root: Hash256, initial_peers: &[PeerId]) -> Self { + Self { + id, + block_root, + failed_peers: <_>::default(), + peers: Arc::new(RwLock::new(HashSet::from_iter(initial_peers))), + request: SyncingStatus::AwaitingDownload, + } + } + + pub fn on_download_result( + &mut self, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::Downloading(_) => match result { + // TODO(tree-sync): check that the request ID matches + Ok((block, peers)) => { + debug!(id = %self.id, "Sync block downloaded"); + self.request = SyncingStatus::AwaitingProcessing(block, peers); + self.continue_request(cx) + } + Err(e) => { + // TODO(tree-sync): increase error counter + debug!(id = %self.id, error = ?e, "Sync block download error"); + self.request = SyncingStatus::AwaitingDownload; + self.continue_request(cx) + } + }, + _ => Err(Error::InternalError( + "Lookup not in expected state Downloading".to_owned(), + )), + } + } + + pub fn on_process_result( + &mut self, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::Processing(peers) => match result { + BatchProcessResult::Success => { + debug!(id = %self.id, "Sync block process success"); + Ok(SyncBlockResult::Done) + } + BatchProcessResult::Failure { peer_action, error } => { + debug!(id = %self.id, "Sync block process error"); + + if let Some(peer_action) = peer_action { + for (peer, penalty) in peers.blame(peer_action) { + cx.report_peer(peer, penalty, "faulty_batch"); + } + } + + self.request = SyncingStatus::AwaitingDownload; + self.continue_request(cx) + } + }, + _ => Err(Error::InternalError( + "Lookup not in expected state Processing".to_owned(), + )), + } + } + + pub fn continue_request( + &mut self, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::AwaitingDownload => { + match cx.block_components_by_range_request( + self.block_root, + self.id, + &self.peers, + &self.failed_peers, + ) { + Ok(req_id) => { + self.request = SyncingStatus::Downloading(req_id); + Ok(SyncBlockResult::Wait) + } + Err(e) => match e { + RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_) => { + Err(Error::InternalError(format!( + "Error sending block components request: {e:?}" + ))) + } + }, + } + } + SyncingStatus::Downloading(_) => Ok(SyncBlockResult::Wait), + SyncingStatus::AwaitingProcessing(block, peers) => { + // No need to check if block is already imported here, we'll get an error + // from the beacon processor anyway. No need to add more code to handle this + // edge case faster. + + let expect_parent_to_be_imported = false; + if expect_parent_to_be_imported + && !cx + .chain + .block_is_known_to_fork_choice(&block.as_block().parent_root()) + { + return Ok(SyncBlockResult::Wait); + } + + if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { + let id = match self.id { + RangeRequestId::ForwardSync(id) => ChainSegmentProcessId::ForwardSync(id), + RangeRequestId::BackfillSync(id) => ChainSegmentProcessId::BackfillSync(id), + }; + + if let Err(e) = beacon_processor.send_chain_segment(id, vec![block.clone()]) { + Err(Error::InternalError(format!( + "Error sending block to processor: {e:?}" + ))) + } else { + self.request = SyncingStatus::Processing(peers.clone()); + Ok(SyncBlockResult::Wait) + } + } else { + // TODO(tree-sync): This error will cause the full chain of headers to + // be dropped if the beacon processor goes offline. When can that + // happen? + Err(Error::InternalError( + "Beacon processor is disabled".to_owned(), + )) + } + } + SyncingStatus::Processing(_) => Ok(SyncBlockResult::Wait), + } + } + + pub fn is_processing(&self) -> bool { + matches!(self.request, SyncingStatus::Processing(_)) + } } enum SyncingStatus { @@ -46,12 +203,36 @@ enum SyncingStatus { // TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got // through gossip and sync. -impl Block { +impl ForwardSyncBlock { fn new(block_root: Hash256, id: Id, peers: &[PeerId]) -> Self { Self { id: HeaderLookupId(block_root, id), - peers: HashSet::from_iter(peers.iter().copied()), - status: Status::DownloadingHeader(ColumnRequest::new()), + status: Status::BackfillHeader { + peers: HashSet::from_iter(peers.iter().copied()), + request: ColumnRequest::new(), + }, + } + } + + fn add_peer(&mut self, peer: PeerId) { + match &mut self.status { + Status::BackfillHeader { peers, .. } => { + peers.insert(peer); + } + Status::ForwardSyncBlock { request, .. } => { + request.peers.write().insert(peer); + } + } + } + + fn remove_peer(&mut self, peer: &PeerId) { + match &mut self.status { + Status::BackfillHeader { peers, .. } => { + peers.remove(peer); + } + Status::ForwardSyncBlock { request, .. } => { + request.peers.write().remove(peer); + } } } @@ -61,17 +242,17 @@ impl Block { fn is_syncing(&self) -> bool { match self.status { - Status::DownloadingHeader(..) => false, - Status::Header(..) => false, - Status::Syncing { .. } => true, + Status::BackfillHeader { .. } => false, + Status::ForwardSyncBlock { .. } => true, } } fn parent_root(&self) -> Option { match &self.status { - Status::DownloadingHeader(..) => None, - Status::Header(header) => Some(header.parent_root), - Status::Syncing { parent_root, .. } => Some(*parent_root), + Status::BackfillHeader { request, .. } => { + request.is_complete().map(|header| header.parent_root) + } + Status::ForwardSyncBlock { header, .. } => Some(header.parent_root), } } @@ -79,16 +260,16 @@ impl Block { &mut self, ) -> Result<&mut ColumnRequest, Error> { match &mut self.status { - Status::DownloadingHeader(request) => Ok(request), + Status::BackfillHeader { request, .. } => Ok(request), _ => Err(Error::InternalError( "Expected lookup to be in DownloadingHeader state".to_owned(), )), } } - fn block_request(&mut self) -> Result<&mut SyncingStatus, Error> { + fn block_request(&mut self) -> Result<&mut SyncBlock, Error> { match &mut self.status { - Status::Syncing { request, .. } => Ok(request), + Status::ForwardSyncBlock { request, .. } => Ok(request), _ => Err(Error::InternalError( "Expected lookup to be in Syncing state".to_owned(), )), @@ -105,10 +286,30 @@ impl Block { ))) } } + + fn send_block_header_request( + lookup: &mut ForwardSyncBlock, + block_root: Hash256, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + // TODO(tree-sync): have good peer selection + let Some(peer) = lookup.peers.iter().next() else { + return Err(Error::InternalError("No peers".to_owned())); + }; + + let req_id = cx.send_blocks_by_root_request( + *peer, + block_root, + BlocksByRootRequester::Header(lookup.id), + )?; + + lookup.header_request()?.on_download_start(req_id)?; + Ok(()) + } } #[derive(Debug)] -enum Error { +pub enum Error { InternalError(String), BlockConflictsWithFinality(String), } @@ -148,9 +349,10 @@ impl BlockTree { self.blocks .values() .filter_map(|block| match &block.status { - Status::DownloadingHeader(..) => None, - Status::Header(header) => Some(header.slot), - Status::Syncing { .. } => None, + Status::BackfillHeader { request, .. } => { + request.is_complete().map(|header| header.slot) + } + Status::ForwardSyncBlock { .. } => None, }) .max() } @@ -160,10 +362,11 @@ impl BlockTree { self.blocks .values() .filter(|block| { - matches!( - block.status, - Status::Syncing(_, SyncingStatus::Processing(_)), - ) + block + .header_request() + .ok() + .map(|request| request.is_processing()) + .unwrap_or(false) }) .map(|block| block.id) .collect() @@ -175,7 +378,7 @@ impl BlockTree { pub fn remove_peer(&mut self, peer: PeerId) { for block in self.blocks.values_mut() { - block.peers.remove(&peer); + block.remove_peer(&peer); } } @@ -191,7 +394,7 @@ impl BlockTree { while let Some(lookup) = self.blocks.get_mut(&target_block_root) { for peer in peers { // TODO(tree-sync): If peer already in set no need to add to its ancestors - lookup.peers.insert(*peer); + lookup.add_peer(*peer); // TODO(tree-sync): This log can be very noisy maybe log once per peer debug!(block_root = ?target_block_root, ?peer, "Adding peer to existing header lookup"); } @@ -202,9 +405,13 @@ impl BlockTree { } } } else { + if self.blocks.len() > MAX_LOOKUP_COUNT { + self.prune_least_popular_lookups(); + } + debug!(?block_root, ?peers, "Creating new header lookup"); - let mut lookup = Block::new(block_root, cx.next_id(), peers); + let mut lookup = ForwardSyncBlock::new(block_root, cx.next_id(), peers); match Self::send_block_header_request(&mut lookup, block_root, cx) { Ok(_) => { self.blocks.insert(block_root, lookup); @@ -216,15 +423,15 @@ impl BlockTree { } } - pub fn on_block_header( + pub fn on_header_download_result( &mut self, req_id: BlocksByRootRequestId, - lookup_id: HeaderLookupId, + id: HeaderLookupId, response: RpcResponseResult>>>, peer_id: PeerId, cx: &mut SyncNetworkContext, ) { - let block_root = lookup_id.0; + let block_root = id.0; let result = (|| { let Some(lookup) = self.blocks.get_mut(&block_root) else { @@ -232,7 +439,7 @@ impl BlockTree { debug!(id = ?req_id, "Received header request for unknown lookup"); return Ok(()); }; - lookup.assert_expected_lookup_id(lookup_id)?; + lookup.assert_expected_lookup_id(id)?; let response = response.and_then(|(blocks, timestamp)| { let block = blocks @@ -299,11 +506,65 @@ impl BlockTree { } Ok(()) })(); + self.handle_result(id.0, result); + } - if let Err(e) = result { - debug!(error = ?e, "Dropping forward sync block header lookup"); - self.drop_lookup_and_children(block_root); - } + pub fn on_block_download_result( + &mut self, + id: HeaderLookupId, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) { + let result = (|| { + // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right + // one, while still indexing by block_root only + let Some(lookup) = self.blocks.get_mut(&id.0) else { + // TODO(tree-sync): register metric + debug!(?id, "Received block request for unknown lookup"); + return Ok(()); + }; + lookup.assert_expected_lookup_id(id)?; + + let request = lookup.block_request()?; + request.on_download_result(result, cx)?; + Ok(()) + })(); + self.handle_result(id.0, result); + + // Continue batches + self.continue_syncing_blocks(cx); + } + + pub fn on_block_process_result( + &mut self, + id: HeaderLookupId, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) { + let result = (|| { + let Some(lookup) = self.blocks.get_mut(&id.0) else { + debug!(?id, "Received block process result for unknown lookup"); + return Ok(()); + }; + lookup.assert_expected_lookup_id(id)?; + + let request = lookup.block_request()?; + match request.on_process_result(result, cx) { + Ok(SyncBlockResult::Done { .. }) => { + self.blocks.remove(&id.0); + self.trigger_forward_sync(cx); + } + Ok(SyncBlockResult::Continue) => { + // continue same block + } + _ => {} + } + todo!(); + })(); + self.handle_result(id.0, result); + + // Continue batches + self.continue_syncing_blocks(cx); } pub fn prune(&mut self) { @@ -314,6 +575,27 @@ impl BlockTree { todo!(); } + fn handle_result(&mut self, block_root: Hash256, result: Result<(), Error>) { + match result { + Ok(_) => {} + Err(e) => { + debug!(error = ?e, "Dropping forward sync block header lookup"); + match e { + Error::InternalError(_e) => { + let block_to_children = self.compute_children(); + self.drop_lookup_and_children(block_root, &block_to_children); + } + Error::BlockConflictsWithFinality(_e) => { + let block_to_children = self.compute_children(); + self.drop_lookup_and_children(block_root, &block_to_children); + // TODO(tree-sync): penalize peers of this lookups + // TODO(tree-sync): add blocks to a failed cache to prevent re-sync + } + } + } + } + } + /// Marks blocks ready for download as syncing /// Should be called anytime: /// - A new block is imported to fork-choice @@ -340,7 +622,7 @@ impl BlockTree { for _ in blocks_syncing..2 { // Find the block range with most peers and highest slot. This is the block // to be used as tip of the chain of blocks to fetch. - let Some((block_root, parent_root)) = self + let Some((block_root, header)) = self .blocks .iter() .filter_map(|(root, block)| { @@ -360,19 +642,16 @@ impl BlockTree { }; if is_candidate { - // Find highest peer count, then min slot - Some(( - block.peer_count(), - Slot::new(u64::MAX) - header.slot, - root, - &header.parent_root, - )) + Some((block.peer_count(), root, header)) } else { None } }) - .max() - .map(|(_, _, root, parent_root)| (*root, *parent_root)) + .max_by_key(|(peer_count, _, header)| { + // Find highest peer count, then min slot + (*peer_count, Slot::new(u64::MAX) - header.slot) + }) + .map(|(_, root, header)| (*root, header.clone())) else { break; }; @@ -384,11 +663,10 @@ impl BlockTree { .expect("block_root is a key of self.blocks"); // The code above ensures that `block_to_sync` is in `Status::Header` status - block_to_sync.status = Status::Syncing { - block_root, - parent_root, - request: SyncingStatus::AwaitingDownload, - }; + block_to_sync.status = Status::Syncing( + header, + SyncBlock::new(RangeRequestId::ForwardSync(block_to_sync.id), block_root), + ); debug!(id = %block_to_sync.id, "Starting forwards sync of block"); @@ -407,61 +685,9 @@ impl BlockTree { let result = match &mut lookup.status { Status::DownloadingHeader(..) => continue, Status::Header(_) => continue, - Status::Syncing { request, .. } => match request { - SyncingStatus::AwaitingDownload => { - let requester = RangeRequestId::RangeSync(lookup.id); - // TODO(tree-sync) use RwLock or manually add to active request - let peers = Arc::new(RwLock::new(HashSet::from_iter( - lookup.peers.iter().copied(), - ))); - let failed_peers = HashSet::new(); - - match cx.block_components_by_range_request( - *block_root, - requester, - peers, - &failed_peers, - ) { - Ok(req_id) => { - *request = SyncingStatus::Downloading(req_id); - Ok(()) - } - Err(e) => match e { - RpcRequestSendError::NoPeers - | RpcRequestSendError::InternalError(_) => { - Err(format!("Error sending block components request: {e:?}")) - } - }, - } - } - SyncingStatus::Downloading(_) => Ok(()), // wait for event - SyncingStatus::AwaitingProcessing(block, peers) => { - if cx - .chain - .block_is_known_to_fork_choice(&block.as_block().parent_root()) - { - if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { - if let Err(e) = beacon_processor.send_chain_segment( - ChainSegmentProcessId::RangeBatchId(lookup.id), - vec![block.clone()], - ) { - Err(format!("Error sending block to processor: {e:?}")) - } else { - *request = SyncingStatus::Processing(peers.clone()); - Ok(()) - } - } else { - // TODO(tree-sync): This error will cause the full chain of headers to - // be dropped if the beacon processor goes offline. When can that - // happen? - Err("Beacon processor is disabled".to_owned()) - } - } else { - Ok(()) - } - } - SyncingStatus::Processing(_) => Ok(()), // wait for event - }, + Status::Syncing(_, syncing_block) => { + syncing_block.continue_request(&lookup.peers, cx) + } }; if let Err(_e) = result { @@ -470,104 +696,72 @@ impl BlockTree { } } + let block_to_children = self.compute_children(); for block_root in lookups_to_drop { - self.drop_lookup_and_children(block_root); + self.drop_lookup_and_children(block_root, &block_to_children); } } - pub fn on_block_response( + /// Drop lookup `block_root` if it exists and all its children + fn drop_lookup_and_children( &mut self, - id: HeaderLookupId, - result: Result<(RpcBlock, BatchPeers), RpcResponseError>, - cx: &mut SyncNetworkContext, + block_root: Hash256, + block_to_children: &HashMap>, ) { - let result = (|| { - // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right - // one, while still indexing by block_root only - let Some(lookup) = self.blocks.get_mut(&id.0) else { - // TODO(tree-sync): register metric - debug!(?id, "Received block request for unknown lookup"); - return Ok(()); - }; - lookup.assert_expected_lookup_id(id)?; - - let request = lookup.block_request()?; - match request { - SyncingStatus::Downloading(_) => match result { - Ok((block, peers)) => { - debug!(%id, "Sync block downloaded"); - *request = SyncingStatus::AwaitingProcessing(block, peers); - Ok(()) - } - Err(e) => { - // TODO(tree-sync): increase error counter - debug!(%id, error = ?e, "Sync block download error"); - *request = SyncingStatus::AwaitingDownload; - Ok(()) - } - }, - _ => Err(Error::InternalError( - "Lookup not in expected state Downloading".to_owned(), - )), + // Change to `Vec::new()` if you want depth-first order. + let mut queue: VecDeque = VecDeque::from([block_root]); + + while let Some(node) = queue.pop_front() { + // Remove the node itself. + if self.blocks.remove(&node).is_some() { + // Only remove children if the node still existed + // Push its children—if any—onto the work list. + if let Some(children) = block_to_children.get(&node) { + queue.extend(children.iter().cloned()); + } } - })(); - - // Continue batches - self.continue_syncing_blocks(cx); + } } - pub fn handle_block_process_result( - &mut self, - id: HeaderLookupId, - result: BatchProcessResult, - cx: &mut SyncNetworkContext, - ) { - let result = (|| { - let Some(lookup) = self.blocks.get_mut(&id.0) else { - debug!(?id, "Received block process result for unknown lookup"); - return Ok(()); - }; - lookup.assert_expected_lookup_id(id)?; - - let request = lookup.block_request()?; - match request { - SyncingStatus::Processing(peers) => match result { - BatchProcessResult::Success => { - debug!(%id, "Sync block process success"); - self.blocks.remove(&id.0); - self.trigger_forward_sync(cx); - Ok(()) - } - BatchProcessResult::Failure { peer_action, error } => { - debug!(%id, "Sync block process error"); - - if let Some(peer_action) = peer_action { - for (peer, penalty) in peers.blame(peer_action) { - cx.report_peer(peer, penalty, "faulty_batch"); - } - } - - *request = SyncingStatus::AwaitingDownload; - - Ok(()) - } - }, - _ => Err(Error::InternalError( - "Lookup not in expected state Processing".to_owned(), - )), + /// Drop lookup `block_root` if it exists and all its children + fn compute_children(&mut self) -> HashMap> { + let mut block_to_children = HashMap::>::new(); + for (block_root, block) in self.blocks.iter() { + if let Some(parent_root) = block.parent_root() { + block_to_children + .entry(parent_root) + .or_default() + .push(*block_root); } - })(); - - // Continue batches - self.continue_syncing_blocks(cx); + } + block_to_children } - fn drop_lookup_and_children(&mut self, _block_root: Hash256) { - todo!(); + /// Drop lookups with least amount of peers and slot until we pruned PRUNE_COUNT lookups + fn prune_least_popular_lookups(&mut self) { + let mut blocks = self + .blocks + .iter() + .filter_map(|(block_root, block)| match &block.status { + // Prune only lookups that are not syncing and we know the header + Status::DownloadingHeader(..) => None, + Status::Header(header) => Some((block.peer_count(), header.slot, *block_root)), + Status::Syncing { .. } => None, + }) + .collect::>(); + blocks.sort_unstable(); + + let block_to_children = self.compute_children(); + for (_, _, block_root) in blocks { + self.drop_lookup_and_children(block_root, &block_to_children); + if self.blocks.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { + break; + } + } } fn send_block_header_request( - lookup: &mut Block, + lookup: &mut ForwardSyncBlock, block_root: Hash256, cx: &mut SyncNetworkContext, ) -> Result<(), Error> { diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index b547a556e36..c063a1ad31d 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -350,6 +350,8 @@ impl SyncManager { }; // Search for any block that is unknown and more recent than finality + // TODO(tree-sync): we could prioritize the finalized_root if it's unknown as a way to + // detect finalized sync debug!(?remote, ?local, "new peer"); if !self.chain.block_is_known_to_fork_choice(&remote.head_root) && remote.head_slot @@ -706,15 +708,15 @@ impl SyncManager { imported, } => self.block_tree.prune_root(block_root, imported), SyncMessage::BatchProcessed { sync_type, result } => match sync_type { - ChainSegmentProcessId::RangeBatchId(id) => { + ChainSegmentProcessId::ForwardSync(id) => { self.block_tree - .handle_block_process_result(id, result, &mut self.network); + .on_block_process_result(id, result, &mut self.network); self.update_sync_state(); } - ChainSegmentProcessId::BackSyncBatchId(id) => { + ChainSegmentProcessId::BackfillSync(id) => { // TODO(tree-sync): should update sync state self.backfill_sync - .handle_block_process_result(id, result, &mut self.network) + .on_block_process_result(id, result, &mut self.network) } }, SyncMessage::SampleVerified { id, result } => { @@ -863,7 +865,7 @@ impl SyncManager { { match req_id.parent_request_id { BlocksByRootRequester::Header(lookup_id) => { - self.block_tree.on_block_header( + self.block_tree.on_header_download_result( req_id, lookup_id, result, @@ -871,7 +873,7 @@ impl SyncManager { &mut self.network, ); } - BlocksByRootRequester::RangeSync(batch_id) => { + BlocksByRootRequester::ForwardSync(batch_id) => { self.on_block_components_by_root_response( batch_id, RangeBlockComponent::Block(req_id, result, peer_id), @@ -1017,13 +1019,13 @@ impl SyncManager { .on_block_components_by_root_response(range_request_id, range_block_component) { match range_request_id.requester { - RangeRequestId::RangeSync(id) => { + RangeRequestId::ForwardSync(id) => { self.block_tree - .on_block_response(id, result, &mut self.network); + .on_block_download_result(id, result, &mut self.network); } RangeRequestId::BackfillSync(id) => { self.backfill_sync - .on_block_response(id, result, &mut self.network) + .on_block_download_result(id, result, &mut self.network) } } } diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index e0f61c6f839..a2a2387e2b2 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -122,7 +122,7 @@ impl BlockComponentsByRootRequest { let blocks_req_id = cx.send_blocks_by_root_request( block_peer, block_root, - BlocksByRootRequester::RangeSync(id), + BlocksByRootRequester::ForwardSync(id), )?; let state = State::BlocksRequest { diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 15a62072bd0..682ee9f0358 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -468,6 +468,10 @@ impl ColumnRequest { } } + pub fn is_complete(&self) -> Option { + todo!() + } + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { From 43ac04d6754d1639aa2af5f7b78d539f41d6dcf7 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 19:46:01 +0200 Subject: [PATCH 33/66] more todos --- .../network/src/sync/backfill_sync/mod.rs | 35 +- beacon_node/network/src/sync/block_tree.rs | 327 ++++++------------ beacon_node/network/src/sync/mod.rs | 1 + beacon_node/network/src/sync/sync_block.rs | 199 +++++++++++ 4 files changed, 323 insertions(+), 239 deletions(-) create mode 100644 beacon_node/network/src/sync/sync_block.rs diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index cd4388411f5..0b74ac9eaff 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -8,21 +8,19 @@ //! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill //! sync as failed, log an error and attempt to retry once a new peer joins the node. -use crate::sync::block_tree::{Error as TempError, SyncBlock, SyncBlockResult}; use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ BatchPeers, RangeRequestId, RpcResponseError, SyncNetworkContext, }; +use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::PeerId; -use parking_lot::RwLock; -use std::collections::HashSet; use std::sync::Arc; use tracing::{debug, info, instrument, warn}; -use types::{EthSpec, Hash256}; +use types::{EthSpec, Hash256, Slot}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; @@ -152,7 +150,7 @@ impl BackFillSync { match self.state() { BackFillState::Syncing => {} // already syncing ignore. BackFillState::Paused => { - if !self.peers.read().is_empty() { + if self.status.peer_count() == 0 { // If there are peers to resume with, begin the resume. debug!("Resuming backfill sync"); self.set_state(BackFillState::Syncing); @@ -199,13 +197,13 @@ impl BackFillSync { } pub fn add_peer(&mut self, peer_id: PeerId) { - self.status.peers.write().insert(peer_id); + self.status.add_peer(peer_id); } pub fn peer_disconnected(&mut self, peer_id: &PeerId) { - self.peers.write().remove(peer_id); + self.status.remove_peer(peer_id); - if self.peers.read().is_empty() { + if self.status.peer_count() == 0 { info!( "reason" = "insufficient_synced_peers", "Backfill sync paused" @@ -235,30 +233,33 @@ impl BackFillSync { } fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { - let outcome = self.status.continue_request(&self.peers, cx); + let outcome = self.status.continue_request(cx); self.handle_outcome(outcome, cx); } fn handle_outcome( &mut self, - result: Result, + result: Result, cx: &mut SyncNetworkContext, ) { match result { Ok(SyncBlockResult::Done { parent_root, slot }) => { - if is_done(slot) { + if self.is_done(slot) { todo!("done"); } else { - self.status = - SyncBlock::new(RangeRequestId::BackfillSync(cx.next_id()), parent_root) + let peers = self.status.clone_peers(); + self.status = SyncBlock::new( + RangeRequestId::BackfillSync(cx.next_id()), + parent_root, + &peers.into_iter().collect::>(), + ) } } Ok(SyncBlockResult::Wait) => { // Do nothing wait for future event } Err(e) => match e { - TempError::InternalError(_) => {} - TempError::BlockConflictsWithFinality(_) => {} + SyncBlockError::InternalError(_) => {} }, } self.continue_syncing_blocks(cx); @@ -277,6 +278,10 @@ impl BackFillSync { fn state(&self) -> BackFillState { self.network_globals.backfill_state.read().clone() } + + fn is_done(&self, slot: Slot) -> bool { + todo!(); + } } /// Error kind for attempting to restart the sync from beacon chain parameters. diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/block_tree.rs index a83325bab5a..743450f170c 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/block_tree.rs @@ -1,7 +1,7 @@ use super::network_context::{RpcRequestSendError, RpcResponseError, SyncNetworkContext}; -use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::custody_by_root::{ColumnRequest, Error as ColumnRequestError}; use crate::sync::network_context::{BatchPeers, RpcResponseResult}; +use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; @@ -9,10 +9,9 @@ use lighthouse_network::service::api_types::{ BlocksByRootRequestId, BlocksByRootRequester, HeaderLookupId, Id, RangeRequestId, }; use lighthouse_network::PeerId; -use parking_lot::RwLock; use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; -use tracing::{debug, warn}; +use tracing::{debug, error, warn}; use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; const MAX_LOOKUP_COUNT: usize = 1_000_000; @@ -29,6 +28,7 @@ struct ForwardSyncBlock { } enum Status { + // TODO(tree-sync): Make the "waiting" completed header requests as memory cheap as possible BackfillHeader { peers: HashSet, request: ColumnRequest, @@ -39,167 +39,6 @@ enum Status { }, } -// TODO(tree-sync): have the peer set inside here when syncing add dedup logic -// TODO(tree-sync): for backfill sync use the sync state to check the peers have this block or not -pub struct SyncBlock { - id: RangeRequestId, - block_root: Hash256, - failed_peers: HashSet, - peers: Arc>>, - request: SyncingStatus, -} - -pub enum SyncBlockResult { - Done { parent_root: Hash256, slot: Slot }, - Wait, -} - -impl SyncBlock { - pub fn new(id: RangeRequestId, block_root: Hash256, initial_peers: &[PeerId]) -> Self { - Self { - id, - block_root, - failed_peers: <_>::default(), - peers: Arc::new(RwLock::new(HashSet::from_iter(initial_peers))), - request: SyncingStatus::AwaitingDownload, - } - } - - pub fn on_download_result( - &mut self, - result: Result<(RpcBlock, BatchPeers), RpcResponseError>, - cx: &mut SyncNetworkContext, - ) -> Result { - match &mut self.request { - SyncingStatus::Downloading(_) => match result { - // TODO(tree-sync): check that the request ID matches - Ok((block, peers)) => { - debug!(id = %self.id, "Sync block downloaded"); - self.request = SyncingStatus::AwaitingProcessing(block, peers); - self.continue_request(cx) - } - Err(e) => { - // TODO(tree-sync): increase error counter - debug!(id = %self.id, error = ?e, "Sync block download error"); - self.request = SyncingStatus::AwaitingDownload; - self.continue_request(cx) - } - }, - _ => Err(Error::InternalError( - "Lookup not in expected state Downloading".to_owned(), - )), - } - } - - pub fn on_process_result( - &mut self, - result: BatchProcessResult, - cx: &mut SyncNetworkContext, - ) -> Result { - match &mut self.request { - SyncingStatus::Processing(peers) => match result { - BatchProcessResult::Success => { - debug!(id = %self.id, "Sync block process success"); - Ok(SyncBlockResult::Done) - } - BatchProcessResult::Failure { peer_action, error } => { - debug!(id = %self.id, "Sync block process error"); - - if let Some(peer_action) = peer_action { - for (peer, penalty) in peers.blame(peer_action) { - cx.report_peer(peer, penalty, "faulty_batch"); - } - } - - self.request = SyncingStatus::AwaitingDownload; - self.continue_request(cx) - } - }, - _ => Err(Error::InternalError( - "Lookup not in expected state Processing".to_owned(), - )), - } - } - - pub fn continue_request( - &mut self, - cx: &mut SyncNetworkContext, - ) -> Result { - match &mut self.request { - SyncingStatus::AwaitingDownload => { - match cx.block_components_by_range_request( - self.block_root, - self.id, - &self.peers, - &self.failed_peers, - ) { - Ok(req_id) => { - self.request = SyncingStatus::Downloading(req_id); - Ok(SyncBlockResult::Wait) - } - Err(e) => match e { - RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_) => { - Err(Error::InternalError(format!( - "Error sending block components request: {e:?}" - ))) - } - }, - } - } - SyncingStatus::Downloading(_) => Ok(SyncBlockResult::Wait), - SyncingStatus::AwaitingProcessing(block, peers) => { - // No need to check if block is already imported here, we'll get an error - // from the beacon processor anyway. No need to add more code to handle this - // edge case faster. - - let expect_parent_to_be_imported = false; - if expect_parent_to_be_imported - && !cx - .chain - .block_is_known_to_fork_choice(&block.as_block().parent_root()) - { - return Ok(SyncBlockResult::Wait); - } - - if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { - let id = match self.id { - RangeRequestId::ForwardSync(id) => ChainSegmentProcessId::ForwardSync(id), - RangeRequestId::BackfillSync(id) => ChainSegmentProcessId::BackfillSync(id), - }; - - if let Err(e) = beacon_processor.send_chain_segment(id, vec![block.clone()]) { - Err(Error::InternalError(format!( - "Error sending block to processor: {e:?}" - ))) - } else { - self.request = SyncingStatus::Processing(peers.clone()); - Ok(SyncBlockResult::Wait) - } - } else { - // TODO(tree-sync): This error will cause the full chain of headers to - // be dropped if the beacon processor goes offline. When can that - // happen? - Err(Error::InternalError( - "Beacon processor is disabled".to_owned(), - )) - } - } - SyncingStatus::Processing(_) => Ok(SyncBlockResult::Wait), - } - } - - pub fn is_processing(&self) -> bool { - matches!(self.request, SyncingStatus::Processing(_)) - } -} - -enum SyncingStatus { - AwaitingDownload, - Downloading(Id), - AwaitingProcessing(RpcBlock, BatchPeers), - Processing(BatchPeers), -} - // TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got // through gossip and sync. @@ -220,7 +59,7 @@ impl ForwardSyncBlock { peers.insert(peer); } Status::ForwardSyncBlock { request, .. } => { - request.peers.write().insert(peer); + request.add_peer(peer); } } } @@ -231,13 +70,25 @@ impl ForwardSyncBlock { peers.remove(peer); } Status::ForwardSyncBlock { request, .. } => { - request.peers.write().remove(peer); + request.remove_peer(peer); } } } fn peer_count(&self) -> usize { - self.peers.len() + match &self.status { + Status::BackfillHeader { peers, .. } => peers.len(), + Status::ForwardSyncBlock { request, .. } => request.peer_count(), + } + } + + fn get_peers(&self) -> Vec { + match &self.status { + Status::BackfillHeader { peers, .. } => peers.iter().copied().collect(), + Status::ForwardSyncBlock { request, .. } => { + request.clone_peers().iter().copied().collect() + } + } } fn is_syncing(&self) -> bool { @@ -287,23 +138,65 @@ impl ForwardSyncBlock { } } + fn to_foward_sync_block(&mut self, block_root: Hash256) -> Result<(), Error> { + let (peers, request) = match &mut self.status { + Status::BackfillHeader { peers, request } => (peers, request), + _ => { + return Err(Error::InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), + )) + } + }; + + let header = match request.is_complete() { + Some(header) => header.clone(), + None => { + return Err(Error::InternalError( + "Expected request to be complete".to_owned(), + )) + } + }; + + // We are replacing the `status` field below, so peers will never be read again + let initial_peers = std::mem::take(peers).into_iter().collect::>(); + + self.status = Status::ForwardSyncBlock { + header, + request: SyncBlock::new( + RangeRequestId::ForwardSync(self.id), + block_root, + &initial_peers, + ), + }; + Ok(()) + } + fn send_block_header_request( - lookup: &mut ForwardSyncBlock, + &mut self, block_root: Hash256, cx: &mut SyncNetworkContext, ) -> Result<(), Error> { + let peers = match &self.status { + Status::BackfillHeader { peers, .. } => peers, + Status::ForwardSyncBlock { request, .. } => { + return Err(Error::InternalError( + "Lookup not in forward sync block status".to_owned(), + )) + } + }; + // TODO(tree-sync): have good peer selection - let Some(peer) = lookup.peers.iter().next() else { + let Some(peer) = peers.iter().next() else { return Err(Error::InternalError("No peers".to_owned())); }; let req_id = cx.send_blocks_by_root_request( *peer, block_root, - BlocksByRootRequester::Header(lookup.id), + BlocksByRootRequester::Header(self.id), )?; - lookup.header_request()?.on_download_start(req_id)?; + self.header_request()?.on_download_start(req_id)?; Ok(()) } } @@ -326,6 +219,14 @@ impl From for Error { } } +impl From for Error { + fn from(e: SyncBlockError) -> Self { + match e { + SyncBlockError::InternalError(e) => Self::InternalError(e), + } + } +} + pub(crate) enum SyncState { Synced, Syncing { max_slot: Slot }, @@ -412,7 +313,7 @@ impl BlockTree { debug!(?block_root, ?peers, "Creating new header lookup"); let mut lookup = ForwardSyncBlock::new(block_root, cx.next_id(), peers); - match Self::send_block_header_request(&mut lookup, block_root, cx) { + match lookup.send_block_header_request(block_root, cx) { Ok(_) => { self.blocks.insert(block_root, lookup); } @@ -464,7 +365,6 @@ impl BlockTree { block_header.clone(), received, )?; - lookup.status = Status::Header(block_header.clone()); // Once we discover the parent_root of this block three things can happen // 1. The parent root is a known block -> stop @@ -494,14 +394,14 @@ impl BlockTree { self.trigger_forward_sync(cx); } else { let lookup = self.blocks.get_mut(&block_root).expect("lookup exists"); - let peers = lookup.peers.iter().copied().collect::>(); + let peers = lookup.get_peers(); self.search(parent_root, &peers, cx); } } Err(e) => { debug!(%req_id, error = ?e, "Forward sync block header downloaded error"); lookup.header_request()?.on_download_error(req_id)?; - Self::send_block_header_request(lookup, block_root, cx)?; + lookup.send_block_header_request(block_root, cx)?; } } Ok(()) @@ -554,7 +454,7 @@ impl BlockTree { self.blocks.remove(&id.0); self.trigger_forward_sync(cx); } - Ok(SyncBlockResult::Continue) => { + Ok(SyncBlockResult::Wait) => { // continue same block } _ => {} @@ -622,16 +522,18 @@ impl BlockTree { for _ in blocks_syncing..2 { // Find the block range with most peers and highest slot. This is the block // to be used as tip of the chain of blocks to fetch. - let Some((block_root, header)) = self + let Some(block_root) = self .blocks .iter() .filter_map(|(root, block)| { let header = match &block.status { // Ignore blocks that are still downloading - Status::DownloadingHeader(_) => return None, - Status::Header(header) => header, + Status::BackfillHeader { request, .. } => match request.is_complete() { + Some(header) => header, + None => return None, + }, // Ignore blocks already syncing - Status::Syncing { .. } => return None, + Status::ForwardSyncBlock { .. } => return None, }; // Check if the parent is known in the header tree let is_candidate = if let Some(parent) = self.blocks.get(&header.parent_root) { @@ -642,33 +544,32 @@ impl BlockTree { }; if is_candidate { - Some((block.peer_count(), root, header)) + Some((block.peer_count(), Slot::new(u64::MAX) - header.slot, root)) } else { None } }) - .max_by_key(|(peer_count, _, header)| { - // Find highest peer count, then min slot - (*peer_count, Slot::new(u64::MAX) - header.slot) - }) - .map(|(_, root, header)| (*root, header.clone())) + .max() + .map(|(_, _, root)| *root) else { break; }; // Start syncing `block_root` - let block_to_sync = self + match self .blocks .get_mut(&block_root) - .expect("block_root is a key of self.blocks"); - - // The code above ensures that `block_to_sync` is in `Status::Header` status - block_to_sync.status = Status::Syncing( - header, - SyncBlock::new(RangeRequestId::ForwardSync(block_to_sync.id), block_root), - ); - - debug!(id = %block_to_sync.id, "Starting forwards sync of block"); + .ok_or(Error::InternalError(format!( + "self.blocks must contain an entry with {block_root}" + ))) + .and_then(|block| { + block.to_foward_sync_block(block_root)?; + Ok(block.id) + }) { + Ok(id) => debug!(?id, "Starting forward sync of block"), + // Should never error + Err(e) => error!("Unable to transition header to forward sync block: {e:?}"), + } new_syncing_blocks = true; } @@ -683,11 +584,8 @@ impl BlockTree { for (block_root, lookup) in self.blocks.iter_mut() { let result = match &mut lookup.status { - Status::DownloadingHeader(..) => continue, - Status::Header(_) => continue, - Status::Syncing(_, syncing_block) => { - syncing_block.continue_request(&lookup.peers, cx) - } + Status::BackfillHeader { .. } => continue, + Status::ForwardSyncBlock { request, .. } => request.continue_request(cx), }; if let Err(_e) = result { @@ -744,9 +642,10 @@ impl BlockTree { .iter() .filter_map(|(block_root, block)| match &block.status { // Prune only lookups that are not syncing and we know the header - Status::DownloadingHeader(..) => None, - Status::Header(header) => Some((block.peer_count(), header.slot, *block_root)), - Status::Syncing { .. } => None, + Status::BackfillHeader { peers, request } => request + .is_complete() + .map(|header| (block.peer_count(), header.slot, *block_root)), + Status::ForwardSyncBlock { .. } => None, }) .collect::>(); blocks.sort_unstable(); @@ -759,24 +658,4 @@ impl BlockTree { } } } - - fn send_block_header_request( - lookup: &mut ForwardSyncBlock, - block_root: Hash256, - cx: &mut SyncNetworkContext, - ) -> Result<(), Error> { - // TODO(tree-sync): have good peer selection - let Some(peer) = lookup.peers.iter().next() else { - return Err(Error::InternalError("No peers".to_owned())); - }; - - let req_id = cx.send_blocks_by_root_request( - *peer, - block_root, - BlocksByRootRequester::Header(lookup.id), - )?; - - lookup.header_request()?.on_download_start(req_id)?; - Ok(()) - } } diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 22a52544e63..25e1cafe8be 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -7,6 +7,7 @@ pub mod manager; mod network_context; mod peer_sampling; mod peer_sync_info; +mod sync_block; #[cfg(test)] mod tests; diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs new file mode 100644 index 00000000000..83996aa503d --- /dev/null +++ b/beacon_node/network/src/sync/sync_block.rs @@ -0,0 +1,199 @@ +use super::network_context::{RpcRequestSendError, RpcResponseError, SyncNetworkContext}; +use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::network_context::BatchPeers; +use crate::sync::BatchProcessResult; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::BeaconChainTypes; +use lighthouse_network::service::api_types::{Id, RangeRequestId}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::HashSet; +use std::sync::Arc; +use tracing::debug; +use types::{EthSpec, Hash256, Slot}; + +// TODO(tree-sync): have the peer set inside here when syncing add dedup logic +// TODO(tree-sync): for backfill sync use the sync state to check the peers have this block or not +pub struct SyncBlock { + id: RangeRequestId, + block_root: Hash256, + failed_peers: HashSet, + peers: Arc>>, + request: SyncingStatus, +} + +pub enum SyncBlockResult { + Done { parent_root: Hash256, slot: Slot }, + Wait, +} + +pub enum Error { + InternalError(String), +} + +impl SyncBlock { + pub fn new(id: RangeRequestId, block_root: Hash256, initial_peers: &[PeerId]) -> Self { + Self { + id, + block_root, + failed_peers: <_>::default(), + peers: Arc::new(RwLock::new(HashSet::from_iter( + initial_peers.iter().copied(), + ))), + request: SyncingStatus::AwaitingDownload, + } + } + + pub fn peer_count(&self) -> usize { + self.peers.read().len() + } + + pub fn clone_peers(&self) -> HashSet { + self.peers.read().clone() + } + + pub fn add_peer(&self, peer: PeerId) -> bool { + self.peers.write().insert(peer) + } + + pub fn remove_peer(&self, peer: &PeerId) -> bool { + self.peers.write().remove(peer) + } + + pub fn on_download_result( + &mut self, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::Downloading(_) => match result { + // TODO(tree-sync): check that the request ID matches + Ok((block, peers)) => { + debug!(id = %self.id, "Sync block downloaded"); + self.request = SyncingStatus::AwaitingProcessing(block, peers); + self.continue_request(cx) + } + Err(e) => { + // TODO(tree-sync): increase error counter + debug!(id = %self.id, error = ?e, "Sync block download error"); + self.request = SyncingStatus::AwaitingDownload; + self.continue_request(cx) + } + }, + _ => Err(Error::InternalError( + "Lookup not in expected state Downloading".to_owned(), + )), + } + } + + pub fn on_process_result( + &mut self, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::Processing(block, peers) => match result { + BatchProcessResult::Success => { + debug!(id = %self.id, "Sync block process success"); + Ok(SyncBlockResult::Done { + parent_root: block.as_block().parent_root(), + slot: block.as_block().slot(), + }) + } + BatchProcessResult::Failure { peer_action, error } => { + debug!(id = %self.id, "Sync block process error"); + + if let Some(peer_action) = peer_action { + for (peer, penalty) in peers.blame(peer_action) { + cx.report_peer(peer, penalty, "faulty_batch"); + } + } + + self.request = SyncingStatus::AwaitingDownload; + self.continue_request(cx) + } + }, + _ => Err(Error::InternalError( + "Lookup not in expected state Processing".to_owned(), + )), + } + } + + pub fn continue_request( + &mut self, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::AwaitingDownload => { + match cx.block_components_by_range_request( + self.block_root, + self.id, + self.peers.clone(), + &self.failed_peers, + ) { + Ok(req_id) => { + self.request = SyncingStatus::Downloading(req_id); + Ok(SyncBlockResult::Wait) + } + Err(e) => match e { + RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_) => { + Err(Error::InternalError(format!( + "Error sending block components request: {e:?}" + ))) + } + }, + } + } + SyncingStatus::Downloading(_) => Ok(SyncBlockResult::Wait), + SyncingStatus::AwaitingProcessing(block, peers) => { + // No need to check if block is already imported here, we'll get an error + // from the beacon processor anyway. No need to add more code to handle this + // edge case faster. + + let expect_parent_to_be_imported = false; + if expect_parent_to_be_imported + && !cx + .chain + .block_is_known_to_fork_choice(&block.as_block().parent_root()) + { + return Ok(SyncBlockResult::Wait); + } + + if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { + let id = match self.id { + RangeRequestId::ForwardSync(id) => ChainSegmentProcessId::ForwardSync(id), + RangeRequestId::BackfillSync(id) => ChainSegmentProcessId::BackfillSync(id), + }; + + if let Err(e) = beacon_processor.send_chain_segment(id, vec![block.clone()]) { + Err(Error::InternalError(format!( + "Error sending block to processor: {e:?}" + ))) + } else { + self.request = SyncingStatus::Processing(block.clone(), peers.clone()); + Ok(SyncBlockResult::Wait) + } + } else { + // TODO(tree-sync): This error will cause the full chain of headers to + // be dropped if the beacon processor goes offline. When can that + // happen? + Err(Error::InternalError( + "Beacon processor is disabled".to_owned(), + )) + } + } + SyncingStatus::Processing(..) => Ok(SyncBlockResult::Wait), + } + } + + pub fn is_processing(&self) -> bool { + matches!(self.request, SyncingStatus::Processing(..)) + } +} + +enum SyncingStatus { + AwaitingDownload, + Downloading(Id), + AwaitingProcessing(RpcBlock, BatchPeers), + Processing(RpcBlock, BatchPeers), +} From 623a517aa64cd57dad63bed9099042b151d3eb2d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 20:05:22 +0200 Subject: [PATCH 34/66] Rename to forward sync --- .../network/src/sync/backfill_sync/mod.rs | 16 +++++--- .../sync/{block_tree.rs => forward_sync.rs} | 4 +- beacon_node/network/src/sync/manager.rs | 40 ++++++++----------- beacon_node/network/src/sync/mod.rs | 2 +- .../sync/network_context/custody_by_root.rs | 7 +++- beacon_node/network/src/sync/sync_block.rs | 1 + 6 files changed, 37 insertions(+), 33 deletions(-) rename beacon_node/network/src/sync/{block_tree.rs => forward_sync.rs} (99%) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0b74ac9eaff..7ebb115e793 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -244,8 +244,9 @@ impl BackFillSync { ) { match result { Ok(SyncBlockResult::Done { parent_root, slot }) => { - if self.is_done(slot) { - todo!("done"); + if self.is_complete(slot) { + info!("Backfill sync completed"); + self.set_state(BackFillState::Completed); } else { let peers = self.status.clone_peers(); self.status = SyncBlock::new( @@ -259,7 +260,10 @@ impl BackFillSync { // Do nothing wait for future event } Err(e) => match e { - SyncBlockError::InternalError(_) => {} + SyncBlockError::InternalError(_) => { + debug!(error = ?e, "Backfill synced failed"); + self.set_state(BackFillState::Failed); + } }, } self.continue_syncing_blocks(cx); @@ -279,8 +283,10 @@ impl BackFillSync { self.network_globals.backfill_state.read().clone() } - fn is_done(&self, slot: Slot) -> bool { - todo!(); + fn is_complete(&self, slot: Slot) -> bool { + let anchor_info = self.beacon_chain.store.get_anchor_info(); + // Conditions that we have completed a backfill sync + anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) } } diff --git a/beacon_node/network/src/sync/block_tree.rs b/beacon_node/network/src/sync/forward_sync.rs similarity index 99% rename from beacon_node/network/src/sync/block_tree.rs rename to beacon_node/network/src/sync/forward_sync.rs index 743450f170c..06d2609bdbb 100644 --- a/beacon_node/network/src/sync/block_tree.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -17,7 +17,7 @@ use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; const MAX_LOOKUP_COUNT: usize = 1_000_000; const PRUNE_COUNT: usize = 100_000; -pub struct BlockTree { +pub struct ForwardSync { blocks: HashMap>, chain: Arc>, } @@ -232,7 +232,7 @@ pub(crate) enum SyncState { Syncing { max_slot: Slot }, } -impl BlockTree { +impl ForwardSync { pub fn new(chain: Arc>) -> Self { Self { blocks: <_>::default(), diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index c063a1ad31d..a9ae2146509 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -34,7 +34,7 @@ //! search for the block and subsequently search for parents if needed. use super::backfill_sync::BackFillSync; -use super::block_tree::BlockTree; +use super::forward_sync::ForwardSync; use super::network_context::{ CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; @@ -216,7 +216,7 @@ pub struct SyncManager { /// Backfill syncing. backfill_sync: BackFillSync, - block_tree: BlockTree, + forward_sync: ForwardSync, /// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer /// may forward us thousands of a attestations, each one triggering an individual event. Only @@ -278,7 +278,7 @@ impl SyncManager { beacon_chain.clone(), fork_context.clone(), ), - block_tree: BlockTree::new(beacon_chain.clone()), + forward_sync: ForwardSync::new(beacon_chain.clone()), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, @@ -307,16 +307,10 @@ impl SyncManager { &mut self.network } - // Leak the full range_sync to prevent having to add many cfg(test) methods here - #[cfg(test)] - pub(crate) fn range_sync(&mut self) -> &mut RangeSync { - &mut self.range_sync - } - // Leak the full struct to prevent having to add many cfg(test) methods here #[cfg(test)] - pub(crate) fn block_tree(&mut self) -> &mut BlockTree { - &mut self.block_tree + pub(crate) fn forward_sync(&mut self) -> &mut ForwardSync { + &mut self.forward_sync } #[cfg(test)] @@ -359,7 +353,7 @@ impl SyncManager { .finalized_epoch .start_slot(T::EthSpec::slots_per_epoch()) { - self.block_tree + self.forward_sync .search(remote.head_root, &[peer_id], &mut self.network); } @@ -414,7 +408,7 @@ impl SyncManager { // Remove peer from all data structures self.backfill_sync.peer_disconnected(peer_id); - self.block_tree.remove_peer(*peer_id); + self.forward_sync.remove_peer(*peer_id); // Regardless of the outcome, we update the sync status. self.update_sync_state(); @@ -492,8 +486,8 @@ impl SyncManager { // TODO(tree-sync): We could just iterate the PeerDB and count the most common head as the // sync target. - let forward_sync_active = if self.block_tree.block_count() > 32 { - self.block_tree.max_slot_to_sync() + let forward_sync_active = if self.forward_sync.block_count() > 32 { + self.forward_sync.max_slot_to_sync() } else { None }; @@ -609,7 +603,7 @@ impl SyncManager { self.handle_new_execution_engine_state(engine_state); } _ = prune_lookups_interval.tick() => { - self.block_tree.prune(); + self.forward_sync.prune(); } _ = prune_requests.tick() => { self.prune_requests(); @@ -706,10 +700,10 @@ impl SyncManager { SyncMessage::GossipBlockProcessResult { block_root, imported, - } => self.block_tree.prune_root(block_root, imported), + } => self.forward_sync.prune_root(block_root, imported), SyncMessage::BatchProcessed { sync_type, result } => match sync_type { ChainSegmentProcessId::ForwardSync(id) => { - self.block_tree + self.forward_sync .on_block_process_result(id, result, &mut self.network); self.update_sync_state(); } @@ -739,7 +733,7 @@ impl SyncManager { ) { match self.should_search_for_block(Some(slot), &peer_id) { Ok(_) => { - self.block_tree + self.forward_sync .search(block_root, &[peer_id], &mut self.network); } Err(reason) => { @@ -751,7 +745,7 @@ impl SyncManager { fn handle_unknown_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { match self.should_search_for_block(None, &peer_id) { Ok(_) => { - self.block_tree + self.forward_sync .search(block_root, &[peer_id], &mut self.network); } Err(reason) => { @@ -819,7 +813,7 @@ impl SyncManager { // Disabled while in this state. We drop current requests and don't search for new // blocks. // TODO(tree-sync): should we pause it instead? - self.block_tree.pause(); + self.forward_sync.pause(); // - Range: // We still send found peers to range so that it can keep track of potential chains @@ -865,7 +859,7 @@ impl SyncManager { { match req_id.parent_request_id { BlocksByRootRequester::Header(lookup_id) => { - self.block_tree.on_header_download_result( + self.forward_sync.on_header_download_result( req_id, lookup_id, result, @@ -1020,7 +1014,7 @@ impl SyncManager { { match range_request_id.requester { RangeRequestId::ForwardSync(id) => { - self.block_tree + self.forward_sync .on_block_download_result(id, result, &mut self.network); } RangeRequestId::BackfillSync(id) => { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 25e1cafe8be..23f4700baaf 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -2,7 +2,7 @@ //! //! Stores the various syncing methods for the beacon chain. mod backfill_sync; -mod block_tree; +mod forward_sync; pub mod manager; mod network_context; mod peer_sampling; diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 682ee9f0358..cf669c56cb0 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -468,8 +468,11 @@ impl ColumnRequest { } } - pub fn is_complete(&self) -> Option { - todo!() + pub fn is_complete(&self) -> Option<&T> { + match &self.status { + Status::Downloaded(_, data, _) => Some(data), + other => None, + } } pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 83996aa503d..ac03bdd495a 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -27,6 +27,7 @@ pub enum SyncBlockResult { Wait, } +#[derive(Debug)] pub enum Error { InternalError(String), } From b152735ec45f3a8c9cee8f9f2f0ac8623df3c823 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 21 Jun 2025 21:50:16 +0200 Subject: [PATCH 35/66] Resolve TODOs --- beacon_node/network/src/sync/forward_sync.rs | 130 ++++++++------- beacon_node/network/src/sync/manager.rs | 6 +- .../network/src/sync/network_context.rs | 2 + .../sync/network_context/custody_by_root.rs | 157 ++---------------- .../sync/network_context/download_request.rs | 152 +++++++++++++++++ beacon_node/network/src/sync/sync_block.rs | 63 ++++--- 6 files changed, 287 insertions(+), 223 deletions(-) create mode 100644 beacon_node/network/src/sync/network_context/download_request.rs diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 06d2609bdbb..57ca1b6d589 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -1,5 +1,7 @@ -use super::network_context::{RpcRequestSendError, RpcResponseError, SyncNetworkContext}; -use crate::sync::network_context::custody_by_root::{ColumnRequest, Error as ColumnRequestError}; +use super::network_context::{ + DownloadRequest, DownloadRequestError, RpcRequestSendError, RpcResponseError, + SyncNetworkContext, +}; use crate::sync::network_context::{BatchPeers, RpcResponseResult}; use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; use crate::sync::BatchProcessResult; @@ -31,7 +33,7 @@ enum Status { // TODO(tree-sync): Make the "waiting" completed header requests as memory cheap as possible BackfillHeader { peers: HashSet, - request: ColumnRequest, + request: DownloadRequest, }, ForwardSyncBlock { header: BeaconBlockHeader, @@ -48,7 +50,7 @@ impl ForwardSyncBlock { id: HeaderLookupId(block_root, id), status: Status::BackfillHeader { peers: HashSet::from_iter(peers.iter().copied()), - request: ColumnRequest::new(), + request: DownloadRequest::new(), }, } } @@ -109,7 +111,7 @@ impl ForwardSyncBlock { fn header_request( &mut self, - ) -> Result<&mut ColumnRequest, Error> { + ) -> Result<&mut DownloadRequest, Error> { match &mut self.status { Status::BackfillHeader { request, .. } => Ok(request), _ => Err(Error::InternalError( @@ -207,15 +209,21 @@ pub enum Error { BlockConflictsWithFinality(String), } -impl From for Error { - fn from(_e: ColumnRequestError) -> Self { - todo!(); +impl From for Error { + fn from(e: DownloadRequestError) -> Self { + match e { + DownloadRequestError::InternalError(e) => Self::InternalError(e), + } } } impl From for Error { - fn from(_e: RpcRequestSendError) -> Self { - todo!(); + fn from(e: RpcRequestSendError) -> Self { + match e { + RpcRequestSendError::InternalError(e) => Self::InternalError(e), + // TODO(tree-sync): Should we allow lookups to have zero peers + RpcRequestSendError::NoPeers => Self::InternalError(format!("No peers")), + } } } @@ -334,11 +342,11 @@ impl ForwardSync { ) { let block_root = id.0; - let result = (|| { + let result: Result = (|| { let Some(lookup) = self.blocks.get_mut(&block_root) else { // TODO(tree-sync): register metric debug!(id = ?req_id, "Received header request for unknown lookup"); - return Ok(()); + return Ok(SyncBlockResult::Wait); }; lookup.assert_expected_lookup_id(id)?; @@ -404,9 +412,12 @@ impl ForwardSync { lookup.send_block_header_request(block_root, cx)?; } } - Ok(()) + Ok(SyncBlockResult::Wait) })(); - self.handle_result(id.0, result); + + // Map result Ok to Wait as completing the header request does not complete the overall + // ForwardSyncBlock request. + self.handle_result(id.0, result.map(|_| SyncBlockResult::Wait), cx); } pub fn on_block_download_result( @@ -415,24 +426,20 @@ impl ForwardSync { result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - let result = (|| { - // TODO(tree-sync): attach an ID to the block entry to make sure we are querying the right - // one, while still indexing by block_root only - let Some(lookup) = self.blocks.get_mut(&id.0) else { - // TODO(tree-sync): register metric - debug!(?id, "Received block request for unknown lookup"); - return Ok(()); - }; - lookup.assert_expected_lookup_id(id)?; - - let request = lookup.block_request()?; - request.on_download_result(result, cx)?; - Ok(()) - })(); - self.handle_result(id.0, result); + let Some(lookup) = self.blocks.get_mut(&id.0) else { + // TODO(tree-sync): register metric + debug!(?id, "Received block request for unknown lookup"); + return; + }; + if let Err(e) = lookup.assert_expected_lookup_id(id) { + debug!(?id, "Unexpected lookup ID"); + return; + } - // Continue batches - self.continue_syncing_blocks(cx); + let outcome = lookup + .block_request() + .and_then(|block| Ok(block.on_download_result(result, cx)?)); + self.handle_result(id.0, outcome, cx); } pub fn on_block_process_result( @@ -441,43 +448,50 @@ impl ForwardSync { result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - let result = (|| { - let Some(lookup) = self.blocks.get_mut(&id.0) else { - debug!(?id, "Received block process result for unknown lookup"); - return Ok(()); - }; - lookup.assert_expected_lookup_id(id)?; - - let request = lookup.block_request()?; - match request.on_process_result(result, cx) { - Ok(SyncBlockResult::Done { .. }) => { - self.blocks.remove(&id.0); - self.trigger_forward_sync(cx); - } - Ok(SyncBlockResult::Wait) => { - // continue same block - } - _ => {} - } - todo!(); - })(); - self.handle_result(id.0, result); + let Some(lookup) = self.blocks.get_mut(&id.0) else { + debug!(?id, "Received block process result for unknown lookup"); + return; + }; + if let Err(e) = lookup.assert_expected_lookup_id(id) { + debug!(?id, "Unexpected lookup ID"); + return; + } - // Continue batches - self.continue_syncing_blocks(cx); + let outcome = lookup + .block_request() + .and_then(|block| Ok(block.on_process_result(result, cx)?)); + self.handle_result(id.0, outcome, cx); } pub fn prune(&mut self) { // Prune blocks once imported, and once finality advances } - pub fn prune_root(&mut self, _block_root: Hash256, _imported: bool) { - todo!(); + pub fn prune_imported_block(&mut self, block_root: Hash256, _imported: bool) { + let mut block_to_delete = block_root; + while let Some(block) = self.blocks.remove(&block_root) { + debug!(?block_root, "Deleted imported block lookup"); + if let Some(parent_root) = block.parent_root() { + block_to_delete = parent_root; + } else { + break; + } + } } - fn handle_result(&mut self, block_root: Hash256, result: Result<(), Error>) { + fn handle_result( + &mut self, + block_root: Hash256, + result: Result, + cx: &mut SyncNetworkContext, + ) { match result { - Ok(_) => {} + Ok(SyncBlockResult::Done { .. }) => { + self.blocks.remove(&block_root); + self.trigger_forward_sync(cx); + } + // Wait for next event + Ok(SyncBlockResult::Wait) => {} Err(e) => { debug!(error = ?e, "Dropping forward sync block header lookup"); match e { diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index a9ae2146509..387f5ca6786 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -700,7 +700,7 @@ impl SyncManager { SyncMessage::GossipBlockProcessResult { block_root, imported, - } => self.forward_sync.prune_root(block_root, imported), + } => self.forward_sync.prune_imported_block(block_root, imported), SyncMessage::BatchProcessed { sync_type, result } => match sync_type { ChainSegmentProcessId::ForwardSync(id) => { self.forward_sync @@ -708,9 +708,9 @@ impl SyncManager { self.update_sync_state(); } ChainSegmentProcessId::BackfillSync(id) => { - // TODO(tree-sync): should update sync state self.backfill_sync - .on_block_process_result(id, result, &mut self.network) + .on_block_process_result(id, result, &mut self.network); + self.update_sync_state(); } }, SyncMessage::SampleVerified { id, result } => { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index da2457857e0..19079b31e38 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -14,6 +14,7 @@ use beacon_chain::{BeaconChain, BeaconChainTypes, EngineState}; pub use block_components_by_range::BlockComponentsByRootRequest; #[cfg(test)] pub use block_components_by_range::BlockComponentsByRootRequestStep; +pub use download_request::{DownloadRequest, Error as DownloadRequestError}; use fnv::FnvHashMap; use itertools::Itertools; use lighthouse_network::rpc::methods::{ @@ -51,6 +52,7 @@ use types::{ pub mod block_components_by_range; pub mod custody_by_root; +mod download_request; mod requests; #[derive(Debug)] diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index cf669c56cb0..70a9107224c 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,3 +1,6 @@ +use crate::sync::network_context::download_request::{ + DownloadRequest, Error as DownloadRequestError, +}; use crate::sync::network_context::{ DataColumnsByRootRequestId, RpcRequestSendError, RpcResponseError, }; @@ -12,7 +15,6 @@ use rand::Rng; use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; -use strum::IntoStaticStr; use tracing::{debug, warn}; use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; @@ -32,7 +34,7 @@ pub struct ActiveCustodyByRootRequest { #[allow(clippy::type_complexity)] column_requests: FnvHashMap< ColumnIndex, - ColumnRequest>>, + DownloadRequest>>, >, /// Active requests for 1 or more columns each active_batch_columns_requests: @@ -79,6 +81,14 @@ impl From for RpcRequestSendError { } } +impl From for Error { + fn from(e: DownloadRequestError) -> Self { + match e { + DownloadRequestError::InternalError(e) => Self::InternalError(e), + } + } +} + struct ActiveBatchColumnsRequest { indices: Vec, } @@ -100,7 +110,7 @@ impl ActiveCustodyByRootRequest { column_requests: HashMap::from_iter( column_indices .iter() - .map(|index| (*index, ColumnRequest::new())), + .map(|index| (*index, DownloadRequest::new())), ), active_batch_columns_requests: <_>::default(), failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), @@ -233,7 +243,7 @@ impl ActiveCustodyByRootRequest { seen_timestamps.push(seen_timestamp); Ok(data_column) }) - .collect::, _>>()?; + .collect::, Error>>()?; let peer_group = PeerGroup::from_set(peers); let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); @@ -348,142 +358,3 @@ impl ActiveCustodyByRootRequest { Ok(None) } } - -pub struct ColumnRequest { - status: Status, - download_failures: Vec, -} - -#[derive(Debug, Clone, IntoStaticStr)] -pub enum Status { - NotStarted, - Downloading(I), - Downloaded(PeerId, T, Duration), -} - -impl ColumnRequest { - pub fn new() -> Self { - Self { - status: Status::NotStarted, - download_failures: vec![], - } - } - - pub fn is_awaiting_download(&self) -> bool { - match self.status { - Status::NotStarted => true, - Status::Downloading { .. } | Status::Downloaded { .. } => false, - } - } - - pub fn is_downloading(&self) -> bool { - match self.status { - Status::NotStarted => false, - Status::Downloading { .. } => true, - Status::Downloaded { .. } => false, - } - } - - pub fn is_downloaded(&self) -> bool { - match self.status { - Status::NotStarted | Status::Downloading { .. } => false, - Status::Downloaded { .. } => true, - } - } - - pub fn too_many_failures(&self) -> Option { - if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - Some( - self.download_failures - .last() - .cloned() - .expect("download_failures is not empty"), - ) - } else { - None - } - } - - pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { - match &self.status { - Status::NotStarted => { - self.status = Status::Downloading(req_id); - Ok(()) - } - other => Err(Error::InternalError(format!( - "bad state on_download_start expected NotStarted got {}", - Into::<&'static str>::into(other), - ))), - } - } - - pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { - match &self.status { - Status::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(Error::InternalError(format!( - "Received download result for req_id {req_id} expecting {expected_req_id}" - ))); - } - self.status = Status::NotStarted; - Ok(()) - } - other => Err(Error::InternalError(format!( - "bad state on_download_error expected Downloading got {}", - Into::<&'static str>::into(other), - ))), - } - } - - pub fn on_download_error_and_mark_failure( - &mut self, - req_id: I, - e: RpcResponseError, - ) -> Result<(), Error> { - self.download_failures.push(e); - self.on_download_error(req_id) - } - - pub fn on_download_success( - &mut self, - req_id: I, - peer_id: PeerId, - data_column: T, - seen_timestamp: Duration, - ) -> Result<(), Error> { - match &self.status { - Status::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(Error::InternalError(format!( - "Received download result for req_id {req_id} expecting {expected_req_id}" - ))); - } - self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); - Ok(()) - } - other => Err(Error::InternalError(format!( - "bad state on_download_success expected Downloading got {}", - Into::<&'static str>::into(other), - ))), - } - } - - pub fn is_complete(&self) -> Option<&T> { - match &self.status { - Status::Downloaded(_, data, _) => Some(data), - other => None, - } - } - - pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { - match self.status { - Status::Downloaded(peer_id, data_column, seen_timestamp) => { - Ok((peer_id, data_column, seen_timestamp)) - } - other => Err(Error::InternalError(format!( - "bad state complete expected Downloaded got {}", - Into::<&'static str>::into(other), - ))), - } - } -} diff --git a/beacon_node/network/src/sync/network_context/download_request.rs b/beacon_node/network/src/sync/network_context/download_request.rs new file mode 100644 index 00000000000..60427bc679c --- /dev/null +++ b/beacon_node/network/src/sync/network_context/download_request.rs @@ -0,0 +1,152 @@ +use crate::sync::network_context::RpcResponseError; +use lighthouse_network::PeerId; +use std::time::Duration; +use strum::IntoStaticStr; + +/// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each +/// `custody_by_*` request is already retried multiple inside of a lookup or batch +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; + +pub struct DownloadRequest { + status: Status, + download_failures: Vec, +} + +#[derive(Debug, Clone, IntoStaticStr)] +pub enum Status { + NotStarted, + Downloading(I), + Downloaded(PeerId, T, Duration), +} + +#[derive(Debug)] +pub enum Error { + InternalError(String), +} + +impl DownloadRequest { + pub fn new() -> Self { + Self { + status: Status::NotStarted, + download_failures: vec![], + } + } + + pub fn is_awaiting_download(&self) -> bool { + match self.status { + Status::NotStarted => true, + Status::Downloading { .. } | Status::Downloaded { .. } => false, + } + } + + pub fn is_downloading(&self) -> bool { + match self.status { + Status::NotStarted => false, + Status::Downloading { .. } => true, + Status::Downloaded { .. } => false, + } + } + + pub fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted | Status::Downloading { .. } => false, + Status::Downloaded { .. } => true, + } + } + + pub fn too_many_failures(&self) -> Option { + if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + Some( + self.download_failures + .last() + .cloned() + .expect("download_failures is not empty"), + ) + } else { + None + } + } + + pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { + match &self.status { + Status::NotStarted => { + self.status = Status::Downloading(req_id); + Ok(()) + } + other => Err(Error::InternalError(format!( + "bad state on_download_start expected NotStarted got {}", + Into::<&'static str>::into(other), + ))), + } + } + + pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { + match &self.status { + Status::Downloading(expected_req_id) => { + if req_id != *expected_req_id { + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); + } + self.status = Status::NotStarted; + Ok(()) + } + other => Err(Error::InternalError(format!( + "bad state on_download_error expected Downloading got {}", + Into::<&'static str>::into(other), + ))), + } + } + + pub fn on_download_error_and_mark_failure( + &mut self, + req_id: I, + e: RpcResponseError, + ) -> Result<(), Error> { + self.download_failures.push(e); + self.on_download_error(req_id) + } + + pub fn on_download_success( + &mut self, + req_id: I, + peer_id: PeerId, + data: T, + seen_timestamp: Duration, + ) -> Result<(), Error> { + match &self.status { + Status::Downloading(expected_req_id) => { + if req_id != *expected_req_id { + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); + } + self.status = Status::Downloaded(peer_id, data, seen_timestamp); + Ok(()) + } + other => Err(Error::InternalError(format!( + "bad state on_download_success expected Downloading got {}", + Into::<&'static str>::into(other), + ))), + } + } + + pub fn is_complete(&self) -> Option<&T> { + match &self.status { + Status::Downloaded(_, data, _) => Some(data), + other => None, + } + } + + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { + match self.status { + Status::Downloaded(peer_id, data, seen_timestamp) => { + Ok((peer_id, data, seen_timestamp)) + } + other => Err(Error::InternalError(format!( + "bad state complete expected Downloaded got {}", + Into::<&'static str>::into(other), + ))), + } + } +} diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index ac03bdd495a..8e20576e9d6 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -12,6 +12,9 @@ use std::sync::Arc; use tracing::debug; use types::{EthSpec, Hash256, Slot}; +const MAX_DOWNLOAD_ATTEMPTS: usize = 5; +const MAX_PROCESS_ATTEMPTS: usize = 5; + // TODO(tree-sync): have the peer set inside here when syncing add dedup logic // TODO(tree-sync): for backfill sync use the sync state to check the peers have this block or not pub struct SyncBlock { @@ -20,6 +23,15 @@ pub struct SyncBlock { failed_peers: HashSet, peers: Arc>>, request: SyncingStatus, + download_errors: usize, + process_errors: usize, +} + +enum SyncingStatus { + AwaitingDownload, + Downloading(Id), + AwaitingProcessing(RpcBlock, BatchPeers), + Processing(RpcBlock, BatchPeers), } pub enum SyncBlockResult { @@ -30,6 +42,7 @@ pub enum SyncBlockResult { #[derive(Debug)] pub enum Error { InternalError(String), + TooManyErrors, } impl SyncBlock { @@ -42,6 +55,8 @@ impl SyncBlock { initial_peers.iter().copied(), ))), request: SyncingStatus::AwaitingDownload, + download_errors: 0, + process_errors: 0, } } @@ -67,20 +82,31 @@ impl SyncBlock { cx: &mut SyncNetworkContext, ) -> Result { match &mut self.request { - SyncingStatus::Downloading(_) => match result { - // TODO(tree-sync): check that the request ID matches - Ok((block, peers)) => { - debug!(id = %self.id, "Sync block downloaded"); - self.request = SyncingStatus::AwaitingProcessing(block, peers); - self.continue_request(cx) + SyncingStatus::Downloading(expected_id) => { + if id != expected_id { + return Err(Error::InternalError(format!( + "Unexpected request ID {id} !{expected_id}" + ))); } - Err(e) => { - // TODO(tree-sync): increase error counter - debug!(id = %self.id, error = ?e, "Sync block download error"); - self.request = SyncingStatus::AwaitingDownload; - self.continue_request(cx) + match result { + Ok((block, peers)) => { + debug!(id = %self.id, "Sync block downloaded"); + self.request = SyncingStatus::AwaitingProcessing(block, peers); + self.continue_request(cx) + } + Err(e) => { + debug!(id = %self.id, error = ?e, "Sync block download error"); + self.request = SyncingStatus::AwaitingDownload; + + self.download_errors += 1; + if self.download_errors > MAX_DOWNLOAD_ATTEMPTS { + return Err(Error::TooManyErrors); + } + + self.continue_request(cx) + } } - }, + } _ => Err(Error::InternalError( "Lookup not in expected state Downloading".to_owned(), )), @@ -107,9 +133,15 @@ impl SyncBlock { if let Some(peer_action) = peer_action { for (peer, penalty) in peers.blame(peer_action) { cx.report_peer(peer, penalty, "faulty_batch"); + self.failed_peers.insert(peers); } } + self.process_errors += 1; + if self.process_errors > MAX_PROCESS_ATTEMPTS { + return Err(Error::TooManyErrors); + } + self.request = SyncingStatus::AwaitingDownload; self.continue_request(cx) } @@ -191,10 +223,3 @@ impl SyncBlock { matches!(self.request, SyncingStatus::Processing(..)) } } - -enum SyncingStatus { - AwaitingDownload, - Downloading(Id), - AwaitingProcessing(RpcBlock, BatchPeers), - Processing(RpcBlock, BatchPeers), -} From c07d6d399a463897651807130290e01a4058332c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sun, 22 Jun 2025 15:05:32 +0200 Subject: [PATCH 36/66] Basic tests compile --- .../lighthouse_network/src/rpc/codec.rs | 7 +- .../src/rpc/self_limiter.rs | 58 +- .../src/service/api_types.rs | 39 +- .../network/src/sync/backfill_sync/mod.rs | 16 +- beacon_node/network/src/sync/forward_sync.rs | 38 +- beacon_node/network/src/sync/manager.rs | 16 +- .../network/src/sync/network_context.rs | 8 +- beacon_node/network/src/sync/sync_block.rs | 21 +- beacon_node/network/src/sync/tests/lookups.rs | 2157 ++--------------- beacon_node/network/src/sync/tests/mod.rs | 4 +- beacon_node/network/src/sync/tests/range.rs | 552 ++--- common/eth2/src/lighthouse/sync_state.rs | 11 +- 12 files changed, 432 insertions(+), 2495 deletions(-) diff --git a/beacon_node/lighthouse_network/src/rpc/codec.rs b/beacon_node/lighthouse_network/src/rpc/codec.rs index f24074118eb..e37f1ad01c6 100644 --- a/beacon_node/lighthouse_network/src/rpc/codec.rs +++ b/beacon_node/lighthouse_network/src/rpc/codec.rs @@ -1088,7 +1088,12 @@ mod tests { } fn bbroot_request_v2(fork_name: ForkName) -> BlocksByRootRequest { - BlocksByRootRequest::new(vec![Hash256::zero()], &fork_context(fork_name)) + let fork_context = fork_context(fork_name); + BlocksByRootRequest::new( + vec![Hash256::zero()], + &fork_context.spec, + fork_context.current_fork(), + ) } fn blbroot_request(fork_name: ForkName) -> BlobsByRootRequest { diff --git a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs index f26dc4c7a84..30c26e2c7af 100644 --- a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs @@ -313,13 +313,19 @@ mod tests { use crate::rpc::rate_limiter::Quota; use crate::rpc::self_limiter::SelfRateLimiter; use crate::rpc::{Ping, Protocol, RPCSend, RequestType}; - use crate::service::api_types::{AppRequestId, SingleLookupReqId, SyncRequestId}; + use crate::service::api_types::{ + AppRequestId, BlocksByRootRequestId, BlocksByRootRequester, HeaderLookupId, SyncRequestId, + }; use libp2p::PeerId; use logging::create_test_tracing_subscriber; use std::num::NonZeroU64; use std::time::Duration; use types::{EthSpec, ForkContext, Hash256, MainnetEthSpec, Slot}; + fn get_parent_request_id() -> BlocksByRootRequester { + BlocksByRootRequester::Header(HeaderLookupId(Hash256::ZERO, 0)) + } + /// Test that `next_peer_request_ready` correctly maintains the queue. #[tokio::test] async fn test_next_peer_request_ready() { @@ -336,17 +342,15 @@ mod tests { let mut limiter: SelfRateLimiter = SelfRateLimiter::new(Some(config), fork_context).unwrap(); let peer_id = PeerId::random(); - let lookup_id = 0; + let parent_request_id = get_parent_request_id(); for i in 1..=5u32 { let _ = limiter.allows( peer_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { - lookup_id, - req_id: i, - }, - }), + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId { + id: i, + parent_request_id, + })), RequestType::Ping(Ping { data: i as u64 }), ); } @@ -363,9 +367,7 @@ mod tests { for i in 2..=5u32 { assert!(matches!( iter.next().unwrap().request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if req_id == i, + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId{id,..})) if id == i, )); } @@ -388,9 +390,7 @@ mod tests { for i in 3..=5 { assert!(matches!( iter.next().unwrap().request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if req_id == i, + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId{id,..})) if id == i, )); } @@ -409,16 +409,15 @@ mod tests { let mut limiter: SelfRateLimiter = SelfRateLimiter::new(None, fork_context).unwrap(); let peer_id = PeerId::random(); + let parent_request_id = get_parent_request_id(); for i in 1..=5u32 { let result = limiter.allows( peer_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { - lookup_id: i, - req_id: i, - }, - }), + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId { + id: i, + parent_request_id, + })), RequestType::Ping(Ping { data: i as u64 }), ); @@ -469,9 +468,7 @@ mod tests { assert!(matches!( request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if *req_id == i + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId {id,..})) if *id == i )); } } @@ -487,17 +484,16 @@ mod tests { SelfRateLimiter::new(None, fork_context).unwrap(); let peer1 = PeerId::random(); let peer2 = PeerId::random(); + let parent_request_id = get_parent_request_id(); for peer in [peer1, peer2] { for i in 1..=5u32 { let result = limiter.allows( peer, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { - lookup_id: i, - req_id: i, - }, - }), + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId { + id: i, + parent_request_id, + })), RequestType::Ping(Ping { data: i as u64 }), ); @@ -525,9 +521,7 @@ mod tests { let (request_id, _) = failed_requests.remove(0); assert!(matches!( request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if req_id == i + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId{id,..})) if id == i )); } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 349c6127081..bed846f1f07 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -103,11 +103,6 @@ pub struct CustodyByRootRequestId { pub parent_request_id: ComponentsByRootRequestId, } -/// Downstream components that perform custody by root requests. -/// Currently, it's only single block lookups, so not using an enum -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct CustodyRequester(pub SingleLookupReqId); - /// Application level requests sent to the network. #[derive(Debug, Clone, Copy)] pub enum AppRequestId { @@ -237,12 +232,6 @@ impl Display for BatchId { } } -impl Display for CustodyRequester { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) - } -} - impl Display for RangeRequestId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -283,11 +272,11 @@ mod tests { fn display_id_data_columns_by_root_custody() { let id = DataColumnsByRootRequestId { id: 123, - parent_request_id: DataColumnsByRootRequester::Custody(CustodyId { - requester: CustodyRequester(SingleLookupReqId { - req_id: 121, - lookup_id: 101, - }), + parent_request_id: DataColumnsByRootRequester::Custody(CustodyByRootRequestId { + parent_request_id: ComponentsByRootRequestId { + id: 121, + requester: RangeRequestId::ForwardSync(HeaderLookupId(Hash256::ZERO, 1)), + }, }), }; assert_eq!(format!("{id}"), "123/Custody/121/Lookup/101"); @@ -304,22 +293,4 @@ mod tests { }; assert_eq!(format!("{id}"), "123/Sampling/101/ImportedBlock/0x0000000000000000000000000000000000000000000000000000000000000000"); } - - #[test] - fn display_id_data_columns_by_range() { - let id = DataColumnsByRootRequestId { - id: 123, - parent_request_id: CustodyByRootRequestId { - id: 122, - parent_request_id: ComponentsByRootRequestId { - id: 121, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), - }, - }, - }, - }; - assert_eq!(format!("{id}"), "123/122/121/RangeSync/0/54"); - } } diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 7ebb115e793..80436a64dee 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -15,7 +15,7 @@ use crate::sync::network_context::{ use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::service::api_types::Id; +use lighthouse_network::service::api_types::{ComponentsByRootRequestId, Id}; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::PeerId; use std::sync::Arc; @@ -214,11 +214,11 @@ impl BackFillSync { pub fn on_block_download_result( &mut self, - id: Id, + req_id: ComponentsByRootRequestId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - let outcome = self.status.on_download_result(result, cx); + let outcome = self.status.on_download_result(req_id, result, cx); self.handle_outcome(outcome, cx); } @@ -260,7 +260,7 @@ impl BackFillSync { // Do nothing wait for future event } Err(e) => match e { - SyncBlockError::InternalError(_) => { + SyncBlockError::InternalError(_) | SyncBlockError::TooManyErrors => { debug!(error = ?e, "Backfill synced failed"); self.set_state(BackFillState::Failed); } @@ -285,6 +285,14 @@ impl BackFillSync { fn is_complete(&self, slot: Slot) -> bool { let anchor_info = self.beacon_chain.store.get_anchor_info(); + + if anchor_info.oldest_block_slot != slot { + warn!( + "oldest_block_slot not at expected value {} != {}", + anchor_info.oldest_block_slot, slot + ); + } + // Conditions that we have completed a backfill sync anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) } diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 57ca1b6d589..85853737335 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -8,7 +8,8 @@ use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::{ - BlocksByRootRequestId, BlocksByRootRequester, HeaderLookupId, Id, RangeRequestId, + BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, HeaderLookupId, Id, + RangeRequestId, }; use lighthouse_network::PeerId; use std::collections::{HashMap, HashSet, VecDeque}; @@ -18,6 +19,7 @@ use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; const MAX_LOOKUP_COUNT: usize = 1_000_000; const PRUNE_COUNT: usize = 100_000; +const BLOCK_BUFFER_SIZE: usize = 2; pub struct ForwardSync { blocks: HashMap>, @@ -206,6 +208,7 @@ impl ForwardSyncBlock { #[derive(Debug)] pub enum Error { InternalError(String), + TooManyErrors, BlockConflictsWithFinality(String), } @@ -231,6 +234,7 @@ impl From for Error { fn from(e: SyncBlockError) -> Self { match e { SyncBlockError::InternalError(e) => Self::InternalError(e), + SyncBlockError::TooManyErrors => Self::TooManyErrors, } } } @@ -267,18 +271,19 @@ impl ForwardSync { } #[cfg(test)] - pub fn get_processing_ids(&self) -> Vec { - self.blocks - .values() - .filter(|block| { - block - .header_request() - .ok() - .map(|request| request.is_processing()) - .unwrap_or(false) - }) - .map(|block| block.id) - .collect() + pub fn get_processing_ids(&mut self) -> Vec { + let mut ids = vec![]; + for block in self.blocks.values_mut() { + if block + .block_request() + .ok() + .map(|request| request.is_processing()) + .unwrap_or(false) + { + ids.push(block.id); + } + } + ids } pub fn pause(&mut self) { @@ -422,6 +427,7 @@ impl ForwardSync { pub fn on_block_download_result( &mut self, + req_id: ComponentsByRootRequestId, id: HeaderLookupId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, @@ -438,7 +444,7 @@ impl ForwardSync { let outcome = lookup .block_request() - .and_then(|block| Ok(block.on_download_result(result, cx)?)); + .and_then(|block| Ok(block.on_download_result(req_id, result, cx)?)); self.handle_result(id.0, outcome, cx); } @@ -495,7 +501,7 @@ impl ForwardSync { Err(e) => { debug!(error = ?e, "Dropping forward sync block header lookup"); match e { - Error::InternalError(_e) => { + Error::InternalError(_) | Error::TooManyErrors => { let block_to_children = self.compute_children(); self.drop_lookup_and_children(block_root, &block_to_children); } @@ -533,7 +539,7 @@ impl ForwardSync { let mut new_syncing_blocks = false; // Have up to 2 blocks syncing - for _ in blocks_syncing..2 { + for _ in blocks_syncing..BLOCK_BUFFER_SIZE { // Find the block range with most peers and highest slot. This is the block // to be used as tip of the chain of blocks to fetch. let Some(block_root) = self diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 387f5ca6786..7f152f99f0c 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1005,21 +1005,25 @@ impl SyncManager { /// blobs. fn on_block_components_by_root_response( &mut self, - range_request_id: ComponentsByRootRequestId, + req_id: ComponentsByRootRequestId, range_block_component: RangeBlockComponent, ) { if let Some(result) = self .network - .on_block_components_by_root_response(range_request_id, range_block_component) + .on_block_components_by_root_response(req_id, range_block_component) { - match range_request_id.requester { + match req_id.requester { RangeRequestId::ForwardSync(id) => { - self.forward_sync - .on_block_download_result(id, result, &mut self.network); + self.forward_sync.on_block_download_result( + req_id, + id, + result, + &mut self.network, + ); } RangeRequestId::BackfillSync(id) => { self.backfill_sync - .on_block_download_result(id, result, &mut self.network) + .on_block_download_result(req_id, result, &mut self.network) } } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 19079b31e38..73494191d8d 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -260,7 +260,7 @@ impl SyncNetworkContext> { task_executor: TaskExecutor, ) -> Self { let fork_context = Arc::new(ForkContext::new::( - beacon_chain.slot_clock.now().unwrap_or(Slot::new(0)), + beacon_chain.slot_clock.now().unwrap_or(types::Slot::new(0)), beacon_chain.genesis_validators_root, &beacon_chain.spec, )); @@ -359,7 +359,7 @@ impl SyncNetworkContext { } #[cfg(test)] - pub fn active_block_components_by_root_requests( + pub fn active_block_components_requests( &self, ) -> Vec<(ComponentsByRootRequestId, BlockComponentsByRootRequestStep)> { self.block_components_by_root_requests @@ -460,7 +460,7 @@ impl SyncNetworkContext { requester: RangeRequestId, peers: Arc>>, peers_to_deprioritize: &HashSet, - ) -> Result { + ) -> Result { let id = ComponentsByRootRequestId { id: self.next_id(), requester, @@ -471,7 +471,7 @@ impl SyncNetworkContext { self.block_components_by_root_requests.insert(id, req); - Ok(id.id) + Ok(id) } /// Request to send a single `data_columns_by_root` request to the network. diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 8e20576e9d6..7bd2928adc4 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -4,7 +4,7 @@ use crate::sync::network_context::BatchPeers; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; -use lighthouse_network::service::api_types::{Id, RangeRequestId}; +use lighthouse_network::service::api_types::{ComponentsByRootRequestId, RangeRequestId}; use lighthouse_network::PeerId; use parking_lot::RwLock; use std::collections::HashSet; @@ -29,7 +29,7 @@ pub struct SyncBlock { enum SyncingStatus { AwaitingDownload, - Downloading(Id), + Downloading(ComponentsByRootRequestId), AwaitingProcessing(RpcBlock, BatchPeers), Processing(RpcBlock, BatchPeers), } @@ -76,16 +76,23 @@ impl SyncBlock { self.peers.write().remove(peer) } + #[cfg(test)] + pub fn is_processing(&self) -> bool { + matches!(self.request, SyncingStatus::Processing(..)) + } + pub fn on_download_result( &mut self, + req_id: ComponentsByRootRequestId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) -> Result { match &mut self.request { SyncingStatus::Downloading(expected_id) => { - if id != expected_id { + if req_id != *expected_id { return Err(Error::InternalError(format!( - "Unexpected request ID {id} !{expected_id}" + "Unexpected request ID {} != {}", + req_id, expected_id, ))); } match result { @@ -133,7 +140,7 @@ impl SyncBlock { if let Some(peer_action) = peer_action { for (peer, penalty) in peers.blame(peer_action) { cx.report_peer(peer, penalty, "faulty_batch"); - self.failed_peers.insert(peers); + self.failed_peers.insert(peer); } } @@ -218,8 +225,4 @@ impl SyncBlock { SyncingStatus::Processing(..) => Ok(SyncBlockResult::Wait), } } - - pub fn is_processing(&self) -> bool { - matches!(self.request, SyncingStatus::Processing(..)) - } } diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 8f9801f203c..318309cc195 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1,5 +1,5 @@ +use super::range::{complete, filter, NO_FILTER}; use crate::network_beacon_processor::NetworkBeaconProcessor; -use crate::sync::range_sync::BATCH_BUFFER_SIZE; use crate::sync::{ manager::{BlockProcessingResult, SyncManager}, peer_sampling::SamplingConfig, @@ -69,23 +69,17 @@ struct BlockLookupSummary {} pub struct TestOptions { /// If the node created by this test harness is a supernode pub is_supernode: bool, - /// The maximum number of batches to queue before requesting more. - pub batch_buffer_size: usize, } impl TestRig { pub fn test_setup() -> Self { Self::test_setup_with_options(TestOptions { is_supernode: false, - batch_buffer_size: BATCH_BUFFER_SIZE, }) } pub fn test_setup_as_supernode() -> Self { - Self::test_setup_with_options(TestOptions { - is_supernode: true, - batch_buffer_size: BATCH_BUFFER_SIZE, - }) + Self::test_setup_with_options(TestOptions { is_supernode: true }) } pub fn test_setup_with_options(options: TestOptions) -> Self { @@ -150,7 +144,6 @@ impl TestRig { network_rx, network_rx_queue: vec![], sync_rx, - sent_blocks_by_range: <_>::default(), blocks_by_root: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), @@ -164,7 +157,6 @@ impl TestRig { required_successes: vec![SAMPLING_REQUIRED_SUCCESSES], }, fork_context, - options.batch_buffer_size, ), harness, fork_name, @@ -228,7 +220,7 @@ impl TestRig { } } - fn rand_block(&mut self) -> SignedBeaconBlock { + pub fn rand_block(&mut self) -> SignedBeaconBlock { self.rand_block_and_blobs(NumBlobs::None).0 } @@ -266,34 +258,10 @@ impl TestRig { self.sync_manager.handle_message(sync_message); } - fn active_single_lookups(&self) -> Vec { - todo!(); - } - - fn active_single_lookups_count(&self) -> usize { - self.active_single_lookups().len() - } - - fn active_parent_lookups(&self) -> Vec> { + fn assert_active_lookup(&self, block_root: Hash256) { todo!(); } - fn active_parent_lookups_count(&self) -> usize { - self.active_single_lookups_count() - } - - fn active_range_sync_chain(&mut self) -> (RangeSyncType, Slot, Slot) { - self.sync_manager.range_sync().state().unwrap().unwrap() - } - - fn assert_single_lookups_count(&self, count: usize) { - assert_eq!( - self.active_single_lookups_count(), - count, - "Unexpected count of single lookups. Current lookups: -", - ); - } - fn expect_no_active_sampling(&mut self) { assert_eq!( self.sync_manager.active_sampling_requests(), @@ -315,19 +283,7 @@ impl TestRig { self.expect_no_active_sampling(); } - fn assert_parent_lookups_count(&self, count: usize) { - assert_eq!( - self.active_parent_lookups_count(), - count, - "Unexpected count of parent lookups. Parent lookups: -. Current lookups: -", - ); - } - - fn assert_lookup_is_active(&self, block_root: Hash256) { - todo!(); - } - - fn assert_lookup_peers(&self, block_root: Hash256, mut expected_peers: Vec) { + fn assert_lookup_peers(&self, block_root: Hash256, expected_peers: &[PeerId]) { todo!(); } @@ -357,17 +313,9 @@ impl TestRig { todo!(); } - #[track_caller] - fn expect_no_active_single_lookups(&self) { - assert!( - self.active_single_lookups().is_empty(), - "expect no single block lookups", - ); - } - #[track_caller] fn expect_no_active_lookups(&self) { - self.expect_no_active_single_lookups(); + todo!(); } fn expect_no_active_lookups_empty_network(&mut self) { @@ -445,201 +393,6 @@ impl TestRig { self.new_connected_supernode_peer(); } - fn parent_chain_processed_success( - &mut self, - chain_hash: Hash256, - blocks: &[Arc>], - ) { - // Send import events for all pending parent blocks - for _ in blocks { - self.parent_block_processed_imported(chain_hash); - } - // Send final import event for the block that triggered the lookup - self.single_block_component_processed_imported(chain_hash); - } - - /// Locate a parent lookup chain with tip hash `chain_hash` - fn find_oldest_parent_lookup(&self, chain_hash: Hash256) -> Hash256 { - let parent_chain = self - .active_parent_lookups() - .into_iter() - .find(|chain| chain.first() == Some(&chain_hash)) - .unwrap_or_else(|| { - panic!( - "No parent chain with chain_hash {chain_hash:?}: Parent lookups - Single lookups -", - ) - }); - *parent_chain.last().unwrap() - } - - fn parent_block_processed(&mut self, chain_hash: Hash256, result: BlockProcessingResult) { - let id = self.find_single_lookup_for(self.find_oldest_parent_lookup(chain_hash)); - self.single_block_component_processed(id, result); - } - - fn parent_blob_processed(&mut self, chain_hash: Hash256, result: BlockProcessingResult) { - let id = self.find_single_lookup_for(self.find_oldest_parent_lookup(chain_hash)); - self.single_blob_component_processed(id, result); - } - - fn parent_block_processed_imported(&mut self, chain_hash: Hash256) { - self.parent_block_processed( - chain_hash, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(chain_hash)), - ); - } - - fn single_block_component_processed(&mut self, _id: Id, _result: BlockProcessingResult) { - todo!(); - } - - fn single_block_component_processed_imported(&mut self, block_root: Hash256) { - let id = self.find_single_lookup_for(block_root); - self.single_block_component_processed( - id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)), - ) - } - - fn single_blob_component_processed(&mut self, _id: Id, _result: BlockProcessingResult) { - todo!(); - } - - fn parent_lookup_block_response( - &mut self, - _id: SingleLookupReqId, - _peer_id: PeerId, - _beacon_block: Option>>, - ) { - todo!(); - } - - fn single_lookup_block_response( - &mut self, - _id: SingleLookupReqId, - _peer_id: PeerId, - _beacon_block: Option>>, - ) { - todo!(); - } - - fn parent_lookup_blob_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blob_sidecar: Option>>, - ) { - self.log(&format!( - "parent_lookup_blob_response {:?}", - blob_sidecar.as_ref().map(|b| b.index) - )); - self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: todo!(), - peer_id, - blob_sidecar, - seen_timestamp: D, - }); - } - - fn single_lookup_blob_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blob_sidecar: Option>>, - ) { - self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: todo!(), - peer_id, - blob_sidecar, - seen_timestamp: D, - }); - } - - fn complete_single_lookup_blob_download( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blobs: Vec>, - ) { - for blob in blobs { - self.single_lookup_blob_response(id, peer_id, Some(blob.into())); - } - self.single_lookup_blob_response(id, peer_id, None); - } - - fn complete_single_lookup_blob_lookup_valid( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blobs: Vec>, - import: bool, - ) { - let block_root = blobs.first().unwrap().block_root(); - let block_slot = blobs.first().unwrap().slot(); - self.complete_single_lookup_blob_download(id, peer_id, blobs); - self.expect_block_process(ResponseType::Blob); - self.single_blob_component_processed( - id.lookup_id, - if import { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - block_slot, block_root, - )) - }, - ); - } - - fn complete_lookup_block_download(&mut self, block: SignedBeaconBlock) { - let block_root = block.canonical_root(); - let id = self.expect_block_lookup_request(block_root); - self.expect_empty_network(); - let peer_id = self.new_connected_peer(); - self.single_lookup_block_response(id, peer_id, Some(block.into())); - self.single_lookup_block_response(id, peer_id, None); - } - - fn complete_lookup_block_import_valid(&mut self, block_root: Hash256, import: bool) { - self.expect_block_process(ResponseType::Block); - let id = self.find_single_lookup_for(block_root); - self.single_block_component_processed( - id, - if import { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - Slot::new(0), - block_root, - )) - }, - ) - } - - fn complete_single_lookup_block_valid(&mut self, block: SignedBeaconBlock, import: bool) { - let block_root = block.canonical_root(); - self.complete_lookup_block_download(block); - self.complete_lookup_block_import_valid(block_root, import) - } - - fn parent_lookup_failed(&mut self, _id: SingleLookupReqId, _peer_id: PeerId, _error: RPCError) { - todo!() - } - - fn parent_lookup_failed_unavailable(&mut self, id: SingleLookupReqId, peer_id: PeerId) { - self.parent_lookup_failed( - id, - peer_id, - RPCError::ErrorResponse( - RpcErrorResponse::ResourceUnavailable, - "older than deneb".into(), - ), - ); - } - - fn single_lookup_failed(&mut self, _id: SingleLookupReqId, _peer_id: PeerId, _error: RPCError) { - todo!(); - } - fn return_empty_sampling_requests(&mut self, ids: DCByRootIds) { for id in ids { self.log(&format!("return empty data column for {id:?}")); @@ -673,32 +426,6 @@ impl TestRig { } } - fn complete_valid_block_request( - &mut self, - id: SingleLookupReqId, - block: Arc>, - missing_components: bool, - ) { - // Complete download - let peer_id = PeerId::random(); - let slot = block.slot(); - let block_root = block.canonical_root(); - self.single_lookup_block_response(id, peer_id, Some(block)); - self.single_lookup_block_response(id, peer_id, None); - // Expect processing and resolve with import - self.expect_block_process(ResponseType::Block); - self.single_block_component_processed( - id.lookup_id, - if missing_components { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - slot, block_root, - )) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)) - }, - ) - } - fn complete_valid_sampling_column_requests( &mut self, ids: DCByRootIds, @@ -744,42 +471,6 @@ impl TestRig { }) } - fn complete_valid_custody_request( - &mut self, - ids: DCByRootIds, - data_columns: DataColumnSidecarList, - _missing_components: bool, - ) { - let _lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { - parent_request_id: DataColumnsByRootRequester::Custody(id), - .. - }) = ids.first().unwrap().0 - { - todo!(); - // id.parent_request_id.0.lookup_id - } else { - panic!("not a custody requester") - }; - - let _first_column = data_columns.first().cloned().unwrap(); - - for id in ids { - self.log(&format!("return valid data column for {id:?}")); - let indices = &id.1; - let columns_to_send = indices - .iter() - .map(|&i| data_columns[i as usize].clone()) - .collect::>(); - self.complete_data_columns_by_root_request(id, &columns_to_send); - } - - // Expect work event - self.expect_rpc_custody_column_work_event(); - - // Respond with valid result - todo!(); - } - fn complete_data_columns_by_root_request( &mut self, (sync_request_id, _): DCByRootId, @@ -960,26 +651,6 @@ impl TestRig { } } - #[track_caller] - fn expect_blob_parent_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlobsByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), - } if request - .blob_ids - .to_vec() - .iter() - .all(|r| r.block_root == for_block) => - { - todo!(); - } - _ => None, - }) - .unwrap_or_else(|e| panic!("Expected blob parent request for {for_block:?}: {e}")) - } - /// Retrieves an unknown number of requests for data columns of `block_root`. Because peer ENRs /// are random, and peer selection is random, the total number of batched requests is unknown. fn expect_data_columns_by_root_requests( @@ -1150,6 +821,12 @@ impl TestRig { } } + #[track_caller] + fn expect_empty_network_fully_synced(&mut self) { + self.expect_empty_network(); + self.expect_no_active_lookups(); + } + #[track_caller] pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { let all_penalties = self.filter_received_network_events(|ev| match ev { @@ -1169,6 +846,16 @@ impl TestRig { )); } + pub fn expect_no_penalties(&mut self) { + let penalties = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if !penalties.is_empty() { + panic!("Expected no penalties but found {penalties:?}"); + } + } + #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self @@ -1236,94 +923,6 @@ impl TestRig { blocks } - fn insert_block_to_da_checker(&mut self, block: Arc>) { - let state = BeaconState::Base(BeaconStateBase::random_for_test(&mut self.rng)); - let parent_block = self.rand_block(); - let import_data = BlockImportData::::__new_for_test( - block.canonical_root(), - state, - parent_block.into(), - ); - let payload_verification_outcome = PayloadVerificationOutcome { - payload_verification_status: PayloadVerificationStatus::Verified, - is_valid_merge_transition_block: false, - }; - let executed_block = - AvailabilityPendingExecutedBlock::new(block, import_data, payload_verification_outcome); - match self - .harness - .chain - .data_availability_checker - .put_pending_executed_block(executed_block) - .unwrap() - { - Availability::Available(_) => panic!("block removed from da_checker, available"), - Availability::MissingComponents(block_root) => { - self.log(&format!("inserted block to da_checker {block_root:?}")) - } - }; - } - - fn insert_blob_to_da_checker(&mut self, blob: BlobSidecar) { - match self - .harness - .chain - .data_availability_checker - .put_gossip_verified_blobs( - blob.block_root(), - std::iter::once(GossipVerifiedBlob::<_, Observe>::__assumed_valid( - blob.into(), - )), - ) - .unwrap() - { - Availability::Available(_) => panic!("blob removed from da_checker, available"), - Availability::MissingComponents(block_root) => { - self.log(&format!("inserted blob to da_checker {block_root:?}")) - } - }; - } - - fn insert_block_to_processing_cache(&mut self, block: Arc>) { - self.harness - .chain - .reqresp_pre_import_cache - .write() - .insert(block.canonical_root(), block); - } - - fn simulate_block_gossip_processing_becomes_invalid(&mut self, block_root: Hash256) { - self.harness - .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); - - self.send_sync_message(SyncMessage::GossipBlockProcessResult { - block_root, - imported: false, - }); - } - - fn simulate_block_gossip_processing_becomes_valid_missing_components( - &mut self, - block: Arc>, - ) { - let block_root = block.canonical_root(); - self.harness - .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); - - self.insert_block_to_da_checker(block); - - self.send_sync_message(SyncMessage::GossipBlockProcessResult { - block_root, - imported: false, - }); - } - fn assert_sampling_request_ongoing(&self, block_root: Hash256, indices: &[ColumnIndex]) { for index in indices { let status = self @@ -1363,6 +962,50 @@ impl TestRig { "Sampling request status for {block_root}: {statuses:?}" )); } + + async fn single_lookup_from_attestation_setup(&mut self) -> (Hash256, PeerId) { + let (head_root, head_slot) = self.create_unimported_parent_chain(1).await; + let peer_id = self.new_connected_peer(); + // Trigger the request + self.trigger_unknown_block_from_attestation(head_root, peer_id); + self.expect_block_lookup_request(head_root); + (head_root, peer_id) + } + + async fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { + let (head_root, head_slot) = self.create_unimported_parent_chain(2).await; + let peer_id = self.new_connected_peer(); + let head_block = self + .blocks_by_root + .get(&head_root) + .expect("block should exist"); + self.trigger_unknown_parent_block(peer_id, head_block.clone()); + (head_root, peer_id) + } + + fn expect_fully_complete_sync(&mut self, expected_head_root: Hash256) { + self.progress_until_no_events(NO_FILTER, complete()); + self.assert_head(expected_head_root); + self.expect_empty_network_fully_synced(); + } + + fn assert_head(&self, expected_head: Hash256) { + let head = self.harness.chain.head(); + assert_eq!( + head.head_block_root(), + expected_head, + "Not expected head root" + ); + } + + fn fetch_ancestor_chain(&self, mut block_root: Hash256) -> Vec { + let mut chain = vec![]; + while let Some(block) = self.blocks_by_root.get(&block_root) { + chain.push(block_root); + block_root = block.parent_root(); + } + chain + } } #[test] @@ -1381,740 +1024,141 @@ fn stable_rng() { ); } -#[test] -fn test_single_block_lookup_happy_path() { - let mut rig = TestRig::test_setup(); - let block = rig.rand_block(); - let peer_id = rig.new_connected_peer(); - let block_root = block.canonical_root(); - // Trigger the request - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - let id = rig.expect_block_lookup_request(block_root); - - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - rig.single_lookup_block_response(id, peer_id, Some(block.into())); - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(rig.active_single_lookups_count(), 1); - - // Send the stream termination. Peer should have not been penalized, and the request removed - // after processing. - rig.single_lookup_block_response(id, peer_id, None); - rig.single_block_component_processed_imported(block_root); - rig.expect_empty_network(); - rig.expect_no_active_lookups(); +#[tokio::test] +async fn test_single_block_lookup_happy_path() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + r.expect_fully_complete_sync(new_head_root); } // Tests that if a peer does not respond with a block, we downscore and retry the block only -#[test] -fn test_single_block_lookup_empty_response() { +#[tokio::test] +async fn test_single_block_lookup_empty_response() { let mut r = TestRig::test_setup(); - - let block = r.rand_block(); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - - // Trigger the request - r.trigger_unknown_block_from_attestation(block_root, peer_id); - let id = r.expect_block_lookup_request(block_root); - - // The peer does not have the block. It should be penalized. - r.single_lookup_block_response(id, peer_id, None); - r.expect_penalty(peer_id, "NotEnoughResponsesReturned"); - // it should be retried - let id = r.expect_block_lookup_request(block_root); - // Send the right block this time. - r.single_lookup_block_response(id, peer_id, Some(block.into())); - r.expect_block_process(ResponseType::Block); - r.single_block_component_processed_imported(block_root); - r.expect_no_active_lookups(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + r.progress_until_no_events(NO_FILTER, complete().return_no_blocks()); + r.expect_penalties("NotEnoughResponsesReturned"); + r.expect_fully_complete_sync(new_head_root); } -#[test] -fn test_single_block_lookup_wrong_response() { - let mut rig = TestRig::test_setup(); - - let block_hash = Hash256::random(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block_hash, peer_id); - let id = rig.expect_block_lookup_request(block_hash); - - // Peer sends something else. It should be penalized. - let bad_block = rig.rand_block(); - rig.single_lookup_block_response(id, peer_id, Some(bad_block.into())); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - rig.expect_block_lookup_request(block_hash); // should be retried - - // Send the stream termination. This should not produce an additional penalty. - rig.single_lookup_block_response(id, peer_id, None); - rig.expect_empty_network(); +#[tokio::test] +async fn test_single_block_lookup_wrong_response() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + r.progress_until_no_events(NO_FILTER, complete().return_wrong_blocks()); + r.expect_penalties("UnrequestedBlockRoot"); + r.expect_fully_complete_sync(new_head_root); } -#[test] -fn test_single_block_lookup_failure() { - let mut rig = TestRig::test_setup(); - - let block_hash = Hash256::random(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block_hash, peer_id); - let id = rig.expect_block_lookup_request(block_hash); - - // The request fails. RPC failures are handled elsewhere so we should not penalize the peer. - rig.single_lookup_failed(id, peer_id, RPCError::UnsupportedProtocol); - rig.expect_block_lookup_request(block_hash); - rig.expect_empty_network(); +#[tokio::test] +async fn test_single_block_lookup_failure() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + r.progress_until_no_events( + NO_FILTER, + complete().rpc_error(RPCError::UnsupportedProtocol), + ); + r.expect_no_penalties(); + r.expect_fully_complete_sync(new_head_root); } -#[test] -fn test_single_block_lookup_peer_disconnected_then_rpc_error() { - let mut rig = TestRig::test_setup(); - - let block_hash = Hash256::random(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request. - rig.trigger_unknown_block_from_attestation(block_hash, peer_id); - let id = rig.expect_block_lookup_request(block_hash); - +#[tokio::test] +async fn test_single_block_lookup_peer_disconnected_then_rpc_error() { + let mut r = TestRig::test_setup(); + let (new_head_root, peer_id) = r.single_lookup_from_attestation_setup().await; // The peer disconnect event reaches sync before the rpc error. - rig.peer_disconnected(peer_id); + r.peer_disconnected(peer_id); // The lookup is not removed as it can still potentially make progress. - rig.assert_single_lookups_count(1); + r.assert_active_lookup(new_head_root); // The request fails. - rig.single_lookup_failed(id, peer_id, RPCError::Disconnected); - rig.expect_block_lookup_request(block_hash); - // The request should be removed from the network context on disconnection. - rig.expect_empty_network(); + r.progress_until_no_events(NO_FILTER, complete().rpc_error(RPCError::Disconnected)); + r.expect_fully_complete_sync(new_head_root); } -#[test] -fn test_single_block_lookup_becomes_parent_request() { - let mut rig = TestRig::test_setup(); +#[tokio::test] +async fn test_parent_lookup_happy_path() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + r.expect_fully_complete_sync(new_head_root); +} - let block = Arc::new(rig.rand_block()); - let block_root = block.canonical_root(); - let parent_root = block.parent_root(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block.canonical_root(), peer_id); - let id = rig.expect_block_parent_request(block_root); - - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - rig.single_lookup_block_response(id, peer_id, Some(block.clone())); - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(rig.active_single_lookups_count(), 1); - - // Send the stream termination. Peer should have not been penalized, and the request moved to a - // parent request after processing. - rig.single_block_component_processed( - id.lookup_id, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ); - assert_eq!(rig.active_single_lookups_count(), 2); // 2 = current + parent - rig.expect_block_parent_request(parent_root); - rig.expect_empty_network(); - assert_eq!(rig.active_parent_lookups_count(), 1); +#[tokio::test] +async fn test_parent_lookup_wrong_response() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + r.progress_until_no_events(NO_FILTER, complete().return_wrong_blocks()); + r.expect_penalties("UnrequestedBlockRoot"); + r.expect_fully_complete_sync(new_head_root); } -#[test] -fn test_parent_lookup_happy_path() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - let id = rig.expect_block_parent_request(parent_root); - - // Peer sends the right block, it should be sent for processing. Peer should not be penalized. - rig.parent_lookup_block_response(id, peer_id, Some(parent.into())); - // No request of blobs because the block has not data - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - rig.expect_empty_network(); - - // Add peer to child lookup to prevent it being dropped - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - // Processing succeeds, now the rest of the chain should be sent for processing. - rig.parent_block_processed( - block_root, - BlockError::DuplicateFullyImported(block_root).into(), +#[tokio::test] +async fn test_parent_lookup_rpc_failure() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + r.progress_until_no_events( + NO_FILTER, + complete().rpc_error_response(RpcErrorResponse::ResourceUnavailable), ); - rig.expect_parent_chain_process(); - rig.parent_chain_processed_success(block_root, &[]); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_wrong_response() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - let id1 = rig.expect_block_parent_request(parent_root); - - // Peer sends the wrong block, peer should be penalized and the block re-requested. - let bad_block = rig.rand_block(); - rig.parent_lookup_block_response(id1, peer_id, Some(bad_block.into())); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - let id2 = rig.expect_block_parent_request(parent_root); - - // Send the stream termination for the first request. This should not produce extra penalties. - rig.parent_lookup_block_response(id1, peer_id, None); - rig.expect_empty_network(); - - // Send the right block this time. - rig.parent_lookup_block_response(id2, peer_id, Some(parent.into())); - rig.expect_block_process(ResponseType::Block); - - // Add peer to child lookup to prevent it being dropped - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - // Processing succeeds, now the rest of the chain should be sent for processing. - rig.parent_block_processed_imported(block_root); - rig.expect_parent_chain_process(); - rig.parent_chain_processed_success(block_root, &[]); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_rpc_failure() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - let id = rig.expect_block_parent_request(parent_root); - - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - let id = rig.expect_block_parent_request(parent_root); - - // Send the right block this time. - rig.parent_lookup_block_response(id, peer_id, Some(parent.into())); - rig.expect_block_process(ResponseType::Block); - - // Add peer to child lookup to prevent it being dropped - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - // Processing succeeds, now the rest of the chain should be sent for processing. - rig.parent_block_processed_imported(block_root); - rig.expect_parent_chain_process(); - rig.parent_chain_processed_success(block_root, &[]); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_many_attempts() { - let mut rig = TestRig::test_setup(); - - let block = rig.rand_block(); - let parent_root = block.parent_root(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - for i in 1..=PARENT_FAIL_TOLERANCE { - let id = rig.expect_block_parent_request(parent_root); - // Blobs are only requested in the first iteration as this test only retries blocks - - if i % 2 == 0 { - // make sure every error is accounted for - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - } else { - // Send a bad block this time. It should be tried again. - let bad_block = rig.rand_block(); - rig.parent_lookup_block_response(id, peer_id, Some(bad_block.into())); - // Send the stream termination - - // Note, previously we would send the same lookup id with a stream terminator, - // we'd ignore it because we'd intrepret it as an unrequested response, since - // we already got one response for the block. I'm not sure what the intent is - // for having this stream terminator line in this test at all. Receiving an invalid - // block and a stream terminator with the same Id now results in two failed attempts, - // I'm unsure if this is how it should behave? - // - rig.parent_lookup_block_response(id, peer_id, None); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - } - } - - rig.expect_no_active_lookups_empty_network(); + r.expect_no_penalties(); + r.expect_fully_complete_sync(new_head_root); } -#[test] -fn test_parent_lookup_too_many_download_attempts_no_blacklist() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); +// TODO(tree-sync): test blacklist feature +#[tokio::test] +async fn test_parent_lookup_too_many_attempts() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); for i in 1..=PARENT_FAIL_TOLERANCE { - rig.assert_not_failed_chain(block_root); - let id = rig.expect_block_parent_request(parent_root); - if i % 2 != 0 { - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - } else { - // Send a bad block this time. It should be tried again. - let bad_block = rig.rand_block(); - rig.parent_lookup_block_response(id, peer_id, Some(bad_block.into())); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - } - } - - rig.assert_not_failed_chain(block_root); - rig.assert_not_failed_chain(parent.canonical_root()); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_many_processing_attempts_must_blacklist() { - const PROCESSING_FAILURES: u8 = PARENT_FAIL_TOLERANCE / 2 + 1; - let mut rig = TestRig::test_setup(); - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - - rig.log("Fail downloading the block"); - for _ in 0..(PARENT_FAIL_TOLERANCE - PROCESSING_FAILURES) { - let id = rig.expect_block_parent_request(parent_root); - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - } - - rig.log("Now fail processing a block in the parent request"); - for _ in 0..PROCESSING_FAILURES { - let id = rig.expect_block_parent_request(parent_root); - // Blobs are only requested in the previous first iteration as this test only retries blocks - rig.assert_not_failed_chain(block_root); - // send the right parent but fail processing - rig.parent_lookup_block_response(id, peer_id, Some(parent.clone().into())); - rig.parent_block_processed(block_root, BlockError::BlockSlotLimitReached.into()); - rig.parent_lookup_block_response(id, peer_id, None); - rig.expect_penalty(peer_id, "lookup_block_processing_failure"); - } - - rig.assert_not_failed_chain(block_root); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_deep_grow_ancestor() { - let mut rig = TestRig::test_setup(); - let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE); - - let peer_id = rig.new_connected_peer(); - let trigger_block = blocks.pop().unwrap(); - let chain_hash = trigger_block.canonical_root(); - rig.trigger_unknown_parent_block(peer_id, trigger_block); - - for block in blocks.into_iter().rev() { - let id = rig.expect_block_parent_request(block.canonical_root()); - // the block - rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); - // the stream termination - rig.parent_lookup_block_response(id, peer_id, None); - // the processing request - rig.expect_block_process(ResponseType::Block); - // the processing result - rig.parent_block_processed( - chain_hash, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ) - } - - // Should create a new syncing chain - rig.drain_sync_rx(); - assert_eq!( - rig.active_range_sync_chain(), - ( - RangeSyncType::Head, - Slot::new(0), - Slot::new(PARENT_DEPTH_TOLERANCE as u64 - 1) - ) - ); - // Should not penalize peer, but network is not clear because of the blocks_by_range requests - rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(chain_hash); -} - -// Regression test for https://github.com/sigp/lighthouse/pull/7118 -#[test] -fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { - // GIVEN: A parent chain longer than PARENT_DEPTH_TOLERANCE. - let mut rig = TestRig::test_setup(); - let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE + 1); - let peer_id = rig.new_connected_peer(); - - // The child of the trigger block to be used to extend the chain. - let trigger_block_child = blocks.pop().unwrap(); - // The trigger block that starts the lookup. - let trigger_block = blocks.pop().unwrap(); - let tip_root = trigger_block.canonical_root(); - - // Trigger the initial unknown parent block for the tip. - rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); - - // Simulate the lookup chain building up via `ParentUnknown` errors. - for block in blocks.into_iter().rev() { - let id = rig.expect_block_parent_request(block.canonical_root()); - rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); - rig.parent_lookup_block_response(id, peer_id, None); - rig.expect_block_process(ResponseType::Block); - rig.parent_block_processed( - tip_root, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), + r.progress_until_no_events( + NO_FILTER, + complete().rpc_error_response(RpcErrorResponse::ResourceUnavailable), ); } - // At this point, the chain should have been deemed too deep and pruned. - // The tip root should have been inserted into failed chains. - rig.assert_failed_chain(tip_root); - rig.expect_no_penalty_for(peer_id); - - // WHEN: Trigger the extending block that points to the tip. - let trigger_block_child_root = trigger_block_child.canonical_root(); - rig.trigger_unknown_block_from_attestation(trigger_block_child_root, peer_id); - let id = rig.expect_block_lookup_request(trigger_block_child_root); - rig.single_lookup_block_response(id, peer_id, Some(trigger_block_child.clone())); - rig.single_lookup_block_response(id, peer_id, None); - rig.expect_block_process(ResponseType::Block); - rig.single_block_component_processed( - id.lookup_id, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: tip_root, - }), - ); - - // THEN: The extending block should not create a lookup because the tip was inserted into failed chains. - rig.expect_no_active_lookups(); - // AND: The peer should be penalized for extending a failed chain. - rig.expect_single_penalty(peer_id, "failed_chain"); - rig.expect_empty_network(); + r.expect_no_active_lookups_empty_network(); } -#[test] -fn test_parent_lookup_too_deep_grow_tip() { - let mut rig = TestRig::test_setup(); - let blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE - 1); - let peer_id = rig.new_connected_peer(); - let tip = blocks.last().unwrap().clone(); - - for block in blocks.into_iter() { - let block_root = block.canonical_root(); - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - let id = rig.expect_block_parent_request(block_root); - rig.single_lookup_block_response(id, peer_id, Some(block.clone())); - rig.single_lookup_block_response(id, peer_id, None); - rig.expect_block_process(ResponseType::Block); - rig.single_block_component_processed( - id.lookup_id, - BlockError::ParentUnknown { - parent_root: block.parent_root(), - } - .into(), - ); - } - - // Should create a new syncing chain - rig.drain_sync_rx(); - assert_eq!( - rig.active_range_sync_chain(), - ( - RangeSyncType::Head, - Slot::new(0), - Slot::new(PARENT_DEPTH_TOLERANCE as u64 - 2) - ) - ); - // Should not penalize peer, but network is not clear because of the blocks_by_range requests - rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(tip.canonical_root()); -} - -#[test] -fn test_lookup_peer_disconnected_no_peers_left_while_request() { - let mut rig = TestRig::test_setup(); - let peer_id = rig.new_connected_peer(); - let trigger_block = rig.rand_block(); - rig.trigger_unknown_parent_block(peer_id, trigger_block.into()); - rig.peer_disconnected(peer_id); - rig.rpc_error_all_active_requests(peer_id); +#[tokio::test] +async fn test_lookup_peer_disconnected_no_peers_left_while_request() { + let mut r = TestRig::test_setup(); + let (head_root, peer_id) = r.single_lookup_from_attestation_setup().await; + r.peer_disconnected(peer_id); + r.rpc_error_all_active_requests(peer_id); // Erroring all rpc requests and disconnecting the peer shouldn't remove the requests // from the lookups map as they can still progress. - rig.assert_single_lookups_count(2); + r.assert_active_lookup(head_root); } -#[test] -fn test_lookup_disconnection_peer_left() { - let mut rig = TestRig::test_setup(); - let peer_ids = (0..2).map(|_| rig.new_connected_peer()).collect::>(); - let disconnecting_peer = *peer_ids.first().unwrap(); - let block_root = Hash256::random(); - // lookup should have two peers associated with the same block - for peer_id in peer_ids.iter() { - rig.trigger_unknown_block_from_attestation(block_root, *peer_id); - } +#[tokio::test] +async fn test_lookup_disconnection_peer_left() { + let mut r = TestRig::test_setup(); + let (head_root, peer_1) = r.single_lookup_from_attestation_setup().await; + let peer_2 = r.new_connected_peer(); + r.trigger_unknown_block_from_attestation(head_root, peer_2); // Disconnect the first peer only, which is the one handling the request - rig.peer_disconnected(disconnecting_peer); - rig.rpc_error_all_active_requests(disconnecting_peer); - rig.assert_single_lookups_count(1); + r.peer_disconnected(peer_1); + r.rpc_error_all_active_requests(peer_1); + r.assert_active_lookup(head_root); } -#[test] -fn test_lookup_add_peers_to_parent() { +#[tokio::test] +async fn test_lookup_add_peers_to_parent() { let mut r = TestRig::test_setup(); - let peer_id_1 = r.new_connected_peer(); - let peer_id_2 = r.new_connected_peer(); - let blocks = r.rand_blockchain(5); - let last_block_root = blocks.last().unwrap().canonical_root(); - // Create a chain of lookups - for block in &blocks { - r.trigger_unknown_parent_block(peer_id_1, block.clone()); - } - r.trigger_unknown_block_from_attestation(last_block_root, peer_id_2); - for block in blocks.iter().take(blocks.len() - 1) { - // Parent has the original unknown parent event peer + new peer - r.assert_lookup_peers(block.canonical_root(), vec![peer_id_1, peer_id_2]); - } - // Child lookup only has the unknown attestation peer - r.assert_lookup_peers(last_block_root, vec![peer_id_2]); -} - -#[test] -fn test_skip_creating_failed_parent_lookup() { - let mut rig = TestRig::test_setup(); - let (_, block, parent_root, _) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - rig.insert_failed_chain(parent_root); - rig.trigger_unknown_parent_block(peer_id, block.into()); - // Expect single penalty for peer, despite dropping two lookups - rig.expect_single_penalty(peer_id, "failed_chain"); - // Both current and parent lookup should be rejected - rig.expect_no_active_lookups(); -} - -#[test] -fn test_single_block_lookup_ignored_response() { - let mut rig = TestRig::test_setup(); - - let block = rig.rand_block(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block.canonical_root(), peer_id); - let id = rig.expect_block_lookup_request(block.canonical_root()); - - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - rig.single_lookup_block_response(id, peer_id, Some(block.into())); - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(rig.active_single_lookups_count(), 1); - - // Send the stream termination. Peer should have not been penalized, and the request removed - // after processing. - rig.single_lookup_block_response(id, peer_id, None); - // Send an Ignored response, the request should be dropped - rig.single_block_component_processed(id.lookup_id, BlockProcessingResult::Ignored); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_ignored_response() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.clone().into()); - let id = rig.expect_block_parent_request(parent_root); - // Note: single block lookup for current `block` does not trigger any request because it does - // not have blobs, and the block is already cached - - // Peer sends the right block, it should be sent for processing. Peer should not be penalized. - rig.parent_lookup_block_response(id, peer_id, Some(parent.into())); - rig.expect_block_process(ResponseType::Block); - rig.expect_empty_network(); - - // Return an Ignored result. The request should be dropped - rig.parent_block_processed(block_root, BlockProcessingResult::Ignored); - rig.expect_empty_network(); - rig.expect_no_active_lookups(); -} - -/// This is a regression test. -#[test] -fn test_same_chain_race_condition() { - let mut rig = TestRig::test_setup(); - - // if we use one or two blocks it will match on the hash or the parent hash, so make a longer - // chain. - let depth = 4; - let mut blocks = rig.rand_blockchain(depth); - let peer_id = rig.new_connected_peer(); - let trigger_block = blocks.pop().unwrap(); - let chain_hash = trigger_block.canonical_root(); - rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); - - for (i, block) in blocks.clone().into_iter().rev().enumerate() { - let id = rig.expect_block_parent_request(block.canonical_root()); - // the block - rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); - // the stream termination - rig.parent_lookup_block_response(id, peer_id, None); - // the processing request - rig.expect_block_process(ResponseType::Block); - // the processing result - if i + 2 == depth { - rig.log(&format!("Block {i} was removed and is already known")); - rig.parent_block_processed( - chain_hash, - BlockError::DuplicateFullyImported(block.canonical_root()).into(), - ) - } else { - rig.log(&format!("Block {i} ParentUnknown")); - rig.parent_block_processed( - chain_hash, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ) - } - } - - // Try to get this block again while the chain is being processed. We should not request it again. - let peer_id = rig.new_connected_peer(); - rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); - rig.expect_empty_network(); - - // Add a peer to the tip child lookup which has zero peers - rig.trigger_unknown_block_from_attestation(trigger_block.canonical_root(), peer_id); - - rig.log("Processing succeeds, now the rest of the chain should be sent for processing."); - for block in blocks.iter().skip(1).chain(&[trigger_block]) { - rig.expect_parent_chain_process(); - rig.single_block_component_processed_imported(block.canonical_root()); - } - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn block_in_da_checker_skips_download() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - r.insert_block_to_da_checker(block.into()); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should not trigger block request - let id = r.expect_blob_lookup_request(block_root); - r.expect_empty_network(); - // Resolve blob and expect lookup completed - r.complete_single_lookup_blob_lookup_valid(id, peer_id, blobs, true); - r.expect_no_active_lookups(); -} - -#[test] -fn block_in_processing_cache_becomes_invalid() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should trigger blob request - let id = r.expect_blob_lookup_request(block_root); - // Should not trigger block request - r.expect_empty_network(); - // Simulate invalid block, removing it from processing cache - r.simulate_block_gossip_processing_becomes_invalid(block_root); - // Should download block, then issue blobs request - r.complete_lookup_block_download(block); - // Should not trigger block or blob request - r.expect_empty_network(); - r.complete_lookup_block_import_valid(block_root, false); - // Resolve blob and expect lookup completed - r.complete_single_lookup_blob_lookup_valid(id, peer_id, blobs, true); - r.expect_no_active_lookups(); -} - -#[test] -fn block_in_processing_cache_becomes_valid_imported() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); + let (head_root, _) = r.create_unimported_parent_chain(4).await; + let chain = r.fetch_ancestor_chain(head_root); let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should trigger blob request - let id = r.expect_blob_lookup_request(block_root); - // Should not trigger block request - r.expect_empty_network(); - // Resolve the block from processing step - r.simulate_block_gossip_processing_becomes_valid_missing_components(block.into()); - // Should not trigger block or blob request - r.expect_empty_network(); - // Resolve blob and expect lookup completed - r.complete_single_lookup_blob_lookup_valid(id, peer_id, blobs, true); - r.expect_no_active_lookups(); -} + r.trigger_unknown_block_from_attestation(head_root, peer_id); + r.progress_until_no_events(filter().header_requests_only(), complete()); -// IGNORE: wait for change that delays blob fetching to knowing the block -#[ignore] -#[test] -fn blobs_in_da_checker_skip_download() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - for blob in blobs { - r.insert_blob_to_da_checker(blob); + let new_peers = (0..2).map(|_| r.new_connected_peer()).collect::>(); + for peer in &new_peers { + r.trigger_unknown_block_from_attestation(head_root, *peer); + } + for block in chain { + // Parent has the original unknown parent event peer + new peer + r.assert_lookup_peers(block, &new_peers); } - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should download and process the block - r.complete_single_lookup_block_valid(block, true); - // Should not trigger blob request - r.expect_empty_network(); - r.expect_no_active_lookups(); } #[test] @@ -2127,10 +1171,7 @@ fn sampling_happy_path() { let block_root = block.canonical_root(); r.trigger_sample_block(block_root, block.slot()); // Retrieve all outgoing sample requests for random column indexes - let sampling_ids = - r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); - // Resolve all of them one by one - r.complete_valid_sampling_column_requests(sampling_ids, data_columns); + r.progress_until_no_events(NO_FILTER, complete()); r.expect_clean_finished_sampling(); } @@ -2145,14 +1186,7 @@ fn sampling_with_retries() { let (block, data_columns) = r.rand_block_and_data_columns(); let block_root = block.canonical_root(); r.trigger_sample_block(block_root, block.slot()); - // Retrieve all outgoing sample requests for random column indexes, and return empty responses - let sampling_ids = - r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); - r.return_empty_sampling_requests(sampling_ids); - // Expect retries for all of them, and resolve them - let sampling_ids = - r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); - r.complete_valid_sampling_column_requests(sampling_ids, data_columns); + r.progress_until_no_events(NO_FILTER, complete().empty_sampling_response_once()); r.expect_clean_finished_sampling(); } @@ -2161,18 +1195,20 @@ fn sampling_avoid_retrying_same_peer() { let Some(mut r) = TestRig::test_setup_after_fulu() else { return; }; - let peer_id_1 = r.new_connected_supernode_peer(); - let peer_id_2 = r.new_connected_supernode_peer(); + let peer_1 = r.new_connected_supernode_peer(); let block_root = Hash256::random(); r.trigger_sample_block(block_root, Slot::new(0)); // Retrieve all outgoing sample requests for random column indexes, and return empty responses - let sampling_ids = - r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); - r.sampling_requests_failed(sampling_ids, peer_id_1, RPCError::Disconnected); + r.progress_until_no_events( + filter().peer(peer_1), + complete().rpc_error(RPCError::Disconnected), + ); // Should retry the other peer - let sampling_ids = - r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); - r.sampling_requests_failed(sampling_ids, peer_id_2, RPCError::Disconnected); + let peer_2 = r.new_connected_supernode_peer(); + r.progress_until_no_events( + filter().peer(peer_2), + complete().rpc_error(RPCError::Disconnected), + ); // Expect no more retries r.expect_empty_network(); } @@ -2188,18 +1224,14 @@ fn sampling_batch_requests() { r.trigger_sample_block(block_root, block.slot()); // Retrieve the sample request, which should be batched. - let (sync_request_id, column_indexes) = r + let (_, column_indexes) = r .expect_only_data_columns_by_root_requests(block_root, 1) .pop() .unwrap(); assert_eq!(column_indexes.len(), SAMPLING_REQUIRED_SUCCESSES); r.assert_sampling_request_ongoing(block_root, &column_indexes); - // Resolve the request. - r.complete_valid_sampling_column_requests( - vec![(sync_request_id, column_indexes.clone())], - data_columns, - ); + r.progress_until_no_events(NO_FILTER, complete()); r.expect_clean_finished_sampling(); } @@ -2224,18 +1256,14 @@ fn sampling_batch_requests_not_enough_responses_returned() { r.assert_sampling_request_ongoing(block_root, &column_indexes); // Split the indexes to simulate the case where the supernode doesn't have the requested column. + let missing_custody_index = *column_indexes.first().unwrap(); let (column_indexes_supernode_does_not_have, column_indexes_to_complete) = column_indexes.split_at(1); // Complete the requests but only partially, so a NotEnoughResponsesReturned error occurs. - let data_columns_to_complete = data_columns - .iter() - .filter(|d| column_indexes_to_complete.contains(&d.index)) - .cloned() - .collect::>(); - r.complete_data_columns_by_root_request( - (sync_request_id, column_indexes.clone()), - &data_columns_to_complete, + r.progress_until_no_events( + NO_FILTER, + complete().custody_failure_at_index(missing_custody_index), ); // The request status should be set to NoPeers since the supernode, the only peer, returned not enough responses. @@ -2247,828 +1275,3 @@ fn sampling_batch_requests_not_enough_responses_returned() { r.expect_no_work_event(); r.expect_active_sampling(&block_root); } - -#[test] -fn custody_lookup_happy_path() { - let Some(mut r) = TestRig::test_setup_after_fulu() else { - return; - }; - let spec = E::default_spec(); - r.new_connected_peers_for_peerdas(); - let (block, data_columns) = r.rand_block_and_data_columns(); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should not request blobs - let id = r.expect_block_lookup_request(block.canonical_root()); - r.complete_valid_block_request(id, block.into(), true); - // for each slot we download `samples_per_slot` columns - let sample_column_count = spec.samples_per_slot * spec.data_columns_per_group(); - let custody_ids = - r.expect_only_data_columns_by_root_requests(block_root, sample_column_count as usize); - r.complete_valid_custody_request(custody_ids, data_columns, false); - r.expect_no_active_lookups(); -} - -// TODO(das): Test retries of DataColumnByRoot: -// - Expect request for column_index -// - Respond with bad data -// - Respond with stream terminator -// ^ The stream terminator should be ignored and not close the next retry - -// TODO(das): Test error early a sampling request and it getting drop + then receiving responses -// from pending requests. - -mod deneb_only { - use super::*; - use beacon_chain::{ - block_verification_types::{AsBlock, RpcBlock}, - data_availability_checker::AvailabilityCheckError, - }; - use std::collections::VecDeque; - use types::RuntimeVariableList; - - struct DenebTester { - rig: TestRig, - block: Arc>, - blobs: Vec>>, - parent_block_roots: Vec, - parent_block: VecDeque>>, - parent_blobs: VecDeque>>>, - unknown_parent_block: Option>>, - unknown_parent_blobs: Option>>>, - peer_id: PeerId, - block_req_id: Option, - parent_block_req_id: Option, - blob_req_id: Option, - parent_blob_req_id: Option, - slot: Slot, - block_root: Hash256, - } - - enum RequestTrigger { - AttestationUnknownBlock, - GossipUnknownParentBlock(usize), - GossipUnknownParentBlob(usize), - } - - impl RequestTrigger { - fn num_parents(&self) -> usize { - match self { - RequestTrigger::AttestationUnknownBlock => 0, - RequestTrigger::GossipUnknownParentBlock(num_parents) => *num_parents, - RequestTrigger::GossipUnknownParentBlob(num_parents) => *num_parents, - } - } - } - - impl DenebTester { - fn new(request_trigger: RequestTrigger) -> Option { - let Some(mut rig) = TestRig::test_setup_after_deneb_before_fulu() else { - return None; - }; - let (block, blobs) = rig.rand_block_and_blobs(NumBlobs::Random); - let mut block = Arc::new(block); - let mut blobs = blobs.into_iter().map(Arc::new).collect::>(); - let slot = block.slot(); - - let num_parents = request_trigger.num_parents(); - let mut parent_block_chain = VecDeque::with_capacity(num_parents); - let mut parent_blobs_chain = VecDeque::with_capacity(num_parents); - let mut parent_block_roots = vec![]; - for _ in 0..num_parents { - // Set the current block as the parent. - let parent_root = block.canonical_root(); - let parent_block = block.clone(); - let parent_blobs = blobs.clone(); - parent_block_chain.push_front(parent_block); - parent_blobs_chain.push_front(parent_blobs); - parent_block_roots.push(parent_root); - - // Create the next block. - let (child_block, child_blobs) = - rig.block_with_parent_and_blobs(parent_root, NumBlobs::Random); - let mut child_block = Arc::new(child_block); - let mut child_blobs = child_blobs.into_iter().map(Arc::new).collect::>(); - - // Update the new block to the current block. - std::mem::swap(&mut child_block, &mut block); - std::mem::swap(&mut child_blobs, &mut blobs); - } - let block_root = block.canonical_root(); - - let peer_id = rig.new_connected_peer(); - - // Trigger the request - let (block_req_id, blob_req_id, parent_block_req_id, parent_blob_req_id) = - match request_trigger { - RequestTrigger::AttestationUnknownBlock => { - rig.send_sync_message(SyncMessage::UnknownBlockHashFromAttestation( - peer_id, block_root, - )); - let block_req_id = rig.expect_block_lookup_request(block_root); - (Some(block_req_id), None, None, None) - } - RequestTrigger::GossipUnknownParentBlock { .. } => { - rig.send_sync_message(SyncMessage::UnknownParentBlock( - peer_id, - block.clone(), - block_root, - )); - - let parent_root = block.parent_root(); - let parent_block_req_id = rig.expect_block_parent_request(parent_root); - rig.expect_empty_network(); // expect no more requests - (None, None, Some(parent_block_req_id), None) - } - RequestTrigger::GossipUnknownParentBlob { .. } => { - let single_blob = blobs.first().cloned().unwrap(); - let parent_root = single_blob.block_parent_root(); - rig.send_sync_message(SyncMessage::UnknownParentBlob(peer_id, single_blob)); - - let parent_block_req_id = rig.expect_block_parent_request(parent_root); - rig.expect_empty_network(); // expect no more requests - (None, None, Some(parent_block_req_id), None) - } - }; - - Some(Self { - rig, - block, - blobs, - parent_block: parent_block_chain, - parent_blobs: parent_blobs_chain, - parent_block_roots, - unknown_parent_block: None, - unknown_parent_blobs: None, - peer_id, - block_req_id, - parent_block_req_id, - blob_req_id, - parent_blob_req_id, - slot, - block_root, - }) - } - - fn trigger_unknown_block_from_attestation(mut self) -> Self { - let block_root = self.block.canonical_root(); - self.rig - .trigger_unknown_block_from_attestation(block_root, self.peer_id); - self - } - - fn parent_block_response(mut self) -> Self { - self.rig.expect_empty_network(); - let block = self.parent_block.pop_front().unwrap().clone(); - let _ = self.unknown_parent_block.insert(block.clone()); - self.rig.parent_lookup_block_response( - self.parent_block_req_id.expect("parent request id"), - self.peer_id, - Some(block), - ); - - self.rig.assert_parent_lookups_count(1); - self - } - - fn parent_block_response_expect_blobs(mut self) -> Self { - self.rig.expect_empty_network(); - let block = self.parent_block.pop_front().unwrap().clone(); - let _ = self.unknown_parent_block.insert(block.clone()); - self.rig.parent_lookup_block_response( - self.parent_block_req_id.expect("parent request id"), - self.peer_id, - Some(block), - ); - - // Expect blobs request after sending block - let s = self.expect_parent_blobs_request(); - - s.rig.assert_parent_lookups_count(1); - s - } - - fn parent_blob_response(mut self) -> Self { - let blobs = self.parent_blobs.pop_front().unwrap(); - let _ = self.unknown_parent_blobs.insert(blobs.clone()); - for blob in &blobs { - self.rig.parent_lookup_blob_response( - self.parent_blob_req_id.expect("parent blob request id"), - self.peer_id, - Some(blob.clone()), - ); - assert_eq!(self.rig.active_parent_lookups_count(), 1); - } - self.rig.parent_lookup_blob_response( - self.parent_blob_req_id.expect("parent blob request id"), - self.peer_id, - None, - ); - - self - } - - fn block_response_triggering_process(self) -> Self { - let mut me = self.block_response_and_expect_blob_request(); - me.rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(me.rig.active_single_lookups_count(), 1); - me - } - - fn block_response_and_expect_blob_request(mut self) -> Self { - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - self.rig.single_lookup_block_response( - self.block_req_id.expect("block request id"), - self.peer_id, - Some(self.block.clone()), - ); - // After responding with block the node will issue a blob request - let mut s = self.expect_blobs_request(); - - s.rig.expect_empty_network(); - - // The request should still be active. - s.rig.assert_lookup_is_active(s.block.canonical_root()); - s - } - - fn blobs_response(mut self) -> Self { - self.rig - .log(&format!("blobs response {}", self.blobs.len())); - for blob in &self.blobs { - self.rig.single_lookup_blob_response( - self.blob_req_id.expect("blob request id"), - self.peer_id, - Some(blob.clone()), - ); - self.rig - .assert_lookup_is_active(self.block.canonical_root()); - } - self.rig.single_lookup_blob_response( - self.blob_req_id.expect("blob request id"), - self.peer_id, - None, - ); - self - } - - fn blobs_response_was_valid(mut self) -> Self { - self.rig.expect_empty_network(); - if !self.blobs.is_empty() { - self.rig.expect_block_process(ResponseType::Blob); - } - self - } - - fn expect_empty_beacon_processor(mut self) -> Self { - self.rig.expect_empty_beacon_processor(); - self - } - - fn empty_block_response(mut self) -> Self { - self.rig.single_lookup_block_response( - self.block_req_id.expect("block request id"), - self.peer_id, - None, - ); - self - } - - fn empty_blobs_response(mut self) -> Self { - self.rig.single_lookup_blob_response( - self.blob_req_id.expect("blob request id"), - self.peer_id, - None, - ); - self - } - - fn empty_parent_blobs_response(mut self) -> Self { - self.rig.parent_lookup_blob_response( - self.parent_blob_req_id.expect("blob request id"), - self.peer_id, - None, - ); - self - } - - fn block_missing_components(mut self) -> Self { - self.rig.single_block_component_processed( - self.block_req_id.expect("block request id").lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - self.block.slot(), - self.block_root, - )), - ); - self.rig.expect_empty_network(); - self.rig.assert_single_lookups_count(1); - self - } - - fn blob_imported(mut self) -> Self { - self.rig.single_blob_component_processed( - self.blob_req_id.expect("blob request id").lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(self.block_root)), - ); - self.rig.expect_empty_network(); - self.rig.assert_single_lookups_count(0); - self - } - - fn block_imported(mut self) -> Self { - // Missing blobs should be the request is not removed, the outstanding blobs request should - // mean we do not send a new request. - self.rig.single_block_component_processed( - self.block_req_id - .or(self.blob_req_id) - .expect("block request id") - .lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(self.block_root)), - ); - self.rig.expect_empty_network(); - self.rig.assert_single_lookups_count(0); - self - } - - fn parent_block_imported(mut self) -> Self { - let parent_root = *self.parent_block_roots.first().unwrap(); - self.rig - .log(&format!("parent_block_imported {parent_root:?}")); - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(parent_root)), - ); - self.rig.expect_no_requests_for(parent_root); - self.rig.assert_parent_lookups_count(0); - self - } - - fn parent_block_missing_components(mut self) -> Self { - let parent_root = *self.parent_block_roots.first().unwrap(); - self.rig - .log(&format!("parent_block_missing_components {parent_root:?}")); - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - Slot::new(0), - parent_root, - )), - ); - self.rig.expect_no_requests_for(parent_root); - self - } - - fn parent_blob_imported(mut self) -> Self { - let parent_root = *self.parent_block_roots.first().unwrap(); - self.rig - .log(&format!("parent_blob_imported {parent_root:?}")); - self.rig.parent_blob_processed( - self.block_root, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(parent_root)), - ); - - self.rig.expect_no_requests_for(parent_root); - self.rig.assert_parent_lookups_count(0); - self - } - - fn parent_block_unknown_parent(mut self) -> Self { - self.rig.log("parent_block_unknown_parent"); - let block = self.unknown_parent_block.take().unwrap(); - let max_len = self.rig.spec.max_blobs_per_block(block.epoch()) as usize; - // Now this block is the one we expect requests from - self.block = block.clone(); - let block = RpcBlock::new( - Some(block.canonical_root()), - block, - self.unknown_parent_blobs - .take() - .map(|vec| RuntimeVariableList::from_vec(vec, max_len)), - ) - .unwrap(); - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ); - assert_eq!(self.rig.active_parent_lookups_count(), 1); - self - } - - fn invalid_parent_processed(mut self) -> Self { - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Err(BlockError::BlockSlotLimitReached), - ); - assert_eq!(self.rig.active_parent_lookups_count(), 1); - self - } - - fn invalid_block_processed(mut self) -> Self { - self.rig.single_block_component_processed( - self.block_req_id.expect("block request id").lookup_id, - BlockProcessingResult::Err(BlockError::BlockSlotLimitReached), - ); - self.rig.assert_single_lookups_count(1); - self - } - - fn invalid_blob_processed(mut self) -> Self { - self.rig.log("invalid_blob_processed"); - self.rig.single_blob_component_processed( - self.blob_req_id.expect("blob request id").lookup_id, - BlockProcessingResult::Err(BlockError::AvailabilityCheck( - AvailabilityCheckError::InvalidBlobs(kzg::Error::KzgVerificationFailed), - )), - ); - self.rig.assert_single_lookups_count(1); - self - } - - fn missing_components_from_block_request(mut self) -> Self { - self.rig.single_block_component_processed( - self.block_req_id.expect("block request id").lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - self.slot, - self.block_root, - )), - ); - // Add block to da_checker so blobs request can continue - self.rig.insert_block_to_da_checker(self.block.clone()); - - self.rig.assert_single_lookups_count(1); - self - } - - fn complete_current_block_and_blobs_lookup(self) -> Self { - self.expect_block_request() - .block_response_and_expect_blob_request() - .blobs_response() - // TODO: Should send blobs for processing - .expect_block_process() - .block_imported() - } - - fn log(self, msg: &str) -> Self { - self.rig.log(msg); - self - } - - fn parent_block_then_empty_parent_blobs(self) -> Self { - self.log( - " Return empty blobs for parent, block errors with missing components, downscore", - ) - .parent_block_response() - .expect_parent_blobs_request() - .empty_parent_blobs_response() - .expect_penalty("NotEnoughResponsesReturned") - .log("Re-request parent blobs, succeed and import parent") - .expect_parent_blobs_request() - .parent_blob_response() - .expect_block_process() - .parent_block_missing_components() - // Insert new peer into child request before completing parent - .trigger_unknown_block_from_attestation() - .parent_blob_imported() - } - - fn expect_penalty(mut self, expect_penalty_msg: &'static str) -> Self { - self.rig.expect_penalty(self.peer_id, expect_penalty_msg); - self - } - fn expect_no_penalty(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn expect_no_penalty_and_no_requests(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn expect_block_request(mut self) -> Self { - let id = self - .rig - .expect_block_lookup_request(self.block.canonical_root()); - self.block_req_id = Some(id); - self - } - fn expect_blobs_request(mut self) -> Self { - let id = self - .rig - .expect_blob_lookup_request(self.block.canonical_root()); - self.blob_req_id = Some(id); - self - } - fn expect_parent_block_request(mut self) -> Self { - let id = self - .rig - .expect_block_parent_request(self.block.parent_root()); - self.parent_block_req_id = Some(id); - self - } - fn expect_parent_blobs_request(mut self) -> Self { - let id = self - .rig - .expect_blob_parent_request(self.block.parent_root()); - self.parent_blob_req_id = Some(id); - self - } - fn expect_no_blobs_request(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn expect_no_block_request(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn invalidate_blobs_too_few(mut self) -> Self { - self.blobs.pop().expect("blobs"); - self - } - fn expect_block_process(mut self) -> Self { - self.rig.expect_block_process(ResponseType::Block); - self - } - fn expect_no_active_lookups(self) -> Self { - self.rig.expect_no_active_lookups(); - self - } - fn search_parent_dup(mut self) -> Self { - self.rig - .trigger_unknown_parent_block(self.peer_id, self.block.clone()); - self - } - } - - #[test] - fn single_block_and_blob_lookup_block_returned_first_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_and_expect_blob_request() - .blobs_response() - .block_missing_components() // blobs not yet imported - .blobs_response_was_valid() - .blob_imported(); // now blobs resolve as imported - } - - #[test] - fn single_block_response_then_empty_blob_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_and_expect_blob_request() - .missing_components_from_block_request() - .empty_blobs_response() - .expect_penalty("NotEnoughResponsesReturned") - .expect_blobs_request() - .expect_no_block_request(); - } - - #[test] - fn single_invalid_block_response_then_blob_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_triggering_process() - .invalid_block_processed() - .expect_penalty("lookup_block_processing_failure") - .expect_block_request() - .expect_no_blobs_request() - .blobs_response() - // blobs not sent for processing until the block is processed - .expect_no_penalty_and_no_requests(); - } - - #[test] - fn single_block_response_then_invalid_blob_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_triggering_process() - .missing_components_from_block_request() - .blobs_response() - .invalid_blob_processed() - .expect_penalty("lookup_blobs_processing_failure") - .expect_blobs_request() - .expect_no_block_request(); - } - - #[test] - fn single_block_response_then_too_few_blobs_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_triggering_process() - .missing_components_from_block_request() - .invalidate_blobs_too_few() - .blobs_response() - .expect_penalty("NotEnoughResponsesReturned") - .expect_blobs_request() - .expect_no_block_request(); - } - - // Test peer returning block that has unknown parent, and a new lookup is created - #[test] - fn parent_block_unknown_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .parent_block_unknown_parent() - .expect_parent_block_request() - .expect_empty_beacon_processor(); - } - - // Test peer returning invalid (processing) block, expect retry - #[test] - fn parent_block_invalid_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .invalid_parent_processed() - .expect_penalty("lookup_block_processing_failure") - .expect_parent_block_request() - .expect_empty_beacon_processor(); - } - - // Tests that if a peer does not respond with a block, we downscore and retry the block only - #[test] - fn empty_block_is_retried() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .empty_block_response() - .expect_penalty("NotEnoughResponsesReturned") - .expect_block_request() - .expect_no_blobs_request() - .block_response_and_expect_blob_request() - .blobs_response() - .block_imported() - .expect_no_active_lookups(); - } - - #[test] - fn parent_block_then_empty_parent_blobs() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .parent_block_then_empty_parent_blobs() - .log("resolve original block trigger blobs request and import") - // Should not have block request, it is cached - .expect_blobs_request() - // TODO: Should send blobs for processing - .block_imported() - .expect_no_active_lookups(); - } - - #[test] - fn parent_blob_unknown_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .parent_block_unknown_parent() - .expect_parent_block_request() - .expect_empty_beacon_processor(); - } - - #[test] - fn parent_blob_invalid_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .invalid_parent_processed() - .expect_penalty("lookup_block_processing_failure") - .expect_parent_block_request() - // blobs are not sent until block is processed - .expect_empty_beacon_processor(); - } - - #[test] - fn parent_block_and_blob_lookup_parent_returned_first_blob_trigger() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .parent_block_response() - .expect_parent_blobs_request() - .parent_blob_response() - .expect_block_process() - .trigger_unknown_block_from_attestation() - .parent_block_imported() - .complete_current_block_and_blobs_lookup() - .expect_no_active_lookups(); - } - - #[test] - fn parent_block_then_empty_parent_blobs_blob_trigger() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .parent_block_then_empty_parent_blobs() - .log("resolve original block trigger blobs request and import") - .complete_current_block_and_blobs_lookup() - .expect_no_active_lookups(); - } - - #[test] - fn parent_blob_unknown_parent_chain() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(2)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_no_penalty() - .expect_block_process() - .parent_block_unknown_parent() - .expect_parent_block_request() - .expect_empty_beacon_processor() - .parent_block_response() - .expect_parent_blobs_request() - .parent_blob_response() - .expect_no_penalty() - .expect_block_process(); - } - - #[test] - fn unknown_parent_block_dup() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .search_parent_dup() - .expect_no_blobs_request() - .expect_no_block_request(); - } - - #[test] - fn unknown_parent_blob_dup() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .search_parent_dup() - .expect_no_blobs_request() - .expect_no_block_request(); - } - - // This test no longer applies, we don't issue requests for child lookups - // Keep for after updating rules on fetching blocks only first - #[ignore] - #[test] - fn no_peer_penalty_when_rpc_response_already_known_from_gossip() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(2)); - let block_root = block.canonical_root(); - let blob_0 = blobs[0].clone(); - let blob_1 = blobs[1].clone(); - let peer_a = r.new_connected_peer(); - let peer_b = r.new_connected_peer(); - // Send unknown parent block lookup - r.trigger_unknown_parent_block(peer_a, block.into()); - // Expect network request for blobs - let id = r.expect_blob_lookup_request(block_root); - // Peer responses with blob 0 - r.single_lookup_blob_response(id, peer_a, Some(blob_0.into())); - // Blob 1 is received via gossip unknown parent blob from a different peer - r.trigger_unknown_parent_blob(peer_b, blob_1.clone()); - // Original peer sends blob 1 via RPC - r.single_lookup_blob_response(id, peer_a, Some(blob_1.into())); - // Assert no downscore event for original peer - r.expect_no_penalty_for(peer_a); - } -} diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index 1113d1e554a..e9c2e84e4cc 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -1,5 +1,4 @@ use crate::sync::manager::SyncManager; -use crate::sync::range_sync::RangeSyncType; use crate::sync::SyncMessage; use crate::NetworkMessage; use beacon_chain::builder::Witness; @@ -73,8 +72,7 @@ struct TestRig { fork_name: ForkName, spec: Arc, - // Cache of sent blocks for PeerDAS responses - sent_blocks_by_range: HashMap>>>, + // Cache for produced blocks to serve blocks_by_root: HashMap>>, } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 880365d943d..1211036f227 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -1,12 +1,11 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; -use crate::sync::manager::SLOT_IMPORT_TOLERANCE; +use crate::sync::manager::{BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; use crate::sync::network_context::{BlockComponentsByRootRequestStep, RangeRequestId}; -use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; use crate::sync::tests::lookups::TestOptions; use crate::sync::BatchProcessResult; -use crate::sync::{ChainId, SyncMessage}; +use crate::sync::SyncMessage; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; use beacon_chain::{ @@ -17,7 +16,7 @@ use beacon_processor::WorkType; use lighthouse_network::rpc::methods::{ BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, }; -use lighthouse_network::rpc::{RequestType, StatusMessage}; +use lighthouse_network::rpc::{RPCError, RequestType, RpcErrorResponse, StatusMessage}; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, @@ -78,34 +77,41 @@ type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataCol /// _which_ request to complete. Picking the right request is critical for tests to pass, so this /// filter allows better expressivity on the criteria to identify the right request. #[derive(Default, Debug, Clone, Copy)] -struct RequestFilter { +pub struct RequestFilter { peer: Option, epoch: Option, column_index: Option, + header_requests_only: bool, } -const NO_FILTER: RequestFilter = RequestFilter { +pub const NO_FILTER: RequestFilter = RequestFilter { peer: None, epoch: None, column_index: None, + header_requests_only: false, }; impl RequestFilter { - fn peer(mut self, peer: PeerId) -> Self { + pub fn peer(mut self, peer: PeerId) -> Self { self.peer = Some(peer); self } - fn epoch(mut self, epoch: u64) -> Self { + pub fn epoch(mut self, epoch: u64) -> Self { self.epoch = Some(epoch); self } - fn column_index(mut self, index: u64) -> Self { + pub fn column_index(mut self, index: u64) -> Self { self.column_index = Some(index); self } + pub fn header_requests_only(mut self) -> Self { + self.header_requests_only = true; + self + } + fn blocks_by_root_requests( &self, ev: &NetworkMessage, @@ -115,7 +121,9 @@ impl RequestFilter { peer_id, request: RequestType::BlocksByRoot(req), app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), - } if self.matches_blocks_by_root(peer_id, req) => Some((*id, *peer_id, req.clone())), + } if self.matches_blocks_by_root(peer_id, req, id) => { + Some((*id, *peer_id, req.clone())) + } _ => None, } } @@ -136,11 +144,26 @@ impl RequestFilter { } } - fn matches_blocks_by_root(&self, peer: &PeerId, _req: &BlocksByRootRequest) -> bool { + fn matches_blocks_by_root( + &self, + peer: &PeerId, + _req: &BlocksByRootRequest, + id: &BlocksByRootRequestId, + ) -> bool { + if self.header_requests_only { + if !matches!(id.parent_request_id, BlocksByRootRequester::Header(_)) { + return false; + } + } + self.matches_peer(peer) } fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { + if self.header_requests_only { + return false; + } + if let Some(index) = self.column_index { if !req .data_column_ids @@ -173,30 +196,70 @@ impl RequestFilter { } } -fn filter() -> RequestFilter { +pub fn filter() -> RequestFilter { RequestFilter::default() } /// Instruct the testing rig how to complete requests for _by_range requests -#[derive(Debug, Clone, Copy)] -struct CompleteConfig { +pub struct CompleteConfig { block_count: usize, with_data: bool, custody_failure_at_index: Option, + rpc_error: Option, + empty_sampling_response_once: bool, + stop_at_block: Option, + return_wrong_blocks: bool, + return_no_blocks: bool, + process_error: bool, } impl CompleteConfig { - fn custody_failure_at_index(mut self, index: u64) -> Self { + pub fn custody_failure_at_index(mut self, index: u64) -> Self { self.custody_failure_at_index = Some(index); self } + + pub fn rpc_error(mut self, error: RPCError) -> Self { + self.rpc_error = Some(error); + self + } + + pub fn rpc_error_response(mut self, error: RpcErrorResponse) -> Self { + self.rpc_error(RPCError::ErrorResponse(error, "".to_owned())) + } + + pub fn empty_sampling_response_once(mut self) -> Self { + self.empty_sampling_response_once = true; + self + } + + pub fn stop_at_block(mut self, block: Hash256) -> Self { + self.stop_at_block = Some(block); + self + } + + pub fn return_wrong_blocks(mut self) -> Self { + self.return_wrong_blocks = true; + self + } + + pub fn return_no_blocks(mut self) -> Self { + self.return_no_blocks = true; + self + } } -fn complete() -> CompleteConfig { +pub fn complete() -> CompleteConfig { CompleteConfig { block_count: 1, with_data: true, custody_failure_at_index: None, + rpc_error: None, + empty_sampling_response_once: false, + stop_at_block: None, + return_wrong_blocks: false, + return_no_blocks: false, + process_error: false, } } @@ -279,53 +342,15 @@ impl TestRig { self.add_sync_peer(true, remote_info) } - fn assert_state(&mut self, state: RangeSyncType) { - assert_eq!( - self.sync_manager - .range_sync() - .state() - .expect("State is ok") - .expect("Range should be syncing, there are no chains") - .0, - state, - "not expected range sync state" - ); - } - fn get_sync_state(&mut self) -> SyncState { self.sync_manager.network().network_globals().sync_state() } - fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, &BatchState)> { - self.sync_manager.range_sync().batches_state() - } - fn assert_sync_state(&mut self, expected_state: SyncState) { let current_state = self.sync_manager.network().network_globals().sync_state(); assert_eq!(current_state, expected_state); } - fn assert_syncing_finalized(&mut self) { - self.assert_sync_state(SyncState::SyncingFinalized { - start_slot: Slot::new(0), - target_slot: Slot::new(0), - }); - } - - fn assert_no_chains_exist(&mut self) { - if let Some(chain) = self.sync_manager.range_sync().state().unwrap() { - panic!("There still exists a chain {chain:?}"); - } - } - - fn assert_no_failed_chains(&mut self) { - assert_eq!( - self.sync_manager.range_sync().failed_chains(), - Vec::::new(), - "Expected no failed chains" - ) - } - #[track_caller] fn expect_chain_segments(&mut self, count: usize) { for i in 0..count { @@ -336,36 +361,28 @@ impl TestRig { } } - fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { - let events = self - .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); - if !events.is_empty() { - panic!("Expected to not find data_columns_by_range requests {request_filter:?} by found {events:?}") - } - } - - fn expect_active_block_components_by_range_request_on_custody_step(&mut self) { + fn expect_active_block_components_requests_on_custody_step(&mut self) { let requests = self .sync_manager .network() - .active_block_components_by_range_requests(); + .active_block_components_requests(); if requests.is_empty() { - panic!("No active block_components_by_range requests"); + panic!("No active block components requests"); } for (id, step) in requests { if !matches!(step, BlockComponentsByRootRequestStep::CustodyRequest) { - panic!("block_components_by_range request {id} is not on CustodyRequest step: {step:?}"); + panic!("block components request {id} is not on CustodyRequest step: {step:?}"); } } } - fn expect_no_active_block_components_by_range_requests(&mut self) { + fn expect_no_active_block_components_requests(&mut self) { let requests = self .sync_manager .network() - .active_block_components_by_range_requests(); + .active_block_components_requests(); if !requests.is_empty() { - panic!("Still active block_components_by_range requests {requests:?}"); + panic!("Still active block components requests {requests:?}"); } } @@ -380,41 +397,6 @@ impl TestRig { } } - fn expect_all_batches_in_state) -> bool>( - &mut self, - predicate: F, - expected_state: &'static str, - ) { - let batches = self.get_batch_states(); - if batches.is_empty() { - panic!("no batches"); - } - for (chain_id, batch_id, state) in &batches { - if !predicate(state) { - panic!("batch {chain_id} {batch_id} not in state {expected_state}, {state}"); - } - } - } - - fn expect_all_batches_downloading(&mut self) { - self.expect_all_batches_in_state( - |state| matches!(state, BatchState::Downloading { .. }), - "Downloading", - ); - } - - fn expect_all_batches_processing_or_awaiting(&mut self) { - self.expect_all_batches_in_state( - |state| { - matches!( - state, - BatchState::Processing { .. } | BatchState::AwaitingProcessing { .. } - ) - }, - "Processing or AwaitingProcessing", - ); - } - fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); @@ -433,8 +415,7 @@ impl TestRig { SignedBeaconBlock::from_block(block, Signature::empty()) } - async fn create_unimported_parent_chain(&mut self) -> (Hash256, Slot) { - let block_count = 8; + pub async fn create_unimported_parent_chain(&mut self, block_count: usize) -> (Hash256, Slot) { self.log(&format!( "Creating unimported chain of {block_count} blocks" )); @@ -488,14 +469,15 @@ impl TestRig { (parent_root, slot) } - fn last_sent_blocks_by_range( - &mut self, - id: ComponentsByRootRequestId, - ) -> Vec>> { - self.sent_blocks_by_range - .get(&id) - .cloned() - .unwrap_or_else(|| panic!("No blocks for ComponentsByRootRequestId {id}")) + fn send_rpc_error(&mut self, id: SyncRequestId, peer_id: PeerId, error: RPCError) { + self.log(&format!( + "Completing request {id:?} to {peer_id} with RPCError {error:?}" + )); + self.send_sync_message(SyncMessage::RpcError { + sync_request_id: id, + peer_id, + error, + }); } fn send_blocks_by_root_response( @@ -559,30 +541,41 @@ impl TestRig { fn complete_blocks_by_root_request( &mut self, request: BlocksByRootRequestData, - complete_config: CompleteConfig, - ) -> BlocksByRootRequester { - let (blocks_req_id, block_peer, blocks_req) = request; + config: &CompleteConfig, + ) { + let (req_id, peer, req) = request; + if let Some(error) = &config.rpc_error { + self.send_rpc_error(SyncRequestId::BlocksByRoot(req_id), peer, error.clone()); + return; + } + + if config.return_no_blocks { + self.send_blocks_by_root_response(req_id, peer, &[]); + return; + } - let blocks = blocks_req + let blocks = req .block_roots() .iter() .map(|block_root| { - self.blocks_by_root - .get(block_root) - .expect("Test consumer requested unknown block") - .clone() + if config.return_wrong_blocks { + Arc::new(self.rand_block()) + } else { + self.blocks_by_root + .get(block_root) + .expect("Test consumer requested unknown block") + .clone() + } }) .collect::>(); - self.send_blocks_by_root_response(blocks_req_id, block_peer, &blocks); - - blocks_req_id.parent_request_id + self.send_blocks_by_root_response(req_id, peer, &blocks); } fn complete_data_columns_by_root_request_range_sync( &mut self, (id, peer_id, req): DataColumnsByRootRequestData, - complete_config: CompleteConfig, + complete_config: &CompleteConfig, ) { // To reply with a valid DataColumnsByRange we need to construct // DataColumnsByRange for the block root that we requested the block peer, plus @@ -634,10 +627,14 @@ impl TestRig { .collect::>(); // Need to log here because I can't capture &mut self inside the columns iter - if triggered_custody_failure { - if let Some(index) = complete_config.custody_failure_at_index { + if let Some(target_index) = complete_config.custody_failure_at_index { + if req + .data_column_ids + .iter() + .any(|id| id.columns.iter().any(|index| *index == target_index)) + { self.log(&format!( - "Forced custody failure at request {id} for peer {peer_id} index {index:?}" + "Forced custody failure at request {id} for peer {peer_id} index {target_index:?}" )); } } @@ -645,16 +642,19 @@ impl TestRig { self.send_data_columns_by_root_response(id, peer_id, &data_columns); } - fn find_and_complete_data_by_range_request( - &mut self, - request_filter: RequestFilter, - complete_config: CompleteConfig, - ) { - let by_range_data_request_ids = self.find_data_by_range_request(request_filter); - self.complete_data_by_range_request(by_range_data_request_ids, complete_config); - } + fn complete_block_processing(&mut self, ids: Vec, config: &CompleteConfig) { + if config.process_error { + for id in &ids { + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: ChainSegmentProcessId::ForwardSync(*id), + result: BatchProcessResult::Failure { + peer_action: None, + error: "test error".to_owned(), + }, + }); + } + } - fn complete_block_processing(&mut self, ids: Vec) { // Sort ids first as we need to process blocks in order of ancestors. This only works if the // test does not send blocks of two parallel chains at once. let mut blocks = ids @@ -695,53 +695,36 @@ impl TestRig { } self.send_sync_message(SyncMessage::BatchProcessed { - sync_type: ChainSegmentProcessId::RangeBatchId(id), - result: BatchProcessResult::Success { - sent_blocks: 1, - imported_blocks: 1, - }, + sync_type: ChainSegmentProcessId::ForwardSync(id), + result: BatchProcessResult::Success, }); } } - fn progress_until_no_events( + pub fn progress_until_no_events( &mut self, request_filter: RequestFilter, complete_config: CompleteConfig, ) { loop { - if let Ok(request) = - self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) - { - self.complete_blocks_by_range_request(request, complete_config); - continue; - } - if let Ok(request) = self.pop_received_network_event(|ev| request_filter.blocks_by_root_requests(ev)) { - self.complete_blocks_by_root_request(request, complete_config); - continue; - } - - if let Ok(request) = self - .pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) - { - self.complete_data_columns_by_range_request(request, complete_config); + self.complete_blocks_by_root_request(request, &complete_config); continue; } if let Ok(request) = self .pop_received_network_event(|ev| request_filter.data_columns_by_root_requests(ev)) { - self.complete_data_columns_by_root_request_range_sync(request, complete_config); + self.complete_data_columns_by_root_request_range_sync(request, &complete_config); continue; } // TODO(tree-sync): find a way to get this info from the beacon processor events - let ids = self.sync_manager.block_tree().get_processing_ids(); + let ids = self.sync_manager.forward_sync().get_processing_ids(); if !ids.is_empty() { - self.complete_block_processing(ids); + self.complete_block_processing(ids, &complete_config); continue; } @@ -752,61 +735,6 @@ impl TestRig { } } - fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { - self.pop_received_processor_event(|ev| { - (ev.work_type() == WorkType::ChainSegment).then_some(()) - }) - .unwrap_or_else(|e| panic!("Expected chain segment work event: {e}")); - - self.log(&format!( - "Completing ChainSegment processing work {id:?} with success" - )); - self.send_sync_message(SyncMessage::BatchProcessed { - sync_type: id, - result: crate::sync::BatchProcessResult::Success { - sent_blocks: 8, - imported_blocks: 8, - }, - }); - } - - fn complete_and_process_range_sync_until( - &mut self, - last_epoch: u64, - request_filter: RequestFilter, - complete_config: CompleteConfig, - ) { - for epoch in 0..last_epoch { - // Note: In this test we can't predict the block peer - let id = self.find_and_complete_block_components_by_range_request( - request_filter.epoch(epoch), - complete_config, - ); - if let RangeRequestId::RangeSync { .. } = id { - todo!(); - } else { - panic!("unexpected RangeRequestId {id}"); - } - - let id = match id { - RangeRequestId::RangeSync(id) => { - todo!(); - } - RangeRequestId::BackfillSync { batch_id } => { - ChainSegmentProcessId::BackSyncBatchId(batch_id) - } - }; - - self.find_and_complete_processing_chain_segment(id); - if epoch < last_epoch - 1 { - self.assert_state(RangeSyncType::Finalized); - } else { - self.assert_no_chains_exist(); - self.assert_no_failed_chains(); - } - } - } - async fn create_canonical_block(&mut self) -> (SignedBeaconBlock, Option>) { self.harness.advance_slot(); @@ -889,127 +817,10 @@ fn build_rpc_block( } } -#[test] -fn head_chain_removed_while_finalized_syncing() { - // NOTE: this is a regression test. - // Added in PR https://github.com/sigp/lighthouse/pull/2821 - let mut rig = TestRig::test_setup(); - - // Get a peer with an advanced head - let head_peer = rig.add_head_peer(); - rig.assert_state(RangeSyncType::Head); - - // Sync should have requested a batch, grab the request. - let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); - - // Now get a peer with an advanced finalized epoch. - let finalized_peer = rig.add_finalized_peer(); - rig.assert_state(RangeSyncType::Finalized); - - // Sync should have requested a batch, grab the request - let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); - - // Fail the head chain by disconnecting the peer. - rig.peer_disconnected(head_peer); - rig.assert_state(RangeSyncType::Finalized); -} - -#[tokio::test] -async fn state_update_while_purging() { - // NOTE: this is a regression test. - // Added in PR https://github.com/sigp/lighthouse/pull/2827 - let mut rig = TestRig::test_setup(); - - // Create blocks on a separate harness - let mut rig_2 = TestRig::test_setup(); - // Need to create blocks that can be inserted into the fork-choice and fit the "known - // conditions" below. - let head_peer_block = rig_2.create_canonical_block().await; - let head_peer_root = head_peer_block.0.canonical_root(); - let finalized_peer_block = rig_2.create_canonical_block().await; - let finalized_peer_root = finalized_peer_block.0.canonical_root(); - - // Get a peer with an advanced head - let head_peer = rig.add_head_peer_with_root(head_peer_root); - rig.assert_state(RangeSyncType::Head); - - // Sync should have requested a batch, grab the request. - let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); - - // Now get a peer with an advanced finalized epoch. - let finalized_peer = rig.add_finalized_peer_with_root(finalized_peer_root); - rig.assert_state(RangeSyncType::Finalized); - - // Sync should have requested a batch, grab the request - let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); - - // Now the chain knows both chains target roots. - rig.remember_block(head_peer_block).await; - rig.remember_block(finalized_peer_block).await; - - // Add an additional peer to the second chain to make range update it's status - rig.add_finalized_peer(); -} - -#[test] -fn pause_and_resume_on_ee_offline() { - let mut rig = TestRig::test_setup(); - - // add some peers - let peer1 = rig.add_head_peer(); - // make the ee offline - rig.update_execution_engine_state(EngineState::Offline); - // send the response to the request - rig.find_and_complete_block_components_by_range_request( - filter().peer(peer1).epoch(0), - complete(), - ); - // the beacon processor shouldn't have received any work - rig.expect_empty_processor(); - - // while the ee is offline, more peers might arrive. Add a new finalized peer. - let _peer2 = rig.add_finalized_peer(); - - // send the response to the request - // Don't filter requests and the columns requests may be sent to peer1 or peer2 - // We need to filter by epoch, because the previous batch eagerly sent requests for the next - // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_block_components_by_range_request(filter().epoch(0), complete()); - // the beacon processor shouldn't have received any work - rig.expect_empty_processor(); - // make the beacon processor available again. - // update_execution_engine_state implicitly calls resume - // now resume range, we should have two processing requests in the beacon processor. - rig.update_execution_engine_state(EngineState::Online); - - // The head chain and finalized chain (2) should be in the processing queue - rig.expect_chain_segments(2); -} - /// To attempt to finalize the peer's status finalized checkpoint we synced to its finalized epoch + /// 2 epochs + 1 slot. const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; -#[test] -fn finalized_sync_enough_global_custody_peers_few_chain_peers() { - // Run for all forks - let mut r = TestRig::test_setup(); - // This test creates enough global custody peers to satisfy column queries but only adds few - // peers to the chain - r.new_connected_peers_for_peerdas(); - - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - - // Current priorization only sends batches to idle peers, so we need enough peers for each batch - // TODO: Test this with a single peer in the chain, it should still work - r.add_sync_peer(false, remote_info); - r.assert_state(RangeSyncType::Finalized); - - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); -} - // Same test with different types of peers: // - 100 peers // - 1 supernode @@ -1042,7 +853,6 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit r.add_connected_sync_peer_not_supernode(remote_info.clone()); - r.assert_syncing_finalized(); // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it // doesn't have enough peers to cover all columns @@ -1052,7 +862,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // Here we have a batch with partially completed block_components_by_range requests. The batch // should not have failed, we are still syncing, and there are no downscoring events. r.expect_no_penalty_for_anyone(); - r.expect_active_block_components_by_range_request_on_custody_step(); + r.expect_active_block_components_requests_on_custody_step(); // Generate enough peers and supernodes to cover all custody columns r.add_sync_peers(config.peers, remote_info.clone()); @@ -1062,7 +872,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); - r.expect_no_active_block_components_by_range_requests(); + r.expect_no_active_block_components_requests(); // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work // events from here easily. What we pop from the beacon processor queue is an opaque closure // wihtout any information. We don't know what batch it is for. @@ -1081,7 +891,6 @@ fn finalized_sync_single_custody_peer_failure() { let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); r.add_sync_peer(true, remote_info.clone()); - r.assert_state(RangeSyncType::Finalized); // Progress all blocks_by_range and columns_by_range requests but respond empty for a single // column index @@ -1095,8 +904,7 @@ fn finalized_sync_single_custody_peer_failure() { // another request yet. r.expect_no_active_rpc_requests(); // Ensure that the block components by range request have not failed - r.expect_active_block_components_by_range_request_on_custody_step(); - r.expect_all_batches_downloading(); + r.expect_active_block_components_requests_on_custody_step(); // After adding a new peer we will try to fetch from it r.add_sync_peer(true, remote_info.clone()); @@ -1108,73 +916,13 @@ fn finalized_sync_single_custody_peer_failure() { ); r.expect_no_active_rpc_requests(); - r.expect_no_active_block_components_by_range_requests(); - r.expect_all_batches_processing_or_awaiting(); -} - -#[test] -fn finalized_sync_permanent_custody_peer_failure() { - let mut r = TestRig::test_setup_with_options(TestOptions { - is_supernode: false, - // The default buffer size is 5, but we want to manually complete only the batch for epoch - // 0. By setting this buffer to 1 sync will create a single batch until it completes. We can - // do better assertions of state assuming there's only one batch and logs are cleaner. - batch_buffer_size: 1, - }); - // Only run post-PeerDAS - if !r.fork_name.fulu_enabled() { - return; - } - - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); - const PEERS_IN_BATCH: usize = 4; - - for _ in 0..PEERS_IN_BATCH { - r.add_connected_sync_random_peer(remote_info.clone()); - } - r.assert_state(RangeSyncType::Finalized); - - // Some peer had a costudy failure at `column_index` so sync should do a single extra request - // for that index and epoch. - r.find_and_complete_block_components_by_range_request( - filter().epoch(0), - complete().custody_failure_at_index(column_index_to_fail), - ); - - let mut requested_peers = HashSet::new(); - - for i in 0..PEERS_IN_BATCH - 1 { - r.log(&format!("Loop {i} of custody failure round")); - - // Some peer had a costudy failure at `column_index` so sync should do a single extra request - // for that index and epoch. We want to make sure that the request goes to different peer - // than the attempted before. - let reqs = - r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); - let req_peer = reqs.peer(); - if requested_peers.contains(&req_peer) { - panic!("Re-requested the same peer {req_peer} again after a custody failure"); - } - requested_peers.insert(req_peer); - - // Find the requests first to assert that this is the only request that exists - r.expect_no_data_columns_by_range_requests(filter().epoch(0)); - r.complete_data_by_range_request( - reqs, - complete().custody_failure_at_index(column_index_to_fail), - ); - } - - // custody_by_range request is still active waiting for a new peer to connect - r.expect_active_block_components_by_range_request_on_custody_step(); + r.expect_no_active_block_components_requests(); } #[tokio::test] async fn tree_sync_happy_path() { let mut r = TestRig::test_setup(); - let (head_root, head_slot) = r.create_unimported_parent_chain().await; + let (head_root, head_slot) = r.create_unimported_parent_chain(8).await; let remote_info = SyncInfo { finalized_epoch: Epoch::new(0), finalized_root: Hash256::ZERO, diff --git a/common/eth2/src/lighthouse/sync_state.rs b/common/eth2/src/lighthouse/sync_state.rs index 793070432a6..b6677e5f636 100644 --- a/common/eth2/src/lighthouse/sync_state.rs +++ b/common/eth2/src/lighthouse/sync_state.rs @@ -43,10 +43,7 @@ impl PartialEq for SyncState { | (SyncState::Synced, SyncState::Synced) | (SyncState::Stalled, SyncState::Stalled) | (SyncState::SyncTransition, SyncState::SyncTransition) - | ( - SyncState::BackFillSyncing { .. }, - SyncState::BackFillSyncing { .. } - ) + | (SyncState::BackFillSyncing, SyncState::BackFillSyncing) ) } } @@ -58,7 +55,7 @@ impl SyncState { SyncState::Syncing { .. } => true, SyncState::SyncTransition => true, // Backfill doesn't effect any logic, we consider this state, not syncing. - SyncState::BackFillSyncing { .. } => false, + SyncState::BackFillSyncing => false, SyncState::Synced => false, SyncState::Stalled => false, } @@ -68,7 +65,7 @@ impl SyncState { /// /// NOTE: We consider the node synced if it is fetching old historical blocks. pub fn is_synced(&self) -> bool { - matches!(self, SyncState::Synced | SyncState::BackFillSyncing { .. }) + matches!(self, SyncState::Synced | SyncState::BackFillSyncing) } /// Returns true if the node is *stalled*, i.e. has no synced peers. @@ -87,7 +84,7 @@ impl std::fmt::Display for SyncState { SyncState::Synced => write!(f, "Synced"), SyncState::Stalled => write!(f, "Stalled"), SyncState::SyncTransition => write!(f, "Evaluating known peers"), - SyncState::BackFillSyncing { .. } => write!(f, "Syncing Historical Blocks"), + SyncState::BackFillSyncing => write!(f, "Syncing Historical Blocks"), } } } From 3af217a5639532eb300d6ed3b3ce94605cb7c3f1 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:03:12 +0200 Subject: [PATCH 37/66] base and fulu tests pass --- 0w | 956 ++++++++++++++++++ Cargo.lock | 1 + beacon_node/beacon_chain/src/test_utils.rs | 10 +- .../src/service/api_types.rs | 14 +- beacon_node/network/Cargo.toml | 1 + .../network/src/sync/backfill_sync/mod.rs | 15 +- beacon_node/network/src/sync/forward_sync.rs | 66 +- beacon_node/network/src/sync/manager.rs | 21 +- .../network/src/sync/network_context.rs | 2 + .../sync/network_context/custody_by_root.rs | 51 +- .../sync/network_context/download_request.rs | 41 +- beacon_node/network/src/sync/sync_block.rs | 7 +- beacon_node/network/src/sync/tests/lookups.rs | 372 ++++--- beacon_node/network/src/sync/tests/range.rs | 106 +- 14 files changed, 1339 insertions(+), 324 deletions(-) create mode 100644 0w diff --git a/0w b/0w new file mode 100644 index 00000000000..abc21c03d37 --- /dev/null +++ b/0w @@ -0,0 +1,956 @@ +use super::*; +use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::status::ToStatusMessage; +use crate::sync::manager::{BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; +use crate::sync::network_context::{BlockComponentsByRootRequestStep, RangeRequestId}; +use crate::sync::tests::lookups::TestOptions; +use crate::sync::BatchProcessResult; +use crate::sync::SyncMessage; +use beacon_chain::data_column_verification::CustodyDataColumn; +use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; +use beacon_chain::{ + block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer, + PayloadVerificationStatus, +}; +use beacon_processor::WorkType; +use lighthouse_network::rpc::methods::{ + BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, +}; +use lighthouse_network::rpc::{RPCError, RequestType, RpcErrorResponse, StatusMessage}; +use lighthouse_network::service::api_types::{ + AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, + ComponentsByRootRequestId, DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, +}; +use lighthouse_network::types::SyncState; +use lighthouse_network::{PeerId, SyncInfo}; +use std::collections::HashSet; +use std::time::Duration; +use types::{ + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, + EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, + SignedBeaconBlockHash, Slot, VariableList, +}; + +const D: Duration = Duration::new(0, 0); + +pub(crate) enum DataSidecars { + Blobs(BlobSidecarList), + DataColumns(Vec>), +} + +enum ByRootDataRequestIds { + PreDeneb, + PrePeerDAS(BlobsByRootRequestId, PeerId, BlobsByRootRequest), + PostPeerDAS(Vec<(DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest)>), +} + +impl ByRootDataRequestIds { + /// If there's a single active request, returns its peer, else panics + fn peer(&self) -> PeerId { + match self { + Self::PreDeneb => panic!("no requests PreDeneb"), + Self::PrePeerDAS(_, peer, _) => *peer, + Self::PostPeerDAS(reqs) => { + if reqs.len() != 1 { + panic!("Should have 1 PostPeerDAS request"); + } + reqs.first().expect("no PostPeerDAS requests").1 + } + } + } +} + +struct Config { + peers: PeersConfig, +} + +type BlocksByRootRequestData = (BlocksByRootRequestId, PeerId, BlocksByRootRequest); + +type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest); + +/// Sync tests are usually written in the form: +/// - Do some action +/// - Expect a request to be sent +/// - Complete the above request +/// +/// To make writting tests succint, the machinery in this testing rig automatically identifies +/// _which_ request to complete. Picking the right request is critical for tests to pass, so this +/// filter allows better expressivity on the criteria to identify the right request. +#[derive(Default, Debug, Clone, Copy)] +pub struct RequestFilter { + peer: Option, + epoch: Option, + block_root: Option, + column_index: Option, + header_requests_only: bool, +} + +pub const NO_FILTER: RequestFilter = RequestFilter { + peer: None, + epoch: None, + block_root: None, + column_index: None, + header_requests_only: false, +}; + +impl RequestFilter { + pub fn peer(mut self, peer: PeerId) -> Self { + self.peer = Some(peer); + self + } + + pub fn epoch(mut self, epoch: u64) -> Self { + self.epoch = Some(epoch); + self + } + + pub fn block_root(mut self, block_root: Hash256) -> Self { + self.block_root = Some(block_root); + self + } + + pub fn column_index(mut self, index: u64) -> Self { + self.column_index = Some(index); + self + } + + pub fn header_requests_only(mut self) -> Self { + self.header_requests_only = true; + self + } + + fn blocks_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), + } if self.matches_blocks_by_root(peer_id, req, id) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn data_columns_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), + } if self.matches_data_columns_by_root(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn matches_blocks_by_root( + &self, + peer: &PeerId, + req: &BlocksByRootRequest, + id: &BlocksByRootRequestId, + ) -> bool { + if self.header_requests_only { + if !matches!(id.parent_request_id, BlocksByRootRequester::Header(_)) { + return false; + } + } + + if let Some(block_root) = self.block_root { + if !req.block_roots().iter().any(|b| *b == block_root) { + return false; + } + } + + self.matches_peer(peer) + } + + fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { + if self.header_requests_only { + return false; + } + + if let Some(index) = self.column_index { + if !req + .data_column_ids + .iter() + .any(|id| id.columns.iter().any(|i| *i == index)) + { + return false; + } + } + self.matches_peer(peer) + } + + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { + if let Some(expected_epoch) = self.epoch { + let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); + if epoch != expected_epoch { + return false; + } + } + self.matches_peer(peer) + } + + fn matches_peer(&self, peer: &PeerId) -> bool { + if let Some(expected_peer) = self.peer { + if *peer != expected_peer { + return false; + } + } + true + } +} + +pub fn filter() -> RequestFilter { + RequestFilter::default() +} + +/// Instruct the testing rig how to complete requests for _by_range requests +pub struct CompleteConfig { + block_count: usize, + with_data: bool, + custody_failure_at_index: Option, + rpc_error: Option, + empty_sampling_response_once: bool, + stop_at_block: Option, + return_wrong_blocks: bool, + return_no_blocks_n_times: usize, + process_error: bool, +} + +impl CompleteConfig { + pub fn custody_failure_at_index(mut self, index: u64) -> Self { + self.custody_failure_at_index = Some(index); + self + } + + pub fn rpc_error(mut self, error: RPCError) -> Self { + self.rpc_error = Some(error); + self + } + + pub fn rpc_error_response(mut self, error: RpcErrorResponse) -> Self { + self.rpc_error(RPCError::ErrorResponse(error, "".to_owned())) + } + + pub fn empty_sampling_response_once(mut self) -> Self { + self.empty_sampling_response_once = true; + self + } + + pub fn stop_at_block(mut self, block: Hash256) -> Self { + self.stop_at_block = Some(block); + self + } + + pub fn return_wrong_blocks(mut self) -> Self { + self.return_wrong_blocks = true; + self + } + + pub fn return_no_blocks(mut self) -> Self { + self.return_no_blocks_n_times(usize::MAX) + } + + pub fn return_no_blocks_n_times(mut self, n_times: usize) -> Self { + self.return_no_blocks_n_times = n_times; + self + } +} + +pub fn complete() -> CompleteConfig { + CompleteConfig { + block_count: 1, + with_data: true, + custody_failure_at_index: None, + rpc_error: None, + empty_sampling_response_once: false, + stop_at_block: None, + return_wrong_blocks: false, + return_no_blocks_n_times: 0, + process_error: false, + } +} + +impl TestRig { + fn our_custody_indices(&self) -> Vec { + self.network_globals + .sampling_columns() + .iter() + .copied() + .collect() + } + + /// Produce a head peer with an advanced head + fn add_head_peer(&mut self) -> PeerId { + self.add_head_peer_with_root(Hash256::random()) + } + + /// Produce a head peer with an advanced head + fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { + let local_info = self.local_info(); + self.add_connected_sync_random_peer(SyncInfo { + head_root, + head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), + ..local_info + }) + } + + // Produce a finalized peer with an advanced finalized epoch + fn add_finalized_peer(&mut self) -> PeerId { + self.add_finalized_peer_with_root(Hash256::random()) + } + + // Produce a finalized peer with an advanced finalized epoch + fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { + let local_info = self.local_info(); + let finalized_epoch = local_info.finalized_epoch + 2; + self.add_connected_sync_random_peer(SyncInfo { + finalized_epoch, + finalized_root, + head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), + head_root: Hash256::random(), + }) + } + + fn finalized_remote_info_advanced_by(&self, advanced_epochs: Epoch) -> SyncInfo { + let local_info = self.local_info(); + let finalized_epoch = local_info.finalized_epoch + advanced_epochs; + SyncInfo { + finalized_epoch, + finalized_root: Hash256::random(), + head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), + head_root: Hash256::random(), + } + } + + fn local_info(&self) -> SyncInfo { + let StatusMessage { + fork_digest: _, + finalized_root, + finalized_epoch, + head_root, + head_slot, + } = self.harness.chain.status_message(); + SyncInfo { + head_slot, + head_root, + finalized_epoch, + finalized_root, + } + } + + fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { + self.add_sync_peer(false, remote_info) + } + + fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { + // Create valid peer known to network globals + // TODO(fulu): Using supernode peers to ensure we have peer across all column + // subnets for syncing. Should add tests connecting to full node peers. + self.add_sync_peer(true, remote_info) + } + + fn get_sync_state(&mut self) -> SyncState { + self.sync_manager.network().network_globals().sync_state() + } + + fn assert_sync_state(&mut self, expected_state: SyncState) { + let current_state = self.sync_manager.network().network_globals().sync_state(); + assert_eq!(current_state, expected_state); + } + + #[track_caller] + fn expect_chain_segments(&mut self, count: usize) { + for i in 0..count { + self.pop_received_processor_event(|ev| { + (ev.work_type() == beacon_processor::WorkType::ChainSegment).then_some(()) + }) + .unwrap_or_else(|e| panic!("Expect ChainSegment work event count {i}: {e:?}")); + } + } + + fn expect_active_block_components_requests_on_custody_step(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_requests(); + if requests.is_empty() { + panic!("No active block components requests"); + } + for (id, step) in requests { + if !matches!(step, BlockComponentsByRootRequestStep::CustodyRequest) { + panic!("block components request {id} is not on CustodyRequest step: {step:?}"); + } + } + } + + fn expect_no_active_block_components_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_requests(); + if !requests.is_empty() { + panic!("Still active block components requests {requests:?}"); + } + } + + fn expect_no_active_rpc_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_requests() + .collect::>(); + if !requests.is_empty() { + panic!("There are still active RPC requests {requests:?}"); + } + } + + fn update_execution_engine_state(&mut self, state: EngineState) { + self.log(&format!("execution engine state updated: {state:?}")); + self.sync_manager.update_execution_engine_state(state); + } + + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> SignedBeaconBlock { + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { + blob_kzg_commitments + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); + } + } + *block.slot_mut() = slot; + SignedBeaconBlock::from_block(block, Signature::empty()) + } + + pub async fn create_unimported_parent_chain(&mut self, block_count: usize) -> (Hash256, Slot) { + self.log(&format!( + "Creating unimported chain of {block_count} blocks" + )); + + let mut r = TestRig::test_setup(); + + r.harness.advance_slot(); + let head_root = r + .harness + .extend_chain( + block_count, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let store = &r.harness.chain.store; + let head_block = store.get_full_block(&head_root).unwrap().unwrap(); + + let mut target_block_root = head_root; + while let Some(block) = store.get_full_block(&target_block_root).unwrap() { + self.log(&format!( + "Adding block {target_block_root:?} slot {} to known blocks", + block.slot() + )); + let parent_root = block.parent_root(); + self.blocks_by_root.insert(target_block_root, block.into()); + if parent_root == Hash256::ZERO { + break; + } + target_block_root = parent_root; + } + + (head_root, head_block.slot()) + } + + fn create_not_rooted_parent_chain(&mut self) -> (Hash256, Slot) { + let current_head = self.harness.chain.head(); + let mut parent_root = current_head.head_block_root(); + let mut slot = current_head.head_slot(); + for _ in 0..64 { + let mut block = self.zero_block_at_slot(slot, true); + *block.message_mut().parent_root_mut() = parent_root; + *block.message_mut().slot_mut() = slot; + let block_root = block.canonical_root(); + self.blocks_by_root.insert(block_root, block.into()); + + parent_root = block_root; + slot = slot + Slot::new(1); + } + (parent_root, slot) + } + + fn send_rpc_error(&mut self, id: SyncRequestId, peer_id: PeerId, error: RPCError) { + self.log(&format!( + "Completing request {id:?} to {peer_id} with RPCError {error:?}" + )); + self.send_sync_message(SyncMessage::RpcError { + sync_request_id: id, + peer_id, + error, + }); + } + + fn send_blocks_by_root_response( + &mut self, + req_id: BlocksByRootRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); + self.log(&format!( + "Completing BlocksByRoot request {req_id} to {peer_id} with blocks {slots:?}" + )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRoot(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRoot(req_id), + peer_id, + beacon_block: None, + seen_timestamp: D, + }); + } + + fn send_data_columns_by_root_response( + &mut self, + id: DataColumnsByRootRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRoot request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + + fn complete_blocks_by_root_request( + &mut self, + request: BlocksByRootRequestData, + config: &mut CompleteConfig, + ) { + let (req_id, peer, req) = request; + if let Some(error) = &config.rpc_error { + self.send_rpc_error(SyncRequestId::BlocksByRoot(req_id), peer, error.clone()); + return; + } + + if config.return_no_blocks_n_times > 0 { + config.return_no_blocks_n_times -= 1; + self.send_blocks_by_root_response(req_id, peer, &[]); + return; + } + + let blocks = req + .block_roots() + .iter() + .map(|block_root| { + if config.return_wrong_blocks { + Arc::new(self.rand_block()) + } else { + self.blocks_by_root + .get(block_root) + .expect("Test consumer requested unknown block") + .clone() + } + }) + .collect::>(); + + self.send_blocks_by_root_response(req_id, peer, &blocks); + } + + fn complete_data_columns_by_root_request_range_sync( + &mut self, + (id, peer_id, req): DataColumnsByRootRequestData, + complete_config: &CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + let mut triggered_custody_failure = false; + + let data_columns = req + .data_column_ids + .iter() + .flat_map(|column_id| { + let block = self + .blocks_by_root + .get(&column_id.block_root) + .expect("Test consumer requested unknown block") + .clone(); + + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + column_id.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + triggered_custody_failure = true; + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if let Some(target_index) = complete_config.custody_failure_at_index { + if req + .data_column_ids + .iter() + .any(|id| id.columns.iter().any(|index| *index == target_index)) + { + self.log(&format!( + "Forced custody failure at request {id} for peer {peer_id} index {target_index:?}" + )); + } + } + + self.send_data_columns_by_root_response(id, peer_id, &data_columns); + } + + fn complete_block_processing(&mut self, ids: Vec, config: &CompleteConfig) { + if config.process_error { + for id in &ids { + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: ChainSegmentProcessId::ForwardSync(*id), + result: BatchProcessResult::Failure { + peer_action: None, + error: "test error".to_owned(), + }, + }); + } + } + + // Sort ids first as we need to process blocks in order of ancestors. This only works if the + // test does not send blocks of two parallel chains at once. + let mut blocks = ids + .into_iter() + .map(|id| { + let block = self + .blocks_by_root + .get(&id.block_root) + .cloned() + .expect("unknown block"); + (id, block) + }) + .collect::>(); + blocks.sort_by_key(|(_, block)| block.slot()); + + for (id, block) in blocks { + self.log(&format!( + "Completing block processing {id} slot {}", + block.slot() + )); + + { + let mut head_state = self.harness.chain.head().snapshot.beacon_state.clone(); + *head_state.slot_mut() = block.slot(); + + let mut fork_choice = self.harness.chain.canonical_head.fork_choice_write_lock(); + fork_choice + .on_block( + block.slot(), + block.message(), + id.block_root, + Duration::from_secs(0), + &head_state, + PayloadVerificationStatus::Verified, + &self.spec, + ) + .expect("error importing block to fork-choice"); + } + + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: ChainSegmentProcessId::ForwardSync(id), + result: BatchProcessResult::Success, + }); + } + } + + pub fn progress_until_no_events( + &mut self, + request_filter: RequestFilter, + mut complete_config: CompleteConfig, + ) { + self.log(format!("progress until no events {request_filter:?}")); + loop { + if let Ok(request) = self + .pop_received_network_event(&mut |ev| request_filter.blocks_by_root_requests(ev)) + { + self.complete_blocks_by_root_request(request, &mut complete_config); + continue; + } + + if let Ok(request) = self.pop_received_network_event(&mut |ev| { + request_filter.data_columns_by_root_requests(ev) + }) { + self.complete_data_columns_by_root_request_range_sync(request, &complete_config); + continue; + } + + // TODO(tree-sync): find a way to get this info from the beacon processor events + let ids = self.sync_manager.forward_sync().get_processing_ids(); + if !ids.is_empty() { + self.complete_block_processing(ids, &complete_config); + continue; + } + + let sync_state = self.get_sync_state(); + self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); + + return; + } + } + + async fn create_canonical_block(&mut self) -> (SignedBeaconBlock, Option>) { + self.harness.advance_slot(); + + let block_root = self + .harness + .extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let store = &self.harness.chain.store; + let block = store.get_full_block(&block_root).unwrap().unwrap(); + let fork = block.fork_name_unchecked(); + + let data_sidecars = if fork.fulu_enabled() { + store + .get_data_columns(&block_root) + .unwrap() + .map(|columns| { + columns + .into_iter() + .map(CustodyDataColumn::from_asserted_custody) + .collect() + }) + .map(DataSidecars::DataColumns) + } else if fork.deneb_enabled() { + store + .get_blobs(&block_root) + .unwrap() + .blobs() + .map(DataSidecars::Blobs) + } else { + None + }; + + (block, data_sidecars) + } + + async fn remember_block( + &mut self, + (block, data_sidecars): (SignedBeaconBlock, Option>), + ) { + // This code is kind of duplicated from Harness::process_block, but takes sidecars directly. + let block_root = block.canonical_root(); + self.harness.set_current_slot(block.slot()); + let _: SignedBeaconBlockHash = self + .harness + .chain + .process_block( + block_root, + build_rpc_block(block.into(), &data_sidecars, &self.spec), + NotifyExecutionLayer::Yes, + BlockImportSource::RangeSync, + || Ok(()), + ) + .await + .unwrap() + .try_into() + .unwrap(); + self.harness.chain.recompute_head_at_current_slot().await; + } +} + +fn build_rpc_block( + block: Arc>, + data_sidecars: &Option>, + spec: &ChainSpec, +) -> RpcBlock { + match data_sidecars { + Some(DataSidecars::Blobs(blobs)) => { + RpcBlock::new(None, block, Some(blobs.clone())).unwrap() + } + Some(DataSidecars::DataColumns(columns)) => { + RpcBlock::new_with_custody_columns(None, block, columns.clone(), spec).unwrap() + } + // Block has no data, expects zero columns + None => RpcBlock::new_without_blobs(None, block), + } +} + +/// To attempt to finalize the peer's status finalized checkpoint we synced to its finalized epoch + +/// 2 epochs + 1 slot. +const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; + +// Same test with different types of peers: +// - 100 peers +// - 1 supernode +// - perfectly distributed peer ids + +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeOnly, + }); +} + +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeAndRandom, + }); +} + +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { + let mut r = TestRig::test_setup_as_supernode(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + + // Unikely that the single peer we added has enough columns for us. Tests are determinstic and + // this error should never be hit + r.add_connected_sync_peer_not_supernode(remote_info.clone()); + + // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it + // doesn't have enough peers to cover all columns + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_no_active_rpc_requests(); + + // Here we have a batch with partially completed block_components_by_range requests. The batch + // should not have failed, we are still syncing, and there are no downscoring events. + r.expect_no_penalty_for_anyone(); + r.expect_active_block_components_requests_on_custody_step(); + + // Generate enough peers and supernodes to cover all custody columns + r.add_sync_peers(config.peers, remote_info.clone()); + // Note: not necessary to add this peers to the chain, as we draw from the global pool + // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as + // the test above. + + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_requests(); + // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work + // events from here easily. What we pop from the beacon processor queue is an opaque closure + // wihtout any information. We don't know what batch it is for. +} + +#[test] +fn finalized_sync_single_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + + r.add_sync_peer(true, remote_info.clone()); + + // Progress all blocks_by_range and columns_by_range requests but respond empty for a single + // column index + r.progress_until_no_events( + NO_FILTER, + complete().custody_failure_at_index(column_index_to_fail), + ); + r.expect_penalties("custody_failure"); + + // Some peer had a custody failure, but since there's a single peer in the batch we won't issue + // another request yet. + r.expect_no_active_rpc_requests(); + // Ensure that the block components by range request have not failed + r.expect_active_block_components_requests_on_custody_step(); + + // After adding a new peer we will try to fetch from it + r.add_sync_peer(true, remote_info.clone()); + r.progress_until_no_events( + // Find the requests first to assert that this is the only request that exists + filter().column_index(column_index_to_fail), + // complete this one request without the custody failure now + complete(), + ); + + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_requests(); +} + +#[tokio::test] +async fn tree_sync_happy_path() { + let mut r = TestRig::test_setup(); + let (head_root, head_slot) = r.create_unimported_parent_chain(8).await; + let remote_info = SyncInfo { + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + head_slot, + head_root, + }; + r.add_sync_peer(false, remote_info.clone()); + r.progress_until_no_events(NO_FILTER, complete()); + r.add_sync_peer(true, remote_info); + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_empty_network(); +} diff --git a/Cargo.lock b/Cargo.lock index 70c910aadc9..55b407cc8c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6373,6 +6373,7 @@ dependencies = [ "ethereum_ssz", "execution_layer", "fnv", + "fork_choice", "futures", "genesis", "hex", diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 369145a615a..18be6f592d7 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -3145,10 +3145,14 @@ pub enum NumBlobs { pub fn generate_rand_block_and_blobs( fork_name: ForkName, num_blobs: NumBlobs, + parent_root: Option, rng: &mut impl Rng, spec: &ChainSpec, ) -> (SignedBeaconBlock>, Vec>) { - let inner = map_fork_name!(fork_name, BeaconBlock, <_>::random_for_test(rng)); + let mut inner = map_fork_name!(fork_name, BeaconBlock, <_>::random_for_test(rng)); + if let Some(parent_root) = parent_root { + *inner.parent_root_mut() = parent_root; + } let mut block = SignedBeaconBlock::from_block(inner, types::Signature::random_for_test(rng)); let max_blobs = spec.max_blobs_per_block(block.epoch()) as usize; @@ -3247,13 +3251,15 @@ pub fn generate_rand_block_and_blobs( pub fn generate_rand_block_and_data_columns( fork_name: ForkName, num_blobs: NumBlobs, + parent_root: Option, rng: &mut impl Rng, spec: &ChainSpec, ) -> ( SignedBeaconBlock>, DataColumnSidecarList, ) { - let (block, _blobs) = generate_rand_block_and_blobs(fork_name, num_blobs, rng, spec); + let (block, _blobs) = + generate_rand_block_and_blobs(fork_name, num_blobs, parent_root, rng, spec); let data_columns = generate_data_column_sidecars_from_block(&block, spec); (block, data_columns) } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index bed846f1f07..7754e38d458 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -32,7 +32,10 @@ pub struct BlocksByRootRequestId { } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct HeaderLookupId(pub Hash256, pub Id); +pub struct HeaderLookupId { + pub id: Id, + pub block_root: Hash256, +} #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct BatchId(pub Id); @@ -210,6 +213,9 @@ impl_display!(DataColumnsByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); impl_display!(CustodyByRootRequestId, "{}", parent_request_id); impl_display!(SamplingId, "{}/{}", sampling_request_id, id); +// Print only the ID to make logs succint. On lookup creation we log the ID and the block root to +// link them. +impl_display!(HeaderLookupId, "{}", id); impl Display for DataColumnsByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -220,12 +226,6 @@ impl Display for DataColumnsByRootRequester { } } -impl Display for HeaderLookupId { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", self.0, self.1) - } -} - impl Display for BatchId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) diff --git a/beacon_node/network/Cargo.toml b/beacon_node/network/Cargo.toml index cdb6ba7a83f..ad0684bb91e 100644 --- a/beacon_node/network/Cargo.toml +++ b/beacon_node/network/Cargo.toml @@ -57,3 +57,4 @@ kzg = { workspace = true } matches = "0.1.8" rand_chacha = "0.3.1" serde_json = { workspace = true } +fork_choice = { workspace = true } diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 80436a64dee..f5c6d36f7f8 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -204,11 +204,13 @@ impl BackFillSync { self.status.remove_peer(peer_id); if self.status.peer_count() == 0 { - info!( - "reason" = "insufficient_synced_peers", - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); + if self.state() == BackFillState::Syncing { + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); + } } } @@ -260,13 +262,12 @@ impl BackFillSync { // Do nothing wait for future event } Err(e) => match e { - SyncBlockError::InternalError(_) | SyncBlockError::TooManyErrors => { + SyncBlockError::InternalError(_) | SyncBlockError::TooManyErrors(_) => { debug!(error = ?e, "Backfill synced failed"); self.set_state(BackFillState::Failed); } }, } - self.continue_syncing_blocks(cx); } /// Updates the global network state indicating the current state of a backfill sync. diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 85853737335..429655e0fc4 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -49,7 +49,7 @@ enum Status { impl ForwardSyncBlock { fn new(block_root: Hash256, id: Id, peers: &[PeerId]) -> Self { Self { - id: HeaderLookupId(block_root, id), + id: HeaderLookupId { id, block_root }, status: Status::BackfillHeader { peers: HashSet::from_iter(peers.iter().copied()), request: DownloadRequest::new(), @@ -57,14 +57,11 @@ impl ForwardSyncBlock { } } - fn add_peer(&mut self, peer: PeerId) { + /// Returns whether the value was newly inserted + fn add_peer(&mut self, peer: PeerId) -> bool { match &mut self.status { - Status::BackfillHeader { peers, .. } => { - peers.insert(peer); - } - Status::ForwardSyncBlock { request, .. } => { - request.add_peer(peer); - } + Status::BackfillHeader { peers, .. } => peers.insert(peer), + Status::ForwardSyncBlock { request, .. } => request.add_peer(peer), } } @@ -208,7 +205,7 @@ impl ForwardSyncBlock { #[derive(Debug)] pub enum Error { InternalError(String), - TooManyErrors, + TooManyErrors(String), BlockConflictsWithFinality(String), } @@ -216,6 +213,7 @@ impl From for Error { fn from(e: DownloadRequestError) -> Self { match e { DownloadRequestError::InternalError(e) => Self::InternalError(e), + DownloadRequestError::TooManyErrors(e) => Self::TooManyErrors(format!("{e:?}")), } } } @@ -234,7 +232,7 @@ impl From for Error { fn from(e: SyncBlockError) -> Self { match e { SyncBlockError::InternalError(e) => Self::InternalError(e), - SyncBlockError::TooManyErrors => Self::TooManyErrors, + SyncBlockError::TooManyErrors(e) => Self::TooManyErrors(e), } } } @@ -252,6 +250,16 @@ impl ForwardSync { } } + #[cfg(test)] + pub fn block_peers(&self, block_root: &Hash256) -> Option> { + self.blocks.get(block_root).map(|block| block.get_peers()) + } + + #[cfg(test)] + pub fn get_lookups(&self) -> Vec { + self.blocks.keys().copied().collect() + } + pub fn block_count(&self) -> usize { self.blocks.len() } @@ -308,9 +316,14 @@ impl ForwardSync { while let Some(lookup) = self.blocks.get_mut(&target_block_root) { for peer in peers { // TODO(tree-sync): If peer already in set no need to add to its ancestors - lookup.add_peer(*peer); - // TODO(tree-sync): This log can be very noisy maybe log once per peer - debug!(block_root = ?target_block_root, ?peer, "Adding peer to existing header lookup"); + if lookup.add_peer(*peer) { + // TODO(tree-sync): This log can be very noisy maybe log once per peer + debug!(block_root = ?target_block_root, ?peer, "Adding peer to existing header lookup"); + } else { + // Peer already part of this lookup, therefore it must be part of the peer + // set of all of its ancestors: stop + break; + } } if let Some(parent_root) = lookup.parent_root() { target_block_root = parent_root; @@ -323,9 +336,10 @@ impl ForwardSync { self.prune_least_popular_lookups(); } - debug!(?block_root, ?peers, "Creating new header lookup"); + let id = cx.next_id(); + debug!(?block_root, id, ?peers, "Creating new header lookup"); - let mut lookup = ForwardSyncBlock::new(block_root, cx.next_id(), peers); + let mut lookup = ForwardSyncBlock::new(block_root, id, peers); match lookup.send_block_header_request(block_root, cx) { Ok(_) => { self.blocks.insert(block_root, lookup); @@ -345,7 +359,7 @@ impl ForwardSync { peer_id: PeerId, cx: &mut SyncNetworkContext, ) { - let block_root = id.0; + let block_root = id.block_root; let result: Result = (|| { let Some(lookup) = self.blocks.get_mut(&block_root) else { @@ -412,8 +426,10 @@ impl ForwardSync { } } Err(e) => { - debug!(%req_id, error = ?e, "Forward sync block header downloaded error"); - lookup.header_request()?.on_download_error(req_id)?; + // Request errors are logged in `SyncNetworkContext::on_rpc_response_result` + lookup + .header_request()? + .on_download_error(req_id, Some(e))?; lookup.send_block_header_request(block_root, cx)?; } } @@ -422,7 +438,7 @@ impl ForwardSync { // Map result Ok to Wait as completing the header request does not complete the overall // ForwardSyncBlock request. - self.handle_result(id.0, result.map(|_| SyncBlockResult::Wait), cx); + self.handle_result(id.block_root, result.map(|_| SyncBlockResult::Wait), cx); } pub fn on_block_download_result( @@ -432,7 +448,7 @@ impl ForwardSync { result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - let Some(lookup) = self.blocks.get_mut(&id.0) else { + let Some(lookup) = self.blocks.get_mut(&id.block_root) else { // TODO(tree-sync): register metric debug!(?id, "Received block request for unknown lookup"); return; @@ -445,7 +461,7 @@ impl ForwardSync { let outcome = lookup .block_request() .and_then(|block| Ok(block.on_download_result(req_id, result, cx)?)); - self.handle_result(id.0, outcome, cx); + self.handle_result(id.block_root, outcome, cx); } pub fn on_block_process_result( @@ -454,7 +470,7 @@ impl ForwardSync { result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - let Some(lookup) = self.blocks.get_mut(&id.0) else { + let Some(lookup) = self.blocks.get_mut(&id.block_root) else { debug!(?id, "Received block process result for unknown lookup"); return; }; @@ -466,7 +482,7 @@ impl ForwardSync { let outcome = lookup .block_request() .and_then(|block| Ok(block.on_process_result(result, cx)?)); - self.handle_result(id.0, outcome, cx); + self.handle_result(id.block_root, outcome, cx); } pub fn prune(&mut self) { @@ -499,9 +515,9 @@ impl ForwardSync { // Wait for next event Ok(SyncBlockResult::Wait) => {} Err(e) => { - debug!(error = ?e, "Dropping forward sync block header lookup"); + debug!(error = ?e, ?block_root, "Dropping forward sync block lookup"); match e { - Error::InternalError(_) | Error::TooManyErrors => { + Error::InternalError(_) | Error::TooManyErrors(_) => { let block_to_children = self.compute_children(); self.drop_lookup_and_children(block_root, &block_to_children); } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 7f152f99f0c..1274c8fc721 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -353,8 +353,7 @@ impl SyncManager { .finalized_epoch .start_slot(T::EthSpec::slots_per_epoch()) { - self.forward_sync - .search(remote.head_root, &[peer_id], &mut self.network); + self.add_peer_with_imported_block_root(peer_id, remote.head_root); } let sync_type = remote_sync_type(&local, &remote, &self.chain); @@ -371,6 +370,18 @@ impl SyncManager { } } + // Adds a peer to forward sync. Since its possible that a lookup just gained a new peer we + // attempt to continue idle custody by root requests that are waiting for peers. + fn add_peer_with_imported_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { + self.forward_sync + .search(block_root, &[peer_id], &mut self.network); + + // Try to make progress on custody requests that are waiting for peers + for (id, result) in self.network.continue_custody_by_root_requests() { + self.on_custody_by_root_result(id, result); + } + } + fn updated_peer_cgc(&mut self, _peer_id: PeerId) { // Try to make progress on custody requests that are waiting for peers for (id, result) in self.network.continue_custody_by_root_requests() { @@ -733,8 +744,7 @@ impl SyncManager { ) { match self.should_search_for_block(Some(slot), &peer_id) { Ok(_) => { - self.forward_sync - .search(block_root, &[peer_id], &mut self.network); + self.add_peer_with_imported_block_root(peer_id, block_root); } Err(reason) => { debug!(%block_root, %parent_root, reason, "Ignoring unknown parent request"); @@ -745,8 +755,7 @@ impl SyncManager { fn handle_unknown_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { match self.should_search_for_block(None, &peer_id) { Ok(_) => { - self.forward_sync - .search(block_root, &[peer_id], &mut self.network); + self.add_peer_with_imported_block_root(peer_id, block_root); } Err(reason) => { debug!(%block_root, reason, "Ignoring unknown block request"); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 73494191d8d..7a8d277da0c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -548,6 +548,8 @@ impl SyncNetworkContext { let id = CustodyByRootRequestId { parent_request_id }; debug!( %id, + ?block_root, + peers = lookup_peers.read().len(), "Starting custody columns request" ); diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 70a9107224c..faf30f3f67f 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -8,7 +8,7 @@ use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; use lighthouse_network::service::api_types::{CustodyByRootRequestId, DataColumnsByRootRequester}; -use lighthouse_network::PeerId; +use lighthouse_network::{PeerAction, PeerId}; use lru_cache::LRUTimeCache; use parking_lot::RwLock; use rand::Rng; @@ -44,6 +44,8 @@ pub struct ActiveCustodyByRootRequest { failed_peers: LRUTimeCache, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, + /// Log that request is idle once + logged_idle_request: bool, _phantom: PhantomData, } @@ -85,6 +87,7 @@ impl From for Error { fn from(e: DownloadRequestError) -> Self { match e { DownloadRequestError::InternalError(e) => Self::InternalError(e), + DownloadRequestError::TooManyErrors(e) => Self::TooManyDownloadErrors(e), } } } @@ -115,6 +118,7 @@ impl ActiveCustodyByRootRequest { active_batch_columns_requests: <_>::default(), failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), lookup_peers, + logged_idle_request: false, _phantom: PhantomData, } } @@ -180,7 +184,7 @@ impl ActiveCustodyByRootRequest { // TODO(das): Should track which columns are missing and eventually give up // TODO(das): If the peer is in the lookup peer set it claims to have imported // the block AND its custody columns. So in this case we can downscore - column_request.on_download_error(req_id)?; + column_request.on_download_error(req_id, None)?; missing_column_indexes.push(column_index); } } @@ -199,6 +203,14 @@ impl ActiveCustodyByRootRequest { ); self.failed_peers.insert(peer_id); + + // If peer is in the lookup peer set, it claims to have imported the block and + // must have its columns in custody. In that case, set `true = enforce max_requests` + // and downscore if data_columns_by_root does not returned the expected custody + // columns. For the rest of peers, don't downscore if columns are missing. + if self.lookup_peers.read().contains(&peer_id) { + cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); + } } } Err(err) => { @@ -214,7 +226,7 @@ impl ActiveCustodyByRootRequest { self.column_requests .get_mut(column_index) .ok_or(Error::InternalError("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id, err.clone())?; + .on_download_error(req_id, Some(err.clone()))?; } self.failed_peers.insert(peer_id); @@ -261,10 +273,6 @@ impl ActiveCustodyByRootRequest { for (column_index, request) in self.column_requests.iter_mut() { if request.is_awaiting_download() { - if let Some(last_error) = request.too_many_failures() { - return Err(Error::TooManyDownloadErrors(last_error)); - } - // TODO(das): When is a fork and only a subset of your peers know about a block, we should // only query the peers on that fork. Should this case be handled? How to handle it? let custodial_peers = cx.get_custodial_peers(*column_index); @@ -276,13 +284,11 @@ impl ActiveCustodyByRootRequest { // custody peers on a given column let mut priorized_peers = custodial_peers .iter() + .filter(|peer| !self.failed_peers.contains(peer)) .map(|peer| { ( // Prioritize peers that claim to know have imported this block if lookup_peers.contains(peer) { 0 } else { 1 }, - // De-prioritize peers that have failed to successfully respond to - // requests recently - self.failed_peers.contains(peer), // Prefer peers with fewer requests to load balance across peers. // We batch requests to the same peer, so count existence in the // `columns_to_request_by_peer` as a single 1 request. @@ -296,7 +302,7 @@ impl ActiveCustodyByRootRequest { .collect::>(); priorized_peers.sort_unstable(); - if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + if let Some((_, _, _, peer_id)) = priorized_peers.first() { columns_to_request_by_peer .entry(*peer_id) .or_default() @@ -319,11 +325,7 @@ impl ActiveCustodyByRootRequest { peer_id, self.block_root, indices.clone(), - // If peer is in the lookup peer set, it claims to have imported the block and - // must have its columns in custody. In that case, set `true = enforce max_requests` - // and downscore if data_columns_by_root does not returned the expected custody - // columns. For the rest of peers, don't downscore if columns are missing. - lookup_peers.contains(&peer_id), + false, ) .map_err(|e| { Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) @@ -341,10 +343,15 @@ impl ActiveCustodyByRootRequest { self.active_batch_columns_requests .insert(req_id, ActiveBatchColumnsRequest { indices }); + + // Reset the idle request log, for the next time this request completes + self.logged_idle_request = false; } + let no_active_request = !self.column_requests.values().any(|r| r.is_downloading()); + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) - && !self.column_requests.values().any(|r| r.is_downloading()) + && no_active_request { let awaiting_peers_indicies = self .column_requests @@ -355,6 +362,16 @@ impl ActiveCustodyByRootRequest { return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); } + if no_active_request && !self.logged_idle_request { + self.logged_idle_request = true; + debug!( + id = ?self.custody_id, + failed_peers = self.failed_peers.keys().count(), + peers = self.lookup_peers.read().len(), + "Custody by root request idle waiting for peers" + ); + } + Ok(None) } } diff --git a/beacon_node/network/src/sync/network_context/download_request.rs b/beacon_node/network/src/sync/network_context/download_request.rs index 60427bc679c..95bb8c8f161 100644 --- a/beacon_node/network/src/sync/network_context/download_request.rs +++ b/beacon_node/network/src/sync/network_context/download_request.rs @@ -5,7 +5,7 @@ use strum::IntoStaticStr; /// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each /// `custody_by_*` request is already retried multiple inside of a lookup or batch -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; +const MAX_DOWNLOAD_ATTEMPTS: usize = 5; pub struct DownloadRequest { status: Status, @@ -22,6 +22,7 @@ pub enum Status { #[derive(Debug)] pub enum Error { InternalError(String), + TooManyErrors(RpcResponseError), } impl DownloadRequest { @@ -54,19 +55,6 @@ impl DownloadRequest { } } - pub fn too_many_failures(&self) -> Option { - if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - Some( - self.download_failures - .last() - .cloned() - .expect("download_failures is not empty"), - ) - } else { - None - } - } - pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { match &self.status { Status::NotStarted => { @@ -80,7 +68,11 @@ impl DownloadRequest { } } - pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { + pub fn on_download_error( + &mut self, + req_id: I, + error_to_register: Option, + ) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { @@ -88,6 +80,16 @@ impl DownloadRequest { "Received download result for req_id {req_id} expecting {expected_req_id}" ))); } + + if let Some(e) = error_to_register { + self.download_failures.push(e); + if self.download_failures.len() > MAX_DOWNLOAD_ATTEMPTS { + if let Some(last_error) = self.download_failures.pop() { + return Err(Error::TooManyErrors(last_error)); + } + } + } + self.status = Status::NotStarted; Ok(()) } @@ -98,15 +100,6 @@ impl DownloadRequest { } } - pub fn on_download_error_and_mark_failure( - &mut self, - req_id: I, - e: RpcResponseError, - ) -> Result<(), Error> { - self.download_failures.push(e); - self.on_download_error(req_id) - } - pub fn on_download_success( &mut self, req_id: I, diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 7bd2928adc4..639e059eb98 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -42,7 +42,7 @@ pub enum SyncBlockResult { #[derive(Debug)] pub enum Error { InternalError(String), - TooManyErrors, + TooManyErrors(String), } impl SyncBlock { @@ -68,6 +68,7 @@ impl SyncBlock { self.peers.read().clone() } + /// Returns whether the value was newly inserted pub fn add_peer(&self, peer: PeerId) -> bool { self.peers.write().insert(peer) } @@ -107,7 +108,7 @@ impl SyncBlock { self.download_errors += 1; if self.download_errors > MAX_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyErrors); + return Err(Error::TooManyErrors("download errors".to_owned())); } self.continue_request(cx) @@ -146,7 +147,7 @@ impl SyncBlock { self.process_errors += 1; if self.process_errors > MAX_PROCESS_ATTEMPTS { - return Err(Error::TooManyErrors); + return Err(Error::TooManyErrors("process errors".to_owned())); } self.request = SyncingStatus::AwaitingDownload; diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 318309cc195..b3e7334a0ed 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -25,6 +25,7 @@ use beacon_chain::{ PayloadVerificationOutcome, PayloadVerificationStatus, }; use beacon_processor::WorkEvent; +use fork_choice::ForkChoiceStore; use lighthouse_network::discovery::CombinedKey; use lighthouse_network::{ rpc::{RPCError, RequestType, RpcErrorResponse}, @@ -203,7 +204,7 @@ impl TestRig { self.send_sync_message(SyncMessage::UnknownParentBlob(peer_id, blob.into())); } - fn trigger_unknown_block_from_attestation(&mut self, block_root: Hash256, peer_id: PeerId) { + pub fn trigger_unknown_block_from_attestation(&mut self, block_root: Hash256, peer_id: PeerId) { self.send_sync_message(SyncMessage::UnknownBlockHashFromAttestation( peer_id, block_root, )); @@ -230,14 +231,17 @@ impl TestRig { ) -> (SignedBeaconBlock, Vec>) { let fork_name = self.fork_name; let rng = &mut self.rng; - generate_rand_block_and_blobs::(fork_name, num_blobs, rng, &self.spec) + let head_root = self.harness.chain.head().head_block_root(); + generate_rand_block_and_blobs::(fork_name, num_blobs, Some(head_root), rng, &self.spec) } fn rand_block_and_data_columns(&mut self) -> (SignedBeaconBlock, DataColumnSidecarList) { let num_blobs = NumBlobs::Number(1); + let head_root = self.harness.chain.head().head_block_root(); generate_rand_block_and_data_columns::( self.fork_name, num_blobs, + Some(head_root), &mut self.rng, &self.harness.spec, ) @@ -258,8 +262,16 @@ impl TestRig { self.sync_manager.handle_message(sync_message); } - fn assert_active_lookup(&self, block_root: Hash256) { - todo!(); + fn assert_active_lookup(&mut self, block_root: Hash256) { + let lookups = self.sync_manager.forward_sync().get_lookups(); + if !lookups.contains(&block_root) { + panic!("Expected lookup {block_root} not found, active lookups: {lookups:?}"); + } + } + + fn assert_active_lookups(&mut self, expected_lookups: &[Hash256]) { + let lookups = self.sync_manager.forward_sync().get_lookups(); + assert_eq!(lookups, expected_lookups, "Unexpected lookups"); } fn expect_no_active_sampling(&mut self) { @@ -283,39 +295,24 @@ impl TestRig { self.expect_no_active_sampling(); } - fn assert_lookup_peers(&self, block_root: Hash256, expected_peers: &[PeerId]) { - todo!(); - } - - fn insert_failed_chain(&mut self, block_root: Hash256) { - todo!(); - } - - fn assert_not_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.get_failed_chains(); - if failed_chains.contains(&chain_hash) { - panic!("failed chains contain {chain_hash:?}: {failed_chains:?}"); - } - } - - fn get_failed_chains(&mut self) -> Vec { - todo!(); - } - - fn assert_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.get_failed_chains(); - if !failed_chains.contains(&chain_hash) { - panic!("expected failed chains to contain {chain_hash:?}: {failed_chains:?}"); - } - } - - fn find_single_lookup_for(&self, block_root: Hash256) -> Id { - todo!(); + fn assert_lookup_peers(&mut self, block_root: Hash256, expected_peers: &[PeerId]) { + let mut peers = self + .sync_manager + .forward_sync() + .block_peers(&block_root) + .unwrap_or_else(|| panic!("Unknown block {block_root}")); + peers.sort_unstable(); + let mut expected_peers = expected_peers.to_vec(); + expected_peers.sort_unstable(); + assert_eq!(peers, expected_peers, "Unexpected block {block_root} peers"); } #[track_caller] - fn expect_no_active_lookups(&self) { - todo!(); + pub fn expect_no_active_lookups(&mut self) { + let lookups = self.sync_manager.forward_sync().get_lookups(); + if !lookups.is_empty() { + panic!("expected no active lookups but found {lookups:?}") + } } fn expect_no_active_lookups_empty_network(&mut self) { @@ -325,13 +322,13 @@ impl TestRig { // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in // lookup tests. We should consolidate this "add peer" methods in a future refactor - fn new_connected_peer(&mut self) -> PeerId { + pub fn new_connected_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(false) } // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in // lookup tests. We should consolidate this "add peer" methods in a future refactor - fn new_connected_supernode_peer(&mut self) -> PeerId { + pub fn new_connected_supernode_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(true) } @@ -498,7 +495,7 @@ impl TestRig { /// Return RPCErrors for all active requests of peer fn rpc_error_all_active_requests(&mut self, disconnected_peer_id: PeerId) { self.drain_network_rx(); - while let Ok(sync_request_id) = self.pop_received_network_event(|ev| match ev { + while let Ok(sync_request_id) = self.pop_received_network_event(&mut |ev| match ev { NetworkMessage::SendRequest { peer_id, app_request_id: AppRequestId::Sync(id), @@ -532,7 +529,7 @@ impl TestRig { pub fn pop_received_network_event) -> Option>( &mut self, - predicate_transform: F, + predicate_transform: &mut F, ) -> Result { self.drain_network_rx(); @@ -550,6 +547,17 @@ impl TestRig { } } + pub fn pop_received_network_events) -> Option>( + &mut self, + predicate_transform: &mut F, + ) -> Vec { + let mut events = vec![]; + while let Ok(ev) = self.pop_received_network_event(predicate_transform) { + events.push(ev) + } + events + } + /// Similar to `pop_received_network_events` but finds matching events without removing them. pub fn filter_received_network_events) -> Option>( &mut self, @@ -597,60 +605,6 @@ impl TestRig { } } - fn find_block_lookup_request( - &mut self, - _for_block: Hash256, - ) -> Result { - todo!(); - } - - #[track_caller] - fn expect_block_lookup_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.find_block_lookup_request(for_block) - .unwrap_or_else(|e| panic!("Expected block request for {for_block:?}: {e}")) - } - - fn find_blob_lookup_request( - &mut self, - for_block: Hash256, - ) -> Result { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlobsByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), - } if request - .blob_ids - .to_vec() - .iter() - .any(|r| r.block_root == for_block) => - { - todo!(); - } - _ => None, - }) - } - - #[track_caller] - fn expect_blob_lookup_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.find_blob_lookup_request(for_block) - .unwrap_or_else(|e| panic!("Expected blob request for {for_block:?}: {e}")) - } - - #[track_caller] - fn expect_block_parent_request(&mut self, _for_block: Hash256) -> SingleLookupReqId { - todo!(); - } - - fn expect_no_requests_for(&mut self, block_root: Hash256) { - if let Ok(request) = self.find_block_lookup_request(block_root) { - panic!("Expected no block request for {block_root:?} found {request:?}"); - } - if let Ok(request) = self.find_blob_lookup_request(block_root) { - panic!("Expected no blob request for {block_root:?} found {request:?}"); - } - } - /// Retrieves an unknown number of requests for data columns of `block_root`. Because peer ENRs /// are random, and peer selection is random, the total number of batched requests is unknown. fn expect_data_columns_by_root_requests( @@ -658,36 +612,22 @@ impl TestRig { block_root: Hash256, count: usize, ) -> DCByRootIds { - let mut requests: DCByRootIds = vec![]; - loop { - let req = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::DataColumnsByRoot(request), - app_request_id: - AppRequestId::Sync(id @ SyncRequestId::DataColumnsByRoot { .. }), - } => { - let matching = request - .data_column_ids - .iter() - .find(|id| id.block_root == block_root)?; - - let indices = matching.columns.iter().copied().collect(); - Some((*id, indices)) - } - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Expected more DataColumnsByRoot requests for {block_root:?}: {e}") - }); - requests.push(req); - - // Should never infinite loop because sync does not send requests for 0 columns - if requests.iter().map(|r| r.1.len()).sum::() >= count { - return requests; + self.pop_received_network_events(&mut |ev| match ev { + NetworkMessage::SendRequest { + peer_id: _, + request: RequestType::DataColumnsByRoot(request), + app_request_id: AppRequestId::Sync(id @ SyncRequestId::DataColumnsByRoot { .. }), + } => { + let matching = request + .data_column_ids + .iter() + .find(|id| id.block_root == block_root)?; + + let indices = matching.columns.iter().copied().collect(); + Some((*id, indices)) } - } + _ => None, + }) } fn expect_only_data_columns_by_root_requests( @@ -829,13 +769,14 @@ impl TestRig { #[track_caller] pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { - let all_penalties = self.filter_received_network_events(|ev| match ev { + let all_penalties = self.pop_received_network_events(&mut |ev| match ev { NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), _ => None, }); - if all_penalties - .iter() - .any(|(_, msg)| *msg != expected_penalty_msg) + if !all_penalties.is_empty() + && all_penalties + .iter() + .any(|(_, msg)| *msg != expected_penalty_msg) { panic!( "Expected penalties only of {expected_penalty_msg}, but found {all_penalties:?}" @@ -859,7 +800,7 @@ impl TestRig { #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self - .pop_received_network_event(|ev| match ev { + .pop_received_network_event(&mut |ev| match ev { NetworkMessage::ReportPeer { peer_id: p_id, msg, .. } if p_id == &peer_id => Some(msg.to_owned()), @@ -965,16 +906,18 @@ impl TestRig { async fn single_lookup_from_attestation_setup(&mut self) -> (Hash256, PeerId) { let (head_root, head_slot) = self.create_unimported_parent_chain(1).await; - let peer_id = self.new_connected_peer(); + // Use a supernode so Fulu tests can pass without edits + let peer_id = self.new_connected_supernode_peer(); // Trigger the request self.trigger_unknown_block_from_attestation(head_root, peer_id); - self.expect_block_lookup_request(head_root); + self.assert_active_lookup(head_root); (head_root, peer_id) } - async fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { + pub async fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { let (head_root, head_slot) = self.create_unimported_parent_chain(2).await; - let peer_id = self.new_connected_peer(); + // Use a supernode so Fulu tests can pass without edits + let peer_id = self.new_connected_supernode_peer(); let head_block = self .blocks_by_root .get(&head_root) @@ -990,22 +933,34 @@ impl TestRig { } fn assert_head(&self, expected_head: Hash256) { - let head = self.harness.chain.head(); - assert_eq!( - head.head_block_root(), - expected_head, - "Not expected head root" - ); + let mut fork_choice = self.harness.chain.canonical_head.fork_choice_write_lock(); + let current_slot = fork_choice.fc_store().get_current_slot(); + let head_root = fork_choice + .get_head(current_slot, &self.harness.spec) + .expect("error computing head"); + assert_eq!(head_root, expected_head, "Not expected head root"); } - fn fetch_ancestor_chain(&self, mut block_root: Hash256) -> Vec { + fn fetch_unimported_ancestor_chain(&self, mut block_root: Hash256) -> Vec { let mut chain = vec![]; while let Some(block) = self.blocks_by_root.get(&block_root) { + if self + .harness + .chain + .block_is_known_to_fork_choice(&block_root) + { + break; + } + chain.push(block_root); block_root = block.parent_root(); } chain } + + pub fn complete_header_chain(&mut self) { + self.progress_until_no_events(filter().header_requests_only(), complete()); + } } #[test] @@ -1013,7 +968,7 @@ fn stable_rng() { let spec = types::MainnetEthSpec::default_spec(); let mut rng = XorShiftRng::from_seed([42; 16]); let (block, _) = - generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, &mut rng, &spec); + generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, None, &mut rng, &spec); assert_eq!( block.canonical_root(), Hash256::from_slice( @@ -1033,35 +988,50 @@ async fn test_single_block_lookup_happy_path() { // Tests that if a peer does not respond with a block, we downscore and retry the block only #[tokio::test] -async fn test_single_block_lookup_empty_response() { +async fn test_single_block_lookup_empty_response_until_failure() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + let (_, _) = r.single_lookup_from_attestation_setup().await; r.progress_until_no_events(NO_FILTER, complete().return_no_blocks()); r.expect_penalties("NotEnoughResponsesReturned"); + // Test will loop until reaching max download attempts and remove the lookup + r.expect_no_active_lookups(); +} + +#[tokio::test] +async fn test_single_block_lookup_empty_response_some_times() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + r.progress_until_no_events(NO_FILTER, complete().return_no_blocks_n_times(3)); + r.expect_penalties("NotEnoughResponsesReturned"); r.expect_fully_complete_sync(new_head_root); } #[tokio::test] async fn test_single_block_lookup_wrong_response() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + let (_, _) = r.single_lookup_from_attestation_setup().await; r.progress_until_no_events(NO_FILTER, complete().return_wrong_blocks()); r.expect_penalties("UnrequestedBlockRoot"); - r.expect_fully_complete_sync(new_head_root); + r.expect_no_active_lookups(); + // Test will loop until reaching max download attempts and remove the lookup + r.expect_no_active_lookups(); } #[tokio::test] -async fn test_single_block_lookup_failure() { +async fn test_single_block_lookup_rpc_error() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + let (_, _) = r.single_lookup_from_attestation_setup().await; r.progress_until_no_events( NO_FILTER, complete().rpc_error(RPCError::UnsupportedProtocol), ); r.expect_no_penalties(); - r.expect_fully_complete_sync(new_head_root); + // Test will loop until reaching max download attempts and remove the lookup + r.expect_no_active_lookups(); } +// TODO(tree-sync): Current behaviour drops the lookup if there's no peers left +#[ignore] #[tokio::test] async fn test_single_block_lookup_peer_disconnected_then_rpc_error() { let mut r = TestRig::test_setup(); @@ -1083,42 +1053,41 @@ async fn test_parent_lookup_happy_path() { } #[tokio::test] -async fn test_parent_lookup_wrong_response() { +async fn test_parent_lookup_drop_parent() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; - r.progress_until_no_events(NO_FILTER, complete().return_wrong_blocks()); + let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + // Complete the header chain so the first block can start syncing + r.complete_header_chain(); + let chain = r.fetch_unimported_ancestor_chain(head_root); + // Return wrong blocks for the parent of `head_root` = chain[1] + r.progress_until_no_events( + filter().block_root(chain[1]), + complete().return_wrong_blocks(), + ); r.expect_penalties("UnrequestedBlockRoot"); - r.expect_fully_complete_sync(new_head_root); + // It should drop all lookups + r.expect_no_active_lookups(); } #[tokio::test] -async fn test_parent_lookup_rpc_failure() { +async fn test_parent_lookup_drop_child() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + // Complete the header chain so the first block can start syncing + r.complete_header_chain(); + let chain = r.fetch_unimported_ancestor_chain(head_root); + // Return wrong blocks for the parent of `head_root` = chain[1] r.progress_until_no_events( - NO_FILTER, - complete().rpc_error_response(RpcErrorResponse::ResourceUnavailable), + filter().block_root(chain[0]), + complete().return_wrong_blocks(), ); - r.expect_no_penalties(); - r.expect_fully_complete_sync(new_head_root); -} - -// TODO(tree-sync): test blacklist feature -#[tokio::test] -async fn test_parent_lookup_too_many_attempts() { - let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; - - for i in 1..=PARENT_FAIL_TOLERANCE { - r.progress_until_no_events( - NO_FILTER, - complete().rpc_error_response(RpcErrorResponse::ResourceUnavailable), - ); - } - - r.expect_no_active_lookups_empty_network(); + r.expect_penalties("UnrequestedBlockRoot"); + // It should only drop the newest lookup + r.assert_active_lookups(&[chain[1]]); } +// TODO(tree-sync): Current behaviour drops the lookup if there's no peers left +#[ignore] #[tokio::test] async fn test_lookup_peer_disconnected_no_peers_left_while_request() { let mut r = TestRig::test_setup(); @@ -1146,18 +1115,21 @@ async fn test_lookup_disconnection_peer_left() { async fn test_lookup_add_peers_to_parent() { let mut r = TestRig::test_setup(); let (head_root, _) = r.create_unimported_parent_chain(4).await; - let chain = r.fetch_ancestor_chain(head_root); + let chain = r.fetch_unimported_ancestor_chain(head_root); let peer_id = r.new_connected_peer(); r.trigger_unknown_block_from_attestation(head_root, peer_id); - r.progress_until_no_events(filter().header_requests_only(), complete()); + r.complete_header_chain(); let new_peers = (0..2).map(|_| r.new_connected_peer()).collect::>(); for peer in &new_peers { r.trigger_unknown_block_from_attestation(head_root, *peer); } + + let mut expected_peers = new_peers.clone(); + expected_peers.push(peer_id); for block in chain { // Parent has the original unknown parent event peer + new peer - r.assert_lookup_peers(block, &new_peers); + r.assert_lookup_peers(block, &expected_peers); } } @@ -1171,7 +1143,10 @@ fn sampling_happy_path() { let block_root = block.canonical_root(); r.trigger_sample_block(block_root, block.slot()); // Retrieve all outgoing sample requests for random column indexes - r.progress_until_no_events(NO_FILTER, complete()); + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + // Resolve all of them one by one + r.complete_valid_sampling_column_requests(sampling_ids, data_columns); r.expect_clean_finished_sampling(); } @@ -1186,7 +1161,14 @@ fn sampling_with_retries() { let (block, data_columns) = r.rand_block_and_data_columns(); let block_root = block.canonical_root(); r.trigger_sample_block(block_root, block.slot()); - r.progress_until_no_events(NO_FILTER, complete().empty_sampling_response_once()); + // Retrieve all outgoing sample requests for random column indexes, and return empty responses + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + r.return_empty_sampling_requests(sampling_ids); + // Expect retries for all of them, and resolve them + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + r.complete_valid_sampling_column_requests(sampling_ids, data_columns); r.expect_clean_finished_sampling(); } @@ -1195,20 +1177,18 @@ fn sampling_avoid_retrying_same_peer() { let Some(mut r) = TestRig::test_setup_after_fulu() else { return; }; - let peer_1 = r.new_connected_supernode_peer(); + let peer_id_1 = r.new_connected_supernode_peer(); + let peer_id_2 = r.new_connected_supernode_peer(); let block_root = Hash256::random(); r.trigger_sample_block(block_root, Slot::new(0)); // Retrieve all outgoing sample requests for random column indexes, and return empty responses - r.progress_until_no_events( - filter().peer(peer_1), - complete().rpc_error(RPCError::Disconnected), - ); + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + r.sampling_requests_failed(sampling_ids, peer_id_1, RPCError::Disconnected); // Should retry the other peer - let peer_2 = r.new_connected_supernode_peer(); - r.progress_until_no_events( - filter().peer(peer_2), - complete().rpc_error(RPCError::Disconnected), - ); + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + r.sampling_requests_failed(sampling_ids, peer_id_2, RPCError::Disconnected); // Expect no more retries r.expect_empty_network(); } @@ -1224,14 +1204,18 @@ fn sampling_batch_requests() { r.trigger_sample_block(block_root, block.slot()); // Retrieve the sample request, which should be batched. - let (_, column_indexes) = r + let (sync_request_id, column_indexes) = r .expect_only_data_columns_by_root_requests(block_root, 1) .pop() .unwrap(); assert_eq!(column_indexes.len(), SAMPLING_REQUIRED_SUCCESSES); r.assert_sampling_request_ongoing(block_root, &column_indexes); - r.progress_until_no_events(NO_FILTER, complete()); + // Resolve the request. + r.complete_valid_sampling_column_requests( + vec![(sync_request_id, column_indexes.clone())], + data_columns, + ); r.expect_clean_finished_sampling(); } @@ -1256,14 +1240,18 @@ fn sampling_batch_requests_not_enough_responses_returned() { r.assert_sampling_request_ongoing(block_root, &column_indexes); // Split the indexes to simulate the case where the supernode doesn't have the requested column. - let missing_custody_index = *column_indexes.first().unwrap(); let (column_indexes_supernode_does_not_have, column_indexes_to_complete) = column_indexes.split_at(1); // Complete the requests but only partially, so a NotEnoughResponsesReturned error occurs. - r.progress_until_no_events( - NO_FILTER, - complete().custody_failure_at_index(missing_custody_index), + let data_columns_to_complete = data_columns + .iter() + .filter(|d| column_indexes_to_complete.contains(&d.index)) + .cloned() + .collect::>(); + r.complete_data_columns_by_root_request( + (sync_request_id, column_indexes.clone()), + &data_columns_to_complete, ); // The request status should be set to NoPeers since the supernode, the only peer, returned not enough responses. diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 1211036f227..36c8b0801e1 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -80,6 +80,7 @@ type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataCol pub struct RequestFilter { peer: Option, epoch: Option, + block_root: Option, column_index: Option, header_requests_only: bool, } @@ -87,6 +88,7 @@ pub struct RequestFilter { pub const NO_FILTER: RequestFilter = RequestFilter { peer: None, epoch: None, + block_root: None, column_index: None, header_requests_only: false, }; @@ -102,6 +104,11 @@ impl RequestFilter { self } + pub fn block_root(mut self, block_root: Hash256) -> Self { + self.block_root = Some(block_root); + self + } + pub fn column_index(mut self, index: u64) -> Self { self.column_index = Some(index); self @@ -147,7 +154,7 @@ impl RequestFilter { fn matches_blocks_by_root( &self, peer: &PeerId, - _req: &BlocksByRootRequest, + req: &BlocksByRootRequest, id: &BlocksByRootRequestId, ) -> bool { if self.header_requests_only { @@ -156,6 +163,12 @@ impl RequestFilter { } } + if let Some(block_root) = self.block_root { + if !req.block_roots().iter().any(|b| *b == block_root) { + return false; + } + } + self.matches_peer(peer) } @@ -209,7 +222,7 @@ pub struct CompleteConfig { empty_sampling_response_once: bool, stop_at_block: Option, return_wrong_blocks: bool, - return_no_blocks: bool, + return_no_blocks_n_times: usize, process_error: bool, } @@ -244,7 +257,11 @@ impl CompleteConfig { } pub fn return_no_blocks(mut self) -> Self { - self.return_no_blocks = true; + self.return_no_blocks_n_times(usize::MAX) + } + + pub fn return_no_blocks_n_times(mut self, n_times: usize) -> Self { + self.return_no_blocks_n_times = n_times; self } } @@ -258,7 +275,7 @@ pub fn complete() -> CompleteConfig { empty_sampling_response_once: false, stop_at_block: None, return_wrong_blocks: false, - return_no_blocks: false, + return_no_blocks_n_times: 0, process_error: false, } } @@ -541,7 +558,7 @@ impl TestRig { fn complete_blocks_by_root_request( &mut self, request: BlocksByRootRequestData, - config: &CompleteConfig, + config: &mut CompleteConfig, ) { let (req_id, peer, req) = request; if let Some(error) = &config.rpc_error { @@ -549,7 +566,8 @@ impl TestRig { return; } - if config.return_no_blocks { + if config.return_no_blocks_n_times > 0 { + config.return_no_blocks_n_times -= 1; self.send_blocks_by_root_response(req_id, peer, &[]); return; } @@ -662,7 +680,7 @@ impl TestRig { .map(|id| { let block = self .blocks_by_root - .get(&id.0) + .get(&id.block_root) .cloned() .expect("unknown block"); (id, block) @@ -685,7 +703,7 @@ impl TestRig { .on_block( block.slot(), block.message(), - id.0, + id.block_root, Duration::from_secs(0), &head_state, PayloadVerificationStatus::Verified, @@ -704,19 +722,20 @@ impl TestRig { pub fn progress_until_no_events( &mut self, request_filter: RequestFilter, - complete_config: CompleteConfig, + mut complete_config: CompleteConfig, ) { + self.log(&format!("progress until no events {request_filter:?}")); loop { - if let Ok(request) = - self.pop_received_network_event(|ev| request_filter.blocks_by_root_requests(ev)) + if let Ok(request) = self + .pop_received_network_event(&mut |ev| request_filter.blocks_by_root_requests(ev)) { - self.complete_blocks_by_root_request(request, &complete_config); + self.complete_blocks_by_root_request(request, &mut complete_config); continue; } - if let Ok(request) = self - .pop_received_network_event(|ev| request_filter.data_columns_by_root_requests(ev)) - { + if let Ok(request) = self.pop_received_network_event(&mut |ev| { + request_filter.data_columns_by_root_requests(ev) + }) { self.complete_data_columns_by_root_request_range_sync(request, &complete_config); continue; } @@ -817,6 +836,15 @@ fn build_rpc_block( } } +fn sync_info_with_head_root(head_root: Hash256) -> SyncInfo { + SyncInfo { + head_slot: Slot::new(1), + head_root, + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + } +} + /// To attempt to finalize the peer's status finalized checkpoint we synced to its finalized epoch + /// 2 epochs + 1 slot. const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; @@ -826,36 +854,33 @@ const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; // - 1 supernode // - perfectly distributed peer ids -#[test] -fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { +#[tokio::test] +async fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { finalized_sync_not_enough_custody_peers_on_start(Config { peers: PeersConfig::SupernodeOnly, }); } -#[test] -fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { +#[tokio::test] +async fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { finalized_sync_not_enough_custody_peers_on_start(Config { peers: PeersConfig::SupernodeAndRandom, }); } -fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { +async fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; } - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - - // Unikely that the single peer we added has enough columns for us. Tests are determinstic and - // this error should never be hit - r.add_connected_sync_peer_not_supernode(remote_info.clone()); + let (head_root, head_slot) = r.create_unimported_parent_chain(2).await; + let remote_info = sync_info_with_head_root(head_root); + r.add_sync_peer(false, remote_info.clone()); - // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it - // doesn't have enough peers to cover all columns + // We are a supernode, and just added a single non-supernode peer. The custody by root request + // will stall as many columns have zero peers. r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); @@ -878,19 +903,21 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // wihtout any information. We don't know what batch it is for. } -#[test] -fn finalized_sync_single_custody_peer_failure() { +#[tokio::test] +async fn finalized_sync_single_custody_peer_failure() { let mut r = TestRig::test_setup(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; } - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + let (head_root, head_slot) = r.create_unimported_parent_chain(2).await; + let peer_1 = r.new_connected_supernode_peer(); + // Trigger the request + r.trigger_unknown_block_from_attestation(head_root, peer_1); - r.add_sync_peer(true, remote_info.clone()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + r.complete_header_chain(); // Progress all blocks_by_range and columns_by_range requests but respond empty for a single // column index @@ -907,13 +934,10 @@ fn finalized_sync_single_custody_peer_failure() { r.expect_active_block_components_requests_on_custody_step(); // After adding a new peer we will try to fetch from it - r.add_sync_peer(true, remote_info.clone()); - r.progress_until_no_events( - // Find the requests first to assert that this is the only request that exists - filter().column_index(column_index_to_fail), - // complete this one request without the custody failure now - complete(), - ); + let peer_2 = r.new_connected_supernode_peer(); + r.trigger_unknown_block_from_attestation(head_root, peer_2); + // complete this one request without the custody failure now + r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); r.expect_no_active_block_components_requests(); From 397de5aaea4609cf726659792355b745bad26c59 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:15:33 +0200 Subject: [PATCH 38/66] Remove unnecessary async from tests --- beacon_node/network/src/sync/tests/lookups.rs | 80 +++++++++---------- beacon_node/network/src/sync/tests/range.rs | 68 +++++----------- 2 files changed, 58 insertions(+), 90 deletions(-) diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index b3e7334a0ed..f428def6004 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -904,8 +904,8 @@ impl TestRig { )); } - async fn single_lookup_from_attestation_setup(&mut self) -> (Hash256, PeerId) { - let (head_root, head_slot) = self.create_unimported_parent_chain(1).await; + fn single_lookup_from_attestation_setup(&mut self) -> (Hash256, PeerId) { + let (head_root, head_slot) = self.create_unimported_parent_chain(1); // Use a supernode so Fulu tests can pass without edits let peer_id = self.new_connected_supernode_peer(); // Trigger the request @@ -914,8 +914,8 @@ impl TestRig { (head_root, peer_id) } - pub async fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { - let (head_root, head_slot) = self.create_unimported_parent_chain(2).await; + pub fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { + let (head_root, head_slot) = self.create_unimported_parent_chain(2); // Use a supernode so Fulu tests can pass without edits let peer_id = self.new_connected_supernode_peer(); let head_block = self @@ -979,37 +979,37 @@ fn stable_rng() { ); } -#[tokio::test] -async fn test_single_block_lookup_happy_path() { +#[test] +fn test_single_block_lookup_happy_path() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + let (new_head_root, _) = r.single_lookup_from_attestation_setup(); r.expect_fully_complete_sync(new_head_root); } // Tests that if a peer does not respond with a block, we downscore and retry the block only -#[tokio::test] -async fn test_single_block_lookup_empty_response_until_failure() { +#[test] +fn test_single_block_lookup_empty_response_until_failure() { let mut r = TestRig::test_setup(); - let (_, _) = r.single_lookup_from_attestation_setup().await; + let (_, _) = r.single_lookup_from_attestation_setup(); r.progress_until_no_events(NO_FILTER, complete().return_no_blocks()); r.expect_penalties("NotEnoughResponsesReturned"); // Test will loop until reaching max download attempts and remove the lookup r.expect_no_active_lookups(); } -#[tokio::test] -async fn test_single_block_lookup_empty_response_some_times() { +#[test] +fn test_single_block_lookup_empty_response_some_times() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.single_lookup_from_attestation_setup().await; + let (new_head_root, _) = r.single_lookup_from_attestation_setup(); r.progress_until_no_events(NO_FILTER, complete().return_no_blocks_n_times(3)); r.expect_penalties("NotEnoughResponsesReturned"); r.expect_fully_complete_sync(new_head_root); } -#[tokio::test] -async fn test_single_block_lookup_wrong_response() { +#[test] +fn test_single_block_lookup_wrong_response() { let mut r = TestRig::test_setup(); - let (_, _) = r.single_lookup_from_attestation_setup().await; + let (_, _) = r.single_lookup_from_attestation_setup(); r.progress_until_no_events(NO_FILTER, complete().return_wrong_blocks()); r.expect_penalties("UnrequestedBlockRoot"); r.expect_no_active_lookups(); @@ -1017,10 +1017,10 @@ async fn test_single_block_lookup_wrong_response() { r.expect_no_active_lookups(); } -#[tokio::test] -async fn test_single_block_lookup_rpc_error() { +#[test] +fn test_single_block_lookup_rpc_error() { let mut r = TestRig::test_setup(); - let (_, _) = r.single_lookup_from_attestation_setup().await; + let (_, _) = r.single_lookup_from_attestation_setup(); r.progress_until_no_events( NO_FILTER, complete().rpc_error(RPCError::UnsupportedProtocol), @@ -1032,10 +1032,10 @@ async fn test_single_block_lookup_rpc_error() { // TODO(tree-sync): Current behaviour drops the lookup if there's no peers left #[ignore] -#[tokio::test] -async fn test_single_block_lookup_peer_disconnected_then_rpc_error() { +#[test] +fn test_single_block_lookup_peer_disconnected_then_rpc_error() { let mut r = TestRig::test_setup(); - let (new_head_root, peer_id) = r.single_lookup_from_attestation_setup().await; + let (new_head_root, peer_id) = r.single_lookup_from_attestation_setup(); // The peer disconnect event reaches sync before the rpc error. r.peer_disconnected(peer_id); // The lookup is not removed as it can still potentially make progress. @@ -1045,17 +1045,17 @@ async fn test_single_block_lookup_peer_disconnected_then_rpc_error() { r.expect_fully_complete_sync(new_head_root); } -#[tokio::test] -async fn test_parent_lookup_happy_path() { +#[test] +fn test_parent_lookup_happy_path() { let mut r = TestRig::test_setup(); - let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); r.expect_fully_complete_sync(new_head_root); } -#[tokio::test] -async fn test_parent_lookup_drop_parent() { +#[test] +fn test_parent_lookup_drop_parent() { let mut r = TestRig::test_setup(); - let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); // Complete the header chain so the first block can start syncing r.complete_header_chain(); let chain = r.fetch_unimported_ancestor_chain(head_root); @@ -1069,10 +1069,10 @@ async fn test_parent_lookup_drop_parent() { r.expect_no_active_lookups(); } -#[tokio::test] -async fn test_parent_lookup_drop_child() { +#[test] +fn test_parent_lookup_drop_child() { let mut r = TestRig::test_setup(); - let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup().await; + let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); // Complete the header chain so the first block can start syncing r.complete_header_chain(); let chain = r.fetch_unimported_ancestor_chain(head_root); @@ -1088,10 +1088,10 @@ async fn test_parent_lookup_drop_child() { // TODO(tree-sync): Current behaviour drops the lookup if there's no peers left #[ignore] -#[tokio::test] -async fn test_lookup_peer_disconnected_no_peers_left_while_request() { +#[test] +fn test_lookup_peer_disconnected_no_peers_left_while_request() { let mut r = TestRig::test_setup(); - let (head_root, peer_id) = r.single_lookup_from_attestation_setup().await; + let (head_root, peer_id) = r.single_lookup_from_attestation_setup(); r.peer_disconnected(peer_id); r.rpc_error_all_active_requests(peer_id); // Erroring all rpc requests and disconnecting the peer shouldn't remove the requests @@ -1099,10 +1099,10 @@ async fn test_lookup_peer_disconnected_no_peers_left_while_request() { r.assert_active_lookup(head_root); } -#[tokio::test] -async fn test_lookup_disconnection_peer_left() { +#[test] +fn test_lookup_disconnection_peer_left() { let mut r = TestRig::test_setup(); - let (head_root, peer_1) = r.single_lookup_from_attestation_setup().await; + let (head_root, peer_1) = r.single_lookup_from_attestation_setup(); let peer_2 = r.new_connected_peer(); r.trigger_unknown_block_from_attestation(head_root, peer_2); // Disconnect the first peer only, which is the one handling the request @@ -1111,10 +1111,10 @@ async fn test_lookup_disconnection_peer_left() { r.assert_active_lookup(head_root); } -#[tokio::test] -async fn test_lookup_add_peers_to_parent() { +#[test] +fn test_lookup_add_peers_to_parent() { let mut r = TestRig::test_setup(); - let (head_root, _) = r.create_unimported_parent_chain(4).await; + let (head_root, _) = r.create_unimported_parent_chain(4); let chain = r.fetch_unimported_ancestor_chain(head_root); let peer_id = r.new_connected_peer(); r.trigger_unknown_block_from_attestation(head_root, peer_id); diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 36c8b0801e1..b0ab1d39068 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -432,48 +432,16 @@ impl TestRig { SignedBeaconBlock::from_block(block, Signature::empty()) } - pub async fn create_unimported_parent_chain(&mut self, block_count: usize) -> (Hash256, Slot) { + pub fn create_unimported_parent_chain(&mut self, block_count: usize) -> (Hash256, Slot) { self.log(&format!( "Creating unimported chain of {block_count} blocks" )); - let mut r = TestRig::test_setup(); - - r.harness.advance_slot(); - let head_root = r - .harness - .extend_chain( - block_count, - BlockStrategy::OnCanonicalHead, - AttestationStrategy::AllValidators, - ) - .await; - - let store = &r.harness.chain.store; - let head_block = store.get_full_block(&head_root).unwrap().unwrap(); - - let mut target_block_root = head_root; - while let Some(block) = store.get_full_block(&target_block_root).unwrap() { - self.log(&format!( - "Adding block {target_block_root:?} slot {} to known blocks", - block.slot() - )); - let parent_root = block.parent_root(); - self.blocks_by_root.insert(target_block_root, block.into()); - if parent_root == Hash256::ZERO { - break; - } - target_block_root = parent_root; - } - - (head_root, head_block.slot()) - } - - fn create_not_rooted_parent_chain(&mut self) -> (Hash256, Slot) { let current_head = self.harness.chain.head(); let mut parent_root = current_head.head_block_root(); - let mut slot = current_head.head_slot(); - for _ in 0..64 { + let mut prev_slot = current_head.head_slot(); + for _ in 0..block_count { + let slot = prev_slot + Slot::new(1); let mut block = self.zero_block_at_slot(slot, true); *block.message_mut().parent_root_mut() = parent_root; *block.message_mut().slot_mut() = slot; @@ -481,9 +449,9 @@ impl TestRig { self.blocks_by_root.insert(block_root, block.into()); parent_root = block_root; - slot = slot + Slot::new(1); + prev_slot = slot; } - (parent_root, slot) + (parent_root, prev_slot) } fn send_rpc_error(&mut self, id: SyncRequestId, peer_id: PeerId, error: RPCError) { @@ -854,28 +822,28 @@ const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; // - 1 supernode // - perfectly distributed peer ids -#[tokio::test] -async fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { finalized_sync_not_enough_custody_peers_on_start(Config { peers: PeersConfig::SupernodeOnly, }); } -#[tokio::test] -async fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { finalized_sync_not_enough_custody_peers_on_start(Config { peers: PeersConfig::SupernodeAndRandom, }); } -async fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; } - let (head_root, head_slot) = r.create_unimported_parent_chain(2).await; + let (head_root, head_slot) = r.create_unimported_parent_chain(2); let remote_info = sync_info_with_head_root(head_root); r.add_sync_peer(false, remote_info.clone()); @@ -903,15 +871,15 @@ async fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // wihtout any information. We don't know what batch it is for. } -#[tokio::test] -async fn finalized_sync_single_custody_peer_failure() { +#[test] +fn finalized_sync_single_custody_peer_failure() { let mut r = TestRig::test_setup(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; } - let (head_root, head_slot) = r.create_unimported_parent_chain(2).await; + let (head_root, head_slot) = r.create_unimported_parent_chain(2); let peer_1 = r.new_connected_supernode_peer(); // Trigger the request r.trigger_unknown_block_from_attestation(head_root, peer_1); @@ -943,10 +911,10 @@ async fn finalized_sync_single_custody_peer_failure() { r.expect_no_active_block_components_requests(); } -#[tokio::test] -async fn tree_sync_happy_path() { +#[test] +fn tree_sync_happy_path() { let mut r = TestRig::test_setup(); - let (head_root, head_slot) = r.create_unimported_parent_chain(8).await; + let (head_root, head_slot) = r.create_unimported_parent_chain(8); let remote_info = SyncInfo { finalized_epoch: Epoch::new(0), finalized_root: Hash256::ZERO, From ed4171bfbb5f4d827ede886fa31d2172effc95e0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:29:25 +0200 Subject: [PATCH 39/66] pass deneb tests --- beacon_node/network/src/sync/tests/range.rs | 120 +++++++++++++++++++- 1 file changed, 117 insertions(+), 3 deletions(-) diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index b0ab1d39068..c1e5c08506a 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -26,9 +26,9 @@ use lighthouse_network::{PeerId, SyncInfo}; use std::collections::HashSet; use std::time::Duration; use types::{ - BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, - EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, - SignedBeaconBlockHash, Slot, VariableList, + BeaconBlock, Blob, BlobSidecar, BlobSidecarList, BlockImportSource, ColumnIndex, + DataColumnSidecar, Epoch, EthSpec, Hash256, KzgCommitment, KzgProof, MinimalEthSpec as E, + Signature, SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -66,6 +66,8 @@ struct Config { type BlocksByRootRequestData = (BlocksByRootRequestId, PeerId, BlocksByRootRequest); +type BlobsByRootRequestData = (BlobsByRootRequestId, PeerId, BlobsByRootRequest); + type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest); /// Sync tests are usually written in the form: @@ -135,6 +137,20 @@ impl RequestFilter { } } + fn blobs_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlobsByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), + } if self.matches_blobs_by_root(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + fn data_columns_by_root_requests( &self, ev: &NetworkMessage, @@ -172,6 +188,20 @@ impl RequestFilter { self.matches_peer(peer) } + fn matches_blobs_by_root(&self, peer: &PeerId, req: &BlobsByRootRequest) -> bool { + if self.header_requests_only { + return false; + } + + if let Some(block_root) = self.block_root { + if !req.blob_ids.iter().any(|id| id.block_root == block_root) { + return false; + } + } + + self.matches_peer(peer) + } + fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { if self.header_requests_only { return false; @@ -492,6 +522,37 @@ impl TestRig { }); } + fn send_blobs_by_root_response( + &mut self, + id: BlobsByRootRequestId, + peer_id: PeerId, + blobs: &[Arc>], + ) { + let mut ids = blobs + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing BlobsByRoot request {id} to {peer_id} with data_columns {ids:?}" + )); + + for blob in blobs { + self.send_sync_message(SyncMessage::RpcBlob { + sync_request_id: SyncRequestId::BlobsByRoot(id), + peer_id, + blob_sidecar: Some(blob.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlob { + sync_request_id: SyncRequestId::BlobsByRoot(id), + peer_id, + blob_sidecar: None, + seen_timestamp: D, + }); + } + fn send_data_columns_by_root_response( &mut self, id: DataColumnsByRootRequestId, @@ -558,6 +619,52 @@ impl TestRig { self.send_blocks_by_root_response(req_id, peer, &blocks); } + fn complete_blobs_by_root_request_range_sync( + &mut self, + (id, peer_id, req): BlobsByRootRequestData, + complete_config: &CompleteConfig, + ) { + let blobs = req + .blob_ids + .iter() + .flat_map(|blob_id| { + let block = self + .blocks_by_root + .get(&blob_id.block_root) + .expect("Test consumer requested unknown block") + .clone(); + + let kzg_commitment_inclusion_proof = block + .message() + .body() + .kzg_commitment_merkle_proof(blob_id.index as usize) + .unwrap(); + let kzg_commitment = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .get(blob_id.index as usize) + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(BlobSidecar { + index: blob_id.index, + blob: Blob::::default(), + kzg_commitment, + kzg_proof: KzgProof::empty(), + signed_block_header, + kzg_commitment_inclusion_proof, + })) + }) + .collect::>(); + + self.send_blobs_by_root_response(id, peer_id, &blobs); + } + fn complete_data_columns_by_root_request_range_sync( &mut self, (id, peer_id, req): DataColumnsByRootRequestData, @@ -701,6 +808,13 @@ impl TestRig { continue; } + if let Ok(request) = + self.pop_received_network_event(&mut |ev| request_filter.blobs_by_root_requests(ev)) + { + self.complete_blobs_by_root_request_range_sync(request, &complete_config); + continue; + } + if let Ok(request) = self.pop_received_network_event(&mut |ev| { request_filter.data_columns_by_root_requests(ev) }) { From bf0015cc9c60809ae3f97a2f26588ecfa4f22b61 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:47:33 +0200 Subject: [PATCH 40/66] WIP processor tests --- .../src/network_beacon_processor/tests.rs | 85 ++----------------- 1 file changed, 6 insertions(+), 79 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index f6a1069a7f4..9fe556781e4 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -22,6 +22,7 @@ use gossipsub::MessageAcceptance; use itertools::Itertools; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, MetaDataV3}; use lighthouse_network::rpc::InboundRequestId; +use lighthouse_network::service::api_types::HeaderLookupId; use lighthouse_network::{ discv5::enr::{self, CombinedKey}, rpc::methods::{MetaData, MetaDataV2}, @@ -374,54 +375,13 @@ impl TestRig { pub fn enqueue_rpc_block(&self) { let block_root = self.next_block.canonical_root(); self.network_beacon_processor - .send_rpc_beacon_block( - block_root, - RpcBlock::new_without_blobs(Some(block_root), self.next_block.clone()), - std::time::Duration::default(), - BlockProcessType::SingleBlock { id: 0 }, - ) - .unwrap(); - } - - pub fn enqueue_single_lookup_rpc_block(&self) { - let block_root = self.next_block.canonical_root(); - self.network_beacon_processor - .send_rpc_beacon_block( - block_root, - RpcBlock::new_without_blobs(Some(block_root), self.next_block.clone()), - std::time::Duration::default(), - BlockProcessType::SingleBlock { id: 1 }, + .send_chain_segment( + ChainSegmentProcessId::ForwardSync(HeaderLookupId { id: 0, block_root }), + vec![], ) .unwrap(); } - pub fn enqueue_single_lookup_rpc_blobs(&self) { - if let Some(blobs) = self.next_blobs.clone() { - let blobs = FixedBlobSidecarList::new(blobs.into_iter().map(Some).collect::>()); - self.network_beacon_processor - .send_rpc_blobs( - self.next_block.canonical_root(), - blobs, - std::time::Duration::default(), - BlockProcessType::SingleBlob { id: 1 }, - ) - .unwrap(); - } - } - - pub fn enqueue_single_lookup_rpc_data_columns(&self) { - if let Some(data_columns) = self.next_data_columns.clone() { - self.network_beacon_processor - .send_rpc_custody_columns( - self.next_block.canonical_root(), - data_columns, - Duration::default(), - BlockProcessType::SingleCustodyColumn(1), - ) - .unwrap(); - } - } - pub fn enqueue_blobs_by_range_request(&self, count: u64) { self.network_beacon_processor .send_blobs_by_range_request( @@ -437,10 +397,7 @@ impl TestRig { pub fn enqueue_backfill_batch(&self) { self.network_beacon_processor - .send_chain_segment( - ChainSegmentProcessId::BackSyncBatchId(Epoch::default()), - Vec::default(), - ) + .send_chain_segment(ChainSegmentProcessId::BackfillSync(0), Vec::default()) .unwrap(); } @@ -945,14 +902,6 @@ async fn attestation_to_unknown_block_processed(import_method: BlockImportMethod BlockImportMethod::Rpc => { rig.enqueue_rpc_block(); events.push(WorkType::RpcBlock); - if num_blobs > 0 { - rig.enqueue_single_lookup_rpc_blobs(); - events.push(WorkType::RpcBlobs); - } - if num_data_columns > 0 { - rig.enqueue_single_lookup_rpc_data_columns(); - events.push(WorkType::RpcCustodyColumn); - } } }; @@ -1031,14 +980,6 @@ async fn aggregate_attestation_to_unknown_block(import_method: BlockImportMethod BlockImportMethod::Rpc => { rig.enqueue_rpc_block(); events.push(WorkType::RpcBlock); - if num_blobs > 0 { - rig.enqueue_single_lookup_rpc_blobs(); - events.push(WorkType::RpcBlobs); - } - if num_data_columns > 0 { - rig.enqueue_single_lookup_rpc_data_columns(); - events.push(WorkType::RpcCustodyColumn); - } } }; @@ -1219,24 +1160,10 @@ async fn test_rpc_block_reprocessing() { let next_block_root = rig.next_block.canonical_root(); // Insert the next block into the duplicate cache manually let handle = rig.duplicate_cache.check_and_insert(next_block_root); - rig.enqueue_single_lookup_rpc_block(); + rig.enqueue_rpc_block(); rig.assert_event_journal_completes(&[WorkType::RpcBlock]) .await; - let num_blobs = rig.next_blobs.as_ref().map(|b| b.len()).unwrap_or(0); - if num_blobs > 0 { - rig.enqueue_single_lookup_rpc_blobs(); - rig.assert_event_journal_completes(&[WorkType::RpcBlobs]) - .await; - } - - let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); - if num_data_columns > 0 { - rig.enqueue_single_lookup_rpc_data_columns(); - rig.assert_event_journal_completes(&[WorkType::RpcCustodyColumn]) - .await; - } - // next_block shouldn't be processed since it couldn't get the // duplicate cache handle assert_ne!(next_block_root, rig.head_root()); From 10953a8b42aa2dcfd8a02b544fbbfffee5b23fbe Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 23 Jun 2025 03:07:57 +0200 Subject: [PATCH 41/66] Remove artifact file --- 0w | 956 ------------------------------------------------------------- 1 file changed, 956 deletions(-) delete mode 100644 0w diff --git a/0w b/0w deleted file mode 100644 index abc21c03d37..00000000000 --- a/0w +++ /dev/null @@ -1,956 +0,0 @@ -use super::*; -use crate::network_beacon_processor::ChainSegmentProcessId; -use crate::status::ToStatusMessage; -use crate::sync::manager::{BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; -use crate::sync::network_context::{BlockComponentsByRootRequestStep, RangeRequestId}; -use crate::sync::tests::lookups::TestOptions; -use crate::sync::BatchProcessResult; -use crate::sync::SyncMessage; -use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; -use beacon_chain::{ - block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer, - PayloadVerificationStatus, -}; -use beacon_processor::WorkType; -use lighthouse_network::rpc::methods::{ - BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, -}; -use lighthouse_network::rpc::{RPCError, RequestType, RpcErrorResponse, StatusMessage}; -use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, - ComponentsByRootRequestId, DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, -}; -use lighthouse_network::types::SyncState; -use lighthouse_network::{PeerId, SyncInfo}; -use std::collections::HashSet; -use std::time::Duration; -use types::{ - BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, - EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, - SignedBeaconBlockHash, Slot, VariableList, -}; - -const D: Duration = Duration::new(0, 0); - -pub(crate) enum DataSidecars { - Blobs(BlobSidecarList), - DataColumns(Vec>), -} - -enum ByRootDataRequestIds { - PreDeneb, - PrePeerDAS(BlobsByRootRequestId, PeerId, BlobsByRootRequest), - PostPeerDAS(Vec<(DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest)>), -} - -impl ByRootDataRequestIds { - /// If there's a single active request, returns its peer, else panics - fn peer(&self) -> PeerId { - match self { - Self::PreDeneb => panic!("no requests PreDeneb"), - Self::PrePeerDAS(_, peer, _) => *peer, - Self::PostPeerDAS(reqs) => { - if reqs.len() != 1 { - panic!("Should have 1 PostPeerDAS request"); - } - reqs.first().expect("no PostPeerDAS requests").1 - } - } - } -} - -struct Config { - peers: PeersConfig, -} - -type BlocksByRootRequestData = (BlocksByRootRequestId, PeerId, BlocksByRootRequest); - -type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest); - -/// Sync tests are usually written in the form: -/// - Do some action -/// - Expect a request to be sent -/// - Complete the above request -/// -/// To make writting tests succint, the machinery in this testing rig automatically identifies -/// _which_ request to complete. Picking the right request is critical for tests to pass, so this -/// filter allows better expressivity on the criteria to identify the right request. -#[derive(Default, Debug, Clone, Copy)] -pub struct RequestFilter { - peer: Option, - epoch: Option, - block_root: Option, - column_index: Option, - header_requests_only: bool, -} - -pub const NO_FILTER: RequestFilter = RequestFilter { - peer: None, - epoch: None, - block_root: None, - column_index: None, - header_requests_only: false, -}; - -impl RequestFilter { - pub fn peer(mut self, peer: PeerId) -> Self { - self.peer = Some(peer); - self - } - - pub fn epoch(mut self, epoch: u64) -> Self { - self.epoch = Some(epoch); - self - } - - pub fn block_root(mut self, block_root: Hash256) -> Self { - self.block_root = Some(block_root); - self - } - - pub fn column_index(mut self, index: u64) -> Self { - self.column_index = Some(index); - self - } - - pub fn header_requests_only(mut self) -> Self { - self.header_requests_only = true; - self - } - - fn blocks_by_root_requests( - &self, - ev: &NetworkMessage, - ) -> Option { - match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlocksByRoot(req), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), - } if self.matches_blocks_by_root(peer_id, req, id) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - } - } - - fn data_columns_by_root_requests( - &self, - ev: &NetworkMessage, - ) -> Option { - match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::DataColumnsByRoot(req), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), - } if self.matches_data_columns_by_root(peer_id, req) => { - Some((*id, *peer_id, req.clone())) - } - _ => None, - } - } - - fn matches_blocks_by_root( - &self, - peer: &PeerId, - req: &BlocksByRootRequest, - id: &BlocksByRootRequestId, - ) -> bool { - if self.header_requests_only { - if !matches!(id.parent_request_id, BlocksByRootRequester::Header(_)) { - return false; - } - } - - if let Some(block_root) = self.block_root { - if !req.block_roots().iter().any(|b| *b == block_root) { - return false; - } - } - - self.matches_peer(peer) - } - - fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { - if self.header_requests_only { - return false; - } - - if let Some(index) = self.column_index { - if !req - .data_column_ids - .iter() - .any(|id| id.columns.iter().any(|i| *i == index)) - { - return false; - } - } - self.matches_peer(peer) - } - - fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { - if let Some(expected_epoch) = self.epoch { - let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); - if epoch != expected_epoch { - return false; - } - } - self.matches_peer(peer) - } - - fn matches_peer(&self, peer: &PeerId) -> bool { - if let Some(expected_peer) = self.peer { - if *peer != expected_peer { - return false; - } - } - true - } -} - -pub fn filter() -> RequestFilter { - RequestFilter::default() -} - -/// Instruct the testing rig how to complete requests for _by_range requests -pub struct CompleteConfig { - block_count: usize, - with_data: bool, - custody_failure_at_index: Option, - rpc_error: Option, - empty_sampling_response_once: bool, - stop_at_block: Option, - return_wrong_blocks: bool, - return_no_blocks_n_times: usize, - process_error: bool, -} - -impl CompleteConfig { - pub fn custody_failure_at_index(mut self, index: u64) -> Self { - self.custody_failure_at_index = Some(index); - self - } - - pub fn rpc_error(mut self, error: RPCError) -> Self { - self.rpc_error = Some(error); - self - } - - pub fn rpc_error_response(mut self, error: RpcErrorResponse) -> Self { - self.rpc_error(RPCError::ErrorResponse(error, "".to_owned())) - } - - pub fn empty_sampling_response_once(mut self) -> Self { - self.empty_sampling_response_once = true; - self - } - - pub fn stop_at_block(mut self, block: Hash256) -> Self { - self.stop_at_block = Some(block); - self - } - - pub fn return_wrong_blocks(mut self) -> Self { - self.return_wrong_blocks = true; - self - } - - pub fn return_no_blocks(mut self) -> Self { - self.return_no_blocks_n_times(usize::MAX) - } - - pub fn return_no_blocks_n_times(mut self, n_times: usize) -> Self { - self.return_no_blocks_n_times = n_times; - self - } -} - -pub fn complete() -> CompleteConfig { - CompleteConfig { - block_count: 1, - with_data: true, - custody_failure_at_index: None, - rpc_error: None, - empty_sampling_response_once: false, - stop_at_block: None, - return_wrong_blocks: false, - return_no_blocks_n_times: 0, - process_error: false, - } -} - -impl TestRig { - fn our_custody_indices(&self) -> Vec { - self.network_globals - .sampling_columns() - .iter() - .copied() - .collect() - } - - /// Produce a head peer with an advanced head - fn add_head_peer(&mut self) -> PeerId { - self.add_head_peer_with_root(Hash256::random()) - } - - /// Produce a head peer with an advanced head - fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { - let local_info = self.local_info(); - self.add_connected_sync_random_peer(SyncInfo { - head_root, - head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), - ..local_info - }) - } - - // Produce a finalized peer with an advanced finalized epoch - fn add_finalized_peer(&mut self) -> PeerId { - self.add_finalized_peer_with_root(Hash256::random()) - } - - // Produce a finalized peer with an advanced finalized epoch - fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { - let local_info = self.local_info(); - let finalized_epoch = local_info.finalized_epoch + 2; - self.add_connected_sync_random_peer(SyncInfo { - finalized_epoch, - finalized_root, - head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), - head_root: Hash256::random(), - }) - } - - fn finalized_remote_info_advanced_by(&self, advanced_epochs: Epoch) -> SyncInfo { - let local_info = self.local_info(); - let finalized_epoch = local_info.finalized_epoch + advanced_epochs; - SyncInfo { - finalized_epoch, - finalized_root: Hash256::random(), - head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), - head_root: Hash256::random(), - } - } - - fn local_info(&self) -> SyncInfo { - let StatusMessage { - fork_digest: _, - finalized_root, - finalized_epoch, - head_root, - head_slot, - } = self.harness.chain.status_message(); - SyncInfo { - head_slot, - head_root, - finalized_epoch, - finalized_root, - } - } - - fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { - self.add_sync_peer(false, remote_info) - } - - fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { - // Create valid peer known to network globals - // TODO(fulu): Using supernode peers to ensure we have peer across all column - // subnets for syncing. Should add tests connecting to full node peers. - self.add_sync_peer(true, remote_info) - } - - fn get_sync_state(&mut self) -> SyncState { - self.sync_manager.network().network_globals().sync_state() - } - - fn assert_sync_state(&mut self, expected_state: SyncState) { - let current_state = self.sync_manager.network().network_globals().sync_state(); - assert_eq!(current_state, expected_state); - } - - #[track_caller] - fn expect_chain_segments(&mut self, count: usize) { - for i in 0..count { - self.pop_received_processor_event(|ev| { - (ev.work_type() == beacon_processor::WorkType::ChainSegment).then_some(()) - }) - .unwrap_or_else(|e| panic!("Expect ChainSegment work event count {i}: {e:?}")); - } - } - - fn expect_active_block_components_requests_on_custody_step(&mut self) { - let requests = self - .sync_manager - .network() - .active_block_components_requests(); - if requests.is_empty() { - panic!("No active block components requests"); - } - for (id, step) in requests { - if !matches!(step, BlockComponentsByRootRequestStep::CustodyRequest) { - panic!("block components request {id} is not on CustodyRequest step: {step:?}"); - } - } - } - - fn expect_no_active_block_components_requests(&mut self) { - let requests = self - .sync_manager - .network() - .active_block_components_requests(); - if !requests.is_empty() { - panic!("Still active block components requests {requests:?}"); - } - } - - fn expect_no_active_rpc_requests(&mut self) { - let requests = self - .sync_manager - .network() - .active_requests() - .collect::>(); - if !requests.is_empty() { - panic!("There are still active RPC requests {requests:?}"); - } - } - - fn update_execution_engine_state(&mut self, state: EngineState) { - self.log(&format!("execution engine state updated: {state:?}")); - self.sync_manager.update_execution_engine_state(state); - } - - fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> SignedBeaconBlock { - let mut block = BeaconBlock::empty(&self.spec); - if with_data { - if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { - blob_kzg_commitments - .push(KzgCommitment([0; 48])) - .expect("pushed to empty kzg commitments"); - } - } - *block.slot_mut() = slot; - SignedBeaconBlock::from_block(block, Signature::empty()) - } - - pub async fn create_unimported_parent_chain(&mut self, block_count: usize) -> (Hash256, Slot) { - self.log(&format!( - "Creating unimported chain of {block_count} blocks" - )); - - let mut r = TestRig::test_setup(); - - r.harness.advance_slot(); - let head_root = r - .harness - .extend_chain( - block_count, - BlockStrategy::OnCanonicalHead, - AttestationStrategy::AllValidators, - ) - .await; - - let store = &r.harness.chain.store; - let head_block = store.get_full_block(&head_root).unwrap().unwrap(); - - let mut target_block_root = head_root; - while let Some(block) = store.get_full_block(&target_block_root).unwrap() { - self.log(&format!( - "Adding block {target_block_root:?} slot {} to known blocks", - block.slot() - )); - let parent_root = block.parent_root(); - self.blocks_by_root.insert(target_block_root, block.into()); - if parent_root == Hash256::ZERO { - break; - } - target_block_root = parent_root; - } - - (head_root, head_block.slot()) - } - - fn create_not_rooted_parent_chain(&mut self) -> (Hash256, Slot) { - let current_head = self.harness.chain.head(); - let mut parent_root = current_head.head_block_root(); - let mut slot = current_head.head_slot(); - for _ in 0..64 { - let mut block = self.zero_block_at_slot(slot, true); - *block.message_mut().parent_root_mut() = parent_root; - *block.message_mut().slot_mut() = slot; - let block_root = block.canonical_root(); - self.blocks_by_root.insert(block_root, block.into()); - - parent_root = block_root; - slot = slot + Slot::new(1); - } - (parent_root, slot) - } - - fn send_rpc_error(&mut self, id: SyncRequestId, peer_id: PeerId, error: RPCError) { - self.log(&format!( - "Completing request {id:?} to {peer_id} with RPCError {error:?}" - )); - self.send_sync_message(SyncMessage::RpcError { - sync_request_id: id, - peer_id, - error, - }); - } - - fn send_blocks_by_root_response( - &mut self, - req_id: BlocksByRootRequestId, - peer_id: PeerId, - blocks: &[Arc>], - ) { - let slots = blocks.iter().map(|block| block.slot()).collect::>(); - self.log(&format!( - "Completing BlocksByRoot request {req_id} to {peer_id} with blocks {slots:?}" - )); - - for block in blocks { - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRoot(req_id), - peer_id, - beacon_block: Some(block.clone()), - seen_timestamp: D, - }); - } - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRoot(req_id), - peer_id, - beacon_block: None, - seen_timestamp: D, - }); - } - - fn send_data_columns_by_root_response( - &mut self, - id: DataColumnsByRootRequestId, - peer_id: PeerId, - data_columns: &[Arc>], - ) { - let mut ids = data_columns - .iter() - .map(|d| (d.slot().as_u64(), d.index)) - .collect::>(); - ids.sort_unstable(); - self.log(&format!( - "Completing DataColumnsByRoot request {id} to {peer_id} with data_columns {ids:?}" - )); - - for data_column in data_columns { - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRoot(id), - peer_id, - data_column: Some(data_column.clone()), - seen_timestamp: D, - }); - } - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRoot(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); - } - - fn complete_blocks_by_root_request( - &mut self, - request: BlocksByRootRequestData, - config: &mut CompleteConfig, - ) { - let (req_id, peer, req) = request; - if let Some(error) = &config.rpc_error { - self.send_rpc_error(SyncRequestId::BlocksByRoot(req_id), peer, error.clone()); - return; - } - - if config.return_no_blocks_n_times > 0 { - config.return_no_blocks_n_times -= 1; - self.send_blocks_by_root_response(req_id, peer, &[]); - return; - } - - let blocks = req - .block_roots() - .iter() - .map(|block_root| { - if config.return_wrong_blocks { - Arc::new(self.rand_block()) - } else { - self.blocks_by_root - .get(block_root) - .expect("Test consumer requested unknown block") - .clone() - } - }) - .collect::>(); - - self.send_blocks_by_root_response(req_id, peer, &blocks); - } - - fn complete_data_columns_by_root_request_range_sync( - &mut self, - (id, peer_id, req): DataColumnsByRootRequestData, - complete_config: &CompleteConfig, - ) { - // To reply with a valid DataColumnsByRange we need to construct - // DataColumnsByRange for the block root that we requested the block peer, plus - // figure out which exact columns we requested this peer - let mut triggered_custody_failure = false; - - let data_columns = req - .data_column_ids - .iter() - .flat_map(|column_id| { - let block = self - .blocks_by_root - .get(&column_id.block_root) - .expect("Test consumer requested unknown block") - .clone(); - - let kzg_commitments_inclusion_proof = block - .message() - .body() - .kzg_commitments_merkle_proof() - .unwrap(); - let kzg_commitments = block - .message() - .body() - .blob_kzg_commitments() - .unwrap() - .clone(); - let signed_block_header = block.signed_block_header(); - - column_id.columns.iter().filter_map(move |index| { - // Skip column generation if index is marked as failure - if complete_config.custody_failure_at_index == Some(*index) { - triggered_custody_failure = true; - return None; - } - - // We need to produce a DataColumn with valid inclusion proof, but can - // be with random KZG proof and data as we won't send it for processing - Some(Arc::new(DataColumnSidecar { - index: *index, - column: VariableList::empty(), - kzg_commitments: kzg_commitments.clone(), - kzg_proofs: VariableList::from(vec![]), - signed_block_header: signed_block_header.clone(), - kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), - })) - }) - }) - .collect::>(); - - // Need to log here because I can't capture &mut self inside the columns iter - if let Some(target_index) = complete_config.custody_failure_at_index { - if req - .data_column_ids - .iter() - .any(|id| id.columns.iter().any(|index| *index == target_index)) - { - self.log(&format!( - "Forced custody failure at request {id} for peer {peer_id} index {target_index:?}" - )); - } - } - - self.send_data_columns_by_root_response(id, peer_id, &data_columns); - } - - fn complete_block_processing(&mut self, ids: Vec, config: &CompleteConfig) { - if config.process_error { - for id in &ids { - self.send_sync_message(SyncMessage::BatchProcessed { - sync_type: ChainSegmentProcessId::ForwardSync(*id), - result: BatchProcessResult::Failure { - peer_action: None, - error: "test error".to_owned(), - }, - }); - } - } - - // Sort ids first as we need to process blocks in order of ancestors. This only works if the - // test does not send blocks of two parallel chains at once. - let mut blocks = ids - .into_iter() - .map(|id| { - let block = self - .blocks_by_root - .get(&id.block_root) - .cloned() - .expect("unknown block"); - (id, block) - }) - .collect::>(); - blocks.sort_by_key(|(_, block)| block.slot()); - - for (id, block) in blocks { - self.log(&format!( - "Completing block processing {id} slot {}", - block.slot() - )); - - { - let mut head_state = self.harness.chain.head().snapshot.beacon_state.clone(); - *head_state.slot_mut() = block.slot(); - - let mut fork_choice = self.harness.chain.canonical_head.fork_choice_write_lock(); - fork_choice - .on_block( - block.slot(), - block.message(), - id.block_root, - Duration::from_secs(0), - &head_state, - PayloadVerificationStatus::Verified, - &self.spec, - ) - .expect("error importing block to fork-choice"); - } - - self.send_sync_message(SyncMessage::BatchProcessed { - sync_type: ChainSegmentProcessId::ForwardSync(id), - result: BatchProcessResult::Success, - }); - } - } - - pub fn progress_until_no_events( - &mut self, - request_filter: RequestFilter, - mut complete_config: CompleteConfig, - ) { - self.log(format!("progress until no events {request_filter:?}")); - loop { - if let Ok(request) = self - .pop_received_network_event(&mut |ev| request_filter.blocks_by_root_requests(ev)) - { - self.complete_blocks_by_root_request(request, &mut complete_config); - continue; - } - - if let Ok(request) = self.pop_received_network_event(&mut |ev| { - request_filter.data_columns_by_root_requests(ev) - }) { - self.complete_data_columns_by_root_request_range_sync(request, &complete_config); - continue; - } - - // TODO(tree-sync): find a way to get this info from the beacon processor events - let ids = self.sync_manager.forward_sync().get_processing_ids(); - if !ids.is_empty() { - self.complete_block_processing(ids, &complete_config); - continue; - } - - let sync_state = self.get_sync_state(); - self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); - - return; - } - } - - async fn create_canonical_block(&mut self) -> (SignedBeaconBlock, Option>) { - self.harness.advance_slot(); - - let block_root = self - .harness - .extend_chain( - 1, - BlockStrategy::OnCanonicalHead, - AttestationStrategy::AllValidators, - ) - .await; - - let store = &self.harness.chain.store; - let block = store.get_full_block(&block_root).unwrap().unwrap(); - let fork = block.fork_name_unchecked(); - - let data_sidecars = if fork.fulu_enabled() { - store - .get_data_columns(&block_root) - .unwrap() - .map(|columns| { - columns - .into_iter() - .map(CustodyDataColumn::from_asserted_custody) - .collect() - }) - .map(DataSidecars::DataColumns) - } else if fork.deneb_enabled() { - store - .get_blobs(&block_root) - .unwrap() - .blobs() - .map(DataSidecars::Blobs) - } else { - None - }; - - (block, data_sidecars) - } - - async fn remember_block( - &mut self, - (block, data_sidecars): (SignedBeaconBlock, Option>), - ) { - // This code is kind of duplicated from Harness::process_block, but takes sidecars directly. - let block_root = block.canonical_root(); - self.harness.set_current_slot(block.slot()); - let _: SignedBeaconBlockHash = self - .harness - .chain - .process_block( - block_root, - build_rpc_block(block.into(), &data_sidecars, &self.spec), - NotifyExecutionLayer::Yes, - BlockImportSource::RangeSync, - || Ok(()), - ) - .await - .unwrap() - .try_into() - .unwrap(); - self.harness.chain.recompute_head_at_current_slot().await; - } -} - -fn build_rpc_block( - block: Arc>, - data_sidecars: &Option>, - spec: &ChainSpec, -) -> RpcBlock { - match data_sidecars { - Some(DataSidecars::Blobs(blobs)) => { - RpcBlock::new(None, block, Some(blobs.clone())).unwrap() - } - Some(DataSidecars::DataColumns(columns)) => { - RpcBlock::new_with_custody_columns(None, block, columns.clone(), spec).unwrap() - } - // Block has no data, expects zero columns - None => RpcBlock::new_without_blobs(None, block), - } -} - -/// To attempt to finalize the peer's status finalized checkpoint we synced to its finalized epoch + -/// 2 epochs + 1 slot. -const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; - -// Same test with different types of peers: -// - 100 peers -// - 1 supernode -// - perfectly distributed peer ids - -#[test] -fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { - finalized_sync_not_enough_custody_peers_on_start(Config { - peers: PeersConfig::SupernodeOnly, - }); -} - -#[test] -fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { - finalized_sync_not_enough_custody_peers_on_start(Config { - peers: PeersConfig::SupernodeAndRandom, - }); -} - -fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { - let mut r = TestRig::test_setup_as_supernode(); - // Only run post-PeerDAS - if !r.fork_name.fulu_enabled() { - return; - } - - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - - // Unikely that the single peer we added has enough columns for us. Tests are determinstic and - // this error should never be hit - r.add_connected_sync_peer_not_supernode(remote_info.clone()); - - // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it - // doesn't have enough peers to cover all columns - r.progress_until_no_events(NO_FILTER, complete()); - r.expect_no_active_rpc_requests(); - - // Here we have a batch with partially completed block_components_by_range requests. The batch - // should not have failed, we are still syncing, and there are no downscoring events. - r.expect_no_penalty_for_anyone(); - r.expect_active_block_components_requests_on_custody_step(); - - // Generate enough peers and supernodes to cover all custody columns - r.add_sync_peers(config.peers, remote_info.clone()); - // Note: not necessary to add this peers to the chain, as we draw from the global pool - // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as - // the test above. - - r.progress_until_no_events(NO_FILTER, complete()); - r.expect_no_active_rpc_requests(); - r.expect_no_active_block_components_requests(); - // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work - // events from here easily. What we pop from the beacon processor queue is an opaque closure - // wihtout any information. We don't know what batch it is for. -} - -#[test] -fn finalized_sync_single_custody_peer_failure() { - let mut r = TestRig::test_setup(); - // Only run post-PeerDAS - if !r.fork_name.fulu_enabled() { - return; - } - - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); - - r.add_sync_peer(true, remote_info.clone()); - - // Progress all blocks_by_range and columns_by_range requests but respond empty for a single - // column index - r.progress_until_no_events( - NO_FILTER, - complete().custody_failure_at_index(column_index_to_fail), - ); - r.expect_penalties("custody_failure"); - - // Some peer had a custody failure, but since there's a single peer in the batch we won't issue - // another request yet. - r.expect_no_active_rpc_requests(); - // Ensure that the block components by range request have not failed - r.expect_active_block_components_requests_on_custody_step(); - - // After adding a new peer we will try to fetch from it - r.add_sync_peer(true, remote_info.clone()); - r.progress_until_no_events( - // Find the requests first to assert that this is the only request that exists - filter().column_index(column_index_to_fail), - // complete this one request without the custody failure now - complete(), - ); - - r.expect_no_active_rpc_requests(); - r.expect_no_active_block_components_requests(); -} - -#[tokio::test] -async fn tree_sync_happy_path() { - let mut r = TestRig::test_setup(); - let (head_root, head_slot) = r.create_unimported_parent_chain(8).await; - let remote_info = SyncInfo { - finalized_epoch: Epoch::new(0), - finalized_root: Hash256::ZERO, - head_slot, - head_root, - }; - r.add_sync_peer(false, remote_info.clone()); - r.progress_until_no_events(NO_FILTER, complete()); - r.add_sync_peer(true, remote_info); - r.progress_until_no_events(NO_FILTER, complete()); - r.expect_empty_network(); -} From bff65e79a5c4a2480973fab39a94281db26b3cd3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 23 Jun 2025 16:21:45 +0200 Subject: [PATCH 42/66] Add basic metrics --- beacon_node/network/src/metrics.rs | 48 +++++++++++++++++++ beacon_node/network/src/sync/forward_sync.rs | 28 +++++++++++ beacon_node/network/src/sync/manager.rs | 1 + .../src/sync/network_context/requests.rs | 27 +++++++++-- 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 05c7dc287b0..d7db27452b1 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -494,6 +494,54 @@ pub static SYNC_UNKNOWN_NETWORK_REQUESTS: LazyLock> = Lazy &["type"], ) }); +pub static SYNC_RPC_REQUEST_SUCCESSES: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( + "sync_rpc_requests_success_total", + "Total count of sync RPC requests successes", + &["protocol"], + ) +}); +pub static SYNC_RPC_REQUEST_ERRORS: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( + "sync_rpc_requests_error_total", + "Total count of sync RPC requests errors", + &["protocol"], + ) +}); +pub static SYNC_RPC_REQUEST_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec_with_buckets( + "sync_rpc_request_duration_sec", + "Time to complete a successful sync RPC requesst", + Ok(vec![ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 1.0, 2.0, + ]), + &["protocol"], + ) +}); +pub static SYNC_HEADERS_DOWNLOADED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_headers_downloaded_total", + "Total count of forward sync headers downloaded", + ) +}); +pub static SYNC_BLOCKS_PROCESSED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_blocks_processed_total", + "Total count of forward sync blocks processed", + ) +}); +pub static SYNC_HEADER_MIN_SLOT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_header_min_slot", + "Current min slot of foward sync headers", + ) +}); +pub static SYNC_HEADER_MAX_SLOT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_header_max_slot", + "Current max slot of foward sync headers", + ) +}); /* * Block Delay Metrics diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 429655e0fc4..502df5ffa47 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -2,11 +2,13 @@ use super::network_context::{ DownloadRequest, DownloadRequestError, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; +use crate::metrics; use crate::sync::network_context::{BatchPeers, RpcResponseResult}; use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; +use itertools::Itertools; use lighthouse_network::service::api_types::{ BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, HeaderLookupId, Id, RangeRequestId, @@ -393,6 +395,8 @@ impl ForwardSync { received, )?; + metrics::inc_counter(&metrics::SYNC_HEADERS_DOWNLOADED); + // Once we discover the parent_root of this block three things can happen // 1. The parent root is a known block -> stop // 2. We conflicts with finality -> reject @@ -509,6 +513,7 @@ impl ForwardSync { ) { match result { Ok(SyncBlockResult::Done { .. }) => { + metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); self.blocks.remove(&block_root); self.trigger_forward_sync(cx); } @@ -694,4 +699,27 @@ impl ForwardSync { } } } + + pub fn register_metrics(&self) { + if let Some((min_slot, max_slot)) = self + .blocks + .values() + .filter_map(|lookup| { + if let Status::BackfillHeader { request, .. } = &lookup.status { + request.is_complete().map(|header| header.slot) + } else { + None + } + }) + .minmax() + .into_option() + { + metrics::set_gauge(&metrics::SYNC_HEADER_MIN_SLOT, min_slot.as_u64() as i64); + metrics::set_gauge(&metrics::SYNC_HEADER_MAX_SLOT, max_slot.as_u64() as i64); + } + + // Min header + // Highest known header + // Current head + } } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 1274c8fc721..d5e837a4204 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -621,6 +621,7 @@ impl SyncManager { } _ = register_metrics_interval.tick() => { self.network.register_metrics(); + self.forward_sync.register_metrics(); } } } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index d648978cc26..deba1ada54d 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -1,3 +1,4 @@ +use std::time::Instant; use std::{collections::hash_map::Entry, hash::Hash}; use beacon_chain::validator_monitor::timestamp_now; @@ -48,6 +49,7 @@ struct ActiveRequest { peer_id: PeerId, // Error if the request terminates before receiving max expected responses expect_max_responses: bool, + start_instant: Instant, } enum State { @@ -71,6 +73,7 @@ impl ActiveRequests { state: State::Active(items), peer_id, expect_max_responses, + start_instant: Instant::now(), }, ); } @@ -94,7 +97,7 @@ impl ActiveRequests { return None; }; - match rpc_event { + let result = match rpc_event { // Handler of a success ReqResp chunk. Adds the item to the request accumulator. // `ActiveRequestItems` validates the item before appending to its internal state. RpcEvent::Response(item, seen_timestamp) => { @@ -107,7 +110,7 @@ impl ActiveRequests { Ok(true) => { let items = items.consume(); request.state = State::CompletedEarly; - Some(Ok((items, seen_timestamp))) + Some(Ok((items, seen_timestamp, request.start_instant.elapsed()))) } // Received item, but we are still expecting more Ok(false) => None, @@ -143,7 +146,11 @@ impl ActiveRequests { } .into())) } else { - Some(Ok((items.consume(), timestamp_now()))) + Some(Ok(( + items.consume(), + timestamp_now(), + request.start_instant.elapsed(), + ))) } } // Items already returned, ignore stream termination @@ -166,7 +173,19 @@ impl ActiveRequests { State::Errored => None, } } - } + }; + + result.map(|result| match result { + Ok((items, seen_timestamp, duration)) => { + metrics::inc_counter_vec(&metrics::SYNC_RPC_REQUEST_SUCCESSES, &[self.name]); + metrics::observe_timer_vec(&metrics::SYNC_RPC_REQUEST_TIME, &[self.name], duration); + Ok((items, seen_timestamp)) + } + Err(e) => { + metrics::inc_counter_vec(&metrics::SYNC_RPC_REQUEST_ERRORS, &[self.name]); + Err(e) + } + }) } pub fn active_requests(&self) -> impl Iterator { From 7c903b94e42d75cd36e596ea7048f7fd110fa3d0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 23 Jun 2025 20:21:47 +0200 Subject: [PATCH 43/66] Fix logs and metrics --- beacon_node/network/src/metrics.rs | 12 ++++ beacon_node/network/src/sync/forward_sync.rs | 68 +++++++++++++++----- beacon_node/network/src/sync/sync_block.rs | 2 +- 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index d7db27452b1..34f58435a36 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -530,6 +530,18 @@ pub static SYNC_BLOCKS_PROCESSED: LazyLock> = LazyLock::new(| "Total count of forward sync blocks processed", ) }); +pub static SYNC_LOOKUPS_CREATED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_lookups_created_total", + "Total count of forward sync lookups created", + ) +}); +pub static SYNC_LOOKUPS_DROPPED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_lookups_dropped_total", + "Total count of forward sync lookups dropped", + ) +}); pub static SYNC_HEADER_MIN_SLOT: LazyLock> = LazyLock::new(|| { try_create_int_gauge( "sync_header_min_slot", diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 502df5ffa47..ff049b118f8 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -37,6 +37,7 @@ enum Status { // TODO(tree-sync): Make the "waiting" completed header requests as memory cheap as possible BackfillHeader { peers: HashSet, + failed_peers: HashSet, request: DownloadRequest, }, ForwardSyncBlock { @@ -54,6 +55,7 @@ impl ForwardSyncBlock { id: HeaderLookupId { id, block_root }, status: Status::BackfillHeader { peers: HashSet::from_iter(peers.iter().copied()), + failed_peers: <_>::default(), request: DownloadRequest::new(), }, } @@ -143,7 +145,7 @@ impl ForwardSyncBlock { fn to_foward_sync_block(&mut self, block_root: Hash256) -> Result<(), Error> { let (peers, request) = match &mut self.status { - Status::BackfillHeader { peers, request } => (peers, request), + Status::BackfillHeader { peers, request, .. } => (peers, request), _ => { return Err(Error::InternalError( "Expected lookup to be in DownloadingHeader state".to_owned(), @@ -179,27 +181,45 @@ impl ForwardSyncBlock { block_root: Hash256, cx: &mut SyncNetworkContext, ) -> Result<(), Error> { - let peers = match &self.status { - Status::BackfillHeader { peers, .. } => peers, - Status::ForwardSyncBlock { request, .. } => { + let (peers, failed_peers, request) = match &mut self.status { + Status::BackfillHeader { + peers, + failed_peers, + request, + } => (peers, failed_peers, request), + Status::ForwardSyncBlock { .. } => { return Err(Error::InternalError( "Lookup not in forward sync block status".to_owned(), )) } }; - // TODO(tree-sync): have good peer selection - let Some(peer) = peers.iter().next() else { + let Some(peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + failed_peers.contains(peer), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the lookup is removed return Err(Error::InternalError("No peers".to_owned())); }; let req_id = cx.send_blocks_by_root_request( - *peer, + peer, block_root, BlocksByRootRequester::Header(self.id), )?; - self.header_request()?.on_download_start(req_id)?; + request.on_download_start(req_id)?; Ok(()) } } @@ -313,6 +333,7 @@ impl ForwardSync { cx: &mut SyncNetworkContext, ) { if self.blocks.contains_key(&block_root) { + let mut counts = HashMap::<&PeerId, usize>::new(); // Add peer to `block`'s entry and all its ancestors let mut target_block_root = block_root; while let Some(lookup) = self.blocks.get_mut(&target_block_root) { @@ -320,7 +341,7 @@ impl ForwardSync { // TODO(tree-sync): If peer already in set no need to add to its ancestors if lookup.add_peer(*peer) { // TODO(tree-sync): This log can be very noisy maybe log once per peer - debug!(block_root = ?target_block_root, ?peer, "Adding peer to existing header lookup"); + *counts.entry(peer).or_default() += 1; } else { // Peer already part of this lookup, therefore it must be part of the peer // set of all of its ancestors: stop @@ -333,18 +354,30 @@ impl ForwardSync { break; } } + for (peer, count) in counts { + debug!(block_root = ?target_block_root, %peer, count, "Adding peer to existing header lookup and ancestors"); + } } else { if self.blocks.len() > MAX_LOOKUP_COUNT { self.prune_least_popular_lookups(); } let id = cx.next_id(); - debug!(?block_root, id, ?peers, "Creating new header lookup"); + match peers { + [peer] => debug!(?block_root, id, %peer, "Creating new header lookup"), + _ => debug!( + ?block_root, + id, + peers = peers.len(), + "Creating new header lookup" + ), + } let mut lookup = ForwardSyncBlock::new(block_root, id, peers); match lookup.send_block_header_request(block_root, cx) { Ok(_) => { self.blocks.insert(block_root, lookup); + metrics::inc_counter(&metrics::SYNC_LOOKUPS_CREATED); } Err(e) => { warn!(id = ?lookup.id, error = ?e, "Error sending initial lookup request"); @@ -644,18 +677,19 @@ impl ForwardSync { /// Drop lookup `block_root` if it exists and all its children fn drop_lookup_and_children( &mut self, - block_root: Hash256, + initial_block_root: Hash256, block_to_children: &HashMap>, ) { - // Change to `Vec::new()` if you want depth-first order. - let mut queue: VecDeque = VecDeque::from([block_root]); + let mut queue: VecDeque = VecDeque::from([initial_block_root]); - while let Some(node) = queue.pop_front() { + while let Some(block_root) = queue.pop_front() { // Remove the node itself. - if self.blocks.remove(&node).is_some() { + if let Some(block) = self.blocks.remove(&block_root) { + debug!(?block_root, id = %block.id, "Dropping forward sync block lookup"); + metrics::inc_counter(&metrics::SYNC_LOOKUPS_DROPPED); // Only remove children if the node still existed // Push its children—if any—onto the work list. - if let Some(children) = block_to_children.get(&node) { + if let Some(children) = block_to_children.get(&block_root) { queue.extend(children.iter().cloned()); } } @@ -683,7 +717,7 @@ impl ForwardSync { .iter() .filter_map(|(block_root, block)| match &block.status { // Prune only lookups that are not syncing and we know the header - Status::BackfillHeader { peers, request } => request + Status::BackfillHeader { peers, request, .. } => request .is_complete() .map(|header| (block.peer_count(), header.slot, *block_root)), Status::ForwardSyncBlock { .. } => None, diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 639e059eb98..7d8d76dab85 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -136,7 +136,7 @@ impl SyncBlock { }) } BatchProcessResult::Failure { peer_action, error } => { - debug!(id = %self.id, "Sync block process error"); + debug!(id = %self.id, error, "Sync block process error"); if let Some(peer_action) = peer_action { for (peer, penalty) in peers.blame(peer_action) { From 3c301976429218801dc0a8c7fc71593dc9604c31 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 24 Jun 2025 11:04:39 +0200 Subject: [PATCH 44/66] Add todo --- beacon_node/network/src/sync/forward_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index ff049b118f8..cd80937d901 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -442,6 +442,12 @@ impl ForwardSync { let finalized_checkpoint = self.chain.head().finalized_checkpoint(); let parent_known = self.blocks.contains_key(&parent_root); + // TODO(tree-sync): check that the slots are decreasing, so we don't end up in + // an infinite loop. But note that the wrong block will be the descendant. + // - We get header A with parent B and slot 10 + // - We get header B with parent C and slot 11 + // - That makes header A invalid + if block_header.slot <= finalized_checkpoint .epoch From c567a950473a28eec09f45052d72ffe32a162852 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 9 Jul 2025 11:33:53 +0200 Subject: [PATCH 45/66] Introduce chains to group blocks with same peer set --- .../overflow_lru_cache.rs | 9 +- .../src/rpc/self_limiter.rs | 5 +- .../src/service/api_types.rs | 5 +- beacon_node/network/src/metrics.rs | 34 +- .../network_beacon_processor/sync_methods.rs | 4 +- .../network/src/sync/backfill_sync/mod.rs | 11 +- beacon_node/network/src/sync/forward_sync.rs | 1048 +++++++++++------ beacon_node/network/src/sync/manager.rs | 13 +- .../network/src/sync/network_context.rs | 2 + beacon_node/network/src/sync/sync_block.rs | 33 +- beacon_node/network/src/sync/tests/lookups.rs | 33 +- beacon_node/network/src/sync/tests/range.rs | 12 +- 12 files changed, 795 insertions(+), 414 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 36c4f2cdc1e..67f2f155a74 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -1183,8 +1183,13 @@ mod pending_components_tests { pub fn pre_setup() -> Setup { let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); let spec = test_spec::(); - let (block, blobs_vec) = - generate_rand_block_and_blobs::(ForkName::Deneb, NumBlobs::Random, &mut rng, &spec); + let (block, blobs_vec) = generate_rand_block_and_blobs::( + ForkName::Deneb, + NumBlobs::Random, + None, + &mut rng, + &spec, + ); let max_len = spec.max_blobs_per_block(block.epoch()) as usize; let mut blobs: RuntimeFixedVector>>> = RuntimeFixedVector::default(max_len); diff --git a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs index 30c26e2c7af..f28ae8d5e09 100644 --- a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs @@ -323,7 +323,10 @@ mod tests { use types::{EthSpec, ForkContext, Hash256, MainnetEthSpec, Slot}; fn get_parent_request_id() -> BlocksByRootRequester { - BlocksByRootRequester::Header(HeaderLookupId(Hash256::ZERO, 0)) + BlocksByRootRequester::Header(HeaderLookupId { + id: 0, + block_root: Hash256::ZERO, + }) } /// Test that `next_peer_request_ready` correctly maintains the queue. diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 7754e38d458..251f899da36 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -275,7 +275,10 @@ mod tests { parent_request_id: DataColumnsByRootRequester::Custody(CustodyByRootRequestId { parent_request_id: ComponentsByRootRequestId { id: 121, - requester: RangeRequestId::ForwardSync(HeaderLookupId(Hash256::ZERO, 1)), + requester: RangeRequestId::ForwardSync(HeaderLookupId { + id: 1, + block_root: Hash256::ZERO, + }), }, }), }; diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 34f58435a36..eeb2e888dff 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -403,25 +403,16 @@ pub static SYNCING_CHAINS_COUNT: LazyLock> = LazyLock::new(| &["range_type"], ) }); -pub static SYNCING_CHAINS_REMOVED: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_range_removed_chains_total", - "Total count of range syncing chains removed per range type", - &["range_type"], - ) -}); -pub static SYNCING_CHAINS_ADDED: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_range_added_chains_total", - "Total count of range syncing chains added per range type", - &["range_type"], +pub static SYNC_CHAINS_REMOVED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_removed_chains_total", + "Total count of forward sync chains removed", ) }); -pub static SYNCING_CHAINS_DROPPED_BLOCKS: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_range_chains_dropped_blocks_total", - "Total count of dropped blocks when removing a syncing chain per range type", - &["range_type"], +pub static SYNC_CHAINS_ADDED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_added_chains_total", + "Total count of forward sync chains added", ) }); pub static SYNCING_CHAINS_IGNORED_BLOCKS: LazyLock> = LazyLock::new(|| { @@ -554,6 +545,15 @@ pub static SYNC_HEADER_MAX_SLOT: LazyLock> = LazyLock::new(|| { "Current max slot of foward sync headers", ) }); +pub static SYNC_HEADERS_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge("sync_headers_count", "Current count of headers in memory") +}); +pub static SYNC_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_chains_count", + "Current count of forward sync chains in memory", + ) +}); /* * Block Delay Metrics diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 3dd38d4ce2b..ca16fbf39fa 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -126,7 +126,9 @@ impl NetworkBeaconProcessor { .process_blocks(downloaded_blocks.iter(), notify_execution_layer) .await { - (_imported_blocks, Ok(_)) => { + (imported_blocks, Ok(_)) => { + let ignored_blocks = sent_blocks - imported_blocks; + metrics::inc_gauge(&metrics::SYNCING_CHAINS_IGNORED_BLOCKS); debug!( %id, first_block_slot = start_slot, diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index f5c6d36f7f8..9e7e5582c07 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -220,8 +220,9 @@ impl BackFillSync { result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - let outcome = self.status.on_download_result(req_id, result, cx); - self.handle_outcome(outcome, cx); + if let Err(e) = self.status.on_download_result(req_id, result, cx) { + self.handle_outcome(Err(e), cx); + } } pub fn on_block_process_result( @@ -235,8 +236,10 @@ impl BackFillSync { } fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { - let outcome = self.status.continue_request(cx); - self.handle_outcome(outcome, cx); + // TODO(tree-sync): only ok to import the newest block + let ok_to_import = true; + let outcome = self.status.continue_request(cx, ok_to_import); + self.handle_outcome(outcome.map(|_| SyncBlockResult::Wait), cx); } fn handle_outcome( diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index cd80937d901..ca96c1a6223 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -16,16 +16,36 @@ use lighthouse_network::service::api_types::{ use lighthouse_network::PeerId; use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; -use tracing::{debug, error, warn}; +use tracing::{debug, error}; use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; const MAX_LOOKUP_COUNT: usize = 1_000_000; const PRUNE_COUNT: usize = 100_000; const BLOCK_BUFFER_SIZE: usize = 2; +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] +struct TipId(u32); + +/// Roots are added to ForwardSync via: +/// 1. Peers referencing an unknown block root +/// 2. When fetching the next ancestor of a chain, the parent is unknown +/// +/// Invariants: +/// - Each block references a single chain +/// - Each block root exists in exactly one `Chain::block_roots` list +/// - A block can change what chain it belongs to at any moment, including during an active request +/// +/// Goals +/// - Download multiple blocks at once to increase download speed pub struct ForwardSync { - blocks: HashMap>, - chain: Arc>, + block_to_tip: HashMap, + chains: HashMap>, +} + +/// Chain of consecutive blocks that are imported by the same set of peers +struct Chain { + peers: HashSet, + status: ChainStatus, } struct ForwardSyncBlock { @@ -33,10 +53,33 @@ struct ForwardSyncBlock { status: Status, } +enum ChainStatus { + // Recursively fetch headers until discovering a parent_root that is known, then transition + // state to `ForwardSync`. + BackfillHeaders { + /// Headers descendant of `next_block_root` that are already downloaded. + /// Sorting: tip first, oldest ancestor last + block_roots: Vec, + /// Oldest ancestor block root of this Chain. + next_header_request: HeaderRequest, + }, + WaitingParentChain { + parent_root: Hash256, + /// Sorting: tip first, oldest ancestor last + block_roots: Vec, + }, + // Sync blocks from old to new buffering some blocks + ForwardSync { + /// Sorting: tip first, oldest ancestor last + block_roots: Vec, + /// Sorting: oldest ancestor first + syncing_blocks: VecDeque>, + }, +} + enum Status { // TODO(tree-sync): Make the "waiting" completed header requests as memory cheap as possible BackfillHeader { - peers: HashSet, failed_peers: HashSet, request: DownloadRequest, }, @@ -46,69 +89,288 @@ enum Status { }, } +struct HeaderRequest { + id: HeaderLookupId, + block_root: Hash256, + failed_peers: HashSet, + request: DownloadRequest, +} + +impl HeaderRequest { + fn new(block_root: Hash256, id: Id) -> Self { + Self { + id: HeaderLookupId { id, block_root }, + block_root, + failed_peers: <_>::default(), + request: DownloadRequest::new(), + } + } + + fn empty() -> Self { + Self::new(Hash256::ZERO, 0) + } + + fn continue_request( + &mut self, + peers: &HashSet, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + if self.request.is_awaiting_download() { + let Some(peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + self.failed_peers.contains(peer), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the lookup is removed + return Err(Error::InternalError("No peers".to_owned())); + }; + + let req_id = cx.send_blocks_by_root_request( + peer, + self.block_root, + BlocksByRootRequester::Header(self.id), + )?; + + self.request.on_download_start(req_id)?; + } + Ok(()) + } +} + // TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got // through gossip and sync. -impl ForwardSyncBlock { - fn new(block_root: Hash256, id: Id, peers: &[PeerId]) -> Self { +impl Chain { + fn new(block_root: Hash256, id: Id, initial_peers: &[PeerId]) -> Self { Self { - id: HeaderLookupId { id, block_root }, - status: Status::BackfillHeader { - peers: HashSet::from_iter(peers.iter().copied()), - failed_peers: <_>::default(), - request: DownloadRequest::new(), + peers: HashSet::from_iter(initial_peers.iter().copied()), + status: ChainStatus::BackfillHeaders { + block_roots: vec![], + next_header_request: HeaderRequest::new(block_root, id), }, } } /// Returns whether the value was newly inserted fn add_peer(&mut self, peer: PeerId) -> bool { - match &mut self.status { - Status::BackfillHeader { peers, .. } => peers.insert(peer), - Status::ForwardSyncBlock { request, .. } => request.add_peer(peer), + self.peers.insert(peer) + } + + /// Returns whether the value was present in the set. + fn remove_peer(&mut self, peer: &PeerId) -> bool { + self.peers.remove(peer) + } + + fn get_peers(&self) -> Vec { + self.peers.iter().copied().collect() + } + + fn peer_count(&self) -> usize { + self.peers.len() + } + + fn parent_root(&self) -> Option { + match &self.status { + ChainStatus::BackfillHeaders { + next_header_request, + .. + } => Some(next_header_request.block_root), + ChainStatus::WaitingParentChain { parent_root, .. } => Some(*parent_root), + ChainStatus::ForwardSync { .. } => None, } } - fn remove_peer(&mut self, peer: &PeerId) { + fn tip(&self) -> Hash256 { + match &self.status { + ChainStatus::BackfillHeaders { + next_header_request, + block_roots, + } => block_roots + .first() + .copied() + .unwrap_or(next_header_request.block_root), + ChainStatus::WaitingParentChain { block_roots, .. } => block_roots + .first() + .copied() + .expect("block roots is not empty"), + ChainStatus::ForwardSync { + block_roots, + syncing_blocks, + } => block_roots.first().copied().unwrap_or_else(|| { + syncing_blocks + .back() + .map(|block| *block.block_root()) + .expect("blocks are not empty") + }), + } + } + + /// Split chain by `block_root` returning a new Self that includes `block_root` and all of its + /// ancestors, and leaves `self` with only the descendants of `block_root` excluding + /// `block_root` + fn split_by(&mut self, block_root: Hash256) -> Result { + let status = match &mut self.status { + ChainStatus::BackfillHeaders { + block_roots, + next_header_request, + } => { + // Take ownership of BackfillHeaders fields without having to add a Poisoned state + let mut block_roots = std::mem::take(block_roots); + let next_header_request = + std::mem::replace(next_header_request, HeaderRequest::empty()); + + let new_block_roots = + if let Some(idx) = block_roots.iter().position(|b| b == &block_root) { + // ..= to keep the block_root on the left + block_roots.drain(0..=idx).collect::>() + } else { + // TODO(tree-sync): check that block_root is the next_root or error + vec![] + }; + self.status = ChainStatus::WaitingParentChain { + parent_root: block_root, + block_roots, + }; + ChainStatus::BackfillHeaders { + block_roots: new_block_roots, + next_header_request, + } + } + ChainStatus::WaitingParentChain { + parent_root, + block_roots, + } => { + let idx = + block_roots + .iter() + .position(|b| b == &block_root) + .ok_or(InternalError(format!( + "block_root {block_root:?} no in chain" + )))?; + // ..= to keep the block_root on the left + let new_block_roots = block_roots.drain(0..=idx).collect::>(); + let parent_root = *parent_root; + self.status = ChainStatus::WaitingParentChain { + parent_root: block_root, + block_roots: std::mem::take(block_roots), + }; + ChainStatus::WaitingParentChain { + parent_root, + block_roots: new_block_roots, + } + } + ChainStatus::ForwardSync { .. } => { + todo!("How to split a chain that's already syncing?"); + } + }; + + Ok(Self { + peers: self.peers.clone(), + // What to set the status to?? + status, + }) + } + + fn to_foward_sync_block(&mut self) -> Result { + todo!(); + } + + fn on_block_imported(&mut self, block_root: &Hash256) { match &mut self.status { - Status::BackfillHeader { peers, .. } => { - peers.remove(peer); + ChainStatus::BackfillHeaders { .. } => {} + ChainStatus::WaitingParentChain { + block_roots, + parent_root, + } => { + if block_root == parent_root { + self.status = ChainStatus::ForwardSync { + block_roots: std::mem::take(block_roots), + syncing_blocks: <_>::default(), + }; + } } - Status::ForwardSyncBlock { request, .. } => { - request.remove_peer(peer); + ChainStatus::ForwardSync { .. } => {} + } + } + + fn to_forward_sync(&mut self, parent_root: Hash256) -> Result<(), InternalError> { + match &mut self.status { + ChainStatus::BackfillHeaders { + block_roots, + next_header_request, + } => { + block_roots.push(next_header_request.block_root); + self.status = ChainStatus::ForwardSync { + block_roots: std::mem::take(block_roots), + syncing_blocks: <_>::default(), + }; + Ok(()) } + _ => Err(InternalError("Not in BackfillHeaders state".to_string())), } } - fn peer_count(&self) -> usize { + fn block_count(&self) -> usize { match &self.status { - Status::BackfillHeader { peers, .. } => peers.len(), - Status::ForwardSyncBlock { request, .. } => request.peer_count(), + ChainStatus::BackfillHeaders { block_roots, .. } + | ChainStatus::WaitingParentChain { block_roots, .. } => block_roots.len(), + ChainStatus::ForwardSync { + block_roots, + syncing_blocks, + } => block_roots.len() + syncing_blocks.len(), } } - fn get_peers(&self) -> Vec { + /// Returns all block roots part of this chain + fn iter_block_roots(&self) -> Box + '_> { match &self.status { - Status::BackfillHeader { peers, .. } => peers.iter().copied().collect(), - Status::ForwardSyncBlock { request, .. } => { - request.clone_peers().iter().copied().collect() + ChainStatus::BackfillHeaders { + block_roots, + next_header_request, + } => { + Box::new(std::iter::once(&next_header_request.block_root).chain(block_roots.iter())) } + ChainStatus::WaitingParentChain { block_roots, .. } => Box::new(block_roots.iter()), + ChainStatus::ForwardSync { + syncing_blocks, + block_roots, + } => Box::new( + syncing_blocks + .iter() + .map(|block| block.block_root()) + .chain(block_roots.iter()), + ), } } - fn is_syncing(&self) -> bool { - match self.status { - Status::BackfillHeader { .. } => false, - Status::ForwardSyncBlock { .. } => true, - } + /// Returns true if this chain has no blocks + fn is_empty(&self) -> bool { + self.iter_block_roots().is_empty() } - fn parent_root(&self) -> Option { + fn min_slot(&self) -> Option { + todo!(); + } + + fn max_slot(&self) -> Option { + todo!(); + } + + fn syncing_blocks_count(&self) -> usize { match &self.status { - Status::BackfillHeader { request, .. } => { - request.is_complete().map(|header| header.parent_root) - } - Status::ForwardSyncBlock { header, .. } => Some(header.parent_root), + ChainStatus::BackfillHeaders { .. } => 0, + ChainStatus::WaitingParentChain { .. } => 0, + ChainStatus::ForwardSync { syncing_blocks, .. } => syncing_blocks.len(), } } @@ -116,111 +378,129 @@ impl ForwardSyncBlock { &mut self, ) -> Result<&mut DownloadRequest, Error> { match &mut self.status { - Status::BackfillHeader { request, .. } => Ok(request), + ChainStatus::BackfillHeaders { + next_header_request, + .. + } => Ok(&mut next_header_request.request), _ => Err(Error::InternalError( "Expected lookup to be in DownloadingHeader state".to_owned(), )), } } - fn block_request(&mut self) -> Result<&mut SyncBlock, Error> { + fn add_ancestor(&mut self, parent_root: Hash256, id: Id) -> Result<(), InternalError> { match &mut self.status { - Status::ForwardSyncBlock { request, .. } => Ok(request), - _ => Err(Error::InternalError( - "Expected lookup to be in Syncing state".to_owned(), + ChainStatus::BackfillHeaders { + block_roots, + next_header_request, + } => { + block_roots.push(next_header_request.block_root); + *next_header_request = HeaderRequest::new(parent_root, id); + Ok(()) + } + _ => Err(InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), )), } } - fn assert_expected_lookup_id(&self, lookup_id: HeaderLookupId) -> Result<(), Error> { - if self.id == lookup_id { - Ok(()) - } else { - Err(Error::InternalError(format!( - "Unexpected lookup ID {} != {}", - self.id, lookup_id - ))) + fn to_waiting_parent(&mut self, parent_root: Hash256) -> Result<(), Error> { + match &mut self.status { + ChainStatus::BackfillHeaders { block_roots, .. } => { + self.status = ChainStatus::WaitingParentChain { + parent_root, + block_roots: std::mem::take(block_roots), + }; + Ok(()) + } + _ => Err(Error::InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), + )), } } - fn to_foward_sync_block(&mut self, block_root: Hash256) -> Result<(), Error> { - let (peers, request) = match &mut self.status { - Status::BackfillHeader { peers, request, .. } => (peers, request), - _ => { - return Err(Error::InternalError( - "Expected lookup to be in DownloadingHeader state".to_owned(), - )) - } - }; - - let header = match request.is_complete() { - Some(header) => header.clone(), - None => { - return Err(Error::InternalError( - "Expected request to be complete".to_owned(), - )) - } - }; - - // We are replacing the `status` field below, so peers will never be read again - let initial_peers = std::mem::take(peers).into_iter().collect::>(); - - self.status = Status::ForwardSyncBlock { - header, - request: SyncBlock::new( - RangeRequestId::ForwardSync(self.id), - block_root, - &initial_peers, - ), - }; + fn on_download_result( + &mut self, + req_id: ComponentsByRootRequestId, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + let (ok_to_import, block) = self.block_request(req_id.requester)?; + block.on_download_result(req_id, result, cx)?; + block.continue_request(cx, ok_to_import)?; Ok(()) } - fn send_block_header_request( + fn on_process_result( &mut self, - block_root: Hash256, + id: HeaderLookupId, + result: BatchProcessResult, cx: &mut SyncNetworkContext, - ) -> Result<(), Error> { - let (peers, failed_peers, request) = match &mut self.status { - Status::BackfillHeader { - peers, - failed_peers, - request, - } => (peers, failed_peers, request), - Status::ForwardSyncBlock { .. } => { - return Err(Error::InternalError( - "Lookup not in forward sync block status".to_owned(), - )) + ) -> Result { + let (ok_to_import, block) = self.block_request(RangeRequestId::ForwardSync(id))?; + match block.on_process_result(result, cx)? { + SyncBlockResult::Done { parent_root, slot } => { + // This block is complete, remove it from chain + if !ok_to_import { + return Err(Error::InternalError(format!( + "Block {id} is not the first block" + ))); + } + if let ChainStatus::ForwardSync { syncing_blocks, .. } = &mut self.status { + if let Some(block) = syncing_blocks.pop_front() { + debug!("Dropping syncing block {}", block.id()); + } else { + return Err(Error::InternalError("syncing_blocks is empty".to_string())); + } + } + Ok(SyncBlockResult::Done { parent_root, slot }) } - }; + SyncBlockResult::Wait => { + // Not complete yet, continue requests + block.continue_request(cx, ok_to_import)?; + Ok(SyncBlockResult::Wait) + } + } + } - let Some(peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - failed_peers.contains(peer), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, peer)| *peer) - else { - // When a peer disconnects and is removed from the SyncingChain peer set, if the set - // reaches zero the lookup is removed - return Err(Error::InternalError("No peers".to_owned())); - }; + fn block_request(&mut self, id: RangeRequestId) -> Result<(bool, &mut SyncBlock), Error> { + match &mut self.status { + ChainStatus::ForwardSync { syncing_blocks, .. } => { + if let Some(index) = syncing_blocks.iter().position(|b| b.id() == id) { + let block = syncing_blocks.get_mut(index).expect("index just found"); + return Ok((index == 0, block)); + } - let req_id = cx.send_blocks_by_root_request( - peer, - block_root, - BlocksByRootRequester::Header(self.id), - )?; + let first_ids: Vec<_> = syncing_blocks.iter().take(5).map(|b| b.id()).collect(); + Err(Error::InternalError(format!( + "Unknown block for {id}, first few blocks {first_ids:?}" + ))) + } - request.on_download_start(req_id)?; - Ok(()) + _ => Err(Error::InternalError( + "Expected lookup to be in Syncing state".to_owned(), + )), + } + } + + fn continue_requests(&mut self, cx: &mut SyncNetworkContext) -> Result<(), Error> { + match &mut self.status { + ChainStatus::BackfillHeaders { + next_header_request, + .. + } => Ok(next_header_request.continue_request(&self.peers, cx)?), + ChainStatus::WaitingParentChain { .. } => Ok(()), + ChainStatus::ForwardSync { + block_roots, + syncing_blocks, + } => { + for (index, block) in syncing_blocks.iter_mut().enumerate() { + let ok_to_import = index == 0; + block.continue_request(cx, ok_to_import)?; + } + Ok(()) + } + } } } @@ -231,6 +511,15 @@ pub enum Error { BlockConflictsWithFinality(String), } +#[derive(Debug)] +struct InternalError(String); + +impl From for Error { + fn from(e: InternalError) -> Self { + Self::InternalError(e.0) + } +} + impl From for Error { fn from(e: DownloadRequestError) -> Self { match e { @@ -245,7 +534,7 @@ impl From for Error { match e { RpcRequestSendError::InternalError(e) => Self::InternalError(e), // TODO(tree-sync): Should we allow lookups to have zero peers - RpcRequestSendError::NoPeers => Self::InternalError(format!("No peers")), + RpcRequestSendError::NoPeers => Self::InternalError("No peers".to_string()), } } } @@ -267,50 +556,56 @@ pub(crate) enum SyncState { impl ForwardSync { pub fn new(chain: Arc>) -> Self { Self { - blocks: <_>::default(), - chain, + block_to_tip: <_>::default(), + chains: <_>::default(), } } #[cfg(test)] - pub fn block_peers(&self, block_root: &Hash256) -> Option> { - self.blocks.get(block_root).map(|block| block.get_peers()) + pub fn block_peers(&self, block_root: &Hash256) -> Result>, String> { + let Some(chain) = self.block_to_tip.get(block_root) else { + return Ok(None); + }; + Ok(Some( + self.chains + .get(chain) + .ok_or(format!("Unknown chain {chain:?}"))? + .get_peers(), + )) } #[cfg(test)] pub fn get_lookups(&self) -> Vec { - self.blocks.keys().copied().collect() + self.block_to_tip.keys().copied().collect() } pub fn block_count(&self) -> usize { - self.blocks.len() + self.block_to_tip.len() } + /// Returns the highest known slot that we are attempting to sync pub fn max_slot_to_sync(&self) -> Option { // TODO(tree-sync): weak metric, who have a better heuristic for sync? Now that lookups // count here - self.blocks - .values() - .filter_map(|block| match &block.status { - Status::BackfillHeader { request, .. } => { - request.is_complete().map(|header| header.slot) - } - Status::ForwardSyncBlock { .. } => None, - }) - .max() + todo!(); } #[cfg(test)] pub fn get_processing_ids(&mut self) -> Vec { let mut ids = vec![]; - for block in self.blocks.values_mut() { - if block - .block_request() - .ok() - .map(|request| request.is_processing()) - .unwrap_or(false) - { - ids.push(block.id); + for chain in self.chains.values() { + match &chain.status { + ChainStatus::BackfillHeaders { .. } => {} + ChainStatus::WaitingParentChain { .. } => {} + ChainStatus::ForwardSync { syncing_blocks, .. } => { + for block in syncing_blocks { + if block.is_processing() { + if let RangeRequestId::ForwardSync(id) = block.id() { + ids.push(id); + } + } + } + } } } ids @@ -321,8 +616,8 @@ impl ForwardSync { } pub fn remove_peer(&mut self, peer: PeerId) { - for block in self.blocks.values_mut() { - block.remove_peer(&peer); + for chain in self.chains.values_mut() { + chain.remove_peer(&peer); } } @@ -331,15 +626,40 @@ impl ForwardSync { block_root: Hash256, peers: &[PeerId], cx: &mut SyncNetworkContext, - ) { - if self.blocks.contains_key(&block_root) { + ) -> Result<(), Error> { + if let Some(initial_chain_id) = self.block_to_tip.get(&block_root) { let mut counts = HashMap::<&PeerId, usize>::new(); // Add peer to `block`'s entry and all its ancestors let mut target_block_root = block_root; - while let Some(lookup) = self.blocks.get_mut(&target_block_root) { + while let Some(chain_id) = self.block_to_tip.get_mut(&target_block_root) { + let chain = self + .chains + .get_mut(chain_id) + .ok_or(InternalError(format!("Unknown chain {chain_id}")))?; + + // If target_block_root is not the tip of chain, we have to split the chain + let chain_to_add_peers = if chain.tip() != target_block_root { + let new_chain = chain.split_by(target_block_root)?; + let new_chain_id = TipId(cx.next_id()); + + // Update all block references to the new chain + for block_root in new_chain.iter_block_roots() { + *self + .block_to_tip + .get_mut(block_root) + .ok_or(InternalError(format!("No block {block_root:?}")))? = + new_chain_id; + } + + self.chains.insert(new_chain_id, new_chain); + self.chains.get_mut(&new_chain_id).expect("key just added") + } else { + chain + }; + for peer in peers { // TODO(tree-sync): If peer already in set no need to add to its ancestors - if lookup.add_peer(*peer) { + if chain_to_add_peers.add_peer(*peer) { // TODO(tree-sync): This log can be very noisy maybe log once per peer *counts.entry(peer).or_default() += 1; } else { @@ -348,7 +668,7 @@ impl ForwardSync { break; } } - if let Some(parent_root) = lookup.parent_root() { + if let Some(parent_root) = chain_to_add_peers.parent_root() { target_block_root = parent_root; } else { break; @@ -358,32 +678,33 @@ impl ForwardSync { debug!(block_root = ?target_block_root, %peer, count, "Adding peer to existing header lookup and ancestors"); } } else { - if self.blocks.len() > MAX_LOOKUP_COUNT { - self.prune_least_popular_lookups(); + if self.block_to_tip.len() > MAX_LOOKUP_COUNT { + if let Err(e) = self.prune_least_popular_lookups() { + error!("Error on prune_least_popular_lookups {e:?}"); + } } let id = cx.next_id(); + let chain_id = TipId(cx.next_id()); match peers { - [peer] => debug!(?block_root, id, %peer, "Creating new header lookup"), + [peer] => debug!(?block_root, id, %chain_id, %peer, "Creating new header lookup"), _ => debug!( ?block_root, id, + %chain_id, peers = peers.len(), "Creating new header lookup" ), } - let mut lookup = ForwardSyncBlock::new(block_root, id, peers); - match lookup.send_block_header_request(block_root, cx) { - Ok(_) => { - self.blocks.insert(block_root, lookup); - metrics::inc_counter(&metrics::SYNC_LOOKUPS_CREATED); - } - Err(e) => { - warn!(id = ?lookup.id, error = ?e, "Error sending initial lookup request"); - } - } + let mut chain = Chain::new(block_root, id, peers); + chain.continue_requests(cx)?; + // Don't insert until first request is successful + metrics::inc_counter(&metrics::SYNC_CHAINS_ADDED); + self.chains.insert(chain_id, chain); + self.block_to_tip.insert(block_root, chain_id); } + Ok(()) } pub fn on_header_download_result( @@ -397,12 +718,14 @@ impl ForwardSync { let block_root = id.block_root; let result: Result = (|| { - let Some(lookup) = self.blocks.get_mut(&block_root) else { + let Some(chain_id) = self.block_to_tip.get(&block_root) else { // TODO(tree-sync): register metric - debug!(id = ?req_id, "Received header request for unknown lookup"); + debug!(id = ?req_id, "Received header request for unknown block_root"); return Ok(SyncBlockResult::Wait); }; - lookup.assert_expected_lookup_id(id)?; + let chain = self.chains.get_mut(chain_id).ok_or(InternalError(format!( + "block_root {block_root:?} references unknown chain {chain_id}" + )))?; let response = response.and_then(|(blocks, timestamp)| { let block = blocks @@ -414,6 +737,10 @@ impl ForwardSync { Ok((block, timestamp)) }); + // TODO(tree-sync): add some check to make sure that distinct lookups for the same + // block root don't mess with each other. That check must happen before triggering + // errors for bad state + match response { Ok((block, received)) => { debug!(%req_id, "Forward sync block header downloaded success"); @@ -421,7 +748,7 @@ impl ForwardSync { let block_header = block.message().block_header(); let parent_root = block_header.parent_root; - lookup.header_request()?.on_download_success( + chain.header_request()?.on_download_success( req_id, peer_id, block_header.clone(), @@ -438,9 +765,7 @@ impl ForwardSync { // TODO(tree-sync): should check if the block is descendant of finalized // TODO(tree-sync): on finalization or every interval we should drop branches that // conflict with finality - let parent_imported = self.chain.block_is_known_to_fork_choice(&parent_root); - let finalized_checkpoint = self.chain.head().finalized_checkpoint(); - let parent_known = self.blocks.contains_key(&parent_root); + let finalized_checkpoint = cx.chain.head().finalized_checkpoint(); // TODO(tree-sync): check that the slots are decreasing, so we don't end up in // an infinite loop. But note that the wrong block will be the descendant. @@ -459,21 +784,36 @@ impl ForwardSync { block_root, block_header.slot, finalized_checkpoint ))); } - if parent_imported || parent_known { + + if cx.chain.block_is_known_to_fork_choice(&parent_root) { + // Parent is imported, we can forward sync this chain // Stop search we reached a known block - self.trigger_forward_sync(cx); + chain.to_forward_sync(parent_root)?; + debug!(%chain_id, ?parent_root, block_count = chain.block_count(), "Forward sync chain reached imported block"); + // Trigger potential foward sync for this chain + self.continue_requests(cx); + } else if let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) { + debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); + // Parent is part of another chain, stop search + // Stop search we reached a known block + chain.to_waiting_parent(parent_root)?; + // TODO(tree-sync): Add peers recursively to the chain_id, potentially + // splitting the chain when adding peers. } else { - let lookup = self.blocks.get_mut(&block_root).expect("lookup exists"); - let peers = lookup.get_peers(); - self.search(parent_root, &peers, cx); + chain.add_ancestor(parent_root, cx.next_id())?; + // Add to the block_to_tip mapping to respect the invariant "Each block + // root exists in exactly one `Chain::block_roots` list". + self.block_to_tip.insert(parent_root, *chain_id); + // Since the block already points to `chain` we don't need to add peers. + // Just trigger header download for this new root. + self.continue_requests(cx); } } Err(e) => { // Request errors are logged in `SyncNetworkContext::on_rpc_response_result` - lookup - .header_request()? - .on_download_error(req_id, Some(e))?; - lookup.send_block_header_request(block_root, cx)?; + chain.header_request()?.on_download_error(req_id, Some(e))?; + // Continue this request to potentially resend the header request + self.continue_requests(cx); } } Ok(SyncBlockResult::Wait) @@ -481,7 +821,9 @@ impl ForwardSync { // Map result Ok to Wait as completing the header request does not complete the overall // ForwardSyncBlock request. - self.handle_result(id.block_root, result.map(|_| SyncBlockResult::Wait), cx); + if let Err(e) = result { + self.handle_result(id.block_root, e, cx); + } } pub fn on_block_download_result( @@ -491,20 +833,18 @@ impl ForwardSync { result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - let Some(lookup) = self.blocks.get_mut(&id.block_root) else { - // TODO(tree-sync): register metric - debug!(?id, "Received block request for unknown lookup"); + let Some(chain_id) = self.block_to_tip.get(&id.block_root) else { + debug!(?id, "Received block process result for unknown lookup"); return; }; - if let Err(e) = lookup.assert_expected_lookup_id(id) { - debug!(?id, "Unexpected lookup ID"); + let Some(chain) = self.chains.get_mut(chain_id) else { + error!(%chain_id, block_root = ?id.block_root, "Block references unknown chain"); return; - } + }; - let outcome = lookup - .block_request() - .and_then(|block| Ok(block.on_download_result(req_id, result, cx)?)); - self.handle_result(id.block_root, outcome, cx); + if let Err(e) = chain.on_download_result(req_id, result, cx) { + self.handle_result(id.block_root, e, cx); + } } pub fn on_block_process_result( @@ -513,19 +853,36 @@ impl ForwardSync { result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - let Some(lookup) = self.blocks.get_mut(&id.block_root) else { + let Some(chain_id) = self.block_to_tip.get(&id.block_root) else { debug!(?id, "Received block process result for unknown lookup"); return; }; - if let Err(e) = lookup.assert_expected_lookup_id(id) { - debug!(?id, "Unexpected lookup ID"); + let Some(chain) = self.chains.get_mut(chain_id) else { + error!(%chain_id, block_root = ?id.block_root, "Block references unknown chain"); return; - } + }; - let outcome = lookup - .block_request() - .and_then(|block| Ok(block.on_process_result(result, cx)?)); - self.handle_result(id.block_root, outcome, cx); + match chain.on_process_result(id, result, cx) { + Ok(SyncBlockResult::Done { .. }) => { + metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); + self.block_to_tip.remove(&id.block_root); + // Find all chains that are awaiting this block to process and continue them + for other_chain in self.chains.values_mut() { + other_chain.on_block_imported(&id.block_root); + } + self.continue_requests(cx); + // If the chain is empty, remove it + if chain.is_empty() { + self.chains.remove(&chain_id); + metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); + } + } + // Wait for next event + Ok(SyncBlockResult::Wait) => {} + Err(e) => { + self.handle_result(id.block_root, e, cx); + } + } } pub fn prune(&mut self) { @@ -533,45 +890,30 @@ impl ForwardSync { } pub fn prune_imported_block(&mut self, block_root: Hash256, _imported: bool) { - let mut block_to_delete = block_root; - while let Some(block) = self.blocks.remove(&block_root) { - debug!(?block_root, "Deleted imported block lookup"); - if let Some(parent_root) = block.parent_root() { - block_to_delete = parent_root; - } else { - break; - } - } + // Recursively prune this block and all their ancestors + todo!(); } - fn handle_result( - &mut self, - block_root: Hash256, - result: Result, - cx: &mut SyncNetworkContext, - ) { - match result { - Ok(SyncBlockResult::Done { .. }) => { - metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); - self.blocks.remove(&block_root); - self.trigger_forward_sync(cx); + fn handle_result(&mut self, block_root: Hash256, error: Error, cx: &mut SyncNetworkContext) { + debug!(?error, ?block_root, "Dropping forward sync block lookup"); + let Some(chain_id) = self.block_to_tip.get(&block_root).copied() else { + debug!(?block_root, "Handling error for unknown block_root"); + return; + }; + match error { + Error::InternalError(_) | Error::TooManyErrors(_) => { + let block_to_children = self + .compute_children() + .expect("TODO: handle this error if it can't be avoided"); + self.drop_chain_and_children(chain_id, &block_to_children); } - // Wait for next event - Ok(SyncBlockResult::Wait) => {} - Err(e) => { - debug!(error = ?e, ?block_root, "Dropping forward sync block lookup"); - match e { - Error::InternalError(_) | Error::TooManyErrors(_) => { - let block_to_children = self.compute_children(); - self.drop_lookup_and_children(block_root, &block_to_children); - } - Error::BlockConflictsWithFinality(_e) => { - let block_to_children = self.compute_children(); - self.drop_lookup_and_children(block_root, &block_to_children); - // TODO(tree-sync): penalize peers of this lookups - // TODO(tree-sync): add blocks to a failed cache to prevent re-sync - } - } + Error::BlockConflictsWithFinality(_e) => { + let block_to_children = self + .compute_children() + .expect("TODO: handle this error if it can't be avoided"); + self.drop_chain_and_children(chain_id, &block_to_children); + // TODO(tree-sync): penalize peers of this lookups + // TODO(tree-sync): add blocks to a failed cache to prevent re-sync } } } @@ -591,111 +933,106 @@ impl ForwardSync { // TODO(tree-sync): don't build on demand, cache roots somewhere - let blocks_syncing = self - .blocks + let mut blocks_syncing = self + .chains .values() - .filter(|block| block.is_syncing()) - .count(); + .map(|chain| chain.syncing_blocks_count()) + .sum::(); + + // A chain can be in two states: + // - Active backfill + // - Oldest ancestor known + let mut new_syncing_blocks = false; // Have up to 2 blocks syncing - for _ in blocks_syncing..BLOCK_BUFFER_SIZE { - // Find the block range with most peers and highest slot. This is the block - // to be used as tip of the chain of blocks to fetch. - let Some(block_root) = self - .blocks - .iter() - .filter_map(|(root, block)| { - let header = match &block.status { - // Ignore blocks that are still downloading - Status::BackfillHeader { request, .. } => match request.is_complete() { - Some(header) => header, - None => return None, - }, - // Ignore blocks already syncing - Status::ForwardSyncBlock { .. } => return None, - }; - // Check if the parent is known in the header tree - let is_candidate = if let Some(parent) = self.blocks.get(&header.parent_root) { - parent.is_syncing() - } else { - // TODO(tree-sync): cache this calls in the struct - cx.chain.block_is_known_to_fork_choice(&header.parent_root) - }; + // Find the block range with most peers and highest slot. This is the block + // to be used as tip of the chain of blocks to fetch. + let mut chains_by_peer_count = self + .chains + .iter_mut() + .filter_map(|(_, chain)| { + if matches!(chain.status, ChainStatus::ForwardSync { .. }) { + Some((chain.peer_count(), chain)) + } else { + None + } + }) + .collect::>(); - if is_candidate { - Some((block.peer_count(), Slot::new(u64::MAX) - header.slot, root)) - } else { - None - } - }) - .max() - .map(|(_, _, root)| *root) - else { - break; - }; + chains_by_peer_count.sort_by_key(|(peer_count, _)| *peer_count); - // Start syncing `block_root` - match self - .blocks - .get_mut(&block_root) - .ok_or(Error::InternalError(format!( - "self.blocks must contain an entry with {block_root}" - ))) - .and_then(|block| { - block.to_foward_sync_block(block_root)?; - Ok(block.id) - }) { - Ok(id) => debug!(?id, "Starting forward sync of block"), - // Should never error - Err(e) => error!("Unable to transition header to forward sync block: {e:?}"), + for (_, chain) in chains_by_peer_count { + if let ChainStatus::ForwardSync { + block_roots, + syncing_blocks, + } = &mut chain.status + { + /// block_roots sorting: tip first, oldest ancestor last => pop + if let Some(next_block) = block_roots.pop() { + syncing_blocks.push_back(SyncBlock::new( + RangeRequestId::ForwardSync(HeaderLookupId { + id: cx.next_id(), + block_root: next_block, + }), + next_block, + &chain.peers.iter().copied().collect::>(), + )); + blocks_syncing += 1; + new_syncing_blocks = true; + if blocks_syncing >= BLOCK_BUFFER_SIZE { + break; + } + } } - - new_syncing_blocks = true; } if new_syncing_blocks { - self.continue_syncing_blocks(cx); + self.continue_requests(cx); } } - fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { - let mut lookups_to_drop = vec![]; + fn continue_requests(&mut self, cx: &mut SyncNetworkContext) { + // TODO(tree-sync): optimize this call to maybe not do it everytime + self.trigger_forward_sync(cx); - for (block_root, lookup) in self.blocks.iter_mut() { - let result = match &mut lookup.status { - Status::BackfillHeader { .. } => continue, - Status::ForwardSyncBlock { request, .. } => request.continue_request(cx), - }; + let mut chains_to_drop = vec![]; - if let Err(_e) = result { + for (chain_id, chain) in self.chains.iter_mut() { + if let Err(_e) = chain.continue_requests(cx) { // TODO(tree-sync): should log error? - lookups_to_drop.push(*block_root); + chains_to_drop.push(*chain_id); } } - let block_to_children = self.compute_children(); - for block_root in lookups_to_drop { - self.drop_lookup_and_children(block_root, &block_to_children); + let chain_to_children = self + .compute_children() + .expect("Handle this error if it can't be avoided"); + for chain_id in chains_to_drop { + self.drop_chain_and_children(chain_id, &chain_to_children); } } - /// Drop lookup `block_root` if it exists and all its children - fn drop_lookup_and_children( + /// Drop chain if it exists and all its children + fn drop_chain_and_children( &mut self, - initial_block_root: Hash256, - block_to_children: &HashMap>, + initial_chain_id: TipId, + chain_to_children: &HashMap>, ) { - let mut queue: VecDeque = VecDeque::from([initial_block_root]); + let mut queue: VecDeque = VecDeque::from([initial_chain_id]); - while let Some(block_root) = queue.pop_front() { + while let Some(chain_id) = queue.pop_front() { // Remove the node itself. - if let Some(block) = self.blocks.remove(&block_root) { - debug!(?block_root, id = %block.id, "Dropping forward sync block lookup"); - metrics::inc_counter(&metrics::SYNC_LOOKUPS_DROPPED); + if let Some(chain) = self.chains.remove(&chain_id) { + metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); + for block_root in chain.iter_block_roots() { + self.block_to_tip.remove(block_root); + debug!(?block_root, id = %chain_id, "Dropping forward sync block lookup"); + metrics::inc_counter(&metrics::SYNC_LOOKUPS_DROPPED); + } // Only remove children if the node still existed // Push its children—if any—onto the work list. - if let Some(children) = block_to_children.get(&block_root) { + if let Some(children) = chain_to_children.get(&chain_id) { queue.extend(children.iter().cloned()); } } @@ -703,63 +1040,82 @@ impl ForwardSync { } /// Drop lookup `block_root` if it exists and all its children - fn compute_children(&mut self) -> HashMap> { - let mut block_to_children = HashMap::>::new(); - for (block_root, block) in self.blocks.iter() { - if let Some(parent_root) = block.parent_root() { - block_to_children - .entry(parent_root) + fn compute_children(&mut self) -> Result>, InternalError> { + let mut chain_to_children = HashMap::>::new(); + for (chain_id, chain) in self.chains.iter() { + if let Some(parent_root) = chain.parent_root() { + // TODO(tree-sync): Is this error impossible? + let parent_chain_id = self.block_to_tip + .get(&parent_root) + .ok_or(InternalError(format!( + "Chain {chain_id} has a parent root that points to an unknown block {parent_root:?}" + )))?; + + chain_to_children + .entry(*parent_chain_id) .or_default() - .push(*block_root); + .push(*chain_id); } } - block_to_children + Ok(chain_to_children) } /// Drop lookups with least amount of peers and slot until we pruned PRUNE_COUNT lookups - fn prune_least_popular_lookups(&mut self) { - let mut blocks = self - .blocks + fn prune_least_popular_lookups(&mut self) -> Result<(), InternalError> { + let mut chains = self + .chains .iter() - .filter_map(|(block_root, block)| match &block.status { - // Prune only lookups that are not syncing and we know the header - Status::BackfillHeader { peers, request, .. } => request - .is_complete() - .map(|header| (block.peer_count(), header.slot, *block_root)), - Status::ForwardSyncBlock { .. } => None, - }) + // TODO: Prune only lookups that are not syncing and we know the header + .map(|(chain_id, chain)| (chain.peer_count(), *chain_id)) .collect::>(); - blocks.sort_unstable(); + chains.sort_unstable(); - let block_to_children = self.compute_children(); - for (_, _, block_root) in blocks { - self.drop_lookup_and_children(block_root, &block_to_children); - if self.blocks.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { + let chain_to_children = self.compute_children()?; + for (_, chain_id) in chains { + self.drop_chain_and_children(chain_id, &chain_to_children); + if self.block_to_tip.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { break; } } + Ok(()) } pub fn register_metrics(&self) { - if let Some((min_slot, max_slot)) = self - .blocks - .values() - .filter_map(|lookup| { - if let Status::BackfillHeader { request, .. } = &lookup.status { - request.is_complete().map(|header| header.slot) - } else { - None - } - }) - .minmax() - .into_option() - { + let (min_slot, max_slot) = + self.chains + .values() + .fold((None::, None::), |(gmin, gmax), chain| { + let gmin = match (gmin, chain.min_slot()) { + (Some(a), Some(b)) => Some(a.min(b)), + (None, some @ Some(_)) => some, // first non-None wins + (x, None) => x, + }; + + let gmax = match (gmax, chain.max_slot()) { + (Some(a), Some(b)) => Some(a.max(b)), + (None, some @ Some(_)) => some, + (x, None) => x, + }; + + (gmin, gmax) + }); + + if let (Some(min_slot), Some(max_slot)) = (min_slot, max_slot) { metrics::set_gauge(&metrics::SYNC_HEADER_MIN_SLOT, min_slot.as_u64() as i64); metrics::set_gauge(&metrics::SYNC_HEADER_MAX_SLOT, max_slot.as_u64() as i64); } + metrics::set_gauge(&metrics::SYNC_HEADERS_COUNT, self.block_to_tip.len() as i64); + metrics::set_gauge(&metrics::SYNC_CHAINS_COUNT, self.chains.len() as i64); + // Min header // Highest known header // Current head } } + +impl std::fmt::Display for TipId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index d5e837a4204..ffe90b7cafe 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -373,8 +373,12 @@ impl SyncManager { // Adds a peer to forward sync. Since its possible that a lookup just gained a new peer we // attempt to continue idle custody by root requests that are waiting for peers. fn add_peer_with_imported_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { - self.forward_sync - .search(block_root, &[peer_id], &mut self.network); + if let Err(e) = self + .forward_sync + .search(block_root, &[peer_id], &mut self.network) + { + error!("Error adding peer to forward sync {block_root:?} {peer_id} {e:?}"); + } // Try to make progress on custody requests that are waiting for peers for (id, result) in self.network.continue_custody_by_root_requests() { @@ -571,10 +575,7 @@ impl SyncManager { // We don't need to subscribe if the old state is a state that would have already // invoked this call. if new_state.is_synced() - && !matches!( - old_state, - SyncState::Synced | SyncState::BackFillSyncing { .. } - ) + && !matches!(old_state, SyncState::Synced | SyncState::BackFillSyncing) { self.network.subscribe_core_topics(); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 7a8d277da0c..1b8ecdf66fd 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -808,6 +808,7 @@ impl SyncNetworkContext { /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks /// - If the event is an error it fails the request and returns `Some(Err)` /// - else it appends the response chunk to the active request state and returns `None` + #[allow(clippy::type_complexity)] pub(crate) fn on_blocks_by_root_response( &mut self, id: BlocksByRootRequestId, @@ -820,6 +821,7 @@ impl SyncNetworkContext { /// Processes a single `RpcEvent` blobs_by_root RPC request. /// Same logic as [`on_blocks_by_root_response`] + #[allow(clippy::type_complexity)] pub(crate) fn on_blobs_by_root_response( &mut self, id: BlobsByRootRequestId, diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 7d8d76dab85..2d87cea5283 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -21,6 +21,7 @@ pub struct SyncBlock { id: RangeRequestId, block_root: Hash256, failed_peers: HashSet, + // TODO(tree-sync): deprecate this shared state for manual addition and removal peers: Arc>>, request: SyncingStatus, download_errors: usize, @@ -34,6 +35,7 @@ enum SyncingStatus { Processing(RpcBlock, BatchPeers), } +#[must_use] pub enum SyncBlockResult { Done { parent_root: Hash256, slot: Slot }, Wait, @@ -60,6 +62,14 @@ impl SyncBlock { } } + pub fn block_root(&self) -> &Hash256 { + &self.block_root + } + + pub fn id(&self) -> RangeRequestId { + self.id + } + pub fn peer_count(&self) -> usize { self.peers.read().len() } @@ -87,7 +97,7 @@ impl SyncBlock { req_id: ComponentsByRootRequestId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, - ) -> Result { + ) -> Result<(), Error> { match &mut self.request { SyncingStatus::Downloading(expected_id) => { if req_id != *expected_id { @@ -100,7 +110,7 @@ impl SyncBlock { Ok((block, peers)) => { debug!(id = %self.id, "Sync block downloaded"); self.request = SyncingStatus::AwaitingProcessing(block, peers); - self.continue_request(cx) + Ok(()) } Err(e) => { debug!(id = %self.id, error = ?e, "Sync block download error"); @@ -111,7 +121,7 @@ impl SyncBlock { return Err(Error::TooManyErrors("download errors".to_owned())); } - self.continue_request(cx) + Ok(()) } } } @@ -151,7 +161,7 @@ impl SyncBlock { } self.request = SyncingStatus::AwaitingDownload; - self.continue_request(cx) + Ok(SyncBlockResult::Wait) } }, _ => Err(Error::InternalError( @@ -160,10 +170,13 @@ impl SyncBlock { } } + /// Make progress on the request. Note that a request can never finish on this call, thus it + /// does not return `SyncBlockResult`. pub fn continue_request( &mut self, cx: &mut SyncNetworkContext, - ) -> Result { + ok_to_import: bool, + ) -> Result<(), Error> { match &mut self.request { SyncingStatus::AwaitingDownload => { match cx.block_components_by_range_request( @@ -174,7 +187,7 @@ impl SyncBlock { ) { Ok(req_id) => { self.request = SyncingStatus::Downloading(req_id); - Ok(SyncBlockResult::Wait) + Ok(()) } Err(e) => match e { RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_) => { @@ -185,7 +198,7 @@ impl SyncBlock { }, } } - SyncingStatus::Downloading(_) => Ok(SyncBlockResult::Wait), + SyncingStatus::Downloading(_) => Ok(()), SyncingStatus::AwaitingProcessing(block, peers) => { // No need to check if block is already imported here, we'll get an error // from the beacon processor anyway. No need to add more code to handle this @@ -197,7 +210,7 @@ impl SyncBlock { .chain .block_is_known_to_fork_choice(&block.as_block().parent_root()) { - return Ok(SyncBlockResult::Wait); + return Ok(()); } if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { @@ -212,7 +225,7 @@ impl SyncBlock { ))) } else { self.request = SyncingStatus::Processing(block.clone(), peers.clone()); - Ok(SyncBlockResult::Wait) + Ok(()) } } else { // TODO(tree-sync): This error will cause the full chain of headers to @@ -223,7 +236,7 @@ impl SyncBlock { )) } } - SyncingStatus::Processing(..) => Ok(SyncBlockResult::Wait), + SyncingStatus::Processing(..) => Ok(()), } } } diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index f428def6004..0bf19ec871a 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -300,6 +300,7 @@ impl TestRig { .sync_manager .forward_sync() .block_peers(&block_root) + .expect("Error getting block peers") .unwrap_or_else(|| panic!("Unknown block {block_root}")); peers.sort_unstable(); let mut expected_peers = expected_peers.to_vec(); @@ -607,11 +608,7 @@ impl TestRig { /// Retrieves an unknown number of requests for data columns of `block_root`. Because peer ENRs /// are random, and peer selection is random, the total number of batched requests is unknown. - fn expect_data_columns_by_root_requests( - &mut self, - block_root: Hash256, - count: usize, - ) -> DCByRootIds { + fn expect_data_columns_by_root_requests(&mut self, block_root: Hash256) -> DCByRootIds { self.pop_received_network_events(&mut |ev| match ev { NetworkMessage::SendRequest { peer_id: _, @@ -630,12 +627,8 @@ impl TestRig { }) } - fn expect_only_data_columns_by_root_requests( - &mut self, - for_block: Hash256, - count: usize, - ) -> DCByRootIds { - let ids = self.expect_data_columns_by_root_requests(for_block, count); + fn expect_only_data_columns_by_root_requests(&mut self, for_block: Hash256) -> DCByRootIds { + let ids = self.expect_data_columns_by_root_requests(for_block); self.expect_empty_network(); ids } @@ -905,7 +898,7 @@ impl TestRig { } fn single_lookup_from_attestation_setup(&mut self) -> (Hash256, PeerId) { - let (head_root, head_slot) = self.create_unimported_parent_chain(1); + let (head_root, _) = self.create_unimported_parent_chain(1); // Use a supernode so Fulu tests can pass without edits let peer_id = self.new_connected_supernode_peer(); // Trigger the request @@ -915,7 +908,7 @@ impl TestRig { } pub fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { - let (head_root, head_slot) = self.create_unimported_parent_chain(2); + let (head_root, _) = self.create_unimported_parent_chain(2); // Use a supernode so Fulu tests can pass without edits let peer_id = self.new_connected_supernode_peer(); let head_block = self @@ -1058,10 +1051,10 @@ fn test_parent_lookup_drop_parent() { let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); // Complete the header chain so the first block can start syncing r.complete_header_chain(); - let chain = r.fetch_unimported_ancestor_chain(head_root); + let blocks = r.fetch_unimported_ancestor_chain(head_root); // Return wrong blocks for the parent of `head_root` = chain[1] r.progress_until_no_events( - filter().block_root(chain[1]), + filter().block_root(blocks[1]), complete().return_wrong_blocks(), ); r.expect_penalties("UnrequestedBlockRoot"); @@ -1075,15 +1068,15 @@ fn test_parent_lookup_drop_child() { let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); // Complete the header chain so the first block can start syncing r.complete_header_chain(); - let chain = r.fetch_unimported_ancestor_chain(head_root); + let blocks = r.fetch_unimported_ancestor_chain(head_root); // Return wrong blocks for the parent of `head_root` = chain[1] r.progress_until_no_events( - filter().block_root(chain[0]), + filter().block_root(blocks[0]), complete().return_wrong_blocks(), ); r.expect_penalties("UnrequestedBlockRoot"); // It should only drop the newest lookup - r.assert_active_lookups(&[chain[1]]); + r.assert_active_lookups(&[blocks[1]]); } // TODO(tree-sync): Current behaviour drops the lookup if there's no peers left @@ -1115,7 +1108,7 @@ fn test_lookup_disconnection_peer_left() { fn test_lookup_add_peers_to_parent() { let mut r = TestRig::test_setup(); let (head_root, _) = r.create_unimported_parent_chain(4); - let chain = r.fetch_unimported_ancestor_chain(head_root); + let blocks = r.fetch_unimported_ancestor_chain(head_root); let peer_id = r.new_connected_peer(); r.trigger_unknown_block_from_attestation(head_root, peer_id); r.complete_header_chain(); @@ -1127,7 +1120,7 @@ fn test_lookup_add_peers_to_parent() { let mut expected_peers = new_peers.clone(); expected_peers.push(peer_id); - for block in chain { + for block in blocks { // Parent has the original unknown parent event peer + new peer r.assert_lookup_peers(block, &expected_peers); } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index c1e5c08506a..2d94c7db0ec 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -267,7 +267,7 @@ impl CompleteConfig { self } - pub fn rpc_error_response(mut self, error: RpcErrorResponse) -> Self { + pub fn rpc_error_response(self, error: RpcErrorResponse) -> Self { self.rpc_error(RPCError::ErrorResponse(error, "".to_owned())) } @@ -286,7 +286,7 @@ impl CompleteConfig { self } - pub fn return_no_blocks(mut self) -> Self { + pub fn return_no_blocks(self) -> Self { self.return_no_blocks_n_times(usize::MAX) } @@ -476,6 +476,7 @@ impl TestRig { *block.message_mut().parent_root_mut() = parent_root; *block.message_mut().slot_mut() = slot; let block_root = block.canonical_root(); + self.log(&format!("Block slot {slot} root {block_root:?}")); self.blocks_by_root.insert(block_root, block.into()); parent_root = block_root; @@ -645,8 +646,7 @@ impl TestRig { .blob_kzg_commitments() .unwrap() .get(blob_id.index as usize) - .unwrap() - .clone(); + .unwrap(); let signed_block_header = block.signed_block_header(); // We need to produce a DataColumn with valid inclusion proof, but can @@ -957,7 +957,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { return; } - let (head_root, head_slot) = r.create_unimported_parent_chain(2); + let (head_root, _) = r.create_unimported_parent_chain(2); let remote_info = sync_info_with_head_root(head_root); r.add_sync_peer(false, remote_info.clone()); @@ -993,7 +993,7 @@ fn finalized_sync_single_custody_peer_failure() { return; } - let (head_root, head_slot) = r.create_unimported_parent_chain(2); + let (head_root, _) = r.create_unimported_parent_chain(2); let peer_1 = r.new_connected_supernode_peer(); // Trigger the request r.trigger_unknown_block_from_attestation(head_root, peer_1); From 8c857253aac7be0107acc345a0f68984ff0e9c0a Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 9 Jul 2025 20:48:59 +0200 Subject: [PATCH 46/66] Fix some lints and document --- beacon_node/network/src/metrics.rs | 24 +- .../network_beacon_processor/sync_methods.rs | 5 +- .../network/src/sync/backfill_sync/mod.rs | 16 +- beacon_node/network/src/sync/forward_sync.rs | 358 +++++++++++------- beacon_node/network/src/sync/manager.rs | 14 +- .../sync/network_context/download_request.rs | 2 +- beacon_node/network/src/sync/sync_block.rs | 9 +- beacon_node/network/src/sync/tests/lookups.rs | 6 +- beacon_node/network/src/sync/tests/range.rs | 10 +- 9 files changed, 248 insertions(+), 196 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index eeb2e888dff..aea9f9aba10 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -415,11 +415,10 @@ pub static SYNC_CHAINS_ADDED: LazyLock> = LazyLock::new(|| { "Total count of forward sync chains added", ) }); -pub static SYNCING_CHAINS_IGNORED_BLOCKS: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( +pub static SYNCING_CHAINS_IGNORED_BLOCKS: LazyLock> = LazyLock::new(|| { + try_create_int_counter( "sync_range_chains_ignored_blocks_total", "Total count of ignored blocks when processing a syncing chain batch per chain type", - &["chain_type"], ) }); pub static SYNCING_CHAINS_PROCESSED_BATCHES: LazyLock> = @@ -452,23 +451,10 @@ pub static SYNC_LOOKUP_CREATED: LazyLock> = LazyLock::new(|| "Total count of sync lookups created", ) }); -pub static SYNC_LOOKUP_DROPPED: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_lookups_dropped_total", - "Total count of sync lookups dropped by reason", - &["reason"], - ) -}); -pub static SYNC_LOOKUP_COMPLETED: LazyLock> = LazyLock::new(|| { - try_create_int_counter( - "sync_lookups_completed_total", - "Total count of sync lookups completed", - ) -}); -pub static SYNC_LOOKUPS_STUCK: LazyLock> = LazyLock::new(|| { +pub static SYNC_FORWARD_BLOCKS_DROPPED: LazyLock> = LazyLock::new(|| { try_create_int_counter( - "sync_lookups_stuck_total", - "Total count of sync lookups that are stuck and dropped", + "sync_forward_lookups_dropped_total", + "Total count of forward sync blocks dropped by reason", ) }); pub static SYNC_ACTIVE_NETWORK_REQUESTS: LazyLock> = LazyLock::new(|| { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index ca16fbf39fa..01df2304b4b 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -128,7 +128,10 @@ impl NetworkBeaconProcessor { { (imported_blocks, Ok(_)) => { let ignored_blocks = sent_blocks - imported_blocks; - metrics::inc_gauge(&metrics::SYNCING_CHAINS_IGNORED_BLOCKS); + metrics::inc_counter_by( + &metrics::SYNCING_CHAINS_IGNORED_BLOCKS, + ignored_blocks as u64, + ); debug!( %id, first_block_slot = start_slot, diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 9e7e5582c07..efe7ddf59d9 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -203,14 +203,12 @@ impl BackFillSync { pub fn peer_disconnected(&mut self, peer_id: &PeerId) { self.status.remove_peer(peer_id); - if self.status.peer_count() == 0 { - if self.state() == BackFillState::Syncing { - info!( - "reason" = "insufficient_synced_peers", - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - } + if self.status.peer_count() == 0 && self.state() == BackFillState::Syncing { + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); } } @@ -227,7 +225,7 @@ impl BackFillSync { pub fn on_block_process_result( &mut self, - id: Id, + _id: Id, result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index ca96c1a6223..bd0a3881e06 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -7,8 +7,7 @@ use crate::sync::network_context::{BatchPeers, RpcResponseResult}; use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use itertools::Itertools; +use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::{ BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, HeaderLookupId, Id, RangeRequestId, @@ -37,6 +36,39 @@ struct TipId(u32); /// /// Goals /// - Download multiple blocks at once to increase download speed +/// +/// +/// ## da_checker oracle +/// +/// TODO(tree-sync): re-implement if necessary +/// +/// +/// ## duplicate_cache with gossip blocks +/// +/// Gossip may receive and process the same block that ForwardSync attempts to process. +/// +/// a. Gossip receives block X and sends to process +/// b. ForwardSync downloads block X +/// c. ForwardSync sends block X for process +/// +/// Consider the order of events +/// - [a,b,c]: the gossip block is inserted in the `duplicate_cache` and the RPC block is queued. +/// Step b could be skipped, but we accept the inneficiency for simplicity. +/// - [b,a,c]: the RPC block is downloaded, gossip block into `duplicate_cache` and RPC block queued +/// - [b,c,a]: the RPC block is inserted in the `duplicate_cache` and the gossip block is queued +/// +/// ## Pruning +/// +/// So chose to not explicitly implement pruning for forward sync. Chains can be pruned by: +/// +/// 1. Checking if the conflict with finality once finality advances: If this happens once we +/// attempt to import the first block of the chain we'll get an unknown parent error. The chain +/// will fail and be dropped = so this pruning happens by default. +/// 2. If their blocks are imported through another source: If this happens when we attempt to +/// process the block we'll get a duplicate_cache hit or a block already known error. In either +/// case the processing result for the block with be an Ok, and we'll move to the next block. +/// +/// pub struct ForwardSync { block_to_tip: HashMap, chains: HashMap>, @@ -45,50 +77,45 @@ pub struct ForwardSync { /// Chain of consecutive blocks that are imported by the same set of peers struct Chain { peers: HashSet, - status: ChainStatus, -} - -struct ForwardSyncBlock { - id: HeaderLookupId, status: Status, } -enum ChainStatus { - // Recursively fetch headers until discovering a parent_root that is known, then transition - // state to `ForwardSync`. +type PendingBlock = (Hash256, Slot, Id); + +#[allow(clippy::large_enum_variant)] +enum Status { + /// Recursively fetch headers until discovering a parent_root that is known. Its list of + /// block_roots can grow by appending ancestors. + /// - Transition to `WaitingParentChain` if the parent is known but not imported + /// - Transition to `ForwardSync` if the parent is imported BackfillHeaders { - /// Headers descendant of `next_block_root` that are already downloaded. + /// Headers descendant of `next_header_request.block_root` that are already downloaded. + /// Does not include `next_header_request.block_root`. /// Sorting: tip first, oldest ancestor last - block_roots: Vec, + block_roots: Vec, /// Oldest ancestor block root of this Chain. next_header_request: HeaderRequest, }, + /// Waits for a parent block in a different chain to be imported. Its block_root list does not + /// change. + /// - Transitions to `ForwardSync` once `parent_root` is imported. WaitingParentChain { + /// Parent root of the last block_root in `block_roots` parent_root: Hash256, /// Sorting: tip first, oldest ancestor last - block_roots: Vec, + block_roots: Vec, }, - // Sync blocks from old to new buffering some blocks + /// Download and process block_roots from oldest ancestor to tip. Its list of block_roots does + /// not grow, only removed block roots once processed. ForwardSync { /// Sorting: tip first, oldest ancestor last - block_roots: Vec, + block_roots: Vec, /// Sorting: oldest ancestor first syncing_blocks: VecDeque>, }, } -enum Status { - // TODO(tree-sync): Make the "waiting" completed header requests as memory cheap as possible - BackfillHeader { - failed_peers: HashSet, - request: DownloadRequest, - }, - ForwardSyncBlock { - header: BeaconBlockHeader, - request: SyncBlock, - }, -} - +/// Tracks a request to download a BeaconBlockHeader by block root struct HeaderRequest { id: HeaderLookupId, block_root: Hash256, @@ -154,7 +181,7 @@ impl Chain { fn new(block_root: Hash256, id: Id, initial_peers: &[PeerId]) -> Self { Self { peers: HashSet::from_iter(initial_peers.iter().copied()), - status: ChainStatus::BackfillHeaders { + status: Status::BackfillHeaders { block_roots: vec![], next_header_request: HeaderRequest::new(block_root, id), }, @@ -171,47 +198,51 @@ impl Chain { self.peers.remove(peer) } + /// Returns a Vec of peers that have imported the blocks in this chain fn get_peers(&self) -> Vec { self.peers.iter().copied().collect() } + /// Returns the count of peers that have imported the blocks in this chain fn peer_count(&self) -> usize { self.peers.len() } + /// Returns the parent root of the oldest ancestor of this chain. Returns None if the chain is + /// already processing = its parent has already been imported. fn parent_root(&self) -> Option { match &self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { next_header_request, .. } => Some(next_header_request.block_root), - ChainStatus::WaitingParentChain { parent_root, .. } => Some(*parent_root), - ChainStatus::ForwardSync { .. } => None, + Status::WaitingParentChain { parent_root, .. } => Some(*parent_root), + Status::ForwardSync { .. } => None, } } - fn tip(&self) -> Hash256 { + /// Returns the tip of this chain. Returns None if the chain is empty (should not happen) + fn tip(&self) -> Option { match &self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { next_header_request, block_roots, - } => block_roots - .first() - .copied() - .unwrap_or(next_header_request.block_root), - ChainStatus::WaitingParentChain { block_roots, .. } => block_roots - .first() - .copied() - .expect("block roots is not empty"), - ChainStatus::ForwardSync { + } => Some( + block_roots + .first() + .map(|block| block.0) + .unwrap_or(next_header_request.block_root), + ), + Status::WaitingParentChain { block_roots, .. } => { + block_roots.first().map(|block| block.0) + } + Status::ForwardSync { block_roots, syncing_blocks, - } => block_roots.first().copied().unwrap_or_else(|| { - syncing_blocks - .back() - .map(|block| *block.block_root()) - .expect("blocks are not empty") - }), + } => block_roots + .first() + .map(|block| block.0) + .or_else(|| syncing_blocks.back().map(|block| *block.block_root())), } } @@ -219,8 +250,9 @@ impl Chain { /// ancestors, and leaves `self` with only the descendants of `block_root` excluding /// `block_root` fn split_by(&mut self, block_root: Hash256) -> Result { + // TODO(tree-sync): Review this logic, it's sensitive and not trivial let status = match &mut self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { block_roots, next_header_request, } => { @@ -230,46 +262,46 @@ impl Chain { std::mem::replace(next_header_request, HeaderRequest::empty()); let new_block_roots = - if let Some(idx) = block_roots.iter().position(|b| b == &block_root) { + if let Some(idx) = block_roots.iter().position(|b| b.0 == block_root) { // ..= to keep the block_root on the left block_roots.drain(0..=idx).collect::>() } else { // TODO(tree-sync): check that block_root is the next_root or error vec![] }; - self.status = ChainStatus::WaitingParentChain { + self.status = Status::WaitingParentChain { parent_root: block_root, block_roots, }; - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { block_roots: new_block_roots, next_header_request, } } - ChainStatus::WaitingParentChain { + Status::WaitingParentChain { parent_root, block_roots, } => { let idx = block_roots .iter() - .position(|b| b == &block_root) + .position(|b| b.0 == block_root) .ok_or(InternalError(format!( "block_root {block_root:?} no in chain" )))?; // ..= to keep the block_root on the left let new_block_roots = block_roots.drain(0..=idx).collect::>(); let parent_root = *parent_root; - self.status = ChainStatus::WaitingParentChain { + self.status = Status::WaitingParentChain { parent_root: block_root, block_roots: std::mem::take(block_roots), }; - ChainStatus::WaitingParentChain { + Status::WaitingParentChain { parent_root, block_roots: new_block_roots, } } - ChainStatus::ForwardSync { .. } => { + Status::ForwardSync { .. } => { todo!("How to split a chain that's already syncing?"); } }; @@ -281,36 +313,41 @@ impl Chain { }) } - fn to_foward_sync_block(&mut self) -> Result { - todo!(); - } - + /// If this chain is waiting for `block_root` it transitions to forward sync. fn on_block_imported(&mut self, block_root: &Hash256) { match &mut self.status { - ChainStatus::BackfillHeaders { .. } => {} - ChainStatus::WaitingParentChain { + Status::BackfillHeaders { .. } => {} + Status::WaitingParentChain { block_roots, parent_root, } => { if block_root == parent_root { - self.status = ChainStatus::ForwardSync { + self.status = Status::ForwardSync { block_roots: std::mem::take(block_roots), syncing_blocks: <_>::default(), }; } } - ChainStatus::ForwardSync { .. } => {} + Status::ForwardSync { .. } => {} } } - fn to_forward_sync(&mut self, parent_root: Hash256) -> Result<(), InternalError> { + /// Transitions to forward sync + fn backfill_headers_to_forward_sync( + &mut self, + block: BeaconBlockHeader, + ) -> Result<(), InternalError> { match &mut self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { block_roots, next_header_request, } => { - block_roots.push(next_header_request.block_root); - self.status = ChainStatus::ForwardSync { + block_roots.push(( + block.canonical_root(), + block.slot, + next_header_request.id.id, + )); + self.status = Status::ForwardSync { block_roots: std::mem::take(block_roots), syncing_blocks: <_>::default(), }; @@ -322,9 +359,9 @@ impl Chain { fn block_count(&self) -> usize { match &self.status { - ChainStatus::BackfillHeaders { block_roots, .. } - | ChainStatus::WaitingParentChain { block_roots, .. } => block_roots.len(), - ChainStatus::ForwardSync { + Status::BackfillHeaders { block_roots, .. } + | Status::WaitingParentChain { block_roots, .. } => block_roots.len(), + Status::ForwardSync { block_roots, syncing_blocks, } => block_roots.len() + syncing_blocks.len(), @@ -334,28 +371,31 @@ impl Chain { /// Returns all block roots part of this chain fn iter_block_roots(&self) -> Box + '_> { match &self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { block_roots, next_header_request, - } => { - Box::new(std::iter::once(&next_header_request.block_root).chain(block_roots.iter())) + } => Box::new( + std::iter::once(&next_header_request.block_root) + .chain(block_roots.iter().map(|(root, _, _)| root)), + ), + Status::WaitingParentChain { block_roots, .. } => { + Box::new(block_roots.iter().map(|(root, _, _)| root)) } - ChainStatus::WaitingParentChain { block_roots, .. } => Box::new(block_roots.iter()), - ChainStatus::ForwardSync { + Status::ForwardSync { syncing_blocks, block_roots, } => Box::new( syncing_blocks .iter() .map(|block| block.block_root()) - .chain(block_roots.iter()), + .chain(block_roots.iter().map(|(root, _, _)| root)), ), } } /// Returns true if this chain has no blocks fn is_empty(&self) -> bool { - self.iter_block_roots().is_empty() + self.iter_block_roots().next().is_none() } fn min_slot(&self) -> Option { @@ -368,9 +408,9 @@ impl Chain { fn syncing_blocks_count(&self) -> usize { match &self.status { - ChainStatus::BackfillHeaders { .. } => 0, - ChainStatus::WaitingParentChain { .. } => 0, - ChainStatus::ForwardSync { syncing_blocks, .. } => syncing_blocks.len(), + Status::BackfillHeaders { .. } => 0, + Status::WaitingParentChain { .. } => 0, + Status::ForwardSync { syncing_blocks, .. } => syncing_blocks.len(), } } @@ -378,7 +418,7 @@ impl Chain { &mut self, ) -> Result<&mut DownloadRequest, Error> { match &mut self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { next_header_request, .. } => Ok(&mut next_header_request.request), @@ -388,14 +428,20 @@ impl Chain { } } - fn add_ancestor(&mut self, parent_root: Hash256, id: Id) -> Result<(), InternalError> { + fn add_ancestor(&mut self, block: BeaconBlockHeader, id: Id) -> Result<(), InternalError> { match &mut self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { block_roots, next_header_request, } => { - block_roots.push(next_header_request.block_root); - *next_header_request = HeaderRequest::new(parent_root, id); + block_roots.push(( + // Should be the same as `next_header_request.block_root` + block.canonical_root(), + block.slot, + // Persist the request ID of the header for better traceability + next_header_request.id.id, + )); + *next_header_request = HeaderRequest::new(block.parent_root, id); Ok(()) } _ => Err(InternalError( @@ -406,8 +452,8 @@ impl Chain { fn to_waiting_parent(&mut self, parent_root: Hash256) -> Result<(), Error> { match &mut self.status { - ChainStatus::BackfillHeaders { block_roots, .. } => { - self.status = ChainStatus::WaitingParentChain { + Status::BackfillHeaders { block_roots, .. } => { + self.status = Status::WaitingParentChain { parent_root, block_roots: std::mem::take(block_roots), }; @@ -431,6 +477,7 @@ impl Chain { Ok(()) } + /// Handle the result of a block processing. fn on_process_result( &mut self, id: HeaderLookupId, @@ -440,13 +487,14 @@ impl Chain { let (ok_to_import, block) = self.block_request(RangeRequestId::ForwardSync(id))?; match block.on_process_result(result, cx)? { SyncBlockResult::Done { parent_root, slot } => { - // This block is complete, remove it from chain + // Sanity check: the processed block must be the oldest block in the chain if !ok_to_import { return Err(Error::InternalError(format!( "Block {id} is not the first block" ))); } - if let ChainStatus::ForwardSync { syncing_blocks, .. } = &mut self.status { + // This block processing is complete, remove it from chain + if let Status::ForwardSync { syncing_blocks, .. } = &mut self.status { if let Some(block) = syncing_blocks.pop_front() { debug!("Dropping syncing block {}", block.id()); } else { @@ -465,7 +513,7 @@ impl Chain { fn block_request(&mut self, id: RangeRequestId) -> Result<(bool, &mut SyncBlock), Error> { match &mut self.status { - ChainStatus::ForwardSync { syncing_blocks, .. } => { + Status::ForwardSync { syncing_blocks, .. } => { if let Some(index) = syncing_blocks.iter().position(|b| b.id() == id) { let block = syncing_blocks.get_mut(index).expect("index just found"); return Ok((index == 0, block)); @@ -476,24 +524,21 @@ impl Chain { "Unknown block for {id}, first few blocks {first_ids:?}" ))) } - _ => Err(Error::InternalError( "Expected lookup to be in Syncing state".to_owned(), )), } } + /// Continues the header or blocks requests of this chain fn continue_requests(&mut self, cx: &mut SyncNetworkContext) -> Result<(), Error> { match &mut self.status { - ChainStatus::BackfillHeaders { + Status::BackfillHeaders { next_header_request, .. } => Ok(next_header_request.continue_request(&self.peers, cx)?), - ChainStatus::WaitingParentChain { .. } => Ok(()), - ChainStatus::ForwardSync { - block_roots, - syncing_blocks, - } => { + Status::WaitingParentChain { .. } => Ok(()), + Status::ForwardSync { syncing_blocks, .. } => { for (index, block) in syncing_blocks.iter_mut().enumerate() { let ok_to_import = index == 0; block.continue_request(cx, ok_to_import)?; @@ -506,11 +551,15 @@ impl Chain { #[derive(Debug)] pub enum Error { + /// Unexpected and unrecoverable error InternalError(String), + /// Expected and unrecoverable error TooManyErrors(String), + /// Block is not descendant of the finalized checkpoint BlockConflictsWithFinality(String), } +/// Unexpected and unrecoverable error #[derive(Debug)] struct InternalError(String); @@ -554,13 +603,14 @@ pub(crate) enum SyncState { } impl ForwardSync { - pub fn new(chain: Arc>) -> Self { + pub fn new() -> Self { Self { block_to_tip: <_>::default(), chains: <_>::default(), } } + /// Returns the peers that claim to have imported a specific block_root #[cfg(test)] pub fn block_peers(&self, block_root: &Hash256) -> Result>, String> { let Some(chain) = self.block_to_tip.get(block_root) else { @@ -574,11 +624,13 @@ impl ForwardSync { )) } + /// Get all blocks that forward sync intends to sync #[cfg(test)] pub fn get_lookups(&self) -> Vec { self.block_to_tip.keys().copied().collect() } + /// Total count of blocks that forward sync intends to sync pub fn block_count(&self) -> usize { self.block_to_tip.len() } @@ -590,14 +642,15 @@ impl ForwardSync { todo!(); } + /// Return all processing ids of syncing blocks #[cfg(test)] pub fn get_processing_ids(&mut self) -> Vec { let mut ids = vec![]; for chain in self.chains.values() { match &chain.status { - ChainStatus::BackfillHeaders { .. } => {} - ChainStatus::WaitingParentChain { .. } => {} - ChainStatus::ForwardSync { syncing_blocks, .. } => { + Status::BackfillHeaders { .. } => {} + Status::WaitingParentChain { .. } => {} + Status::ForwardSync { syncing_blocks, .. } => { for block in syncing_blocks { if block.is_processing() { if let RangeRequestId::ForwardSync(id) = block.id() { @@ -615,20 +668,25 @@ impl ForwardSync { todo!(); } + /// Remove a disconnected peer from all chains pub fn remove_peer(&mut self, peer: PeerId) { for chain in self.chains.values_mut() { chain.remove_peer(&peer); } } + /// A set of peers claim to have imported a block_root. Create a new lookup for it or add them + /// to an existing one + its ancestors pub fn search( &mut self, block_root: Hash256, peers: &[PeerId], cx: &mut SyncNetworkContext, ) -> Result<(), Error> { - if let Some(initial_chain_id) = self.block_to_tip.get(&block_root) { + if let Some(_) = self.block_to_tip.get(&block_root) { + let mut peers = HashSet::<&PeerId>::from_iter(peers); let mut counts = HashMap::<&PeerId, usize>::new(); + // Add peer to `block`'s entry and all its ancestors let mut target_block_root = block_root; while let Some(chain_id) = self.block_to_tip.get_mut(&target_block_root) { @@ -637,8 +695,13 @@ impl ForwardSync { .get_mut(chain_id) .ok_or(InternalError(format!("Unknown chain {chain_id}")))?; - // If target_block_root is not the tip of chain, we have to split the chain - let chain_to_add_peers = if chain.tip() != target_block_root { + let should_split_chain = match chain.tip() { + // If target_block_root is not the tip of chain, we have to split the chain + Some(tip) => tip != target_block_root, + // If the chain has no tip (should not happen) don't split the chain + None => false, + }; + let chain_to_add_peers = if should_split_chain { let new_chain = chain.split_by(target_block_root)?; let new_chain_id = TipId(cx.next_id()); @@ -657,23 +720,29 @@ impl ForwardSync { chain }; - for peer in peers { - // TODO(tree-sync): If peer already in set no need to add to its ancestors - if chain_to_add_peers.add_peer(*peer) { - // TODO(tree-sync): This log can be very noisy maybe log once per peer + peers.retain(|peer| { + if chain_to_add_peers.add_peer(**peer) { *counts.entry(peer).or_default() += 1; + // We added peer to the lookup, retain it for the next ancestor chain + true } else { // Peer already part of this lookup, therefore it must be part of the peer // set of all of its ancestors: stop - break; + false } + }); + // No peers need to be added to ancestors, stop + if peers.is_empty() { + break; } + if let Some(parent_root) = chain_to_add_peers.parent_root() { target_block_root = parent_root; } else { break; } } + // Log once per peer, as we could add it to a very large number of lookups for (peer, count) in counts { debug!(block_root = ?target_block_root, %peer, count, "Adding peer to existing header lookup and ancestors"); } @@ -700,13 +769,14 @@ impl ForwardSync { let mut chain = Chain::new(block_root, id, peers); chain.continue_requests(cx)?; // Don't insert until first request is successful - metrics::inc_counter(&metrics::SYNC_CHAINS_ADDED); self.chains.insert(chain_id, chain); self.block_to_tip.insert(block_root, chain_id); + metrics::inc_counter(&metrics::SYNC_CHAINS_ADDED); } Ok(()) } + /// Handle the result of a header download. pub fn on_header_download_result( &mut self, req_id: BlocksByRootRequestId, @@ -717,11 +787,12 @@ impl ForwardSync { ) { let block_root = id.block_root; - let result: Result = (|| { + // Invoke a closure to use the ? operator and handle the result consistenlty + let result: Result<(), Error> = (|| { let Some(chain_id) = self.block_to_tip.get(&block_root) else { // TODO(tree-sync): register metric debug!(id = ?req_id, "Received header request for unknown block_root"); - return Ok(SyncBlockResult::Wait); + return Ok(()); }; let chain = self.chains.get_mut(chain_id).ok_or(InternalError(format!( "block_root {block_root:?} references unknown chain {chain_id}" @@ -788,7 +859,7 @@ impl ForwardSync { if cx.chain.block_is_known_to_fork_choice(&parent_root) { // Parent is imported, we can forward sync this chain // Stop search we reached a known block - chain.to_forward_sync(parent_root)?; + chain.backfill_headers_to_forward_sync(block_header)?; debug!(%chain_id, ?parent_root, block_count = chain.block_count(), "Forward sync chain reached imported block"); // Trigger potential foward sync for this chain self.continue_requests(cx); @@ -800,7 +871,7 @@ impl ForwardSync { // TODO(tree-sync): Add peers recursively to the chain_id, potentially // splitting the chain when adding peers. } else { - chain.add_ancestor(parent_root, cx.next_id())?; + chain.add_ancestor(block_header, cx.next_id())?; // Add to the block_to_tip mapping to respect the invariant "Each block // root exists in exactly one `Chain::block_roots` list". self.block_to_tip.insert(parent_root, *chain_id); @@ -816,16 +887,15 @@ impl ForwardSync { self.continue_requests(cx); } } - Ok(SyncBlockResult::Wait) + Ok(()) })(); - // Map result Ok to Wait as completing the header request does not complete the overall - // ForwardSyncBlock request. if let Err(e) = result { - self.handle_result(id.block_root, e, cx); + self.handle_error(id.block_root, e); } } + /// Handle the result of a block download. pub fn on_block_download_result( &mut self, req_id: ComponentsByRootRequestId, @@ -843,21 +913,22 @@ impl ForwardSync { }; if let Err(e) = chain.on_download_result(req_id, result, cx) { - self.handle_result(id.block_root, e, cx); + self.handle_error(id.block_root, e); } } + /// Handle the result of a block processing. pub fn on_block_process_result( &mut self, id: HeaderLookupId, result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - let Some(chain_id) = self.block_to_tip.get(&id.block_root) else { + let Some(chain_id) = self.block_to_tip.get(&id.block_root).copied() else { debug!(?id, "Received block process result for unknown lookup"); return; }; - let Some(chain) = self.chains.get_mut(chain_id) else { + let Some(chain) = self.chains.get_mut(&chain_id) else { error!(%chain_id, block_root = ?id.block_root, "Block references unknown chain"); return; }; @@ -866,35 +937,29 @@ impl ForwardSync { Ok(SyncBlockResult::Done { .. }) => { metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); self.block_to_tip.remove(&id.block_root); - // Find all chains that are awaiting this block to process and continue them - for other_chain in self.chains.values_mut() { - other_chain.on_block_imported(&id.block_root); - } - self.continue_requests(cx); // If the chain is empty, remove it if chain.is_empty() { self.chains.remove(&chain_id); metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); } + + // Find all chains that are awaiting this block to process and continue them + for other_chain in self.chains.values_mut() { + other_chain.on_block_imported(&id.block_root); + } + self.continue_requests(cx); } // Wait for next event Ok(SyncBlockResult::Wait) => {} Err(e) => { - self.handle_result(id.block_root, e, cx); + self.handle_error(id.block_root, e); } } } - pub fn prune(&mut self) { - // Prune blocks once imported, and once finality advances - } - - pub fn prune_imported_block(&mut self, block_root: Hash256, _imported: bool) { - // Recursively prune this block and all their ancestors - todo!(); - } - - fn handle_result(&mut self, block_root: Hash256, error: Error, cx: &mut SyncNetworkContext) { + /// Common handler for any `forward_sync::Error`. For simplicity it drops the chain that includes + /// the block and all of its descendants. + fn handle_error(&mut self, block_root: Hash256, error: Error) { debug!(?error, ?block_root, "Dropping forward sync block lookup"); let Some(chain_id) = self.block_to_tip.get(&block_root).copied() else { debug!(?block_root, "Handling error for unknown block_root"); @@ -952,7 +1017,7 @@ impl ForwardSync { .chains .iter_mut() .filter_map(|(_, chain)| { - if matches!(chain.status, ChainStatus::ForwardSync { .. }) { + if matches!(chain.status, Status::ForwardSync { .. }) { Some((chain.peer_count(), chain)) } else { None @@ -963,19 +1028,20 @@ impl ForwardSync { chains_by_peer_count.sort_by_key(|(peer_count, _)| *peer_count); for (_, chain) in chains_by_peer_count { - if let ChainStatus::ForwardSync { + if let Status::ForwardSync { block_roots, syncing_blocks, } = &mut chain.status { - /// block_roots sorting: tip first, oldest ancestor last => pop + // block_roots sorting: tip first, oldest ancestor last => pop if let Some(next_block) = block_roots.pop() { syncing_blocks.push_back(SyncBlock::new( RangeRequestId::ForwardSync(HeaderLookupId { - id: cx.next_id(), - block_root: next_block, + // Reuse the request ID of the header for better traceability + id: next_block.2, + block_root: next_block.0, }), - next_block, + next_block.0, &chain.peers.iter().copied().collect::>(), )); blocks_syncing += 1; @@ -1028,7 +1094,7 @@ impl ForwardSync { for block_root in chain.iter_block_roots() { self.block_to_tip.remove(block_root); debug!(?block_root, id = %chain_id, "Dropping forward sync block lookup"); - metrics::inc_counter(&metrics::SYNC_LOOKUPS_DROPPED); + metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); } // Only remove children if the node still existed // Push its children—if any—onto the work list. diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index ffe90b7cafe..34d09ffe19c 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -39,7 +39,7 @@ use super::network_context::{ CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; -use super::peer_sync_info::{remote_sync_type, PeerSyncType}; +use super::peer_sync_info::PeerSyncType; use crate::network_beacon_processor::{ ChainSegmentProcessId, NetworkBeaconProcessor, PeerGroupAction, }; @@ -278,7 +278,7 @@ impl SyncManager { beacon_chain.clone(), fork_context.clone(), ), - forward_sync: ForwardSync::new(beacon_chain.clone()), + forward_sync: ForwardSync::new(), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, @@ -356,8 +356,6 @@ impl SyncManager { self.add_peer_with_imported_block_root(peer_id, remote.head_root); } - let sync_type = remote_sync_type(&local, &remote, &self.chain); - // TODO(tree-sync): Okay to add all peers to backfill sync? How can we know which have the // blocks we need? self.backfill_sync.add_peer(peer_id); @@ -615,7 +613,7 @@ impl SyncManager { self.handle_new_execution_engine_state(engine_state); } _ = prune_lookups_interval.tick() => { - self.forward_sync.prune(); + // TODO(tree-sync): should prune stuck lookups? } _ = prune_requests.tick() => { self.prune_requests(); @@ -713,7 +711,9 @@ impl SyncManager { SyncMessage::GossipBlockProcessResult { block_root, imported, - } => self.forward_sync.prune_imported_block(block_root, imported), + } => { + // Not used + } SyncMessage::BatchProcessed { sync_type, result } => match sync_type { ChainSegmentProcessId::ForwardSync(id) => { self.forward_sync @@ -1032,7 +1032,7 @@ impl SyncManager { &mut self.network, ); } - RangeRequestId::BackfillSync(id) => { + RangeRequestId::BackfillSync(_) => { self.backfill_sync .on_block_download_result(req_id, result, &mut self.network) } diff --git a/beacon_node/network/src/sync/network_context/download_request.rs b/beacon_node/network/src/sync/network_context/download_request.rs index 95bb8c8f161..b610c8da488 100644 --- a/beacon_node/network/src/sync/network_context/download_request.rs +++ b/beacon_node/network/src/sync/network_context/download_request.rs @@ -127,7 +127,7 @@ impl DownloadRequest { pub fn is_complete(&self) -> Option<&T> { match &self.status { Status::Downloaded(_, data, _) => Some(data), - other => None, + _ => None, } } diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 2d87cea5283..68efa58cd05 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -96,7 +96,7 @@ impl SyncBlock { &mut self, req_id: ComponentsByRootRequestId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, - cx: &mut SyncNetworkContext, + _cx: &mut SyncNetworkContext, ) -> Result<(), Error> { match &mut self.request { SyncingStatus::Downloading(expected_id) => { @@ -204,12 +204,7 @@ impl SyncBlock { // from the beacon processor anyway. No need to add more code to handle this // edge case faster. - let expect_parent_to_be_imported = false; - if expect_parent_to_be_imported - && !cx - .chain - .block_is_known_to_fork_choice(&block.as_block().parent_root()) - { + if !ok_to_import { return Ok(()); } diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 0bf19ec871a..d832d3b68fb 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -627,7 +627,11 @@ impl TestRig { }) } - fn expect_only_data_columns_by_root_requests(&mut self, for_block: Hash256) -> DCByRootIds { + fn expect_only_data_columns_by_root_requests( + &mut self, + for_block: Hash256, + _count: usize, + ) -> DCByRootIds { let ids = self.expect_data_columns_by_root_requests(for_block); self.expect_empty_network(); ids diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 2d94c7db0ec..ae2a2416c11 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -173,10 +173,10 @@ impl RequestFilter { req: &BlocksByRootRequest, id: &BlocksByRootRequestId, ) -> bool { - if self.header_requests_only { - if !matches!(id.parent_request_id, BlocksByRootRequester::Header(_)) { - return false; - } + if self.header_requests_only + && !matches!(id.parent_request_id, BlocksByRootRequester::Header(_)) + { + return false; } if let Some(block_root) = self.block_root { @@ -640,7 +640,7 @@ impl TestRig { .body() .kzg_commitment_merkle_proof(blob_id.index as usize) .unwrap(); - let kzg_commitment = block + let kzg_commitment = *block .message() .body() .blob_kzg_commitments() From ef7c6b2984b613b976d25f98a24479a75d7b4caa Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 11 Jul 2025 00:44:35 +0200 Subject: [PATCH 47/66] Remove remaining todo --- .../network/src/sync/backfill_sync/mod.rs | 5 ++ beacon_node/network/src/sync/forward_sync.rs | 66 +++++++++++++++++-- beacon_node/network/src/sync/sync_block.rs | 17 ++++- 3 files changed, 81 insertions(+), 7 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index efe7ddf59d9..f61ec36fee7 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -107,6 +107,8 @@ impl BackFillSync { status: SyncBlock::new( RangeRequestId::BackfillSync(0), anchor_info.oldest_block_parent, + // TODO(tree-sync): not correct fetch the corrent slot + anchor_info.oldest_block_slot, &[], ), restart_failed_sync: false, @@ -252,9 +254,12 @@ impl BackFillSync { self.set_state(BackFillState::Completed); } else { let peers = self.status.clone_peers(); + // TODO(tree-sync): retrieve correct slot from fetching headers first + let parent_block_slot = Slot::new(0); self.status = SyncBlock::new( RangeRequestId::BackfillSync(cx.next_id()), parent_root, + parent_block_slot, &peers.into_iter().collect::>(), ) } diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index bd0a3881e06..f491f2acdf7 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -107,6 +107,12 @@ enum Status { }, /// Download and process block_roots from oldest ancestor to tip. Its list of block_roots does /// not grow, only removed block roots once processed. + /// + /// Note: Keeping block_roots and syncing_blocks in separate Vecs instead of a single Vec with + /// an enum shows the following invariants: + /// - The set of PendingBlocks is consecutive + /// - The set of SyncBlocks is consecutive + /// - The parent of the last item in `block_roots` is the first item in `syncing_blocks` ForwardSync { /// Sorting: tip first, oldest ancestor last block_roots: Vec, @@ -251,6 +257,7 @@ impl Chain { /// `block_root` fn split_by(&mut self, block_root: Hash256) -> Result { // TODO(tree-sync): Review this logic, it's sensitive and not trivial + // TODO(tree-sync): write a prop test for this, check milhouse tests as inspo let status = match &mut self.status { Status::BackfillHeaders { block_roots, @@ -301,8 +308,41 @@ impl Chain { block_roots: new_block_roots, } } - Status::ForwardSync { .. } => { - todo!("How to split a chain that's already syncing?"); + Status::ForwardSync { + block_roots, + syncing_blocks, + } => { + // block_root may be in `block_roots` or in `syncing_blocks`. + let block_roots_idx = block_roots.iter().position(|b| b.0 == block_root); + let new_block_roots = if let Some(idx) = block_roots_idx { + // ..= to keep the block_root on the left + block_roots.drain(0..=idx).collect::>() + } else { + // `block_root` must be in `syncing_blocks` so the new splitted chain will have + // no `block_roots` items. + vec![] + }; + + let new_syncing_blocks = if block_roots_idx.is_some() { + // If `block_root` is in `block_roots` all syncing_blocks go to the new chain + std::mem::take(syncing_blocks) + } else { + // else find the position + let idx = syncing_blocks + .iter() + .position(|b| *b.block_root() == block_root) + .ok_or(InternalError(format!( + "block_root {block_root:?} not found in chain" + )))?; + // ..= to keep the block_root on the left + syncing_blocks.drain(0..=idx).collect::>() + }; + // This chain remains ForwardSync + // New chain is ForwardSync with the splitted Vecs + Status::ForwardSync { + block_roots: new_block_roots, + syncing_blocks: new_syncing_blocks, + } } }; @@ -399,11 +439,21 @@ impl Chain { } fn min_slot(&self) -> Option { - todo!(); + match &self.status { + // TODO(tree-sync): include syncing_blocks for ForwardSync + Status::BackfillHeaders { block_roots, .. } + | Status::WaitingParentChain { block_roots, .. } + | Status::ForwardSync { block_roots, .. } => block_roots.last().map(|b| b.1), + } } fn max_slot(&self) -> Option { - todo!(); + match &self.status { + // TODO(tree-sync): include syncing_blocks for ForwardSync + Status::BackfillHeaders { block_roots, .. } + | Status::WaitingParentChain { block_roots, .. } + | Status::ForwardSync { block_roots, .. } => block_roots.first().map(|b| b.1), + } } fn syncing_blocks_count(&self) -> usize { @@ -639,7 +689,10 @@ impl ForwardSync { pub fn max_slot_to_sync(&self) -> Option { // TODO(tree-sync): weak metric, who have a better heuristic for sync? Now that lookups // count here - todo!(); + self.chains + .values() + .filter_map(|chain| chain.max_slot()) + .max() } /// Return all processing ids of syncing blocks @@ -665,7 +718,7 @@ impl ForwardSync { } pub fn pause(&mut self) { - todo!(); + // TODO(tree-sync): consider if we really need a pausing mechanism for when EL offline } /// Remove a disconnected peer from all chains @@ -1042,6 +1095,7 @@ impl ForwardSync { block_root: next_block.0, }), next_block.0, + next_block.1, &chain.peers.iter().copied().collect::>(), )); blocks_syncing += 1; diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 68efa58cd05..dbe0bee5317 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -20,6 +20,7 @@ const MAX_PROCESS_ATTEMPTS: usize = 5; pub struct SyncBlock { id: RangeRequestId, block_root: Hash256, + block_slot: Slot, failed_peers: HashSet, // TODO(tree-sync): deprecate this shared state for manual addition and removal peers: Arc>>, @@ -48,10 +49,16 @@ pub enum Error { } impl SyncBlock { - pub fn new(id: RangeRequestId, block_root: Hash256, initial_peers: &[PeerId]) -> Self { + pub fn new( + id: RangeRequestId, + block_root: Hash256, + block_slot: Slot, + initial_peers: &[PeerId], + ) -> Self { Self { id, block_root, + block_slot, failed_peers: <_>::default(), peers: Arc::new(RwLock::new(HashSet::from_iter( initial_peers.iter().copied(), @@ -66,6 +73,10 @@ impl SyncBlock { &self.block_root } + pub fn slot(&self) -> Slot { + self.block_slot + } + pub fn id(&self) -> RangeRequestId { self.id } @@ -87,6 +98,10 @@ impl SyncBlock { self.peers.write().remove(peer) } + pub fn is_syncing(&self) -> bool { + !matches!(self.request, SyncingStatus::AwaitingDownload) + } + #[cfg(test)] pub fn is_processing(&self) -> bool { matches!(self.request, SyncingStatus::Processing(..)) From 7d864ab7e631eb200ff9d4960675d78ab5f69947 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:27:05 +0200 Subject: [PATCH 48/66] Add better logging --- beacon_node/network/src/metrics.rs | 7 +++++ beacon_node/network/src/sync/forward_sync.rs | 10 +++++-- .../network/src/sync/network_context.rs | 30 +++---------------- .../src/sync/network_context/requests.rs | 17 ++++++++++- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index aea9f9aba10..4f81cee7f34 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -540,6 +540,13 @@ pub static SYNC_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { "Current count of forward sync chains in memory", ) }); +pub static SYNC_CHAIN_ERROR_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( + "sync_forward_chain_error_total", + "Total count of forward sync chain errors", + &["error"], + ) +}); /* * Block Delay Metrics diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index f491f2acdf7..e7d85977d0c 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -15,6 +15,7 @@ use lighthouse_network::service::api_types::{ use lighthouse_network::PeerId; use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; +use strum::IntoStaticStr; use tracing::{debug, error}; use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; @@ -599,7 +600,7 @@ impl Chain { } } -#[derive(Debug)] +#[derive(Debug, IntoStaticStr)] pub enum Error { /// Unexpected and unrecoverable error InternalError(String), @@ -993,6 +994,7 @@ impl ForwardSync { // If the chain is empty, remove it if chain.is_empty() { self.chains.remove(&chain_id); + debug!(%chain_id, "Removed completed chain"); metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); } @@ -1018,6 +1020,10 @@ impl ForwardSync { debug!(?block_root, "Handling error for unknown block_root"); return; }; + + debug!(%chain_id, ?block_root, ?error, "Dropping forward sync chain on error"); + metrics::inc_counter_vec(&metrics::SYNC_CHAIN_ERROR_COUNT, &[(&error).into()]); + match error { Error::InternalError(_) | Error::TooManyErrors(_) => { let block_to_children = self @@ -1147,7 +1153,7 @@ impl ForwardSync { metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); for block_root in chain.iter_block_roots() { self.block_to_tip.remove(block_root); - debug!(?block_root, id = %chain_id, "Dropping forward sync block lookup"); + debug!(?block_root, %chain_id, "Dropping forward sync block lookup"); metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); } // Only remove children if the node still existed diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 1b8ecdf66fd..f64551016a2 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -816,7 +816,7 @@ impl SyncNetworkContext { rpc_event: RpcEvent>>, ) -> Option>>>> { let resp = self.blocks_by_root_requests.on_response(id, rpc_event); - self.on_rpc_response_result(id, "BlocksByRoot", resp, peer_id, |_| 1) + self.on_rpc_response_result(resp, peer_id) } /// Processes a single `RpcEvent` blobs_by_root RPC request. @@ -829,7 +829,7 @@ impl SyncNetworkContext { rpc_event: RpcEvent>>, ) -> Option>>>> { let resp = self.blobs_by_root_requests.on_response(id, rpc_event); - self.on_rpc_response_result(id, "BlobsByRoot", resp, peer_id, |_| 1) + self.on_rpc_response_result(resp, peer_id) } /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. @@ -844,38 +844,16 @@ impl SyncNetworkContext { let resp = self .data_columns_by_root_requests .on_response(id, rpc_event); - self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) + self.on_rpc_response_result(resp, peer_id) } /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics /// and peer reporting for all request types. - fn on_rpc_response_result usize>( + fn on_rpc_response_result( &mut self, - id: I, - method: &'static str, resp: Option>, peer_id: PeerId, - get_count: F, ) -> Option> { - match &resp { - None => {} - Some(Ok((v, _))) => { - debug!( - %id, - method, - count = get_count(v), - "Sync RPC request completed" - ); - } - Some(Err(e)) => { - debug!( - %id, - method, - error = ?e, - "Sync RPC request error" - ); - } - } if let Some(Err(RpcResponseError::VerifyError(e))) = &resp { self.report_peer(peer_id, PeerAction::LowToleranceError, e.into()); } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index deba1ada54d..cc9d7bbb372 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -5,6 +5,7 @@ use beacon_chain::validator_monitor::timestamp_now; use fnv::FnvHashMap; use lighthouse_network::PeerId; use strum::IntoStaticStr; +use tracing::debug; use types::{Hash256, Slot}; pub use blobs_by_root::BlobsByRootRequestItems; @@ -58,7 +59,7 @@ enum State { Errored, } -impl ActiveRequests { +impl ActiveRequests { pub fn new(name: &'static str) -> Self { Self { requests: <_>::default(), @@ -179,10 +180,24 @@ impl ActiveRequests { Ok((items, seen_timestamp, duration)) => { metrics::inc_counter_vec(&metrics::SYNC_RPC_REQUEST_SUCCESSES, &[self.name]); metrics::observe_timer_vec(&metrics::SYNC_RPC_REQUEST_TIME, &[self.name], duration); + debug!( + %id, + method = self.name, + count = items.len(), + "Sync RPC request completed" + ); + Ok((items, seen_timestamp)) } Err(e) => { metrics::inc_counter_vec(&metrics::SYNC_RPC_REQUEST_ERRORS, &[self.name]); + debug!( + %id, + method = self.name, + error = ?e, + "Sync RPC request error" + ); + Err(e) } }) From be99c7ad1c9280873863eb4d31ad64e79165e455 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:38:57 +0200 Subject: [PATCH 49/66] Add metrics for stage duration --- beacon_node/network/src/metrics.rs | 21 ++++++ beacon_node/network/src/sync/sync_block.rs | 81 +++++++++++++--------- 2 files changed, 71 insertions(+), 31 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 4f81cee7f34..6e5f41cbff4 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -547,6 +547,27 @@ pub static SYNC_CHAIN_ERROR_COUNT: LazyLock> = LazyLock::n &["error"], ) }); +pub static SYNC_BLOCK_DOWNLOADING_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "sync_block_downloading_time_seconds", + "Time to complete SyncBlock Downloading state", + decimal_buckets(-3, -1), + ) +}); +pub static SYNC_BLOCK_AWAITING_PROCESSING_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "sync_block_awaiting_processing_time_seconds", + "Time to complete SyncBlock AwaitingProcessing state", + decimal_buckets(-3, -1), + ) +}); +pub static SYNC_BLOCK_PROCESSING_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "sync_block_processing_time_seconds", + "Time to complete SyncBlock Processing state", + decimal_buckets(-3, -1), + ) +}); /* * Block Delay Metrics diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index dbe0bee5317..611c5e60514 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -1,4 +1,5 @@ use super::network_context::{RpcRequestSendError, RpcResponseError, SyncNetworkContext}; +use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::BatchPeers; use crate::sync::BatchProcessResult; @@ -9,6 +10,7 @@ use lighthouse_network::PeerId; use parking_lot::RwLock; use std::collections::HashSet; use std::sync::Arc; +use std::time::Instant; use tracing::debug; use types::{EthSpec, Hash256, Slot}; @@ -31,9 +33,9 @@ pub struct SyncBlock { enum SyncingStatus { AwaitingDownload, - Downloading(ComponentsByRootRequestId), - AwaitingProcessing(RpcBlock, BatchPeers), - Processing(RpcBlock, BatchPeers), + Downloading(ComponentsByRootRequestId, Instant), + AwaitingProcessing(RpcBlock, BatchPeers, Instant), + Processing(RpcBlock, BatchPeers, Instant), } #[must_use] @@ -114,7 +116,11 @@ impl SyncBlock { _cx: &mut SyncNetworkContext, ) -> Result<(), Error> { match &mut self.request { - SyncingStatus::Downloading(expected_id) => { + SyncingStatus::Downloading(expected_id, start_time) => { + metrics::observe_duration( + &metrics::SYNC_BLOCK_DOWNLOADING_TIME, + start_time.elapsed(), + ); if req_id != *expected_id { return Err(Error::InternalError(format!( "Unexpected request ID {} != {}", @@ -124,7 +130,8 @@ impl SyncBlock { match result { Ok((block, peers)) => { debug!(id = %self.id, "Sync block downloaded"); - self.request = SyncingStatus::AwaitingProcessing(block, peers); + self.request = + SyncingStatus::AwaitingProcessing(block, peers, Instant::now()); Ok(()) } Err(e) => { @@ -152,33 +159,39 @@ impl SyncBlock { cx: &mut SyncNetworkContext, ) -> Result { match &mut self.request { - SyncingStatus::Processing(block, peers) => match result { - BatchProcessResult::Success => { - debug!(id = %self.id, "Sync block process success"); - Ok(SyncBlockResult::Done { - parent_root: block.as_block().parent_root(), - slot: block.as_block().slot(), - }) - } - BatchProcessResult::Failure { peer_action, error } => { - debug!(id = %self.id, error, "Sync block process error"); + SyncingStatus::Processing(block, peers, start_time) => { + metrics::observe_duration( + &metrics::SYNC_BLOCK_PROCESSING_TIME, + start_time.elapsed(), + ); + match result { + BatchProcessResult::Success => { + debug!(id = %self.id, "Sync block process success"); + Ok(SyncBlockResult::Done { + parent_root: block.as_block().parent_root(), + slot: block.as_block().slot(), + }) + } + BatchProcessResult::Failure { peer_action, error } => { + debug!(id = %self.id, error, "Sync block process error"); - if let Some(peer_action) = peer_action { - for (peer, penalty) in peers.blame(peer_action) { - cx.report_peer(peer, penalty, "faulty_batch"); - self.failed_peers.insert(peer); + if let Some(peer_action) = peer_action { + for (peer, penalty) in peers.blame(peer_action) { + cx.report_peer(peer, penalty, "faulty_batch"); + self.failed_peers.insert(peer); + } } - } - self.process_errors += 1; - if self.process_errors > MAX_PROCESS_ATTEMPTS { - return Err(Error::TooManyErrors("process errors".to_owned())); - } + self.process_errors += 1; + if self.process_errors > MAX_PROCESS_ATTEMPTS { + return Err(Error::TooManyErrors("process errors".to_owned())); + } - self.request = SyncingStatus::AwaitingDownload; - Ok(SyncBlockResult::Wait) + self.request = SyncingStatus::AwaitingDownload; + Ok(SyncBlockResult::Wait) + } } - }, + } _ => Err(Error::InternalError( "Lookup not in expected state Processing".to_owned(), )), @@ -201,7 +214,7 @@ impl SyncBlock { &self.failed_peers, ) { Ok(req_id) => { - self.request = SyncingStatus::Downloading(req_id); + self.request = SyncingStatus::Downloading(req_id, Instant::now()); Ok(()) } Err(e) => match e { @@ -213,8 +226,8 @@ impl SyncBlock { }, } } - SyncingStatus::Downloading(_) => Ok(()), - SyncingStatus::AwaitingProcessing(block, peers) => { + SyncingStatus::Downloading(..) => Ok(()), + SyncingStatus::AwaitingProcessing(block, peers, start_time) => { // No need to check if block is already imported here, we'll get an error // from the beacon processor anyway. No need to add more code to handle this // edge case faster. @@ -234,7 +247,13 @@ impl SyncBlock { "Error sending block to processor: {e:?}" ))) } else { - self.request = SyncingStatus::Processing(block.clone(), peers.clone()); + metrics::observe_duration( + &metrics::SYNC_BLOCK_AWAITING_PROCESSING_TIME, + start_time.elapsed(), + ); + + self.request = + SyncingStatus::Processing(block.clone(), peers.clone(), Instant::now()); Ok(()) } } else { From b16a427bd169d4fc1becedc5a68a4d0deb752287 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:57:10 +0200 Subject: [PATCH 50/66] Remove empty chains --- beacon_node/network/src/sync/forward_sync.rs | 25 +++++++++++++++++--- beacon_node/network/src/sync/manager.rs | 4 +++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index e7d85977d0c..28398877b0a 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -723,10 +723,29 @@ impl ForwardSync { } /// Remove a disconnected peer from all chains - pub fn remove_peer(&mut self, peer: PeerId) { - for chain in self.chains.values_mut() { - chain.remove_peer(&peer); + pub fn remove_peer(&mut self, peer: PeerId) -> Result<(), Error> { + let chains_to_remove = self + .chains + .iter_mut() + .filter_map(|(chain_id, chain)| { + chain.remove_peer(&peer); + // TODO(tree-sync): research if it actually useful to keep chains with zero peers for + // some time. + if chain.peer_count() == 0 { + Some(*chain_id) + } else { + None + } + }) + .collect::>(); + + if !chains_to_remove.is_empty() { + let chain_to_children = self.compute_children()?; + for chain_id in chains_to_remove { + self.drop_chain_and_children(chain_id, &chain_to_children); + } } + Ok(()) } /// A set of peers claim to have imported a block_root. Create a new lookup for it or add them diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 34d09ffe19c..05068f55c3d 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -421,7 +421,9 @@ impl SyncManager { // Remove peer from all data structures self.backfill_sync.peer_disconnected(peer_id); - self.forward_sync.remove_peer(*peer_id); + if let Err(e) = self.forward_sync.remove_peer(*peer_id) { + error!("Error removing peer from forward sync {peer_id} {e:?}"); + } // Regardless of the outcome, we update the sync status. self.update_sync_state(); From c3373c99d7528a832f2ff0376203063999368ab0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 14:00:24 +0200 Subject: [PATCH 51/66] Ignore AddPeer message on race condition --- beacon_node/network/src/sync/manager.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 05068f55c3d..609ca067e85 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -343,6 +343,13 @@ impl SyncManager { finalized_root: status.finalized_root, }; + // Handle race condition where peer may disconnect between the peer manager sending the + // AddPeer message and sync handling a subsequent Disconnect message + if !self.network_globals().peers.read().is_connected(&peer_id) { + debug!(%peer_id, "Ignoring AddPeer message for already disconnected peer"); + return; + } + // Search for any block that is unknown and more recent than finality // TODO(tree-sync): we could prioritize the finalized_root if it's unknown as a way to // detect finalized sync From 9a758e8179a056f7d91a88064bea3a31e4874aa3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 14:08:21 +0200 Subject: [PATCH 52/66] Track the reason chains are removed --- beacon_node/network/src/metrics.rs | 5 +- beacon_node/network/src/sync/forward_sync.rs | 55 +++++++++++--------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 6e5f41cbff4..4495aedd8f9 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -403,10 +403,11 @@ pub static SYNCING_CHAINS_COUNT: LazyLock> = LazyLock::new(| &["range_type"], ) }); -pub static SYNC_CHAINS_REMOVED: LazyLock> = LazyLock::new(|| { - try_create_int_counter( +pub static SYNC_CHAINS_REMOVED: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( "sync_removed_chains_total", "Total count of forward sync chains removed", + &["reason"], ) }); pub static SYNC_CHAINS_ADDED: LazyLock> = LazyLock::new(|| { diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 28398877b0a..a2ea64ae7b4 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -742,7 +742,7 @@ impl ForwardSync { if !chains_to_remove.is_empty() { let chain_to_children = self.compute_children()?; for chain_id in chains_to_remove { - self.drop_chain_and_children(chain_id, &chain_to_children); + self.drop_chain_and_children(chain_id, &chain_to_children, "no_peers"); } } Ok(()) @@ -1014,7 +1014,7 @@ impl ForwardSync { if chain.is_empty() { self.chains.remove(&chain_id); debug!(%chain_id, "Removed completed chain"); - metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &["completed"]); } // Find all chains that are awaiting this block to process and continue them @@ -1043,18 +1043,16 @@ impl ForwardSync { debug!(%chain_id, ?block_root, ?error, "Dropping forward sync chain on error"); metrics::inc_counter_vec(&metrics::SYNC_CHAIN_ERROR_COUNT, &[(&error).into()]); + let block_to_children = self + .compute_children() + .expect("TODO: handle this error if it can't be avoided"); + self.drop_chain_and_children(chain_id, &block_to_children, (&error).into()); + match error { Error::InternalError(_) | Error::TooManyErrors(_) => { - let block_to_children = self - .compute_children() - .expect("TODO: handle this error if it can't be avoided"); - self.drop_chain_and_children(chain_id, &block_to_children); + // } Error::BlockConflictsWithFinality(_e) => { - let block_to_children = self - .compute_children() - .expect("TODO: handle this error if it can't be avoided"); - self.drop_chain_and_children(chain_id, &block_to_children); // TODO(tree-sync): penalize peers of this lookups // TODO(tree-sync): add blocks to a failed cache to prevent re-sync } @@ -1141,21 +1139,27 @@ impl ForwardSync { // TODO(tree-sync): optimize this call to maybe not do it everytime self.trigger_forward_sync(cx); - let mut chains_to_drop = vec![]; + let chains_to_drop = self + .chains + .iter_mut() + .filter_map(|(chain_id, chain)| { + if let Err(e) = chain.continue_requests(cx) { + // TODO(tree-sync): should log error? + Some((*chain_id, e)) + } else { + None + } + }) + .collect::>(); - for (chain_id, chain) in self.chains.iter_mut() { - if let Err(_e) = chain.continue_requests(cx) { - // TODO(tree-sync): should log error? - chains_to_drop.push(*chain_id); + if !chains_to_drop.is_empty() { + let chain_to_children = self + .compute_children() + .expect("Handle this error if it can't be avoided"); + for (chain_id, e) in chains_to_drop { + self.drop_chain_and_children(chain_id, &chain_to_children, e.into()); } } - - let chain_to_children = self - .compute_children() - .expect("Handle this error if it can't be avoided"); - for chain_id in chains_to_drop { - self.drop_chain_and_children(chain_id, &chain_to_children); - } } /// Drop chain if it exists and all its children @@ -1163,16 +1167,17 @@ impl ForwardSync { &mut self, initial_chain_id: TipId, chain_to_children: &HashMap>, + reason: &'static str, ) { let mut queue: VecDeque = VecDeque::from([initial_chain_id]); while let Some(chain_id) = queue.pop_front() { // Remove the node itself. if let Some(chain) = self.chains.remove(&chain_id) { - metrics::inc_counter(&metrics::SYNC_CHAINS_REMOVED); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); for block_root in chain.iter_block_roots() { self.block_to_tip.remove(block_root); - debug!(?block_root, %chain_id, "Dropping forward sync block lookup"); + debug!(?block_root, %chain_id, reason, "Dropping forward sync block lookup"); metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); } // Only remove children if the node still existed @@ -1217,7 +1222,7 @@ impl ForwardSync { let chain_to_children = self.compute_children()?; for (_, chain_id) in chains { - self.drop_chain_and_children(chain_id, &chain_to_children); + self.drop_chain_and_children(chain_id, &chain_to_children, "too_many_blocks"); if self.block_to_tip.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { break; } From ab36b78aebe5b2ef2c98b3cf4d4f0075ff6aa63b Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 17:35:06 +0200 Subject: [PATCH 53/66] Add extra logging for dropping --- beacon_node/network/src/sync/forward_sync.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index a2ea64ae7b4..3f422bb44b6 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -887,7 +887,7 @@ impl ForwardSync { match response { Ok((block, received)) => { - debug!(%req_id, "Forward sync block header downloaded success"); + debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); let block_header = block.message().block_header(); let parent_root = block_header.parent_root; @@ -937,14 +937,15 @@ impl ForwardSync { // Trigger potential foward sync for this chain self.continue_requests(cx); } else if let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) { - debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); // Parent is part of another chain, stop search // Stop search we reached a known block chain.to_waiting_parent(parent_root)?; + debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); // TODO(tree-sync): Add peers recursively to the chain_id, potentially // splitting the chain when adding peers. } else { chain.add_ancestor(block_header, cx.next_id())?; + debug!(%chain_id, ?parent_root, "Forward sync chain continues fetching ancestor"); // Add to the block_to_tip mapping to respect the invariant "Each block // root exists in exactly one `Chain::block_roots` list". self.block_to_tip.insert(parent_root, *chain_id); @@ -1040,12 +1041,13 @@ impl ForwardSync { return; }; - debug!(%chain_id, ?block_root, ?error, "Dropping forward sync chain on error"); metrics::inc_counter_vec(&metrics::SYNC_CHAIN_ERROR_COUNT, &[(&error).into()]); let block_to_children = self .compute_children() .expect("TODO: handle this error if it can't be avoided"); + // TODO(tree-sync): logging `block_to_children` for debugging + debug!(%chain_id, ?block_root, ?error, ?block_to_children, "Dropping forward sync chain on error"); self.drop_chain_and_children(chain_id, &block_to_children, (&error).into()); match error { @@ -1177,7 +1179,7 @@ impl ForwardSync { metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); for block_root in chain.iter_block_roots() { self.block_to_tip.remove(block_root); - debug!(?block_root, %chain_id, reason, "Dropping forward sync block lookup"); + debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block lookup"); metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); } // Only remove children if the node still existed From dba590123ffdb606435d705272753924db82d26b Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 18:21:24 +0200 Subject: [PATCH 54/66] Simplify compute children --- beacon_node/network/src/sync/forward_sync.rs | 63 ++++++++------------ beacon_node/network/src/sync/manager.rs | 4 +- 2 files changed, 25 insertions(+), 42 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 3f422bb44b6..9d2e5f84017 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -219,10 +219,7 @@ impl Chain { /// already processing = its parent has already been imported. fn parent_root(&self) -> Option { match &self.status { - Status::BackfillHeaders { - next_header_request, - .. - } => Some(next_header_request.block_root), + Status::BackfillHeaders { .. } => None, Status::WaitingParentChain { parent_root, .. } => Some(*parent_root), Status::ForwardSync { .. } => None, } @@ -723,7 +720,7 @@ impl ForwardSync { } /// Remove a disconnected peer from all chains - pub fn remove_peer(&mut self, peer: PeerId) -> Result<(), Error> { + pub fn remove_peer(&mut self, peer: PeerId) { let chains_to_remove = self .chains .iter_mut() @@ -740,12 +737,11 @@ impl ForwardSync { .collect::>(); if !chains_to_remove.is_empty() { - let chain_to_children = self.compute_children()?; + let chain_to_children = self.compute_children(); for chain_id in chains_to_remove { self.drop_chain_and_children(chain_id, &chain_to_children, "no_peers"); } } - Ok(()) } /// A set of peers claim to have imported a block_root. Create a new lookup for it or add them @@ -821,9 +817,7 @@ impl ForwardSync { } } else { if self.block_to_tip.len() > MAX_LOOKUP_COUNT { - if let Err(e) = self.prune_least_popular_lookups() { - error!("Error on prune_least_popular_lookups {e:?}"); - } + self.prune_least_popular_lookups(); } let id = cx.next_id(); @@ -887,8 +881,6 @@ impl ForwardSync { match response { Ok((block, received)) => { - debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); - let block_header = block.message().block_header(); let parent_root = block_header.parent_root; @@ -900,6 +892,7 @@ impl ForwardSync { )?; metrics::inc_counter(&metrics::SYNC_HEADERS_DOWNLOADED); + debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); // Once we discover the parent_root of this block three things can happen // 1. The parent root is a known block -> stop @@ -1007,6 +1000,8 @@ impl ForwardSync { return; }; + debug!(%id, %chain_id, ?result, "Forward sync block process result"); + match chain.on_process_result(id, result, cx) { Ok(SyncBlockResult::Done { .. }) => { metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); @@ -1043,9 +1038,7 @@ impl ForwardSync { metrics::inc_counter_vec(&metrics::SYNC_CHAIN_ERROR_COUNT, &[(&error).into()]); - let block_to_children = self - .compute_children() - .expect("TODO: handle this error if it can't be avoided"); + let block_to_children = self.compute_children(); // TODO(tree-sync): logging `block_to_children` for debugging debug!(%chain_id, ?block_root, ?error, ?block_to_children, "Dropping forward sync chain on error"); self.drop_chain_and_children(chain_id, &block_to_children, (&error).into()); @@ -1155,9 +1148,7 @@ impl ForwardSync { .collect::>(); if !chains_to_drop.is_empty() { - let chain_to_children = self - .compute_children() - .expect("Handle this error if it can't be avoided"); + let chain_to_children = self.compute_children(); for (chain_id, e) in chains_to_drop { self.drop_chain_and_children(chain_id, &chain_to_children, e.into()); } @@ -1168,52 +1159,47 @@ impl ForwardSync { fn drop_chain_and_children( &mut self, initial_chain_id: TipId, - chain_to_children: &HashMap>, + chain_to_children: &HashMap>, reason: &'static str, ) { let mut queue: VecDeque = VecDeque::from([initial_chain_id]); while let Some(chain_id) = queue.pop_front() { // Remove the node itself. + // Only continue if the node was removed. This prevents infinite loops even if + // `chain_to_children` items reference themselves if let Some(chain) = self.chains.remove(&chain_id) { metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); for block_root in chain.iter_block_roots() { self.block_to_tip.remove(block_root); debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block lookup"); metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); - } - // Only remove children if the node still existed - // Push its children—if any—onto the work list. - if let Some(children) = chain_to_children.get(&chain_id) { - queue.extend(children.iter().cloned()); + // Only remove children if the node still existed + // Push its children‚Äîif any‚Äîonto the work list. + if let Some(children) = chain_to_children.get(block_root) { + queue.extend(children.iter().cloned()); + } } } } } /// Drop lookup `block_root` if it exists and all its children - fn compute_children(&mut self) -> Result>, InternalError> { - let mut chain_to_children = HashMap::>::new(); + fn compute_children(&mut self) -> HashMap> { + let mut parent_to_children = HashMap::>::new(); for (chain_id, chain) in self.chains.iter() { if let Some(parent_root) = chain.parent_root() { - // TODO(tree-sync): Is this error impossible? - let parent_chain_id = self.block_to_tip - .get(&parent_root) - .ok_or(InternalError(format!( - "Chain {chain_id} has a parent root that points to an unknown block {parent_root:?}" - )))?; - - chain_to_children - .entry(*parent_chain_id) + parent_to_children + .entry(parent_root) .or_default() .push(*chain_id); } } - Ok(chain_to_children) + parent_to_children } /// Drop lookups with least amount of peers and slot until we pruned PRUNE_COUNT lookups - fn prune_least_popular_lookups(&mut self) -> Result<(), InternalError> { + fn prune_least_popular_lookups(&mut self) { let mut chains = self .chains .iter() @@ -1222,14 +1208,13 @@ impl ForwardSync { .collect::>(); chains.sort_unstable(); - let chain_to_children = self.compute_children()?; + let chain_to_children = self.compute_children(); for (_, chain_id) in chains { self.drop_chain_and_children(chain_id, &chain_to_children, "too_many_blocks"); if self.block_to_tip.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { break; } } - Ok(()) } pub fn register_metrics(&self) { diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 609ca067e85..38143e1b21d 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -428,9 +428,7 @@ impl SyncManager { // Remove peer from all data structures self.backfill_sync.peer_disconnected(peer_id); - if let Err(e) = self.forward_sync.remove_peer(*peer_id) { - error!("Error removing peer from forward sync {peer_id} {e:?}"); - } + self.forward_sync.remove_peer(*peer_id); // Regardless of the outcome, we update the sync status. self.update_sync_state(); From 72b5e46c6eb45c1e42259767e240622ea5708424 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 18:33:28 +0200 Subject: [PATCH 55/66] More logs with chain_id --- beacon_node/network/src/sync/forward_sync.rs | 29 ++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 9d2e5f84017..8d79208b6b0 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -352,9 +352,10 @@ impl Chain { } /// If this chain is waiting for `block_root` it transitions to forward sync. - fn on_block_imported(&mut self, block_root: &Hash256) { + /// Returns true if the chain transitioned to ForwardSync + fn on_block_imported(&mut self, block_root: &Hash256) -> bool { match &mut self.status { - Status::BackfillHeaders { .. } => {} + Status::BackfillHeaders { .. } => false, Status::WaitingParentChain { block_roots, parent_root, @@ -364,9 +365,12 @@ impl Chain { block_roots: std::mem::take(block_roots), syncing_blocks: <_>::default(), }; + true + } else { + false } } - Status::ForwardSync { .. } => {} + Status::ForwardSync { .. } => false, } } @@ -979,6 +983,8 @@ impl ForwardSync { return; }; + debug!(%id, %chain_id, result = render_result(&result), "Forward sync block download result"); + if let Err(e) = chain.on_download_result(req_id, result, cx) { self.handle_error(id.block_root, e); } @@ -1000,7 +1006,7 @@ impl ForwardSync { return; }; - debug!(%id, %chain_id, ?result, "Forward sync block process result"); + debug!(%id, %chain_id, ?result, "Forward sync block download result"); match chain.on_process_result(id, result, cx) { Ok(SyncBlockResult::Done { .. }) => { @@ -1015,7 +1021,13 @@ impl ForwardSync { // Find all chains that are awaiting this block to process and continue them for other_chain in self.chains.values_mut() { - other_chain.on_block_imported(&id.block_root); + if other_chain.on_block_imported(&id.block_root) { + debug!( + %chain_id, + parent_root = ?id.block_root, + "Forward sync chain awaiting parent transitioned to forward sync" + ); + } } self.continue_requests(cx); } @@ -1256,3 +1268,10 @@ impl std::fmt::Display for TipId { write!(f, "{}", self.0) } } + +fn render_result(result: &Result) -> String { + match result { + Ok(_) => format!("Ok"), + Err(e) => format!("Err({e:?})"), + } +} From b4058ab9ac9b5f0a23bb22d9077faa754d16206a Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 14 Jul 2025 20:40:27 +0200 Subject: [PATCH 56/66] Log chains every interval --- beacon_node/network/src/sync/forward_sync.rs | 39 ++++++++++++++++++- .../sync/network_context/download_request.rs | 4 ++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 8d79208b6b0..2087e600706 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -1181,10 +1181,11 @@ impl ForwardSync { // Only continue if the node was removed. This prevents infinite loops even if // `chain_to_children` items reference themselves if let Some(chain) = self.chains.remove(&chain_id) { + debug!(%chain_id, %initial_chain_id, reason, "Dropping forward sync chain"); metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); for block_root in chain.iter_block_roots() { self.block_to_tip.remove(block_root); - debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block lookup"); + debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block"); metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); // Only remove children if the node still existed // Push its children‚Äîif any‚Äîonto the work list. @@ -1257,6 +1258,42 @@ impl ForwardSync { metrics::set_gauge(&metrics::SYNC_HEADERS_COUNT, self.block_to_tip.len() as i64); metrics::set_gauge(&metrics::SYNC_CHAINS_COUNT, self.chains.len() as i64); + for (chain_id, chain) in &self.chains { + let status = match &chain.status { + Status::BackfillHeaders { + block_roots, + next_header_request, + } => { + format!( + "block_roots {block_roots:?} next_header_request {} {} {}", + next_header_request.id, + next_header_request.block_root, + next_header_request.request.status_str() + ) + } + Status::WaitingParentChain { + parent_root, + block_roots, + } => { + format!("parent_root {parent_root:?} block_roots {block_roots:?}") + } + Status::ForwardSync { + block_roots, + syncing_blocks, + } => { + format!( + "block_roots {block_roots:?} syncing_blocks {:?}", + syncing_blocks + .iter() + .map(|b| b.block_root()) + .collect::>() + ) + } + }; + + debug!(%chain_id, peers = chain.peers.len(), status, "DEBUG chain"); + } + // Min header // Highest known header // Current head diff --git a/beacon_node/network/src/sync/network_context/download_request.rs b/beacon_node/network/src/sync/network_context/download_request.rs index b610c8da488..71eebbd11c5 100644 --- a/beacon_node/network/src/sync/network_context/download_request.rs +++ b/beacon_node/network/src/sync/network_context/download_request.rs @@ -33,6 +33,10 @@ impl DownloadRequest { } } + pub fn status_str(&self) -> &'static str { + (&self.status).into() + } + pub fn is_awaiting_download(&self) -> bool { match self.status { Status::NotStarted => true, From bc30717ab53239cb0d08dae7843f0df49d99c641 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 01:50:22 +0200 Subject: [PATCH 57/66] Simplify transition to forward sync --- .../network/src/sync/backfill_sync/mod.rs | 6 +- beacon_node/network/src/sync/forward_sync.rs | 438 ++++++++---------- beacon_node/network/src/sync/sync_block.rs | 23 +- 3 files changed, 213 insertions(+), 254 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index f61ec36fee7..6d58ddc1a3e 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -12,7 +12,7 @@ use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ BatchPeers, RangeRequestId, RpcResponseError, SyncNetworkContext, }; -use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; +use crate::sync::sync_block::{Error as SyncBlockError, OkToImport, SyncBlock, SyncBlockResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::{ComponentsByRootRequestId, Id}; @@ -238,7 +238,9 @@ impl BackFillSync { fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { // TODO(tree-sync): only ok to import the newest block let ok_to_import = true; - let outcome = self.status.continue_request(cx, ok_to_import); + let outcome = self + .status + .continue_request(cx, OkToImport::Bool(ok_to_import)); self.handle_outcome(outcome.map(|_| SyncBlockResult::Wait), cx); } diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 2087e600706..1fd087ed44b 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -4,7 +4,7 @@ use super::network_context::{ }; use crate::metrics; use crate::sync::network_context::{BatchPeers, RpcResponseResult}; -use crate::sync::sync_block::{Error as SyncBlockError, SyncBlock, SyncBlockResult}; +use crate::sync::sync_block::{Error as SyncBlockError, OkToImport, SyncBlock, SyncBlockResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; @@ -15,13 +15,14 @@ use lighthouse_network::service::api_types::{ use lighthouse_network::PeerId; use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; +use std::time::Duration; use strum::IntoStaticStr; use tracing::{debug, error}; use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; const MAX_LOOKUP_COUNT: usize = 1_000_000; const PRUNE_COUNT: usize = 100_000; -const BLOCK_BUFFER_SIZE: usize = 2; +const BLOCK_BUFFER_SIZE: usize = 4; #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] struct TipId(u32); @@ -81,7 +82,7 @@ struct Chain { status: Status, } -type PendingBlock = (Hash256, Slot, Id); +type PendingBlock = (HeaderLookupId, Slot); #[allow(clippy::large_enum_variant)] enum Status { @@ -105,6 +106,8 @@ enum Status { parent_root: Hash256, /// Sorting: tip first, oldest ancestor last block_roots: Vec, + /// True if the oldest ancestor can start downloading + ready_to_sync: bool, }, /// Download and process block_roots from oldest ancestor to tip. Its list of block_roots does /// not grow, only removed block roots once processed. @@ -115,10 +118,9 @@ enum Status { /// - The set of SyncBlocks is consecutive /// - The parent of the last item in `block_roots` is the first item in `syncing_blocks` ForwardSync { - /// Sorting: tip first, oldest ancestor last - block_roots: Vec, /// Sorting: oldest ancestor first - syncing_blocks: VecDeque>, + block: SyncBlock, + parent_root: Hash256, }, } @@ -221,7 +223,7 @@ impl Chain { match &self.status { Status::BackfillHeaders { .. } => None, Status::WaitingParentChain { parent_root, .. } => Some(*parent_root), - Status::ForwardSync { .. } => None, + Status::ForwardSync { parent_root, .. } => Some(*parent_root), } } @@ -234,19 +236,13 @@ impl Chain { } => Some( block_roots .first() - .map(|block| block.0) + .map(|block| block.0.block_root) .unwrap_or(next_header_request.block_root), ), Status::WaitingParentChain { block_roots, .. } => { - block_roots.first().map(|block| block.0) + block_roots.first().map(|block| block.0.block_root) } - Status::ForwardSync { - block_roots, - syncing_blocks, - } => block_roots - .first() - .map(|block| block.0) - .or_else(|| syncing_blocks.back().map(|block| *block.block_root())), + Status::ForwardSync { block, .. } => Some(*block.block_root()), } } @@ -266,17 +262,20 @@ impl Chain { let next_header_request = std::mem::replace(next_header_request, HeaderRequest::empty()); - let new_block_roots = - if let Some(idx) = block_roots.iter().position(|b| b.0 == block_root) { - // ..= to keep the block_root on the left - block_roots.drain(0..=idx).collect::>() - } else { - // TODO(tree-sync): check that block_root is the next_root or error - vec![] - }; + let new_block_roots = if let Some(idx) = block_roots + .iter() + .position(|b| b.0.block_root == block_root) + { + // ..= to keep the block_root on the left + block_roots.drain(0..=idx).collect::>() + } else { + // TODO(tree-sync): check that block_root is the next_root or error + vec![] + }; self.status = Status::WaitingParentChain { parent_root: block_root, block_roots, + ready_to_sync: false, }; Status::BackfillHeaders { block_roots: new_block_roots, @@ -286,61 +285,31 @@ impl Chain { Status::WaitingParentChain { parent_root, block_roots, + ready_to_sync, } => { - let idx = - block_roots - .iter() - .position(|b| b.0 == block_root) - .ok_or(InternalError(format!( - "block_root {block_root:?} no in chain" - )))?; + let idx = block_roots + .iter() + .position(|b| b.0.block_root == block_root) + .ok_or(InternalError(format!( + "block_root {block_root:?} no in chain" + )))?; // ..= to keep the block_root on the left let new_block_roots = block_roots.drain(0..=idx).collect::>(); let parent_root = *parent_root; + let ready_to_sync = *ready_to_sync; self.status = Status::WaitingParentChain { parent_root: block_root, block_roots: std::mem::take(block_roots), + ready_to_sync: false, }; Status::WaitingParentChain { parent_root, block_roots: new_block_roots, + ready_to_sync, } } - Status::ForwardSync { - block_roots, - syncing_blocks, - } => { - // block_root may be in `block_roots` or in `syncing_blocks`. - let block_roots_idx = block_roots.iter().position(|b| b.0 == block_root); - let new_block_roots = if let Some(idx) = block_roots_idx { - // ..= to keep the block_root on the left - block_roots.drain(0..=idx).collect::>() - } else { - // `block_root` must be in `syncing_blocks` so the new splitted chain will have - // no `block_roots` items. - vec![] - }; - - let new_syncing_blocks = if block_roots_idx.is_some() { - // If `block_root` is in `block_roots` all syncing_blocks go to the new chain - std::mem::take(syncing_blocks) - } else { - // else find the position - let idx = syncing_blocks - .iter() - .position(|b| *b.block_root() == block_root) - .ok_or(InternalError(format!( - "block_root {block_root:?} not found in chain" - )))?; - // ..= to keep the block_root on the left - syncing_blocks.drain(0..=idx).collect::>() - }; - // This chain remains ForwardSync - // New chain is ForwardSync with the splitted Vecs - Status::ForwardSync { - block_roots: new_block_roots, - syncing_blocks: new_syncing_blocks, - } + Status::ForwardSync { .. } => { + todo!("cannot split single block"); } }; @@ -351,62 +320,20 @@ impl Chain { }) } - /// If this chain is waiting for `block_root` it transitions to forward sync. - /// Returns true if the chain transitioned to ForwardSync - fn on_block_imported(&mut self, block_root: &Hash256) -> bool { - match &mut self.status { + /// Return true if this chain is awaiting `block_root` + fn is_waiting_parent(&self, block_root: &Hash256) -> bool { + match &self.status { Status::BackfillHeaders { .. } => false, - Status::WaitingParentChain { - block_roots, - parent_root, - } => { - if block_root == parent_root { - self.status = Status::ForwardSync { - block_roots: std::mem::take(block_roots), - syncing_blocks: <_>::default(), - }; - true - } else { - false - } - } + Status::WaitingParentChain { parent_root, .. } => block_root == parent_root, Status::ForwardSync { .. } => false, } } - /// Transitions to forward sync - fn backfill_headers_to_forward_sync( - &mut self, - block: BeaconBlockHeader, - ) -> Result<(), InternalError> { - match &mut self.status { - Status::BackfillHeaders { - block_roots, - next_header_request, - } => { - block_roots.push(( - block.canonical_root(), - block.slot, - next_header_request.id.id, - )); - self.status = Status::ForwardSync { - block_roots: std::mem::take(block_roots), - syncing_blocks: <_>::default(), - }; - Ok(()) - } - _ => Err(InternalError("Not in BackfillHeaders state".to_string())), - } - } - fn block_count(&self) -> usize { match &self.status { Status::BackfillHeaders { block_roots, .. } | Status::WaitingParentChain { block_roots, .. } => block_roots.len(), - Status::ForwardSync { - block_roots, - syncing_blocks, - } => block_roots.len() + syncing_blocks.len(), + Status::ForwardSync { .. } => 1, } } @@ -418,20 +345,12 @@ impl Chain { next_header_request, } => Box::new( std::iter::once(&next_header_request.block_root) - .chain(block_roots.iter().map(|(root, _, _)| root)), + .chain(block_roots.iter().map(|(id, _)| &id.block_root)), ), Status::WaitingParentChain { block_roots, .. } => { - Box::new(block_roots.iter().map(|(root, _, _)| root)) + Box::new(block_roots.iter().map(|(id, _)| &id.block_root)) } - Status::ForwardSync { - syncing_blocks, - block_roots, - } => Box::new( - syncing_blocks - .iter() - .map(|block| block.block_root()) - .chain(block_roots.iter().map(|(root, _, _)| root)), - ), + Status::ForwardSync { block, .. } => Box::new(std::iter::once(block.block_root())), } } @@ -442,19 +361,17 @@ impl Chain { fn min_slot(&self) -> Option { match &self.status { - // TODO(tree-sync): include syncing_blocks for ForwardSync Status::BackfillHeaders { block_roots, .. } - | Status::WaitingParentChain { block_roots, .. } - | Status::ForwardSync { block_roots, .. } => block_roots.last().map(|b| b.1), + | Status::WaitingParentChain { block_roots, .. } => block_roots.last().map(|b| b.1), + Status::ForwardSync { block, .. } => Some(block.slot()), } } fn max_slot(&self) -> Option { match &self.status { - // TODO(tree-sync): include syncing_blocks for ForwardSync Status::BackfillHeaders { block_roots, .. } - | Status::WaitingParentChain { block_roots, .. } - | Status::ForwardSync { block_roots, .. } => block_roots.first().map(|b| b.1), + | Status::WaitingParentChain { block_roots, .. } => block_roots.first().map(|b| b.1), + Status::ForwardSync { block, .. } => Some(block.slot()), } } @@ -462,7 +379,7 @@ impl Chain { match &self.status { Status::BackfillHeaders { .. } => 0, Status::WaitingParentChain { .. } => 0, - Status::ForwardSync { syncing_blocks, .. } => syncing_blocks.len(), + Status::ForwardSync { .. } => 1, } } @@ -480,20 +397,13 @@ impl Chain { } } - fn add_ancestor(&mut self, block: BeaconBlockHeader, id: Id) -> Result<(), InternalError> { + fn add_ancestor(&mut self, parent_root: Hash256, id: Id) -> Result<(), InternalError> { match &mut self.status { Status::BackfillHeaders { block_roots, next_header_request, } => { - block_roots.push(( - // Should be the same as `next_header_request.block_root` - block.canonical_root(), - block.slot, - // Persist the request ID of the header for better traceability - next_header_request.id.id, - )); - *next_header_request = HeaderRequest::new(block.parent_root, id); + *next_header_request = HeaderRequest::new(parent_root, id); Ok(()) } _ => Err(InternalError( @@ -502,12 +412,17 @@ impl Chain { } } - fn to_waiting_parent(&mut self, parent_root: Hash256) -> Result<(), Error> { + fn to_waiting_parent( + &mut self, + parent_root: Hash256, + ready_to_sync: bool, + ) -> Result<(), Error> { match &mut self.status { Status::BackfillHeaders { block_roots, .. } => { self.status = Status::WaitingParentChain { parent_root, block_roots: std::mem::take(block_roots), + ready_to_sync, }; Ok(()) } @@ -517,15 +432,80 @@ impl Chain { } } + fn pop_next_block_to_sync(&mut self) -> Option { + match &mut self.status { + Status::WaitingParentChain { + block_roots, + parent_root, + ready_to_sync, + } => { + if !*ready_to_sync { + return None; + } + let Some(last_block) = block_roots.pop() else { + return None; + }; + + let last_block_parent_root = *parent_root; + *parent_root = last_block.0.block_root; + + let block = SyncBlock::new( + // Reuse the request ID of the header for better traceability + RangeRequestId::ForwardSync(last_block.0), + last_block.0.block_root, + last_block.1, + &self.peers.iter().copied().collect::>(), + ); + + Some(Self { + peers: self.peers.clone(), + status: Status::ForwardSync { + block, + parent_root: last_block_parent_root, + }, + }) + } + _ => None, + } + } + + fn on_header_download( + &mut self, + req_id: BlocksByRootRequestId, + block: BeaconBlockHeader, + ) -> Result<(), Error> { + match &mut self.status { + Status::BackfillHeaders { + next_header_request, + block_roots, + } => { + // Call `on_download_success` to assert that the req_id is the expected on + next_header_request.request.on_download_success( + req_id, + PeerId::random(), + block.clone(), + Duration::from_secs(0), + )?; + // Add the downloaded block + // Persist the request ID of the header for better traceability + block_roots.push((next_header_request.id, block.slot)); + Ok(()) + } + _ => Err(Error::InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), + )), + } + } + fn on_download_result( &mut self, req_id: ComponentsByRootRequestId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) -> Result<(), Error> { - let (ok_to_import, block) = self.block_request(req_id.requester)?; + let block = self.block_request(req_id.requester)?; block.on_download_result(req_id, result, cx)?; - block.continue_request(cx, ok_to_import)?; + block.continue_request(cx, OkToImport::IfParentImported)?; Ok(()) } @@ -536,45 +516,31 @@ impl Chain { result: BatchProcessResult, cx: &mut SyncNetworkContext, ) -> Result { - let (ok_to_import, block) = self.block_request(RangeRequestId::ForwardSync(id))?; + let block = self.block_request(RangeRequestId::ForwardSync(id))?; match block.on_process_result(result, cx)? { SyncBlockResult::Done { parent_root, slot } => { - // Sanity check: the processed block must be the oldest block in the chain - if !ok_to_import { - return Err(Error::InternalError(format!( - "Block {id} is not the first block" - ))); - } - // This block processing is complete, remove it from chain - if let Status::ForwardSync { syncing_blocks, .. } = &mut self.status { - if let Some(block) = syncing_blocks.pop_front() { - debug!("Dropping syncing block {}", block.id()); - } else { - return Err(Error::InternalError("syncing_blocks is empty".to_string())); - } - } + // Single block, drop the chain Ok(SyncBlockResult::Done { parent_root, slot }) } SyncBlockResult::Wait => { // Not complete yet, continue requests - block.continue_request(cx, ok_to_import)?; + block.continue_request(cx, OkToImport::IfParentImported)?; Ok(SyncBlockResult::Wait) } } } - fn block_request(&mut self, id: RangeRequestId) -> Result<(bool, &mut SyncBlock), Error> { + fn block_request(&mut self, id: RangeRequestId) -> Result<&mut SyncBlock, Error> { match &mut self.status { - Status::ForwardSync { syncing_blocks, .. } => { - if let Some(index) = syncing_blocks.iter().position(|b| b.id() == id) { - let block = syncing_blocks.get_mut(index).expect("index just found"); - return Ok((index == 0, block)); + Status::ForwardSync { block, .. } => { + if block.id() == id { + Ok(block) + } else { + Err(Error::InternalError(format!( + "Unknown block for {id} current ID {}", + block.id(), + ))) } - - let first_ids: Vec<_> = syncing_blocks.iter().take(5).map(|b| b.id()).collect(); - Err(Error::InternalError(format!( - "Unknown block for {id}, first few blocks {first_ids:?}" - ))) } _ => Err(Error::InternalError( "Expected lookup to be in Syncing state".to_owned(), @@ -590,11 +556,8 @@ impl Chain { .. } => Ok(next_header_request.continue_request(&self.peers, cx)?), Status::WaitingParentChain { .. } => Ok(()), - Status::ForwardSync { syncing_blocks, .. } => { - for (index, block) in syncing_blocks.iter_mut().enumerate() { - let ok_to_import = index == 0; - block.continue_request(cx, ok_to_import)?; - } + Status::ForwardSync { block, .. } => { + block.continue_request(cx, OkToImport::IfParentImported)?; Ok(()) } } @@ -705,12 +668,10 @@ impl ForwardSync { match &chain.status { Status::BackfillHeaders { .. } => {} Status::WaitingParentChain { .. } => {} - Status::ForwardSync { syncing_blocks, .. } => { - for block in syncing_blocks { - if block.is_processing() { - if let RangeRequestId::ForwardSync(id) = block.id() { - ids.push(id); - } + Status::ForwardSync { block, .. } => { + if block.is_processing() { + if let RangeRequestId::ForwardSync(id) = block.id() { + ids.push(id); } } } @@ -780,15 +741,10 @@ impl ForwardSync { // Update all block references to the new chain for block_root in new_chain.iter_block_roots() { - *self - .block_to_tip - .get_mut(block_root) - .ok_or(InternalError(format!("No block {block_root:?}")))? = - new_chain_id; + self.block_to_tip.insert(*block_root, new_chain_id); } - self.chains.insert(new_chain_id, new_chain); - self.chains.get_mut(&new_chain_id).expect("key just added") + self.chains.entry(new_chain_id).or_insert(new_chain) } else { chain }; @@ -888,12 +844,7 @@ impl ForwardSync { let block_header = block.message().block_header(); let parent_root = block_header.parent_root; - chain.header_request()?.on_download_success( - req_id, - peer_id, - block_header.clone(), - received, - )?; + chain.on_header_download(req_id, block_header.clone())?; metrics::inc_counter(&metrics::SYNC_HEADERS_DOWNLOADED); debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); @@ -929,19 +880,19 @@ impl ForwardSync { if cx.chain.block_is_known_to_fork_choice(&parent_root) { // Parent is imported, we can forward sync this chain // Stop search we reached a known block - chain.backfill_headers_to_forward_sync(block_header)?; + chain.to_waiting_parent(parent_root, true)?; debug!(%chain_id, ?parent_root, block_count = chain.block_count(), "Forward sync chain reached imported block"); // Trigger potential foward sync for this chain self.continue_requests(cx); } else if let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) { // Parent is part of another chain, stop search // Stop search we reached a known block - chain.to_waiting_parent(parent_root)?; + chain.to_waiting_parent(parent_root, false)?; debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); // TODO(tree-sync): Add peers recursively to the chain_id, potentially // splitting the chain when adding peers. } else { - chain.add_ancestor(block_header, cx.next_id())?; + chain.add_ancestor(block_header.parent_root, cx.next_id())?; debug!(%chain_id, ?parent_root, "Forward sync chain continues fetching ancestor"); // Add to the block_to_tip mapping to respect the invariant "Each block // root exists in exactly one `Chain::block_roots` list". @@ -987,6 +938,8 @@ impl ForwardSync { if let Err(e) = chain.on_download_result(req_id, result, cx) { self.handle_error(id.block_root, e); + // Some syncing blocks may have been dropped so there's space for new chains to sync + self.continue_requests(cx); } } @@ -1012,16 +965,14 @@ impl ForwardSync { Ok(SyncBlockResult::Done { .. }) => { metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); self.block_to_tip.remove(&id.block_root); - // If the chain is empty, remove it - if chain.is_empty() { - self.chains.remove(&chain_id); - debug!(%chain_id, "Removed completed chain"); - metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &["completed"]); - } + // ForwardSync chains have a single block, remove them on Done + self.chains.remove(&chain_id); + debug!(%id, %chain_id, "Removed completed chain"); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &["completed"]); // Find all chains that are awaiting this block to process and continue them for other_chain in self.chains.values_mut() { - if other_chain.on_block_imported(&id.block_root) { + if other_chain.is_waiting_parent(&id.block_root) { debug!( %chain_id, parent_root = ?id.block_root, @@ -1035,6 +986,8 @@ impl ForwardSync { Ok(SyncBlockResult::Wait) => {} Err(e) => { self.handle_error(id.block_root, e); + // Some syncing blocks may have been dropped so there's space for new chains to sync + self.continue_requests(cx); } } } @@ -1081,18 +1034,21 @@ impl ForwardSync { // TODO(tree-sync): don't build on demand, cache roots somewhere - let mut blocks_syncing = self - .chains - .values() - .map(|chain| chain.syncing_blocks_count()) - .sum::(); + let new_blocks_to_sync = BLOCK_BUFFER_SIZE.saturating_sub( + self.chains + .values() + .map(|chain| chain.syncing_blocks_count()) + .sum::(), + ); + + if new_blocks_to_sync == 0 { + return; + } // A chain can be in two states: // - Active backfill // - Oldest ancestor known - let mut new_syncing_blocks = false; - // Have up to 2 blocks syncing // Find the block range with most peers and highest slot. This is the block // to be used as tip of the chain of blocks to fetch. @@ -1100,44 +1056,36 @@ impl ForwardSync { .chains .iter_mut() .filter_map(|(_, chain)| { - if matches!(chain.status, Status::ForwardSync { .. }) { + if matches!(chain.status, Status::WaitingParentChain { .. }) { Some((chain.peer_count(), chain)) } else { None } }) .collect::>(); - chains_by_peer_count.sort_by_key(|(peer_count, _)| *peer_count); - for (_, chain) in chains_by_peer_count { - if let Status::ForwardSync { - block_roots, - syncing_blocks, - } = &mut chain.status - { - // block_roots sorting: tip first, oldest ancestor last => pop - if let Some(next_block) = block_roots.pop() { - syncing_blocks.push_back(SyncBlock::new( - RangeRequestId::ForwardSync(HeaderLookupId { - // Reuse the request ID of the header for better traceability - id: next_block.2, - block_root: next_block.0, - }), - next_block.0, - next_block.1, - &chain.peers.iter().copied().collect::>(), - )); - blocks_syncing += 1; - new_syncing_blocks = true; - if blocks_syncing >= BLOCK_BUFFER_SIZE { - break; - } + let mut new_chains = vec![]; + + 'o: for (id, chain) in chains_by_peer_count { + while let Some(new_chain) = chain.pop_next_block_to_sync() { + let new_chain_id = TipId(cx.next_id()); + // Update all block references to the new chain + for block_root in new_chain.iter_block_roots() { + self.block_to_tip.insert(*block_root, new_chain_id); + debug!(%new_chain_id, ?block_root, "Transitioned block to forward sync"); + } + new_chains.push((new_chain_id, new_chain)); + if new_chains.len() >= new_blocks_to_sync { + break 'o; } } } - if new_syncing_blocks { + if !new_chains.is_empty() { + for (chain_id, chain) in new_chains { + self.chains.insert(chain_id, chain); + } self.continue_requests(cx); } } @@ -1265,7 +1213,7 @@ impl ForwardSync { next_header_request, } => { format!( - "block_roots {block_roots:?} next_header_request {} {} {}", + "BackfillHeaders block_roots {block_roots:?} next_header_request {} {} {}", next_header_request.id, next_header_request.block_root, next_header_request.request.status_str() @@ -1274,20 +1222,12 @@ impl ForwardSync { Status::WaitingParentChain { parent_root, block_roots, + ready_to_sync, } => { - format!("parent_root {parent_root:?} block_roots {block_roots:?}") + format!("WaitingParentChain ready_to_sync {ready_to_sync} parent_root {parent_root:?} block_roots {block_roots:?}") } - Status::ForwardSync { - block_roots, - syncing_blocks, - } => { - format!( - "block_roots {block_roots:?} syncing_blocks {:?}", - syncing_blocks - .iter() - .map(|b| b.block_root()) - .collect::>() - ) + Status::ForwardSync { block, .. } => { + format!("ForwardSync sync_block {:?}", block.block_root()) } }; diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs index 611c5e60514..5d181d99fd8 100644 --- a/beacon_node/network/src/sync/sync_block.rs +++ b/beacon_node/network/src/sync/sync_block.rs @@ -38,6 +38,11 @@ enum SyncingStatus { Processing(RpcBlock, BatchPeers, Instant), } +pub enum OkToImport { + IfParentImported, + Bool(bool), +} + #[must_use] pub enum SyncBlockResult { Done { parent_root: Hash256, slot: Slot }, @@ -203,7 +208,7 @@ impl SyncBlock { pub fn continue_request( &mut self, cx: &mut SyncNetworkContext, - ok_to_import: bool, + ok_to_import: OkToImport, ) -> Result<(), Error> { match &mut self.request { SyncingStatus::AwaitingDownload => { @@ -232,8 +237,20 @@ impl SyncBlock { // from the beacon processor anyway. No need to add more code to handle this // edge case faster. - if !ok_to_import { - return Ok(()); + match ok_to_import { + OkToImport::IfParentImported => { + if !cx + .chain + .block_is_known_to_fork_choice(&block.as_block().parent_root()) + { + return Ok(()); + } + } + OkToImport::Bool(ok_to_import) => { + if !ok_to_import { + return Ok(()); + } + } } if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { From 8e791e5b8992c67b4bb334f71cf600a12a024258 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 02:11:05 +0200 Subject: [PATCH 58/66] Fix to_ready_to_sync --- beacon_node/network/src/sync/forward_sync.rs | 22 ++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 1fd087ed44b..9c71688bcbd 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -315,16 +315,26 @@ impl Chain { Ok(Self { peers: self.peers.clone(), - // What to set the status to?? status, }) } /// Return true if this chain is awaiting `block_root` - fn is_waiting_parent(&self, block_root: &Hash256) -> bool { - match &self.status { + fn to_ready_to_sync(&mut self, block_root: &Hash256) -> bool { + match &mut self.status { Status::BackfillHeaders { .. } => false, - Status::WaitingParentChain { parent_root, .. } => block_root == parent_root, + Status::WaitingParentChain { + parent_root, + ready_to_sync, + .. + } => { + if block_root == parent_root && !*ready_to_sync { + *ready_to_sync = true; + true + } else { + false + } + } Status::ForwardSync { .. } => false, } } @@ -972,11 +982,11 @@ impl ForwardSync { // Find all chains that are awaiting this block to process and continue them for other_chain in self.chains.values_mut() { - if other_chain.is_waiting_parent(&id.block_root) { + if other_chain.to_ready_to_sync(&id.block_root) { debug!( %chain_id, parent_root = ?id.block_root, - "Forward sync chain awaiting parent transitioned to forward sync" + "Forward sync marked chain as ready to sync" ); } } From fd304d445578e18808dbf72fcfb8b2d1ea0c6827 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 10:42:38 +0200 Subject: [PATCH 59/66] Log when chain splits --- beacon_node/network/src/sync/forward_sync.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 9c71688bcbd..9c55740848a 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -748,6 +748,14 @@ impl ForwardSync { let chain_to_add_peers = if should_split_chain { let new_chain = chain.split_by(target_block_root)?; let new_chain_id = TipId(cx.next_id()); + debug!( + block_root = ?target_block_root, + %chain_id, + %new_chain_id, + chain_block_count = chain.block_count(), + new_chain_block_count = new_chain.block_count(), + "Forward sync chain split" + ); // Update all block references to the new chain for block_root in new_chain.iter_block_roots() { @@ -1077,13 +1085,19 @@ impl ForwardSync { let mut new_chains = vec![]; - 'o: for (id, chain) in chains_by_peer_count { + 'o: for (chain_id, chain) in chains_by_peer_count { while let Some(new_chain) = chain.pop_next_block_to_sync() { let new_chain_id = TipId(cx.next_id()); // Update all block references to the new chain for block_root in new_chain.iter_block_roots() { self.block_to_tip.insert(*block_root, new_chain_id); - debug!(%new_chain_id, ?block_root, "Transitioned block to forward sync"); + debug!( + %chain_id, + %new_chain_id, + ?block_root, + chain_block_count = chain.block_count(), + "Transitioned block to forward sync" + ); } new_chains.push((new_chain_id, new_chain)); if new_chains.len() >= new_blocks_to_sync { From 6d48a1ec5b3c59e5eb2f33afe01614d799b81ae7 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 18:16:57 +0200 Subject: [PATCH 60/66] Log recursive chains --- beacon_node/network/src/sync/forward_sync.rs | 139 ++++++++++++++++++- 1 file changed, 137 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 9c55740848a..02c785370b7 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -272,6 +272,7 @@ impl Chain { // TODO(tree-sync): check that block_root is the next_root or error vec![] }; + // Mutate this chain, which keeps all descendant roots of `block_root` self.status = Status::WaitingParentChain { parent_root: block_root, block_roots, @@ -287,6 +288,7 @@ impl Chain { block_roots, ready_to_sync, } => { + let mut block_roots = std::mem::take(block_roots); let idx = block_roots .iter() .position(|b| b.0.block_root == block_root) @@ -297,9 +299,10 @@ impl Chain { let new_block_roots = block_roots.drain(0..=idx).collect::>(); let parent_root = *parent_root; let ready_to_sync = *ready_to_sync; + // Mutate this chain, which keeps all descendant roots of `block_root` self.status = Status::WaitingParentChain { parent_root: block_root, - block_roots: std::mem::take(block_roots), + block_roots, ready_to_sync: false, }; Status::WaitingParentChain { @@ -1106,6 +1109,9 @@ impl ForwardSync { } } + // Prune chains that become empty after pop_next_block_to_sync + self.chains.retain(|_, chain| !chain.is_empty()); + if !new_chains.is_empty() { for (chain_id, chain) in new_chains { self.chains.insert(chain_id, chain); @@ -1255,7 +1261,32 @@ impl ForwardSync { } }; - debug!(%chain_id, peers = chain.peers.len(), status, "DEBUG chain"); + let recursive_parent_chain = (|| { + let mut next_chain_id = *chain_id; + loop { + let Some(next_chain) = self.chains.get(&next_chain_id) else { + return Err(format!("Unknown chain {next_chain_id}")); + }; + if let Status::WaitingParentChain { parent_root, .. } = next_chain.status { + let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) else { + return Err(format!("Unknown block {parent_root:?}")); + }; + next_chain_id = *parent_chain_id; + } else if next_chain_id == *chain_id { + return Ok(format!("itself")); + } else { + return Ok(format!("recursive_parent_chain: {}", next_chain_id)); + } + } + })(); + + debug!(%chain_id, peers = chain.peers.len(), status, ?recursive_parent_chain, "DEBUG chain"); + } + + for (block_root, chain_id) in &self.block_to_tip { + if !self.chains.contains_key(chain_id) { + debug!("DEBUG block {block_root} points to unknown chain {chain_id}"); + } } // Min header @@ -1276,3 +1307,107 @@ fn render_result(result: &Result) -> String { Err(e) => format!("Err({e:?})"), } } + +#[cfg(test)] +mod tests { + use super::*; + use beacon_chain::builder::Witness; + use beacon_chain::eth1_chain::CachingEth1Backend; + use slot_clock::ManualSlotClock; + use store::MemoryStore; + use types::FixedBytesExtended; + use types::MinimalEthSpec as E; + + type T = Witness, E, MemoryStore, MemoryStore>; + + fn to_roots(input: &[u64]) -> Vec { + input.iter().map(to_root).collect() + } + + fn to_root(u: &u64) -> Hash256 { + Hash256::from_low_u64_le(*u) + } + + fn from_root(r: &Hash256) -> u64 { + r.to_low_u64_le() + } + + fn to_block(u: &u64) -> PendingBlock { + ( + HeaderLookupId { + id: *u as u32, + block_root: to_root(u), + }, + Slot::new(*u), + ) + } + + fn get_roots(chain: &Chain) -> Vec { + chain.iter_block_roots().map(from_root).collect() + } + + /* ------- BackfillHeaders ------------------------------------------------ */ + + fn test_split_by(input: &[u64], split: u64, roots_left: &[u64], roots_right: &[u64]) { + let mut initial_chain = Chain:: { + peers: <_>::default(), + status: Status::BackfillHeaders { + block_roots: input.iter().skip(1).map(to_block).collect::>(), + next_header_request: HeaderRequest::new(to_root(&input[0]), 0), + }, + }; + let new_chain = initial_chain + .split_by(to_root(&split)) + .expect("error spliting backfill headers"); + + assert_eq!(get_roots(&initial_chain), roots_right, "initial backfill"); + assert_eq!(get_roots(&new_chain), roots_left, "new backfill"); + + let mut initial_chain = Chain:: { + peers: <_>::default(), + status: Status::WaitingParentChain { + parent_root: to_root(&0), + block_roots: input.iter().map(to_block).collect::>(), + ready_to_sync: false, + }, + }; + let new_chain = initial_chain + .split_by(to_root(&split)) + .expect("error spliting backfill headers"); + + assert_eq!(get_roots(&initial_chain), roots_right, "initial waiting"); + assert_eq!(get_roots(&new_chain), roots_left, "new waiting"); + assert_eq!( + from_root(&initial_chain.parent_root().unwrap()), + *roots_left.last().unwrap(), + "parent_right" + ); + } + + #[test] + fn split_by_only_elem() { + // input [A,B] split A → [A] | [B] + test_split_by(&[0, 1], 0, &[0], &[1]); + } + + #[test] + fn split_by_first() { + // split first of many + test_split_by(&[0, 1, 2, 3], 0, &[0], &[1, 2, 3]); + } + + #[test] + fn split_by_middle_a() { + test_split_by(&[0, 1, 2, 3], 2, &[0, 1, 2], &[3]); + } + + #[test] + fn split_by_middle_b() { + test_split_by(&[0, 1, 2], 1, &[0, 1], &[2]); + } + + #[test] + fn split_by_last() { + test_split_by(&[0, 1, 2], 2, &[0, 1, 2], &[]); + } +} From d40cc35a107b47acabb4830d2e442a8abab1cf7e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 18:36:21 +0200 Subject: [PATCH 61/66] Recurse forward sync block too --- beacon_node/network/src/sync/forward_sync.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 02c785370b7..fc5fe7b4f43 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -1267,7 +1267,9 @@ impl ForwardSync { let Some(next_chain) = self.chains.get(&next_chain_id) else { return Err(format!("Unknown chain {next_chain_id}")); }; - if let Status::WaitingParentChain { parent_root, .. } = next_chain.status { + if let Status::WaitingParentChain { parent_root, .. } + | Status::ForwardSync { parent_root, .. } = next_chain.status + { let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) else { return Err(format!("Unknown block {parent_root:?}")); }; From 9cf94a733beed157702be80197be8e8b9073b243 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 18:52:03 +0200 Subject: [PATCH 62/66] Consider imported blocks --- beacon_node/network/src/sync/forward_sync.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index fc5fe7b4f43..0c57f0336a2 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -1271,13 +1271,20 @@ impl ForwardSync { | Status::ForwardSync { parent_root, .. } = next_chain.status { let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) else { - return Err(format!("Unknown block {parent_root:?}")); + if matches!(next_chain.status, Status::ForwardSync { .. }) { + // A ForwardSync chain may point to an already imported block + return Err(format!("{next_chain_id} unknown/imported")); + } else { + return Err(format!( + "{next_chain_id} Unknown block {parent_root:?}" + )); + } }; next_chain_id = *parent_chain_id; } else if next_chain_id == *chain_id { return Ok(format!("itself")); } else { - return Ok(format!("recursive_parent_chain: {}", next_chain_id)); + return Ok(format!("{next_chain_id}")); } } })(); From def7886c85905f934823a05206847f4d24c3753d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 15 Jul 2025 20:20:09 +0200 Subject: [PATCH 63/66] Fix split_by logic --- beacon_node/network/src/sync/forward_sync.rs | 89 ++++++++++++-------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 0c57f0336a2..c3df906907c 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -118,8 +118,9 @@ enum Status { /// - The set of SyncBlocks is consecutive /// - The parent of the last item in `block_roots` is the first item in `syncing_blocks` ForwardSync { - /// Sorting: oldest ancestor first block: SyncBlock, + /// The parent root of `block`. Note that it may point to a block that is already imported, + /// and is not in the sync headers DAG. parent_root: Hash256, }, } @@ -266,15 +267,20 @@ impl Chain { .iter() .position(|b| b.0.block_root == block_root) { - // ..= to keep the block_root on the left - block_roots.drain(0..=idx).collect::>() + // block_roots sorting: tip first, oldest ancestor last + // We want to return the set of blocks including `block_root` and all its + // ancestors into `new_block_roots`, and keep the rest in `block_roots` + block_roots.drain(idx..).collect::>() } else { // TODO(tree-sync): check that block_root is the next_root or error vec![] }; // Mutate this chain, which keeps all descendant roots of `block_root` self.status = Status::WaitingParentChain { + // This chain keeps the descendants of `block_root` so the oldest parent root is + // `block_root` parent_root: block_root, + // `block_roots` has been mutated to have only the descendants of `block_root` block_roots, ready_to_sync: false, }; @@ -295,12 +301,14 @@ impl Chain { .ok_or(InternalError(format!( "block_root {block_root:?} no in chain" )))?; - // ..= to keep the block_root on the left - let new_block_roots = block_roots.drain(0..=idx).collect::>(); + // See comments in BackfillHeaders variant above + let new_block_roots = block_roots.drain(idx..).collect::>(); let parent_root = *parent_root; let ready_to_sync = *ready_to_sync; // Mutate this chain, which keeps all descendant roots of `block_root` self.status = Status::WaitingParentChain { + // This chain keeps the descendants of `block_root` so the oldest parent root is + // `block_root` parent_root: block_root, block_roots, ready_to_sync: false, @@ -350,15 +358,18 @@ impl Chain { } } - /// Returns all block roots part of this chain + /// Returns all block roots part of this chain, in descending slot order fn iter_block_roots(&self) -> Box + '_> { match &self.status { Status::BackfillHeaders { block_roots, next_header_request, } => Box::new( - std::iter::once(&next_header_request.block_root) - .chain(block_roots.iter().map(|(id, _)| &id.block_root)), + block_roots + .iter() + .map(|(id, _)| &id.block_root) + // next_header_request is the oldest ancestor, so chain last + .chain(std::iter::once(&next_header_request.block_root)), ), Status::WaitingParentChain { block_roots, .. } => { Box::new(block_roots.iter().map(|(id, _)| &id.block_root)) @@ -1355,22 +1366,24 @@ mod tests { chain.iter_block_roots().map(from_root).collect() } - /* ------- BackfillHeaders ------------------------------------------------ */ - - fn test_split_by(input: &[u64], split: u64, roots_left: &[u64], roots_right: &[u64]) { - let mut initial_chain = Chain:: { - peers: <_>::default(), - status: Status::BackfillHeaders { - block_roots: input.iter().skip(1).map(to_block).collect::>(), - next_header_request: HeaderRequest::new(to_root(&input[0]), 0), - }, + fn test_split_by(input: &[u64], split: u64, roots_new: &[u64], roots_initial: &[u64]) { + let mut initial_chain = { + /// input sorting: tip first, oldest ancestor last + let (last, rest) = input.split_last().unwrap(); + Chain:: { + peers: <_>::default(), + status: Status::BackfillHeaders { + block_roots: rest.iter().map(to_block).collect::>(), + next_header_request: HeaderRequest::new(to_root(&last), 0), + }, + } }; let new_chain = initial_chain .split_by(to_root(&split)) .expect("error spliting backfill headers"); - assert_eq!(get_roots(&initial_chain), roots_right, "initial backfill"); - assert_eq!(get_roots(&new_chain), roots_left, "new backfill"); + assert_eq!(get_roots(&new_chain), roots_new, "new backfill"); + assert_eq!(get_roots(&initial_chain), roots_initial, "initial backfill"); let mut initial_chain = Chain:: { peers: <_>::default(), @@ -1384,39 +1397,47 @@ mod tests { .split_by(to_root(&split)) .expect("error spliting backfill headers"); - assert_eq!(get_roots(&initial_chain), roots_right, "initial waiting"); - assert_eq!(get_roots(&new_chain), roots_left, "new waiting"); + assert_eq!(get_roots(&new_chain), roots_new, "new waiting"); + assert_eq!(get_roots(&initial_chain), roots_initial, "initial waiting"); assert_eq!( from_root(&initial_chain.parent_root().unwrap()), - *roots_left.last().unwrap(), - "parent_right" + // The tip of the new chain is the parent of the initial chain + *roots_new.first().unwrap(), + "parent_initial" ); } #[test] - fn split_by_only_elem() { - // input [A,B] split A → [A] | [B] - test_split_by(&[0, 1], 0, &[0], &[1]); + fn split_by_only_elem_a() { + // input [0,1] sorted by tip first + test_split_by(&[1, 0], 0, &[0], &[1]); + } + + #[test] + fn split_by_only_elem_b() { + test_split_by(&[1, 0], 1, &[1, 0], &[]); } #[test] fn split_by_first() { - // split first of many - test_split_by(&[0, 1, 2, 3], 0, &[0], &[1, 2, 3]); + test_split_by(&[3, 2, 1, 0], 0, &[0], &[3, 2, 1]); } #[test] - fn split_by_middle_a() { - test_split_by(&[0, 1, 2, 3], 2, &[0, 1, 2], &[3]); + fn split_by_last() { + test_split_by(&[3, 2, 1, 0], 3, &[3, 2, 1, 0], &[]); } + #[test] + fn split_by_middle_a() { + test_split_by(&[3, 2, 1, 0], 1, &[1, 0], &[3, 2]); + } #[test] fn split_by_middle_b() { - test_split_by(&[0, 1, 2], 1, &[0, 1], &[2]); + test_split_by(&[3, 2, 1, 0], 2, &[2, 1, 0], &[3]); } - #[test] - fn split_by_last() { - test_split_by(&[0, 1, 2], 2, &[0, 1, 2], &[]); + fn split_by_middle_c() { + test_split_by(&[2, 1, 0], 1, &[1, 0], &[2]); } } From b009b37d75b38064a509863159d1dd58c4ea4d5f Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 24 Jul 2025 15:02:45 +0200 Subject: [PATCH 64/66] Implement merge operation --- beacon_node/network/src/metrics.rs | 6 + beacon_node/network/src/sync/forward_sync.rs | 149 +++++++++++++++++++ beacon_node/network/src/sync/manager.rs | 2 +- 3 files changed, 156 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 4495aedd8f9..925917bb2ce 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -541,6 +541,12 @@ pub static SYNC_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { "Current count of forward sync chains in memory", ) }); +pub static SYNC_CHAIN_MERGES_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_forward_chain_merges_total", + "Total count of forward sync chain merges", + ) +}); pub static SYNC_CHAIN_ERROR_COUNT: LazyLock> = LazyLock::new(|| { try_create_int_counter_vec( "sync_forward_chain_error_total", diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index c3df906907c..78a602ac1b7 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -330,6 +330,34 @@ impl Chain { }) } + /// Given another chain whose parent is the tip of this chain, merge the block of `other` into + /// `self`. + fn merge(&mut self, child_chain: Self) -> Result<(), InternalError> { + let Status::WaitingParentChain { + block_roots: child_block_roots, + .. + } = child_chain.status + else { + return Err(InternalError("Other not in WaitingParentChain".to_string())); + }; + + match &mut self.status { + Status::BackfillHeaders { block_roots, .. } + | Status::WaitingParentChain { block_roots, .. } => { + // child_block_roots and block_roots are sorted as tip first, so do + // child_block_roots + block_roots + *block_roots = child_block_roots + .into_iter() + .chain(block_roots.drain(..)) + .collect::>(); + Ok(()) + } + Status::ForwardSync { .. } => { + Err(InternalError("Cannot merge into ForwardSync".to_string())) + } + } + } + /// Return true if this chain is awaiting `block_root` fn to_ready_to_sync(&mut self, block_root: &Hash256) -> bool { match &mut self.status { @@ -1219,6 +1247,82 @@ impl ForwardSync { } } + fn merge_chains(&mut self) -> Result<(), InternalError> { + // To prevent O(n^2) ops, first compute a hashmap of tips to chains. Each block belongs + // exactly to one chain so there must be a single tip -> chain relationship + let tip_to_chain = HashMap::)>::from_iter( + self.chains.iter().filter_map(|(chain_id, chain)| { + // TODO(tree-sync): exclude ForwardSync + if let Some(tip) = chain.tip() { + Some((tip, (chain_id, chain))) + } else { + None + } + }), + ); + + // Now collect all chains waiting for a parent to sort them by peer_count and block count + let mut chains = self + .chains + .iter() + .filter_map(|(chain_id, chain)| { + if let Status::WaitingParentChain { parent_root, .. } = chain.status { + Some((chain_id, chain, parent_root)) + } else { + None + } + }) + .collect::>(); + chains.sort_unstable_by_key(|(_, chain, _)| (chain.peer_count(), chain.block_count())); + + // Iterate from highest peer count and highest block count first + let chains_to_merge = + chains + .into_iter() + .rev() + .find_map(|(chain_id, chain, parent_root)| { + // The parent root of chain is exactly the tip of parent_chain + if let Some((parent_chain_id, parent_chain)) = tip_to_chain.get(&parent_root) { + if chain.peers == parent_chain.peers { + // The peer set is the same, schedule to merge them + return Some((**parent_chain_id, *chain_id)); + } + } + None + }); + + // Execute the merge operation. Do a single merge operation per loop as we remove a + // chain from the chains map. Is possible that chains are childs of each other so to + // safely merge them we would need to iterate them in topological order. For + // simplicity we just do one merge at a time. + if let Some((parent_chain_id, chain_id)) = chains_to_merge { + debug!(%parent_chain_id, %chain_id, "Merging forward sync chains"); + metrics::inc_counter(&metrics::SYNC_CHAIN_MERGES_COUNT); + + let Some(chain) = self.chains.remove(&chain_id) else { + return Err(InternalError(format!("chain {chain_id} does not exist"))); + }; + let Some(parent_chain) = self.chains.get_mut(&parent_chain_id) else { + return Err(InternalError(format!( + "parent_chain {parent_chain_id} does not exist" + ))); + }; + // Update all block references to the new chain + for block_root in chain.iter_block_roots() { + self.block_to_tip.insert(*block_root, parent_chain_id); + } + parent_chain.merge(chain)?; + } + + Ok(()) + } + + pub fn prune(&mut self) { + if let Err(e) = self.merge_chains() { + error!(error = ?e, "Error merging forward sync chains"); + } + } + pub fn register_metrics(&self) { let (min_slot, max_slot) = self.chains @@ -1407,6 +1511,36 @@ mod tests { ); } + fn test_merge(left: &[u64], right: &[u64], expected_merged: &[u64]) { + let peers = HashSet::from_iter([PeerId::random()]); + // Left chain has descendant roots of right + let mut left_chain = Chain:: { + peers: peers.clone(), + status: Status::WaitingParentChain { + parent_root: to_root(right.first().unwrap()), + block_roots: left.iter().map(to_block).collect::>(), + ready_to_sync: false, + }, + }; + // Right chain has no known parent, so set it to 0xff + let mut right_chain = Chain:: { + peers: peers.clone(), + status: Status::WaitingParentChain { + parent_root: to_root(&0xff), // rand root to not have conflicts + block_roots: right.iter().map(to_block).collect::>(), + ready_to_sync: false, + }, + }; + let mut sync = ForwardSync { + block_to_tip: <_>::default(), + chains: HashMap::from_iter([(TipId(0), left_chain), (TipId(1), right_chain)]), + }; + sync.merge_chains(); + assert_eq!(sync.chains.len(), 1, "Should merge 2 chains into 1"); + let merged_chain = sync.chains.values().next().unwrap(); + assert_eq!(get_roots(merged_chain), expected_merged, "merged roots"); + } + #[test] fn split_by_only_elem_a() { // input [0,1] sorted by tip first @@ -1440,4 +1574,19 @@ mod tests { fn split_by_middle_c() { test_split_by(&[2, 1, 0], 1, &[1, 0], &[2]); } + + #[test] + fn merge_left_long() { + test_merge(&[2, 1], &[0], &[2, 1, 0]); + } + + #[test] + fn merge_right_long() { + test_merge(&[2], &[1, 0], &[2, 1, 0]); + } + + #[test] + fn merge_same() { + test_merge(&[3, 2], &[1, 0], &[3, 2, 1, 0]); + } } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 38143e1b21d..765d532c3d7 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -620,7 +620,7 @@ impl SyncManager { self.handle_new_execution_engine_state(engine_state); } _ = prune_lookups_interval.tick() => { - // TODO(tree-sync): should prune stuck lookups? + self.forward_sync.prune(); } _ = prune_requests.tick() => { self.prune_requests(); From 47c93578c418bdac5c3beb3064ab5f675c3c177d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 24 Jul 2025 23:43:36 +0200 Subject: [PATCH 65/66] Implement headers_by_root consumer side --- .../src/service/api_types.rs | 9 + beacon_node/network/src/sync/forward_sync.rs | 259 +++++++++--------- beacon_node/network/src/sync/manager.rs | 82 +++++- .../network/src/sync/network_context.rs | 33 ++- .../src/sync/network_context/requests.rs | 2 + .../requests/headers_by_root.rs | 45 +++ 6 files changed, 288 insertions(+), 142 deletions(-) create mode 100644 beacon_node/network/src/sync/network_context/requests/headers_by_root.rs diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 251f899da36..abed618d1b9 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -23,6 +23,8 @@ pub enum SyncRequestId { BlobsByRoot(BlobsByRootRequestId), /// Request searching for a set of data columns given a hash and list of column indices. DataColumnsByRoot(DataColumnsByRootRequestId), + /// Request for headers_by_root + HeadersByRoot(HeadersByRootRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -31,6 +33,12 @@ pub struct BlocksByRootRequestId { pub parent_request_id: BlocksByRootRequester, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct HeadersByRootRequestId { + pub id: Id, + pub parent_request_id: HeaderLookupId, +} + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct HeaderLookupId { pub id: Id, @@ -210,6 +218,7 @@ impl_display!(ComponentsByRootRequestId, "{}/{}", id, requester); impl_display!(BlocksByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(BlobsByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, parent_request_id); +impl_display!(HeadersByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); impl_display!(CustodyByRootRequestId, "{}", parent_request_id); impl_display!(SamplingId, "{}/{}", sampling_request_id, id); diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index 78a602ac1b7..b084e0fa82f 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -3,7 +3,7 @@ use super::network_context::{ SyncNetworkContext, }; use crate::metrics; -use crate::sync::network_context::{BatchPeers, RpcResponseResult}; +use crate::sync::network_context::{BatchPeers, LookupVerifyError, RpcResponseResult}; use crate::sync::sync_block::{Error as SyncBlockError, OkToImport, SyncBlock, SyncBlockResult}; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; @@ -14,11 +14,10 @@ use lighthouse_network::service::api_types::{ }; use lighthouse_network::PeerId; use std::collections::{HashMap, HashSet, VecDeque}; -use std::sync::Arc; use std::time::Duration; use strum::IntoStaticStr; use tracing::{debug, error}; -use types::{BeaconBlockHeader, EthSpec, Hash256, SignedBeaconBlock, Slot}; +use types::{BeaconBlockHeader, EthSpec, Hash256, Slot}; const MAX_LOOKUP_COUNT: usize = 1_000_000; const PRUNE_COUNT: usize = 100_000; @@ -82,7 +81,7 @@ struct Chain { status: Status, } -type PendingBlock = (HeaderLookupId, Slot); +type PendingBlock = (Hash256, Slot); #[allow(clippy::large_enum_variant)] enum Status { @@ -127,16 +126,16 @@ enum Status { /// Tracks a request to download a BeaconBlockHeader by block root struct HeaderRequest { - id: HeaderLookupId, + id: Option, block_root: Hash256, failed_peers: HashSet, request: DownloadRequest, } impl HeaderRequest { - fn new(block_root: Hash256, id: Id) -> Self { + fn new(block_root: Hash256) -> Self { Self { - id: HeaderLookupId { id, block_root }, + id: None, block_root, failed_peers: <_>::default(), request: DownloadRequest::new(), @@ -144,7 +143,7 @@ impl HeaderRequest { } fn empty() -> Self { - Self::new(Hash256::ZERO, 0) + Self::new(Hash256::ZERO) } fn continue_request( @@ -172,10 +171,16 @@ impl HeaderRequest { return Err(Error::InternalError("No peers".to_owned())); }; + let id = self.id.get_or_insert_with(|| cx.next_id()).clone(); + + // TODO(tree-sync): send headers_by_root request if available let req_id = cx.send_blocks_by_root_request( peer, self.block_root, - BlocksByRootRequester::Header(self.id), + BlocksByRootRequester::Header(HeaderLookupId { + id, + block_root: self.block_root, + }), )?; self.request.on_download_start(req_id)?; @@ -188,12 +193,12 @@ impl HeaderRequest { // through gossip and sync. impl Chain { - fn new(block_root: Hash256, id: Id, initial_peers: &[PeerId]) -> Self { + fn new(block_root: Hash256, initial_peers: &[PeerId]) -> Self { Self { peers: HashSet::from_iter(initial_peers.iter().copied()), status: Status::BackfillHeaders { block_roots: vec![], - next_header_request: HeaderRequest::new(block_root, id), + next_header_request: HeaderRequest::new(block_root), }, } } @@ -237,11 +242,11 @@ impl Chain { } => Some( block_roots .first() - .map(|block| block.0.block_root) + .map(|block| block.0) .unwrap_or(next_header_request.block_root), ), Status::WaitingParentChain { block_roots, .. } => { - block_roots.first().map(|block| block.0.block_root) + block_roots.first().map(|block| block.0) } Status::ForwardSync { block, .. } => Some(*block.block_root()), } @@ -263,18 +268,16 @@ impl Chain { let next_header_request = std::mem::replace(next_header_request, HeaderRequest::empty()); - let new_block_roots = if let Some(idx) = block_roots - .iter() - .position(|b| b.0.block_root == block_root) - { - // block_roots sorting: tip first, oldest ancestor last - // We want to return the set of blocks including `block_root` and all its - // ancestors into `new_block_roots`, and keep the rest in `block_roots` - block_roots.drain(idx..).collect::>() - } else { - // TODO(tree-sync): check that block_root is the next_root or error - vec![] - }; + let new_block_roots = + if let Some(idx) = block_roots.iter().position(|b| b.0 == block_root) { + // block_roots sorting: tip first, oldest ancestor last + // We want to return the set of blocks including `block_root` and all its + // ancestors into `new_block_roots`, and keep the rest in `block_roots` + block_roots.drain(idx..).collect::>() + } else { + // TODO(tree-sync): check that block_root is the next_root or error + vec![] + }; // Mutate this chain, which keeps all descendant roots of `block_root` self.status = Status::WaitingParentChain { // This chain keeps the descendants of `block_root` so the oldest parent root is @@ -295,12 +298,13 @@ impl Chain { ready_to_sync, } => { let mut block_roots = std::mem::take(block_roots); - let idx = block_roots - .iter() - .position(|b| b.0.block_root == block_root) - .ok_or(InternalError(format!( - "block_root {block_root:?} no in chain" - )))?; + let idx = + block_roots + .iter() + .position(|b| b.0 == block_root) + .ok_or(InternalError(format!( + "block_root {block_root:?} no in chain" + )))?; // See comments in BackfillHeaders variant above let new_block_roots = block_roots.drain(idx..).collect::>(); let parent_root = *parent_root; @@ -395,12 +399,12 @@ impl Chain { } => Box::new( block_roots .iter() - .map(|(id, _)| &id.block_root) + .map(|(block_root, _)| block_root) // next_header_request is the oldest ancestor, so chain last .chain(std::iter::once(&next_header_request.block_root)), ), Status::WaitingParentChain { block_roots, .. } => { - Box::new(block_roots.iter().map(|(id, _)| &id.block_root)) + Box::new(block_roots.iter().map(|(block_root, _)| block_root)) } Status::ForwardSync { block, .. } => Box::new(std::iter::once(block.block_root())), } @@ -449,13 +453,14 @@ impl Chain { } } - fn add_ancestor(&mut self, parent_root: Hash256, id: Id) -> Result<(), InternalError> { + fn add_ancestor(&mut self, header: BeaconBlockHeader) -> Result<(), InternalError> { match &mut self.status { Status::BackfillHeaders { block_roots, next_header_request, } => { - *next_header_request = HeaderRequest::new(parent_root, id); + block_roots.push((next_header_request.block_root, header.slot)); + *next_header_request = HeaderRequest::new(header.parent_root); Ok(()) } _ => Err(InternalError( @@ -484,7 +489,7 @@ impl Chain { } } - fn pop_next_block_to_sync(&mut self) -> Option { + fn pop_next_block_to_sync(&mut self, cx: &mut SyncNetworkContext) -> Option { match &mut self.status { Status::WaitingParentChain { block_roots, @@ -494,18 +499,19 @@ impl Chain { if !*ready_to_sync { return None; } - let Some(last_block) = block_roots.pop() else { + let Some((block_root, block_slot)) = block_roots.pop() else { return None; }; let last_block_parent_root = *parent_root; - *parent_root = last_block.0.block_root; + *parent_root = block_root; + let id = cx.next_id(); let block = SyncBlock::new( // Reuse the request ID of the header for better traceability - RangeRequestId::ForwardSync(last_block.0), - last_block.0.block_root, - last_block.1, + RangeRequestId::ForwardSync(HeaderLookupId { id, block_root }), + block_root, + block_slot, &self.peers.iter().copied().collect::>(), ); @@ -521,26 +527,19 @@ impl Chain { } } - fn on_header_download( - &mut self, - req_id: BlocksByRootRequestId, - block: BeaconBlockHeader, - ) -> Result<(), Error> { + fn assert_download_req_id(&mut self, req_id: BlocksByRootRequestId) -> Result<(), Error> { match &mut self.status { Status::BackfillHeaders { next_header_request, - block_roots, + .. } => { // Call `on_download_success` to assert that the req_id is the expected on next_header_request.request.on_download_success( req_id, PeerId::random(), - block.clone(), + BeaconBlockHeader::empty(), Duration::from_secs(0), )?; - // Add the downloaded block - // Persist the request ID of the header for better traceability - block_roots.push((next_header_request.id, block.slot)); Ok(()) } _ => Err(Error::InternalError( @@ -840,20 +839,18 @@ impl ForwardSync { self.prune_least_popular_lookups(); } - let id = cx.next_id(); let chain_id = TipId(cx.next_id()); match peers { - [peer] => debug!(?block_root, id, %chain_id, %peer, "Creating new header lookup"), + [peer] => debug!(?block_root, %chain_id, %peer, "Creating new header lookup"), _ => debug!( ?block_root, - id, %chain_id, peers = peers.len(), "Creating new header lookup" ), } - let mut chain = Chain::new(block_root, id, peers); + let mut chain = Chain::new(block_root, peers); chain.continue_requests(cx)?; // Don't insert until first request is successful self.chains.insert(chain_id, chain); @@ -864,35 +861,34 @@ impl ForwardSync { } /// Handle the result of a header download. - pub fn on_header_download_result( + pub fn on_headers_download_result( &mut self, req_id: BlocksByRootRequestId, id: HeaderLookupId, - response: RpcResponseResult>>>, - peer_id: PeerId, + response: RpcResponseResult>, + _peer_id: PeerId, cx: &mut SyncNetworkContext, ) { - let block_root = id.block_root; - // Invoke a closure to use the ? operator and handle the result consistenlty let result: Result<(), Error> = (|| { - let Some(chain_id) = self.block_to_tip.get(&block_root) else { + let Some(chain_id) = self.block_to_tip.get(&id.block_root).copied() else { // TODO(tree-sync): register metric debug!(id = ?req_id, "Received header request for unknown block_root"); return Ok(()); }; - let chain = self.chains.get_mut(chain_id).ok_or(InternalError(format!( - "block_root {block_root:?} references unknown chain {chain_id}" + let chain = self.chains.get_mut(&chain_id).ok_or(InternalError(format!( + "block_root {:?} references unknown chain {chain_id}", + id.block_root )))?; let response = response.and_then(|(blocks, timestamp)| { - let block = blocks - .first() - .cloned() - .ok_or(RpcResponseError::InternalError( - "blocks_by_root response contains zero blocks".to_owned(), - ))?; - Ok((block, timestamp)) + if blocks.is_empty() { + Err(RpcResponseError::VerifyError( + LookupVerifyError::NotEnoughResponsesReturned { actual: 0 }, + )) + } else { + Ok((blocks, timestamp)) + } }); // TODO(tree-sync): add some check to make sure that distinct lookups for the same @@ -900,75 +896,78 @@ impl ForwardSync { // errors for bad state match response { - Ok((block, received)) => { - let block_header = block.message().block_header(); - let parent_root = block_header.parent_root; - - chain.on_header_download(req_id, block_header.clone())?; - - metrics::inc_counter(&metrics::SYNC_HEADERS_DOWNLOADED); + Ok((headers, _received)) => { + chain.assert_download_req_id(req_id)?; debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); - // Once we discover the parent_root of this block three things can happen - // 1. The parent root is a known block -> stop - // 2. We conflicts with finality -> reject - // 3. The parent root is unknown -> continue search - - // TODO(tree-sync): should check if the block is descendant of finalized - // TODO(tree-sync): on finalization or every interval we should drop branches that - // conflict with finality - let finalized_checkpoint = cx.chain.head().finalized_checkpoint(); - - // TODO(tree-sync): check that the slots are decreasing, so we don't end up in - // an infinite loop. But note that the wrong block will be the descendant. - // - We get header A with parent B and slot 10 - // - We get header B with parent C and slot 11 - // - That makes header A invalid - - if block_header.slot - <= finalized_checkpoint - .epoch - .start_slot(T::EthSpec::slots_per_epoch()) - && block_root != finalized_checkpoint.root - { - return Err(Error::BlockConflictsWithFinality(format!( - "Block {:?} {} conflicts with finalized checkpoint {:?}", - block_root, block_header.slot, finalized_checkpoint - ))); - } + for header in headers { + let parent_root = header.parent_root; + let block_root = header.canonical_root(); + chain.add_ancestor(header.clone())?; + + metrics::inc_counter(&metrics::SYNC_HEADERS_DOWNLOADED); + + // Once we discover the parent_root of this block three things can happen + // 1. The parent root is a known block -> stop + // 2. We conflicts with finality -> reject + // 3. The parent root is unknown -> continue search + + // TODO(tree-sync): should check if the block is descendant of finalized + // TODO(tree-sync): on finalization or every interval we should drop branches that + // conflict with finality + let finalized_checkpoint = cx.chain.head().finalized_checkpoint(); + + // TODO(tree-sync): check that the slots are decreasing, so we don't end up in + // an infinite loop. But note that the wrong block will be the descendant. + // - We get header A with parent B and slot 10 + // - We get header B with parent C and slot 11 + // - That makes header A invalid + + if header.slot + <= finalized_checkpoint + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + && block_root != finalized_checkpoint.root + { + return Err(Error::BlockConflictsWithFinality(format!( + "Block {:?} {} conflicts with finalized checkpoint {:?}", + block_root, header.slot, finalized_checkpoint + ))); + } - if cx.chain.block_is_known_to_fork_choice(&parent_root) { - // Parent is imported, we can forward sync this chain - // Stop search we reached a known block - chain.to_waiting_parent(parent_root, true)?; - debug!(%chain_id, ?parent_root, block_count = chain.block_count(), "Forward sync chain reached imported block"); - // Trigger potential foward sync for this chain - self.continue_requests(cx); - } else if let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) { - // Parent is part of another chain, stop search - // Stop search we reached a known block - chain.to_waiting_parent(parent_root, false)?; - debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); - // TODO(tree-sync): Add peers recursively to the chain_id, potentially - // splitting the chain when adding peers. - } else { - chain.add_ancestor(block_header.parent_root, cx.next_id())?; - debug!(%chain_id, ?parent_root, "Forward sync chain continues fetching ancestor"); - // Add to the block_to_tip mapping to respect the invariant "Each block - // root exists in exactly one `Chain::block_roots` list". - self.block_to_tip.insert(parent_root, *chain_id); - // Since the block already points to `chain` we don't need to add peers. - // Just trigger header download for this new root. - self.continue_requests(cx); + if cx.chain.block_is_known_to_fork_choice(&parent_root) { + // Parent is imported, we can forward sync this chain + // Stop search we reached a known block + chain.to_waiting_parent(parent_root, true)?; + debug!(%chain_id, ?parent_root, block_count = chain.block_count(), "Forward sync chain reached imported block"); + // Trigger potential foward sync for this chain + self.continue_requests(cx); + break; + } else if let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) { + // Parent is part of another chain, stop search + // Stop search we reached a known block + chain.to_waiting_parent(parent_root, false)?; + debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); + // TODO(tree-sync): Add peers recursively to the chain_id, potentially + // splitting the chain when adding peers. + break; + } else { + debug!(%chain_id, ?parent_root, "Forward sync chain continues fetching ancestor"); + // Add to the block_to_tip mapping to respect the invariant "Each block + // root exists in exactly one `Chain::block_roots` list". + self.block_to_tip.insert(parent_root, chain_id); + // Since the block already points to `chain` we don't need to add peers. + // Just trigger header download for this new root. + } } } Err(e) => { // Request errors are logged in `SyncNetworkContext::on_rpc_response_result` chain.header_request()?.on_download_error(req_id, Some(e))?; // Continue this request to potentially resend the header request - self.continue_requests(cx); } } + self.continue_requests(cx); Ok(()) })(); @@ -1128,7 +1127,7 @@ impl ForwardSync { let mut new_chains = vec![]; 'o: for (chain_id, chain) in chains_by_peer_count { - while let Some(new_chain) = chain.pop_next_block_to_sync() { + while let Some(new_chain) = chain.pop_next_block_to_sync(cx) { let new_chain_id = TipId(cx.next_id()); // Update all block references to the new chain for block_root in new_chain.iter_block_roots() { @@ -1358,7 +1357,7 @@ impl ForwardSync { next_header_request, } => { format!( - "BackfillHeaders block_roots {block_roots:?} next_header_request {} {} {}", + "BackfillHeaders block_roots {block_roots:?} next_header_request {:?} {} {}", next_header_request.id, next_header_request.block_root, next_header_request.request.status_str() diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 765d532c3d7..6b75eb506d8 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -54,8 +54,8 @@ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, - CustodyByRootRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, - SamplingRequester, SyncRequestId, + CustodyByRootRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, + HeadersByRootRequestId, Id, SamplingId, SamplingRequester, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::{PeerId, SyncInfo}; @@ -67,7 +67,8 @@ use std::time::Duration; use tokio::sync::mpsc; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use types::{ - BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, + BeaconBlockHeader, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, + SignedBeaconBlock, Slot, }; #[cfg(test)] @@ -120,6 +121,13 @@ pub enum SyncMessage { seen_timestamp: Duration, }, + BlockHeader { + id: SyncRequestId, + peer_id: PeerId, + header: Option, + seen_timestamp: Duration, + }, + /// A block with an unknown parent has been received. UnknownParentBlock(PeerId, Arc>, Hash256), @@ -411,6 +419,9 @@ impl SyncManager { SyncRequestId::DataColumnsByRoot(req_id) => { self.on_data_columns_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } + SyncRequestId::HeadersByRoot(req_id) => { + self.on_headers_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) + } } } @@ -667,6 +678,12 @@ impl SyncManager { } => { self.rpc_data_column_received(sync_request_id, peer_id, data_column, seen_timestamp) } + SyncMessage::BlockHeader { + id, + peer_id, + header, + seen_timestamp, + } => self.rpc_block_header_received(id, peer_id, header, seen_timestamp), SyncMessage::UnknownParentBlock(peer_id, block, block_root) => { let block_slot = block.slot(); let parent_root = block.parent_root(); @@ -716,8 +733,8 @@ impl SyncManager { error, } => self.inject_error(peer_id, sync_request_id, error), SyncMessage::GossipBlockProcessResult { - block_root, - imported, + block_root: _, + imported: _, } => { // Not used } @@ -877,10 +894,16 @@ impl SyncManager { { match req_id.parent_request_id { BlocksByRootRequester::Header(lookup_id) => { - self.forward_sync.on_header_download_result( + self.forward_sync.on_headers_download_result( req_id, lookup_id, - result, + result.map(|(blocks, seen_timestamp)| { + let blocks = blocks + .into_iter() + .map(|block| block.message().block_header()) + .collect::>(); + (blocks, seen_timestamp) + }), peer_id, &mut self.network, ); @@ -952,6 +975,27 @@ impl SyncManager { } } + fn rpc_block_header_received( + &mut self, + id: SyncRequestId, + peer_id: PeerId, + header: Option, + seen_timestamp: Duration, + ) { + match id { + SyncRequestId::HeadersByRoot(req_id) => { + self.on_headers_by_root_response( + req_id, + peer_id, + RpcEvent::from_chunk(header, seen_timestamp), + ); + } + _ => { + crit!(%peer_id, "bad request id for beacon_block_header"); + } + } + } + fn on_data_columns_by_root_response( &mut self, req_id: DataColumnsByRootRequestId, @@ -983,6 +1027,30 @@ impl SyncManager { } } + fn on_headers_by_root_response( + &mut self, + req_id: HeadersByRootRequestId, + peer_id: PeerId, + header: RpcEvent, + ) { + if let Some(resp) = self + .network + .on_headers_by_root_response(req_id, peer_id, header) + { + self.forward_sync.on_headers_download_result( + // TODO(tree-sync): handle the two type of requests with distinct IDs + BlocksByRootRequestId { + id: req_id.id, + parent_request_id: BlocksByRootRequester::Header(req_id.parent_request_id), + }, + req_id.parent_request_id, + resp, + peer_id, + &mut self.network, + ); + } + } + fn on_custody_by_root_result( &mut self, id: CustodyByRootRequestId, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index f64551016a2..5250803fca3 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -25,14 +25,14 @@ pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, CustodyByRootRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SyncRequestId, + DataColumnsByRootRequester, HeadersByRootRequestId, Id, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; pub use requests::LookupVerifyError; use requests::{ ActiveRequests, BlobsByRootRequestItems, BlocksByRootRequestItems, - DataColumnsByRootRequestItems, + DataColumnsByRootRequestItems, HeadersByRootRequestItems, }; #[cfg(test)] use slot_clock::SlotClock; @@ -45,9 +45,9 @@ use task_executor::TaskExecutor; use tokio::sync::mpsc; use tracing::{debug, span, warn, Level}; use types::{ - BlobIdentifier, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, - DataColumnsByRootIdentifier, EthSpec, ForkContext, ForkName, Hash256, RuntimeVariableList, - SignedBeaconBlock, + BeaconBlockHeader, BlobIdentifier, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, + DataColumnSidecarList, DataColumnsByRootIdentifier, EthSpec, ForkContext, ForkName, Hash256, + RuntimeVariableList, SignedBeaconBlock, }; pub mod block_components_by_range; @@ -217,6 +217,8 @@ pub struct SyncNetworkContext { /// A mapping of active DataColumnsByRoot requests data_columns_by_root_requests: ActiveRequests>, + /// A mapping of active HeadersByRoot requests + headers_by_root_requests: ActiveRequests, /// Mapping of active custody column by root requests for a block root custody_by_root_requests: FnvHashMap>, @@ -301,6 +303,7 @@ impl SyncNetworkContext { blocks_by_root_requests: ActiveRequests::new("blocks_by_root"), blobs_by_root_requests: ActiveRequests::new("blobs_by_root"), data_columns_by_root_requests: ActiveRequests::new("data_columns_by_root"), + headers_by_root_requests: ActiveRequests::new("headers_by_root"), custody_by_root_requests: <_>::default(), block_components_by_root_requests: <_>::default(), network_beacon_processor, @@ -333,6 +336,7 @@ impl SyncNetworkContext { blocks_by_root_requests, blobs_by_root_requests, data_columns_by_root_requests, + headers_by_root_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_root_requests is a meta request of various _by_root requests @@ -352,10 +356,14 @@ impl SyncNetworkContext { let data_column_by_root_ids = data_columns_by_root_requests .active_requests() .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); + let headers_by_root_ids = headers_by_root_requests + .active_requests() + .map(|(id, peer)| (SyncRequestId::HeadersByRoot(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) + .chain(headers_by_root_ids) } #[cfg(test)] @@ -428,6 +436,7 @@ impl SyncNetworkContext { blocks_by_root_requests, blobs_by_root_requests, data_columns_by_root_requests, + headers_by_root_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests @@ -446,6 +455,7 @@ impl SyncNetworkContext { .iter_request_peers() .chain(blobs_by_root_requests.iter_request_peers()) .chain(data_columns_by_root_requests.iter_request_peers()) + .chain(headers_by_root_requests.iter_request_peers()) { *active_request_count_by_peer.entry(peer_id).or_default() += 1; } @@ -847,6 +857,19 @@ impl SyncNetworkContext { self.on_rpc_response_result(resp, peer_id) } + /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. + /// Same logic as [`on_blocks_by_root_response`] + #[allow(clippy::type_complexity)] + pub(crate) fn on_headers_by_root_response( + &mut self, + id: HeadersByRootRequestId, + peer_id: PeerId, + rpc_event: RpcEvent, + ) -> Option>> { + let resp = self.headers_by_root_requests.on_response(id, rpc_event); + self.on_rpc_response_result(resp, peer_id) + } + /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics /// and peer reporting for all request types. fn on_rpc_response_result( diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index cc9d7bbb372..e9eb8654bb2 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -11,6 +11,7 @@ use types::{Hash256, Slot}; pub use blobs_by_root::BlobsByRootRequestItems; pub use blocks_by_root::BlocksByRootRequestItems; pub use data_columns_by_root::DataColumnsByRootRequestItems; +pub use headers_by_root::HeadersByRootRequestItems; use crate::metrics; @@ -19,6 +20,7 @@ use super::{RpcEvent, RpcResponseResult}; mod blobs_by_root; mod blocks_by_root; mod data_columns_by_root; +mod headers_by_root; #[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { diff --git a/beacon_node/network/src/sync/network_context/requests/headers_by_root.rs b/beacon_node/network/src/sync/network_context/requests/headers_by_root.rs new file mode 100644 index 00000000000..7620c72fdca --- /dev/null +++ b/beacon_node/network/src/sync/network_context/requests/headers_by_root.rs @@ -0,0 +1,45 @@ +use super::{ActiveRequestItems, LookupVerifyError}; +use types::{BeaconBlockHeader, Hash256}; + +pub struct HeadersByRootRequestItems { + next_block_root: Hash256, + max_count: usize, + items: Vec, +} + +impl HeadersByRootRequestItems { + pub fn new(block_root: Hash256, max_count: usize) -> Self { + Self { + next_block_root: block_root, + max_count, + items: vec![], + } + } +} + +impl ActiveRequestItems for HeadersByRootRequestItems { + type Item = BeaconBlockHeader; + + /// Append a response to the single chunk request. If the chunk is valid, the request is + /// resolved immediately. + /// The active request SHOULD be dropped after `add_response` returns an error + fn add(&mut self, header: Self::Item) -> Result { + let block_root = header.canonical_root(); + if self.next_block_root != block_root { + return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); + } + + if self.items.len() >= self.max_count { + return Err(LookupVerifyError::TooManyResponses); + } + + self.next_block_root = header.parent_root; + self.items.push(header); + + Ok(false) + } + + fn consume(&mut self) -> Vec { + std::mem::take(&mut self.items) + } +} From cdfb74fa880bf93f40966824224deaef7dd73fb8 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:09:03 -0300 Subject: [PATCH 66/66] Simplify forward sync design --- .../src/service/api_types.rs | 20 +- beacon_node/network/src/metrics.rs | 12 +- .../network_beacon_processor/sync_methods.rs | 4 +- beacon_node/network/src/sync/forward_sync.rs | 1254 +++++++---------- 4 files changed, 538 insertions(+), 752 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index abed618d1b9..5b1281efe06 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -42,12 +42,21 @@ pub struct HeadersByRootRequestId { #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct HeaderLookupId { pub id: Id, - pub block_root: Hash256, + pub chain_id: HeaderChainId, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] +pub struct HeaderChainId(pub Id); + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct BatchId(pub Id); +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct ForwardSyncLookupId { + pub id: Id, + pub block_root: Hash256, +} + /// Request ID for data_columns_by_root requests. Block lookups do not issue this request directly. /// Wrapping this particular req_id, ensures not mixing this request with a custody req_id. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -78,7 +87,7 @@ pub struct ComponentsByRootRequestId { /// Range sync chain or backfill batch #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { - ForwardSync(HeaderLookupId), + ForwardSync(ForwardSyncLookupId), BackfillSync(Id), } @@ -225,6 +234,7 @@ impl_display!(SamplingId, "{}/{}", sampling_request_id, id); // Print only the ID to make logs succint. On lookup creation we log the ID and the block root to // link them. impl_display!(HeaderLookupId, "{}", id); +impl_display!(ForwardSyncLookupId, "{}/{}", id, block_root); impl Display for DataColumnsByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -235,6 +245,12 @@ impl Display for DataColumnsByRootRequester { } } +impl Display for HeaderChainId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + impl Display for BatchId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 925917bb2ce..2588ca6b12c 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -535,10 +535,16 @@ pub static SYNC_HEADER_MAX_SLOT: LazyLock> = LazyLock::new(|| { pub static SYNC_HEADERS_COUNT: LazyLock> = LazyLock::new(|| { try_create_int_gauge("sync_headers_count", "Current count of headers in memory") }); -pub static SYNC_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { +pub static SYNC_HEADER_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { try_create_int_gauge( - "sync_chains_count", - "Current count of forward sync chains in memory", + "sync_header_chains_count", + "Current count of header chains in memory", + ) +}); +pub static SYNC_FORWARD_SYNC_BLOCKS_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_forward_sync_blocks_count", + "Current count of forward sync blocks in memory", ) }); pub static SYNC_CHAIN_MERGES_COUNT: LazyLock> = LazyLock::new(|| { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 01df2304b4b..a3ab3459709 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -8,7 +8,7 @@ use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; use beacon_chain::{ BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, }; -use lighthouse_network::service::api_types::{HeaderLookupId, Id}; +use lighthouse_network::service::api_types::{ForwardSyncLookupId, Id}; use lighthouse_network::PeerAction; use std::collections::HashMap; use std::fmt::{Display, Formatter}; @@ -21,7 +21,7 @@ use types::{ColumnIndex, DataColumnSidecar, Hash256}; #[derive(Clone, Debug, PartialEq)] pub enum ChainSegmentProcessId { /// Processing Id of a range syncing batch. - ForwardSync(HeaderLookupId), + ForwardSync(ForwardSyncLookupId), /// Processing ID for a backfill syncing batch. BackfillSync(Id), } diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs index b084e0fa82f..56b2ebc1a77 100644 --- a/beacon_node/network/src/sync/forward_sync.rs +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -9,12 +9,11 @@ use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::{ - BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, HeaderLookupId, Id, - RangeRequestId, + BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, ForwardSyncLookupId, + HeaderChainId, HeaderLookupId, Id, RangeRequestId, }; use lighthouse_network::PeerId; use std::collections::{HashMap, HashSet, VecDeque}; -use std::time::Duration; use strum::IntoStaticStr; use tracing::{debug, error}; use types::{BeaconBlockHeader, EthSpec, Hash256, Slot}; @@ -23,8 +22,11 @@ const MAX_LOOKUP_COUNT: usize = 1_000_000; const PRUNE_COUNT: usize = 100_000; const BLOCK_BUFFER_SIZE: usize = 4; -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] -struct TipId(u32); +#[derive(Debug, Copy, Clone)] +enum BlockPointer { + HeaderChain(HeaderChainId), + SyncBlock(Hash256), +} /// Roots are added to ForwardSync via: /// 1. Peers referencing an unknown block root @@ -71,18 +73,48 @@ struct TipId(u32); /// /// pub struct ForwardSync { - block_to_tip: HashMap, - chains: HashMap>, + block_to_tip: HashMap, + header_chains: HashMap, + syncing_blocks: HashMap>, } /// Chain of consecutive blocks that are imported by the same set of peers struct Chain { - peers: HashSet, status: Status, } type PendingBlock = (Hash256, Slot); +#[derive(Copy, Clone, Debug)] +pub struct PeerStatusSummary { + pub max_slot: Slot, + pub min_slot: Slot, +} + +struct HeaderChain { + id: HeaderChainId, + /// Headers descendant of `next_header_request.block_root` that are already downloaded. + /// Does not include `next_header_request.block_root`. + /// Sorting: tip first, oldest ancestor last + block_roots: VecDeque, + status: HeaderChainStatus, + /// Peers that claim to have imported the oldest ancestor of this chain + peers: HashMap, +} + +enum HeaderChainStatus { + Backfill { + /// Oldest ancestor block root of this Chain. + next_request: HeaderRequest, + }, + WaitingParent { + /// Parent root of the last block_root in `block_roots` + parent_root: Hash256, + /// True if the oldest ancestor can start downloading + ready_to_sync: bool, + }, +} + #[allow(clippy::large_enum_variant)] enum Status { /// Recursively fetch headers until discovering a parent_root that is known. Its list of @@ -127,15 +159,17 @@ enum Status { /// Tracks a request to download a BeaconBlockHeader by block root struct HeaderRequest { id: Option, + chain_id: HeaderChainId, block_root: Hash256, failed_peers: HashSet, request: DownloadRequest, } impl HeaderRequest { - fn new(block_root: Hash256) -> Self { + fn new(block_root: Hash256, chain_id: HeaderChainId) -> Self { Self { id: None, + chain_id, block_root, failed_peers: <_>::default(), request: DownloadRequest::new(), @@ -143,17 +177,20 @@ impl HeaderRequest { } fn empty() -> Self { - Self::new(Hash256::ZERO) + Self::new(Hash256::ZERO, HeaderChainId(0)) } - fn continue_request( + fn continue_request( &mut self, - peers: &HashSet, + peers: I, cx: &mut SyncNetworkContext, - ) -> Result<(), Error> { + ) -> Result<(), Error> + where + T: BeaconChainTypes, + I: Iterator, + { if self.request.is_awaiting_download() { let Some(peer) = peers - .iter() .map(|peer| { ( // If contains -> 1 (order after), not contains -> 0 (order first) @@ -179,7 +216,7 @@ impl HeaderRequest { self.block_root, BlocksByRootRequester::Header(HeaderLookupId { id, - block_root: self.block_root, + chain_id: self.chain_id, }), )?; @@ -189,432 +226,156 @@ impl HeaderRequest { } } -// TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got -// through gossip and sync. - -impl Chain { - fn new(block_root: Hash256, initial_peers: &[PeerId]) -> Self { +impl HeaderChain { + fn new( + initial_block_root: Hash256, + id: HeaderChainId, + initial_peers: &[(PeerId, PeerStatusSummary)], + ) -> Self { Self { - peers: HashSet::from_iter(initial_peers.iter().copied()), - status: Status::BackfillHeaders { - block_roots: vec![], - next_header_request: HeaderRequest::new(block_root), + id, + block_roots: <_>::default(), + status: HeaderChainStatus::Backfill { + next_request: HeaderRequest::new(initial_block_root, id), }, + peers: HashMap::from_iter(initial_peers.iter().copied()), } } - /// Returns whether the value was newly inserted - fn add_peer(&mut self, peer: PeerId) -> bool { - self.peers.insert(peer) - } - - /// Returns whether the value was present in the set. - fn remove_peer(&mut self, peer: &PeerId) -> bool { - self.peers.remove(peer) - } - - /// Returns a Vec of peers that have imported the blocks in this chain - fn get_peers(&self) -> Vec { - self.peers.iter().copied().collect() - } - - /// Returns the count of peers that have imported the blocks in this chain - fn peer_count(&self) -> usize { - self.peers.len() - } - - /// Returns the parent root of the oldest ancestor of this chain. Returns None if the chain is - /// already processing = its parent has already been imported. - fn parent_root(&self) -> Option { - match &self.status { - Status::BackfillHeaders { .. } => None, - Status::WaitingParentChain { parent_root, .. } => Some(*parent_root), - Status::ForwardSync { parent_root, .. } => Some(*parent_root), - } - } - - /// Returns the tip of this chain. Returns None if the chain is empty (should not happen) - fn tip(&self) -> Option { - match &self.status { - Status::BackfillHeaders { - next_header_request, - block_roots, - } => Some( - block_roots - .first() - .map(|block| block.0) - .unwrap_or(next_header_request.block_root), - ), - Status::WaitingParentChain { block_roots, .. } => { - block_roots.first().map(|block| block.0) - } - Status::ForwardSync { block, .. } => Some(*block.block_root()), - } - } - - /// Split chain by `block_root` returning a new Self that includes `block_root` and all of its - /// ancestors, and leaves `self` with only the descendants of `block_root` excluding - /// `block_root` - fn split_by(&mut self, block_root: Hash256) -> Result { - // TODO(tree-sync): Review this logic, it's sensitive and not trivial - // TODO(tree-sync): write a prop test for this, check milhouse tests as inspo - let status = match &mut self.status { - Status::BackfillHeaders { - block_roots, - next_header_request, - } => { - // Take ownership of BackfillHeaders fields without having to add a Poisoned state - let mut block_roots = std::mem::take(block_roots); - let next_header_request = - std::mem::replace(next_header_request, HeaderRequest::empty()); - - let new_block_roots = - if let Some(idx) = block_roots.iter().position(|b| b.0 == block_root) { - // block_roots sorting: tip first, oldest ancestor last - // We want to return the set of blocks including `block_root` and all its - // ancestors into `new_block_roots`, and keep the rest in `block_roots` - block_roots.drain(idx..).collect::>() - } else { - // TODO(tree-sync): check that block_root is the next_root or error - vec![] - }; - // Mutate this chain, which keeps all descendant roots of `block_root` - self.status = Status::WaitingParentChain { - // This chain keeps the descendants of `block_root` so the oldest parent root is - // `block_root` - parent_root: block_root, - // `block_roots` has been mutated to have only the descendants of `block_root` - block_roots, - ready_to_sync: false, - }; - Status::BackfillHeaders { - block_roots: new_block_roots, - next_header_request, - } - } - Status::WaitingParentChain { - parent_root, - block_roots, - ready_to_sync, - } => { - let mut block_roots = std::mem::take(block_roots); - let idx = - block_roots - .iter() - .position(|b| b.0 == block_root) - .ok_or(InternalError(format!( - "block_root {block_root:?} no in chain" - )))?; - // See comments in BackfillHeaders variant above - let new_block_roots = block_roots.drain(idx..).collect::>(); - let parent_root = *parent_root; - let ready_to_sync = *ready_to_sync; - // Mutate this chain, which keeps all descendant roots of `block_root` - self.status = Status::WaitingParentChain { - // This chain keeps the descendants of `block_root` so the oldest parent root is - // `block_root` - parent_root: block_root, - block_roots, - ready_to_sync: false, - }; - Status::WaitingParentChain { - parent_root, - block_roots: new_block_roots, - ready_to_sync, - } - } - Status::ForwardSync { .. } => { - todo!("cannot split single block"); - } - }; - - Ok(Self { - peers: self.peers.clone(), - status, - }) - } - - /// Given another chain whose parent is the tip of this chain, merge the block of `other` into - /// `self`. - fn merge(&mut self, child_chain: Self) -> Result<(), InternalError> { - let Status::WaitingParentChain { - block_roots: child_block_roots, - .. - } = child_chain.status - else { - return Err(InternalError("Other not in WaitingParentChain".to_string())); - }; - + /// Continues the header or blocks requests of this chain + fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { match &mut self.status { - Status::BackfillHeaders { block_roots, .. } - | Status::WaitingParentChain { block_roots, .. } => { - // child_block_roots and block_roots are sorted as tip first, so do - // child_block_roots + block_roots - *block_roots = child_block_roots - .into_iter() - .chain(block_roots.drain(..)) - .collect::>(); - Ok(()) - } - Status::ForwardSync { .. } => { - Err(InternalError("Cannot merge into ForwardSync".to_string())) + HeaderChainStatus::Backfill { next_request } => { + Ok(next_request.continue_request(self.peers.keys(), cx)?) } + _ => Ok(()), } } - /// Return true if this chain is awaiting `block_root` - fn to_ready_to_sync(&mut self, block_root: &Hash256) -> bool { + fn add_ancestor(&mut self, header: BeaconBlockHeader) -> Result<(), InternalError> { match &mut self.status { - Status::BackfillHeaders { .. } => false, - Status::WaitingParentChain { - parent_root, - ready_to_sync, - .. - } => { - if block_root == parent_root && !*ready_to_sync { - *ready_to_sync = true; - true - } else { - false - } + HeaderChainStatus::Backfill { next_request, .. } => { + self.block_roots + .push_back((next_request.block_root, header.slot)); + *next_request = HeaderRequest::new(header.parent_root, self.id); + Ok(()) } - Status::ForwardSync { .. } => false, - } - } - - fn block_count(&self) -> usize { - match &self.status { - Status::BackfillHeaders { block_roots, .. } - | Status::WaitingParentChain { block_roots, .. } => block_roots.len(), - Status::ForwardSync { .. } => 1, + _ => Err(InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), + )), } } - /// Returns all block roots part of this chain, in descending slot order - fn iter_block_roots(&self) -> Box + '_> { - match &self.status { - Status::BackfillHeaders { - block_roots, - next_header_request, - } => Box::new( - block_roots - .iter() - .map(|(block_root, _)| block_root) - // next_header_request is the oldest ancestor, so chain last - .chain(std::iter::once(&next_header_request.block_root)), - ), - Status::WaitingParentChain { block_roots, .. } => { - Box::new(block_roots.iter().map(|(block_root, _)| block_root)) - } - Status::ForwardSync { block, .. } => Box::new(std::iter::once(block.block_root())), + fn extend_with_children(&mut self, mut child_chain: Self) { + while let Some(block) = child_chain.block_roots.pop_back() { + // pop_back gives oldest first, pushing to front restores tip-first + self.block_roots.push_front(block); } - } - /// Returns true if this chain has no blocks - fn is_empty(&self) -> bool { - self.iter_block_roots().next().is_none() + // All the peers of the child chain have imported the ancestors + self.peers.extend(child_chain.peers.drain()); } - fn min_slot(&self) -> Option { - match &self.status { - Status::BackfillHeaders { block_roots, .. } - | Status::WaitingParentChain { block_roots, .. } => block_roots.last().map(|b| b.1), - Status::ForwardSync { block, .. } => Some(block.slot()), - } - } - - fn max_slot(&self) -> Option { - match &self.status { - Status::BackfillHeaders { block_roots, .. } - | Status::WaitingParentChain { block_roots, .. } => block_roots.first().map(|b| b.1), - Status::ForwardSync { block, .. } => Some(block.slot()), - } + fn to_waiting_parent( + &mut self, + parent_root: Hash256, + ready_to_sync: bool, + ) -> Result<(), Error> { + self.status = HeaderChainStatus::WaitingParent { + parent_root, + ready_to_sync, + }; + Ok(()) } - fn syncing_blocks_count(&self) -> usize { + fn parent_root(&self) -> Option { match &self.status { - Status::BackfillHeaders { .. } => 0, - Status::WaitingParentChain { .. } => 0, - Status::ForwardSync { .. } => 1, + HeaderChainStatus::Backfill { .. } => None, + HeaderChainStatus::WaitingParent { parent_root, .. } => Some(*parent_root), } } - fn header_request( - &mut self, - ) -> Result<&mut DownloadRequest, Error> { - match &mut self.status { - Status::BackfillHeaders { - next_header_request, - .. - } => Ok(&mut next_header_request.request), - _ => Err(Error::InternalError( - "Expected lookup to be in DownloadingHeader state".to_owned(), - )), - } - } - - fn add_ancestor(&mut self, header: BeaconBlockHeader) -> Result<(), InternalError> { - match &mut self.status { - Status::BackfillHeaders { - block_roots, - next_header_request, - } => { - block_roots.push((next_header_request.block_root, header.slot)); - *next_header_request = HeaderRequest::new(header.parent_root); - Ok(()) - } - _ => Err(InternalError( - "Expected lookup to be in DownloadingHeader state".to_owned(), - )), - } + /// Returns true if the peer has been added to the map + fn add_peer(&mut self, peer: PeerId, status: PeerStatusSummary) -> bool { + let contains_key = self.peers.contains_key(&peer); + self.peers.insert(peer, status); + !contains_key } - fn to_waiting_parent( - &mut self, - parent_root: Hash256, - ready_to_sync: bool, - ) -> Result<(), Error> { - match &mut self.status { - Status::BackfillHeaders { block_roots, .. } => { - self.status = Status::WaitingParentChain { - parent_root, - block_roots: std::mem::take(block_roots), - ready_to_sync, - }; - Ok(()) - } - _ => Err(Error::InternalError( - "Expected lookup to be in DownloadingHeader state".to_owned(), - )), - } + /// Returns true if a peer was removed from the map + fn remove_peer(&mut self, peer: &PeerId) -> bool { + self.peers.remove(peer).is_some() } - fn pop_next_block_to_sync(&mut self, cx: &mut SyncNetworkContext) -> Option { + fn pop_oldest_ancestor(&mut self) -> Option { match &mut self.status { - Status::WaitingParentChain { - block_roots, + HeaderChainStatus::WaitingParent { parent_root, ready_to_sync, } => { if !*ready_to_sync { return None; } - let Some((block_root, block_slot)) = block_roots.pop() else { - return None; - }; - - let last_block_parent_root = *parent_root; - *parent_root = block_root; - - let id = cx.next_id(); - let block = SyncBlock::new( - // Reuse the request ID of the header for better traceability - RangeRequestId::ForwardSync(HeaderLookupId { id, block_root }), - block_root, - block_slot, - &self.peers.iter().copied().collect::>(), - ); - - Some(Self { - peers: self.peers.clone(), - status: Status::ForwardSync { - block, - parent_root: last_block_parent_root, - }, - }) + if let Some((block_root, block_slot)) = self.block_roots.pop_back() { + *parent_root = block_root; + Some((block_root, block_slot)) + } else { + None + } } _ => None, } } - fn assert_download_req_id(&mut self, req_id: BlocksByRootRequestId) -> Result<(), Error> { + fn peers_of_block_slot(&self, block_slot: Slot) -> Vec { + self.peers + .iter() + .filter(|(_, status)| block_slot >= status.min_slot && block_slot < status.max_slot) + .map(|(peer, _)| *peer) + .collect() + } + + /// Returns true if this chain transitioned into ready to sync + fn on_parent_imported(&mut self, imported_block_root: &Hash256) -> bool { match &mut self.status { - Status::BackfillHeaders { - next_header_request, - .. + HeaderChainStatus::WaitingParent { + parent_root, + ready_to_sync, } => { - // Call `on_download_success` to assert that the req_id is the expected on - next_header_request.request.on_download_success( - req_id, - PeerId::random(), - BeaconBlockHeader::empty(), - Duration::from_secs(0), - )?; - Ok(()) + if parent_root == imported_block_root && !*ready_to_sync { + *ready_to_sync = true; + true + } else { + false + } } - _ => Err(Error::InternalError( - "Expected lookup to be in DownloadingHeader state".to_owned(), - )), + _ => false, } } - fn on_download_result( - &mut self, - req_id: ComponentsByRootRequestId, - result: Result<(RpcBlock, BatchPeers), RpcResponseError>, - cx: &mut SyncNetworkContext, - ) -> Result<(), Error> { - let block = self.block_request(req_id.requester)?; - block.on_download_result(req_id, result, cx)?; - block.continue_request(cx, OkToImport::IfParentImported)?; - Ok(()) + fn block_count(&self) -> usize { + self.block_roots.len() } - /// Handle the result of a block processing. - fn on_process_result( - &mut self, - id: HeaderLookupId, - result: BatchProcessResult, - cx: &mut SyncNetworkContext, - ) -> Result { - let block = self.block_request(RangeRequestId::ForwardSync(id))?; - match block.on_process_result(result, cx)? { - SyncBlockResult::Done { parent_root, slot } => { - // Single block, drop the chain - Ok(SyncBlockResult::Done { parent_root, slot }) - } - SyncBlockResult::Wait => { - // Not complete yet, continue requests - block.continue_request(cx, OkToImport::IfParentImported)?; - Ok(SyncBlockResult::Wait) - } - } + fn min_slot(&self) -> Option { + self.block_roots.back().map(|b| b.1) } - fn block_request(&mut self, id: RangeRequestId) -> Result<&mut SyncBlock, Error> { - match &mut self.status { - Status::ForwardSync { block, .. } => { - if block.id() == id { - Ok(block) - } else { - Err(Error::InternalError(format!( - "Unknown block for {id} current ID {}", - block.id(), - ))) - } - } - _ => Err(Error::InternalError( - "Expected lookup to be in Syncing state".to_owned(), - )), - } + fn max_slot(&self) -> Option { + self.block_roots.front().map(|b| b.1) } - /// Continues the header or blocks requests of this chain - fn continue_requests(&mut self, cx: &mut SyncNetworkContext) -> Result<(), Error> { - match &mut self.status { - Status::BackfillHeaders { - next_header_request, - .. - } => Ok(next_header_request.continue_request(&self.peers, cx)?), - Status::WaitingParentChain { .. } => Ok(()), - Status::ForwardSync { block, .. } => { - block.continue_request(cx, OkToImport::IfParentImported)?; - Ok(()) - } - } + fn peer_count(&self) -> usize { + self.peers.len() } } +// TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got +// through gossip and sync. + #[derive(Debug, IntoStaticStr)] pub enum Error { /// Unexpected and unrecoverable error @@ -672,22 +433,26 @@ impl ForwardSync { pub fn new() -> Self { Self { block_to_tip: <_>::default(), - chains: <_>::default(), + header_chains: <_>::default(), + syncing_blocks: <_>::default(), } } /// Returns the peers that claim to have imported a specific block_root #[cfg(test)] pub fn block_peers(&self, block_root: &Hash256) -> Result>, String> { - let Some(chain) = self.block_to_tip.get(block_root) else { + let Some(block_ptr) = self.block_to_tip.get(block_root) else { return Ok(None); }; - Ok(Some( - self.chains - .get(chain) - .ok_or(format!("Unknown chain {chain:?}"))? - .get_peers(), - )) + match block_ptr { + BlockPointer::HeaderChain(id) => Err(format!("Block {id} is a header chain")), + BlockPointer::SyncBlock(id) => Ok(Some( + self.syncing_blocks + .get(id) + .ok_or(format!("Unknown chain {id}"))? + .get_peers(), + )), + } } /// Get all blocks that forward sync intends to sync @@ -705,7 +470,7 @@ impl ForwardSync { pub fn max_slot_to_sync(&self) -> Option { // TODO(tree-sync): weak metric, who have a better heuristic for sync? Now that lookups // count here - self.chains + self.header_chains .values() .filter_map(|chain| chain.max_slot()) .max() @@ -713,18 +478,12 @@ impl ForwardSync { /// Return all processing ids of syncing blocks #[cfg(test)] - pub fn get_processing_ids(&mut self) -> Vec { + pub fn get_processing_ids(&mut self) -> Vec { let mut ids = vec![]; - for chain in self.chains.values() { - match &chain.status { - Status::BackfillHeaders { .. } => {} - Status::WaitingParentChain { .. } => {} - Status::ForwardSync { block, .. } => { - if block.is_processing() { - if let RangeRequestId::ForwardSync(id) = block.id() { - ids.push(id); - } - } + for block in self.syncing_blocks.values() { + if block.is_processing() { + if let RangeRequestId::ForwardSync(id) = block.id() { + ids.push(id); } } } @@ -738,20 +497,24 @@ impl ForwardSync { /// Remove a disconnected peer from all chains pub fn remove_peer(&mut self, peer: PeerId) { let chains_to_remove = self - .chains + .header_chains .iter_mut() .filter_map(|(chain_id, chain)| { chain.remove_peer(&peer); // TODO(tree-sync): research if it actually useful to keep chains with zero peers for // some time. if chain.peer_count() == 0 { - Some(*chain_id) + Some((*chain_id).into()) } else { None } }) .collect::>(); + for (id, block) in self.syncing_blocks.iter_mut() { + block.remove_peer(&peer); + } + if !chains_to_remove.is_empty() { let chain_to_children = self.compute_children(); for chain_id in chains_to_remove { @@ -765,83 +528,20 @@ impl ForwardSync { pub fn search( &mut self, block_root: Hash256, - peers: &[PeerId], + peers: &[(PeerId, PeerStatusSummary)], cx: &mut SyncNetworkContext, ) -> Result<(), Error> { if let Some(_) = self.block_to_tip.get(&block_root) { - let mut peers = HashSet::<&PeerId>::from_iter(peers); - let mut counts = HashMap::<&PeerId, usize>::new(); - - // Add peer to `block`'s entry and all its ancestors - let mut target_block_root = block_root; - while let Some(chain_id) = self.block_to_tip.get_mut(&target_block_root) { - let chain = self - .chains - .get_mut(chain_id) - .ok_or(InternalError(format!("Unknown chain {chain_id}")))?; - - let should_split_chain = match chain.tip() { - // If target_block_root is not the tip of chain, we have to split the chain - Some(tip) => tip != target_block_root, - // If the chain has no tip (should not happen) don't split the chain - None => false, - }; - let chain_to_add_peers = if should_split_chain { - let new_chain = chain.split_by(target_block_root)?; - let new_chain_id = TipId(cx.next_id()); - debug!( - block_root = ?target_block_root, - %chain_id, - %new_chain_id, - chain_block_count = chain.block_count(), - new_chain_block_count = new_chain.block_count(), - "Forward sync chain split" - ); - - // Update all block references to the new chain - for block_root in new_chain.iter_block_roots() { - self.block_to_tip.insert(*block_root, new_chain_id); - } - - self.chains.entry(new_chain_id).or_insert(new_chain) - } else { - chain - }; - - peers.retain(|peer| { - if chain_to_add_peers.add_peer(**peer) { - *counts.entry(peer).or_default() += 1; - // We added peer to the lookup, retain it for the next ancestor chain - true - } else { - // Peer already part of this lookup, therefore it must be part of the peer - // set of all of its ancestors: stop - false - } - }); - // No peers need to be added to ancestors, stop - if peers.is_empty() { - break; - } - - if let Some(parent_root) = chain_to_add_peers.parent_root() { - target_block_root = parent_root; - } else { - break; - } - } - // Log once per peer, as we could add it to a very large number of lookups - for (peer, count) in counts { - debug!(block_root = ?target_block_root, %peer, count, "Adding peer to existing header lookup and ancestors"); - } + debug!(block_root = ?block_root, ?peers, "Adding peer to existing header lookup and ancestors"); + self.add_peers_recursively(block_root, peers)?; } else { if self.block_to_tip.len() > MAX_LOOKUP_COUNT { self.prune_least_popular_lookups(); } - let chain_id = TipId(cx.next_id()); + let chain_id = HeaderChainId(cx.next_id()); match peers { - [peer] => debug!(?block_root, %chain_id, %peer, "Creating new header lookup"), + [peer] => debug!(?block_root, %chain_id, ?peer, "Creating new header lookup"), _ => debug!( ?block_root, %chain_id, @@ -850,11 +550,12 @@ impl ForwardSync { ), } - let mut chain = Chain::new(block_root, peers); + let mut chain = HeaderChain::new(block_root, chain_id, peers); chain.continue_requests(cx)?; // Don't insert until first request is successful - self.chains.insert(chain_id, chain); - self.block_to_tip.insert(block_root, chain_id); + self.header_chains.insert(chain_id, chain); + self.block_to_tip + .insert(block_root, BlockPointer::HeaderChain(chain_id)); metrics::inc_counter(&metrics::SYNC_CHAINS_ADDED); } Ok(()) @@ -871,15 +572,11 @@ impl ForwardSync { ) { // Invoke a closure to use the ? operator and handle the result consistenlty let result: Result<(), Error> = (|| { - let Some(chain_id) = self.block_to_tip.get(&id.block_root).copied() else { - // TODO(tree-sync): register metric - debug!(id = ?req_id, "Received header request for unknown block_root"); - return Ok(()); - }; - let chain = self.chains.get_mut(&chain_id).ok_or(InternalError(format!( - "block_root {:?} references unknown chain {chain_id}", - id.block_root - )))?; + let chain_id = id.chain_id; + let chain = self + .header_chains + .get_mut(&chain_id) + .ok_or(InternalError(format!("Request for unknown chain {id}")))?; let response = response.and_then(|(blocks, timestamp)| { if blocks.is_empty() { @@ -891,15 +588,33 @@ impl ForwardSync { } }); + let header_request = match &mut chain.status { + HeaderChainStatus::Backfill { next_request, .. } => next_request, + HeaderChainStatus::WaitingParent { .. } => { + debug!(%req_id, %chain_id, "Unexpected request for header chain waiting parent"); + return Ok(()); + } + }; + // TODO(tree-sync): add some check to make sure that distinct lookups for the same // block root don't mess with each other. That check must happen before triggering // errors for bad state match response { - Ok((headers, _received)) => { - chain.assert_download_req_id(req_id)?; + Ok((headers, received)) => { + header_request.request.on_download_success( + req_id, + PeerId::random(), + BeaconBlockHeader::empty(), + received, + )?; debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); + // TODO(tree-sync): should check if the block is descendant of finalized + // TODO(tree-sync): on finalization or every interval we should drop branches that + // conflict with finality + let finalized_checkpoint = cx.chain.head().finalized_checkpoint(); + for header in headers { let parent_root = header.parent_root; let block_root = header.canonical_root(); @@ -912,11 +627,6 @@ impl ForwardSync { // 2. We conflicts with finality -> reject // 3. The parent root is unknown -> continue search - // TODO(tree-sync): should check if the block is descendant of finalized - // TODO(tree-sync): on finalization or every interval we should drop branches that - // conflict with finality - let finalized_checkpoint = cx.chain.head().finalized_checkpoint(); - // TODO(tree-sync): check that the slots are decreasing, so we don't end up in // an infinite loop. But note that the wrong block will be the descendant. // - We get header A with parent B and slot 10 @@ -943,19 +653,58 @@ impl ForwardSync { // Trigger potential foward sync for this chain self.continue_requests(cx); break; - } else if let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) { + } else if let Some(parent_chain_ptr) = + self.block_to_tip.get(&parent_root).copied() + { // Parent is part of another chain, stop search // Stop search we reached a known block - chain.to_waiting_parent(parent_root, false)?; - debug!(%chain_id, %parent_chain_id, ?parent_root, "Forward sync chain reached known block"); - // TODO(tree-sync): Add peers recursively to the chain_id, potentially - // splitting the chain when adding peers. + debug!(%chain_id, ?parent_chain_ptr, ?parent_root, "Forward sync chain reached known block"); + + // If this is the only child of `parent_root` we can insert the block + // in the parent chain, and "merge" them. This is the common case in + // single fork chains. The main chain keeps producing new blocks while + // we backfill headers. + if match self.compute_children().get(&parent_root) { + Some(children) => children.is_empty(), + None => false, + } { + if let BlockPointer::HeaderChain(parent_chain_id) = parent_chain_ptr + { + // Add new tip to `parent_chain` + let chain = self.header_chains.remove(&chain_id).ok_or( + InternalError(format!("missing chain {chain_id}")), + )?; + + let parent_chain = self + .header_chains + .get_mut(&parent_chain_id) + .ok_or(InternalError(format!( + "missing chain {parent_chain_id}" + )))?; + + for (block_root, _) in &chain.block_roots { + self.block_to_tip.insert(*block_root, parent_chain_ptr); + } + parent_chain.extend_with_children(chain); + } + } else { + let chain = self + .header_chains + .get_mut(&chain_id) + .ok_or(InternalError(format!("missing chain {chain_id}")))?; + + // `parent_root` has multiple children, keep `chain` as a fork and + // mark it awaiting parent + chain.to_waiting_parent(parent_root, false)?; + } + + // The rest of headers of this response are known, ignore break; } else { debug!(%chain_id, ?parent_root, "Forward sync chain continues fetching ancestor"); // Add to the block_to_tip mapping to respect the invariant "Each block // root exists in exactly one `Chain::block_roots` list". - self.block_to_tip.insert(parent_root, chain_id); + self.block_to_tip.insert(parent_root, chain_id.into()); // Since the block already points to `chain` we don't need to add peers. // Just trigger header download for this new root. } @@ -963,7 +712,7 @@ impl ForwardSync { } Err(e) => { // Request errors are logged in `SyncNetworkContext::on_rpc_response_result` - chain.header_request()?.on_download_error(req_id, Some(e))?; + header_request.request.on_download_error(req_id, Some(e))?; // Continue this request to potentially resend the header request } } @@ -972,7 +721,7 @@ impl ForwardSync { })(); if let Err(e) = result { - self.handle_error(id.block_root, e); + self.handle_error(id.chain_id.into(), e); } } @@ -980,91 +729,99 @@ impl ForwardSync { pub fn on_block_download_result( &mut self, req_id: ComponentsByRootRequestId, - id: HeaderLookupId, + id: ForwardSyncLookupId, result: Result<(RpcBlock, BatchPeers), RpcResponseError>, cx: &mut SyncNetworkContext, ) { - let Some(chain_id) = self.block_to_tip.get(&id.block_root) else { - debug!(?id, "Received block process result for unknown lookup"); - return; - }; - let Some(chain) = self.chains.get_mut(chain_id) else { - error!(%chain_id, block_root = ?id.block_root, "Block references unknown chain"); + let block_root = id.block_root; + let Some(block) = self.syncing_blocks.get_mut(&block_root) else { + error!(?block_root, "Unknown forward sync block"); return; }; - debug!(%id, %chain_id, result = render_result(&result), "Forward sync block download result"); + let result: Result<(), Error> = (|| { + // let block = self.block_request(req_id.requester)?; + debug!(%id, ?block_root, result = render_result(&result), "Forward sync block download result"); + block.on_download_result(req_id, result, cx)?; + block.continue_request(cx, OkToImport::IfParentImported)?; + Ok(()) + })(); - if let Err(e) = chain.on_download_result(req_id, result, cx) { - self.handle_error(id.block_root, e); + if let Err(e) = result { + self.handle_error(block_root.into(), e); // Some syncing blocks may have been dropped so there's space for new chains to sync self.continue_requests(cx); } } /// Handle the result of a block processing. + /// We known this block's parent is imported, so we don't explicitly handle a ParentUnknown error. pub fn on_block_process_result( &mut self, - id: HeaderLookupId, + id: ForwardSyncLookupId, result: BatchProcessResult, cx: &mut SyncNetworkContext, ) { - let Some(chain_id) = self.block_to_tip.get(&id.block_root).copied() else { - debug!(?id, "Received block process result for unknown lookup"); - return; - }; - let Some(chain) = self.chains.get_mut(&chain_id) else { - error!(%chain_id, block_root = ?id.block_root, "Block references unknown chain"); - return; - }; + let result: Result<(), Error> = (|| { + let block_root = id.block_root; + let Some(block) = self.syncing_blocks.get_mut(&block_root) else { + error!(?block_root, "Unknown forward sync block"); + return Ok(()); + }; - debug!(%id, %chain_id, ?result, "Forward sync block download result"); - - match chain.on_process_result(id, result, cx) { - Ok(SyncBlockResult::Done { .. }) => { - metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); - self.block_to_tip.remove(&id.block_root); - // ForwardSync chains have a single block, remove them on Done - self.chains.remove(&chain_id); - debug!(%id, %chain_id, "Removed completed chain"); - metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &["completed"]); - - // Find all chains that are awaiting this block to process and continue them - for other_chain in self.chains.values_mut() { - if other_chain.to_ready_to_sync(&id.block_root) { - debug!( - %chain_id, - parent_root = ?id.block_root, - "Forward sync marked chain as ready to sync" - ); + debug!(%id, ?block_root, ?result, "Forward sync block process result"); + + // TODO(tree-sync): use id to ensure results for other roots don't mix up + match block.on_process_result(result, cx)? { + SyncBlockResult::Done { .. } => { + metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); + self.block_to_tip.remove(&block_root); + // ForwardSync chains have a single block, remove them on Done + self.syncing_blocks.remove(&block_root); + debug!(%id, ?block_root, "Removed completed forward sync block"); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &["completed"]); + + // Find all chains that are awaiting this block to process and continue them + for (chain_id, other_chain) in self.header_chains.iter_mut() { + if other_chain.on_parent_imported(&id.block_root) { + debug!( + %chain_id, + parent_root = ?id.block_root, + "Forward sync marked chain as ready to sync" + ); + } } + self.continue_requests(cx); + } + // Not complete yet, continue requests + SyncBlockResult::Wait => { + block.continue_request(cx, OkToImport::IfParentImported)?; } - self.continue_requests(cx); - } - // Wait for next event - Ok(SyncBlockResult::Wait) => {} - Err(e) => { - self.handle_error(id.block_root, e); - // Some syncing blocks may have been dropped so there's space for new chains to sync - self.continue_requests(cx); } + Ok(()) + })(); + + if let Err(e) = result { + self.handle_error(id.block_root.into(), e); + // Some syncing blocks may have been dropped so there's space for new chains to sync + self.continue_requests(cx); } } + pub fn prune(&mut self) { + // TODO(tree-sync): should prune? Based on finality and expired head chains + } + /// Common handler for any `forward_sync::Error`. For simplicity it drops the chain that includes /// the block and all of its descendants. - fn handle_error(&mut self, block_root: Hash256, error: Error) { - debug!(?error, ?block_root, "Dropping forward sync block lookup"); - let Some(chain_id) = self.block_to_tip.get(&block_root).copied() else { - debug!(?block_root, "Handling error for unknown block_root"); - return; - }; + fn handle_error(&mut self, chain_id: BlockPointer, error: Error) { + debug!(?error, ?chain_id, "Dropping forward sync block lookup"); metrics::inc_counter_vec(&metrics::SYNC_CHAIN_ERROR_COUNT, &[(&error).into()]); let block_to_children = self.compute_children(); // TODO(tree-sync): logging `block_to_children` for debugging - debug!(%chain_id, ?block_root, ?error, ?block_to_children, "Dropping forward sync chain on error"); + debug!(%chain_id, ?chain_id, ?error, ?block_to_children, "Dropping forward sync chain on error"); self.drop_chain_and_children(chain_id, &block_to_children, (&error).into()); match error { @@ -1093,14 +850,7 @@ impl ForwardSync { // TODO(tree-sync): don't build on demand, cache roots somewhere - let new_blocks_to_sync = BLOCK_BUFFER_SIZE.saturating_sub( - self.chains - .values() - .map(|chain| chain.syncing_blocks_count()) - .sum::(), - ); - - if new_blocks_to_sync == 0 { + if self.syncing_blocks.len() > BLOCK_BUFFER_SIZE { return; } @@ -1108,15 +858,14 @@ impl ForwardSync { // - Active backfill // - Oldest ancestor known - // Have up to 2 blocks syncing // Find the block range with most peers and highest slot. This is the block // to be used as tip of the chain of blocks to fetch. let mut chains_by_peer_count = self - .chains + .header_chains .iter_mut() .filter_map(|(_, chain)| { - if matches!(chain.status, Status::WaitingParentChain { .. }) { - Some((chain.peer_count(), chain)) + if chain.parent_root().is_some() { + Some((chain.peers.len(), chain)) } else { None } @@ -1124,36 +873,45 @@ impl ForwardSync { .collect::>(); chains_by_peer_count.sort_by_key(|(peer_count, _)| *peer_count); - let mut new_chains = vec![]; + let mut blocks_to_add = vec![]; 'o: for (chain_id, chain) in chains_by_peer_count { - while let Some(new_chain) = chain.pop_next_block_to_sync(cx) { - let new_chain_id = TipId(cx.next_id()); - // Update all block references to the new chain - for block_root in new_chain.iter_block_roots() { - self.block_to_tip.insert(*block_root, new_chain_id); - debug!( - %chain_id, - %new_chain_id, - ?block_root, - chain_block_count = chain.block_count(), - "Transitioned block to forward sync" - ); - } - new_chains.push((new_chain_id, new_chain)); - if new_chains.len() >= new_blocks_to_sync { + while let Some((block_root, block_slot)) = chain.pop_oldest_ancestor() { + let block_peers = chain.peers_of_block_slot(block_slot); + blocks_to_add.push((block_root, block_slot, block_peers)); + debug!(%chain_id, ?block_root, %block_slot, "Transitioned block to forward sync"); + if blocks_to_add.len() + self.syncing_blocks.len() > BLOCK_BUFFER_SIZE { break 'o; } } } + let should_continue_requests = !blocks_to_add.is_empty(); + for (block_root, block_slot, block_peers) in blocks_to_add { + // Need to compute the peer of the block here since header chains only track peers + // that have imported the oldest ancestor. + + let block = SyncBlock::new( + // Reuse the request ID of the header for better traceability + RangeRequestId::ForwardSync(ForwardSyncLookupId { + id: cx.next_id(), + block_root, + }), + block_root, + block_slot, + &block_peers, + ); + // Update all block references to the new chain + self.block_to_tip + .insert(block_root, BlockPointer::SyncBlock(block_root)); + self.syncing_blocks.insert(block_root, block); + } + // Prune chains that become empty after pop_next_block_to_sync - self.chains.retain(|_, chain| !chain.is_empty()); + self.header_chains + .retain(|_, chain| !chain.block_roots.is_empty()); - if !new_chains.is_empty() { - for (chain_id, chain) in new_chains { - self.chains.insert(chain_id, chain); - } + if should_continue_requests { self.continue_requests(cx); } } @@ -1162,18 +920,21 @@ impl ForwardSync { // TODO(tree-sync): optimize this call to maybe not do it everytime self.trigger_forward_sync(cx); - let chains_to_drop = self - .chains - .iter_mut() - .filter_map(|(chain_id, chain)| { - if let Err(e) = chain.continue_requests(cx) { - // TODO(tree-sync): should log error? - Some((*chain_id, e)) - } else { - None - } - }) - .collect::>(); + let mut chains_to_drop = vec![]; + + for (chain_id, block) in self.syncing_blocks.iter_mut() { + if let Err(e) = block.continue_request(cx, OkToImport::IfParentImported) { + // TODO(tree-sync): should log error? + chains_to_drop.push(((*chain_id).into(), e.into())); + } + } + + for (chain_id, chain) in self.header_chains.iter_mut() { + if let Err(e) = chain.continue_requests(cx) { + // TODO(tree-sync): should log error? + chains_to_drop.push(((*chain_id).into(), e)); + } + } if !chains_to_drop.is_empty() { let chain_to_children = self.compute_children(); @@ -1183,45 +944,107 @@ impl ForwardSync { } } + fn add_peers_recursively( + &mut self, + block_root: Hash256, + peers: &[(PeerId, PeerStatusSummary)], + ) -> Result<(), Error> { + let Some(id) = self.block_to_tip.get(&block_root) else { + return Ok(()); + }; + match id { + BlockPointer::HeaderChain(chain_id) => { + // The peer claims to have imported some block in this header chain. Header + // chain requests always the oldest ancestor. So we can guarantee that this peer + // has imported the oldest ancestor of the chain. + let chain = self + .header_chains + .get_mut(chain_id) + .ok_or(InternalError(format!("Unknown chain {chain_id}")))?; + for (peer, status) in peers { + chain.add_peer(*peer, *status); + } + if let Some(parent_root) = chain.parent_root() { + self.add_peers_recursively(parent_root, peers)?; + } + Ok(()) + } + BlockPointer::SyncBlock(id) => { + let block = self + .syncing_blocks + .get_mut(id) + .ok_or(InternalError(format!("Unknown syncing block {id:?}")))?; + for (peer, _) in peers { + block.add_peer(*peer); + } + if let Some(parent_root) = block.parent_root() { + self.add_peers_recursively(parent_root, peers)?; + } + Ok(()) + } + } + } + /// Drop chain if it exists and all its children fn drop_chain_and_children( &mut self, - initial_chain_id: TipId, - chain_to_children: &HashMap>, + initial_chain_id: BlockPointer, + chain_to_children: &HashMap>, reason: &'static str, ) { - let mut queue: VecDeque = VecDeque::from([initial_chain_id]); + let mut queue: VecDeque = VecDeque::from([initial_chain_id]); - while let Some(chain_id) = queue.pop_front() { + while let Some(block_ptr) = queue.pop_front() { // Remove the node itself. // Only continue if the node was removed. This prevents infinite loops even if // `chain_to_children` items reference themselves - if let Some(chain) = self.chains.remove(&chain_id) { - debug!(%chain_id, %initial_chain_id, reason, "Dropping forward sync chain"); - metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); - for block_root in chain.iter_block_roots() { - self.block_to_tip.remove(block_root); - debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block"); - metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); - // Only remove children if the node still existed - // Push its children‚Äîif any‚Äîonto the work list. - if let Some(children) = chain_to_children.get(block_root) { - queue.extend(children.iter().cloned()); + match block_ptr { + BlockPointer::HeaderChain(chain_id) => { + if let Some(chain) = self.header_chains.remove(&chain_id) { + debug!(%chain_id, %initial_chain_id, reason, "Dropping forward sync chain"); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); + + for (block_root, _) in chain.block_roots { + self.block_to_tip.remove(&block_root); + debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block"); + metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); + + // Only remove children if the node still existed + // Push its children‚ if any‚ onto the work list. + if let Some(children) = chain_to_children.get(&block_root) { + queue.extend(children.iter().cloned()); + } + } + } + } + BlockPointer::SyncBlock(id) => { + if let Some(block) = self.syncing_blocks.remove(&id) { + if let Some(children) = chain_to_children.get(&id) { + queue.extend(children.iter().cloned()); + } } } } } } - /// Drop lookup `block_root` if it exists and all its children - fn compute_children(&mut self) -> HashMap> { - let mut parent_to_children = HashMap::>::new(); - for (chain_id, chain) in self.chains.iter() { + /// Compute the map of block_roots -> chain IDs + fn compute_children(&self) -> HashMap> { + let mut parent_to_children = HashMap::>::new(); + for (chain_id, chain) in self.header_chains.iter() { if let Some(parent_root) = chain.parent_root() { parent_to_children .entry(parent_root) .or_default() - .push(*chain_id); + .push(BlockPointer::HeaderChain(*chain_id)); + } + } + for (chain_id, chain) in self.syncing_blocks.iter() { + if let Some(parent_root) = chain.parent_root() { + parent_to_children + .entry(parent_root) + .or_default() + .push(BlockPointer::SyncBlock(*chain_id)); } } parent_to_children @@ -1230,7 +1053,7 @@ impl ForwardSync { /// Drop lookups with least amount of peers and slot until we pruned PRUNE_COUNT lookups fn prune_least_popular_lookups(&mut self) { let mut chains = self - .chains + .header_chains .iter() // TODO: Prune only lookups that are not syncing and we know the header .map(|(chain_id, chain)| (chain.peer_count(), *chain_id)) @@ -1239,108 +1062,32 @@ impl ForwardSync { let chain_to_children = self.compute_children(); for (_, chain_id) in chains { - self.drop_chain_and_children(chain_id, &chain_to_children, "too_many_blocks"); + self.drop_chain_and_children(chain_id.into(), &chain_to_children, "too_many_blocks"); if self.block_to_tip.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { break; } } } - fn merge_chains(&mut self) -> Result<(), InternalError> { - // To prevent O(n^2) ops, first compute a hashmap of tips to chains. Each block belongs - // exactly to one chain so there must be a single tip -> chain relationship - let tip_to_chain = HashMap::)>::from_iter( - self.chains.iter().filter_map(|(chain_id, chain)| { - // TODO(tree-sync): exclude ForwardSync - if let Some(tip) = chain.tip() { - Some((tip, (chain_id, chain))) - } else { - None - } - }), - ); - - // Now collect all chains waiting for a parent to sort them by peer_count and block count - let mut chains = self - .chains - .iter() - .filter_map(|(chain_id, chain)| { - if let Status::WaitingParentChain { parent_root, .. } = chain.status { - Some((chain_id, chain, parent_root)) - } else { - None - } - }) - .collect::>(); - chains.sort_unstable_by_key(|(_, chain, _)| (chain.peer_count(), chain.block_count())); - - // Iterate from highest peer count and highest block count first - let chains_to_merge = - chains - .into_iter() - .rev() - .find_map(|(chain_id, chain, parent_root)| { - // The parent root of chain is exactly the tip of parent_chain - if let Some((parent_chain_id, parent_chain)) = tip_to_chain.get(&parent_root) { - if chain.peers == parent_chain.peers { - // The peer set is the same, schedule to merge them - return Some((**parent_chain_id, *chain_id)); - } - } - None - }); - - // Execute the merge operation. Do a single merge operation per loop as we remove a - // chain from the chains map. Is possible that chains are childs of each other so to - // safely merge them we would need to iterate them in topological order. For - // simplicity we just do one merge at a time. - if let Some((parent_chain_id, chain_id)) = chains_to_merge { - debug!(%parent_chain_id, %chain_id, "Merging forward sync chains"); - metrics::inc_counter(&metrics::SYNC_CHAIN_MERGES_COUNT); - - let Some(chain) = self.chains.remove(&chain_id) else { - return Err(InternalError(format!("chain {chain_id} does not exist"))); - }; - let Some(parent_chain) = self.chains.get_mut(&parent_chain_id) else { - return Err(InternalError(format!( - "parent_chain {parent_chain_id} does not exist" - ))); - }; - // Update all block references to the new chain - for block_root in chain.iter_block_roots() { - self.block_to_tip.insert(*block_root, parent_chain_id); - } - parent_chain.merge(chain)?; - } - - Ok(()) - } - - pub fn prune(&mut self) { - if let Err(e) = self.merge_chains() { - error!(error = ?e, "Error merging forward sync chains"); - } - } - pub fn register_metrics(&self) { - let (min_slot, max_slot) = - self.chains - .values() - .fold((None::, None::), |(gmin, gmax), chain| { - let gmin = match (gmin, chain.min_slot()) { - (Some(a), Some(b)) => Some(a.min(b)), - (None, some @ Some(_)) => some, // first non-None wins - (x, None) => x, - }; + let (min_slot, max_slot) = self.header_chains.values().fold( + (None::, None::), + |(gmin, gmax), chain| { + let gmin = match (gmin, chain.min_slot()) { + (Some(a), Some(b)) => Some(a.min(b)), + (None, some @ Some(_)) => some, // first non-None wins + (x, None) => x, + }; - let gmax = match (gmax, chain.max_slot()) { - (Some(a), Some(b)) => Some(a.max(b)), - (None, some @ Some(_)) => some, - (x, None) => x, - }; + let gmax = match (gmax, chain.max_slot()) { + (Some(a), Some(b)) => Some(a.max(b)), + (None, some @ Some(_)) => some, + (x, None) => x, + }; - (gmin, gmax) - }); + (gmin, gmax) + }, + ); if let (Some(min_slot), Some(max_slot)) = (min_slot, max_slot) { metrics::set_gauge(&metrics::SYNC_HEADER_MIN_SLOT, min_slot.as_u64() as i64); @@ -1348,50 +1095,49 @@ impl ForwardSync { } metrics::set_gauge(&metrics::SYNC_HEADERS_COUNT, self.block_to_tip.len() as i64); - metrics::set_gauge(&metrics::SYNC_CHAINS_COUNT, self.chains.len() as i64); + metrics::set_gauge( + &metrics::SYNC_HEADER_CHAINS_COUNT, + self.header_chains.len() as i64, + ); + metrics::set_gauge( + &metrics::SYNC_FORWARD_SYNC_BLOCKS_COUNT, + self.syncing_blocks.len() as i64, + ); - for (chain_id, chain) in &self.chains { + for (chain_id, chain) in &self.header_chains { let status = match &chain.status { - Status::BackfillHeaders { - block_roots, - next_header_request, - } => { + HeaderChainStatus::Backfill { next_request, .. } => { format!( - "BackfillHeaders block_roots {block_roots:?} next_header_request {:?} {} {}", - next_header_request.id, - next_header_request.block_root, - next_header_request.request.status_str() + "BackfillHeaders block_roots {:?} next_header_request {:?} {} {}", + chain.block_roots, + next_request.id, + next_request.block_root, + next_request.request.status_str() ) } - Status::WaitingParentChain { + HeaderChainStatus::WaitingParent { parent_root, - block_roots, ready_to_sync, } => { - format!("WaitingParentChain ready_to_sync {ready_to_sync} parent_root {parent_root:?} block_roots {block_roots:?}") - } - Status::ForwardSync { block, .. } => { - format!("ForwardSync sync_block {:?}", block.block_root()) + format!("WaitingParentChain ready_to_sync {ready_to_sync} parent_root {parent_root:?} block_roots {:?}",chain.block_roots) } }; let recursive_parent_chain = (|| { let mut next_chain_id = *chain_id; loop { - let Some(next_chain) = self.chains.get(&next_chain_id) else { + let Some(next_chain) = self.header_chains.get(&next_chain_id) else { return Err(format!("Unknown chain {next_chain_id}")); }; - if let Status::WaitingParentChain { parent_root, .. } - | Status::ForwardSync { parent_root, .. } = next_chain.status + if let HeaderChainStatus::WaitingParent { parent_root, .. } = next_chain.status { - let Some(parent_chain_id) = self.block_to_tip.get(&parent_root) else { - if matches!(next_chain.status, Status::ForwardSync { .. }) { - // A ForwardSync chain may point to an already imported block + let Some(parent_ptr_id) = self.block_to_tip.get(&parent_root) else { + return Err(format!("{next_chain_id} Unknown block {parent_root:?}")); + }; + let parent_chain_id = match parent_ptr_id { + BlockPointer::HeaderChain(id) => id, + BlockPointer::SyncBlock(id) => { return Err(format!("{next_chain_id} unknown/imported")); - } else { - return Err(format!( - "{next_chain_id} Unknown block {parent_root:?}" - )); } }; next_chain_id = *parent_chain_id; @@ -1403,11 +1149,14 @@ impl ForwardSync { } })(); - debug!(%chain_id, peers = chain.peers.len(), status, ?recursive_parent_chain, "DEBUG chain"); + debug!(%chain_id, status, ?recursive_parent_chain, "DEBUG chain"); } for (block_root, chain_id) in &self.block_to_tip { - if !self.chains.contains_key(chain_id) { + if !match chain_id { + BlockPointer::HeaderChain(id) => self.header_chains.contains_key(id), + BlockPointer::SyncBlock(id) => self.syncing_blocks.contains_key(id), + } { debug!("DEBUG block {block_root} points to unknown chain {chain_id}"); } } @@ -1418,9 +1167,24 @@ impl ForwardSync { } } -impl std::fmt::Display for TipId { +impl std::fmt::Display for BlockPointer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) + match self { + Self::HeaderChain(id) => write!(f, "Header/{id}"), + Self::SyncBlock(id) => write!(f, "Block/{id:?}"), + } + } +} + +impl From for BlockPointer { + fn from(id: HeaderChainId) -> Self { + Self::HeaderChain(id) + } +} + +impl From for BlockPointer { + fn from(id: Hash256) -> Self { + Self::SyncBlock(id) } } @@ -1532,7 +1296,7 @@ mod tests { }; let mut sync = ForwardSync { block_to_tip: <_>::default(), - chains: HashMap::from_iter([(TipId(0), left_chain), (TipId(1), right_chain)]), + chains: HashMap::from_iter([(HeaderChainId(0), left_chain), (TipId(1), right_chain)]), }; sync.merge_chains(); assert_eq!(sync.chains.len(), 1, "Should merge 2 chains into 1");