diff --git a/Cargo.lock b/Cargo.lock index 70c910aadc9..55b407cc8c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6373,6 +6373,7 @@ dependencies = [ "ethereum_ssz", "execution_layer", "fnv", + "fork_choice", "futures", "genesis", "hex", diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index d29576fa899..03452fbf6ec 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -188,7 +188,7 @@ impl RpcBlock { block: Arc>, custody_columns: Vec>, spec: &ChainSpec, - ) -> Result { + ) -> Result { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); let inner = RpcBlockInner::BlockAndCustodyColumns { @@ -197,11 +197,7 @@ impl RpcBlock { custody_columns, spec.number_of_columns as usize, ) - .map_err(|e| { - AvailabilityCheckError::Unexpected(format!( - "custody_columns len exceeds number_of_columns: {e:?}" - )) - })?, + .map_err(|e| format!("custody_columns len exceeds number_of_columns: {e:?}"))?, }; Ok(Self { block_root, diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 36c4f2cdc1e..67f2f155a74 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -1183,8 +1183,13 @@ mod pending_components_tests { pub fn pre_setup() -> Setup { let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); let spec = test_spec::(); - let (block, blobs_vec) = - generate_rand_block_and_blobs::(ForkName::Deneb, NumBlobs::Random, &mut rng, &spec); + let (block, blobs_vec) = generate_rand_block_and_blobs::( + ForkName::Deneb, + NumBlobs::Random, + None, + &mut rng, + &spec, + ); let max_len = spec.max_blobs_per_block(block.epoch()) as usize; let mut blobs: RuntimeFixedVector>>> = RuntimeFixedVector::default(max_len); diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index c2c5d8d6266..18be6f592d7 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2392,7 +2392,8 @@ where .take(sampling_column_count) .map(CustodyDataColumn::from_asserted_custody) .collect::>(); - RpcBlock::new_with_custody_columns(Some(block_root), block, columns, &self.spec)? + RpcBlock::new_with_custody_columns(Some(block_root), block, columns, &self.spec) + .map_err(BlockError::InternalError)? } else { RpcBlock::new_without_blobs(Some(block_root), block) } @@ -3144,10 +3145,14 @@ pub enum NumBlobs { pub fn generate_rand_block_and_blobs( fork_name: ForkName, num_blobs: NumBlobs, + parent_root: Option, rng: &mut impl Rng, spec: &ChainSpec, ) -> (SignedBeaconBlock>, Vec>) { - let inner = map_fork_name!(fork_name, BeaconBlock, <_>::random_for_test(rng)); + let mut inner = map_fork_name!(fork_name, BeaconBlock, <_>::random_for_test(rng)); + if let Some(parent_root) = parent_root { + *inner.parent_root_mut() = parent_root; + } let mut block = SignedBeaconBlock::from_block(inner, types::Signature::random_for_test(rng)); let max_blobs = spec.max_blobs_per_block(block.epoch()) as usize; @@ -3246,13 +3251,15 @@ pub fn generate_rand_block_and_blobs( pub fn generate_rand_block_and_data_columns( fork_name: ForkName, num_blobs: NumBlobs, + parent_root: Option, rng: &mut impl Rng, spec: &ChainSpec, ) -> ( SignedBeaconBlock>, DataColumnSidecarList, ) { - let (block, _blobs) = generate_rand_block_and_blobs(fork_name, num_blobs, rng, spec); + let (block, _blobs) = + generate_rand_block_and_blobs(fork_name, num_blobs, parent_root, rng, spec); let data_columns = generate_data_column_sidecars_from_block(&block, spec); (block, data_columns) } diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 9f8c14f3398..73e2a9025c7 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2603,7 +2603,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .deconstruct(); if wss_fork.fulu_enabled() { info!(block_slot = %block.slot(), ?block_root, "Corrupting data column KZG proof"); - let (mut data_columns, expected_column_indices) = cols.unwrap(); + let mut data_columns = cols.unwrap(); assert!( !data_columns.is_empty(), "data column sidecars shouldn't be empty" @@ -2618,7 +2618,6 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { Some(block_root), block, data_columns.to_vec(), - expected_column_indices, &harness.spec, ) .unwrap() @@ -3819,7 +3818,6 @@ fn available_to_rpc_block(block: AvailableBlock, spec: &ChainSpec .into_iter() .map(|d| CustodyDataColumn::from_asserted_custody(d)) .collect(), - vec![], spec, ) .unwrap(), diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index 53c9c85c001..57dd8b0a34a 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -146,9 +146,7 @@ pub fn spawn_notifier( Instant::now(), ); } - SyncState::SyncingFinalized { .. } - | SyncState::SyncingHead { .. } - | SyncState::SyncTransition => { + SyncState::Syncing { .. } | SyncState::SyncTransition => { speedo.observe(head_slot, Instant::now()); } SyncState::Stalled | SyncState::Synced => {} diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index a4ec41ac06c..5d764464167 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -472,7 +472,8 @@ pub fn serve( move |network_globals: Arc>, chain: Arc>| async move { match *network_globals.sync_state.read() { - SyncState::SyncingFinalized { .. } => { + // TODO(tree-sync): review, we don't have a notion of finalized sync now + SyncState::Syncing { .. } => { let head_slot = chain.canonical_head.cached_head().head_slot(); let current_slot = @@ -494,9 +495,7 @@ pub fn serve( ))) } } - SyncState::SyncingHead { .. } - | SyncState::SyncTransition - | SyncState::BackFillSyncing { .. } => Ok(()), + SyncState::SyncTransition | SyncState::BackFillSyncing { .. } => Ok(()), SyncState::Synced => Ok(()), SyncState::Stalled => Ok(()), } diff --git a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs index 1ad55ce5c4a..ca5dfafa352 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs @@ -79,10 +79,7 @@ impl NetworkBehaviour for PeerManager { } } - if !matches!( - self.network_globals.sync_state(), - SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } - ) { + if !matches!(self.network_globals.sync_state(), SyncState::Syncing { .. }) { loop { match self.status_peers.poll_next_unpin(cx) { Poll::Ready(Some(Ok(peer_id))) => { diff --git a/beacon_node/lighthouse_network/src/rpc/codec.rs b/beacon_node/lighthouse_network/src/rpc/codec.rs index f24074118eb..e37f1ad01c6 100644 --- a/beacon_node/lighthouse_network/src/rpc/codec.rs +++ b/beacon_node/lighthouse_network/src/rpc/codec.rs @@ -1088,7 +1088,12 @@ mod tests { } fn bbroot_request_v2(fork_name: ForkName) -> BlocksByRootRequest { - BlocksByRootRequest::new(vec![Hash256::zero()], &fork_context(fork_name)) + let fork_context = fork_context(fork_name); + BlocksByRootRequest::new( + vec![Hash256::zero()], + &fork_context.spec, + fork_context.current_fork(), + ) } fn blbroot_request(fork_name: ForkName) -> BlobsByRootRequest { diff --git a/beacon_node/lighthouse_network/src/rpc/methods.rs b/beacon_node/lighthouse_network/src/rpc/methods.rs index 8a11a6f29d6..1f9ad0868b4 100644 --- a/beacon_node/lighthouse_network/src/rpc/methods.rs +++ b/beacon_node/lighthouse_network/src/rpc/methods.rs @@ -16,9 +16,9 @@ use types::blob_sidecar::BlobIdentifier; use types::light_client_update::MAX_REQUEST_LIGHT_CLIENT_UPDATES; use types::{ blob_sidecar::BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, - DataColumnsByRootIdentifier, Epoch, EthSpec, ForkContext, Hash256, LightClientBootstrap, - LightClientFinalityUpdate, LightClientOptimisticUpdate, LightClientUpdate, RuntimeVariableList, - SignedBeaconBlock, Slot, + DataColumnsByRootIdentifier, Epoch, EthSpec, ForkContext, ForkName, Hash256, + LightClientBootstrap, LightClientFinalityUpdate, LightClientOptimisticUpdate, + LightClientUpdate, RuntimeVariableList, SignedBeaconBlock, Slot, }; /// Maximum length of error message. @@ -440,10 +440,8 @@ pub struct BlocksByRootRequest { } impl BlocksByRootRequest { - pub fn new(block_roots: Vec, fork_context: &ForkContext) -> Self { - let max_request_blocks = fork_context - .spec - .max_request_blocks(fork_context.current_fork()); + pub fn new(block_roots: Vec, spec: &ChainSpec, current_fork: ForkName) -> Self { + let max_request_blocks = spec.max_request_blocks(current_fork); let block_roots = RuntimeVariableList::from_vec(block_roots, max_request_blocks); Self::V2(BlocksByRootRequestV2 { block_roots }) } diff --git a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs index f26dc4c7a84..f28ae8d5e09 100644 --- a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs @@ -313,13 +313,22 @@ mod tests { use crate::rpc::rate_limiter::Quota; use crate::rpc::self_limiter::SelfRateLimiter; use crate::rpc::{Ping, Protocol, RPCSend, RequestType}; - use crate::service::api_types::{AppRequestId, SingleLookupReqId, SyncRequestId}; + use crate::service::api_types::{ + AppRequestId, BlocksByRootRequestId, BlocksByRootRequester, HeaderLookupId, SyncRequestId, + }; use libp2p::PeerId; use logging::create_test_tracing_subscriber; use std::num::NonZeroU64; use std::time::Duration; use types::{EthSpec, ForkContext, Hash256, MainnetEthSpec, Slot}; + fn get_parent_request_id() -> BlocksByRootRequester { + BlocksByRootRequester::Header(HeaderLookupId { + id: 0, + block_root: Hash256::ZERO, + }) + } + /// Test that `next_peer_request_ready` correctly maintains the queue. #[tokio::test] async fn test_next_peer_request_ready() { @@ -336,17 +345,15 @@ mod tests { let mut limiter: SelfRateLimiter = SelfRateLimiter::new(Some(config), fork_context).unwrap(); let peer_id = PeerId::random(); - let lookup_id = 0; + let parent_request_id = get_parent_request_id(); for i in 1..=5u32 { let _ = limiter.allows( peer_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { - lookup_id, - req_id: i, - }, - }), + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId { + id: i, + parent_request_id, + })), RequestType::Ping(Ping { data: i as u64 }), ); } @@ -363,9 +370,7 @@ mod tests { for i in 2..=5u32 { assert!(matches!( iter.next().unwrap().request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if req_id == i, + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId{id,..})) if id == i, )); } @@ -388,9 +393,7 @@ mod tests { for i in 3..=5 { assert!(matches!( iter.next().unwrap().request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if req_id == i, + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId{id,..})) if id == i, )); } @@ -409,16 +412,15 @@ mod tests { let mut limiter: SelfRateLimiter = SelfRateLimiter::new(None, fork_context).unwrap(); let peer_id = PeerId::random(); + let parent_request_id = get_parent_request_id(); for i in 1..=5u32 { let result = limiter.allows( peer_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { - lookup_id: i, - req_id: i, - }, - }), + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId { + id: i, + parent_request_id, + })), RequestType::Ping(Ping { data: i as u64 }), ); @@ -469,9 +471,7 @@ mod tests { assert!(matches!( request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if *req_id == i + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId {id,..})) if *id == i )); } } @@ -487,17 +487,16 @@ mod tests { SelfRateLimiter::new(None, fork_context).unwrap(); let peer1 = PeerId::random(); let peer2 = PeerId::random(); + let parent_request_id = get_parent_request_id(); for peer in [peer1, peer2] { for i in 1..=5u32 { let result = limiter.allows( peer, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { - lookup_id: i, - req_id: i, - }, - }), + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId { + id: i, + parent_request_id, + })), RequestType::Ping(Ping { data: i as u64 }), ); @@ -525,9 +524,7 @@ mod tests { let (request_id, _) = failed_requests.remove(0); assert!(matches!( request_id, - AppRequestId::Sync(SyncRequestId::SingleBlock { - id: SingleLookupReqId { req_id, .. }, - }) if req_id == i + AppRequestId::Sync(SyncRequestId::BlocksByRoot(BlocksByRootRequestId{id,..})) if id == i )); } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index b36f8cc2154..5b1281efe06 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -2,7 +2,7 @@ use crate::rpc::methods::{ResponseTermination, RpcResponse, RpcSuccessResponse, use std::fmt::{Display, Formatter}; use std::sync::Arc; use types::{ - BlobSidecar, DataColumnSidecar, Epoch, EthSpec, Hash256, LightClientBootstrap, + BlobSidecar, DataColumnSidecar, EthSpec, Hash256, LightClientBootstrap, LightClientFinalityUpdate, LightClientOptimisticUpdate, LightClientUpdate, SignedBeaconBlock, }; @@ -18,55 +18,65 @@ pub struct SingleLookupReqId { #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum SyncRequestId { /// Request searching for a block given a hash. - SingleBlock { id: SingleLookupReqId }, + BlocksByRoot(BlocksByRootRequestId), /// Request searching for a set of blobs given a hash. - SingleBlob { id: SingleLookupReqId }, + BlobsByRoot(BlobsByRootRequestId), /// Request searching for a set of data columns given a hash and list of column indices. DataColumnsByRoot(DataColumnsByRootRequestId), - /// Blocks by range request - BlocksByRange(BlocksByRangeRequestId), - /// Blobs by range request - BlobsByRange(BlobsByRangeRequestId), - /// Data columns by range request - DataColumnsByRange(DataColumnsByRangeRequestId), + /// Request for headers_by_root + HeadersByRoot(HeadersByRootRequestId), } -/// Request ID for data_columns_by_root requests. Block lookups do not issue this request directly. -/// Wrapping this particular req_id, ensures not mixing this request with a custody req_id. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct DataColumnsByRootRequestId { +pub struct BlocksByRootRequestId { pub id: Id, - pub requester: DataColumnsByRootRequester, + pub parent_request_id: BlocksByRootRequester, } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct BlocksByRangeRequestId { - /// Id to identify this attempt at a blocks_by_range request for `parent_request_id` +pub struct HeadersByRootRequestId { pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + pub parent_request_id: HeaderLookupId, } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct BlobsByRangeRequestId { - /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` +pub struct HeaderLookupId { pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + pub chain_id: HeaderChainId, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] +pub struct HeaderChainId(pub Id); + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct BatchId(pub Id); + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct DataColumnsByRangeRequestId { - /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` +pub struct ForwardSyncLookupId { + pub id: Id, + pub block_root: Hash256, +} + +/// Request ID for data_columns_by_root requests. Block lookups do not issue this request directly. +/// Wrapping this particular req_id, ensures not mixing this request with a custody req_id. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct DataColumnsByRootRequestId { + pub id: Id, + pub parent_request_id: DataColumnsByRootRequester, +} + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct BlobsByRootRequestId { + /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` pub id: Id, /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + pub parent_request_id: ComponentsByRootRequestId, } /// Block components by range request for range sync. Includes an ID for downstream consumers to /// handle retries and tie all their sub requests together. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct ComponentsByRangeRequestId { +pub struct ComponentsByRootRequestId { /// Each `RangeRequestId` may request the same data in a later retry. This Id identifies the /// current attempt. pub id: Id, @@ -77,20 +87,20 @@ pub struct ComponentsByRangeRequestId { /// Range sync chain or backfill batch #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { - RangeSync { chain_id: Id, batch_id: Epoch }, - BackfillSync { batch_id: Epoch }, + ForwardSync(ForwardSyncLookupId), + BackfillSync(Id), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub enum DataColumnsByRootRequester { - Sampling(SamplingId), - Custody(CustodyId), +pub enum BlocksByRootRequester { + Header(HeaderLookupId), + ForwardSync(ComponentsByRootRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub enum RangeRequester { - RangeSync { chain_id: u64, batch_id: Epoch }, - BackfillSync { batch_id: Epoch }, +pub enum DataColumnsByRootRequester { + Sampling(SamplingId), + Custody(CustodyByRootRequestId), } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -109,15 +119,10 @@ pub enum SamplingRequester { pub struct SamplingRequestId(pub usize); #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct CustodyId { - pub requester: CustodyRequester, +pub struct CustodyByRootRequestId { + pub parent_request_id: ComponentsByRootRequestId, } -/// Downstream components that perform custody by root requests. -/// Currently, it's only single block lookups, so not using an enum -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] -pub struct CustodyRequester(pub SingleLookupReqId); - /// Application level requests sent to the network. #[derive(Debug, Clone, Copy)] pub enum AppRequestId { @@ -218,14 +223,18 @@ macro_rules! impl_display { // Since each request Id is deeply nested with various types, if rendered with Debug on logs they // take too much visual space. This custom Display implementations make the overall Id short while // not losing information -impl_display!(BlocksByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); -impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); -impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); +impl_display!(ComponentsByRootRequestId, "{}/{}", id, requester); +impl_display!(BlocksByRootRequestId, "{}/{}", id, parent_request_id); +impl_display!(BlobsByRootRequestId, "{}/{}", id, parent_request_id); +impl_display!(DataColumnsByRootRequestId, "{}/{}", id, parent_request_id); +impl_display!(HeadersByRootRequestId, "{}/{}", id, parent_request_id); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); -impl_display!(CustodyId, "{}", requester); +impl_display!(CustodyByRootRequestId, "{}", parent_request_id); impl_display!(SamplingId, "{}/{}", sampling_request_id, id); +// Print only the ID to make logs succint. On lookup creation we log the ID and the block root to +// link them. +impl_display!(HeaderLookupId, "{}", id); +impl_display!(ForwardSyncLookupId, "{}/{}", id, block_root); impl Display for DataColumnsByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -236,7 +245,13 @@ impl Display for DataColumnsByRootRequester { } } -impl Display for CustodyRequester { +impl Display for HeaderChainId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl Display for BatchId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } @@ -245,8 +260,17 @@ impl Display for CustodyRequester { impl Display for RangeRequestId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::RangeSync { chain_id, batch_id } => write!(f, "RangeSync/{batch_id}/{chain_id}"), - Self::BackfillSync { batch_id } => write!(f, "BackfillSync/{batch_id}"), + Self::ForwardSync(id) => write!(f, "ForwardSync/{id}"), + Self::BackfillSync(id) => write!(f, "BackfillSync/{id}"), + } + } +} + +impl Display for BlocksByRootRequester { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Header(id) => write!(f, "Header/{id}"), + Self::ForwardSync(id) => write!(f, "ForwardSync/{id}"), } } } @@ -273,11 +297,14 @@ mod tests { fn display_id_data_columns_by_root_custody() { let id = DataColumnsByRootRequestId { id: 123, - requester: DataColumnsByRootRequester::Custody(CustodyId { - requester: CustodyRequester(SingleLookupReqId { - req_id: 121, - lookup_id: 101, - }), + parent_request_id: DataColumnsByRootRequester::Custody(CustodyByRootRequestId { + parent_request_id: ComponentsByRootRequestId { + id: 121, + requester: RangeRequestId::ForwardSync(HeaderLookupId { + id: 1, + block_root: Hash256::ZERO, + }), + }, }), }; assert_eq!(format!("{id}"), "123/Custody/121/Lookup/101"); @@ -287,26 +314,11 @@ mod tests { fn display_id_data_columns_by_root_sampling() { let id = DataColumnsByRootRequestId { id: 123, - requester: DataColumnsByRootRequester::Sampling(SamplingId { + parent_request_id: DataColumnsByRootRequester::Sampling(SamplingId { id: SamplingRequester::ImportedBlock(Hash256::ZERO), sampling_request_id: SamplingRequestId(101), }), }; assert_eq!(format!("{id}"), "123/Sampling/101/ImportedBlock/0x0000000000000000000000000000000000000000000000000000000000000000"); } - - #[test] - fn display_id_data_columns_by_range() { - let id = DataColumnsByRangeRequestId { - id: 123, - parent_request_id: ComponentsByRangeRequestId { - id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), - }, - }, - }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); - } } diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index d1ed1c33b07..1c11e7aa1f0 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -248,6 +248,10 @@ impl NetworkGlobals { } } + pub fn sampling_columns_count(&self) -> usize { + self.sampling_columns.read().len() + } + pub fn sampling_columns(&self) -> HashSet { self.sampling_columns.read().clone() } @@ -271,6 +275,25 @@ impl NetworkGlobals { Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) } + pub fn new_test_globals_as_supernode( + trusted_peers: Vec, + config: Arc, + spec: Arc, + is_supernode: bool, + ) -> NetworkGlobals { + let metadata = MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets: Default::default(), + custody_group_count: if is_supernode { + spec.number_of_custody_groups + } else { + spec.custody_requirement + }, + }); + Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) + } + pub(crate) fn new_test_globals_with_metadata( trusted_peers: Vec, metadata: MetaData, diff --git a/beacon_node/network/Cargo.toml b/beacon_node/network/Cargo.toml index cdb6ba7a83f..ad0684bb91e 100644 --- a/beacon_node/network/Cargo.toml +++ b/beacon_node/network/Cargo.toml @@ -57,3 +57,4 @@ kzg = { workspace = true } matches = "0.1.8" rand_chacha = "0.3.1" serde_json = { workspace = true } +fork_choice = { workspace = true } diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 05c7dc287b0..2588ca6b12c 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -403,32 +403,23 @@ pub static SYNCING_CHAINS_COUNT: LazyLock> = LazyLock::new(| &["range_type"], ) }); -pub static SYNCING_CHAINS_REMOVED: LazyLock> = LazyLock::new(|| { +pub static SYNC_CHAINS_REMOVED: LazyLock> = LazyLock::new(|| { try_create_int_counter_vec( - "sync_range_removed_chains_total", - "Total count of range syncing chains removed per range type", - &["range_type"], - ) -}); -pub static SYNCING_CHAINS_ADDED: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_range_added_chains_total", - "Total count of range syncing chains added per range type", - &["range_type"], + "sync_removed_chains_total", + "Total count of forward sync chains removed", + &["reason"], ) }); -pub static SYNCING_CHAINS_DROPPED_BLOCKS: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_range_chains_dropped_blocks_total", - "Total count of dropped blocks when removing a syncing chain per range type", - &["range_type"], +pub static SYNC_CHAINS_ADDED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_added_chains_total", + "Total count of forward sync chains added", ) }); -pub static SYNCING_CHAINS_IGNORED_BLOCKS: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( +pub static SYNCING_CHAINS_IGNORED_BLOCKS: LazyLock> = LazyLock::new(|| { + try_create_int_counter( "sync_range_chains_ignored_blocks_total", "Total count of ignored blocks when processing a syncing chain batch per chain type", - &["chain_type"], ) }); pub static SYNCING_CHAINS_PROCESSED_BATCHES: LazyLock> = @@ -461,23 +452,10 @@ pub static SYNC_LOOKUP_CREATED: LazyLock> = LazyLock::new(|| "Total count of sync lookups created", ) }); -pub static SYNC_LOOKUP_DROPPED: LazyLock> = LazyLock::new(|| { - try_create_int_counter_vec( - "sync_lookups_dropped_total", - "Total count of sync lookups dropped by reason", - &["reason"], - ) -}); -pub static SYNC_LOOKUP_COMPLETED: LazyLock> = LazyLock::new(|| { +pub static SYNC_FORWARD_BLOCKS_DROPPED: LazyLock> = LazyLock::new(|| { try_create_int_counter( - "sync_lookups_completed_total", - "Total count of sync lookups completed", - ) -}); -pub static SYNC_LOOKUPS_STUCK: LazyLock> = LazyLock::new(|| { - try_create_int_counter( - "sync_lookups_stuck_total", - "Total count of sync lookups that are stuck and dropped", + "sync_forward_lookups_dropped_total", + "Total count of forward sync blocks dropped by reason", ) }); pub static SYNC_ACTIVE_NETWORK_REQUESTS: LazyLock> = LazyLock::new(|| { @@ -494,6 +472,115 @@ pub static SYNC_UNKNOWN_NETWORK_REQUESTS: LazyLock> = Lazy &["type"], ) }); +pub static SYNC_RPC_REQUEST_SUCCESSES: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( + "sync_rpc_requests_success_total", + "Total count of sync RPC requests successes", + &["protocol"], + ) +}); +pub static SYNC_RPC_REQUEST_ERRORS: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( + "sync_rpc_requests_error_total", + "Total count of sync RPC requests errors", + &["protocol"], + ) +}); +pub static SYNC_RPC_REQUEST_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec_with_buckets( + "sync_rpc_request_duration_sec", + "Time to complete a successful sync RPC requesst", + Ok(vec![ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 1.0, 2.0, + ]), + &["protocol"], + ) +}); +pub static SYNC_HEADERS_DOWNLOADED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_headers_downloaded_total", + "Total count of forward sync headers downloaded", + ) +}); +pub static SYNC_BLOCKS_PROCESSED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_blocks_processed_total", + "Total count of forward sync blocks processed", + ) +}); +pub static SYNC_LOOKUPS_CREATED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_lookups_created_total", + "Total count of forward sync lookups created", + ) +}); +pub static SYNC_LOOKUPS_DROPPED: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_lookups_dropped_total", + "Total count of forward sync lookups dropped", + ) +}); +pub static SYNC_HEADER_MIN_SLOT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_header_min_slot", + "Current min slot of foward sync headers", + ) +}); +pub static SYNC_HEADER_MAX_SLOT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_header_max_slot", + "Current max slot of foward sync headers", + ) +}); +pub static SYNC_HEADERS_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge("sync_headers_count", "Current count of headers in memory") +}); +pub static SYNC_HEADER_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_header_chains_count", + "Current count of header chains in memory", + ) +}); +pub static SYNC_FORWARD_SYNC_BLOCKS_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_forward_sync_blocks_count", + "Current count of forward sync blocks in memory", + ) +}); +pub static SYNC_CHAIN_MERGES_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_counter( + "sync_forward_chain_merges_total", + "Total count of forward sync chain merges", + ) +}); +pub static SYNC_CHAIN_ERROR_COUNT: LazyLock> = LazyLock::new(|| { + try_create_int_counter_vec( + "sync_forward_chain_error_total", + "Total count of forward sync chain errors", + &["error"], + ) +}); +pub static SYNC_BLOCK_DOWNLOADING_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "sync_block_downloading_time_seconds", + "Time to complete SyncBlock Downloading state", + decimal_buckets(-3, -1), + ) +}); +pub static SYNC_BLOCK_AWAITING_PROCESSING_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "sync_block_awaiting_processing_time_seconds", + "Time to complete SyncBlock AwaitingProcessing state", + decimal_buckets(-3, -1), + ) +}); +pub static SYNC_BLOCK_PROCESSING_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "sync_block_processing_time_seconds", + "Time to complete SyncBlock Processing state", + decimal_buckets(-3, -1), + ) +}); /* * Block Delay Metrics diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 37f3b00e443..c8eb5c5571d 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -1,4 +1,3 @@ -use crate::sync::manager::BlockProcessType; use crate::sync::SamplingId; use crate::{service::NetworkMessage, sync::manager::SyncMessage}; use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob}; @@ -34,7 +33,6 @@ use tracing::{debug, error, trace, warn, Instrument}; use types::*; pub use sync_methods::{ChainSegmentProcessId, PeerGroupAction}; -use types::blob_sidecar::FixedBlobSidecarList; pub type Error = TrySendError>; @@ -479,82 +477,12 @@ impl NetworkBeaconProcessor { }) } - /// Create a new `Work` event for some block, where the result from computation (if any) is - /// sent to the other side of `result_tx`. - pub fn send_rpc_beacon_block( - self: &Arc, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), Error> { - let process_fn = self.clone().generate_rpc_beacon_block_process_fn( - block_root, - block, - seen_timestamp, - process_type, - ); - self.try_send(BeaconWorkEvent { - drop_during_sync: false, - work: Work::RpcBlock { process_fn }, - }) - } - - /// Create a new `Work` event for some blobs, where the result from computation (if any) is - /// sent to the other side of `result_tx`. - pub fn send_rpc_blobs( - self: &Arc, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), Error> { - let blob_count = blobs.iter().filter(|b| b.is_some()).count(); - if blob_count == 0 { - return Ok(()); - } - let process_fn = self.clone().generate_rpc_blobs_process_fn( - block_root, - blobs, - seen_timestamp, - process_type, - ); - self.try_send(BeaconWorkEvent { - drop_during_sync: false, - work: Work::RpcBlobs { process_fn }, - }) - } - - /// Create a new `Work` event for some custody columns. `process_rpc_custody_columns` reports - /// the result back to sync. - pub fn send_rpc_custody_columns( - self: &Arc, - block_root: Hash256, - custody_columns: DataColumnSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), Error> { - let s = self.clone(); - self.try_send(BeaconWorkEvent { - drop_during_sync: false, - work: Work::RpcCustodyColumn(Box::pin(async move { - s.process_rpc_custody_columns( - block_root, - custody_columns, - seen_timestamp, - process_type, - ) - .await; - })), - }) - } - /// Create a new `Work` event for some sampling columns, and reports the verification result /// back to sync. pub fn send_rpc_validate_data_columns( self: &Arc, block_root: Hash256, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, seen_timestamp: Duration, id: SamplingId, ) -> Result<(), Error> { @@ -592,21 +520,16 @@ impl NetworkBeaconProcessor { process_id: ChainSegmentProcessId, blocks: Vec>, ) -> Result<(), Error> { - let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. }); - debug!(blocks = blocks.len(), id = ?process_id, "Batch sending for process"); + let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackfillSync { .. }); + debug!(blocks = blocks.len(), id = %process_id, "Batch sending for process"); let processor = self.clone(); let process_fn = async move { - let notify_execution_layer = if processor - .network_globals - .sync_state - .read() - .is_syncing_finalized() - { - NotifyExecutionLayer::No - } else { - NotifyExecutionLayer::Yes - }; + // TODO(tree-sync): Now that we group peers in a header tree they could have diverging + // opinions on what's finalized and what's not. So don't have a clear yes / no to guess + // if this block is finalized or not. Review the optimization of NOT notifying the + // execution layer if we belive this block is finalized. + let notify_execution_layer = NotifyExecutionLayer::Yes; processor .process_chain_segment(process_id, blocks, notify_execution_layer) .await; diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index b1777cef792..a3ab3459709 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,45 +1,32 @@ -use crate::metrics::{self, register_process_result_metrics}; +use crate::metrics::{self}; use crate::network_beacon_processor::{NetworkBeaconProcessor, FUTURE_SLOT_TOLERANCE}; +use crate::sync::manager::SyncMessage; use crate::sync::BatchProcessResult; -use crate::sync::{ - manager::{BlockProcessType, SyncMessage}, - ChainId, -}; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; use beacon_chain::{ - validator_monitor::get_slot_delay_ms, AvailabilityProcessingStatus, BeaconChainTypes, - BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, -}; -use beacon_processor::{ - work_reprocessing_queue::{QueuedRpcBlock, ReprocessQueueMessage}, - AsyncFn, BlockingFn, DuplicateCache, + BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, }; +use lighthouse_network::service::api_types::{ForwardSyncLookupId, Id}; use lighthouse_network::PeerAction; use std::collections::HashMap; +use std::fmt::{Display, Formatter}; use std::sync::Arc; use std::time::Duration; -use store::KzgCommitment; -use tokio::sync::mpsc; -use tracing::{debug, error, info, warn}; -use types::beacon_block_body::format_kzg_commitments; -use types::blob_sidecar::FixedBlobSidecarList; -use types::{ - BlockImportSource, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, Hash256, -}; +use tracing::{debug, warn}; +use types::{ColumnIndex, DataColumnSidecar, Hash256}; /// Id associated to a batch processing request, either a sync batch or a parent lookup. #[derive(Clone, Debug, PartialEq)] pub enum ChainSegmentProcessId { /// Processing Id of a range syncing batch. - RangeBatchId(ChainId, Epoch), + ForwardSync(ForwardSyncLookupId), /// Processing ID for a backfill syncing batch. - BackSyncBatchId(Epoch), + BackfillSync(Id), } /// Returned when a chain segment import fails. -#[derive(Debug)] pub struct ChainSegmentFailed { /// To be displayed in logs. pub message: String, @@ -101,376 +88,6 @@ impl PeerGroupAction { } impl NetworkBeaconProcessor { - /// Returns an async closure which processes a beacon block received via RPC. - /// - /// This separate function was required to prevent a cycle during compiler - /// type checking. - pub fn generate_rpc_beacon_block_process_fn( - self: Arc, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> AsyncFn { - let process_fn = async move { - let reprocess_tx = self.reprocess_tx.clone(); - let duplicate_cache = self.duplicate_cache.clone(); - self.process_rpc_block( - block_root, - block, - seen_timestamp, - process_type, - reprocess_tx, - duplicate_cache, - ) - .await; - }; - Box::pin(process_fn) - } - - /// Returns the `process_fn` and `ignore_fn` required when requeuing an RPC block. - pub fn generate_rpc_beacon_block_fns( - self: Arc, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> (AsyncFn, BlockingFn) { - // An async closure which will import the block. - let process_fn = self.clone().generate_rpc_beacon_block_process_fn( - block_root, - block, - seen_timestamp, - process_type.clone(), - ); - // A closure which will ignore the block. - let ignore_fn = move || { - // Sync handles these results - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: crate::sync::manager::BlockProcessingResult::Ignored, - }); - }; - (process_fn, Box::new(ignore_fn)) - } - - /// Attempt to process a block received from a direct RPC request. - #[allow(clippy::too_many_arguments)] - pub async fn process_rpc_block( - self: Arc>, - block_root: Hash256, - block: RpcBlock, - seen_timestamp: Duration, - process_type: BlockProcessType, - reprocess_tx: mpsc::Sender, - duplicate_cache: DuplicateCache, - ) { - // Check if the block is already being imported through another source - let Some(handle) = duplicate_cache.check_and_insert(block_root) else { - debug!( - action = "sending rpc block to reprocessing queue", - %block_root, - ?process_type, - "Gossip block is being processed" - ); - - // Send message to work reprocess queue to retry the block - let (process_fn, ignore_fn) = self.clone().generate_rpc_beacon_block_fns( - block_root, - block, - seen_timestamp, - process_type, - ); - let reprocess_msg = ReprocessQueueMessage::RpcBlock(QueuedRpcBlock { - beacon_block_root: block_root, - process_fn, - ignore_fn, - }); - - if reprocess_tx.try_send(reprocess_msg).is_err() { - error!(source = "rpc", %block_root,"Failed to inform block import") - }; - return; - }; - - let slot = block.slot(); - let block_has_data = block.as_block().num_expected_blobs() > 0; - let parent_root = block.message().parent_root(); - let commitments_formatted = block.as_block().commitments_formatted(); - - debug!( - ?block_root, - proposer = block.message().proposer_index(), - slot = %block.slot(), - commitments_formatted, - ?process_type, - "Processing RPC block" - ); - - let signed_beacon_block = block.block_cloned(); - let result = self - .chain - .process_block_with_early_caching( - block_root, - block, - BlockImportSource::Lookup, - NotifyExecutionLayer::Yes, - ) - .await; - register_process_result_metrics(&result, metrics::BlockSource::Rpc, "block"); - - // RPC block imported, regardless of process type - match result.as_ref() { - Ok(AvailabilityProcessingStatus::Imported(hash)) => { - info!( - %slot, - %hash, - "New RPC block received", - ); - // Trigger processing for work referencing this block. - let reprocess_msg = ReprocessQueueMessage::BlockImported { - block_root: *hash, - parent_root, - }; - if reprocess_tx.try_send(reprocess_msg).is_err() { - error!( - source = "rpc", - block_root = %hash, - "Failed to inform block import" - ); - }; - self.chain.block_times_cache.write().set_time_observed( - *hash, - slot, - seen_timestamp, - None, - None, - ); - - self.chain.recompute_head_at_current_slot().await; - } - Ok(AvailabilityProcessingStatus::MissingComponents(..)) => { - // Block is valid, we can now attempt fetching blobs from EL using version hashes - // derived from kzg commitments from the block, without having to wait for all blobs - // to be sent from the peers if we already have them. - let publish_blobs = false; - self.fetch_engine_blobs_and_publish(signed_beacon_block, block_root, publish_blobs) - .await - } - _ => {} - } - - // RPC block imported or execution validated. If the block was already imported by gossip we - // receive Err(BlockError::AlreadyKnown). - if result.is_ok() && - // Block has at least one blob, so it produced columns - block_has_data && - // Block slot is within the DA boundary (should always be the case) and PeerDAS is activated - self.chain.should_sample_slot(slot) - { - self.send_sync_message(SyncMessage::SampleBlock(block_root, slot)); - } - - // Sync handles these results - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: result.into(), - }); - - // Drop the handle to remove the entry from the cache - drop(handle); - } - - /// Returns an async closure which processes a list of blobs received via RPC. - /// - /// This separate function was required to prevent a cycle during compiler - /// type checking. - pub fn generate_rpc_blobs_process_fn( - self: Arc, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> AsyncFn { - let process_fn = async move { - self.clone() - .process_rpc_blobs(block_root, blobs, seen_timestamp, process_type) - .await; - }; - Box::pin(process_fn) - } - - /// Attempt to process a list of blobs received from a direct RPC request. - pub async fn process_rpc_blobs( - self: Arc>, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) { - let Some(slot) = blobs - .iter() - .find_map(|blob| blob.as_ref().map(|blob| blob.slot())) - else { - return; - }; - - let (indices, commitments): (Vec, Vec) = blobs - .iter() - .filter_map(|blob_opt| { - blob_opt - .as_ref() - .map(|blob| (blob.index, blob.kzg_commitment)) - }) - .unzip(); - let commitments = format_kzg_commitments(&commitments); - - debug!( - ?indices, - %block_root, - %slot, - commitments, - "RPC blobs received" - ); - - if let Ok(current_slot) = self.chain.slot() { - if current_slot == slot { - // Note: this metric is useful to gauge how long it takes to receive blobs requested - // over rpc. Since we always send the request for block components at `slot_clock.single_lookup_delay()` - // we can use that as a baseline to measure against. - let delay = get_slot_delay_ms(seen_timestamp, slot, &self.chain.slot_clock); - - metrics::observe_duration(&metrics::BEACON_BLOB_RPC_SLOT_START_DELAY_TIME, delay); - } - } - - let result = self.chain.process_rpc_blobs(slot, block_root, blobs).await; - register_process_result_metrics(&result, metrics::BlockSource::Rpc, "blobs"); - - match &result { - Ok(AvailabilityProcessingStatus::Imported(hash)) => { - debug!( - result = "imported block and blobs", - %slot, - block_hash = %hash, - "Block components retrieved" - ); - self.chain.recompute_head_at_current_slot().await; - } - Ok(AvailabilityProcessingStatus::MissingComponents(_, _)) => { - debug!( - block_hash = %block_root, - %slot, - "Missing components over rpc" - ); - } - Err(BlockError::DuplicateFullyImported(_)) => { - debug!( - block_hash = %block_root, - %slot, - "Blobs have already been imported" - ); - } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - %slot, - "Error when importing rpc blobs" - ); - } - } - - // Sync handles these results - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: result.into(), - }); - } - - pub async fn process_rpc_custody_columns( - self: Arc>, - block_root: Hash256, - custody_columns: DataColumnSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) { - // custody_columns must always have at least one element - let Some(slot) = custody_columns.first().map(|d| d.slot()) else { - return; - }; - - if let Ok(current_slot) = self.chain.slot() { - if current_slot == slot { - let delay = get_slot_delay_ms(seen_timestamp, slot, &self.chain.slot_clock); - metrics::observe_duration(&metrics::BEACON_BLOB_RPC_SLOT_START_DELAY_TIME, delay); - } - } - - let mut indices = custody_columns.iter().map(|d| d.index).collect::>(); - indices.sort_unstable(); - debug!( - ?indices, - %block_root, - %slot, - "RPC custody data columns received" - ); - - let mut result = self - .chain - .process_rpc_custody_columns(custody_columns) - .await; - register_process_result_metrics(&result, metrics::BlockSource::Rpc, "custody_columns"); - - match &result { - Ok(availability) => match availability { - AvailabilityProcessingStatus::Imported(hash) => { - debug!( - result = "imported block and custody columns", - block_hash = %hash, - "Block components retrieved" - ); - self.chain.recompute_head_at_current_slot().await; - } - AvailabilityProcessingStatus::MissingComponents(_, _) => { - debug!( - block_hash = %block_root, - "Missing components over rpc" - ); - // Attempt reconstruction here before notifying sync, to avoid sending out more requests - // that we may no longer need. - // We don't publish columns reconstructed from rpc columns to the gossip network, - // as these are likely historic columns. - let publish_columns = false; - if let Some(availability) = self - .attempt_data_column_reconstruction(block_root, publish_columns) - .await - { - result = Ok(availability) - } - } - }, - Err(BlockError::DuplicateFullyImported(_)) => { - debug!( - block_hash = %block_root, - "Custody columns have already been imported" - ); - } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - "Error when importing rpc custody columns" - ); - } - } - - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type, - result: result.into(), - }); - } - /// Validate a list of data columns received from RPC requests pub async fn validate_rpc_data_columns( self: Arc>, @@ -500,7 +117,7 @@ impl NetworkBeaconProcessor { ) { let result = match sync_type { // this a request from the range sync - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { + ChainSegmentProcessId::ForwardSync(id) => { let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); let sent_blocks = downloaded_blocks.len(); @@ -510,42 +127,37 @@ impl NetworkBeaconProcessor { .await { (imported_blocks, Ok(_)) => { + let ignored_blocks = sent_blocks - imported_blocks; + metrics::inc_counter_by( + &metrics::SYNCING_CHAINS_IGNORED_BLOCKS, + ignored_blocks as u64, + ); debug!( - batch_epoch = %epoch, + %id, first_block_slot = start_slot, - chain = chain_id, last_block_slot = end_slot, processed_blocks = sent_blocks, service= "sync", "Batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } + BatchProcessResult::Success } - (imported_blocks, Err(e)) => { + (_imported_blocks, Err(e)) => { debug!( - batch_epoch = %epoch, + %id, first_block_slot = start_slot, - chain = chain_id, last_block_slot = end_slot, - imported_blocks, error = %e.message, service = "sync", "Batch processing failed"); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks, - peer_action: penalty, - error: e.message, - }, - None => BatchProcessResult::NonFaultyFailure, + BatchProcessResult::Failure { + peer_action: e.peer_action, + error: e.message, } } } } // this a request from the Backfill sync - ChainSegmentProcessId::BackSyncBatchId(epoch) => { + ChainSegmentProcessId::BackfillSync(epoch) => { let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); let sent_blocks = downloaded_blocks.len(); @@ -559,7 +171,7 @@ impl NetworkBeaconProcessor { .sum::(); match self.process_backfill_blocks(downloaded_blocks) { - Ok(imported_blocks) => { + Ok(_imported_blocks) => { debug!( batch_epoch = %epoch, first_block_slot = start_slot, @@ -570,10 +182,7 @@ impl NetworkBeaconProcessor { processed_data_columns = n_data_columns, service= "sync", "Backfill batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } + BatchProcessResult::Success } Err(e) => { debug!( @@ -585,13 +194,9 @@ impl NetworkBeaconProcessor { service = "sync", "Backfill batch processing failed" ); - match e.peer_action { - Some(peer_action) => BatchProcessResult::FaultyFailure { - imported_blocks: 0, - peer_action, - error: e.message, - }, - None => BatchProcessResult::NonFaultyFailure, + BatchProcessResult::Failure { + peer_action: e.peer_action, + error: e.message, } } } @@ -813,3 +418,12 @@ impl NetworkBeaconProcessor { }) } } + +impl Display for ChainSegmentProcessId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::ForwardSync(id) => write!(f, "ForwardSync/{id}"), + Self::BackfillSync(id) => write!(f, "BackfillSync/{id}"), + } + } +} diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index f6a1069a7f4..9fe556781e4 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -22,6 +22,7 @@ use gossipsub::MessageAcceptance; use itertools::Itertools; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, MetaDataV3}; use lighthouse_network::rpc::InboundRequestId; +use lighthouse_network::service::api_types::HeaderLookupId; use lighthouse_network::{ discv5::enr::{self, CombinedKey}, rpc::methods::{MetaData, MetaDataV2}, @@ -374,54 +375,13 @@ impl TestRig { pub fn enqueue_rpc_block(&self) { let block_root = self.next_block.canonical_root(); self.network_beacon_processor - .send_rpc_beacon_block( - block_root, - RpcBlock::new_without_blobs(Some(block_root), self.next_block.clone()), - std::time::Duration::default(), - BlockProcessType::SingleBlock { id: 0 }, - ) - .unwrap(); - } - - pub fn enqueue_single_lookup_rpc_block(&self) { - let block_root = self.next_block.canonical_root(); - self.network_beacon_processor - .send_rpc_beacon_block( - block_root, - RpcBlock::new_without_blobs(Some(block_root), self.next_block.clone()), - std::time::Duration::default(), - BlockProcessType::SingleBlock { id: 1 }, + .send_chain_segment( + ChainSegmentProcessId::ForwardSync(HeaderLookupId { id: 0, block_root }), + vec![], ) .unwrap(); } - pub fn enqueue_single_lookup_rpc_blobs(&self) { - if let Some(blobs) = self.next_blobs.clone() { - let blobs = FixedBlobSidecarList::new(blobs.into_iter().map(Some).collect::>()); - self.network_beacon_processor - .send_rpc_blobs( - self.next_block.canonical_root(), - blobs, - std::time::Duration::default(), - BlockProcessType::SingleBlob { id: 1 }, - ) - .unwrap(); - } - } - - pub fn enqueue_single_lookup_rpc_data_columns(&self) { - if let Some(data_columns) = self.next_data_columns.clone() { - self.network_beacon_processor - .send_rpc_custody_columns( - self.next_block.canonical_root(), - data_columns, - Duration::default(), - BlockProcessType::SingleCustodyColumn(1), - ) - .unwrap(); - } - } - pub fn enqueue_blobs_by_range_request(&self, count: u64) { self.network_beacon_processor .send_blobs_by_range_request( @@ -437,10 +397,7 @@ impl TestRig { pub fn enqueue_backfill_batch(&self) { self.network_beacon_processor - .send_chain_segment( - ChainSegmentProcessId::BackSyncBatchId(Epoch::default()), - Vec::default(), - ) + .send_chain_segment(ChainSegmentProcessId::BackfillSync(0), Vec::default()) .unwrap(); } @@ -945,14 +902,6 @@ async fn attestation_to_unknown_block_processed(import_method: BlockImportMethod BlockImportMethod::Rpc => { rig.enqueue_rpc_block(); events.push(WorkType::RpcBlock); - if num_blobs > 0 { - rig.enqueue_single_lookup_rpc_blobs(); - events.push(WorkType::RpcBlobs); - } - if num_data_columns > 0 { - rig.enqueue_single_lookup_rpc_data_columns(); - events.push(WorkType::RpcCustodyColumn); - } } }; @@ -1031,14 +980,6 @@ async fn aggregate_attestation_to_unknown_block(import_method: BlockImportMethod BlockImportMethod::Rpc => { rig.enqueue_rpc_block(); events.push(WorkType::RpcBlock); - if num_blobs > 0 { - rig.enqueue_single_lookup_rpc_blobs(); - events.push(WorkType::RpcBlobs); - } - if num_data_columns > 0 { - rig.enqueue_single_lookup_rpc_data_columns(); - events.push(WorkType::RpcCustodyColumn); - } } }; @@ -1219,24 +1160,10 @@ async fn test_rpc_block_reprocessing() { let next_block_root = rig.next_block.canonical_root(); // Insert the next block into the duplicate cache manually let handle = rig.duplicate_cache.check_and_insert(next_block_root); - rig.enqueue_single_lookup_rpc_block(); + rig.enqueue_rpc_block(); rig.assert_event_journal_completes(&[WorkType::RpcBlock]) .await; - let num_blobs = rig.next_blobs.as_ref().map(|b| b.len()).unwrap_or(0); - if num_blobs > 0 { - rig.enqueue_single_lookup_rpc_blobs(); - rig.assert_event_journal_completes(&[WorkType::RpcBlobs]) - .await; - } - - let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); - if num_data_columns > 0 { - rig.enqueue_single_lookup_rpc_data_columns(); - rig.assert_event_journal_completes(&[WorkType::RpcCustodyColumn]) - .await; - } - // next_block shouldn't be processed since it couldn't get the // duplicate cache handle assert_ne!(next_block_root, rig.head_root()); diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index 2a7bc597c26..71c3de95949 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -296,14 +296,14 @@ impl Router { .send_status_message(peer_id, status_message), ) } - Response::BlocksByRange(beacon_block) => { - self.on_blocks_by_range_response(peer_id, app_request_id, beacon_block); + Response::BlocksByRange(_) => { + crit!(id = ?app_request_id, "No BlocksByRange response expected"); } Response::BlocksByRoot(beacon_block) => { self.on_blocks_by_root_response(peer_id, app_request_id, beacon_block); } - Response::BlobsByRange(blob) => { - self.on_blobs_by_range_response(peer_id, app_request_id, blob); + Response::BlobsByRange(_) => { + crit!(id = ?app_request_id, "No BlobsByRange response expected"); } Response::BlobsByRoot(blob) => { self.on_blobs_by_root_response(peer_id, app_request_id, blob); @@ -311,8 +311,8 @@ impl Router { Response::DataColumnsByRoot(data_column) => { self.on_data_columns_by_root_response(peer_id, app_request_id, data_column); } - Response::DataColumnsByRange(data_column) => { - self.on_data_columns_by_range_response(peer_id, app_request_id, data_column); + Response::DataColumnsByRange(_) => { + crit!(id = ?app_request_id, "No DataColumnsByRange response expected"); } // Light client responses should not be received Response::LightClientBootstrap(_) @@ -559,66 +559,6 @@ impl Router { ) } - /// Handle a `BlocksByRange` response from the peer. - /// A `beacon_block` behaves as a stream which is terminated on a `None` response. - pub fn on_blocks_by_range_response( - &mut self, - peer_id: PeerId, - app_request_id: AppRequestId, - beacon_block: Option>>, - ) { - let sync_request_id = match app_request_id { - AppRequestId::Sync(sync_request_id) => match sync_request_id { - id @ SyncRequestId::BlocksByRange { .. } => id, - other => { - crit!(request = ?other, "BlocksByRange response on incorrect request"); - return; - } - }, - AppRequestId::Router => { - crit!(%peer_id, "All BBRange requests belong to sync"); - return; - } - AppRequestId::Internal => unreachable!("Handled internally"), - }; - - trace!( - %peer_id, - "Received BlocksByRange Response" - - ); - - self.send_to_sync(SyncMessage::RpcBlock { - peer_id, - sync_request_id, - beacon_block, - seen_timestamp: timestamp_now(), - }); - } - - pub fn on_blobs_by_range_response( - &mut self, - peer_id: PeerId, - app_request_id: AppRequestId, - blob_sidecar: Option>>, - ) { - trace!( - %peer_id, - "Received BlobsByRange Response" - ); - - if let AppRequestId::Sync(sync_request_id) = app_request_id { - self.send_to_sync(SyncMessage::RpcBlob { - peer_id, - sync_request_id, - blob_sidecar, - seen_timestamp: timestamp_now(), - }); - } else { - crit!("All blobs by range responses should belong to sync"); - } - } - /// Handle a `BlocksByRoot` response from the peer. pub fn on_blocks_by_root_response( &mut self, @@ -628,7 +568,7 @@ impl Router { ) { let sync_request_id = match app_request_id { AppRequestId::Sync(sync_id) => match sync_id { - id @ SyncRequestId::SingleBlock { .. } => id, + id @ SyncRequestId::BlocksByRoot { .. } => id, other => { crit!(request = ?other, "BlocksByRoot response on incorrect request"); return; @@ -662,7 +602,7 @@ impl Router { ) { let sync_request_id = match app_request_id { AppRequestId::Sync(sync_id) => match sync_id { - id @ SyncRequestId::SingleBlob { .. } => id, + id @ SyncRequestId::BlobsByRoot { .. } => id, other => { crit!(request = ?other, "BlobsByRoot response on incorrect request"); return; @@ -721,29 +661,6 @@ impl Router { }); } - pub fn on_data_columns_by_range_response( - &mut self, - peer_id: PeerId, - app_request_id: AppRequestId, - data_column: Option>>, - ) { - trace!( - %peer_id, - "Received DataColumnsByRange Response" - ); - - if let AppRequestId::Sync(sync_request_id) = app_request_id { - self.send_to_sync(SyncMessage::RpcDataColumn { - peer_id, - sync_request_id, - data_column, - seen_timestamp: timestamp_now(), - }); - } else { - crit!("All data columns by range responses should belong to sync"); - } - } - fn handle_beacon_processor_send_result( &mut self, result: Result<(), crate::network_beacon_processor::Error>, diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 7b5701cc8d2..6d58ddc1a3e 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -8,41 +8,19 @@ //! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill //! sync as failed, log an error and attempt to retry once a new peer joins the node. -use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ - RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, -}; -use crate::sync::range_sync::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + BatchPeers, RangeRequestId, RpcResponseError, SyncNetworkContext, }; +use crate::sync::sync_block::{Error as SyncBlockError, OkToImport, SyncBlock, SyncBlockResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use itertools::Itertools; -use lighthouse_network::service::api_types::Id; +use lighthouse_network::service::api_types::{ComponentsByRootRequestId, Id}; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::{PeerAction, PeerId}; -use logging::crit; -use std::collections::{ - btree_map::{BTreeMap, Entry}, - HashSet, -}; +use lighthouse_network::PeerId; use std::sync::Arc; -use tracing::{debug, error, info, instrument, warn}; -use types::{Epoch, EthSpec}; - -use super::range_sync::BatchPeers; - -/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of -/// blocks per batch are requested _at most_. A batch may request less blocks to account for -/// already requested slots. There is a timeout for each batch request. If this value is too high, -/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which -/// case the responder will fill the response up to the max request size, assuming they have the -/// bandwidth to do so. -pub const BACKFILL_EPOCHS_PER_BATCH: u64 = 1; - -/// The maximum number of batches to queue before requesting more. -const BACKFILL_BATCH_BUFFER_SIZE: u8 = 20; +use tracing::{debug, info, instrument, warn}; +use types::{EthSpec, Hash256, Slot}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; @@ -51,34 +29,10 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10; -/// Custom configuration for the batch object. -struct BackFillBatchConfig {} - -impl BatchConfig for BackFillBatchConfig { - fn max_batch_download_attempts() -> u8 { - MAX_BATCH_DOWNLOAD_ATTEMPTS - } - fn max_batch_processing_attempts() -> u8 { - MAX_BATCH_PROCESSING_ATTEMPTS - } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - let mut hasher = DefaultHasher::new(); - blocks.hash(&mut hasher); - hasher.finish() - } -} - /// Return type when attempting to start the backfill sync process. pub enum SyncStart { /// The chain started syncing or is already syncing. - Syncing { - /// The number of slots that have been processed so far. - completed: usize, - /// The number of slots still to be processed. - remaining: usize, - }, + Syncing, /// The chain didn't start syncing. NotSyncing, } @@ -96,40 +50,26 @@ pub enum ProcessResult { #[derive(Debug)] pub enum BackFillError { /// A batch failed to be downloaded. - BatchDownloadFailed(#[allow(dead_code)] BatchId), + BatchDownloadFailed(#[allow(dead_code)] Id), /// A batch could not be processed. - BatchProcessingFailed(#[allow(dead_code)] BatchId), + BatchProcessingFailed(#[allow(dead_code)] Id), /// A batch entered an invalid state. - BatchInvalidState(#[allow(dead_code)] BatchId, #[allow(dead_code)] String), + BatchInvalidState(#[allow(dead_code)] Id, #[allow(dead_code)] String), /// The sync algorithm entered an invalid state. InvalidSyncState(#[allow(dead_code)] String), /// The chain became paused. Paused, } -pub struct BackFillSync { - /// Keeps track of the current progress of the backfill. - /// This only gets refreshed from the beacon chain if we enter a failed state. - current_start: BatchId, - - /// Starting epoch of the batch that needs to be processed next. - /// This is incremented as the chain advances. - processing_target: BatchId, - - /// Starting epoch of the next batch that needs to be downloaded. - to_be_downloaded: BatchId, - - /// Keeps track if we have requested the final batch. - last_batch_downloaded: bool, - - /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, - - /// The current processing batch, if any. - current_processing_batch: Option, +enum SyncingStatus { + AwaitingDownload(Hash256), + Downloading(Hash256, Id), + AwaitingProcessing(RpcBlock, BatchPeers), + Processing(RpcBlock, BatchPeers), +} - /// Batches validated by this chain. - validated_batches: u64, +pub struct BackFillSync { + status: SyncBlock, /// When a backfill sync fails, we keep track of whether a new fully synced peer has joined. /// This signifies that we are able to attempt to restart a failed chain. @@ -157,29 +97,23 @@ impl BackFillSync { // If, for some reason a backfill has already been completed (or we've used a trusted // genesis root) then backfill has been completed. let anchor_info = beacon_chain.store.get_anchor_info(); - let (state, current_start) = - if anchor_info.block_backfill_complete(beacon_chain.genesis_backfill_slot) { - (BackFillState::Completed, Epoch::new(0)) - } else { - ( - BackFillState::Paused, - anchor_info - .oldest_block_slot - .epoch(T::EthSpec::slots_per_epoch()), - ) - }; + let state = if anchor_info.block_backfill_complete(beacon_chain.genesis_backfill_slot) { + BackFillState::Completed + } else { + BackFillState::Paused + }; let bfs = BackFillSync { - batches: BTreeMap::new(), - processing_target: current_start, - current_start, - last_batch_downloaded: false, - to_be_downloaded: current_start, - network_globals, - current_processing_batch: None, - validated_batches: 0, + status: SyncBlock::new( + RangeRequestId::BackfillSync(0), + anchor_info.oldest_block_parent, + // TODO(tree-sync): not correct fetch the corrent slot + anchor_info.oldest_block_slot, + &[], + ), restart_failed_sync: false, beacon_chain, + network_globals, }; // Update the global network state with the current backfill state. @@ -196,7 +130,7 @@ impl BackFillSync { )] pub fn pause(&mut self) { if let BackFillState::Syncing = self.state() { - debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused"); + debug!("Backfill sync paused"); self.set_state(BackFillState::Paused); } } @@ -218,24 +152,11 @@ impl BackFillSync { match self.state() { BackFillState::Syncing => {} // already syncing ignore. BackFillState::Paused => { - if self - .network_globals - .peers - .read() - .synced_peers() - .next() - .is_some() - { + if self.status.peer_count() == 0 { // If there are peers to resume with, begin the resume. - debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync"); + debug!("Resuming backfill sync"); self.set_state(BackFillState::Syncing); - // Resume any previously failed batches. - self.resume_batches(network)?; - // begin requesting blocks from the peer pool, until all peers are exhausted. - self.request_batches(network)?; - - // start processing batches if needed - self.process_completed_batches(network)?; + self.continue_syncing_blocks(network); } else { return Ok(SyncStart::NotSyncing); } @@ -251,36 +172,15 @@ impl BackFillSync { self.set_state(BackFillState::Syncing); - // Obtain a new start slot, from the beacon chain and handle possible errors. - if let Err(e) = self.reset_start_epoch() { - // This infallible match exists to force us to update this code if a future - // refactor of `ResetEpochError` adds a variant. - let ResetEpochError::SyncCompleted = e; - error!("Backfill sync completed whilst in failed status"); - self.set_state(BackFillState::Completed); - return Err(BackFillError::InvalidSyncState(String::from( - "chain completed", - ))); - } - - debug!(start_epoch = %self.current_start, "Resuming a failed backfill sync"); + debug!("Resuming a failed backfill sync"); // begin requesting blocks from the peer pool, until all peers are exhausted. - self.request_batches(network)?; + self.continue_syncing_blocks(network); } BackFillState::Completed => return Ok(SyncStart::NotSyncing), } - Ok(SyncStart::Syncing { - completed: (self.validated_batches - * BACKFILL_EPOCHS_PER_BATCH - * T::EthSpec::slots_per_epoch()) as usize, - remaining: self - .current_start - .start_slot(T::EthSpec::slots_per_epoch()) - .saturating_sub(self.beacon_chain.genesis_backfill_slot) - .as_usize(), - }) + Ok(SyncStart::Syncing) } /// A fully synced peer has joined us. @@ -298,881 +198,112 @@ impl BackFillSync { } } - /// An RPC error has occurred. - /// - /// If the batch exists it is re-requested. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn inject_error( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - peer_id: &PeerId, - request_id: Id, - err: RpcResponseError, - ) -> Result<(), BackFillError> { - if let Some(batch) = self.batches.get_mut(&batch_id) { - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. - if !batch.is_expecting_block(&request_id) { - return Ok(()); - } - debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - match batch.download_failed(Some(*peer_id)) { - Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) - } - Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), - } - } else { - // this could be an error for an old batch, removed when the chain advances - Ok(()) - } - } - - /// A block has been received for a batch relating to this backfilling chain. - /// If the block correctly completes the batch it will be processed if possible. - /// If this returns an error, the backfill sync has failed and will be restarted once new peers - /// join the system. - /// The sync manager should update the global sync state on failure. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn on_block_response( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - batch_peers: BatchPeers, - request_id: Id, - blocks: Vec>, - ) -> Result { - // check if we have this batch - let Some(batch) = self.batches.get_mut(&batch_id) else { - if !matches!(self.state(), BackFillState::Failed) { - // A batch might get removed when the chain advances, so this is non fatal. - debug!(epoch = %batch_id, "Received a block for unknown batch"); - } - return Ok(ProcessResult::Successful); - }; - - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer, and that the - // request_id matches - if !batch.is_expecting_block(&request_id) { - return Ok(ProcessResult::Successful); - } - - match batch.download_completed(blocks, batch_peers) { - Ok(received) => { - let awaiting_batches = - self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; - debug!( - epoch = %batch_id, - blocks = received, - %awaiting_batches, - "Completed batch received" - ); - - // pre-emptively request more blocks from peers whilst we process current blocks, - self.request_batches(network)?; - self.process_completed_batches(network) - } - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; - Ok(ProcessResult::Successful) - } - } - } - - /// The syncing process has failed. - /// - /// This resets past variables, to allow for a fresh start when resuming. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> { - // Some errors shouldn't fail the chain. - if matches!(error, BackFillError::Paused) { - return Ok(()); - } - - // Set the state - self.set_state(BackFillState::Failed); - // Remove all batches and active requests and participating peers. - self.batches.clear(); - self.restart_failed_sync = false; - - // Reset all downloading and processing targets - self.processing_target = self.current_start; - self.to_be_downloaded = self.current_start; - self.last_batch_downloaded = false; - self.current_processing_batch = None; - - // NOTE: Lets keep validated_batches for posterity - - // Emit the log here - error!(?error, "Backfill sync failed"); - - // Return the error, kinda weird pattern, but I want to use - // `self.fail_chain(_)?` in other parts of the code. - Err(error) + pub fn add_peer(&mut self, peer_id: PeerId) { + self.status.add_peer(peer_id); } - /// Processes the batch with the given id. - /// The batch must exist and be ready for processing - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn process_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result { - // Only process batches if this chain is Syncing, and only one at a time - if self.state() != BackFillState::Syncing || self.current_processing_batch.is_some() { - return Ok(ProcessResult::Successful); - } + pub fn peer_disconnected(&mut self, peer_id: &PeerId) { + self.status.remove_peer(peer_id); - let Some(batch) = self.batches.get_mut(&batch_id) else { - return self - .fail_sync(BackFillError::InvalidSyncState(format!( - "Trying to process a batch that does not exist: {}", - batch_id - ))) - .map(|_| ProcessResult::Successful); - }; - - // NOTE: We send empty batches to the processor in order to trigger the block processor - // result callback. This is done, because an empty batch could end a chain and the logic - // for removing chains and checking completion is in the callback. - - let (blocks, _) = match batch.start_processing() { - Err(e) => { - return self - .fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) - .map(|_| ProcessResult::Successful) - } - Ok(v) => v, - }; - - let process_id = ChainSegmentProcessId::BackSyncBatchId(batch_id); - self.current_processing_batch = Some(batch_id); - - if let Err(e) = network - .beacon_processor() - .send_chain_segment(process_id, blocks) - { - crit!( - msg = "process_batch", - error = %e, - batch = ?self.processing_target, - "Failed to send backfill segment to processor." + if self.status.peer_count() == 0 && self.state() == BackFillState::Syncing { + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" ); - // This is unlikely to happen but it would stall syncing since the batch now has no - // blocks to continue, and the chain is expecting a processing result that won't - // arrive. To mitigate this, (fake) fail this processing so that the batch is - // re-downloaded. - self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure) - } else { - Ok(ProcessResult::Successful) + self.set_state(BackFillState::Paused); } } - /// The block processor has completed processing a batch. This function handles the result - /// of the batch processor. - /// If an error is returned the BackFill sync has failed. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn on_batch_process_result( + pub fn on_block_download_result( &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - result: &BatchProcessResult, - ) -> Result { - // The first two cases are possible in regular sync, should not occur in backfill, but we - // keep this logic for handling potential processing race conditions. - // result - let batch = match &self.current_processing_batch { - Some(processing_id) if *processing_id != batch_id => { - debug!( - batch_epoch = %batch_id.as_u64(), - expected_batch_epoch = processing_id.as_u64(), - "Unexpected batch result" - ); - return Ok(ProcessResult::Successful); - } - None => { - debug!(%batch_id, "Chain was not expecting a batch result"); - return Ok(ProcessResult::Successful); - } - _ => { - // batch_id matches, continue - self.current_processing_batch = None; - - match self.batches.get_mut(&batch_id) { - Some(batch) => batch, - None => { - // This is an error. Fail the sync algorithm. - return self - .fail_sync(BackFillError::InvalidSyncState(format!( - "Current processing batch not found: {}", - batch_id - ))) - .map(|_| ProcessResult::Successful); - } - } - } - }; - - let Some(batch_peers) = batch.processing_peers() else { - self.fail_sync(BackFillError::BatchInvalidState( - batch_id, - String::from("Peer does not exist"), - ))?; - return Ok(ProcessResult::Successful); - }; - - debug!( - ?result, - %batch, - batch_epoch = %batch_id, - "Backfill batch processed" - ); - - match result { - BatchProcessResult::Success { - imported_blocks, .. - } => { - if let Err(e) = batch.processing_completed(BatchProcessingResult::Success) { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; - } - // If the processed batch was not empty, we can validate previous unvalidated - // blocks. - if *imported_blocks > 0 { - self.advance_chain(network, batch_id); - } - - if batch_id == self.processing_target { - self.processing_target = self - .processing_target - .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); - } - - // check if the chain has completed syncing - if self.check_completed() { - // chain is completed - info!( - blocks_processed = self.validated_batches * T::EthSpec::slots_per_epoch(), - "Backfill sync completed" - ); - self.set_state(BackFillState::Completed); - Ok(ProcessResult::SyncCompleted) - } else { - // chain is not completed - // attempt to request more batches - self.request_batches(network)?; - // attempt to process more batches - self.process_completed_batches(network) - } - } - BatchProcessResult::FaultyFailure { - imported_blocks, - peer_action, - error, - } => { - // TODO(sync): De-dup between back and forwards sync - if let Some(penalty) = peer_action.block_peer { - // Penalize the peer appropiately. - network.report_peer(batch_peers.block(), penalty, "faulty_batch"); - } - - // Penalize each peer only once. Currently a peer_action does not mix different - // PeerAction levels. - for (peer, penalty) in peer_action - .column_peer - .iter() - .filter_map(|(column_index, penalty)| { - batch_peers - .column(column_index) - .map(|peer| (*peer, *penalty)) - }) - .unique() - { - network.report_peer(peer, penalty, "faulty_batch_column"); - } - - match batch.processing_completed(BatchProcessingResult::FaultyFailure) { - Err(e) => { - // Batch was in the wrong state - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) - .map(|_| ProcessResult::Successful) - } - Ok(BatchOperationOutcome::Failed { .. }) => { - // When backfill syncing post-PeerDAS we can't attribute fault to previous - // peers if a batch fails to process too many times. We have strict peer - // scoring for faulty errors, so participating peers that sent invalid - // data are already downscored. - // - // Because backfill sync deals with historical data that we can assert - // to be correct, once we import a batch that contains at least one - // block we are sure we got the right data. There's no need to penalize - // all participating peers in backfill sync if a batch fails - warn!( - batch_epoch = %batch_id, - error, - "Backfill sync failed after attempting to process batch too many times" - ); - - self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) - .map(|_| ProcessResult::Successful) - } - - Ok(BatchOperationOutcome::Continue) => { - // chain can continue. Check if it can be progressed - if *imported_blocks > 0 { - // At least one block was successfully verified and imported, then we can be sure all - // previous batches are valid and we only need to download the current failed - // batch. - self.advance_chain(network, batch_id); - } - // Handle this invalid batch, that is within the re-process retries limit. - self.handle_invalid_batch(network, batch_id) - .map(|_| ProcessResult::Successful) - } - } - } - BatchProcessResult::NonFaultyFailure => { - if let Err(e) = batch.processing_completed(BatchProcessingResult::NonFaultyFailure) - { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; - } - self.send_batch(network, batch_id)?; - Ok(ProcessResult::Successful) - } + req_id: ComponentsByRootRequestId, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) { + if let Err(e) = self.status.on_download_result(req_id, result, cx) { + self.handle_outcome(Err(e), cx); } } - /// Processes the next ready batch. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn process_completed_batches( + pub fn on_block_process_result( &mut self, - network: &mut SyncNetworkContext, - ) -> Result { - // Only process batches if backfill is syncing and only process one batch at a time - if self.state() != BackFillState::Syncing || self.current_processing_batch.is_some() { - return Ok(ProcessResult::Successful); - } - - // Find the id of the batch we are going to process. - if let Some(batch) = self.batches.get(&self.processing_target) { - let state = batch.state(); - match state { - BatchState::AwaitingProcessing(..) => { - return self.process_batch(network, self.processing_target); - } - BatchState::Downloading(..) => { - // Batch is not ready, nothing to process - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { - // these are all inconsistent states: - // - Failed -> non recoverable batch. Chain should have been removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. - // - Processing -> `self.current_processing_batch` is None - self.fail_sync(BackFillError::InvalidSyncState(String::from( - "Invalid expected batch state", - )))?; - return Ok(ProcessResult::Successful); - } - BatchState::AwaitingValidation(_) => { - // TODO: I don't think this state is possible, log a CRIT just in case. - // If this is not observed, add it to the failed state branch above. - crit!( - batch = ?self.processing_target, - "Chain encountered a robust batch awaiting validation" - ); - - self.processing_target -= BACKFILL_EPOCHS_PER_BATCH; - if self.to_be_downloaded >= self.processing_target { - self.to_be_downloaded = self.processing_target - BACKFILL_EPOCHS_PER_BATCH; - } - self.request_batches(network)?; - } - } - } else { - self.fail_sync(BackFillError::InvalidSyncState(format!( - "Batch not found for current processing target {}", - self.processing_target - )))?; - return Ok(ProcessResult::Successful); - } - Ok(ProcessResult::Successful) + _id: Id, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) { + let outcome = self.status.on_process_result(result, cx); + self.handle_outcome(outcome, cx); } - /// Removes any batches previous to the given `validating_epoch` and updates the current - /// boundaries of the chain. - /// - /// The `validating_epoch` must align with batch boundaries. - /// - /// If a previous batch has been validated and it had been re-processed, penalize the original - /// peer. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn advance_chain(&mut self, network: &mut SyncNetworkContext, validating_epoch: Epoch) { - // make sure this epoch produces an advancement - if validating_epoch >= self.current_start { - return; - } - - // We can now validate higher batches that the current batch. Here we remove all - // batches that are higher than the current batch. We add on an extra - // `BACKFILL_EPOCHS_PER_BATCH` as `split_off` is inclusive. - let removed_batches = self - .batches - .split_off(&(validating_epoch + BACKFILL_EPOCHS_PER_BATCH)); - - for (id, batch) in removed_batches.into_iter() { - self.validated_batches = self.validated_batches.saturating_add(1); - // only for batches awaiting validation can we be sure the last attempt is - // right, and thus, that any different attempt is wrong - match batch.state() { - BatchState::AwaitingValidation(ref processed_attempt) => { - for attempt in batch.attempts() { - // The validated batch has been re-processed - if attempt.hash != processed_attempt.hash { - // The re-downloaded version was different. - // TODO(das): should penalize other peers? - let valid_attempt_peer = processed_attempt.block_peer(); - let bad_attempt_peer = attempt.block_peer(); - if valid_attempt_peer != bad_attempt_peer { - // A different peer sent the correct batch, the previous peer did not - // We negatively score the original peer. - let action = PeerAction::LowToleranceError; - debug!( - batch_epoch = %id, score_adjustment = %action, - original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer, - "Re-processed batch validated. Scoring original peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_original_peer", - ); - } else { - // The same peer corrected it's previous mistake. There was an error, so we - // negative score the original peer. - let action = PeerAction::MidToleranceError; - debug!( - batch_epoch = %id, - score_adjustment = %action, - original_peer = %bad_attempt_peer, - new_peer = %valid_attempt_peer, - "Re-processed batch validated by the same peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_same_peer", - ); - } - } - } - } - BatchState::Downloading(..) => {} - BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { - crit!("batch indicates inconsistent chain state while advancing chain") - } - BatchState::AwaitingProcessing(..) => {} - BatchState::Processing(_) => { - debug!(batch = %id, %batch, "Advancing chain while processing a batch"); - if let Some(processing_id) = self.current_processing_batch { - if id >= processing_id { - self.current_processing_batch = None; - } - } - } - } - } - - self.processing_target = self.processing_target.min(validating_epoch); - self.current_start = validating_epoch; - self.to_be_downloaded = self.to_be_downloaded.min(validating_epoch); - if self.batches.contains_key(&self.to_be_downloaded) { - // if a chain is advanced by Range beyond the previous `self.to_be_downloaded`, we - // won't have this batch, so we need to request it. - self.to_be_downloaded -= BACKFILL_EPOCHS_PER_BATCH; - } - debug!(?validating_epoch, processing_target = ?self.processing_target, "Backfill advanced"); + fn continue_syncing_blocks(&mut self, cx: &mut SyncNetworkContext) { + // TODO(tree-sync): only ok to import the newest block + let ok_to_import = true; + let outcome = self + .status + .continue_request(cx, OkToImport::Bool(ok_to_import)); + self.handle_outcome(outcome.map(|_| SyncBlockResult::Wait), cx); } - /// An invalid batch has been received that could not be processed, but that can be retried. - /// - /// These events occur when a peer has successfully responded with blocks, but the blocks we - /// have received are incorrect or invalid. This indicates the peer has not performed as - /// intended and can result in downvoting a peer. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn handle_invalid_batch( + fn handle_outcome( &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result<(), BackFillError> { - // The current batch could not be processed, indicating either the current or previous - // batches are invalid. - - // The previous batch could be incomplete due to the block sizes being too large to fit in - // a single RPC request or there could be consecutive empty batches which are not supposed - // to be there - - // The current (sub-optimal) strategy is to simply re-request all batches that could - // potentially be faulty. If a batch returns a different result than the original and - // results in successful processing, we downvote the original peer that sent us the batch. - - // this is our robust `processing_target`. All previous batches must be awaiting - // validation - let mut redownload_queue = Vec::new(); - - for (id, batch) in self - .batches - .iter_mut() - .filter(|(&id, _batch)| id > batch_id) - { - match batch - .validation_failed() - .map_err(|e| BackFillError::BatchInvalidState(batch_id, e.0))? - { - BatchOperationOutcome::Failed { blacklist: _ } => { - // Batch has failed and cannot be redownloaded. - return self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)); - } - BatchOperationOutcome::Continue => { - redownload_queue.push(*id); + result: Result, + cx: &mut SyncNetworkContext, + ) { + match result { + Ok(SyncBlockResult::Done { parent_root, slot }) => { + if self.is_complete(slot) { + info!("Backfill sync completed"); + self.set_state(BackFillState::Completed); + } else { + let peers = self.status.clone_peers(); + // TODO(tree-sync): retrieve correct slot from fetching headers first + let parent_block_slot = Slot::new(0); + self.status = SyncBlock::new( + RangeRequestId::BackfillSync(cx.next_id()), + parent_root, + parent_block_slot, + &peers.into_iter().collect::>(), + ) } } - } - - // no batch maxed out it process attempts, so now the chain's volatile progress must be - // reset - self.processing_target = self.current_start; - - for id in redownload_queue { - self.send_batch(network, id)?; - } - // finally, re-request the failed batch. - self.send_batch(network, batch_id) - } - - /// Requests the batch assigned to the given id from a given peer. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn send_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result<(), BackFillError> { - if let Some(batch) = self.batches.get_mut(&batch_id) { - let synced_peers = self - .network_globals - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - - let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_block_peers(); - match network.block_components_by_range_request( - request, - RangeRequestId::BackfillSync { batch_id }, - &synced_peers, - &failed_peers, - ) { - Ok(request_id) => { - // inform the batch about the new request - if let Err(e) = batch.start_downloading(request_id) { - return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); - } - debug!(epoch = %batch_id, %batch, "Requesting batch"); - - return Ok(()); - } - Err(e) => match e { - RpcRequestSendError::NoPeer(no_peer) => { - // If we are here the chain has no more synced peers - info!( - "reason" = format!("insufficient_synced_peers({no_peer:?})"), - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); - } - RpcRequestSendError::InternalError(e) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); - // register the failed download and check if the batch can be retried - if let Err(e) = batch.start_downloading(1) { - return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); - } - - match batch.download_failed(None) { - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? - } - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? - } - Ok(BatchOperationOutcome::Continue) => { - return self.send_batch(network, batch_id) - } - } - } - }, + Ok(SyncBlockResult::Wait) => { + // Do nothing wait for future event } - } - - Ok(()) - } - - /// When resuming a chain, this function searches for batches that need to be re-downloaded and - /// transitions their state to redownload the batch. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn resume_batches(&mut self, network: &mut SyncNetworkContext) -> Result<(), BackFillError> { - let batch_ids_to_retry = self - .batches - .iter() - .filter_map(|(batch_id, batch)| { - // In principle there should only ever be on of these, and we could terminate the - // loop early, however the processing is negligible and we continue the search - // for robustness to handle potential future modification - if matches!(batch.state(), BatchState::AwaitingDownload) { - Some(*batch_id) - } else { - None + Err(e) => match e { + SyncBlockError::InternalError(_) | SyncBlockError::TooManyErrors(_) => { + debug!(error = ?e, "Backfill synced failed"); + self.set_state(BackFillState::Failed); } - }) - .collect::>(); - - for batch_id in batch_ids_to_retry { - self.send_batch(network, batch_id)?; + }, } - Ok(()) } - /// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer - /// pool and left over batches until the batch buffer is reached or all peers are exhausted. + /// Updates the global network state indicating the current state of a backfill sync. #[instrument(parent = None, - level = "info", fields(service = "backfill_sync"), name = "backfill_sync", skip_all )] - fn request_batches( - &mut self, - network: &mut SyncNetworkContext, - ) -> Result<(), BackFillError> { - if !matches!(self.state(), BackFillState::Syncing) { - return Ok(()); - } - - // find the next pending batch and request it from the peer - // Note: for this function to not infinite loop we must: - // - If `include_next_batch` returns Some we MUST increase the count of batches that are - // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of - // that function. - while let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id)?; - } - - // No more batches, simply stop - Ok(()) + fn set_state(&self, state: BackFillState) { + *self.network_globals.backfill_state.write() = state; } - /// Creates the next required batch from the chain. If there are no more batches required, - /// `false` is returned. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { - // don't request batches beyond genesis; - if self.last_batch_downloaded { - return None; - } - - // only request batches up to the buffer size limit - // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync - // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { - matches!( - batch.state(), - BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) - ) - }; - if self - .batches - .iter() - .filter(|&(_epoch, batch)| in_buffer(batch)) - .count() - > BACKFILL_BATCH_BUFFER_SIZE as usize - { - return None; - } - - let batch_id = self.to_be_downloaded; - // this batch could have been included already being an optimistic batch - match self.batches.entry(batch_id) { - Entry::Occupied(_) => { - // this batch doesn't need downloading, let this same function decide the next batch - if self.would_complete(batch_id) { - self.last_batch_downloaded = true; - } - - self.to_be_downloaded = self - .to_be_downloaded - .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); - self.include_next_batch(network) - } - Entry::Vacant(entry) => { - entry.insert(BatchInfo::new(&batch_id, BACKFILL_EPOCHS_PER_BATCH)); - if self.would_complete(batch_id) { - self.last_batch_downloaded = true; - } - self.to_be_downloaded = self - .to_be_downloaded - .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); - Some(batch_id) - } - } + fn state(&self) -> BackFillState { + self.network_globals.backfill_state.read().clone() } - /// Resets the start epoch based on the beacon chain. - /// - /// This errors if the beacon chain indicates that backfill sync has already completed or is - /// not required. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> { + fn is_complete(&self, slot: Slot) -> bool { let anchor_info = self.beacon_chain.store.get_anchor_info(); - if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) { - Err(ResetEpochError::SyncCompleted) - } else { - self.current_start = anchor_info - .oldest_block_slot - .epoch(T::EthSpec::slots_per_epoch()); - Ok(()) - } - } - /// Checks with the beacon chain if backfill sync has completed. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn check_completed(&mut self) -> bool { - if self.would_complete(self.current_start) { - // Check that the beacon chain agrees - let anchor_info = self.beacon_chain.store.get_anchor_info(); - // Conditions that we have completed a backfill sync - if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) { - return true; - } else { - error!("Backfill out of sync with beacon chain"); - } + if anchor_info.oldest_block_slot != slot { + warn!( + "oldest_block_slot not at expected value {} != {}", + anchor_info.oldest_block_slot, slot + ); } - false - } - /// Checks if backfill would complete by syncing to `start_epoch`. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn would_complete(&self, start_epoch: Epoch) -> bool { - start_epoch - <= self - .beacon_chain - .genesis_backfill_slot - .epoch(T::EthSpec::slots_per_epoch()) - } - - /// Updates the global network state indicating the current state of a backfill sync. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn set_state(&self, state: BackFillState) { - *self.network_globals.backfill_state.write() = state; - } - - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn state(&self) -> BackFillState { - self.network_globals.backfill_state.read().clone() + // Conditions that we have completed a backfill sync + anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) } } @@ -1181,73 +312,3 @@ enum ResetEpochError { /// The chain has already completed. SyncCompleted, } - -#[cfg(test)] -mod tests { - use super::*; - use beacon_chain::test_utils::BeaconChainHarness; - use bls::Hash256; - use lighthouse_network::{NetworkConfig, SyncInfo, SyncStatus}; - use rand::prelude::StdRng; - use rand::SeedableRng; - use types::MinimalEthSpec; - - #[test] - fn request_batches_should_not_loop_infinitely() { - let harness = BeaconChainHarness::builder(MinimalEthSpec) - .default_spec() - .deterministic_keypairs(4) - .fresh_ephemeral_store() - .build(); - - let beacon_chain = harness.chain.clone(); - let slots_per_epoch = MinimalEthSpec::slots_per_epoch(); - - let network_globals = Arc::new(NetworkGlobals::new_test_globals( - vec![], - Arc::new(NetworkConfig::default()), - beacon_chain.spec.clone(), - )); - - { - let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); - let peer_id = network_globals - .peers - .write() - .__add_connected_peer_testing_only( - true, - &beacon_chain.spec, - k256::ecdsa::SigningKey::random(&mut rng).into(), - ); - - // Simulate finalized epoch and head being 2 epochs ahead - let finalized_epoch = Epoch::new(40); - let head_epoch = finalized_epoch + 2; - let head_slot = head_epoch.start_slot(slots_per_epoch) + 1; - - network_globals.peers.write().update_sync_status( - &peer_id, - SyncStatus::Synced { - info: SyncInfo { - head_slot, - head_root: Hash256::random(), - finalized_epoch, - finalized_root: Hash256::random(), - }, - }, - ); - } - - let mut network = SyncNetworkContext::new_for_testing( - beacon_chain.clone(), - network_globals.clone(), - harness.runtime.task_executor.clone(), - ); - - let mut backfill = BackFillSync::new(beacon_chain, network_globals); - backfill.set_state(BackFillState::Syncing); - - // if this ends up running into an infinite loop, the test will overflow the stack pretty quickly. - let _ = backfill.request_batches(&mut network); - } -} diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs deleted file mode 100644 index 86b6894bac4..00000000000 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ /dev/null @@ -1,217 +0,0 @@ -use crate::sync::block_lookups::single_block_lookup::{ - LookupRequestError, SingleBlockLookup, SingleLookupRequestState, -}; -use crate::sync::block_lookups::{ - BlobRequestState, BlockRequestState, CustodyRequestState, PeerId, -}; -use crate::sync::manager::BlockProcessType; -use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; -use beacon_chain::BeaconChainTypes; -use lighthouse_network::service::api_types::Id; -use parking_lot::RwLock; -use std::collections::HashSet; -use std::sync::Arc; -use types::blob_sidecar::FixedBlobSidecarList; -use types::{DataColumnSidecarList, SignedBeaconBlock}; - -use super::single_block_lookup::{ComponentRequests, DownloadResult}; -use super::SingleLookupId; - -#[derive(Debug, Copy, Clone)] -pub enum ResponseType { - Block, - Blob, - CustodyColumn, -} - -/// This trait unifies common single block lookup functionality across blocks and blobs. This -/// includes making requests, verifying responses, and handling processing results. A -/// `SingleBlockLookup` includes both a `BlockRequestState` and a `BlobRequestState`, this trait is -/// implemented for each. -/// -/// The use of the `ResponseType` associated type gives us a degree of type -/// safety when handling a block/blob response ensuring we only mutate the correct corresponding -/// state. -pub trait RequestState { - /// The type created after validation. - type VerifiedResponseType: Clone; - - /// Request the network context to prepare a request of a component of `block_root`. If the - /// request is not necessary because the component is already known / processed, return false. - /// Return true if it sent a request and we can expect an event back from the network. - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - expected_blobs: usize, - cx: &mut SyncNetworkContext, - ) -> Result; - - /* Response handling methods */ - - /// Send the response to the beacon processor. - fn send_for_processing( - id: Id, - result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError>; - - /* Utility methods */ - - /// Returns the `ResponseType` associated with this trait implementation. Useful in logging. - fn response_type() -> ResponseType; - - /// A getter for the `BlockRequestState` or `BlobRequestState` associated with this trait. - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str>; - - /// A getter for a reference to the `SingleLookupRequestState` associated with this trait. - fn get_state(&self) -> &SingleLookupRequestState; - - /// A getter for a mutable reference to the SingleLookupRequestState associated with this trait. - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState; -} - -impl RequestState for BlockRequestState { - type VerifiedResponseType = Arc>; - - fn make_request( - &self, - id: SingleLookupId, - lookup_peers: Arc>>, - _: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.block_lookup_request(id, lookup_peers, self.requested_block_root) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: SingleLookupId, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_block_for_processing(id, block_root, value, seen_timestamp) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::Block - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - Ok(&mut request.block_request_state) - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} - -impl RequestState for BlobRequestState { - type VerifiedResponseType = FixedBlobSidecarList; - - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - expected_blobs: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.blob_lookup_request(id, lookup_peers, self.block_root, expected_blobs) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: Id, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_blobs_for_processing(id, block_root, value, seen_timestamp) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::Blob - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - match &mut request.component_requests { - ComponentRequests::WaitingForBlock => Err("waiting for block"), - ComponentRequests::ActiveBlobRequest(request, _) => Ok(request), - ComponentRequests::ActiveCustodyRequest { .. } => Err("expecting custody request"), - ComponentRequests::NotNeeded { .. } => Err("not needed"), - } - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} - -impl RequestState for CustodyRequestState { - type VerifiedResponseType = DataColumnSidecarList; - - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - _: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.custody_lookup_request(id, self.block_root, lookup_peers) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: Id, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_custody_columns_for_processing( - id, - block_root, - value, - seen_timestamp, - BlockProcessType::SingleCustodyColumn(id), - ) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::CustodyColumn - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - match &mut request.component_requests { - ComponentRequests::WaitingForBlock => Err("waiting for block"), - ComponentRequests::ActiveBlobRequest { .. } => Err("expecting blob request"), - ComponentRequests::ActiveCustodyRequest(request) => Ok(request), - ComponentRequests::NotNeeded { .. } => Err("not needed"), - } - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs deleted file mode 100644 index 96b088747b9..00000000000 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ /dev/null @@ -1,1161 +0,0 @@ -//! Implements block lookup sync. -//! -//! Block lookup sync is triggered when a peer claims to have imported a block we don't know about. -//! For example, a peer attesting to a head block root that is not in our fork-choice. Lookup sync -//! is recursive in nature, as we may discover that this attested head block root has a parent that -//! is also unknown to us. -//! -//! Block lookup is implemented as an event-driven state machine. It sends events to the network and -//! beacon processor, and expects some set of events back. A discrepancy in the expected event API -//! will result in lookups getting "stuck". A lookup becomes stuck when there is no future event -//! that will trigger the lookup to make progress. There's a fallback mechanism that drops lookups -//! that live for too long, logging the line "Notify the devs a sync lookup is stuck". -//! -//! The expected event API is documented in the code paths that are making assumptions with the -//! comment prefix "Lookup sync event safety:" -//! -//! Block lookup sync attempts to not re-download or re-process data that we already have. Block -//! components are cached temporarily in multiple places before they are imported into fork-choice. -//! Therefore, block lookup sync must peek these caches correctly to decide when to skip a download -//! or consider a lookup complete. These caches are read from the `SyncNetworkContext` and its state -//! returned to this module as `LookupRequestResult` variants. - -use self::parent_chain::{compute_parent_chains, NodeChain}; -pub use self::single_block_lookup::DownloadResult; -use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup}; -use super::manager::{BlockProcessType, BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; -use super::network_context::{PeerGroup, RpcResponseError, SyncNetworkContext}; -use crate::metrics; -use crate::sync::block_lookups::common::ResponseType; -use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor; -use crate::sync::SyncMessage; -use beacon_chain::block_verification_types::AsBlock; -use beacon_chain::data_availability_checker::{ - AvailabilityCheckError, AvailabilityCheckErrorCategory, -}; -use beacon_chain::{AvailabilityProcessingStatus, BeaconChainTypes, BlockError}; -pub use common::RequestState; -use fnv::FnvHashMap; -use itertools::Itertools; -use lighthouse_network::service::api_types::SingleLookupReqId; -use lighthouse_network::{PeerAction, PeerId}; -use lru_cache::LRUTimeCache; -pub use single_block_lookup::{BlobRequestState, BlockRequestState, CustodyRequestState}; -use std::collections::hash_map::Entry; -use std::sync::Arc; -use std::time::Duration; -use store::Hash256; -use tracing::{debug, error, instrument, warn}; -use types::{BlobSidecar, DataColumnSidecar, EthSpec, SignedBeaconBlock}; - -pub mod common; -pub mod parent_chain; -mod single_block_lookup; - -/// The maximum depth we will search for a parent block. In principle we should have sync'd any -/// canonical chain to its head once the peer connects. A chain should not appear where it's depth -/// is further back than the most recent head slot. -/// -/// Have the same value as range's sync tolerance to consider a peer synced. Once sync lookup -/// reaches the maximum depth it will force trigger range sync. -pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE; - -const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; -pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4; - -/// Maximum time we allow a lookup to exist before assuming it is stuck and will never make -/// progress. Assume the worse case processing time per block component set * times max depth. -/// 15 * 2 * 32 = 16 minutes. -const LOOKUP_MAX_DURATION_STUCK_SECS: u64 = 15 * PARENT_DEPTH_TOLERANCE as u64; -/// The most common case of child-lookup without peers is receiving block components before the -/// attestation deadline when the node is lagging behind. Once peers start attesting for the child -/// lookup at most after 4 seconds, the lookup should gain peers. -const LOOKUP_MAX_DURATION_NO_PEERS_SECS: u64 = 10; - -/// Lookups contain untrusted data, including blocks that have not yet been validated. In case of -/// bugs or malicious activity we want to bound how much memory these lookups can consume. Aprox the -/// max size of a lookup is ~ 10 MB (current max size of gossip and RPC blocks). 200 lookups can -/// take at most 2 GB. 200 lookups allow 3 parallel chains of depth 64 (current maximum). -const MAX_LOOKUPS: usize = 200; - -pub enum BlockComponent { - Block(DownloadResult>>), - Blob(DownloadResult>>), - DataColumn(DownloadResult>>), -} - -impl BlockComponent { - fn parent_root(&self) -> Hash256 { - match self { - BlockComponent::Block(block) => block.value.parent_root(), - BlockComponent::Blob(blob) => blob.value.block_parent_root(), - BlockComponent::DataColumn(column) => column.value.block_parent_root(), - } - } - fn get_type(&self) -> &'static str { - match self { - BlockComponent::Block(_) => "block", - BlockComponent::Blob(_) => "blob", - BlockComponent::DataColumn(_) => "data_column", - } - } -} - -pub type SingleLookupId = u32; - -enum Action { - Retry, - ParentUnknown { parent_root: Hash256 }, - Drop(/* reason: */ String), - Continue, -} - -pub struct BlockLookups { - /// A cache of failed chain lookups to prevent duplicate searches. - failed_chains: LRUTimeCache, - - // TODO: Why not index lookups by block_root? - single_block_lookups: FnvHashMap>, -} - -#[cfg(test)] -use lighthouse_network::service::api_types::Id; - -#[cfg(test)] -/// Tuple of `SingleLookupId`, requested block root, awaiting parent block root (if any), -/// and list of peers that claim to have imported this set of block components. -pub(crate) type BlockLookupSummary = (Id, Hash256, Option, Vec); - -impl BlockLookups { - #[instrument(parent = None,level = "info", fields(service = "lookup_sync"), name = "lookup_sync")] - pub fn new() -> Self { - Self { - failed_chains: LRUTimeCache::new(Duration::from_secs( - FAILED_CHAINS_CACHE_EXPIRY_SECONDS, - )), - single_block_lookups: Default::default(), - } - } - - #[cfg(test)] - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.failed_chains.insert(block_root); - } - - #[cfg(test)] - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.failed_chains.keys().cloned().collect() - } - - #[cfg(test)] - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn active_single_lookups(&self) -> Vec { - self.single_block_lookups - .iter() - .map(|(id, l)| (*id, l.block_root(), l.awaiting_parent(), l.all_peers())) - .collect() - } - - /// Returns a vec of all parent lookup chains by tip, in descending slot order (tip first) - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub(crate) fn active_parent_lookups(&self) -> Vec { - compute_parent_chains( - &self - .single_block_lookups - .values() - .map(|lookup| lookup.into()) - .collect::>(), - ) - } - - /* Lookup requests */ - - /// Creates a parent lookup for the block with the given `block_root` and immediately triggers it. - /// If a parent lookup exists or is triggered, a current lookup will be created. - /// - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - pub fn search_child_and_parent( - &mut self, - block_root: Hash256, - block_component: BlockComponent, - peer_id: PeerId, - cx: &mut SyncNetworkContext, - ) -> bool { - let parent_root = block_component.parent_root(); - - let parent_lookup_exists = - self.search_parent_of_child(parent_root, block_root, &[peer_id], cx); - // Only create the child lookup if the parent exists - if parent_lookup_exists { - // `search_parent_of_child` ensures that parent root is not a failed chain - self.new_current_lookup( - block_root, - Some(block_component), - Some(parent_root), - // On a `UnknownParentBlock` or `UnknownParentBlob` event the peer is not required - // to have the rest of the block components (refer to decoupled blob gossip). Create - // the lookup with zero peers to house the block components. - &[], - cx, - ) - } else { - false - } - } - - /// Seach a block whose parent root is unknown. - /// - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - pub fn search_unknown_block( - &mut self, - block_root: Hash256, - peer_source: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> bool { - self.new_current_lookup(block_root, None, None, peer_source, cx) - } - - /// A block or blob triggers the search of a parent. - /// Check if this new lookup extends a bad chain: - /// - Extending `child_block_root_trigger` would exceed the max depth - /// - `block_root_to_search` is a failed chain - /// - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - pub fn search_parent_of_child( - &mut self, - block_root_to_search: Hash256, - child_block_root_trigger: Hash256, - peers: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> bool { - let parent_chains = self.active_parent_lookups(); - - for (chain_idx, parent_chain) in parent_chains.iter().enumerate() { - // `block_root_to_search` will trigger a new lookup, and it will extend a parent_chain - // beyond its max length - let block_would_extend_chain = parent_chain.ancestor() == child_block_root_trigger; - // `block_root_to_search` already has a lookup, and with the block trigger it extends - // the parent_chain beyond its length. This can happen because when creating a lookup - // for a new root we don't do any parent chain length checks - let trigger_is_chain_tip = parent_chain.tip == child_block_root_trigger; - - if (block_would_extend_chain || trigger_is_chain_tip) - && parent_chain.len() >= PARENT_DEPTH_TOLERANCE - { - debug!(block_root = ?block_root_to_search, "Parent lookup chain too long"); - - // Searching for this parent would extend a parent chain over the max - // Insert the tip only to failed chains - self.failed_chains.insert(parent_chain.tip); - - // Note: Drop only the chain that's too long until it merges with another chain - // that's not too long. Consider this attack: there's a chain of valid unknown - // blocks A -> B. A malicious peer builds `PARENT_DEPTH_TOLERANCE` garbage - // blocks on top of A forming A -> C. The malicious peer forces us to fetch C - // from it, which will result in parent A hitting the chain_too_long error. Then - // the valid chain A -> B is dropped too. - // - // `find_oldest_fork_ancestor` should never return Err, unwrapping to tip for - // complete-ness - let parent_chain_tip = parent_chain.tip; - let block_to_drop = - find_oldest_fork_ancestor(parent_chains, chain_idx).unwrap_or(parent_chain_tip); - // Drop all lookups descending from the child of the too long parent chain - if let Some((lookup_id, lookup)) = self - .single_block_lookups - .iter() - .find(|(_, l)| l.block_root() == block_to_drop) - { - // If a lookup chain is too long, we can't distinguish a valid chain from a - // malicious one. We must attempt to sync this chain to not lose liveness. If - // the chain grows too long, we stop lookup sync and transition this head to - // forward range sync. We need to tell range sync which head to sync to, and - // from which peers. The lookup of the very tip of this chain may contain zero - // peers if it's the parent-child lookup. So we do a bit of a trick here: - // - Tell range sync to sync to the tip's root (if available, else its ancestor) - // - But use all peers in the ancestor lookup, which should have at least one - // peer, and its peer set is a strict superset of the tip's lookup. - if let Some((_, tip_lookup)) = self - .single_block_lookups - .iter() - .find(|(_, l)| l.block_root() == parent_chain_tip) - { - cx.send_sync_message(SyncMessage::AddPeersForceRangeSync { - peers: lookup.all_peers(), - head_slot: tip_lookup.peek_downloaded_block_slot(), - head_root: parent_chain_tip, - }); - } else { - // Should never happen, log error and continue the lookup drop - error!( - error = "Parent chain tip lookup not found", - block_root = ?parent_chain_tip, - "Unable to transition lookup to range sync" - ); - } - - // Do not downscore peers here. Because we can't distinguish a valid chain from - // a malicious one we may penalize honest peers for attempting to discover us a - // valid chain. Until blocks_by_range allows to specify a tip, for example with - // https://github.com/ethereum/consensus-specs/pull/3845 we will have poor - // attributability. A peer can send us garbage blocks over blocks_by_root, and - // then correct blocks via blocks_by_range. - - self.drop_lookup_and_children(*lookup_id); - } else { - // Should never happen - error!( - error = "Block to drop lookup not found", - block_root = ?block_to_drop, - "Unable to transition lookup to range sync" - ); - } - - return false; - } - } - - // `block_root_to_search` is a failed chain check happens inside new_current_lookup - self.new_current_lookup(block_root_to_search, None, None, peers, cx) - } - - /// Searches for a single block hash. If the blocks parent is unknown, a chain of blocks is - /// constructed. - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - #[must_use = "only reference the new lookup if returns true"] - fn new_current_lookup( - &mut self, - block_root: Hash256, - block_component: Option>, - awaiting_parent: Option, - peers: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> bool { - // If this block or it's parent is part of a known failed chain, ignore it. - if self.failed_chains.contains(&block_root) { - debug!(?block_root, "Block is from a past failed chain. Dropping"); - for peer_id in peers { - cx.report_peer(*peer_id, PeerAction::MidToleranceError, "failed_chain"); - } - return false; - } - - // Do not re-request a block that is already being requested - if let Some((&lookup_id, lookup)) = self - .single_block_lookups - .iter_mut() - .find(|(_id, lookup)| lookup.is_for_block(block_root)) - { - if let Some(block_component) = block_component { - let component_type = block_component.get_type(); - let imported = lookup.add_child_components(block_component); - if !imported { - debug!( - ?block_root, - component_type, "Lookup child component ignored" - ); - } - } - - if let Err(e) = self.add_peers_to_lookup_and_ancestors(lookup_id, peers, cx) { - warn!(error = ?e, "Error adding peers to ancestor lookup"); - } - - return true; - } - - // Ensure that awaiting parent exists, otherwise this lookup won't be able to make progress - if let Some(awaiting_parent) = awaiting_parent { - if !self - .single_block_lookups - .iter() - .any(|(_, lookup)| lookup.is_for_block(awaiting_parent)) - { - warn!(block_root = ?awaiting_parent, "Ignoring child lookup parent lookup not found"); - return false; - } - } - - // Lookups contain untrusted data, bound the total count of lookups hold in memory to reduce - // the risk of OOM in case of bugs of malicious activity. - if self.single_block_lookups.len() > MAX_LOOKUPS { - warn!(?block_root, "Dropping lookup reached max"); - return false; - } - - // If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve), - // signal here to hold processing downloaded data. - let mut lookup = SingleBlockLookup::new(block_root, peers, cx.next_id(), awaiting_parent); - - // Add block components to the new request - if let Some(block_component) = block_component { - lookup.add_child_components(block_component); - } - - let id = lookup.id; - let lookup = match self.single_block_lookups.entry(id) { - Entry::Vacant(entry) => entry.insert(lookup), - Entry::Occupied(_) => { - // Should never happen - warn!(id, "Lookup exists with same id"); - return false; - } - }; - - debug!( - ?peers, - ?block_root, - awaiting_parent = awaiting_parent - .map(|root| root.to_string()) - .unwrap_or("none".to_owned()), - id = lookup.id, - "Created block lookup" - ); - metrics::inc_counter(&metrics::SYNC_LOOKUP_CREATED); - - let result = lookup.continue_requests(cx); - if self.on_lookup_result(id, result, "new_current_lookup", cx) { - self.update_metrics(); - true - } else { - false - } - } - - /* Lookup responses */ - - /// Process a block or blob response received from a single lookup request. - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_download_response>( - &mut self, - id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, - cx: &mut SyncNetworkContext, - ) { - let result = self.on_download_response_inner::(id, response, cx); - self.on_lookup_result(id.lookup_id, result, "download_response", cx); - } - - /// Process a block or blob response received from a single lookup request. - pub fn on_download_response_inner>( - &mut self, - id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, - cx: &mut SyncNetworkContext, - ) -> Result { - // Note: no need to downscore peers here, already downscored on network context - - let response_type = R::response_type(); - let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { - // We don't have the ability to cancel in-flight RPC requests. So this can happen - // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); - return Err(LookupRequestError::UnknownLookup); - }; - - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - - match response { - Ok((response, peer_group, seen_timestamp)) => { - debug!( - ?block_root, - ?id, - ?peer_group, - ?response_type, - "Received lookup download success" - ); - - // Here we could check if response extends a parent chain beyond its max length. - // However we defer that check to the handling of a processing error ParentUnknown. - // - // Here we could check if there's already a lookup for parent_root of `response`. In - // that case we know that sending the response for processing will likely result in - // a `ParentUnknown` error. However, for simplicity we choose to not implement this - // optimization. - - // Register the download peer here. Once we have received some data over the wire we - // attribute it to this peer for scoring latter regardless of how the request was - // done. - request_state.on_download_success( - id.req_id, - DownloadResult { - value: response, - block_root, - seen_timestamp, - peer_group, - }, - )?; - // continue_request will send for processing as the request state is AwaitingProcessing - } - Err(e) => { - // No need to log peer source here. When sending a DataColumnsByRoot request we log - // the peer and the request ID which is linked to this `id` value here. - debug!( - ?block_root, - ?id, - ?response_type, - error = ?e, - "Received lookup download failure" - ); - - request_state.on_download_failure(id.req_id)?; - // continue_request will retry a download as the request state is AwaitingDownload - } - } - - lookup.continue_requests(cx) - } - - /* Error responses */ - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn peer_disconnected(&mut self, peer_id: &PeerId) { - for (_, lookup) in self.single_block_lookups.iter_mut() { - lookup.remove_peer(peer_id); - } - } - - /* Processing responses */ - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_processing_result( - &mut self, - process_type: BlockProcessType, - result: BlockProcessingResult, - cx: &mut SyncNetworkContext, - ) { - let lookup_result = match process_type { - BlockProcessType::SingleBlock { id } => { - self.on_processing_result_inner::>(id, result, cx) - } - BlockProcessType::SingleBlob { id } => { - self.on_processing_result_inner::>(id, result, cx) - } - BlockProcessType::SingleCustodyColumn(id) => { - self.on_processing_result_inner::>(id, result, cx) - } - }; - self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx); - } - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_processing_result_inner>( - &mut self, - lookup_id: SingleLookupId, - result: BlockProcessingResult, - cx: &mut SyncNetworkContext, - ) -> Result { - let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else { - debug!(id = lookup_id, "Unknown single block lookup"); - return Err(LookupRequestError::UnknownLookup); - }; - - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - - debug!( - component = ?R::response_type(), - ?block_root, - id = lookup_id, - ?result, - "Received lookup processing result" - ); - - let action = match result { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(_)) - | BlockProcessingResult::Err(BlockError::DuplicateFullyImported(..)) => { - // Successfully imported - request_state.on_processing_success()?; - Action::Continue - } - - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents { - .. - }) => { - // `on_processing_success` is called here to ensure the request state is updated prior to checking - // if both components have been processed. - request_state.on_processing_success()?; - - if lookup.all_components_processed() { - // We don't request for other block components until being sure that the block has - // data. If we request blobs / columns to a peer we are sure those must exist. - // Therefore if all components are processed and we still receive `MissingComponents` - // it indicates an internal bug. - return Err(LookupRequestError::MissingComponentsAfterAllProcessed); - } else { - // Continue request, potentially request blobs - Action::Retry - } - } - BlockProcessingResult::Err(BlockError::DuplicateImportStatusUnknown(..)) => { - // This is unreachable because RPC blocks do not undergo gossip verification, and - // this error can *only* come from gossip verification. - error!(?block_root, "Single block lookup hit unreachable condition"); - Action::Drop("DuplicateImportStatusUnknown".to_owned()) - } - BlockProcessingResult::Ignored => { - // Beacon processor signalled to ignore the block processing result. - // This implies that the cpu is overloaded. Drop the request. - warn!( - component = ?R::response_type(), - "Lookup component processing ignored, cpu might be overloaded" - ); - Action::Drop("Block processing ignored".to_owned()) - } - BlockProcessingResult::Err(e) => { - match e { - BlockError::BeaconChainError(e) => { - // Internal error - error!(%block_root, error = ?e, "Beacon chain error processing lookup component"); - Action::Drop(format!("{e:?}")) - } - BlockError::ParentUnknown { parent_root, .. } => { - // Reverts the status of this request to `AwaitingProcessing` holding the - // downloaded data. A future call to `continue_requests` will re-submit it - // once there are no pending parent requests. - // Note: `BlockError::ParentUnknown` is only returned when processing - // blocks, not blobs. - request_state.revert_to_awaiting_processing()?; - Action::ParentUnknown { parent_root } - } - ref e @ BlockError::ExecutionPayloadError(ref epe) if !epe.penalize_peer() => { - // These errors indicate that the execution layer is offline - // and failed to validate the execution payload. Do not downscore peer. - debug!( - ?block_root, - error = ?e, - "Single block lookup failed. Execution layer is offline / unsynced / misconfigured" - ); - Action::Drop(format!("{e:?}")) - } - BlockError::AvailabilityCheck(e) - if e.category() == AvailabilityCheckErrorCategory::Internal => - { - // There errors indicate internal problems and should not downscore the peer - warn!(?block_root, error = ?e, "Internal availability check failure"); - - // Here we choose *not* to call `on_processing_failure` because this could result in a bad - // lookup state transition. This error invalidates both blob and block requests, and we don't know the - // state of both requests. Blobs may have already successfullly processed for example. - // We opt to drop the lookup instead. - Action::Drop(format!("{e:?}")) - } - other => { - debug!( - ?block_root, - component = ?R::response_type(), - error = ?other, - "Invalid lookup component" - ); - let peer_group = request_state.on_processing_failure()?; - let peers_to_penalize: Vec<_> = match other { - // Note: currenlty only InvalidColumn errors have index granularity, - // but future errors may follow the same pattern. Generalize this - // pattern with https://github.com/sigp/lighthouse/pull/6321 - BlockError::AvailabilityCheck( - AvailabilityCheckError::InvalidColumn(errors), - ) => errors - .iter() - // Collect all peers that sent a column that was invalid. Must - // run .unique as a single peer can send multiple invalid - // columns. Penalize once to avoid insta-bans - .flat_map(|(index, _)| peer_group.of_index((*index) as usize)) - .unique() - .collect(), - _ => peer_group.all().collect(), - }; - for peer in peers_to_penalize { - cx.report_peer( - *peer, - PeerAction::MidToleranceError, - match R::response_type() { - ResponseType::Block => "lookup_block_processing_failure", - ResponseType::Blob => "lookup_blobs_processing_failure", - ResponseType::CustodyColumn => { - "lookup_custody_column_processing_failure" - } - }, - ); - } - - Action::Retry - } - } - } - }; - - match action { - Action::Retry => { - // Trigger download for all components in case `MissingComponents` failed the blob - // request. Also if blobs are `AwaitingProcessing` and need to be progressed - lookup.continue_requests(cx) - } - Action::ParentUnknown { parent_root } => { - let peers = lookup.all_peers(); - // Mark lookup as awaiting **before** creating the parent lookup. At this point the - // lookup maybe inconsistent. - lookup.set_awaiting_parent(parent_root); - let parent_lookup_exists = - self.search_parent_of_child(parent_root, block_root, &peers, cx); - if parent_lookup_exists { - // The parent lookup exist or has been created. It's safe for `lookup` to - // reference the parent as awaiting. - debug!( - id = lookup_id, - ?block_root, - ?parent_root, - "Marking lookup as awaiting parent" - ); - Ok(LookupResult::Pending) - } else { - // The parent lookup is faulty and was not created, we must drop the `lookup` as - // it's in an inconsistent state. We must drop all of its children too. - Err(LookupRequestError::Failed(format!( - "Parent lookup is faulty {parent_root:?}" - ))) - } - } - Action::Drop(reason) => { - // Drop with noop - Err(LookupRequestError::Failed(reason)) - } - Action::Continue => { - // Drop this completed lookup only - Ok(LookupResult::Completed) - } - } - } - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn on_external_processing_result( - &mut self, - block_root: Hash256, - imported: bool, - cx: &mut SyncNetworkContext, - ) { - let Some((id, lookup)) = self - .single_block_lookups - .iter_mut() - .find(|(_, lookup)| lookup.is_for_block(block_root)) - else { - // Ok to ignore gossip process events - return; - }; - - let lookup_result = if imported { - Ok(LookupResult::Completed) - } else { - lookup.continue_requests(cx) - }; - let id = *id; - self.on_lookup_result(id, lookup_result, "external_processing_result", cx); - } - - /// Makes progress on the immediate children of `block_root` - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn continue_child_lookups(&mut self, block_root: Hash256, cx: &mut SyncNetworkContext) { - let mut lookup_results = vec![]; // < need to buffer lookup results to not re-borrow &mut self - - for (id, lookup) in self.single_block_lookups.iter_mut() { - if lookup.awaiting_parent() == Some(block_root) { - lookup.resolve_awaiting_parent(); - debug!( - parent_root = ?block_root, - id, - block_root = ?lookup.block_root(), - "Continuing child lookup" - ); - let result = lookup.continue_requests(cx); - lookup_results.push((*id, result)); - } - } - - for (id, result) in lookup_results { - self.on_lookup_result(id, result, "continue_child_lookups", cx); - } - } - - /// Drops `dropped_id` lookup and all its children recursively. Lookups awaiting a parent need - /// the parent to make progress to resolve, therefore we must drop them if the parent is - /// dropped. - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn drop_lookup_and_children(&mut self, dropped_id: SingleLookupId) { - if let Some(dropped_lookup) = self.single_block_lookups.remove(&dropped_id) { - debug!( - id = ?dropped_id, - block_root = ?dropped_lookup.block_root(), - awaiting_parent = ?dropped_lookup.awaiting_parent(), - "Dropping lookup" - ); - - let child_lookups = self - .single_block_lookups - .iter() - .filter(|(_, lookup)| lookup.awaiting_parent() == Some(dropped_lookup.block_root())) - .map(|(id, _)| *id) - .collect::>(); - - for id in child_lookups { - self.drop_lookup_and_children(id); - } - } - } - - /// Common handler a lookup request error, drop it and update metrics - /// Returns true if the lookup is created or already exists - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn on_lookup_result( - &mut self, - id: SingleLookupId, - result: Result, - source: &str, - cx: &mut SyncNetworkContext, - ) -> bool { - match result { - Ok(LookupResult::Pending) => true, // no action - Ok(LookupResult::Completed) => { - if let Some(lookup) = self.single_block_lookups.remove(&id) { - debug!(block = ?lookup.block_root(), id, "Dropping completed lookup"); - metrics::inc_counter(&metrics::SYNC_LOOKUP_COMPLETED); - // Block imported, continue the requests of pending child blocks - self.continue_child_lookups(lookup.block_root(), cx); - self.update_metrics(); - } else { - debug!(id, "Attempting to drop non-existent lookup"); - } - false - } - // If UnknownLookup do not log the request error. No need to drop child lookups nor - // update metrics because the lookup does not exist. - Err(LookupRequestError::UnknownLookup) => false, - Err(error) => { - debug!(id, source, ?error, "Dropping lookup on request error"); - metrics::inc_counter_vec(&metrics::SYNC_LOOKUP_DROPPED, &[error.into()]); - self.drop_lookup_and_children(id); - self.update_metrics(); - false - } - } - } - - /* Helper functions */ - - /// Drops all the single block requests and returns how many requests were dropped. - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn drop_single_block_requests(&mut self) -> usize { - let requests_to_drop = self.single_block_lookups.len(); - self.single_block_lookups.clear(); - requests_to_drop - } - - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn update_metrics(&self) { - metrics::set_gauge( - &metrics::SYNC_SINGLE_BLOCK_LOOKUPS, - self.single_block_lookups.len() as i64, - ); - } - - /// Perform some prune operations on lookups on some interval - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - pub fn prune_lookups(&mut self) { - self.drop_lookups_without_peers(); - self.drop_stuck_lookups(); - } - - /// Lookups without peers are allowed to exist for some time. See this common race condition: - /// - /// 1. Receive unknown block parent event - /// 2. Create child lookup with zero peers - /// 3. Parent is processed, before receiving any attestation for the child block - /// 4. Child lookup is attempted to make progress but has no peers - /// 5. We receive an attestion for child block and add a peer to the child block lookup - /// - /// On step 4 we could drop the lookup because we attempt to issue a request with no peers - /// available. This has two issues: - /// - We may drop the lookup while some other block component is processing, triggering an - /// unknown lookup error. This can potentially cause un-related child lookups to also be - /// dropped when calling `drop_lookup_and_children`. - /// - We lose all progress of the lookup, and have to re-download its components that we may - /// already have there cached. - /// - /// Instead there's no negative for keeping lookups with no peers around for some time. If we - /// regularly prune them, it should not be a memory concern (TODO: maybe yes!). - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn drop_lookups_without_peers(&mut self) { - for (lookup_id, block_root) in self - .single_block_lookups - .values() - .filter(|lookup| { - // Do not drop lookup that are awaiting events to prevent inconsinstencies. If a - // lookup gets stuck, it will be eventually pruned by `drop_stuck_lookups` - lookup.has_no_peers() - && lookup.elapsed_since_created() - > Duration::from_secs(LOOKUP_MAX_DURATION_NO_PEERS_SECS) - && !lookup.is_awaiting_event() - }) - .map(|lookup| (lookup.id, lookup.block_root())) - .collect::>() - { - debug!( - id = lookup_id, - %block_root, - "Dropping lookup with no peers" - ); - self.drop_lookup_and_children(lookup_id); - } - } - - /// Safety mechanism to unstuck lookup sync. Lookup sync if purely event driven and depends on - /// external components to feed it events to make progress. If there is a bug in network, in - /// beacon processor, or here internally: lookups can get stuck forever. A stuck lookup can - /// stall a node indefinitely as other lookup will be awaiting on a parent lookup to make - /// progress. - /// - /// If a lookup lasts more than LOOKUP_MAX_DURATION_SECS this function will find its oldest - /// ancestor and then drop it and all its children. This action will allow the node to unstuck - /// itself. Bugs that cause lookups to get stuck may be triggered consistently. So this strategy - /// is useful for two reasons: - /// - /// - One single clear warn level log per stuck incident - /// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn drop_stuck_lookups(&mut self) { - // While loop to find and drop all disjoint trees of potentially stuck lookups. - while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| { - lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_STUCK_SECS) - }) { - let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) { - Ok(lookup) => lookup, - Err(e) => { - warn!(error = ?e,"Error finding oldest ancestor lookup"); - // Default to dropping the lookup that exceeds the max duration so at least - // eventually sync should be unstuck - stuck_lookup - } - }; - - if stuck_lookup.id == ancestor_stuck_lookup.id { - warn!( - block_root = ?stuck_lookup.block_root(), - lookup = ?stuck_lookup, - "Notify the devs a sync lookup is stuck" - ); - } else { - warn!( - block_root = ?stuck_lookup.block_root(), - lookup = ?stuck_lookup, - ancestor_block_root = ?ancestor_stuck_lookup.block_root(), - ancestor_lookup = ?ancestor_stuck_lookup, - "Notify the devs a sync lookup is stuck" - ); - } - - metrics::inc_counter(&metrics::SYNC_LOOKUPS_STUCK); - self.drop_lookup_and_children(ancestor_stuck_lookup.id); - } - } - - /// Recursively find the oldest ancestor lookup of another lookup - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn find_oldest_ancestor_lookup<'a>( - &'a self, - lookup: &'a SingleBlockLookup, - ) -> Result<&'a SingleBlockLookup, String> { - if let Some(awaiting_parent) = lookup.awaiting_parent() { - if let Some(lookup) = self - .single_block_lookups - .values() - .find(|l| l.block_root() == awaiting_parent) - { - self.find_oldest_ancestor_lookup(lookup) - } else { - Err(format!( - "Lookup references unknown parent {awaiting_parent:?}" - )) - } - } else { - Ok(lookup) - } - } - - /// Adds peers to a lookup and its ancestors recursively. - /// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having - /// to duplicate the code to add peers to a lookup - #[instrument(parent = None, - level = "info", - fields(service = "lookup_sync"), - name = "lookup_sync", - skip_all - )] - fn add_peers_to_lookup_and_ancestors( - &mut self, - lookup_id: SingleLookupId, - peers: &[PeerId], - cx: &mut SyncNetworkContext, - ) -> Result<(), String> { - let lookup = self - .single_block_lookups - .get_mut(&lookup_id) - .ok_or(format!("Unknown lookup for id {lookup_id}"))?; - - let mut added_some_peer = false; - for peer in peers { - if lookup.add_peer(*peer) { - added_some_peer = true; - debug!( - block_root = ?lookup.block_root(), - ?peer, - "Adding peer to existing single block lookup" - ); - } - } - - if let Some(parent_root) = lookup.awaiting_parent() { - if let Some((&child_id, _)) = self - .single_block_lookups - .iter() - .find(|(_, l)| l.block_root() == parent_root) - { - self.add_peers_to_lookup_and_ancestors(child_id, peers, cx) - } else { - Err(format!("Lookup references unknown parent {parent_root:?}")) - } - } else if added_some_peer { - // If this lookup is not awaiting a parent and we added at least one peer, attempt to - // make progress. It is possible that a lookup is created with zero peers, attempted to - // make progress, and then receives peers. After that time the lookup will never be - // pruned with `drop_lookups_without_peers` because it has peers. This is rare corner - // case, but it can result in stuck lookups. - let result = lookup.continue_requests(cx); - self.on_lookup_result(lookup_id, result, "add_peers", cx); - Ok(()) - } else { - Ok(()) - } - } -} diff --git a/beacon_node/network/src/sync/block_lookups/parent_chain.rs b/beacon_node/network/src/sync/block_lookups/parent_chain.rs deleted file mode 100644 index 009b5e2ff74..00000000000 --- a/beacon_node/network/src/sync/block_lookups/parent_chain.rs +++ /dev/null @@ -1,198 +0,0 @@ -use super::single_block_lookup::SingleBlockLookup; -use beacon_chain::BeaconChainTypes; -use std::collections::{HashMap, HashSet}; -use types::Hash256; - -/// Summary of a lookup of which we may not know it's parent_root yet -pub(crate) struct Node { - block_root: Hash256, - parent_root: Option, -} - -impl From<&SingleBlockLookup> for Node { - fn from(value: &SingleBlockLookup) -> Self { - Self { - block_root: value.block_root(), - parent_root: value.awaiting_parent(), - } - } -} - -/// Wrapper around a chain of block roots that have a least one element (tip) -pub(crate) struct NodeChain { - // Parent chain blocks in descending slot order - pub(crate) chain: Vec, - pub(crate) tip: Hash256, -} - -impl NodeChain { - /// Returns the block_root of the oldest ancestor (min slot) of this chain - pub(crate) fn ancestor(&self) -> Hash256 { - self.chain.last().copied().unwrap_or(self.tip) - } - pub(crate) fn len(&self) -> usize { - self.chain.len() - } -} - -/// Given a set of nodes that reference each other, returns a list of chains with unique tips that -/// contain at least two elements. In descending slot order (tip first). -pub(crate) fn compute_parent_chains(nodes: &[Node]) -> Vec { - let mut child_to_parent = HashMap::new(); - let mut parent_to_child = HashMap::>::new(); - for node in nodes { - child_to_parent.insert(node.block_root, node.parent_root); - if let Some(parent_root) = node.parent_root { - parent_to_child - .entry(parent_root) - .or_default() - .push(node.block_root); - } - } - - let mut parent_chains = vec![]; - - // Iterate blocks with no children - for tip in nodes { - let mut block_root = tip.block_root; - if !parent_to_child.contains_key(&block_root) { - let mut chain = vec![]; - - // Resolve chain of blocks - while let Some(parent_root) = child_to_parent.get(&block_root) { - // block_root is a known block that may or may not have a parent root - chain.push(block_root); - if let Some(parent_root) = parent_root { - block_root = *parent_root; - } else { - break; - } - } - - if chain.len() > 1 { - parent_chains.push(NodeChain { - chain, - tip: tip.block_root, - }); - } - } - } - - parent_chains -} - -/// Given a list of node chains, find the oldest node of a specific chain that is not contained in -/// any other chain. -pub(crate) fn find_oldest_fork_ancestor( - parent_chains: Vec, - chain_idx: usize, -) -> Result { - let mut other_blocks = HashSet::new(); - - // Register blocks from other chains - for (i, parent_chain) in parent_chains.iter().enumerate() { - if i != chain_idx { - for block in &parent_chain.chain { - other_blocks.insert(block); - } - } - } - - // Should never happen - let parent_chain = parent_chains - .get(chain_idx) - .ok_or("chain_idx out of bounds")?; - // Find the first block in the target parent chain that is not in other parent chains - // Iterate in ascending slot order - for block in parent_chain.chain.iter().rev() { - if !other_blocks.contains(block) { - return Ok(*block); - } - } - - // No match means that the chain is fully contained within another chain. This should never - // happen, but if that was the case just return the tip - Ok(parent_chain.tip) -} - -#[cfg(test)] -mod tests { - use super::{compute_parent_chains, find_oldest_fork_ancestor, Node}; - use types::{FixedBytesExtended, Hash256}; - - fn h(n: u64) -> Hash256 { - Hash256::from_low_u64_be(n) - } - - fn n(block: u64) -> Node { - Node { - block_root: h(block), - parent_root: None, - } - } - - fn np(parent: u64, block: u64) -> Node { - Node { - block_root: h(block), - parent_root: Some(h(parent)), - } - } - - fn compute_parent_chains_test(nodes: &[Node], expected_chain: Vec>) { - assert_eq!( - compute_parent_chains(nodes) - .iter() - .map(|c| c.chain.clone()) - .collect::>(), - expected_chain - ); - } - - fn find_oldest_fork_ancestor_test(nodes: &[Node], expected: Hash256) { - let chains = compute_parent_chains(nodes); - println!( - "chains {:?}", - chains.iter().map(|c| &c.chain).collect::>() - ); - assert_eq!(find_oldest_fork_ancestor(chains, 0).unwrap(), expected); - } - - #[test] - fn compute_parent_chains_empty_case() { - compute_parent_chains_test(&[], vec![]); - } - - #[test] - fn compute_parent_chains_single_branch() { - compute_parent_chains_test(&[n(0), np(0, 1), np(1, 2)], vec![vec![h(2), h(1), h(0)]]); - } - - #[test] - fn compute_parent_chains_single_branch_with_solo() { - compute_parent_chains_test( - &[n(0), np(0, 1), np(1, 2), np(3, 4)], - vec![vec![h(2), h(1), h(0)]], - ); - } - - #[test] - fn compute_parent_chains_two_forking_branches() { - compute_parent_chains_test( - &[n(0), np(0, 1), np(1, 2), np(1, 3)], - vec![vec![h(2), h(1), h(0)], vec![h(3), h(1), h(0)]], - ); - } - - #[test] - fn compute_parent_chains_two_independent_branches() { - compute_parent_chains_test( - &[n(0), np(0, 1), np(1, 2), n(3), np(3, 4)], - vec![vec![h(2), h(1), h(0)], vec![h(4), h(3)]], - ); - } - - #[test] - fn find_oldest_fork_ancestor_simple_case() { - find_oldest_fork_ancestor_test(&[n(0), np(0, 1), np(1, 2), np(0, 3)], h(1)) - } -} diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs deleted file mode 100644 index 30947cf1f0a..00000000000 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ /dev/null @@ -1,678 +0,0 @@ -use super::{BlockComponent, PeerId, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS}; -use crate::sync::block_lookups::common::RequestState; -use crate::sync::network_context::{ - LookupRequestResult, PeerGroup, ReqId, RpcRequestSendError, SendErrorProcessor, - SyncNetworkContext, -}; -use beacon_chain::{BeaconChainTypes, BlockProcessStatus}; -use derivative::Derivative; -use lighthouse_network::service::api_types::Id; -use parking_lot::RwLock; -use std::collections::HashSet; -use std::fmt::Debug; -use std::sync::Arc; -use std::time::{Duration, Instant}; -use store::Hash256; -use strum::IntoStaticStr; -use types::blob_sidecar::FixedBlobSidecarList; -use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock, Slot}; - -// Dedicated enum for LookupResult to force its usage -#[must_use = "LookupResult must be handled with on_lookup_result"] -pub enum LookupResult { - /// Lookup completed successfully - Completed, - /// Lookup is expecting some future event from the network - Pending, -} - -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] -pub enum LookupRequestError { - /// Too many failed attempts - TooManyAttempts { - /// The failed attempts were primarily due to processing failures. - cannot_process: bool, - }, - /// Error sending event to network - SendFailedNetwork(RpcRequestSendError), - /// Error sending event to processor - SendFailedProcessor(SendErrorProcessor), - /// Inconsistent lookup request state - BadState(String), - /// Lookup failed for some other reason and should be dropped - Failed(/* reason: */ String), - /// Received MissingComponents when all components have been processed. This should never - /// happen, and indicates some internal bug - MissingComponentsAfterAllProcessed, - /// Attempted to retrieve a not known lookup id - UnknownLookup, - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: ReqId, - req_id: ReqId, - }, -} - -#[derive(Derivative)] -#[derivative(Debug(bound = "T: BeaconChainTypes"))] -pub struct SingleBlockLookup { - pub id: Id, - pub block_request_state: BlockRequestState, - pub component_requests: ComponentRequests, - /// Peers that claim to have imported this set of block components. This state is shared with - /// the custody request to have an updated view of the peers that claim to have imported the - /// block associated with this lookup. The peer set of a lookup can change rapidly, and faster - /// than the lifetime of a custody request. - #[derivative(Debug(format_with = "fmt_peer_set_as_len"))] - peers: Arc>>, - block_root: Hash256, - awaiting_parent: Option, - created: Instant, -} - -#[derive(Debug)] -pub(crate) enum ComponentRequests { - WaitingForBlock, - ActiveBlobRequest(BlobRequestState, usize), - ActiveCustodyRequest(CustodyRequestState), - // When printing in debug this state display the reason why it's not needed - #[allow(dead_code)] - NotNeeded(&'static str), -} - -impl SingleBlockLookup { - pub fn new( - requested_block_root: Hash256, - peers: &[PeerId], - id: Id, - awaiting_parent: Option, - ) -> Self { - Self { - id, - block_request_state: BlockRequestState::new(requested_block_root), - component_requests: ComponentRequests::WaitingForBlock, - peers: Arc::new(RwLock::new(HashSet::from_iter(peers.iter().copied()))), - block_root: requested_block_root, - awaiting_parent, - created: Instant::now(), - } - } - - /// Return the slot of this lookup's block if it's currently cached as `AwaitingProcessing` - pub fn peek_downloaded_block_slot(&self) -> Option { - self.block_request_state - .state - .peek_downloaded_data() - .map(|block| block.slot()) - } - - /// Get the block root that is being requested. - pub fn block_root(&self) -> Hash256 { - self.block_root - } - - pub fn awaiting_parent(&self) -> Option { - self.awaiting_parent - } - - /// Mark this lookup as awaiting a parent lookup from being processed. Meanwhile don't send - /// components for processing. - pub fn set_awaiting_parent(&mut self, parent_root: Hash256) { - self.awaiting_parent = Some(parent_root) - } - - /// Mark this lookup as no longer awaiting a parent lookup. Components can be sent for - /// processing. - pub fn resolve_awaiting_parent(&mut self) { - self.awaiting_parent = None; - } - - /// Returns the time elapsed since this lookup was created - pub fn elapsed_since_created(&self) -> Duration { - self.created.elapsed() - } - - /// Maybe insert a verified response into this lookup. Returns true if imported - pub fn add_child_components(&mut self, block_component: BlockComponent) -> bool { - match block_component { - BlockComponent::Block(block) => self - .block_request_state - .state - .insert_verified_response(block), - BlockComponent::Blob(_) | BlockComponent::DataColumn(_) => { - // For now ignore single blobs and columns, as the blob request state assumes all blobs are - // attributed to the same peer = the peer serving the remaining blobs. Ignoring this - // block component has a minor effect, causing the node to re-request this blob - // once the parent chain is successfully resolved - false - } - } - } - - /// Check the block root matches the requested block root. - pub fn is_for_block(&self, block_root: Hash256) -> bool { - self.block_root() == block_root - } - - /// Returns true if the block has already been downloaded. - pub fn all_components_processed(&self) -> bool { - self.block_request_state.state.is_processed() - && match &self.component_requests { - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveBlobRequest(request, _) => request.state.is_processed(), - ComponentRequests::ActiveCustodyRequest(request) => request.state.is_processed(), - ComponentRequests::NotNeeded { .. } => true, - } - } - - /// Returns true if this request is expecting some event to make progress - pub fn is_awaiting_event(&self) -> bool { - self.awaiting_parent.is_some() - || self.block_request_state.state.is_awaiting_event() - || match &self.component_requests { - // If components are waiting for the block request to complete, here we should - // check if the`block_request_state.state.is_awaiting_event(). However we already - // checked that above, so `WaitingForBlock => false` is equivalent. - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveBlobRequest(request, _) => { - request.state.is_awaiting_event() - } - ComponentRequests::ActiveCustodyRequest(request) => { - request.state.is_awaiting_event() - } - ComponentRequests::NotNeeded { .. } => false, - } - } - - /// Makes progress on all requests of this lookup. Any error is not recoverable and must result - /// in dropping the lookup. May mark the lookup as completed. - pub fn continue_requests( - &mut self, - cx: &mut SyncNetworkContext, - ) -> Result { - // TODO: Check what's necessary to download, specially for blobs - self.continue_request::>(cx, 0)?; - - if let ComponentRequests::WaitingForBlock = self.component_requests { - let downloaded_block = self - .block_request_state - .state - .peek_downloaded_data() - .cloned(); - - if let Some(block) = downloaded_block.or_else(|| { - // If the block is already being processed or fully validated, retrieve how many blobs - // it expects. Consider any stage of the block. If the block root has been validated, we - // can assert that this is the correct value of `blob_kzg_commitments_count`. - match cx.chain.get_block_process_status(&self.block_root) { - BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) - | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), - } - }) { - let expected_blobs = block.num_expected_blobs(); - let block_epoch = block.slot().epoch(T::EthSpec::slots_per_epoch()); - if expected_blobs == 0 { - self.component_requests = ComponentRequests::NotNeeded("no data"); - } else if cx.chain.should_fetch_blobs(block_epoch) { - self.component_requests = ComponentRequests::ActiveBlobRequest( - BlobRequestState::new(self.block_root), - expected_blobs, - ); - } else if cx.chain.should_fetch_custody_columns(block_epoch) { - self.component_requests = ComponentRequests::ActiveCustodyRequest( - CustodyRequestState::new(self.block_root), - ); - } else { - self.component_requests = ComponentRequests::NotNeeded("outside da window"); - } - } else { - // Wait to download the block before downloading blobs. Then we can be sure that the - // block has data, so there's no need to do "blind" requests for all possible blobs and - // latter handle the case where if the peer sent no blobs, penalize. - // - // Lookup sync event safety: Reaching this code means that a block is not in any pre-import - // cache nor in the request state of this lookup. Therefore, the block must either: (1) not - // be downloaded yet or (2) the block is already imported into the fork-choice. - // In case (1) the lookup must either successfully download the block or get dropped. - // In case (2) the block will be downloaded, processed, reach `DuplicateFullyImported` - // and get dropped as completed. - } - } - - match &self.component_requests { - ComponentRequests::WaitingForBlock => {} // do nothing - ComponentRequests::ActiveBlobRequest(_, expected_blobs) => { - self.continue_request::>(cx, *expected_blobs)? - } - ComponentRequests::ActiveCustodyRequest(_) => { - self.continue_request::>(cx, 0)? - } - ComponentRequests::NotNeeded { .. } => {} // do nothing - } - - // If all components of this lookup are already processed, there will be no future events - // that can make progress so it must be dropped. Consider the lookup completed. - // This case can happen if we receive the components from gossip during a retry. - if self.all_components_processed() { - Ok(LookupResult::Completed) - } else { - Ok(LookupResult::Pending) - } - } - - /// Potentially makes progress on this request if it's in a progress-able state - fn continue_request>( - &mut self, - cx: &mut SyncNetworkContext, - expected_blobs: usize, - ) -> Result<(), LookupRequestError> { - let id = self.id; - let awaiting_parent = self.awaiting_parent.is_some(); - let request = - R::request_state_mut(self).map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - // Attempt to progress awaiting downloads - if request.get_state().is_awaiting_download() { - // Verify the current request has not exceeded the maximum number of attempts. - let request_state = request.get_state(); - if request_state.failed_attempts() >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { - let cannot_process = request_state.more_failed_processing_attempts(); - return Err(LookupRequestError::TooManyAttempts { cannot_process }); - } - - let peers = self.peers.clone(); - let request = R::request_state_mut(self) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - match request.make_request(id, peers, expected_blobs, cx)? { - LookupRequestResult::RequestSent(req_id) => { - // Lookup sync event safety: If make_request returns `RequestSent`, we are - // guaranteed that `BlockLookups::on_download_response` will be called exactly - // with this `req_id`. - request.get_state_mut().on_download_start(req_id)? - } - LookupRequestResult::NoRequestNeeded(reason) => { - // Lookup sync event safety: Advances this request to the terminal `Processed` - // state. If all requests reach this state, the request is marked as completed - // in `Self::continue_requests`. - request.get_state_mut().on_completed_request(reason)? - } - // Sync will receive a future event to make progress on the request, do nothing now - LookupRequestResult::Pending(reason) => { - // Lookup sync event safety: Refer to the code paths constructing - // `LookupRequestResult::Pending` - request - .get_state_mut() - .update_awaiting_download_status(reason); - return Ok(()); - } - } - - // Otherwise, attempt to progress awaiting processing - // If this request is awaiting a parent lookup to be processed, do not send for processing. - // The request will be rejected with unknown parent error. - } else if !awaiting_parent { - // maybe_start_processing returns Some if state == AwaitingProcess. This pattern is - // useful to conditionally access the result data. - if let Some(result) = request.get_state_mut().maybe_start_processing() { - // Lookup sync event safety: If `send_for_processing` returns Ok() we are guaranteed - // that `BlockLookups::on_processing_result` will be called exactly once with this - // lookup_id - return R::send_for_processing(id, result, cx); - } - // Lookup sync event safety: If the request is not in `AwaitingDownload` or - // `AwaitingProcessing` state it is guaranteed to receive some event to make progress. - } - - // Lookup sync event safety: If a lookup is awaiting a parent we are guaranteed to either: - // (1) attempt to make progress with `BlockLookups::continue_child_lookups` if the parent - // lookup completes, or (2) get dropped if the parent fails and is dropped. - - Ok(()) - } - - /// Get all unique peers that claim to have imported this set of block components - pub fn all_peers(&self) -> Vec { - self.peers.read().iter().copied().collect() - } - - /// Add peer to all request states. The peer must be able to serve this request. - /// Returns true if the peer was newly inserted into some request state. - pub fn add_peer(&mut self, peer_id: PeerId) -> bool { - self.peers.write().insert(peer_id) - } - - /// Remove peer from available peers. - pub fn remove_peer(&mut self, peer_id: &PeerId) { - self.peers.write().remove(peer_id); - } - - /// Returns true if this lookup has zero peers - pub fn has_no_peers(&self) -> bool { - self.peers.read().is_empty() - } -} - -/// The state of the blob request component of a `SingleBlockLookup`. -#[derive(Derivative)] -#[derivative(Debug)] -pub struct BlobRequestState { - #[derivative(Debug = "ignore")] - pub block_root: Hash256, - pub state: SingleLookupRequestState>, -} - -impl BlobRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - block_root, - state: SingleLookupRequestState::new(), - } - } -} - -/// The state of the custody request component of a `SingleBlockLookup`. -#[derive(Derivative)] -#[derivative(Debug)] -pub struct CustodyRequestState { - #[derivative(Debug = "ignore")] - pub block_root: Hash256, - pub state: SingleLookupRequestState>, -} - -impl CustodyRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - block_root, - state: SingleLookupRequestState::new(), - } - } -} - -/// The state of the block request component of a `SingleBlockLookup`. -#[derive(Derivative)] -#[derivative(Debug)] -pub struct BlockRequestState { - #[derivative(Debug = "ignore")] - pub requested_block_root: Hash256, - pub state: SingleLookupRequestState>>, -} - -impl BlockRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - requested_block_root: block_root, - state: SingleLookupRequestState::new(), - } - } -} - -#[derive(Debug, Clone)] -pub struct DownloadResult { - pub value: T, - pub block_root: Hash256, - pub seen_timestamp: Duration, - pub peer_group: PeerGroup, -} - -#[derive(IntoStaticStr)] -pub enum State { - AwaitingDownload(/* reason */ &'static str), - Downloading(ReqId), - AwaitingProcess(DownloadResult), - /// Request is processing, sent by lookup sync - Processing(DownloadResult), - /// Request is processed - Processed(/* reason */ &'static str), -} - -/// Object representing the state of a single block or blob lookup request. -#[derive(Debug)] -pub struct SingleLookupRequestState { - /// State of this request. - state: State, - /// How many times have we attempted to process this block or blob. - failed_processing: u8, - /// How many times have we attempted to download this block or blob. - failed_downloading: u8, -} - -impl SingleLookupRequestState { - pub fn new() -> Self { - Self { - state: State::AwaitingDownload("not started"), - failed_processing: 0, - failed_downloading: 0, - } - } - - pub fn is_awaiting_download(&self) -> bool { - match self.state { - State::AwaitingDownload { .. } => true, - State::Downloading { .. } - | State::AwaitingProcess { .. } - | State::Processing { .. } - | State::Processed { .. } => false, - } - } - - pub fn is_processed(&self) -> bool { - match self.state { - State::AwaitingDownload { .. } - | State::Downloading { .. } - | State::AwaitingProcess { .. } - | State::Processing { .. } => false, - State::Processed { .. } => true, - } - } - - /// Returns true if we can expect some future event to progress this block component request - /// specifically. - pub fn is_awaiting_event(&self) -> bool { - match self.state { - // No event will progress this request specifically, but the request may be put on hold - // due to some external event - State::AwaitingDownload { .. } => false, - // Network will emit a download success / error event - State::Downloading { .. } => true, - // Not awaiting any external event - State::AwaitingProcess { .. } => false, - // Beacon processor will emit a processing result event - State::Processing { .. } => true, - // Request complete, no future event left - State::Processed { .. } => false, - } - } - - pub fn peek_downloaded_data(&self) -> Option<&T> { - match &self.state { - State::AwaitingDownload { .. } => None, - State::Downloading { .. } => None, - State::AwaitingProcess(result) => Some(&result.value), - State::Processing(result) => Some(&result.value), - State::Processed { .. } => None, - } - } - - /// Switch to `AwaitingProcessing` if the request is in `AwaitingDownload` state, otherwise - /// ignore. - pub fn insert_verified_response(&mut self, result: DownloadResult) -> bool { - if let State::AwaitingDownload { .. } = &self.state { - self.state = State::AwaitingProcess(result); - true - } else { - false - } - } - - /// Append metadata on why this request is in AwaitingDownload status. Very helpful to debug - /// stuck lookups. Not fallible as it's purely informational. - pub fn update_awaiting_download_status(&mut self, new_status: &'static str) { - if let State::AwaitingDownload(status) = &mut self.state { - *status = new_status - } - } - - /// Switch to `Downloading` if the request is in `AwaitingDownload` state, otherwise returns None. - pub fn on_download_start(&mut self, req_id: ReqId) -> Result<(), LookupRequestError> { - match &self.state { - State::AwaitingDownload { .. } => { - self.state = State::Downloading(req_id); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_download_start expected AwaitingDownload got {other}" - ))), - } - } - - /// Registers a failure in downloading a block. This might be a peer disconnection or a wrong - /// block. - pub fn on_download_failure(&mut self, req_id: ReqId) -> Result<(), LookupRequestError> { - match &self.state { - State::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(LookupRequestError::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); - } - self.failed_downloading = self.failed_downloading.saturating_add(1); - self.state = State::AwaitingDownload("not started"); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_download_failure expected Downloading got {other}" - ))), - } - } - - pub fn on_download_success( - &mut self, - req_id: ReqId, - result: DownloadResult, - ) -> Result<(), LookupRequestError> { - match &self.state { - State::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(LookupRequestError::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); - } - self.state = State::AwaitingProcess(result); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_download_success expected Downloading got {other}" - ))), - } - } - - /// Switch to `Processing` if the request is in `AwaitingProcess` state, otherwise returns None. - pub fn maybe_start_processing(&mut self) -> Option> { - // For 2 lines replace state with placeholder to gain ownership of `result` - match &self.state { - State::AwaitingProcess(result) => { - let result = result.clone(); - self.state = State::Processing(result.clone()); - Some(result) - } - _ => None, - } - } - - /// Revert into `AwaitingProcessing`, if the payload if not invalid and can be submitted for - /// processing latter. - pub fn revert_to_awaiting_processing(&mut self) -> Result<(), LookupRequestError> { - match &self.state { - State::Processing(result) => { - self.state = State::AwaitingProcess(result.clone()); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on revert_to_awaiting_processing expected Processing got {other}" - ))), - } - } - - /// Registers a failure in processing a block. - pub fn on_processing_failure(&mut self) -> Result { - match &self.state { - State::Processing(result) => { - let peers_source = result.peer_group.clone(); - self.failed_processing = self.failed_processing.saturating_add(1); - self.state = State::AwaitingDownload("not started"); - Ok(peers_source) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_processing_failure expected Processing got {other}" - ))), - } - } - - pub fn on_processing_success(&mut self) -> Result<(), LookupRequestError> { - match &self.state { - State::Processing(_) => { - self.state = State::Processed("processing success"); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_processing_success expected Processing got {other}" - ))), - } - } - - /// Mark a request as complete without any download or processing - pub fn on_completed_request(&mut self, reason: &'static str) -> Result<(), LookupRequestError> { - match &self.state { - State::AwaitingDownload { .. } => { - self.state = State::Processed(reason); - Ok(()) - } - other => Err(LookupRequestError::BadState(format!( - "Bad state on_completed_request expected AwaitingDownload got {other}" - ))), - } - } - - /// The total number of failures, whether it be processing or downloading. - pub fn failed_attempts(&self) -> u8 { - self.failed_processing + self.failed_downloading - } - - pub fn more_failed_processing_attempts(&self) -> bool { - self.failed_processing >= self.failed_downloading - } -} - -// Display is used in the BadState assertions above -impl std::fmt::Display for State { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", Into::<&'static str>::into(self)) - } -} - -// Debug is used in the log_stuck_lookups print to include some more info. Implements custom Debug -// to not dump an entire block or blob to terminal which don't add valuable data. -impl std::fmt::Debug for State { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::AwaitingDownload(reason) => write!(f, "AwaitingDownload({})", reason), - Self::Downloading(req_id) => write!(f, "Downloading({:?})", req_id), - Self::AwaitingProcess(d) => write!(f, "AwaitingProcess({:?})", d.peer_group), - Self::Processing(d) => write!(f, "Processing({:?})", d.peer_group), - Self::Processed(reason) => write!(f, "Processed({})", reason), - } - } -} - -fn fmt_peer_set_as_len( - peer_set: &Arc>>, - f: &mut std::fmt::Formatter, -) -> Result<(), std::fmt::Error> { - write!(f, "{}", peer_set.read().len()) -} diff --git a/beacon_node/network/src/sync/forward_sync.rs b/beacon_node/network/src/sync/forward_sync.rs new file mode 100644 index 00000000000..56b2ebc1a77 --- /dev/null +++ b/beacon_node/network/src/sync/forward_sync.rs @@ -0,0 +1,1355 @@ +use super::network_context::{ + DownloadRequest, DownloadRequestError, RpcRequestSendError, RpcResponseError, + SyncNetworkContext, +}; +use crate::metrics; +use crate::sync::network_context::{BatchPeers, LookupVerifyError, RpcResponseResult}; +use crate::sync::sync_block::{Error as SyncBlockError, OkToImport, SyncBlock, SyncBlockResult}; +use crate::sync::BatchProcessResult; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::BeaconChainTypes; +use lighthouse_network::service::api_types::{ + BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, ForwardSyncLookupId, + HeaderChainId, HeaderLookupId, Id, RangeRequestId, +}; +use lighthouse_network::PeerId; +use std::collections::{HashMap, HashSet, VecDeque}; +use strum::IntoStaticStr; +use tracing::{debug, error}; +use types::{BeaconBlockHeader, EthSpec, Hash256, Slot}; + +const MAX_LOOKUP_COUNT: usize = 1_000_000; +const PRUNE_COUNT: usize = 100_000; +const BLOCK_BUFFER_SIZE: usize = 4; + +#[derive(Debug, Copy, Clone)] +enum BlockPointer { + HeaderChain(HeaderChainId), + SyncBlock(Hash256), +} + +/// Roots are added to ForwardSync via: +/// 1. Peers referencing an unknown block root +/// 2. When fetching the next ancestor of a chain, the parent is unknown +/// +/// Invariants: +/// - Each block references a single chain +/// - Each block root exists in exactly one `Chain::block_roots` list +/// - A block can change what chain it belongs to at any moment, including during an active request +/// +/// Goals +/// - Download multiple blocks at once to increase download speed +/// +/// +/// ## da_checker oracle +/// +/// TODO(tree-sync): re-implement if necessary +/// +/// +/// ## duplicate_cache with gossip blocks +/// +/// Gossip may receive and process the same block that ForwardSync attempts to process. +/// +/// a. Gossip receives block X and sends to process +/// b. ForwardSync downloads block X +/// c. ForwardSync sends block X for process +/// +/// Consider the order of events +/// - [a,b,c]: the gossip block is inserted in the `duplicate_cache` and the RPC block is queued. +/// Step b could be skipped, but we accept the inneficiency for simplicity. +/// - [b,a,c]: the RPC block is downloaded, gossip block into `duplicate_cache` and RPC block queued +/// - [b,c,a]: the RPC block is inserted in the `duplicate_cache` and the gossip block is queued +/// +/// ## Pruning +/// +/// So chose to not explicitly implement pruning for forward sync. Chains can be pruned by: +/// +/// 1. Checking if the conflict with finality once finality advances: If this happens once we +/// attempt to import the first block of the chain we'll get an unknown parent error. The chain +/// will fail and be dropped = so this pruning happens by default. +/// 2. If their blocks are imported through another source: If this happens when we attempt to +/// process the block we'll get a duplicate_cache hit or a block already known error. In either +/// case the processing result for the block with be an Ok, and we'll move to the next block. +/// +/// +pub struct ForwardSync { + block_to_tip: HashMap, + header_chains: HashMap, + syncing_blocks: HashMap>, +} + +/// Chain of consecutive blocks that are imported by the same set of peers +struct Chain { + status: Status, +} + +type PendingBlock = (Hash256, Slot); + +#[derive(Copy, Clone, Debug)] +pub struct PeerStatusSummary { + pub max_slot: Slot, + pub min_slot: Slot, +} + +struct HeaderChain { + id: HeaderChainId, + /// Headers descendant of `next_header_request.block_root` that are already downloaded. + /// Does not include `next_header_request.block_root`. + /// Sorting: tip first, oldest ancestor last + block_roots: VecDeque, + status: HeaderChainStatus, + /// Peers that claim to have imported the oldest ancestor of this chain + peers: HashMap, +} + +enum HeaderChainStatus { + Backfill { + /// Oldest ancestor block root of this Chain. + next_request: HeaderRequest, + }, + WaitingParent { + /// Parent root of the last block_root in `block_roots` + parent_root: Hash256, + /// True if the oldest ancestor can start downloading + ready_to_sync: bool, + }, +} + +#[allow(clippy::large_enum_variant)] +enum Status { + /// Recursively fetch headers until discovering a parent_root that is known. Its list of + /// block_roots can grow by appending ancestors. + /// - Transition to `WaitingParentChain` if the parent is known but not imported + /// - Transition to `ForwardSync` if the parent is imported + BackfillHeaders { + /// Headers descendant of `next_header_request.block_root` that are already downloaded. + /// Does not include `next_header_request.block_root`. + /// Sorting: tip first, oldest ancestor last + block_roots: Vec, + /// Oldest ancestor block root of this Chain. + next_header_request: HeaderRequest, + }, + /// Waits for a parent block in a different chain to be imported. Its block_root list does not + /// change. + /// - Transitions to `ForwardSync` once `parent_root` is imported. + WaitingParentChain { + /// Parent root of the last block_root in `block_roots` + parent_root: Hash256, + /// Sorting: tip first, oldest ancestor last + block_roots: Vec, + /// True if the oldest ancestor can start downloading + ready_to_sync: bool, + }, + /// Download and process block_roots from oldest ancestor to tip. Its list of block_roots does + /// not grow, only removed block roots once processed. + /// + /// Note: Keeping block_roots and syncing_blocks in separate Vecs instead of a single Vec with + /// an enum shows the following invariants: + /// - The set of PendingBlocks is consecutive + /// - The set of SyncBlocks is consecutive + /// - The parent of the last item in `block_roots` is the first item in `syncing_blocks` + ForwardSync { + block: SyncBlock, + /// The parent root of `block`. Note that it may point to a block that is already imported, + /// and is not in the sync headers DAG. + parent_root: Hash256, + }, +} + +/// Tracks a request to download a BeaconBlockHeader by block root +struct HeaderRequest { + id: Option, + chain_id: HeaderChainId, + block_root: Hash256, + failed_peers: HashSet, + request: DownloadRequest, +} + +impl HeaderRequest { + fn new(block_root: Hash256, chain_id: HeaderChainId) -> Self { + Self { + id: None, + chain_id, + block_root, + failed_peers: <_>::default(), + request: DownloadRequest::new(), + } + } + + fn empty() -> Self { + Self::new(Hash256::ZERO, HeaderChainId(0)) + } + + fn continue_request( + &mut self, + peers: I, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> + where + T: BeaconChainTypes, + I: Iterator, + { + if self.request.is_awaiting_download() { + let Some(peer) = peers + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + self.failed_peers.contains(peer), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the lookup is removed + return Err(Error::InternalError("No peers".to_owned())); + }; + + let id = self.id.get_or_insert_with(|| cx.next_id()).clone(); + + // TODO(tree-sync): send headers_by_root request if available + let req_id = cx.send_blocks_by_root_request( + peer, + self.block_root, + BlocksByRootRequester::Header(HeaderLookupId { + id, + chain_id: self.chain_id, + }), + )?; + + self.request.on_download_start(req_id)?; + } + Ok(()) + } +} + +impl HeaderChain { + fn new( + initial_block_root: Hash256, + id: HeaderChainId, + initial_peers: &[(PeerId, PeerStatusSummary)], + ) -> Self { + Self { + id, + block_roots: <_>::default(), + status: HeaderChainStatus::Backfill { + next_request: HeaderRequest::new(initial_block_root, id), + }, + peers: HashMap::from_iter(initial_peers.iter().copied()), + } + } + + /// Continues the header or blocks requests of this chain + fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + match &mut self.status { + HeaderChainStatus::Backfill { next_request } => { + Ok(next_request.continue_request(self.peers.keys(), cx)?) + } + _ => Ok(()), + } + } + + fn add_ancestor(&mut self, header: BeaconBlockHeader) -> Result<(), InternalError> { + match &mut self.status { + HeaderChainStatus::Backfill { next_request, .. } => { + self.block_roots + .push_back((next_request.block_root, header.slot)); + *next_request = HeaderRequest::new(header.parent_root, self.id); + Ok(()) + } + _ => Err(InternalError( + "Expected lookup to be in DownloadingHeader state".to_owned(), + )), + } + } + + fn extend_with_children(&mut self, mut child_chain: Self) { + while let Some(block) = child_chain.block_roots.pop_back() { + // pop_back gives oldest first, pushing to front restores tip-first + self.block_roots.push_front(block); + } + + // All the peers of the child chain have imported the ancestors + self.peers.extend(child_chain.peers.drain()); + } + + fn to_waiting_parent( + &mut self, + parent_root: Hash256, + ready_to_sync: bool, + ) -> Result<(), Error> { + self.status = HeaderChainStatus::WaitingParent { + parent_root, + ready_to_sync, + }; + Ok(()) + } + + fn parent_root(&self) -> Option { + match &self.status { + HeaderChainStatus::Backfill { .. } => None, + HeaderChainStatus::WaitingParent { parent_root, .. } => Some(*parent_root), + } + } + + /// Returns true if the peer has been added to the map + fn add_peer(&mut self, peer: PeerId, status: PeerStatusSummary) -> bool { + let contains_key = self.peers.contains_key(&peer); + self.peers.insert(peer, status); + !contains_key + } + + /// Returns true if a peer was removed from the map + fn remove_peer(&mut self, peer: &PeerId) -> bool { + self.peers.remove(peer).is_some() + } + + fn pop_oldest_ancestor(&mut self) -> Option { + match &mut self.status { + HeaderChainStatus::WaitingParent { + parent_root, + ready_to_sync, + } => { + if !*ready_to_sync { + return None; + } + if let Some((block_root, block_slot)) = self.block_roots.pop_back() { + *parent_root = block_root; + Some((block_root, block_slot)) + } else { + None + } + } + _ => None, + } + } + + fn peers_of_block_slot(&self, block_slot: Slot) -> Vec { + self.peers + .iter() + .filter(|(_, status)| block_slot >= status.min_slot && block_slot < status.max_slot) + .map(|(peer, _)| *peer) + .collect() + } + + /// Returns true if this chain transitioned into ready to sync + fn on_parent_imported(&mut self, imported_block_root: &Hash256) -> bool { + match &mut self.status { + HeaderChainStatus::WaitingParent { + parent_root, + ready_to_sync, + } => { + if parent_root == imported_block_root && !*ready_to_sync { + *ready_to_sync = true; + true + } else { + false + } + } + _ => false, + } + } + + fn block_count(&self) -> usize { + self.block_roots.len() + } + + fn min_slot(&self) -> Option { + self.block_roots.back().map(|b| b.1) + } + + fn max_slot(&self) -> Option { + self.block_roots.front().map(|b| b.1) + } + + fn peer_count(&self) -> usize { + self.peers.len() + } +} + +// TODO(tree-sync): Re-add the reprocessing cache, so we don't process twice a block that we got +// through gossip and sync. + +#[derive(Debug, IntoStaticStr)] +pub enum Error { + /// Unexpected and unrecoverable error + InternalError(String), + /// Expected and unrecoverable error + TooManyErrors(String), + /// Block is not descendant of the finalized checkpoint + BlockConflictsWithFinality(String), +} + +/// Unexpected and unrecoverable error +#[derive(Debug)] +struct InternalError(String); + +impl From for Error { + fn from(e: InternalError) -> Self { + Self::InternalError(e.0) + } +} + +impl From for Error { + fn from(e: DownloadRequestError) -> Self { + match e { + DownloadRequestError::InternalError(e) => Self::InternalError(e), + DownloadRequestError::TooManyErrors(e) => Self::TooManyErrors(format!("{e:?}")), + } + } +} + +impl From for Error { + fn from(e: RpcRequestSendError) -> Self { + match e { + RpcRequestSendError::InternalError(e) => Self::InternalError(e), + // TODO(tree-sync): Should we allow lookups to have zero peers + RpcRequestSendError::NoPeers => Self::InternalError("No peers".to_string()), + } + } +} + +impl From for Error { + fn from(e: SyncBlockError) -> Self { + match e { + SyncBlockError::InternalError(e) => Self::InternalError(e), + SyncBlockError::TooManyErrors(e) => Self::TooManyErrors(e), + } + } +} + +pub(crate) enum SyncState { + Synced, + Syncing { max_slot: Slot }, +} + +impl ForwardSync { + pub fn new() -> Self { + Self { + block_to_tip: <_>::default(), + header_chains: <_>::default(), + syncing_blocks: <_>::default(), + } + } + + /// Returns the peers that claim to have imported a specific block_root + #[cfg(test)] + pub fn block_peers(&self, block_root: &Hash256) -> Result>, String> { + let Some(block_ptr) = self.block_to_tip.get(block_root) else { + return Ok(None); + }; + match block_ptr { + BlockPointer::HeaderChain(id) => Err(format!("Block {id} is a header chain")), + BlockPointer::SyncBlock(id) => Ok(Some( + self.syncing_blocks + .get(id) + .ok_or(format!("Unknown chain {id}"))? + .get_peers(), + )), + } + } + + /// Get all blocks that forward sync intends to sync + #[cfg(test)] + pub fn get_lookups(&self) -> Vec { + self.block_to_tip.keys().copied().collect() + } + + /// Total count of blocks that forward sync intends to sync + pub fn block_count(&self) -> usize { + self.block_to_tip.len() + } + + /// Returns the highest known slot that we are attempting to sync + pub fn max_slot_to_sync(&self) -> Option { + // TODO(tree-sync): weak metric, who have a better heuristic for sync? Now that lookups + // count here + self.header_chains + .values() + .filter_map(|chain| chain.max_slot()) + .max() + } + + /// Return all processing ids of syncing blocks + #[cfg(test)] + pub fn get_processing_ids(&mut self) -> Vec { + let mut ids = vec![]; + for block in self.syncing_blocks.values() { + if block.is_processing() { + if let RangeRequestId::ForwardSync(id) = block.id() { + ids.push(id); + } + } + } + ids + } + + pub fn pause(&mut self) { + // TODO(tree-sync): consider if we really need a pausing mechanism for when EL offline + } + + /// Remove a disconnected peer from all chains + pub fn remove_peer(&mut self, peer: PeerId) { + let chains_to_remove = self + .header_chains + .iter_mut() + .filter_map(|(chain_id, chain)| { + chain.remove_peer(&peer); + // TODO(tree-sync): research if it actually useful to keep chains with zero peers for + // some time. + if chain.peer_count() == 0 { + Some((*chain_id).into()) + } else { + None + } + }) + .collect::>(); + + for (id, block) in self.syncing_blocks.iter_mut() { + block.remove_peer(&peer); + } + + if !chains_to_remove.is_empty() { + let chain_to_children = self.compute_children(); + for chain_id in chains_to_remove { + self.drop_chain_and_children(chain_id, &chain_to_children, "no_peers"); + } + } + } + + /// A set of peers claim to have imported a block_root. Create a new lookup for it or add them + /// to an existing one + its ancestors + pub fn search( + &mut self, + block_root: Hash256, + peers: &[(PeerId, PeerStatusSummary)], + cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + if let Some(_) = self.block_to_tip.get(&block_root) { + debug!(block_root = ?block_root, ?peers, "Adding peer to existing header lookup and ancestors"); + self.add_peers_recursively(block_root, peers)?; + } else { + if self.block_to_tip.len() > MAX_LOOKUP_COUNT { + self.prune_least_popular_lookups(); + } + + let chain_id = HeaderChainId(cx.next_id()); + match peers { + [peer] => debug!(?block_root, %chain_id, ?peer, "Creating new header lookup"), + _ => debug!( + ?block_root, + %chain_id, + peers = peers.len(), + "Creating new header lookup" + ), + } + + let mut chain = HeaderChain::new(block_root, chain_id, peers); + chain.continue_requests(cx)?; + // Don't insert until first request is successful + self.header_chains.insert(chain_id, chain); + self.block_to_tip + .insert(block_root, BlockPointer::HeaderChain(chain_id)); + metrics::inc_counter(&metrics::SYNC_CHAINS_ADDED); + } + Ok(()) + } + + /// Handle the result of a header download. + pub fn on_headers_download_result( + &mut self, + req_id: BlocksByRootRequestId, + id: HeaderLookupId, + response: RpcResponseResult>, + _peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) { + // Invoke a closure to use the ? operator and handle the result consistenlty + let result: Result<(), Error> = (|| { + let chain_id = id.chain_id; + let chain = self + .header_chains + .get_mut(&chain_id) + .ok_or(InternalError(format!("Request for unknown chain {id}")))?; + + let response = response.and_then(|(blocks, timestamp)| { + if blocks.is_empty() { + Err(RpcResponseError::VerifyError( + LookupVerifyError::NotEnoughResponsesReturned { actual: 0 }, + )) + } else { + Ok((blocks, timestamp)) + } + }); + + let header_request = match &mut chain.status { + HeaderChainStatus::Backfill { next_request, .. } => next_request, + HeaderChainStatus::WaitingParent { .. } => { + debug!(%req_id, %chain_id, "Unexpected request for header chain waiting parent"); + return Ok(()); + } + }; + + // TODO(tree-sync): add some check to make sure that distinct lookups for the same + // block root don't mess with each other. That check must happen before triggering + // errors for bad state + + match response { + Ok((headers, received)) => { + header_request.request.on_download_success( + req_id, + PeerId::random(), + BeaconBlockHeader::empty(), + received, + )?; + debug!(%req_id, %chain_id, "Forward sync block header downloaded success"); + + // TODO(tree-sync): should check if the block is descendant of finalized + // TODO(tree-sync): on finalization or every interval we should drop branches that + // conflict with finality + let finalized_checkpoint = cx.chain.head().finalized_checkpoint(); + + for header in headers { + let parent_root = header.parent_root; + let block_root = header.canonical_root(); + chain.add_ancestor(header.clone())?; + + metrics::inc_counter(&metrics::SYNC_HEADERS_DOWNLOADED); + + // Once we discover the parent_root of this block three things can happen + // 1. The parent root is a known block -> stop + // 2. We conflicts with finality -> reject + // 3. The parent root is unknown -> continue search + + // TODO(tree-sync): check that the slots are decreasing, so we don't end up in + // an infinite loop. But note that the wrong block will be the descendant. + // - We get header A with parent B and slot 10 + // - We get header B with parent C and slot 11 + // - That makes header A invalid + + if header.slot + <= finalized_checkpoint + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + && block_root != finalized_checkpoint.root + { + return Err(Error::BlockConflictsWithFinality(format!( + "Block {:?} {} conflicts with finalized checkpoint {:?}", + block_root, header.slot, finalized_checkpoint + ))); + } + + if cx.chain.block_is_known_to_fork_choice(&parent_root) { + // Parent is imported, we can forward sync this chain + // Stop search we reached a known block + chain.to_waiting_parent(parent_root, true)?; + debug!(%chain_id, ?parent_root, block_count = chain.block_count(), "Forward sync chain reached imported block"); + // Trigger potential foward sync for this chain + self.continue_requests(cx); + break; + } else if let Some(parent_chain_ptr) = + self.block_to_tip.get(&parent_root).copied() + { + // Parent is part of another chain, stop search + // Stop search we reached a known block + debug!(%chain_id, ?parent_chain_ptr, ?parent_root, "Forward sync chain reached known block"); + + // If this is the only child of `parent_root` we can insert the block + // in the parent chain, and "merge" them. This is the common case in + // single fork chains. The main chain keeps producing new blocks while + // we backfill headers. + if match self.compute_children().get(&parent_root) { + Some(children) => children.is_empty(), + None => false, + } { + if let BlockPointer::HeaderChain(parent_chain_id) = parent_chain_ptr + { + // Add new tip to `parent_chain` + let chain = self.header_chains.remove(&chain_id).ok_or( + InternalError(format!("missing chain {chain_id}")), + )?; + + let parent_chain = self + .header_chains + .get_mut(&parent_chain_id) + .ok_or(InternalError(format!( + "missing chain {parent_chain_id}" + )))?; + + for (block_root, _) in &chain.block_roots { + self.block_to_tip.insert(*block_root, parent_chain_ptr); + } + parent_chain.extend_with_children(chain); + } + } else { + let chain = self + .header_chains + .get_mut(&chain_id) + .ok_or(InternalError(format!("missing chain {chain_id}")))?; + + // `parent_root` has multiple children, keep `chain` as a fork and + // mark it awaiting parent + chain.to_waiting_parent(parent_root, false)?; + } + + // The rest of headers of this response are known, ignore + break; + } else { + debug!(%chain_id, ?parent_root, "Forward sync chain continues fetching ancestor"); + // Add to the block_to_tip mapping to respect the invariant "Each block + // root exists in exactly one `Chain::block_roots` list". + self.block_to_tip.insert(parent_root, chain_id.into()); + // Since the block already points to `chain` we don't need to add peers. + // Just trigger header download for this new root. + } + } + } + Err(e) => { + // Request errors are logged in `SyncNetworkContext::on_rpc_response_result` + header_request.request.on_download_error(req_id, Some(e))?; + // Continue this request to potentially resend the header request + } + } + self.continue_requests(cx); + Ok(()) + })(); + + if let Err(e) = result { + self.handle_error(id.chain_id.into(), e); + } + } + + /// Handle the result of a block download. + pub fn on_block_download_result( + &mut self, + req_id: ComponentsByRootRequestId, + id: ForwardSyncLookupId, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + cx: &mut SyncNetworkContext, + ) { + let block_root = id.block_root; + let Some(block) = self.syncing_blocks.get_mut(&block_root) else { + error!(?block_root, "Unknown forward sync block"); + return; + }; + + let result: Result<(), Error> = (|| { + // let block = self.block_request(req_id.requester)?; + debug!(%id, ?block_root, result = render_result(&result), "Forward sync block download result"); + block.on_download_result(req_id, result, cx)?; + block.continue_request(cx, OkToImport::IfParentImported)?; + Ok(()) + })(); + + if let Err(e) = result { + self.handle_error(block_root.into(), e); + // Some syncing blocks may have been dropped so there's space for new chains to sync + self.continue_requests(cx); + } + } + + /// Handle the result of a block processing. + /// We known this block's parent is imported, so we don't explicitly handle a ParentUnknown error. + pub fn on_block_process_result( + &mut self, + id: ForwardSyncLookupId, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) { + let result: Result<(), Error> = (|| { + let block_root = id.block_root; + let Some(block) = self.syncing_blocks.get_mut(&block_root) else { + error!(?block_root, "Unknown forward sync block"); + return Ok(()); + }; + + debug!(%id, ?block_root, ?result, "Forward sync block process result"); + + // TODO(tree-sync): use id to ensure results for other roots don't mix up + match block.on_process_result(result, cx)? { + SyncBlockResult::Done { .. } => { + metrics::inc_counter(&metrics::SYNC_BLOCKS_PROCESSED); + self.block_to_tip.remove(&block_root); + // ForwardSync chains have a single block, remove them on Done + self.syncing_blocks.remove(&block_root); + debug!(%id, ?block_root, "Removed completed forward sync block"); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &["completed"]); + + // Find all chains that are awaiting this block to process and continue them + for (chain_id, other_chain) in self.header_chains.iter_mut() { + if other_chain.on_parent_imported(&id.block_root) { + debug!( + %chain_id, + parent_root = ?id.block_root, + "Forward sync marked chain as ready to sync" + ); + } + } + self.continue_requests(cx); + } + // Not complete yet, continue requests + SyncBlockResult::Wait => { + block.continue_request(cx, OkToImport::IfParentImported)?; + } + } + Ok(()) + })(); + + if let Err(e) = result { + self.handle_error(id.block_root.into(), e); + // Some syncing blocks may have been dropped so there's space for new chains to sync + self.continue_requests(cx); + } + } + + pub fn prune(&mut self) { + // TODO(tree-sync): should prune? Based on finality and expired head chains + } + + /// Common handler for any `forward_sync::Error`. For simplicity it drops the chain that includes + /// the block and all of its descendants. + fn handle_error(&mut self, chain_id: BlockPointer, error: Error) { + debug!(?error, ?chain_id, "Dropping forward sync block lookup"); + + metrics::inc_counter_vec(&metrics::SYNC_CHAIN_ERROR_COUNT, &[(&error).into()]); + + let block_to_children = self.compute_children(); + // TODO(tree-sync): logging `block_to_children` for debugging + debug!(%chain_id, ?chain_id, ?error, ?block_to_children, "Dropping forward sync chain on error"); + self.drop_chain_and_children(chain_id, &block_to_children, (&error).into()); + + match error { + Error::InternalError(_) | Error::TooManyErrors(_) => { + // + } + Error::BlockConflictsWithFinality(_e) => { + // TODO(tree-sync): penalize peers of this lookups + // TODO(tree-sync): add blocks to a failed cache to prevent re-sync + } + } + } + + /// Marks blocks ready for download as syncing + /// Should be called anytime: + /// - A new block is imported to fork-choice + /// - A block in the header tree is advanced to Syncing + /// - A new header is downloaded with a parent that is imported or syncing + fn trigger_forward_sync(&mut self, cx: &mut SyncNetworkContext) { + // We want to download and import blocks whose parent is imported in our fork-choice. Also + // to buffer we want to download children of blocks that are awaiting import. + // + // We may want to avoid 1M calls into fork-choice to check if a block is imported. We only + // need to work of roots. Once a root is processed we have re-compute roots, or track + // children. + + // TODO(tree-sync): don't build on demand, cache roots somewhere + + if self.syncing_blocks.len() > BLOCK_BUFFER_SIZE { + return; + } + + // A chain can be in two states: + // - Active backfill + // - Oldest ancestor known + + // Find the block range with most peers and highest slot. This is the block + // to be used as tip of the chain of blocks to fetch. + let mut chains_by_peer_count = self + .header_chains + .iter_mut() + .filter_map(|(_, chain)| { + if chain.parent_root().is_some() { + Some((chain.peers.len(), chain)) + } else { + None + } + }) + .collect::>(); + chains_by_peer_count.sort_by_key(|(peer_count, _)| *peer_count); + + let mut blocks_to_add = vec![]; + + 'o: for (chain_id, chain) in chains_by_peer_count { + while let Some((block_root, block_slot)) = chain.pop_oldest_ancestor() { + let block_peers = chain.peers_of_block_slot(block_slot); + blocks_to_add.push((block_root, block_slot, block_peers)); + debug!(%chain_id, ?block_root, %block_slot, "Transitioned block to forward sync"); + if blocks_to_add.len() + self.syncing_blocks.len() > BLOCK_BUFFER_SIZE { + break 'o; + } + } + } + + let should_continue_requests = !blocks_to_add.is_empty(); + for (block_root, block_slot, block_peers) in blocks_to_add { + // Need to compute the peer of the block here since header chains only track peers + // that have imported the oldest ancestor. + + let block = SyncBlock::new( + // Reuse the request ID of the header for better traceability + RangeRequestId::ForwardSync(ForwardSyncLookupId { + id: cx.next_id(), + block_root, + }), + block_root, + block_slot, + &block_peers, + ); + // Update all block references to the new chain + self.block_to_tip + .insert(block_root, BlockPointer::SyncBlock(block_root)); + self.syncing_blocks.insert(block_root, block); + } + + // Prune chains that become empty after pop_next_block_to_sync + self.header_chains + .retain(|_, chain| !chain.block_roots.is_empty()); + + if should_continue_requests { + self.continue_requests(cx); + } + } + + fn continue_requests(&mut self, cx: &mut SyncNetworkContext) { + // TODO(tree-sync): optimize this call to maybe not do it everytime + self.trigger_forward_sync(cx); + + let mut chains_to_drop = vec![]; + + for (chain_id, block) in self.syncing_blocks.iter_mut() { + if let Err(e) = block.continue_request(cx, OkToImport::IfParentImported) { + // TODO(tree-sync): should log error? + chains_to_drop.push(((*chain_id).into(), e.into())); + } + } + + for (chain_id, chain) in self.header_chains.iter_mut() { + if let Err(e) = chain.continue_requests(cx) { + // TODO(tree-sync): should log error? + chains_to_drop.push(((*chain_id).into(), e)); + } + } + + if !chains_to_drop.is_empty() { + let chain_to_children = self.compute_children(); + for (chain_id, e) in chains_to_drop { + self.drop_chain_and_children(chain_id, &chain_to_children, e.into()); + } + } + } + + fn add_peers_recursively( + &mut self, + block_root: Hash256, + peers: &[(PeerId, PeerStatusSummary)], + ) -> Result<(), Error> { + let Some(id) = self.block_to_tip.get(&block_root) else { + return Ok(()); + }; + match id { + BlockPointer::HeaderChain(chain_id) => { + // The peer claims to have imported some block in this header chain. Header + // chain requests always the oldest ancestor. So we can guarantee that this peer + // has imported the oldest ancestor of the chain. + let chain = self + .header_chains + .get_mut(chain_id) + .ok_or(InternalError(format!("Unknown chain {chain_id}")))?; + for (peer, status) in peers { + chain.add_peer(*peer, *status); + } + if let Some(parent_root) = chain.parent_root() { + self.add_peers_recursively(parent_root, peers)?; + } + Ok(()) + } + BlockPointer::SyncBlock(id) => { + let block = self + .syncing_blocks + .get_mut(id) + .ok_or(InternalError(format!("Unknown syncing block {id:?}")))?; + for (peer, _) in peers { + block.add_peer(*peer); + } + if let Some(parent_root) = block.parent_root() { + self.add_peers_recursively(parent_root, peers)?; + } + Ok(()) + } + } + } + + /// Drop chain if it exists and all its children + fn drop_chain_and_children( + &mut self, + initial_chain_id: BlockPointer, + chain_to_children: &HashMap>, + reason: &'static str, + ) { + let mut queue: VecDeque = VecDeque::from([initial_chain_id]); + + while let Some(block_ptr) = queue.pop_front() { + // Remove the node itself. + // Only continue if the node was removed. This prevents infinite loops even if + // `chain_to_children` items reference themselves + match block_ptr { + BlockPointer::HeaderChain(chain_id) => { + if let Some(chain) = self.header_chains.remove(&chain_id) { + debug!(%chain_id, %initial_chain_id, reason, "Dropping forward sync chain"); + metrics::inc_counter_vec(&metrics::SYNC_CHAINS_REMOVED, &[reason]); + + for (block_root, _) in chain.block_roots { + self.block_to_tip.remove(&block_root); + debug!(?block_root, %chain_id, %initial_chain_id, reason, "Dropping forward sync block"); + metrics::inc_counter(&metrics::SYNC_FORWARD_BLOCKS_DROPPED); + + // Only remove children if the node still existed + // Push its children‚ if any‚ onto the work list. + if let Some(children) = chain_to_children.get(&block_root) { + queue.extend(children.iter().cloned()); + } + } + } + } + BlockPointer::SyncBlock(id) => { + if let Some(block) = self.syncing_blocks.remove(&id) { + if let Some(children) = chain_to_children.get(&id) { + queue.extend(children.iter().cloned()); + } + } + } + } + } + } + + /// Compute the map of block_roots -> chain IDs + fn compute_children(&self) -> HashMap> { + let mut parent_to_children = HashMap::>::new(); + for (chain_id, chain) in self.header_chains.iter() { + if let Some(parent_root) = chain.parent_root() { + parent_to_children + .entry(parent_root) + .or_default() + .push(BlockPointer::HeaderChain(*chain_id)); + } + } + for (chain_id, chain) in self.syncing_blocks.iter() { + if let Some(parent_root) = chain.parent_root() { + parent_to_children + .entry(parent_root) + .or_default() + .push(BlockPointer::SyncBlock(*chain_id)); + } + } + parent_to_children + } + + /// Drop lookups with least amount of peers and slot until we pruned PRUNE_COUNT lookups + fn prune_least_popular_lookups(&mut self) { + let mut chains = self + .header_chains + .iter() + // TODO: Prune only lookups that are not syncing and we know the header + .map(|(chain_id, chain)| (chain.peer_count(), *chain_id)) + .collect::>(); + chains.sort_unstable(); + + let chain_to_children = self.compute_children(); + for (_, chain_id) in chains { + self.drop_chain_and_children(chain_id.into(), &chain_to_children, "too_many_blocks"); + if self.block_to_tip.len() < MAX_LOOKUP_COUNT - PRUNE_COUNT { + break; + } + } + } + + pub fn register_metrics(&self) { + let (min_slot, max_slot) = self.header_chains.values().fold( + (None::, None::), + |(gmin, gmax), chain| { + let gmin = match (gmin, chain.min_slot()) { + (Some(a), Some(b)) => Some(a.min(b)), + (None, some @ Some(_)) => some, // first non-None wins + (x, None) => x, + }; + + let gmax = match (gmax, chain.max_slot()) { + (Some(a), Some(b)) => Some(a.max(b)), + (None, some @ Some(_)) => some, + (x, None) => x, + }; + + (gmin, gmax) + }, + ); + + if let (Some(min_slot), Some(max_slot)) = (min_slot, max_slot) { + metrics::set_gauge(&metrics::SYNC_HEADER_MIN_SLOT, min_slot.as_u64() as i64); + metrics::set_gauge(&metrics::SYNC_HEADER_MAX_SLOT, max_slot.as_u64() as i64); + } + + metrics::set_gauge(&metrics::SYNC_HEADERS_COUNT, self.block_to_tip.len() as i64); + metrics::set_gauge( + &metrics::SYNC_HEADER_CHAINS_COUNT, + self.header_chains.len() as i64, + ); + metrics::set_gauge( + &metrics::SYNC_FORWARD_SYNC_BLOCKS_COUNT, + self.syncing_blocks.len() as i64, + ); + + for (chain_id, chain) in &self.header_chains { + let status = match &chain.status { + HeaderChainStatus::Backfill { next_request, .. } => { + format!( + "BackfillHeaders block_roots {:?} next_header_request {:?} {} {}", + chain.block_roots, + next_request.id, + next_request.block_root, + next_request.request.status_str() + ) + } + HeaderChainStatus::WaitingParent { + parent_root, + ready_to_sync, + } => { + format!("WaitingParentChain ready_to_sync {ready_to_sync} parent_root {parent_root:?} block_roots {:?}",chain.block_roots) + } + }; + + let recursive_parent_chain = (|| { + let mut next_chain_id = *chain_id; + loop { + let Some(next_chain) = self.header_chains.get(&next_chain_id) else { + return Err(format!("Unknown chain {next_chain_id}")); + }; + if let HeaderChainStatus::WaitingParent { parent_root, .. } = next_chain.status + { + let Some(parent_ptr_id) = self.block_to_tip.get(&parent_root) else { + return Err(format!("{next_chain_id} Unknown block {parent_root:?}")); + }; + let parent_chain_id = match parent_ptr_id { + BlockPointer::HeaderChain(id) => id, + BlockPointer::SyncBlock(id) => { + return Err(format!("{next_chain_id} unknown/imported")); + } + }; + next_chain_id = *parent_chain_id; + } else if next_chain_id == *chain_id { + return Ok(format!("itself")); + } else { + return Ok(format!("{next_chain_id}")); + } + } + })(); + + debug!(%chain_id, status, ?recursive_parent_chain, "DEBUG chain"); + } + + for (block_root, chain_id) in &self.block_to_tip { + if !match chain_id { + BlockPointer::HeaderChain(id) => self.header_chains.contains_key(id), + BlockPointer::SyncBlock(id) => self.syncing_blocks.contains_key(id), + } { + debug!("DEBUG block {block_root} points to unknown chain {chain_id}"); + } + } + + // Min header + // Highest known header + // Current head + } +} + +impl std::fmt::Display for BlockPointer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::HeaderChain(id) => write!(f, "Header/{id}"), + Self::SyncBlock(id) => write!(f, "Block/{id:?}"), + } + } +} + +impl From for BlockPointer { + fn from(id: HeaderChainId) -> Self { + Self::HeaderChain(id) + } +} + +impl From for BlockPointer { + fn from(id: Hash256) -> Self { + Self::SyncBlock(id) + } +} + +fn render_result(result: &Result) -> String { + match result { + Ok(_) => format!("Ok"), + Err(e) => format!("Err({e:?})"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use beacon_chain::builder::Witness; + use beacon_chain::eth1_chain::CachingEth1Backend; + use slot_clock::ManualSlotClock; + use store::MemoryStore; + use types::FixedBytesExtended; + use types::MinimalEthSpec as E; + + type T = Witness, E, MemoryStore, MemoryStore>; + + fn to_roots(input: &[u64]) -> Vec { + input.iter().map(to_root).collect() + } + + fn to_root(u: &u64) -> Hash256 { + Hash256::from_low_u64_le(*u) + } + + fn from_root(r: &Hash256) -> u64 { + r.to_low_u64_le() + } + + fn to_block(u: &u64) -> PendingBlock { + ( + HeaderLookupId { + id: *u as u32, + block_root: to_root(u), + }, + Slot::new(*u), + ) + } + + fn get_roots(chain: &Chain) -> Vec { + chain.iter_block_roots().map(from_root).collect() + } + + fn test_split_by(input: &[u64], split: u64, roots_new: &[u64], roots_initial: &[u64]) { + let mut initial_chain = { + /// input sorting: tip first, oldest ancestor last + let (last, rest) = input.split_last().unwrap(); + Chain:: { + peers: <_>::default(), + status: Status::BackfillHeaders { + block_roots: rest.iter().map(to_block).collect::>(), + next_header_request: HeaderRequest::new(to_root(&last), 0), + }, + } + }; + let new_chain = initial_chain + .split_by(to_root(&split)) + .expect("error spliting backfill headers"); + + assert_eq!(get_roots(&new_chain), roots_new, "new backfill"); + assert_eq!(get_roots(&initial_chain), roots_initial, "initial backfill"); + + let mut initial_chain = Chain:: { + peers: <_>::default(), + status: Status::WaitingParentChain { + parent_root: to_root(&0), + block_roots: input.iter().map(to_block).collect::>(), + ready_to_sync: false, + }, + }; + let new_chain = initial_chain + .split_by(to_root(&split)) + .expect("error spliting backfill headers"); + + assert_eq!(get_roots(&new_chain), roots_new, "new waiting"); + assert_eq!(get_roots(&initial_chain), roots_initial, "initial waiting"); + assert_eq!( + from_root(&initial_chain.parent_root().unwrap()), + // The tip of the new chain is the parent of the initial chain + *roots_new.first().unwrap(), + "parent_initial" + ); + } + + fn test_merge(left: &[u64], right: &[u64], expected_merged: &[u64]) { + let peers = HashSet::from_iter([PeerId::random()]); + // Left chain has descendant roots of right + let mut left_chain = Chain:: { + peers: peers.clone(), + status: Status::WaitingParentChain { + parent_root: to_root(right.first().unwrap()), + block_roots: left.iter().map(to_block).collect::>(), + ready_to_sync: false, + }, + }; + // Right chain has no known parent, so set it to 0xff + let mut right_chain = Chain:: { + peers: peers.clone(), + status: Status::WaitingParentChain { + parent_root: to_root(&0xff), // rand root to not have conflicts + block_roots: right.iter().map(to_block).collect::>(), + ready_to_sync: false, + }, + }; + let mut sync = ForwardSync { + block_to_tip: <_>::default(), + chains: HashMap::from_iter([(HeaderChainId(0), left_chain), (TipId(1), right_chain)]), + }; + sync.merge_chains(); + assert_eq!(sync.chains.len(), 1, "Should merge 2 chains into 1"); + let merged_chain = sync.chains.values().next().unwrap(); + assert_eq!(get_roots(merged_chain), expected_merged, "merged roots"); + } + + #[test] + fn split_by_only_elem_a() { + // input [0,1] sorted by tip first + test_split_by(&[1, 0], 0, &[0], &[1]); + } + + #[test] + fn split_by_only_elem_b() { + test_split_by(&[1, 0], 1, &[1, 0], &[]); + } + + #[test] + fn split_by_first() { + test_split_by(&[3, 2, 1, 0], 0, &[0], &[3, 2, 1]); + } + + #[test] + fn split_by_last() { + test_split_by(&[3, 2, 1, 0], 3, &[3, 2, 1, 0], &[]); + } + + #[test] + fn split_by_middle_a() { + test_split_by(&[3, 2, 1, 0], 1, &[1, 0], &[3, 2]); + } + #[test] + fn split_by_middle_b() { + test_split_by(&[3, 2, 1, 0], 2, &[2, 1, 0], &[3]); + } + #[test] + fn split_by_middle_c() { + test_split_by(&[2, 1, 0], 1, &[1, 0], &[2]); + } + + #[test] + fn merge_left_long() { + test_merge(&[2, 1], &[0], &[2, 1, 0]); + } + + #[test] + fn merge_right_long() { + test_merge(&[2], &[1, 0], &[2, 1, 0]); + } + + #[test] + fn merge_same() { + test_merge(&[3, 2], &[1, 0], &[3, 2, 1, 0]); + } +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 13145401e4c..6b75eb506d8 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -33,38 +33,32 @@ //! needs to be searched for (i.e if an attestation references an unknown block) this manager can //! search for the block and subsequently search for parents if needed. -use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; -use super::block_lookups::BlockLookups; +use super::backfill_sync::BackFillSync; +use super::forward_sync::ForwardSync; use super::network_context::{ - CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, + CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; -use super::peer_sync_info::{remote_sync_type, PeerSyncType}; -use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH}; +use super::peer_sync_info::PeerSyncType; use crate::network_beacon_processor::{ ChainSegmentProcessId, NetworkBeaconProcessor, PeerGroupAction, }; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::block_lookups::{ - BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, -}; -use crate::sync::network_context::PeerGroup; +use crate::sync::backfill_sync::SyncStart; use beacon_chain::block_verification_types::AsBlock; -use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, EngineState, }; use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SamplingId, SamplingRequester, SingleLookupReqId, SyncRequestId, + BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, + CustodyByRootRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, + HeadersByRootRequestId, Id, SamplingId, SamplingRequester, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; -use lighthouse_network::PeerId; -use lighthouse_network::SyncInfo; +use lighthouse_network::{PeerId, SyncInfo}; use logging::crit; use lru_cache::LRUTimeCache; use std::ops::Sub; @@ -73,7 +67,8 @@ use std::time::Duration; use tokio::sync::mpsc; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use types::{ - BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, + BeaconBlockHeader, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, + SignedBeaconBlock, Slot, }; #[cfg(test)] @@ -99,15 +94,6 @@ pub enum SyncMessage { /// A useful peer has been discovered. AddPeer(PeerId, SyncInfo), - /// Force trigger range sync for a set of peers given a head they claim to have imported. Used - /// by block lookup to trigger range sync if a parent chain grows too large. - AddPeersForceRangeSync { - peers: Vec, - head_root: Hash256, - /// Sync lookup may not know the Slot of this head. However this situation is very rare. - head_slot: Option, - }, - /// Peer manager has received a MetaData of a peer with a new or updated CGC value. UpdatedPeerCgc(PeerId), @@ -135,6 +121,13 @@ pub enum SyncMessage { seen_timestamp: Duration, }, + BlockHeader { + id: SyncRequestId, + peer_id: PeerId, + header: Option, + seen_timestamp: Duration, + }, + /// A block with an unknown parent has been received. UnknownParentBlock(PeerId, Arc>, Hash256), @@ -168,12 +161,6 @@ pub enum SyncMessage { result: BatchProcessResult, }, - /// Block processed - BlockComponentProcessed { - process_type: BlockProcessType, - result: BlockProcessingResult, - }, - /// Sample data column verified SampleVerified { id: SamplingId, @@ -187,16 +174,16 @@ pub enum SyncMessage { /// The type of processing specified for a received block. #[derive(Debug, Clone)] pub enum BlockProcessType { - SingleBlock { id: Id }, - SingleBlob { id: Id }, + BlocksByRoot { id: Id }, + BlobsByRoot { id: Id }, SingleCustodyColumn(Id), } impl BlockProcessType { pub fn id(&self) -> Id { match self { - BlockProcessType::SingleBlock { id } - | BlockProcessType::SingleBlob { id } + BlockProcessType::BlocksByRoot { id } + | BlockProcessType::BlobsByRoot { id } | BlockProcessType::SingleCustodyColumn(id) => *id, } } @@ -213,17 +200,12 @@ pub enum BlockProcessingResult { #[derive(Debug)] pub enum BatchProcessResult { /// The batch was completed successfully. It carries whether the sent batch contained blocks. - Success { - sent_blocks: usize, - imported_blocks: usize, - }, + Success, /// The batch processing failed. It carries whether the processing imported any block. - FaultyFailure { - imported_blocks: usize, - peer_action: PeerGroupAction, + Failure { + peer_action: Option, error: String, }, - NonFaultyFailure, } /// The primary object for handling and driving all the current syncing logic. It maintains the @@ -240,13 +222,10 @@ pub struct SyncManager { /// A network context to contact the network service. network: SyncNetworkContext, - /// The object handling long-range batch load-balanced syncing. - range_sync: RangeSync, - /// Backfill syncing. backfill_sync: BackFillSync, + forward_sync: ForwardSync, - block_lookups: BlockLookups, /// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer /// may forward us thousands of a attestations, each one triggering an individual event. Only /// one event is useful, the rest generating log noise and wasted cycles @@ -266,11 +245,6 @@ pub fn spawn( sync_recv: mpsc::UnboundedReceiver>, fork_context: Arc, ) { - assert!( - beacon_chain.spec.max_request_blocks(fork_context.current_fork()) as u64 >= T::EthSpec::slots_per_epoch() * EPOCHS_PER_BATCH, - "Max blocks that can be requested in a single batch greater than max allowed blocks in a single request" - ); - // create an instance of the SyncManager let mut sync_manager = SyncManager::new( beacon_chain, @@ -312,9 +286,8 @@ impl SyncManager { beacon_chain.clone(), fork_context.clone(), ), - range_sync: RangeSync::new(beacon_chain.clone()), + forward_sync: ForwardSync::new(), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), - block_lookups: BlockLookups::new(), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, )), @@ -322,47 +295,6 @@ impl SyncManager { } } - #[cfg(test)] - pub(crate) fn active_single_lookups(&self) -> Vec { - self.block_lookups.active_single_lookups() - } - - #[cfg(test)] - pub(crate) fn active_parent_lookups(&self) -> Vec> { - self.block_lookups - .active_parent_lookups() - .iter() - .map(|c| c.chain.clone()) - .collect() - } - - #[cfg(test)] - pub(crate) fn get_range_sync_chains( - &self, - ) -> Result, &'static str> { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn range_sync_state(&self) -> super::range_sync::SyncChainStatus { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn __range_failed_chains(&mut self) -> Vec { - self.range_sync.__failed_chains() - } - - #[cfg(test)] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.block_lookups.get_failed_chains() - } - - #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.block_lookups.insert_failed_chain(block_root); - } - #[cfg(test)] pub(crate) fn active_sampling_requests(&self) -> Vec { self.sampling.active_sampling_requests() @@ -377,6 +309,18 @@ impl SyncManager { self.sampling.get_request_status(block_root, index) } + // Leak the full network context to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn network(&mut self) -> &mut SyncNetworkContext { + &mut self.network + } + + // Leak the full struct to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn forward_sync(&mut self) -> &mut ForwardSync { + &mut self.forward_sync + } + #[cfg(test)] pub(crate) fn update_execution_engine_state(&mut self, state: EngineState) { self.handle_new_execution_engine_state(state); @@ -407,35 +351,30 @@ impl SyncManager { finalized_root: status.finalized_root, }; - let sync_type = remote_sync_type(&local, &remote, &self.chain); + // Handle race condition where peer may disconnect between the peer manager sending the + // AddPeer message and sync handling a subsequent Disconnect message + if !self.network_globals().peers.read().is_connected(&peer_id) { + debug!(%peer_id, "Ignoring AddPeer message for already disconnected peer"); + return; + } - // update the state of the peer. - let is_still_connected = self.update_peer_sync_state(&peer_id, &local, &remote, &sync_type); - if is_still_connected { - match sync_type { - PeerSyncType::Behind => {} // Do nothing - PeerSyncType::Advanced => { - self.range_sync - .add_peer(&mut self.network, local, peer_id, remote); - } - PeerSyncType::FullySynced => { - // Sync considers this peer close enough to the head to not trigger range sync. - // Range sync handles well syncing large ranges of blocks, of a least a few blocks. - // However this peer may be in a fork that we should sync but we have not discovered - // yet. If the head of the peer is unknown, attempt block lookup first. If the - // unknown head turns out to be on a longer fork, it will trigger range sync. - // - // A peer should always be considered `Advanced` if its finalized root is - // unknown and ahead of ours, so we don't check for that root here. - // - // TODO: This fork-choice check is potentially duplicated, review code - if !self.chain.block_is_known_to_fork_choice(&remote.head_root) { - self.handle_unknown_block_root(peer_id, remote.head_root); - } - } - } + // Search for any block that is unknown and more recent than finality + // TODO(tree-sync): we could prioritize the finalized_root if it's unknown as a way to + // detect finalized sync + debug!(?remote, ?local, "new peer"); + if !self.chain.block_is_known_to_fork_choice(&remote.head_root) + && remote.head_slot + > status + .finalized_epoch + .start_slot(T::EthSpec::slots_per_epoch()) + { + self.add_peer_with_imported_block_root(peer_id, remote.head_root); } + // TODO(tree-sync): Okay to add all peers to backfill sync? How can we know which have the + // blocks we need? + self.backfill_sync.add_peer(peer_id); + self.update_sync_state(); // Try to make progress on custody requests that are waiting for peers @@ -444,41 +383,19 @@ impl SyncManager { } } - /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. - fn add_peers_force_range_sync( - &mut self, - peers: &[PeerId], - head_root: Hash256, - head_slot: Option, - ) { - let status = self.chain.status_message(); - let local = SyncInfo { - head_slot: status.head_slot, - head_root: status.head_root, - finalized_epoch: status.finalized_epoch, - finalized_root: status.finalized_root, - }; - - let head_slot = head_slot.unwrap_or_else(|| { - debug!( - local_head_slot = %local.head_slot, - ?head_root, - "On add peers force range sync assuming local head_slot" - ); - local.head_slot - }); - - let remote = SyncInfo { - head_slot, - head_root, - // Set finalized to same as local to trigger Head sync - finalized_epoch: local.finalized_epoch, - finalized_root: local.finalized_root, - }; + // Adds a peer to forward sync. Since its possible that a lookup just gained a new peer we + // attempt to continue idle custody by root requests that are waiting for peers. + fn add_peer_with_imported_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { + if let Err(e) = self + .forward_sync + .search(block_root, &[peer_id], &mut self.network) + { + error!("Error adding peer to forward sync {block_root:?} {peer_id} {e:?}"); + } - for peer_id in peers { - self.range_sync - .add_peer(&mut self.network, local.clone(), *peer_id, remote.clone()); + // Try to make progress on custody requests that are waiting for peers + for (id, result) in self.network.continue_custody_by_root_requests() { + self.on_custody_by_root_result(id, result); } } @@ -487,32 +404,23 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } - - // Attempt to resume range sync too - self.range_sync.resume(&mut self.network); } /// Handles RPC errors related to requests that were emitted from the sync manager. fn inject_error(&mut self, peer_id: PeerId, sync_request_id: SyncRequestId, error: RPCError) { trace!("Sync manager received a failed RPC"); match sync_request_id { - SyncRequestId::SingleBlock { id } => { - self.on_single_block_response(id, peer_id, RpcEvent::RPCError(error)) + SyncRequestId::BlocksByRoot(req_id) => { + self.on_blocks_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } - SyncRequestId::SingleBlob { id } => { - self.on_single_blob_response(id, peer_id, RpcEvent::RPCError(error)) + SyncRequestId::BlobsByRoot(req_id) => { + self.on_blobs_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } SyncRequestId::DataColumnsByRoot(req_id) => { self.on_data_columns_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } - SyncRequestId::BlocksByRange(req_id) => { - self.on_blocks_by_range_response(req_id, peer_id, RpcEvent::RPCError(error)) - } - SyncRequestId::BlobsByRange(req_id) => { - self.on_blobs_by_range_response(req_id, peer_id, RpcEvent::RPCError(error)) - } - SyncRequestId::DataColumnsByRange(req_id) => { - self.on_data_columns_by_range_response(req_id, peer_id, RpcEvent::RPCError(error)) + SyncRequestId::HeadersByRoot(req_id) => { + self.on_headers_by_root_response(req_id, peer_id, RpcEvent::RPCError(error)) } } } @@ -530,8 +438,8 @@ impl SyncManager { } // Remove peer from all data structures - self.range_sync.peer_disconnect(&mut self.network, peer_id); - self.block_lookups.peer_disconnected(peer_id); + self.backfill_sync.peer_disconnected(peer_id); + self.forward_sync.remove_peer(*peer_id); // Regardless of the outcome, we update the sync status. self.update_sync_state(); @@ -606,86 +514,73 @@ impl SyncManager { /// - If there is no range sync and no required backfill and we have synced up to the currently /// known peers, we consider ourselves synced. fn update_sync_state(&mut self) { - let new_state: SyncState = match self.range_sync.state() { - Err(e) => { - crit!(error = %e, "Error getting range sync state"); - return; - } - Ok(state) => match state { - None => { - // No range sync, so we decide if we are stalled or synced. - // For this we check if there is at least one advanced peer. An advanced peer - // with Idle range is possible since a peer's status is updated periodically. - // If we synced a peer between status messages, most likely the peer has - // advanced and will produce a head chain on re-status. Otherwise it will shift - // to being synced - let mut sync_state = { - let head = self.chain.best_slot(); - let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0)); - - let peers = self.network_globals().peers.read(); - if current_slot >= head - && current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64) - && head > 0 - { - SyncState::Synced - } else if peers.advanced_peers().next().is_some() { - SyncState::SyncTransition - } else if peers.synced_peers().next().is_none() { - SyncState::Stalled - } else { - // There are no peers that require syncing and we have at least one synced - // peer - SyncState::Synced + // TODO(tree-sync): We could just iterate the PeerDB and count the most common head as the + // sync target. + + let forward_sync_active = if self.forward_sync.block_count() > 32 { + self.forward_sync.max_slot_to_sync() + } else { + None + }; + + let new_state: SyncState = match forward_sync_active { + None => { + // No range sync, so we decide if we are stalled or synced. + // For this we check if there is at least one advanced peer. An advanced peer + // with Idle range is possible since a peer's status is updated periodically. + // If we synced a peer between status messages, most likely the peer has + // advanced and will produce a head chain on re-status. Otherwise it will shift + // to being synced + let mut sync_state = { + let head = self.chain.best_slot(); + let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0)); + + let peers = self.network_globals().peers.read(); + if current_slot >= head + && current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64) + && head > 0 + { + SyncState::Synced + } else if peers.advanced_peers().next().is_some() { + SyncState::SyncTransition + } else if peers.synced_peers().next().is_none() { + SyncState::Stalled + } else { + // There are no peers that require syncing and we have at least one synced + // peer + SyncState::Synced + } + }; + + // If we would otherwise be synced, first check if we need to perform or + // complete a backfill sync. + #[cfg(not(feature = "disable-backfill"))] + if matches!(sync_state, SyncState::Synced) { + // Determine if we need to start/resume/restart a backfill sync. + match self.backfill_sync.start(&mut self.network) { + Ok(SyncStart::Syncing) => { + sync_state = SyncState::BackFillSyncing; } - }; - - // If we would otherwise be synced, first check if we need to perform or - // complete a backfill sync. - #[cfg(not(feature = "disable-backfill"))] - if matches!(sync_state, SyncState::Synced) { - // Determine if we need to start/resume/restart a backfill sync. - match self.backfill_sync.start(&mut self.network) { - Ok(SyncStart::Syncing { - completed, - remaining, - }) => { - sync_state = SyncState::BackFillSyncing { - completed, - remaining, - }; - } - Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start. - Err(e) => { - error!(error = ?e, "Backfill sync failed to start"); - } + Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start. + Err(e) => { + error!(error = ?e, "Backfill sync failed to start"); } } - - // Return the sync state if backfilling is not required. - sync_state - } - Some((RangeSyncType::Finalized, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. - #[cfg(not(feature = "disable-backfill"))] - self.backfill_sync.pause(); - - SyncState::SyncingFinalized { - start_slot, - target_slot, - } } - Some((RangeSyncType::Head, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. - #[cfg(not(feature = "disable-backfill"))] - self.backfill_sync.pause(); - - SyncState::SyncingHead { - start_slot, - target_slot, - } + + // Return the sync state if backfilling is not required. + sync_state + } + Some(target_slot) => { + // If there is a backfill sync in progress pause it. + #[cfg(not(feature = "disable-backfill"))] + self.backfill_sync.pause(); + + SyncState::Syncing { + start_slot: self.chain.best_slot(), + target_slot, } - }, + } }; let old_state = self.network_globals().set_sync_state(new_state); @@ -696,10 +591,7 @@ impl SyncManager { // We don't need to subscribe if the old state is a state that would have already // invoked this call. if new_state.is_synced() - && !matches!( - old_state, - SyncState::Synced | SyncState::BackFillSyncing { .. } - ) + && !matches!(old_state, SyncState::Synced | SyncState::BackFillSyncing) { self.network.subscribe_core_topics(); } @@ -739,13 +631,14 @@ impl SyncManager { self.handle_new_execution_engine_state(engine_state); } _ = prune_lookups_interval.tick() => { - self.block_lookups.prune_lookups(); + self.forward_sync.prune(); } _ = prune_requests.tick() => { self.prune_requests(); } _ = register_metrics_interval.tick() => { self.network.register_metrics(); + self.forward_sync.register_metrics(); } } } @@ -756,13 +649,6 @@ impl SyncManager { SyncMessage::AddPeer(peer_id, info) => { self.add_peer(peer_id, info); } - SyncMessage::AddPeersForceRangeSync { - peers, - head_root, - head_slot, - } => { - self.add_peers_force_range_sync(&peers, head_root, head_slot); - } SyncMessage::UpdatedPeerCgc(peer_id) => { debug!( peer_id = ?peer_id, @@ -792,58 +678,34 @@ impl SyncManager { } => { self.rpc_data_column_received(sync_request_id, peer_id, data_column, seen_timestamp) } + SyncMessage::BlockHeader { + id, + peer_id, + header, + seen_timestamp, + } => self.rpc_block_header_received(id, peer_id, header, seen_timestamp), SyncMessage::UnknownParentBlock(peer_id, block, block_root) => { let block_slot = block.slot(); let parent_root = block.parent_root(); debug!(%block_root, %parent_root, "Received unknown parent block message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - block_slot, - BlockComponent::Block(DownloadResult { - value: block.block_cloned(), - block_root, - seen_timestamp: timestamp_now(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); + self.handle_unknown_parent(peer_id, block_root, parent_root, block_slot); + // TODO(tree-sync): Consider caching this block somewhere for re-processing } SyncMessage::UnknownParentBlob(peer_id, blob) => { let blob_slot = blob.slot(); let block_root = blob.block_root(); let parent_root = blob.block_parent_root(); debug!(%block_root, %parent_root, "Received unknown parent blob message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - blob_slot, - BlockComponent::Blob(DownloadResult { - value: blob, - block_root, - seen_timestamp: timestamp_now(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); + self.handle_unknown_parent(peer_id, block_root, parent_root, blob_slot); + // TODO(tree-sync): Consider caching this blob somewhere for re-processing } SyncMessage::UnknownParentDataColumn(peer_id, data_column) => { let data_column_slot = data_column.slot(); let block_root = data_column.block_root(); let parent_root = data_column.block_parent_root(); debug!(%block_root, %parent_root, "Received unknown parent data column message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - data_column_slot, - BlockComponent::DataColumn(DownloadResult { - value: data_column, - block_root, - seen_timestamp: timestamp_now(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); + self.handle_unknown_parent(peer_id, block_root, parent_root, data_column_slot); + // TODO(tree-sync): Consider caching this column somewhere for re-processing } SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => { if !self.notified_unknown_roots.contains(&(peer_id, block_root)) { @@ -870,44 +732,22 @@ impl SyncManager { sync_request_id, error, } => self.inject_error(peer_id, sync_request_id, error), - SyncMessage::BlockComponentProcessed { - process_type, - result, - } => self - .block_lookups - .on_processing_result(process_type, result, &mut self.network), SyncMessage::GossipBlockProcessResult { - block_root, - imported, - } => self.block_lookups.on_external_processing_result( - block_root, - imported, - &mut self.network, - ), + block_root: _, + imported: _, + } => { + // Not used + } SyncMessage::BatchProcessed { sync_type, result } => match sync_type { - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { - self.range_sync.handle_block_process_result( - &mut self.network, - chain_id, - epoch, - result, - ); + ChainSegmentProcessId::ForwardSync(id) => { + self.forward_sync + .on_block_process_result(id, result, &mut self.network); self.update_sync_state(); } - ChainSegmentProcessId::BackSyncBatchId(epoch) => { - match self.backfill_sync.on_batch_process_result( - &mut self.network, - epoch, - &result, - ) { - Ok(ProcessResult::Successful) => {} - Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), - Err(error) => { - error!(error = ?error, "Backfill sync failed"); - // Update the global status - self.update_sync_state(); - } - } + ChainSegmentProcessId::BackfillSync(id) => { + self.backfill_sync + .on_block_process_result(id, result, &mut self.network); + self.update_sync_state(); } }, SyncMessage::SampleVerified { id, result } => { @@ -927,24 +767,10 @@ impl SyncManager { block_root: Hash256, parent_root: Hash256, slot: Slot, - block_component: BlockComponent, ) { match self.should_search_for_block(Some(slot), &peer_id) { Ok(_) => { - if self.block_lookups.search_child_and_parent( - block_root, - block_component, - peer_id, - &mut self.network, - ) { - // Lookup created. No need to log here it's logged in `new_current_lookup` - } else { - debug!( - ?block_root, - ?parent_root, - "No lookup created for child and parent" - ); - } + self.add_peer_with_imported_block_root(peer_id, block_root); } Err(reason) => { debug!(%block_root, %parent_root, reason, "Ignoring unknown parent request"); @@ -955,15 +781,7 @@ impl SyncManager { fn handle_unknown_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { match self.should_search_for_block(None, &peer_id) { Ok(_) => { - if self.block_lookups.search_unknown_block( - block_root, - &[peer_id], - &mut self.network, - ) { - // Lookup created. No need to log here it's logged in `new_current_lookup` - } else { - debug!(?block_root, "No lookup created for unknown block"); - } + self.add_peer_with_imported_block_root(peer_id, block_root); } Err(reason) => { debug!(%block_root, reason, "Ignoring unknown block request"); @@ -1010,6 +828,7 @@ impl SyncManager { EngineState::Online => { // Resume sync components. + // TODO(tree-sync): review this // - Block lookups: // We start searching for blocks again. This is done by updating the stored ee online // state. No further action required. @@ -1018,10 +837,6 @@ impl SyncManager { // We start searching for parents again. This is done by updating the stored ee // online state. No further action required. - // - Range: - // Actively resume. - self.range_sync.resume(&mut self.network); - // - Backfill: // Not affected by ee states, nothing to do. } @@ -1032,8 +847,8 @@ impl SyncManager { // - Block lookups: // Disabled while in this state. We drop current requests and don't search for new // blocks. - let dropped_single_blocks_requests = - self.block_lookups.drop_single_block_requests(); + // TODO(tree-sync): should we pause it instead? + self.forward_sync.pause(); // - Range: // We still send found peers to range so that it can keep track of potential chains @@ -1043,12 +858,7 @@ impl SyncManager { // - Backfill: Not affected by ee states, nothing to do. // Some logs. - if dropped_single_blocks_requests > 0 { - debug!( - dropped_single_blocks_requests, - "Execution engine not online. Dropping active requests." - ); - } + debug!("Execution engine not online. Stopping active sync requests."); } } } @@ -1061,12 +871,7 @@ impl SyncManager { seen_timestamp: Duration, ) { match sync_request_id { - SyncRequestId::SingleBlock { id } => self.on_single_block_response( - id, - peer_id, - RpcEvent::from_chunk(block, seen_timestamp), - ), - SyncRequestId::BlocksByRange(id) => self.on_blocks_by_range_response( + SyncRequestId::BlocksByRoot(id) => self.on_blocks_by_root_response( id, peer_id, RpcEvent::from_chunk(block, seen_timestamp), @@ -1077,21 +882,56 @@ impl SyncManager { } } - fn on_single_block_response( + fn on_blocks_by_root_response( &mut self, - id: SingleLookupReqId, + req_id: BlocksByRootRequestId, peer_id: PeerId, block: RpcEvent>>, ) { - if let Some(resp) = self.network.on_single_block_response(id, peer_id, block) { - self.block_lookups - .on_download_response::>( - id, - resp.map(|(value, seen_timestamp)| { - (value, PeerGroup::from_single(peer_id), seen_timestamp) - }), - &mut self.network, - ) + if let Some(result) = self + .network + .on_blocks_by_root_response(req_id, peer_id, block) + { + match req_id.parent_request_id { + BlocksByRootRequester::Header(lookup_id) => { + self.forward_sync.on_headers_download_result( + req_id, + lookup_id, + result.map(|(blocks, seen_timestamp)| { + let blocks = blocks + .into_iter() + .map(|block| block.message().block_header()) + .collect::>(); + (blocks, seen_timestamp) + }), + peer_id, + &mut self.network, + ); + } + BlocksByRootRequester::ForwardSync(batch_id) => { + self.on_block_components_by_root_response( + batch_id, + RangeBlockComponent::Block(req_id, result, peer_id), + ); + } + } + } + } + + fn on_blobs_by_root_response( + &mut self, + req_id: BlobsByRootRequestId, + peer_id: PeerId, + block: RpcEvent>>, + ) { + if let Some(result) = self + .network + .on_blobs_by_root_response(req_id, peer_id, block) + { + self.on_block_components_by_root_response( + req_id.parent_request_id, + RangeBlockComponent::Blob(req_id, result, peer_id), + ); } } @@ -1103,12 +943,7 @@ impl SyncManager { seen_timestamp: Duration, ) { match sync_request_id { - SyncRequestId::SingleBlob { id } => self.on_single_blob_response( - id, - peer_id, - RpcEvent::from_chunk(blob, seen_timestamp), - ), - SyncRequestId::BlobsByRange(id) => self.on_blobs_by_range_response( + SyncRequestId::BlobsByRoot(id) => self.on_blobs_by_root_response( id, peer_id, RpcEvent::from_chunk(blob, seen_timestamp), @@ -1134,32 +969,30 @@ impl SyncManager { RpcEvent::from_chunk(data_column, seen_timestamp), ); } - SyncRequestId::DataColumnsByRange(id) => self.on_data_columns_by_range_response( - id, - peer_id, - RpcEvent::from_chunk(data_column, seen_timestamp), - ), _ => { crit!(%peer_id, "bad request id for data_column"); } } } - fn on_single_blob_response( + fn rpc_block_header_received( &mut self, - id: SingleLookupReqId, + id: SyncRequestId, peer_id: PeerId, - blob: RpcEvent>>, + header: Option, + seen_timestamp: Duration, ) { - if let Some(resp) = self.network.on_single_blob_response(id, peer_id, blob) { - self.block_lookups - .on_download_response::>( - id, - resp.map(|(value, seen_timestamp)| { - (value, PeerGroup::from_single(peer_id), seen_timestamp) - }), - &mut self.network, - ) + match id { + SyncRequestId::HeadersByRoot(req_id) => { + self.on_headers_by_root_response( + req_id, + peer_id, + RpcEvent::from_chunk(header, seen_timestamp), + ); + } + _ => { + crit!(%peer_id, "bad request id for beacon_block_header"); + } } } @@ -1173,7 +1006,7 @@ impl SyncManager { self.network .on_data_columns_by_root_response(req_id, peer_id, data_column) { - match req_id.requester { + match req_id.parent_request_id { DataColumnsByRootRequester::Sampling(id) => { if let Some((requester, result)) = self.sampling @@ -1187,72 +1020,46 @@ impl SyncManager { .network .on_custody_by_root_response(custody_id, req_id, peer_id, resp) { - self.on_custody_by_root_result(custody_id.requester, result); + self.on_custody_by_root_result(custody_id, result); } } } } } - fn on_blocks_by_range_response( - &mut self, - id: BlocksByRangeRequestId, - peer_id: PeerId, - block: RpcEvent>>, - ) { - if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::Block(id, resp), - ); - } - } - - fn on_blobs_by_range_response( - &mut self, - id: BlobsByRangeRequestId, - peer_id: PeerId, - blob: RpcEvent>>, - ) { - if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::Blob(id, resp), - ); - } - } - - fn on_data_columns_by_range_response( + fn on_headers_by_root_response( &mut self, - id: DataColumnsByRangeRequestId, + req_id: HeadersByRootRequestId, peer_id: PeerId, - data_column: RpcEvent>>, + header: RpcEvent, ) { if let Some(resp) = self .network - .on_data_columns_by_range_response(id, peer_id, data_column) + .on_headers_by_root_response(req_id, peer_id, header) { - self.on_range_components_response( - id.parent_request_id, + self.forward_sync.on_headers_download_result( + // TODO(tree-sync): handle the two type of requests with distinct IDs + BlocksByRootRequestId { + id: req_id.id, + parent_request_id: BlocksByRootRequester::Header(req_id.parent_request_id), + }, + req_id.parent_request_id, + resp, peer_id, - RangeBlockComponent::CustodyColumns(id, resp), + &mut self.network, ); } } fn on_custody_by_root_result( &mut self, - requester: CustodyRequester, - response: CustodyByRootResult, + id: CustodyByRootRequestId, + result: CustodyRequestResult, ) { - self.block_lookups - .on_download_response::>( - requester.0, - response, - &mut self.network, - ); + self.on_block_components_by_root_response( + id.parent_request_id, + RangeBlockComponent::CustodyColumns(id, result), + ); } fn on_sampling_result(&mut self, requester: SamplingRequester, result: SamplingResult) { @@ -1282,75 +1089,28 @@ impl SyncManager { /// Handles receiving a response for a range sync request that should have both blocks and /// blobs. - fn on_range_components_response( + fn on_block_components_by_root_response( &mut self, - range_request_id: ComponentsByRangeRequestId, - peer_id: PeerId, + req_id: ComponentsByRootRequestId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self.network.range_block_component_response( - range_request_id, - peer_id, - range_block_component, - ) { - match resp { - Ok((blocks, batch_peers)) => { - match range_request_id.requester { - RangeRequestId::RangeSync { chain_id, batch_id } => { - self.range_sync.blocks_by_range_response( - &mut self.network, - batch_peers, - chain_id, - batch_id, - range_request_id.id, - blocks, - ); - self.update_sync_state(); - } - RangeRequestId::BackfillSync { batch_id } => { - match self.backfill_sync.on_block_response( - &mut self.network, - batch_id, - batch_peers, - range_request_id.id, - blocks, - ) { - Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), - Ok(ProcessResult::Successful) => {} - Err(_error) => { - // The backfill sync has failed, errors are reported - // within. - self.update_sync_state(); - } - } - } - } + if let Some(result) = self + .network + .on_block_components_by_root_response(req_id, range_block_component) + { + match req_id.requester { + RangeRequestId::ForwardSync(id) => { + self.forward_sync.on_block_download_result( + req_id, + id, + result, + &mut self.network, + ); + } + RangeRequestId::BackfillSync(_) => { + self.backfill_sync + .on_block_download_result(req_id, result, &mut self.network) } - Err(e) => match range_request_id.requester { - RangeRequestId::RangeSync { chain_id, batch_id } => { - self.range_sync.inject_error( - &mut self.network, - peer_id, - batch_id, - chain_id, - range_request_id.id, - e, - ); - self.update_sync_state(); - } - RangeRequestId::BackfillSync { batch_id } => { - match self.backfill_sync.inject_error( - &mut self.network, - batch_id, - &peer_id, - range_request_id.id, - e, - ) { - Ok(_) => {} - Err(_) => self.update_sync_state(), - } - } - }, } } } diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 0f5fd6fb9f1..23f4700baaf 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -2,16 +2,14 @@ //! //! Stores the various syncing methods for the beacon chain. mod backfill_sync; -mod block_lookups; -mod block_sidecar_coupling; +mod forward_sync; pub mod manager; mod network_context; mod peer_sampling; mod peer_sync_info; -mod range_sync; +mod sync_block; #[cfg(test)] mod tests; pub use lighthouse_network::service::api_types::SamplingId; pub use manager::{BatchProcessResult, SyncMessage}; -pub use range_sync::{BatchOperationOutcome, ChainId}; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index c7a483f33d1..5250803fca3 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,42 +1,41 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; -pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; -use super::block_sidecar_coupling::RangeBlockComponentsRequest; -use super::manager::BlockProcessType; -use super::range_sync::{BatchPeers, ByRangeRequestType}; +use self::custody_by_root::ActiveCustodyByRootRequest; use super::SyncMessage; use crate::metrics; -use crate::network_beacon_processor::NetworkBeaconProcessor; #[cfg(test)] use crate::network_beacon_processor::TestBeaconChainType; +use crate::network_beacon_processor::{NetworkBeaconProcessor, PeerGroupAction}; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::block_lookups::SingleLookupId; -use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; -use custody::CustodyRequestResult; +use beacon_chain::{BeaconChain, BeaconChainTypes, EngineState}; +pub use block_components_by_range::BlockComponentsByRootRequest; +#[cfg(test)] +pub use block_components_by_range::BlockComponentsByRootRequestStep; +pub use download_request::{DownloadRequest, Error as DownloadRequestError}; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; -use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; +use itertools::Itertools; +use lighthouse_network::rpc::methods::{ + BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, +}; +use lighthouse_network::rpc::{GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, + ComponentsByRootRequestId, CustodyByRootRequestId, DataColumnsByRootRequestId, + DataColumnsByRootRequester, HeadersByRootRequestId, Id, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; pub use requests::LookupVerifyError; use requests::{ - ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems, - BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems, + ActiveRequests, BlobsByRootRequestItems, BlocksByRootRequestItems, + DataColumnsByRootRequestItems, HeadersByRootRequestItems, }; #[cfg(test)] use slot_clock::SlotClock; -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -44,14 +43,16 @@ use std::time::Duration; #[cfg(test)] use task_executor::TaskExecutor; use tokio::sync::mpsc; -use tracing::{debug, error, span, warn, Level}; -use types::blob_sidecar::FixedBlobSidecarList; +use tracing::{debug, span, warn, Level}; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BeaconBlockHeader, BlobIdentifier, BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, + DataColumnSidecarList, DataColumnsByRootIdentifier, EthSpec, ForkContext, ForkName, Hash256, + RuntimeVariableList, SignedBeaconBlock, }; -pub mod custody; +pub mod block_components_by_range; +pub mod custody_by_root; +mod download_request; mod requests; #[derive(Debug)] @@ -73,31 +74,29 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; -#[derive(Debug)] +/// Common result type for `custody_by_root` and `custody_by_range` requests. The peers are part of +/// the `Ok` response since they are not known until the entire request succeeds. +pub type CustodyRequestResult = RpcResponseBatchResult>; + +#[derive(Debug, Clone)] pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), - CustodyRequestError(#[allow(dead_code)] CustodyRequestError), - BlockComponentCouplingError(#[allow(dead_code)] String), + RequestExpired(String), + InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// No peer available matching the required criteria - NoPeer(NoPeerError), /// These errors should never happen, including unreachable custody errors or network send /// errors. InternalError(String), -} - -/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` -#[derive(Debug, PartialEq, Eq)] -pub enum NoPeerError { - BlockPeer, - CustodyPeer(ColumnIndex), + // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream + // consumers that sends are expected to be infallible. If this assumption changes in the future, + // add a new variant. + NoPeers, } #[derive(Debug, PartialEq, Eq)] @@ -124,51 +123,83 @@ pub struct PeerGroup { /// Peers group by which indexed section of the block component they served. For example: /// - PeerA served = [blob index 0, blob index 2] /// - PeerA served = [blob index 1] - peers: HashMap>, + peers: HashMap, } impl PeerGroup { - /// Return a peer group where a single peer returned all parts of a block component. For - /// example, a block has a single component (the block = index 0/1). - pub fn from_single(peer: PeerId) -> Self { + pub(crate) fn empty() -> Self { Self { - peers: HashMap::from_iter([(peer, vec![0])]), + peers: HashMap::new(), } } - pub fn from_set(peers: HashMap>) -> Self { + + pub(crate) fn from_set(peer_to_indices: HashMap>) -> Self { + let mut peers = HashMap::new(); + for (peer, indices) in peer_to_indices { + for index in indices { + peers.insert(index, peer); + } + } Self { peers } } - pub fn all(&self) -> impl Iterator + '_ { - self.peers.keys() + + pub(crate) fn of_index(&self, index: &usize) -> Option<&PeerId> { + self.peers.get(index) } - pub fn of_index(&self, index: usize) -> impl Iterator + '_ { - self.peers.iter().filter_map(move |(peer, indices)| { - if indices.contains(&index) { - Some(peer) - } else { - None - } - }) +} + +#[derive(Clone, Debug)] +pub struct BatchPeers { + block_peer: PeerId, + column_peers: PeerGroup, +} + +impl BatchPeers { + pub(crate) fn new_from_block_peer(block_peer: PeerId) -> Self { + Self { + block_peer, + column_peers: PeerGroup::empty(), + } + } + pub(crate) fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { + Self { + block_peer, + column_peers, + } + } + + pub(crate) fn blame(&self, peer_action: PeerGroupAction) -> Vec<(PeerId, PeerAction)> { + // Penalize each peer only once. Currently a peer_action does not mix different + // PeerAction levels. + let mut peer_penalties = peer_action + .column_peer + .iter() + .filter_map(|(column_index, penalty)| { + self.column(column_index).map(|peer| (*peer, *penalty)) + }) + .unique() + .collect::>(); + + if let Some(penalty) = peer_action.block_peer { + // Penalize the peer appropiately. + peer_penalties.push((self.block(), penalty)); + } + + peer_penalties + } + + fn block(&self) -> PeerId { + self.block_peer + } + + fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { + self.column_peers.of_index(&((*index) as usize)) } } /// Sequential ID that uniquely identifies ReqResp outgoing requests pub type ReqId = u32; -pub enum LookupRequestResult { - /// A request is sent. Sync MUST receive an event from the network in the future for either: - /// completed response or failed request - RequestSent(I), - /// No request is sent, and no further action is necessary to consider this request completed. - /// Includes a reason why this request is not needed. - NoRequestNeeded(&'static str), - /// No request is sent, but the request is not completed. Sync MUST receive some future event - /// that makes progress on the request. For example: request is processing from a different - /// source (i.e. block received from gossip) and sync MUST receive an event with that processing - /// result. - Pending(&'static str), -} - /// Wraps a Network channel to employ various RPC related network functionality for the Sync manager. This includes management of a global RPC request Id. pub struct SyncNetworkContext { /// The network channel to relay messages to the Network service. @@ -179,28 +210,22 @@ pub struct SyncNetworkContext { /// A mapping of active BlocksByRoot requests, including both current slot and parent lookups. blocks_by_root_requests: - ActiveRequests>, + ActiveRequests>, /// A mapping of active BlobsByRoot requests, including both current slot and parent lookups. - blobs_by_root_requests: ActiveRequests>, + blobs_by_root_requests: + ActiveRequests>, /// A mapping of active DataColumnsByRoot requests data_columns_by_root_requests: ActiveRequests>, - /// A mapping of active BlocksByRange requests - blocks_by_range_requests: - ActiveRequests>, - /// A mapping of active BlobsByRange requests - blobs_by_range_requests: - ActiveRequests>, - /// A mapping of active DataColumnsByRange requests - data_columns_by_range_requests: - ActiveRequests>, - - /// Mapping of active custody column requests for a block root - custody_by_root_requests: FnvHashMap>, - - /// BlocksByRange requests paired with other ByRange requests for data components - components_by_range_requests: - FnvHashMap>, + /// A mapping of active HeadersByRoot requests + headers_by_root_requests: ActiveRequests, + + /// Mapping of active custody column by root requests for a block root + custody_by_root_requests: FnvHashMap>, + + /// BlocksByRoot requests paired with other ByRoot requests for data components + block_components_by_root_requests: + FnvHashMap>, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -217,17 +242,16 @@ pub struct SyncNetworkContext { /// Small enumeration to make dealing with block and blob requests easier. pub enum RangeBlockComponent { Block( - BlocksByRangeRequestId, + BlocksByRootRequestId, RpcResponseResult>>>, + PeerId, ), Blob( - BlobsByRangeRequestId, + BlobsByRootRequestId, RpcResponseResult>>>, + PeerId, ), - CustodyColumns( - DataColumnsByRangeRequestId, - RpcResponseResult>>>, - ), + CustodyColumns(CustodyByRootRequestId, CustodyRequestResult), } #[cfg(test)] @@ -238,7 +262,7 @@ impl SyncNetworkContext> { task_executor: TaskExecutor, ) -> Self { let fork_context = Arc::new(ForkContext::new::( - beacon_chain.slot_clock.now().unwrap_or(Slot::new(0)), + beacon_chain.slot_clock.now().unwrap_or(types::Slot::new(0)), beacon_chain.genesis_validators_root, &beacon_chain.spec, )); @@ -279,11 +303,9 @@ impl SyncNetworkContext { blocks_by_root_requests: ActiveRequests::new("blocks_by_root"), blobs_by_root_requests: ActiveRequests::new("blobs_by_root"), data_columns_by_root_requests: ActiveRequests::new("data_columns_by_root"), - blocks_by_range_requests: ActiveRequests::new("blocks_by_range"), - blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), - data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), + headers_by_root_requests: ActiveRequests::new("headers_by_root"), custody_by_root_requests: <_>::default(), - components_by_range_requests: FnvHashMap::default(), + block_components_by_root_requests: <_>::default(), network_beacon_processor, chain, fork_context, @@ -297,6 +319,14 @@ impl SyncNetworkContext { /// Returns the ids of all the requests made to the given peer_id. pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Vec { + self.active_requests() + .filter(|(_, request_peer)| *request_peer == peer_id) + .map(|(id, _)| id) + .collect() + } + + /// Returns the ids of all active requests + pub fn active_requests(&mut self) -> impl Iterator { // Note: using destructuring pattern without a default case to make sure we don't forget to // add new request types to this function. Otherwise, lookup sync can break and lookups // will get stuck if a peer disconnects during an active requests. @@ -306,13 +336,11 @@ impl SyncNetworkContext { blocks_by_root_requests, blobs_by_root_requests, data_columns_by_root_requests, - blocks_by_range_requests, - blobs_by_range_requests, - data_columns_by_range_requests, + headers_by_root_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, - // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + // components_by_root_requests is a meta request of various _by_root requests + block_components_by_root_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -320,36 +348,31 @@ impl SyncNetworkContext { } = self; let blocks_by_root_ids = blocks_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlock { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlocksByRoot(*id), peer)); let blobs_by_root_ids = blobs_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlob { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlobsByRoot(*id), peer)); let data_column_by_root_ids = data_columns_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); - let blocks_by_range_ids = blocks_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlocksByRange(*req_id)); - let blobs_by_range_ids = blobs_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlobsByRange(*req_id)); - let data_column_by_range_ids = data_columns_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); + let headers_by_root_ids = headers_by_root_requests + .active_requests() + .map(|(id, peer)| (SyncRequestId::HeadersByRoot(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) - .chain(blocks_by_range_ids) - .chain(blobs_by_range_ids) - .chain(data_column_by_range_ids) + .chain(headers_by_root_ids) + } + + #[cfg(test)] + pub fn active_block_components_requests( + &self, + ) -> Vec<(ComponentsByRootRequestId, BlockComponentsByRootRequestStep)> { + self.block_components_by_root_requests + .iter() + .map(|(id, req)| (*id, req.state_step())) .collect() } @@ -362,6 +385,10 @@ impl SyncNetworkContext { &self.network_beacon_processor.network_globals } + pub fn spec(&self) -> &ChainSpec { + &self.chain.spec + } + /// Returns the Client type of the peer if known pub fn client_type(&self, peer_id: &PeerId) -> Client { self.network_globals() @@ -409,13 +436,11 @@ impl SyncNetworkContext { blocks_by_root_requests, blobs_by_root_requests, data_columns_by_root_requests, - blocks_by_range_requests, - blobs_by_range_requests, - data_columns_by_range_requests, + headers_by_root_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_root_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -430,9 +455,7 @@ impl SyncNetworkContext { .iter_request_peers() .chain(blobs_by_root_requests.iter_request_peers()) .chain(data_columns_by_root_requests.iter_request_peers()) - .chain(blocks_by_range_requests.iter_request_peers()) - .chain(blobs_by_range_requests.iter_request_peers()) - .chain(data_columns_by_range_requests.iter_request_peers()) + .chain(headers_by_root_requests.iter_request_peers()) { *active_request_count_by_peer.entry(peer_id).or_default() += 1; } @@ -443,423 +466,33 @@ impl SyncNetworkContext { /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, - request: BlocksByRangeRequest, + block_root: Hash256, requester: RangeRequestId, - peers: &HashSet, + peers: Arc>>, peers_to_deprioritize: &HashSet, - ) -> Result { - let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let batch_type = self.batch_type(batch_epoch); - - let active_request_count_by_peer = self.active_request_count_by_peer(); - - let Some(block_peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); - }; - - // Attempt to find all required custody peers before sending any request or creating an ID - let columns_by_range_peers_to_request = - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns(); - Some(self.select_columns_by_range_peers_to_request( - &column_indexes, - peers, - active_request_count_by_peer, - peers_to_deprioritize, - )?) - } else { - None - }; - - // Create the overall components_by_range request ID before its individual components - let id = ComponentsByRangeRequestId { + ) -> Result { + let id = ComponentsByRootRequestId { id: self.next_id(), requester, }; - let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; - - let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { - Some(self.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - )?) - } else { - None - }; - - let data_column_requests = columns_by_range_peers_to_request - .map(|columns_by_range_peers_to_request| { - let column_to_peer_map = columns_by_range_peers_to_request - .iter() - .flat_map(|(peer_id, columns)| columns.iter().map(|column| (*column, *peer_id))) - .collect::>(); - - let requests = columns_by_range_peers_to_request - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>()?; - - Ok((requests, column_to_peer_map)) - }) - .transpose()?; - - let info = - RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_column_requests); - self.components_by_range_requests.insert(id, info); - - Ok(id.id) - } - - fn select_columns_by_range_peers_to_request( - &self, - custody_indexes: &HashSet, - peers: &HashSet, - active_request_count_by_peer: HashMap, - peers_to_deprioritize: &HashSet, - ) -> Result>, RpcRequestSendError> { - let mut columns_to_request_by_peer = HashMap::>::new(); - - for column_index in custody_indexes { - // Strictly consider peers that are custodials of this column AND are part of this - // syncing chain. If the forward range sync chain has few peers, it's likely that this - // function will not be able to find peers on our custody columns. - let Some(custody_peer) = peers - .iter() - .filter(|peer| { - self.network_globals() - .is_custody_peer_of(*column_index, peer) - }) - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - // Also account for requests that are not yet issued tracked in peer_id_to_request_map - // We batch requests to the same peer, so count existance in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // TODO(das): this will be pretty bad UX. To improve we should: - // - Handle the no peers case gracefully, maybe add some timeout and give a few - // minutes / seconds to the peer manager to locate peers on this subnet before - // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( - *column_index, - ))); - }; - - columns_to_request_by_peer - .entry(custody_peer) - .or_default() - .push(*column_index); - } - - Ok(columns_to_request_by_peer) - } - - /// Received a _by_range response for a request that couples blocks and its data - /// - /// `peer_id` is the peer that served this individual RPC _by_range response. - #[allow(clippy::type_complexity)] - pub fn range_block_component_response( - &mut self, - id: ComponentsByRangeRequestId, - peer_id: PeerId, - range_block_component: RangeBlockComponent, - ) -> Option>, BatchPeers), RpcResponseError>> { - let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { - metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); - return None; - }; - - if let Err(e) = { - let request = entry.get_mut(); - match range_block_component { - RangeBlockComponent::Block(req_id, resp) => resp.and_then(|(blocks, _)| { - request - .add_blocks(req_id, blocks, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::Blob(req_id, resp) => resp.and_then(|(blobs, _)| { - request - .add_blobs(req_id, blobs, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::CustodyColumns(req_id, resp) => { - resp.and_then(|(custody_columns, _)| { - request - .add_custody_columns(req_id, custody_columns, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }) - } - } - } { - entry.remove(); - return Some(Err(e)); - } - - if let Some(blocks_result) = entry.get().responses(&self.chain.spec) { - entry.remove(); - // If the request is finished, dequeue everything - Some(blocks_result.map_err(RpcResponseError::BlockComponentCouplingError)) - } else { - None - } - } - - /// Request block of `block_root` if necessary by checking: - /// - If the da_checker has a pending block from gossip or a previous request - /// - /// Returns false if no request was made, because the block is already imported - pub fn block_lookup_request( - &mut self, - lookup_id: SingleLookupId, - lookup_peers: Arc>>, - block_root: Hash256, - ) -> Result { - let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(peer_id) = lookup_peers - .read() - .iter() - .map(|peer| { - ( - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, peer)| *peer) - else { - // Allow lookup to not have any peers and do nothing. This is an optimization to not - // lose progress of lookups created from a block with unknown parent before we receive - // attestations for said block. - // Lookup sync event safety: If a lookup requires peers to make progress, and does - // not receive any new peers for some time it will be dropped. If it receives a new - // peer it must attempt to make progress. - return Ok(LookupRequestResult::Pending("no peers")); - }; - - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - match self.chain.get_block_process_status(&block_root) { - // Unknown block, continue request to download - BlockProcessStatus::Unknown => {} - // Block is known are currently processing, expect a future event with the result of - // processing. - BlockProcessStatus::NotValidated { .. } => { - // Lookup sync event safety: If the block is currently in the processing cache, we - // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will - // make progress on this lookup - return Ok(LookupRequestResult::Pending("block in processing cache")); - } - // Block is fully validated. If it's not yet imported it's waiting for missing block - // components. Consider this request completed and do nothing. - BlockProcessStatus::ExecutionValidated { .. } => { - return Ok(LookupRequestResult::NoRequestNeeded( - "block execution validated", - )) - } - } - - let id = SingleLookupReqId { - lookup_id, - req_id: self.next_id(), - }; + let req = + BlockComponentsByRootRequest::new(id, block_root, peers, peers_to_deprioritize, self)?; - let request = BlocksByRootSingleRequest(block_root); + self.block_components_by_root_requests.insert(id, req); - // Lookup sync event safety: If network_send.send() returns Ok(_) we are guaranteed that - // eventually at least one this 3 events will be received: - // - StreamTermination(request_id): handled by `Self::on_single_block_response` - // - RPCError(request_id): handled by `Self::on_single_block_response` - // - Disconnect(peer_id) handled by `Self::peer_disconnected``which converts it to a - // ` RPCError(request_id)`event handled by the above method - self.network_send - .send(NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlocksByRoot(request.into_request(&self.fork_context)), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "BlocksByRoot", - ?block_root, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.blocks_by_root_requests.insert( - id, - peer_id, - // true = enforce max_requests as returned for blocks_by_root. We always request a single - // block and the peer must have it. - true, - BlocksByRootRequestItems::new(request), - ); - - Ok(LookupRequestResult::RequestSent(id.req_id)) - } - - /// Request necessary blobs for `block_root`. Requests only the necessary blobs by checking: - /// - If we have a downloaded but not yet processed block - /// - If the da_checker has a pending block - /// - If the da_checker has pending blobs from gossip - /// - /// Returns false if no request was made, because we don't need to import (more) blobs. - pub fn blob_lookup_request( - &mut self, - lookup_id: SingleLookupId, - lookup_peers: Arc>>, - block_root: Hash256, - expected_blobs: usize, - ) -> Result { - let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(peer_id) = lookup_peers - .read() - .iter() - .map(|peer| { - ( - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, peer)| *peer) - else { - // Allow lookup to not have any peers and do nothing. This is an optimization to not - // lose progress of lookups created from a block with unknown parent before we receive - // attestations for said block. - // Lookup sync event safety: If a lookup requires peers to make progress, and does - // not receive any new peers for some time it will be dropped. If it receives a new - // peer it must attempt to make progress. - return Ok(LookupRequestResult::Pending("no peers")); - }; - - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let imported_blob_indexes = self - .chain - .data_availability_checker - .cached_blob_indexes(&block_root) - .unwrap_or_default(); - // Include only the blob indexes not yet imported (received through gossip) - let indices = (0..expected_blobs as u64) - .filter(|index| !imported_blob_indexes.contains(index)) - .collect::>(); - - if indices.is_empty() { - // No blobs required, do not issue any request - return Ok(LookupRequestResult::NoRequestNeeded("no indices to fetch")); - } - - let id = SingleLookupReqId { - lookup_id, - req_id: self.next_id(), - }; - - let request = BlobsByRootSingleBlockRequest { - block_root, - indices: indices.clone(), - }; - - // Lookup sync event safety: Refer to `Self::block_lookup_request` `network_send.send` call - self.network_send - .send(NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlobsByRoot(request.clone().into_request(&self.fork_context)), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "BlobsByRoot", - ?block_root, - blob_indices = ?indices, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.blobs_by_root_requests.insert( - id, - peer_id, - // true = enforce max_requests are returned for blobs_by_root. We only issue requests for - // blocks after we know the block has data, and only request peers after they claim to - // have imported the block+blobs. - true, - BlobsByRootRequestItems::new(request), - ); - - Ok(LookupRequestResult::RequestSent(id.req_id)) + Ok(id) } /// Request to send a single `data_columns_by_root` request to the network. - pub fn data_column_lookup_request( + pub fn data_columns_by_root_request( &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, - request: DataColumnsByRootSingleBlockRequest, + block_root: Hash256, + indices: Vec, expect_max_responses: bool, - ) -> Result, &'static str> { + ) -> Result { let span = span!( Level::INFO, "SyncNetworkContext", @@ -869,24 +502,28 @@ impl SyncNetworkContext { let id = DataColumnsByRootRequestId { id: self.next_id(), - requester, + parent_request_id: requester, }; + let request = DataColumnsByRootRequest::new( + vec![DataColumnsByRootIdentifier { + block_root, + columns: RuntimeVariableList::from_vec(indices.clone(), usize::MAX), + }], + usize::MAX, + ); + self.send_network_msg(NetworkMessage::SendRequest { peer_id, - request: RequestType::DataColumnsByRoot( - request - .clone() - .try_into_request(self.fork_context.current_fork(), &self.chain.spec)?, - ), + request: RequestType::DataColumnsByRoot(request), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), })?; debug!( method = "DataColumnsByRoot", - block_root = ?request.block_root, - indices = ?request.indices, peer = %peer_id, + ?block_root, + ?indices, %id, "Sync RPC request sent" ); @@ -895,22 +532,22 @@ impl SyncNetworkContext { id, peer_id, expect_max_responses, - DataColumnsByRootRequestItems::new(request), + DataColumnsByRootRequestItems::new(block_root, indices), ); - Ok(LookupRequestResult::RequestSent(id)) + Ok(id) } /// Request to fetch all needed custody columns of a specific block. This function may not send /// any request to the network if no columns have to be fetched based on the import state of the /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_root` /// requests. - pub fn custody_lookup_request( + pub fn send_custody_by_root_request( &mut self, - lookup_id: SingleLookupId, + parent_request_id: ComponentsByRootRequestId, block_root: Hash256, lookup_peers: Arc>>, - ) -> Result { + ) -> Result { let span = span!( Level::INFO, "SyncNetworkContext", @@ -918,44 +555,22 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let custody_indexes_imported = self - .chain - .data_availability_checker - .cached_data_column_indexes(&block_root) - .unwrap_or_default(); + let id = CustodyByRootRequestId { parent_request_id }; + debug!( + %id, + ?block_root, + peers = lookup_peers.read().len(), + "Starting custody columns request" + ); - // Include only the blob indexes not yet imported (received through gossip) - let custody_indexes_to_fetch = self + let custody_indices = self .network_globals() .sampling_columns() .into_iter() - .filter(|index| !custody_indexes_imported.contains(index)) .collect::>(); - if custody_indexes_to_fetch.is_empty() { - // No indexes required, do not issue any request - return Ok(LookupRequestResult::NoRequestNeeded("no indices to fetch")); - } - - let id = SingleLookupReqId { - lookup_id, - req_id: self.next_id(), - }; - - debug!( - ?block_root, - indices = ?custody_indexes_to_fetch, - %id, - "Starting custody columns request" - ); - - let requester = CustodyRequester(id); - let mut request = ActiveCustodyRequest::new( - block_root, - CustodyId { requester }, - &custody_indexes_to_fetch, - lookup_peers, - ); + let mut request = + ActiveCustodyByRootRequest::new(block_root, id, &custody_indices, lookup_peers); // Note that you can only send, but not handle a response here match request.continue_requests(self) { @@ -963,146 +578,103 @@ impl SyncNetworkContext { // Ignoring the result of `continue_requests` is okay. A request that has just been // created cannot return data immediately, it must send some request to the network // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. - self.custody_by_root_requests.insert(requester, request); - Ok(LookupRequestResult::RequestSent(id.req_id)) + self.custody_by_root_requests.insert(id, request); + Ok(id) } - Err(e) => Err(match e { - CustodyRequestError::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } - // - TooManyFailures: Should never happen, `request` has just been created, it's - // count of download_failures is 0 here - // - BadState: Should never happen, a bad state can only happen when handling a - // network response - // - UnexpectedRequestId: Never happens: this Err is only constructed handling a - // download or processing response - // - SendFailed: Should never happen unless in a bad drop sequence when shutting - // down the node - e @ (CustodyRequestError::TooManyFailures - | CustodyRequestError::BadState { .. } - | CustodyRequestError::UnexpectedRequestId { .. } - | CustodyRequestError::SendFailed { .. }) => { - RpcRequestSendError::InternalError(format!("{e:?}")) - } - }), + Err(e) => Err(e.into()), } } - fn send_blocks_by_range_request( + pub fn send_blocks_by_root_request( &mut self, peer_id: PeerId, - request: BlocksByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { - let id = BlocksByRangeRequestId { + block_root: Hash256, + parent_request_id: BlocksByRootRequester, + ) -> Result { + let id = BlocksByRootRequestId { id: self.next_id(), parent_request_id, }; + + let request = BlocksByRootRequest::new(vec![block_root], self.spec(), ForkName::Fulu); + + // Lookup sync event safety: If network_send.send() returns Ok(_) we are guaranteed that + // eventually at least one this 3 events will be received: + // - StreamTermination(request_id): handled by `Self::on_single_block_response` + // - RPCError(request_id): handled by `Self::on_single_block_response` + // - Disconnect(peer_id) handled by `Self::peer_disconnected``which converts it to a + // ` RPCError(request_id)`event handled by the above method self.network_send .send(NetworkMessage::SendRequest { peer_id, - request: RequestType::BlocksByRange(request.clone().into()), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), + request: RequestType::BlocksByRoot(request), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), }) .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( - method = "BlocksByRange", - slots = request.count(), - epoch = %Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()), + method = "BlocksByRoot", peer = %peer_id, %id, "Sync RPC request sent" ); - self.blocks_by_range_requests.insert( + self.blocks_by_root_requests.insert( id, peer_id, - // false = do not enforce max_requests are returned for *_by_range methods. We don't - // know if there are missed blocks. - false, - BlocksByRangeRequestItems::new(request), + // true = enforce max_requests as returned for blocks_by_root. We always request from + // peers to claim to have these blocks + true, + BlocksByRootRequestItems::new(block_root), ); Ok(id) } - fn send_blobs_by_range_request( + fn send_blobs_by_root_request( &mut self, peer_id: PeerId, - request: BlobsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { - let id = BlobsByRangeRequestId { + block_root: Hash256, + blobs_per_block: usize, + parent_request_id: ComponentsByRootRequestId, + ) -> Result { + let id = BlobsByRootRequestId { id: self.next_id(), parent_request_id, }; - let request_epoch = Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()); + + let indices = (0..(blobs_per_block as u64)).collect::>(); + let blob_identifiers = indices + .iter() + .map(|index| BlobIdentifier { + block_root, + index: *index, + }) + .collect::>(); // Create the blob request based on the blocks request. self.network_send .send(NetworkMessage::SendRequest { peer_id, - request: RequestType::BlobsByRange(request.clone()), - app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), + request: RequestType::BlobsByRoot(BlobsByRootRequest { + blob_ids: RuntimeVariableList::new(blob_identifiers, usize::MAX).unwrap(), + }), + app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), }) .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( - method = "BlobsByRange", - slots = request.count, - epoch = %request_epoch, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - let max_blobs_per_block = self.chain.spec.max_blobs_per_block(request_epoch); - self.blobs_by_range_requests.insert( - id, - peer_id, - // false = do not enforce max_requests are returned for *_by_range methods. We don't - // know if there are missed blocks. - false, - BlobsByRangeRequestItems::new(request, max_blobs_per_block), - ); - Ok(id) - } - - fn send_data_columns_by_range_request( - &mut self, - peer_id: PeerId, - request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { - let id = DataColumnsByRangeRequestId { - id: self.next_id(), - parent_request_id, - }; - - self.send_network_msg(NetworkMessage::SendRequest { - peer_id, - request: RequestType::DataColumnsByRange(request.clone()), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "DataColumnsByRange", - slots = request.count, - epoch = %Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()), - columns = ?request.columns, + method = "BlobsByRoot", peer = %peer_id, %id, "Sync RPC request sent" ); - self.data_columns_by_range_requests.insert( + self.blobs_by_root_requests.insert( id, peer_id, - // false = do not enforce max_requests are returned for *_by_range methods. We don't - // know if there are missed blocks. - false, - DataColumnsByRangeRequestItems::new(request), + // true = we know exactly how many blobs total we expect + true, + BlobsByRootRequestItems::new(block_root, indices), ); Ok(id) } @@ -1211,40 +783,12 @@ impl SyncNetworkContext { id } - /// Check whether a batch for this epoch (and only this epoch) should request just blocks or - /// blocks and blobs. - fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { - // Induces a compile time panic if this doesn't hold true. - #[allow(clippy::assertions_on_constants)] - const _: () = assert!( - super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 - && super::range_sync::EPOCHS_PER_BATCH == 1, - "To deal with alignment with deneb boundaries, batches need to be of just one epoch" - ); - - if self - .chain - .data_availability_checker - .data_columns_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndColumns - } else if self - .chain - .data_availability_checker - .blobs_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndBlobs - } else { - ByRangeRequestType::Blocks - } - } - /// Attempt to make progress on all custody_by_root requests. Some request may be stale waiting /// for custody peers. Returns a Vec of results as zero or more requests may fail in this /// attempt. pub fn continue_custody_by_root_requests( &mut self, - ) -> Vec<(CustodyRequester, CustodyByRootResult)> { + ) -> Vec<(CustodyByRootRequestId, CustodyRequestResult)> { let ids = self .custody_by_root_requests .keys() @@ -1258,7 +802,10 @@ impl SyncNetworkContext { .custody_by_root_requests .remove(&id) .expect("key of hashmap"); - let result = request.continue_requests(self); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id, request, result) .map(|result| (id, result)) }) @@ -1267,132 +814,69 @@ impl SyncNetworkContext { // Request handlers - pub(crate) fn on_single_block_response( + /// Processes a single `RpcEvent` for a blocks_by_root RPC request. + /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks + /// - If the event is an error it fails the request and returns `Some(Err)` + /// - else it appends the response chunk to the active request state and returns `None` + #[allow(clippy::type_complexity)] + pub(crate) fn on_blocks_by_root_response( &mut self, - id: SingleLookupReqId, + id: BlocksByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>>> { + ) -> Option>>>> { let resp = self.blocks_by_root_requests.on_response(id, rpc_event); - let resp = resp.map(|res| { - res.and_then(|(mut blocks, seen_timestamp)| { - // Enforce that exactly one chunk = one block is returned. ReqResp behavior limits the - // response count to at most 1. - match blocks.pop() { - Some(block) => Ok((block, seen_timestamp)), - // Should never happen, `blocks_by_root_requests` enforces that we receive at least - // 1 chunk. - None => Err(LookupVerifyError::NotEnoughResponsesReturned { actual: 0 }.into()), - } - }) - }); - self.on_rpc_response_result(id, "BlocksByRoot", resp, peer_id, |_| 1) + self.on_rpc_response_result(resp, peer_id) } - pub(crate) fn on_single_blob_response( + /// Processes a single `RpcEvent` blobs_by_root RPC request. + /// Same logic as [`on_blocks_by_root_response`] + #[allow(clippy::type_complexity)] + pub(crate) fn on_blobs_by_root_response( &mut self, - id: SingleLookupReqId, + id: BlobsByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>> { + ) -> Option>>>> { let resp = self.blobs_by_root_requests.on_response(id, rpc_event); - let resp = resp.map(|res| { - res.and_then(|(blobs, seen_timestamp)| { - if let Some(max_len) = blobs - .first() - .map(|blob| self.chain.spec.max_blobs_per_block(blob.epoch()) as usize) - { - match to_fixed_blob_sidecar_list(blobs, max_len) { - Ok(blobs) => Ok((blobs, seen_timestamp)), - Err(e) => Err(e.into()), - } - } else { - Err(RpcResponseError::VerifyError( - LookupVerifyError::InternalError( - "Requested blobs for a block that has no blobs".to_string(), - ), - )) - } - }) - }); - self.on_rpc_response_result(id, "BlobsByRoot", resp, peer_id, |_| 1) + self.on_rpc_response_result(resp, peer_id) } + /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. + /// Same logic as [`on_blocks_by_root_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_root_response( &mut self, id: DataColumnsByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>>>> { + ) -> Option>> { let resp = self .data_columns_by_root_requests .on_response(id, rpc_event); - self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) - } - - #[allow(clippy::type_complexity)] - pub(crate) fn on_blocks_by_range_response( - &mut self, - id: BlocksByRangeRequestId, - peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>>>> { - let resp = self.blocks_by_range_requests.on_response(id, rpc_event); - self.on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()) - } - - #[allow(clippy::type_complexity)] - pub(crate) fn on_blobs_by_range_response( - &mut self, - id: BlobsByRangeRequestId, - peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>>>> { - let resp = self.blobs_by_range_requests.on_response(id, rpc_event); - self.on_rpc_response_result(id, "BlobsByRangeRequest", resp, peer_id, |b| b.len()) + self.on_rpc_response_result(resp, peer_id) } + /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. + /// Same logic as [`on_blocks_by_root_response`] #[allow(clippy::type_complexity)] - pub(crate) fn on_data_columns_by_range_response( + pub(crate) fn on_headers_by_root_response( &mut self, - id: DataColumnsByRangeRequestId, + id: HeadersByRootRequestId, peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>> { - let resp = self - .data_columns_by_range_requests - .on_response(id, rpc_event); - self.on_rpc_response_result(id, "DataColumnsByRange", resp, peer_id, |d| d.len()) + rpc_event: RpcEvent, + ) -> Option>> { + let resp = self.headers_by_root_requests.on_response(id, rpc_event); + self.on_rpc_response_result(resp, peer_id) } - fn on_rpc_response_result usize>( + /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics + /// and peer reporting for all request types. + fn on_rpc_response_result( &mut self, - id: I, - method: &'static str, resp: Option>, peer_id: PeerId, - get_count: F, ) -> Option> { - match &resp { - None => {} - Some(Ok((v, _))) => { - debug!( - %id, - method, - count = get_count(v), - "Sync RPC request completed" - ); - } - Some(Err(e)) => { - debug!( - %id, - method, - error = ?e, - "Sync RPC request error" - ); - } - } if let Some(Err(RpcResponseError::VerifyError(e))) = &resp { self.report_peer(peer_id, PeerAction::LowToleranceError, e.into()); } @@ -1409,11 +893,11 @@ impl SyncNetworkContext { #[allow(clippy::type_complexity)] pub fn on_custody_by_root_response( &mut self, - id: CustodyId, + id: CustodyByRootRequestId, req_id: DataColumnsByRootRequestId, peer_id: PeerId, - resp: RpcResponseResult>>>, - ) -> Option> { + resp: RpcResponseResult>, + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1423,23 +907,28 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests - let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Custody column downloaded event for unknown request"); + let Some(mut request) = self.custody_by_root_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_root"], + ); return None; }; - let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) + .transpose(); - self.handle_custody_by_root_result(id.requester, request, result) + self.handle_custody_by_root_result(id, request, result) } fn handle_custody_by_root_result( &mut self, - id: CustodyRequester, - request: ActiveCustodyRequest, - result: CustodyRequestResult, - ) -> Option> { + id: CustodyByRootRequestId, + request: ActiveCustodyByRootRequest, + result: Option>, + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1447,18 +936,12 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result - .map_err(RpcResponseError::CustodyRequestError) - .transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { + match &result { Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } Some(Err(e)) => { - debug!(?id, error = ?e, "Custody request failure, removing" ) + debug!(%id, error = ?e, "Custody by root request failure, removing") } None => { self.custody_by_root_requests.insert(id, request); @@ -1467,116 +950,76 @@ impl SyncNetworkContext { result } - pub fn send_block_for_processing( - &self, - id: Id, - block_root: Hash256, - block: Arc>, - seen_timestamp: Duration, - ) -> Result<(), SendErrorProcessor> { - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let beacon_processor = self - .beacon_processor_if_enabled() - .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - - let block = RpcBlock::new_without_blobs(Some(block_root), block); - - debug!(block = ?block_root, id, "Sending block for processing"); - // Lookup sync event safety: If `beacon_processor.send_rpc_beacon_block` returns Ok() sync - // must receive a single `SyncMessage::BlockComponentProcessed` with this process type - beacon_processor - .send_rpc_beacon_block( - block_root, - block, - seen_timestamp, - BlockProcessType::SingleBlock { id }, - ) - .map_err(|e| { - error!( - error = ?e, - "Failed to send sync block to processor" - ); - SendErrorProcessor::SendError - }) - } - - pub fn send_blobs_for_processing( - &self, - id: Id, - block_root: Hash256, - blobs: FixedBlobSidecarList, - seen_timestamp: Duration, - ) -> Result<(), SendErrorProcessor> { - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let beacon_processor = self - .beacon_processor_if_enabled() - .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - - debug!(?block_root, ?id, "Sending blobs for processing"); - // Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync - // must receive a single `SyncMessage::BlockComponentProcessed` event with this process type - beacon_processor - .send_rpc_blobs( - block_root, - blobs, - seen_timestamp, - BlockProcessType::SingleBlob { id }, - ) - .map_err(|e| { - error!( - error = ?e, - "Failed to send sync blobs to processor" - ); - SendErrorProcessor::SendError - }) - } - - pub fn send_custody_columns_for_processing( - &self, - _id: Id, - block_root: Hash256, - custody_columns: DataColumnSidecarList, - seen_timestamp: Duration, - process_type: BlockProcessType, - ) -> Result<(), SendErrorProcessor> { - let span = span!( - Level::INFO, - "SyncNetworkContext", - service = "network_context" - ); - let _enter = span.enter(); - - let beacon_processor = self - .beacon_processor_if_enabled() - .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; + /// Processes the result of an `*_by_range` RPC request issued by a + /// block_components_by_range_request. + /// + /// - If the result completes the request, it returns `Some(Ok)` with a vec of coupled RpcBlocks + /// - If the result fails the request, it returns `Some(Err)`. Note that a failed request may + /// not fail the block_components_by_range_request as it implements retries. + /// - else it appends the result to the active request state and returns `None` + #[allow(clippy::type_complexity)] + pub fn on_block_components_by_root_response( + &mut self, + id: ComponentsByRootRequestId, + range_block_component: RangeBlockComponent, + ) -> Option, BatchPeers), RpcResponseError>> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_root_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); + return None; + }; - debug!( - ?block_root, - ?process_type, - "Sending custody columns for processing" - ); + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + let block = blocks.first().ok_or(RpcResponseError::InternalError( + "blocks_by_root returned zero blocks".to_owned(), + ))?; + request + .on_blocks_by_root_result(req_id, block.clone(), peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_root_result(req_id, blobs, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::CustodyColumns(req_id, resp) => { + resp.and_then(|(custody_columns, peers, _)| { + request + .on_custody_by_root_result(req_id, custody_columns, peers, self) + .map_err(Into::::into) + }) + } + } + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + .transpose(); - beacon_processor - .send_rpc_custody_columns(block_root, custody_columns, seen_timestamp, process_type) - .map_err(|e| { - error!( - error = ?e, - "Failed to send sync custody columns to processor" - ); - SendErrorProcessor::SendError - }) + match result.as_ref() { + Some(Ok((block, peer_group))) => { + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRoot" ...` + debug!( + %id, + slot = %block.as_block().slot(), + block_has_data = block.as_block().has_data(), + block_peer = ?peer_group.block(), + "Block components by range request success, removing" + ) + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Block components by range request failure, removing" ) + } + None => { + self.block_components_by_root_requests.insert(id, request); + } + } + result } pub(crate) fn register_metrics(&self) { @@ -1587,33 +1030,13 @@ impl SyncNetworkContext { "data_columns_by_root", self.data_columns_by_root_requests.len(), ), - ("blocks_by_range", self.blocks_by_range_requests.len()), - ("blobs_by_range", self.blobs_by_range_requests.len()), - ( - "data_columns_by_range", - self.data_columns_by_range_requests.len(), - ), ("custody_by_root", self.custody_by_root_requests.len()), ( - "components_by_range", - self.components_by_range_requests.len(), + "block_components_by_root", + self.block_components_by_root_requests.len(), ), ] { metrics::set_gauge_vec(&metrics::SYNC_ACTIVE_NETWORK_REQUESTS, &[id], count as i64); } } } - -fn to_fixed_blob_sidecar_list( - blobs: Vec>>, - max_len: usize, -) -> Result, LookupVerifyError> { - let mut fixed_list = FixedBlobSidecarList::new(vec![None; max_len]); - for blob in blobs.into_iter() { - let index = blob.index as usize; - *fixed_list - .get_mut(index) - .ok_or(LookupVerifyError::UnrequestedIndex(index as u64))? = Some(blob) - } - Ok(fixed_list) -} diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs new file mode 100644 index 00000000000..a2a2387e2b2 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -0,0 +1,438 @@ +use crate::sync::network_context::{ + BatchPeers, PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, +}; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::data_column_verification::CustodyDataColumn; +use beacon_chain::{get_block_root, BeaconChainTypes}; +use lighthouse_network::service::api_types::{ + BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, ComponentsByRootRequestId, + CustodyByRootRequestId, +}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use types::{ + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, Hash256, + RuntimeVariableList, SignedBeaconBlock, +}; + +/// Given a `BlocksByRootRequest` (a collection of block roots) fetches all necessary data to +/// return potentially available RpcBlocks. +/// +/// See [`State`] for the set of `*_by_root` it may issue depending on the fork. +pub struct BlockComponentsByRootRequest { + id: ComponentsByRootRequestId, + peers: Arc>>, + block_root: Hash256, + state: State, +} + +// Request blocks first, then columns. Assuming the block peer is honest we can attribute +// custody failures to the peers serving us columns. We want to get rid of the honest block +// peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 +enum State { + BlocksRequest { + blocks_request: Request>>, + }, + DataRequest { + block: Arc>, + block_peer: PeerId, + data_request: DataRequest, + }, +} + +enum DataRequest { + Deneb { + blobs_request: Request>>>, + }, + Fulu { + custody_request: Request, PeerGroup>, + }, +} + +enum Request { + /// Active(RequestIndex) + Active(I), + /// Complete(DownloadedData, Peers) + Complete(T, P), +} + +pub type BlockComponentsByRootRequestResult = Result, BatchPeers)>, Error>; + +pub enum Error { + InternalError(String), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } +} + +/// Used to typesafe assertions of state in range sync tests +#[cfg(test)] +#[derive(Debug)] +pub enum BlockComponentsByRootRequestStep { + BlocksRequest, + CustodyRequest, +} + +impl BlockComponentsByRootRequest { + pub fn new( + id: ComponentsByRootRequestId, + block_root: Hash256, + peers: Arc>>, + peers_to_deprioritize: &HashSet, + cx: &mut SyncNetworkContext, + ) -> Result { + // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we + // will request all blocks for the first 5 epochs to that same single peer. Before we would + // query only idle peers in the syncing chain. + let Some(block_peer) = peers + .read() + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the SyncingChain is removed. + return Err(RpcRequestSendError::NoPeers); + }; + + let blocks_req_id = cx.send_blocks_by_root_request( + block_peer, + block_root, + BlocksByRootRequester::ForwardSync(id), + )?; + + let state = State::BlocksRequest { + blocks_request: Request::Active(blocks_req_id), + }; + + Ok(Self { + id, + peers, + block_root, + state, + }) + } + + pub fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRootRequestResult { + match &mut self.state { + State::BlocksRequest { + blocks_request: blocks_by_range_request, + } => { + if let Some((block, block_peer)) = blocks_by_range_request.to_finished() { + let fork = cx.spec().fork_name_at_slot::(block.slot()); + let block_has_data = block.has_data(); + + if block_has_data && fork.fulu_enabled() { + let mut column_indices = cx + .network_globals() + .sampling_columns() + .iter() + .copied() + .collect::>(); + column_indices.sort_unstable(); + + let req_id = cx + .send_custody_by_root_request( + self.id, + self.block_root, + self.peers.clone(), + ) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => Error::InternalError(e), + RpcRequestSendError::NoPeers => Error::InternalError( + "send_custody_by_range_request does not error with NoPeers" + .to_owned(), + ), + })?; + + self.state = State::DataRequest { + block: block.clone(), + block_peer: *block_peer, + data_request: DataRequest::Fulu { + custody_request: Request::Active(req_id), + }, + }; + Ok(None) + } else if block_has_data && fork.deneb_enabled() { + // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I + // would like the beacon processor / da_checker to be the one that decides if an + // RpcBlock is valid or not with respect to containing blobs. Having sync not even + // attempt a requests seems like an added limitation. + let req_id = cx + .send_blobs_by_root_request( + *block_peer, + self.block_root, + block.num_expected_blobs(), + self.id, + ) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => Error::InternalError(e), + RpcRequestSendError::NoPeers => Error::InternalError( + "send_custody_by_range_request does not error with NoPeers" + .to_owned(), + ), + })?; + + self.state = State::DataRequest { + block: block.clone(), + block_peer: *block_peer, + data_request: DataRequest::Deneb { + blobs_request: Request::Active(req_id), + }, + }; + Ok(None) + } else { + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_block = couple_block_base(block.clone()); + Ok(Some((rpc_block, peer_group))) + } + } else { + // Wait for blocks_by_range requests to complete + Ok(None) + } + } + State::DataRequest { + block, + block_peer, + data_request, + } => match data_request { + DataRequest::Deneb { + blobs_request: blobs_by_range_request, + } => { + if let Some((blobs, _)) = blobs_by_range_request.to_finished() { + // We use the same block_peer for the blobs request + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_block = + couple_block_deneb(block.clone(), blobs.to_vec(), cx.spec())?; + Ok(Some((rpc_block, peer_group))) + } else { + // Wait for blocks_by_range and blobs_by_range requests to complete + Ok(None) + } + } + DataRequest::Fulu { + custody_request: custody_by_range_request, + } => { + if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { + let custody_column_indices = cx + .network_globals() + .sampling_columns() + .iter() + .copied() + .collect(); + + let peer_group = BatchPeers::new(*block_peer, column_peers.clone()); + let rpc_block = couple_block_fulu( + block.clone(), + columns.to_vec(), + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_block, peer_group))) + } else { + // Wait for the custody_by_range request to complete + Ok(None) + } + } + }, + } + } + + pub fn on_blocks_by_root_result( + &mut self, + id: BlocksByRootRequestId, + data: Arc>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRootRequestResult { + match &mut self.state { + State::BlocksRequest { blocks_request } => { + blocks_request.finish(id, data, peer_id)?; + } + _ => { + return Err(Error::InternalError( + "Received unexpected blocks_by_range response".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_blobs_by_root_result( + &mut self, + id: BlobsByRootRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRootRequestResult { + match &mut self.state { + State::DataRequest { + data_request: DataRequest::Deneb { blobs_request }, + .. + } => { + blobs_request.finish(id, data, peer_id)?; + } + _ => { + return Err(Error::InternalError( + "Received unexpected blobs_by_range response".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_custody_by_root_result( + &mut self, + id: CustodyByRootRequestId, + data: DataColumnSidecarList, + peers: PeerGroup, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRootRequestResult { + match &mut self.state { + State::DataRequest { + data_request: DataRequest::Fulu { custody_request }, + .. + } => { + custody_request.finish(id, data, peers)?; + } + _ => { + return Err(Error::InternalError( + "Received unexpected custody_by_range response".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + #[cfg(test)] + pub fn state_step(&self) -> BlockComponentsByRootRequestStep { + match &self.state { + State::BlocksRequest { .. } => BlockComponentsByRootRequestStep::BlocksRequest, + State::DataRequest { .. } => BlockComponentsByRootRequestStep::CustodyRequest, + } + } +} + +fn couple_block_base(block: Arc>) -> RpcBlock { + RpcBlock::new_without_blobs(None, block) +} + +fn couple_block_deneb( + block: Arc>, + blobs: Vec>>, + spec: &ChainSpec, +) -> Result, Error> { + let mut blobs_by_block = HashMap::>>>::new(); + for blob in blobs { + let block_root = blob.block_root(); + blobs_by_block.entry(block_root).or_default().push(blob); + } + + // Now collect all blobs that match to the block by block root. BlobsByRange request checks + // the inclusion proof so we know that the commitment is the expected. + // + // BlobsByRange request handler ensures that we don't receive more blobs than possible. + // If the peer serving the request sends us blobs that don't pair well we'll send to the + // processor blocks without expected blobs, resulting in a downscoring event. A serving peer + // could serve fake blobs for blocks that don't have data, but it would gain nothing by it + // wasting theirs and our bandwidth 1:1. Therefore blobs that don't pair well are just ignored. + // + // RpcBlock::new ensures that the count of blobs is consistent with the block + let block_root = get_block_root(&block); + let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; + let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); + // BlobsByRange request handler enforces that blobs are sorted by index + let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block) + .map_err(|_| Error::InternalError("Blobs returned exceeds max length".to_string()))?; + Ok(RpcBlock::new(Some(block_root), block, Some(blobs)).expect("TODO: don't do matching here")) +} + +fn couple_block_fulu( + block: Arc>, + data_columns: DataColumnSidecarList, + custody_column_indices: Vec, + spec: &ChainSpec, +) -> Result, Error> { + // Group data columns by block_root and index + let mut custody_columns_by_block = HashMap::>>::new(); + + for column in data_columns { + let block_root = column.block_root(); + + if custody_column_indices.contains(&column.index) { + custody_columns_by_block + .entry(block_root) + .or_default() + // Safe to convert to `CustodyDataColumn`: we have asserted that the index of + // this column is in the set of `expects_custody_columns` and with the expected + // block root, so for the expected epoch of this batch. + .push(CustodyDataColumn::from_asserted_custody(column)); + } + } + + // Now iterate all blocks ensuring that the block roots of each block and data column match, + let block_root = get_block_root(&block); + let data_columns_with_block_root = custody_columns_by_block + // Remove to only use columns once + .remove(&block_root) + .unwrap_or_default(); + + RpcBlock::new_with_custody_columns(Some(block_root), block, data_columns_with_block_root, spec) + .map_err(Error::InternalError) +} + +impl Request { + fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), Error> { + match self { + Self::Active(expected_id) => { + if expected_id != &id { + return Err(Error::InternalError(format!( + "unexpected req_id expected {expected_id} got {id}" + ))); + } + *self = Self::Complete(data, peer_id); + Ok(()) + } + Self::Complete(_, _) => Err(Error::InternalError(format!( + "request already complete {id}" + ))), + } + } + + fn to_finished(&self) -> Option<(&T, &P)> { + match self { + Self::Active(_) => None, + Self::Complete(data, peer_id) => Some((data, peer_id)), + } + } +} diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs similarity index 57% rename from beacon_node/network/src/sync/network_context/custody.rs rename to beacon_node/network/src/sync/network_context/custody_by_root.rs index f4d010b881e..faf30f3f67f 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,11 +1,14 @@ +use crate::sync::network_context::download_request::{ + DownloadRequest, Error as DownloadRequestError, +}; use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootRequestId, RpcRequestSendError, RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; -use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; -use lighthouse_network::PeerId; +use lighthouse_network::service::api_types::{CustodyByRootRequestId, DataColumnsByRootRequester}; +use lighthouse_network::{PeerAction, PeerId}; use lru_cache::LRUTimeCache; use parking_lot::RwLock; use rand::Rng; @@ -13,21 +16,26 @@ use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; -use types::EthSpec; -use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; +use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; -use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; -const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); - -type DataColumnSidecarList = Vec>>; +const REQUEST_EXPIRY_SECONDS: u64 = 300; +/// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each +/// `custody_by_*` request is already retried multiple inside of a lookup or batch +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; -pub struct ActiveCustodyRequest { +pub struct ActiveCustodyByRootRequest { + start_time: Instant, block_root: Hash256, - custody_id: CustodyId, + custody_id: CustodyByRootRequestId, /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap>, + #[allow(clippy::type_complexity)] + column_requests: FnvHashMap< + ColumnIndex, + DownloadRequest>>, + >, /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, @@ -36,50 +44,81 @@ pub struct ActiveCustodyRequest { failed_peers: LRUTimeCache, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, + /// Log that request is idle once + logged_idle_request: bool, _phantom: PhantomData, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub enum Error { - SendFailed(&'static str), - TooManyFailures, - BadState(String), - NoPeer(ColumnIndex), - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, - }, + InternalError(String), + TooManyDownloadErrors(RpcResponseError), + ExpiredNoCustodyPeers(Vec), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + Error::TooManyDownloadErrors(e) => e, + Error::ExpiredNoCustodyPeers(indices) => RpcResponseError::RequestExpired(format!( + "Expired waiting for custody peers {indices:?}" + )), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::TooManyDownloadErrors(_) => { + RpcRequestSendError::InternalError("Download error in request send".to_string()) + } + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + Error::ExpiredNoCustodyPeers(_) => RpcRequestSendError::InternalError( + "Request can not expire when requesting it".to_string(), + ), + } + } +} + +impl From for Error { + fn from(e: DownloadRequestError) -> Self { + match e { + DownloadRequestError::InternalError(e) => Self::InternalError(e), + DownloadRequestError::TooManyErrors(e) => Self::TooManyDownloadErrors(e), + } + } } struct ActiveBatchColumnsRequest { indices: Vec, } -pub type CustodyRequestResult = +pub type CustodyByRootRequestResult = Result, PeerGroup, Duration)>, Error>; -impl ActiveCustodyRequest { +impl ActiveCustodyByRootRequest { pub(crate) fn new( block_root: Hash256, - custody_id: CustodyId, + custody_id: CustodyByRootRequestId, column_indices: &[ColumnIndex], lookup_peers: Arc>>, ) -> Self { Self { + start_time: Instant::now(), block_root, custody_id, column_requests: HashMap::from_iter( column_indices .iter() - .map(|index| (*index, ColumnRequest::new())), + .map(|index| (*index, DownloadRequest::new())), ), active_batch_columns_requests: <_>::default(), failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), lookup_peers, + logged_idle_request: false, _phantom: PhantomData, } } @@ -98,10 +137,9 @@ impl ActiveCustodyRequest { req_id: DataColumnsByRootRequestId, resp: RpcResponseResult>, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( - block_root = ?self.block_root, %req_id, "Received custody column response for unrequested index" ); @@ -111,7 +149,6 @@ impl ActiveCustodyRequest { match resp { Ok((data_columns, seen_timestamp)) => { debug!( - block_root = ?self.block_root, %req_id, %peer_id, count = data_columns.len(), @@ -131,7 +168,7 @@ impl ActiveCustodyRequest { let column_request = self .column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; if let Some(data_column) = data_columns.remove(column_index) { column_request.on_download_success( @@ -147,7 +184,7 @@ impl ActiveCustodyRequest { // TODO(das): Should track which columns are missing and eventually give up // TODO(das): If the peer is in the lookup peer set it claims to have imported // the block AND its custody columns. So in this case we can downscore - column_request.on_download_error(req_id)?; + column_request.on_download_error(req_id, None)?; missing_column_indexes.push(column_index); } } @@ -158,7 +195,6 @@ impl ActiveCustodyRequest { if !missing_column_indexes.is_empty() { // Note: Batch logging that columns are missing to not spam logger debug!( - block_root = ?self.block_root, %req_id, %peer_id, // TODO(das): this property can become very noisy, being the full range 0..128 @@ -167,14 +203,21 @@ impl ActiveCustodyRequest { ); self.failed_peers.insert(peer_id); + + // If peer is in the lookup peer set, it claims to have imported the block and + // must have its columns in custody. In that case, set `true = enforce max_requests` + // and downscore if data_columns_by_root does not returned the expected custody + // columns. For the rest of peers, don't downscore if columns are missing. + if self.lookup_peers.read().contains(&peer_id) { + cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); + } } } Err(err) => { debug!( - block_root = ?self.block_root, %req_id, - %peer_id, - error = ?err, + %peer_id, + error = ?err, "Custody column download error" ); @@ -182,8 +225,8 @@ impl ActiveCustodyRequest { for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id)?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error(req_id, Some(err.clone()))?; } self.failed_peers.insert(peer_id); @@ -196,7 +239,7 @@ impl ActiveCustodyRequest { pub(crate) fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { if self.column_requests.values().all(|r| r.is_downloaded()) { // All requests have completed successfully. let mut peers = HashMap::>::new(); @@ -212,7 +255,7 @@ impl ActiveCustodyRequest { seen_timestamps.push(seen_timestamp); Ok(data_column) }) - .collect::, _>>()?; + .collect::, Error>>()?; let peer_group = PeerGroup::from_set(peers); let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); @@ -229,11 +272,7 @@ impl ActiveCustodyRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(wait_duration) = request.is_awaiting_download() { - if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyFailures); - } - + if request.is_awaiting_download() { // TODO(das): When is a fork and only a subset of your peers know about a block, we should // only query the peers on that fork. Should this case be handled? How to handle it? let custodial_peers = cx.get_custodial_peers(*column_index); @@ -245,13 +284,11 @@ impl ActiveCustodyRequest { // custody peers on a given column let mut priorized_peers = custodial_peers .iter() + .filter(|peer| !self.failed_peers.contains(peer)) .map(|peer| { ( // Prioritize peers that claim to know have imported this block if lookup_peers.contains(peer) { 0 } else { 1 }, - // De-prioritize peers that have failed to successfully respond to - // requests recently - self.failed_peers.contains(peer), // Prefer peers with fewer requests to load balance across peers. // We batch requests to the same peer, so count existence in the // `columns_to_request_by_peer` as a single 1 request. @@ -265,172 +302,76 @@ impl ActiveCustodyRequest { .collect::>(); priorized_peers.sort_unstable(); - if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + if let Some((_, _, _, peer_id)) = priorized_peers.first() { columns_to_request_by_peer .entry(*peer_id) .or_default() .push(*column_index); - } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { - // Allow to request to sit stale in `NotStarted` state for at most - // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that - // lookup will naturally retry when other peers send us attestations for - // descendants of this un-available lookup. - return Err(Error::NoPeer(*column_index)); } else { - // Do not issue requests if there is no custody peer on this column + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request will be dropped and failed after some time. } } } for (peer_id, indices) in columns_to_request_by_peer.into_iter() { - let request_result = cx - .data_column_lookup_request( + let req_id = cx + .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, - DataColumnsByRootSingleBlockRequest { - block_root: self.block_root, - indices: indices.clone(), - }, - // If peer is in the lookup peer set, it claims to have imported the block and - // must have its columns in custody. In that case, set `true = enforce max_requests` - // and downscore if data_columns_by_root does not returned the expected custody - // columns. For the rest of peers, don't downscore if columns are missing. - lookup_peers.contains(&peer_id), + self.block_root, + indices.clone(), + false, ) - .map_err(Error::SendFailed)?; - - match request_result { - LookupRequestResult::RequestSent(req_id) => { - for column_index in &indices { - let column_request = self - .column_requests - .get_mut(column_index) - // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; - - column_request.on_download_start(req_id)?; - } - - self.active_batch_columns_requests - .insert(req_id, ActiveBatchColumnsRequest { indices }); - } - LookupRequestResult::NoRequestNeeded(_) => unreachable!(), - LookupRequestResult::Pending(_) => unreachable!(), + .map_err(|e| { + Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) + })?; + + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; + + column_request.on_download_start(req_id)?; } - } - Ok(None) - } -} - -/// TODO(das): this attempt count is nested into the existing lookup request count. -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; - -struct ColumnRequest { - status: Status, - download_failures: usize, -} - -#[derive(Debug, Clone)] -enum Status { - NotStarted(Instant), - Downloading(DataColumnsByRootRequestId), - Downloaded(PeerId, Arc>, Duration), -} - -impl ColumnRequest { - fn new() -> Self { - Self { - status: Status::NotStarted(Instant::now()), - download_failures: 0, - } - } - - fn is_awaiting_download(&self) -> Option { - match self.status { - Status::NotStarted(start_time) => Some(start_time.elapsed()), - Status::Downloading { .. } | Status::Downloaded { .. } => None, - } - } + self.active_batch_columns_requests + .insert(req_id, ActiveBatchColumnsRequest { indices }); - fn is_downloaded(&self) -> bool { - match self.status { - Status::NotStarted { .. } | Status::Downloading { .. } => false, - Status::Downloaded { .. } => true, - } - } - - fn on_download_start(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { - match &self.status { - Status::NotStarted { .. } => { - self.status = Status::Downloading(req_id); - Ok(()) - } - other => Err(Error::BadState(format!( - "bad state on_download_start expected NotStarted got {other:?}" - ))), + // Reset the idle request log, for the next time this request completes + self.logged_idle_request = false; } - } - fn on_download_error(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { - match &self.status { - Status::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); - } - self.status = Status::NotStarted(Instant::now()); - Ok(()) - } - other => Err(Error::BadState(format!( - "bad state on_download_error expected Downloading got {other:?}" - ))), + let no_active_request = !self.column_requests.values().any(|r| r.is_downloading()); + + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && no_active_request + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); } - } - fn on_download_error_and_mark_failure( - &mut self, - req_id: DataColumnsByRootRequestId, - ) -> Result<(), Error> { - // TODO(das): Should track which peers don't have data - self.download_failures += 1; - self.on_download_error(req_id) - } - - fn on_download_success( - &mut self, - req_id: DataColumnsByRootRequestId, - peer_id: PeerId, - data_column: Arc>, - seen_timestamp: Duration, - ) -> Result<(), Error> { - match &self.status { - Status::Downloading(expected_req_id) => { - if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); - } - self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); - Ok(()) - } - other => Err(Error::BadState(format!( - "bad state on_download_success expected Downloading got {other:?}" - ))), + if no_active_request && !self.logged_idle_request { + self.logged_idle_request = true; + debug!( + id = ?self.custody_id, + failed_peers = self.failed_peers.keys().count(), + peers = self.lookup_peers.read().len(), + "Custody by root request idle waiting for peers" + ); } - } - fn complete(self) -> Result<(PeerId, Arc>, Duration), Error> { - match self.status { - Status::Downloaded(peer_id, data_column, seen_timestamp) => { - Ok((peer_id, data_column, seen_timestamp)) - } - other => Err(Error::BadState(format!( - "bad state complete expected Downloaded got {other:?}" - ))), - } + Ok(None) } } diff --git a/beacon_node/network/src/sync/network_context/download_request.rs b/beacon_node/network/src/sync/network_context/download_request.rs new file mode 100644 index 00000000000..71eebbd11c5 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/download_request.rs @@ -0,0 +1,149 @@ +use crate::sync::network_context::RpcResponseError; +use lighthouse_network::PeerId; +use std::time::Duration; +use strum::IntoStaticStr; + +/// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each +/// `custody_by_*` request is already retried multiple inside of a lookup or batch +const MAX_DOWNLOAD_ATTEMPTS: usize = 5; + +pub struct DownloadRequest { + status: Status, + download_failures: Vec, +} + +#[derive(Debug, Clone, IntoStaticStr)] +pub enum Status { + NotStarted, + Downloading(I), + Downloaded(PeerId, T, Duration), +} + +#[derive(Debug)] +pub enum Error { + InternalError(String), + TooManyErrors(RpcResponseError), +} + +impl DownloadRequest { + pub fn new() -> Self { + Self { + status: Status::NotStarted, + download_failures: vec![], + } + } + + pub fn status_str(&self) -> &'static str { + (&self.status).into() + } + + pub fn is_awaiting_download(&self) -> bool { + match self.status { + Status::NotStarted => true, + Status::Downloading { .. } | Status::Downloaded { .. } => false, + } + } + + pub fn is_downloading(&self) -> bool { + match self.status { + Status::NotStarted => false, + Status::Downloading { .. } => true, + Status::Downloaded { .. } => false, + } + } + + pub fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted | Status::Downloading { .. } => false, + Status::Downloaded { .. } => true, + } + } + + pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { + match &self.status { + Status::NotStarted => { + self.status = Status::Downloading(req_id); + Ok(()) + } + other => Err(Error::InternalError(format!( + "bad state on_download_start expected NotStarted got {}", + Into::<&'static str>::into(other), + ))), + } + } + + pub fn on_download_error( + &mut self, + req_id: I, + error_to_register: Option, + ) -> Result<(), Error> { + match &self.status { + Status::Downloading(expected_req_id) => { + if req_id != *expected_req_id { + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); + } + + if let Some(e) = error_to_register { + self.download_failures.push(e); + if self.download_failures.len() > MAX_DOWNLOAD_ATTEMPTS { + if let Some(last_error) = self.download_failures.pop() { + return Err(Error::TooManyErrors(last_error)); + } + } + } + + self.status = Status::NotStarted; + Ok(()) + } + other => Err(Error::InternalError(format!( + "bad state on_download_error expected Downloading got {}", + Into::<&'static str>::into(other), + ))), + } + } + + pub fn on_download_success( + &mut self, + req_id: I, + peer_id: PeerId, + data: T, + seen_timestamp: Duration, + ) -> Result<(), Error> { + match &self.status { + Status::Downloading(expected_req_id) => { + if req_id != *expected_req_id { + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); + } + self.status = Status::Downloaded(peer_id, data, seen_timestamp); + Ok(()) + } + other => Err(Error::InternalError(format!( + "bad state on_download_success expected Downloading got {}", + Into::<&'static str>::into(other), + ))), + } + } + + pub fn is_complete(&self) -> Option<&T> { + match &self.status { + Status::Downloaded(_, data, _) => Some(data), + _ => None, + } + } + + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { + match self.status { + Status::Downloaded(peer_id, data, seen_timestamp) => { + Ok((peer_id, data, seen_timestamp)) + } + other => Err(Error::InternalError(format!( + "bad state complete expected Downloaded got {}", + Into::<&'static str>::into(other), + ))), + } + } +} diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index cd70a2e7ebc..e9eb8654bb2 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -1,32 +1,28 @@ +use std::time::Instant; use std::{collections::hash_map::Entry, hash::Hash}; use beacon_chain::validator_monitor::timestamp_now; use fnv::FnvHashMap; use lighthouse_network::PeerId; use strum::IntoStaticStr; +use tracing::debug; use types::{Hash256, Slot}; -pub use blobs_by_range::BlobsByRangeRequestItems; -pub use blobs_by_root::{BlobsByRootRequestItems, BlobsByRootSingleBlockRequest}; -pub use blocks_by_range::BlocksByRangeRequestItems; -pub use blocks_by_root::{BlocksByRootRequestItems, BlocksByRootSingleRequest}; -pub use data_columns_by_range::DataColumnsByRangeRequestItems; -pub use data_columns_by_root::{ - DataColumnsByRootRequestItems, DataColumnsByRootSingleBlockRequest, -}; +pub use blobs_by_root::BlobsByRootRequestItems; +pub use blocks_by_root::BlocksByRootRequestItems; +pub use data_columns_by_root::DataColumnsByRootRequestItems; +pub use headers_by_root::HeadersByRootRequestItems; use crate::metrics; use super::{RpcEvent, RpcResponseResult}; -mod blobs_by_range; mod blobs_by_root; -mod blocks_by_range; mod blocks_by_root; -mod data_columns_by_range; mod data_columns_by_root; +mod headers_by_root; -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] +#[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { NotEnoughResponsesReturned { actual: usize, @@ -56,6 +52,7 @@ struct ActiveRequest { peer_id: PeerId, // Error if the request terminates before receiving max expected responses expect_max_responses: bool, + start_instant: Instant, } enum State { @@ -64,7 +61,7 @@ enum State { Errored, } -impl ActiveRequests { +impl ActiveRequests { pub fn new(name: &'static str) -> Self { Self { requests: <_>::default(), @@ -79,6 +76,7 @@ impl ActiveRequests { state: State::Active(items), peer_id, expect_max_responses, + start_instant: Instant::now(), }, ); } @@ -102,7 +100,7 @@ impl ActiveRequests { return None; }; - match rpc_event { + let result = match rpc_event { // Handler of a success ReqResp chunk. Adds the item to the request accumulator. // `ActiveRequestItems` validates the item before appending to its internal state. RpcEvent::Response(item, seen_timestamp) => { @@ -115,7 +113,7 @@ impl ActiveRequests { Ok(true) => { let items = items.consume(); request.state = State::CompletedEarly; - Some(Ok((items, seen_timestamp))) + Some(Ok((items, seen_timestamp, request.start_instant.elapsed()))) } // Received item, but we are still expecting more Ok(false) => None, @@ -151,7 +149,11 @@ impl ActiveRequests { } .into())) } else { - Some(Ok((items.consume(), timestamp_now()))) + Some(Ok(( + items.consume(), + timestamp_now(), + request.start_instant.elapsed(), + ))) } } // Items already returned, ignore stream termination @@ -174,15 +176,39 @@ impl ActiveRequests { State::Errored => None, } } - } + }; + + result.map(|result| match result { + Ok((items, seen_timestamp, duration)) => { + metrics::inc_counter_vec(&metrics::SYNC_RPC_REQUEST_SUCCESSES, &[self.name]); + metrics::observe_timer_vec(&metrics::SYNC_RPC_REQUEST_TIME, &[self.name], duration); + debug!( + %id, + method = self.name, + count = items.len(), + "Sync RPC request completed" + ); + + Ok((items, seen_timestamp)) + } + Err(e) => { + metrics::inc_counter_vec(&metrics::SYNC_RPC_REQUEST_ERRORS, &[self.name]); + debug!( + %id, + method = self.name, + error = ?e, + "Sync RPC request error" + ); + + Err(e) + } + }) } - pub fn active_requests_of_peer(&self, peer_id: &PeerId) -> Vec<&K> { + pub fn active_requests(&self) -> impl Iterator { self.requests .iter() - .filter(|(_, request)| &request.peer_id == peer_id) - .map(|(id, _)| id) - .collect() + .map(|(id, request)| (id, &request.peer_id)) } pub fn iter_request_peers(&self) -> impl Iterator + '_ { diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs deleted file mode 100644 index 8a9a8c9813c..00000000000 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs +++ /dev/null @@ -1,61 +0,0 @@ -use super::{ActiveRequestItems, LookupVerifyError}; -use lighthouse_network::rpc::methods::BlobsByRangeRequest; -use std::sync::Arc; -use types::{BlobSidecar, EthSpec, Slot}; - -/// Accumulates results of a blobs_by_range request. Only returns items after receiving the -/// stream termination. -pub struct BlobsByRangeRequestItems { - request: BlobsByRangeRequest, - items: Vec>>, - max_blobs_per_block: u64, -} - -impl BlobsByRangeRequestItems { - pub fn new(request: BlobsByRangeRequest, max_blobs_per_block: u64) -> Self { - Self { - request, - items: vec![], - max_blobs_per_block, - } - } -} - -impl ActiveRequestItems for BlobsByRangeRequestItems { - type Item = Arc>; - - fn add(&mut self, blob: Self::Item) -> Result { - let start_slot = Slot::new(self.request.start_slot); - let end_slot = start_slot + Slot::new(self.request.count); - - if blob.slot() < start_slot || blob.slot() >= end_slot { - return Err(LookupVerifyError::UnrequestedSlot { - slot: blob.slot(), - start_slot, - end_slot, - }); - } - if blob.index >= self.max_blobs_per_block { - return Err(LookupVerifyError::UnrequestedIndex(blob.index)); - } - if !blob.verify_blob_sidecar_inclusion_proof() { - return Err(LookupVerifyError::InvalidInclusionProof); - } - if self - .items - .iter() - .any(|existing| existing.slot() == blob.slot() && existing.index == blob.index) - { - return Err(LookupVerifyError::DuplicatedData(blob.slot(), blob.index)); - } - - self.items.push(blob); - - // Skip check if blobs are ready as it's rare that all blocks have max blobs - Ok(false) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs index 547c51198e4..1fa9763cf0f 100644 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blobs_by_root.rs @@ -1,39 +1,19 @@ -use lighthouse_network::rpc::methods::BlobsByRootRequest; use std::sync::Arc; -use types::{blob_sidecar::BlobIdentifier, BlobSidecar, EthSpec, ForkContext, Hash256}; +use types::{BlobSidecar, EthSpec, Hash256}; use super::{ActiveRequestItems, LookupVerifyError}; -#[derive(Debug, Clone)] -pub struct BlobsByRootSingleBlockRequest { - pub block_root: Hash256, - pub indices: Vec, -} - -impl BlobsByRootSingleBlockRequest { - pub fn into_request(self, spec: &ForkContext) -> BlobsByRootRequest { - BlobsByRootRequest::new( - self.indices - .into_iter() - .map(|index| BlobIdentifier { - block_root: self.block_root, - index, - }) - .collect(), - spec, - ) - } -} - pub struct BlobsByRootRequestItems { - request: BlobsByRootSingleBlockRequest, + block_root: Hash256, + indices: Vec, items: Vec>>, } impl BlobsByRootRequestItems { - pub fn new(request: BlobsByRootSingleBlockRequest) -> Self { + pub fn new(block_root: Hash256, indices: Vec) -> Self { Self { - request, + block_root, + indices, items: vec![], } } @@ -47,13 +27,13 @@ impl ActiveRequestItems for BlobsByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, blob: Self::Item) -> Result { let block_root = blob.block_root(); - if self.request.block_root != block_root { + if self.block_root != block_root { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } if !blob.verify_blob_sidecar_inclusion_proof() { return Err(LookupVerifyError::InvalidInclusionProof); } - if !self.request.indices.contains(&blob.index) { + if !self.indices.contains(&blob.index) { return Err(LookupVerifyError::UnrequestedIndex(blob.index)); } if self.items.iter().any(|b| b.index == blob.index) { @@ -62,7 +42,7 @@ impl ActiveRequestItems for BlobsByRootRequestItems { self.items.push(blob); - Ok(self.items.len() >= self.request.indices.len()) + Ok(self.items.len() >= self.indices.len()) } fn consume(&mut self) -> Vec { diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs deleted file mode 100644 index ae39ac1d766..00000000000 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs +++ /dev/null @@ -1,53 +0,0 @@ -use super::{ActiveRequestItems, LookupVerifyError}; -use lighthouse_network::rpc::BlocksByRangeRequest; -use std::sync::Arc; -use types::{EthSpec, SignedBeaconBlock, Slot}; - -/// Accumulates results of a blocks_by_range request. Only returns items after receiving the -/// stream termination. -pub struct BlocksByRangeRequestItems { - request: BlocksByRangeRequest, - items: Vec>>, -} - -impl BlocksByRangeRequestItems { - pub fn new(request: BlocksByRangeRequest) -> Self { - Self { - request, - items: vec![], - } - } -} - -impl ActiveRequestItems for BlocksByRangeRequestItems { - type Item = Arc>; - - fn add(&mut self, block: Self::Item) -> Result { - let start_slot = Slot::new(*self.request.start_slot()); - let end_slot = start_slot + Slot::new(*self.request.count()); - - if block.slot() < start_slot || block.slot() >= end_slot { - return Err(LookupVerifyError::UnrequestedSlot { - slot: block.slot(), - start_slot, - end_slot, - }); - } - if self - .items - .iter() - .any(|existing| existing.slot() == block.slot()) - { - // DuplicatedData is a common error for all components, default index to 0 - return Err(LookupVerifyError::DuplicatedData(block.slot(), 0)); - } - - self.items.push(block); - - Ok(self.items.len() >= *self.request.count() as usize) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs index 6d7eabf909f..e80e70d9c3d 100644 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/blocks_by_root.rs @@ -1,28 +1,18 @@ use beacon_chain::get_block_root; -use lighthouse_network::rpc::BlocksByRootRequest; use std::sync::Arc; -use types::{EthSpec, ForkContext, Hash256, SignedBeaconBlock}; +use types::{EthSpec, Hash256, SignedBeaconBlock}; use super::{ActiveRequestItems, LookupVerifyError}; -#[derive(Debug, Copy, Clone)] -pub struct BlocksByRootSingleRequest(pub Hash256); - -impl BlocksByRootSingleRequest { - pub fn into_request(self, fork_context: &ForkContext) -> BlocksByRootRequest { - BlocksByRootRequest::new(vec![self.0], fork_context) - } -} - pub struct BlocksByRootRequestItems { - request: BlocksByRootSingleRequest, + block_root: Hash256, items: Vec>>, } impl BlocksByRootRequestItems { - pub fn new(request: BlocksByRootSingleRequest) -> Self { + pub fn new(block_root: Hash256) -> Self { Self { - request, + block_root, items: vec![], } } @@ -36,7 +26,7 @@ impl ActiveRequestItems for BlocksByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, block: Self::Item) -> Result { let block_root = get_block_root(&block); - if self.request.0 != block_root { + if self.block_root != block_root { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs deleted file mode 100644 index 276ede93c12..00000000000 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ /dev/null @@ -1,59 +0,0 @@ -use super::{ActiveRequestItems, LookupVerifyError}; -use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; -use std::sync::Arc; -use types::{DataColumnSidecar, EthSpec, Slot}; - -/// Accumulates results of a data_columns_by_range request. Only returns items after receiving the -/// stream termination. -pub struct DataColumnsByRangeRequestItems { - request: DataColumnsByRangeRequest, - items: Vec>>, -} - -impl DataColumnsByRangeRequestItems { - pub fn new(request: DataColumnsByRangeRequest) -> Self { - Self { - request, - items: vec![], - } - } -} - -impl ActiveRequestItems for DataColumnsByRangeRequestItems { - type Item = Arc>; - - fn add(&mut self, data_column: Self::Item) -> Result { - let start_slot = Slot::new(self.request.start_slot); - let end_slot = start_slot + Slot::new(self.request.count); - - if data_column.slot() < start_slot || data_column.slot() >= end_slot { - return Err(LookupVerifyError::UnrequestedSlot { - slot: data_column.slot(), - start_slot, - end_slot, - }); - } - if !self.request.columns.contains(&data_column.index) { - return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); - } - if !data_column.verify_inclusion_proof() { - return Err(LookupVerifyError::InvalidInclusionProof); - } - if self.items.iter().any(|existing| { - existing.slot() == data_column.slot() && existing.index == data_column.index - }) { - return Err(LookupVerifyError::DuplicatedData( - data_column.slot(), - data_column.index, - )); - } - - self.items.push(data_column); - - Ok(self.items.len() >= self.request.count as usize * self.request.columns.len()) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 09d7f4b3b77..dba4a71794a 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -1,46 +1,19 @@ -use lighthouse_network::rpc::methods::DataColumnsByRootRequest; use std::sync::Arc; -use types::{ - ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, Hash256, - RuntimeVariableList, -}; +use types::{DataColumnSidecar, EthSpec, Hash256}; use super::{ActiveRequestItems, LookupVerifyError}; -#[derive(Debug, Clone)] -pub struct DataColumnsByRootSingleBlockRequest { - pub block_root: Hash256, - pub indices: Vec, -} - -impl DataColumnsByRootSingleBlockRequest { - pub fn try_into_request( - self, - fork_name: ForkName, - spec: &ChainSpec, - ) -> Result { - let number_of_columns = spec.number_of_columns as usize; - let columns = RuntimeVariableList::new(self.indices, number_of_columns) - .map_err(|_| "Number of indices exceeds total number of columns")?; - Ok(DataColumnsByRootRequest::new( - vec![DataColumnsByRootIdentifier { - block_root: self.block_root, - columns, - }], - spec.max_request_blocks(fork_name), - )) - } -} - pub struct DataColumnsByRootRequestItems { - request: DataColumnsByRootSingleBlockRequest, + block_root: Hash256, + indices: Vec, items: Vec>>, } impl DataColumnsByRootRequestItems { - pub fn new(request: DataColumnsByRootSingleBlockRequest) -> Self { + pub fn new(block_root: Hash256, indices: Vec) -> Self { Self { - request, + block_root, + indices, items: vec![], } } @@ -54,13 +27,13 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { /// The active request SHOULD be dropped after `add_response` returns an error fn add(&mut self, data_column: Self::Item) -> Result { let block_root = data_column.block_root(); - if self.request.block_root != block_root { + if self.block_root != block_root { return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); } if !data_column.verify_inclusion_proof() { return Err(LookupVerifyError::InvalidInclusionProof); } - if !self.request.indices.contains(&data_column.index) { + if !self.indices.contains(&data_column.index) { return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); } if self.items.iter().any(|d| d.index == data_column.index) { @@ -72,7 +45,7 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { self.items.push(data_column); - Ok(self.items.len() >= self.request.indices.len()) + Ok(self.items.len() >= self.indices.len()) } fn consume(&mut self) -> Vec { diff --git a/beacon_node/network/src/sync/network_context/requests/headers_by_root.rs b/beacon_node/network/src/sync/network_context/requests/headers_by_root.rs new file mode 100644 index 00000000000..7620c72fdca --- /dev/null +++ b/beacon_node/network/src/sync/network_context/requests/headers_by_root.rs @@ -0,0 +1,45 @@ +use super::{ActiveRequestItems, LookupVerifyError}; +use types::{BeaconBlockHeader, Hash256}; + +pub struct HeadersByRootRequestItems { + next_block_root: Hash256, + max_count: usize, + items: Vec, +} + +impl HeadersByRootRequestItems { + pub fn new(block_root: Hash256, max_count: usize) -> Self { + Self { + next_block_root: block_root, + max_count, + items: vec![], + } + } +} + +impl ActiveRequestItems for HeadersByRootRequestItems { + type Item = BeaconBlockHeader; + + /// Append a response to the single chunk request. If the chunk is valid, the request is + /// resolved immediately. + /// The active request SHOULD be dropped after `add_response` returns an error + fn add(&mut self, header: Self::Item) -> Result { + let block_root = header.canonical_root(); + if self.next_block_root != block_root { + return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); + } + + if self.items.len() >= self.max_count { + return Err(LookupVerifyError::TooManyResponses); + } + + self.next_block_root = header.parent_root; + self.items.push(header); + + Ok(false) + } + + fn consume(&mut self) -> Vec { + std::mem::take(&mut self.items) + } +} diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index 59b751787e3..e92e5365f9b 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -1,9 +1,7 @@ use self::request::ActiveColumnSampleRequest; #[cfg(test)] pub(crate) use self::request::Status; -use super::network_context::{ - DataColumnsByRootSingleBlockRequest, RpcResponseError, SyncNetworkContext, -}; +use super::network_context::{RpcResponseError, SyncNetworkContext}; use crate::metrics; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; @@ -98,13 +96,13 @@ impl Sampling { // TODO(das): Should track failed sampling request for some time? Otherwise there's // a risk of a loop with multiple triggers creating the request, then failing, // and repeat. - debug!(?id, "Ignoring duplicate sampling request"); + debug!(%id, "Ignoring duplicate sampling request"); return None; } }; debug!( - ?id, + %id, column_selection = ?request.column_selection(), "Created new sample request" ); @@ -138,7 +136,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample downloaded event for unknown request"); + debug!(%id, "Sample downloaded event for unknown request"); return None; }; @@ -167,7 +165,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample verified event for unknown request"); + debug!(%id, "Sample verified event for unknown request"); return None; }; @@ -191,7 +189,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let result = result.transpose(); if let Some(result) = result { - debug!(?id, ?result, "Sampling request completed, removing"); + debug!(%id, ?result, "Sampling request completed, removing"); metrics::inc_counter_vec( &metrics::SAMPLING_REQUEST_RESULT, &[metrics::from_result(&result)], @@ -570,16 +568,14 @@ impl ActiveSamplingRequest { // Send requests. let mut sent_request = false; for (peer_id, column_indexes) in column_indexes_to_request { - cx.data_column_lookup_request( + cx.data_columns_by_root_request( DataColumnsByRootRequester::Sampling(SamplingId { id: self.requester_id, sampling_request_id: self.current_sampling_request_id, }), peer_id, - DataColumnsByRootSingleBlockRequest { - block_root: self.block_root, - indices: column_indexes.clone(), - }, + self.block_root, + column_indexes.clone(), // false = We issue request to custodians who may or may not have received the // samples yet. We don't any signal (like an attestation or status messages that the // custodian has received data). diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs deleted file mode 100644 index 72598a25405..00000000000 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ /dev/null @@ -1,524 +0,0 @@ -use beacon_chain::block_verification_types::RpcBlock; -use lighthouse_network::rpc::methods::BlocksByRangeRequest; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::PeerId; -use std::collections::{HashMap, HashSet}; -use std::fmt; -use std::hash::{Hash, Hasher}; -use std::ops::Sub; -use std::time::{Duration, Instant}; -use strum::Display; -use types::{ColumnIndex, Epoch, EthSpec, Slot}; - -/// The number of times to retry a batch before it is considered failed. -const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; - -/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed -/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. -const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; - -/// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] -#[strum(serialize_all = "snake_case")] -pub enum ByRangeRequestType { - BlocksAndColumns, - BlocksAndBlobs, - Blocks, -} - -#[derive(Clone, Debug)] -pub struct BatchPeers { - block_peer: PeerId, - column_peers: HashMap, -} - -impl BatchPeers { - pub fn new_from_block_peer(block_peer: PeerId) -> Self { - Self { - block_peer, - column_peers: <_>::default(), - } - } - pub fn new(block_peer: PeerId, column_peers: HashMap) -> Self { - Self { - block_peer, - column_peers, - } - } - - pub fn block(&self) -> PeerId { - self.block_peer - } - - pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { - self.column_peers.get(index) - } -} - -/// Allows customisation of the above constants used in other sync methods such as BackFillSync. -pub trait BatchConfig { - /// The maximum batch download attempts. - fn max_batch_download_attempts() -> u8; - /// The max batch processing attempts. - fn max_batch_processing_attempts() -> u8; - /// Hashing function of a batch's attempt. Used for scoring purposes. - /// - /// When a batch fails processing, it is possible that the batch is wrong (faulty or - /// incomplete) or that a previous one is wrong. For this reason we need to re-download and - /// re-process the batches awaiting validation and the current one. Consider this scenario: - /// - /// ```ignore - /// BatchA BatchB BatchC BatchD - /// -----X Empty Empty Y----- - /// ``` - /// - /// BatchA declares that it refers X, but BatchD declares that it's first block is Y. There is no - /// way to know if BatchD is faulty/incomplete or if batches B and/or C are missing blocks. It is - /// also possible that BatchA belongs to a different chain to the rest starting in some block - /// midway in the batch's range. For this reason, the four batches would need to be re-downloaded - /// and re-processed. - /// - /// If batchD was actually good, it will still register two processing attempts for the same set of - /// blocks. In this case, we don't want to penalize the peer that provided the first version, since - /// it's equal to the successfully processed one. - /// - /// The function `batch_attempt_hash` provides a way to compare two batch attempts without - /// storing the full set of blocks. - /// - /// Note that simpler hashing functions considered in the past (hash of first block, hash of last - /// block, number of received blocks) are not good enough to differentiate attempts. For this - /// reason, we hash the complete set of blocks both in RangeSync and BackFillSync. - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64; -} - -#[derive(Debug)] -pub struct RangeSyncBatchConfig {} - -impl BatchConfig for RangeSyncBatchConfig { - fn max_batch_download_attempts() -> u8 { - MAX_BATCH_DOWNLOAD_ATTEMPTS - } - fn max_batch_processing_attempts() -> u8 { - MAX_BATCH_PROCESSING_ATTEMPTS - } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - blocks.hash(&mut hasher); - hasher.finish() - } -} - -/// Error type of a batch in a wrong state. -// Such errors should never be encountered. -pub struct WrongState(pub(crate) String); - -/// After batch operations, we use this to communicate whether a batch can continue or not -pub enum BatchOperationOutcome { - Continue, - Failed { blacklist: bool }, -} - -pub enum BatchProcessingResult { - Success, - FaultyFailure, - NonFaultyFailure, -} - -#[derive(Debug)] -/// A segment of a chain. -pub struct BatchInfo { - /// Start slot of the batch. - start_slot: Slot, - /// End slot of the batch. - end_slot: Slot, - /// The `Attempts` that have been made and failed to send us this batch. - failed_processing_attempts: Vec, - /// Number of processing attempts that have failed but we do not count. - non_faulty_processing_attempts: u8, - /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: Vec>, - /// State of the batch. - state: BatchState, - /// Pin the generic - marker: std::marker::PhantomData, -} - -impl fmt::Display for BatchInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "Start Slot: {}, End Slot: {}, State: {}", - self.start_slot, self.end_slot, self.state - ) - } -} - -#[derive(Display)] -/// Current state of a batch -pub enum BatchState { - /// The batch has failed either downloading or processing, but can be requested again. - AwaitingDownload, - /// The batch is being downloaded. - Downloading(Id), - /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(BatchPeers, Vec>, Instant), - /// The batch is being processed. - Processing(Attempt), - /// The batch was successfully processed and is waiting to be validated. - /// - /// It is not sufficient to process a batch successfully to consider it correct. This is - /// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered - /// valid, only if the next sequential batch imports at least a block. - AwaitingValidation(Attempt), - /// Intermediate state for inner state handling. - Poisoned, - /// The batch has maxed out the allowed attempts for either downloading or processing. It - /// cannot be recovered. - Failed, -} - -impl BatchState { - /// Helper function for poisoning a state. - pub fn poison(&mut self) -> BatchState { - std::mem::replace(self, BatchState::Poisoned) - } -} - -impl BatchInfo { - /// Batches are downloaded excluding the first block of the epoch assuming it has already been - /// downloaded. - /// - /// For example: - /// - /// Epoch boundary | | - /// ... | 30 | 31 | 32 | 33 | 34 | ... | 61 | 62 | 63 | 64 | 65 | - /// Batch 1 | Batch 2 | Batch 3 - /// - /// NOTE: Removed the shift by one for deneb because otherwise the last batch before the blob - /// fork boundary will be of mixed type (all blocks and one last blockblob), and I don't want to - /// deal with this for now. - /// This means finalization might be slower in deneb - pub fn new(start_epoch: &Epoch, num_of_epochs: u64) -> Self { - let start_slot = start_epoch.start_slot(E::slots_per_epoch()); - let end_slot = start_slot + num_of_epochs * E::slots_per_epoch(); - BatchInfo { - start_slot, - end_slot, - failed_processing_attempts: Vec::new(), - failed_download_attempts: Vec::new(), - non_faulty_processing_attempts: 0, - state: BatchState::AwaitingDownload, - marker: std::marker::PhantomData, - } - } - - /// Gives a list of peers from which this batch has had a failed download or processing - /// attempt. - /// - /// TODO(das): Returns only block peers to keep the mainnet path equivalent. The failed peers - /// mechanism is broken for PeerDAS and will be fixed with https://github.com/sigp/lighthouse/issues/6258 - pub fn failed_block_peers(&self) -> HashSet { - let mut peers = HashSet::with_capacity( - self.failed_processing_attempts.len() + self.failed_download_attempts.len(), - ); - - for attempt in &self.failed_processing_attempts { - peers.insert(attempt.peers.block()); - } - - for peer in self.failed_download_attempts.iter().flatten() { - peers.insert(*peer); - } - - peers - } - - /// Verifies if an incoming block belongs to this batch. - pub fn is_expecting_block(&self, request_id: &Id) -> bool { - if let BatchState::Downloading(expected_id) = &self.state { - return expected_id == request_id; - } - false - } - - /// Returns the peers that provided this batch's downloaded contents - pub fn processing_peers(&self) -> Option<&BatchPeers> { - match &self.state { - BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, - BatchState::AwaitingProcessing(peers, _, _) - | BatchState::Processing(Attempt { peers, .. }) - | BatchState::AwaitingValidation(Attempt { peers, .. }) => Some(peers), - BatchState::Poisoned => unreachable!("Poisoned batch"), - } - } - - /// Returns the count of stored pending blocks if in awaiting processing state - pub fn pending_blocks(&self) -> usize { - match &self.state { - BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(), - BatchState::AwaitingDownload - | BatchState::Downloading { .. } - | BatchState::Processing { .. } - | BatchState::AwaitingValidation { .. } - | BatchState::Poisoned - | BatchState::Failed => 0, - } - } - - /// Returns a BlocksByRange request associated with the batch. - pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest { - BlocksByRangeRequest::new( - self.start_slot.into(), - self.end_slot.sub(self.start_slot).into(), - ) - } - - /// After different operations over a batch, this could be in a state that allows it to - /// continue, or in failed state. When the batch has failed, we check if it did mainly due to - /// processing failures. In this case the batch is considered failed and faulty. - pub fn outcome(&self) -> BatchOperationOutcome { - match self.state { - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed => BatchOperationOutcome::Failed { - blacklist: self.failed_processing_attempts.len() - > self.failed_download_attempts.len(), - }, - _ => BatchOperationOutcome::Continue, - } - } - - pub fn state(&self) -> &BatchState { - &self.state - } - - pub fn attempts(&self) -> &[Attempt] { - &self.failed_processing_attempts - } - - /// Marks the batch as ready to be processed if the blocks are in the range. The number of - /// received blocks is returned, or the wrong batch end on failure - #[must_use = "Batch may have failed"] - pub fn download_completed( - &mut self, - blocks: Vec>, - batch_peers: BatchPeers, - ) -> Result { - match self.state.poison() { - BatchState::Downloading(_request_id) => { - let received = blocks.len(); - self.state = BatchState::AwaitingProcessing(batch_peers, blocks, Instant::now()); - Ok(received) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Download completed for batch in wrong state {:?}", - self.state - ))) - } - } - } - - /// Mark the batch as failed and return whether we can attempt a re-download. - /// - /// This can happen if a peer disconnects or some error occurred that was not the peers fault. - /// The `peer` parameter, when set to None, does not increment the failed attempts of - /// this batch and register the peer, rather attempts a re-download. - #[must_use = "Batch may have failed"] - pub fn download_failed( - &mut self, - peer: Option, - ) -> Result { - match self.state.poison() { - BatchState::Downloading(_request_id) => { - // register the attempt and check if the batch can be tried again - self.failed_download_attempts.push(peer); - self.state = if self.failed_download_attempts.len() - >= B::max_batch_download_attempts() as usize - { - BatchState::Failed - } else { - // drop the blocks - BatchState::AwaitingDownload - }; - Ok(self.outcome()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Download failed for batch in wrong state {:?}", - self.state - ))) - } - } - } - - pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> { - match self.state.poison() { - BatchState::AwaitingDownload => { - self.state = BatchState::Downloading(request_id); - Ok(()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Starting download for batch in wrong state {:?}", - self.state - ))) - } - } - } - - pub fn start_processing(&mut self) -> Result<(Vec>, Duration), WrongState> { - match self.state.poison() { - BatchState::AwaitingProcessing(peers, blocks, start_instant) => { - self.state = BatchState::Processing(Attempt::new::(peers, &blocks)); - Ok((blocks, start_instant.elapsed())) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Starting procesing batch in wrong state {:?}", - self.state - ))) - } - } - } - - #[must_use = "Batch may have failed"] - pub fn processing_completed( - &mut self, - procesing_result: BatchProcessingResult, - ) -> Result { - match self.state.poison() { - BatchState::Processing(attempt) => { - self.state = match procesing_result { - BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), - BatchProcessingResult::FaultyFailure => { - // register the failed attempt - self.failed_processing_attempts.push(attempt); - - // check if the batch can be downloaded again - if self.failed_processing_attempts.len() - >= B::max_batch_processing_attempts() as usize - { - BatchState::Failed - } else { - BatchState::AwaitingDownload - } - } - BatchProcessingResult::NonFaultyFailure => { - self.non_faulty_processing_attempts = - self.non_faulty_processing_attempts.saturating_add(1); - BatchState::AwaitingDownload - } - }; - Ok(self.outcome()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Procesing completed for batch in wrong state: {:?}", - self.state - ))) - } - } - } - - #[must_use = "Batch may have failed"] - pub fn validation_failed(&mut self) -> Result { - match self.state.poison() { - BatchState::AwaitingValidation(attempt) => { - self.failed_processing_attempts.push(attempt); - - // check if the batch can be downloaded again - self.state = if self.failed_processing_attempts.len() - >= B::max_batch_processing_attempts() as usize - { - BatchState::Failed - } else { - BatchState::AwaitingDownload - }; - Ok(self.outcome()) - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - other => { - self.state = other; - Err(WrongState(format!( - "Validation failed for batch in wrong state: {:?}", - self.state - ))) - } - } - } - - // Visualizes the state of this batch using state::visualize() - pub fn visualize(&self) -> char { - self.state.visualize() - } -} - -/// Represents a batch attempt awaiting validation -/// -/// Invalid attempts will downscore its peers -#[derive(Debug)] -pub struct Attempt { - /// The peers that served this batch contents - peers: BatchPeers, - /// The hash of the blocks of the attempt. - pub hash: u64, -} - -impl Attempt { - fn new(peers: BatchPeers, blocks: &[RpcBlock]) -> Self { - let hash = B::batch_attempt_hash(blocks); - Attempt { peers, hash } - } - - pub fn block_peer(&self) -> PeerId { - self.peers.block() - } -} - -impl std::fmt::Debug for BatchState { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - BatchState::Processing(Attempt { ref peers, hash: _ }) => { - write!(f, "Processing({})", peers.block()) - } - BatchState::AwaitingValidation(Attempt { ref peers, hash: _ }) => { - write!(f, "AwaitingValidation({})", peers.block()) - } - BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), - BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(_, ref blocks, _) => { - write!(f, "AwaitingProcessing({} blocks)", blocks.len()) - } - BatchState::Downloading(request_id) => { - write!(f, "Downloading({})", request_id) - } - BatchState::Poisoned => f.write_str("Poisoned"), - } - } -} - -impl BatchState { - /// Creates a character representation/visualization for the batch state to display in logs for quicker and - /// easier recognition - fn visualize(&self) -> char { - match self { - BatchState::Downloading(..) => 'D', - BatchState::Processing(_) => 'P', - BatchState::AwaitingValidation(_) => 'v', - BatchState::AwaitingDownload => 'd', - BatchState::Failed => 'F', - BatchState::AwaitingProcessing(..) => 'p', - BatchState::Poisoned => 'X', - } - } -} diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs deleted file mode 100644 index e7d9f46679e..00000000000 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ /dev/null @@ -1,1201 +0,0 @@ -use super::batch::{BatchInfo, BatchPeers, BatchProcessingResult, BatchState}; -use super::RangeSyncType; -use crate::metrics; -use crate::network_beacon_processor::ChainSegmentProcessId; -use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; -use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; -use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::BeaconChainTypes; -use itertools::Itertools; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::{PeerAction, PeerId}; -use logging::crit; -use std::collections::{btree_map::Entry, BTreeMap, HashSet}; -use strum::IntoStaticStr; -use tracing::{debug, instrument, warn}; -use types::{Epoch, EthSpec, Hash256, Slot}; - -/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of -/// blocks per batch are requested _at most_. A batch may request less blocks to account for -/// already requested slots. There is a timeout for each batch request. If this value is too high, -/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which -/// case the responder will fill the response up to the max request size, assuming they have the -/// bandwidth to do so. -pub const EPOCHS_PER_BATCH: u64 = 1; - -/// The maximum number of batches to queue before requesting more. -const BATCH_BUFFER_SIZE: u8 = 5; - -/// A return type for functions that act on a `Chain` which informs the caller whether the chain -/// has been completed and should be removed or to be kept if further processing is -/// required. -/// -/// Should be checked, since a failed chain must be removed. A chain that requested being removed -/// and continued is now in an inconsistent state. -pub type ProcessingResult = Result; - -/// Reasons for removing a chain -#[derive(Debug)] -#[allow(dead_code)] -pub enum RemoveChain { - EmptyPeerPool, - ChainCompleted, - /// A chain has failed. This boolean signals whether the chain should be blacklisted. - ChainFailed { - blacklist: bool, - failing_batch: BatchId, - }, - WrongBatchState(String), - WrongChainState(String), -} - -#[derive(Debug)] -pub struct KeepChain; - -/// A chain identifier -pub type ChainId = Id; -pub type BatchId = Epoch; - -#[derive(Debug, Copy, Clone, IntoStaticStr)] -pub enum SyncingChainType { - Head, - Finalized, - Backfill, -} - -/// A chain of blocks that need to be downloaded. Peers who claim to contain the target head -/// root are grouped into the peer pool and queried for batches when downloading the -/// chain. -#[derive(Debug)] -pub struct SyncingChain { - /// A random id used to identify this chain. - id: ChainId, - - /// SyncingChain type - pub chain_type: SyncingChainType, - - /// The start of the chain segment. Any epoch previous to this one has been validated. - pub start_epoch: Epoch, - - /// The target head slot. - pub target_head_slot: Slot, - - /// The target head root. - pub target_head_root: Hash256, - - /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, - - /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain - /// and thus available to download this chain from, as well as the batches we are currently - /// requesting. - peers: HashSet, - - /// Starting epoch of the next batch that needs to be downloaded. - to_be_downloaded: BatchId, - - /// Starting epoch of the batch that needs to be processed next. - /// This is incremented as the chain advances. - processing_target: BatchId, - - /// Optimistic head to sync. - /// If a block is imported for this batch, the chain advances to this point. - optimistic_start: Option, - - /// When a batch for an optimistic start is tried (either successful or not), it is stored to - /// avoid trying it again due to chain stopping/re-starting on chain switching. - attempted_optimistic_starts: HashSet, - - /// The current state of the chain. - pub state: ChainSyncingState, - - /// The current processing batch, if any. - current_processing_batch: Option, -} - -#[derive(PartialEq, Debug)] -pub enum ChainSyncingState { - /// The chain is not being synced. - Stopped, - /// The chain is undergoing syncing. - Syncing, -} - -impl SyncingChain { - #[allow(clippy::too_many_arguments)] - pub fn new( - id: Id, - start_epoch: Epoch, - target_head_slot: Slot, - target_head_root: Hash256, - peer_id: PeerId, - chain_type: SyncingChainType, - ) -> Self { - SyncingChain { - id, - chain_type, - start_epoch, - target_head_slot, - target_head_root, - batches: BTreeMap::new(), - peers: HashSet::from_iter([peer_id]), - to_be_downloaded: start_epoch, - processing_target: start_epoch, - optimistic_start: None, - attempted_optimistic_starts: HashSet::default(), - state: ChainSyncingState::Stopped, - current_processing_batch: None, - } - } - - /// Returns true if this chain has the same target - pub fn has_same_target(&self, target_head_slot: Slot, target_head_root: Hash256) -> bool { - self.target_head_slot == target_head_slot && self.target_head_root == target_head_root - } - - /// Check if the chain has peers from which to process batches. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn available_peers(&self) -> usize { - self.peers.len() - } - - /// Get the chain's id. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn id(&self) -> ChainId { - self.id - } - - /// Peers currently syncing this chain. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn peers(&self) -> impl Iterator + '_ { - self.peers.iter().cloned() - } - - /// Progress in epochs made by the chain - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn processed_epochs(&self) -> u64 { - self.processing_target - .saturating_sub(self.start_epoch) - .into() - } - - /// Returns the total count of pending blocks in all the batches of this chain - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn pending_blocks(&self) -> usize { - self.batches - .values() - .map(|batch| batch.pending_blocks()) - .sum() - } - - /// Removes a peer from the chain. - /// If the peer has active batches, those are considered failed and re-requested. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { - self.peers.remove(peer_id); - - if self.peers.is_empty() { - Err(RemoveChain::EmptyPeerPool) - } else { - Ok(KeepChain) - } - } - - /// Returns the latest slot number that has been processed. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn current_processed_slot(&self) -> Slot { - // the last slot we processed was included in the previous batch, and corresponds to the - // first slot of the current target epoch - self.processing_target - .start_slot(T::EthSpec::slots_per_epoch()) - } - - /// A block has been received for a batch on this chain. - /// If the block correctly completes the batch it will be processed if possible. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn on_block_response( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - batch_peers: BatchPeers, - request_id: Id, - blocks: Vec>, - ) -> ProcessingResult { - // check if we have this batch - let batch = match self.batches.get_mut(&batch_id) { - None => { - debug!(epoch = %batch_id, "Received a block for unknown batch"); - // A batch might get removed when the chain advances, so this is non fatal. - return Ok(KeepChain); - } - Some(batch) => { - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer, and that the - // request_id matches - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. - if !batch.is_expecting_block(&request_id) { - return Ok(KeepChain); - } - batch - } - }; - - // A stream termination has been sent. This batch has ended. Process a completed batch. - // Remove the request from the peer's active batches - - let received = batch.download_completed(blocks, batch_peers)?; - let awaiting_batches = batch_id - .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) - / EPOCHS_PER_BATCH; - debug!(epoch = %batch_id, blocks = received, batch_state = self.visualize_batch_state(), %awaiting_batches,"Batch downloaded"); - - // pre-emptively request more blocks from peers whilst we process current blocks, - self.request_batches(network)?; - self.process_completed_batches(network) - } - - /// Processes the batch with the given id. - /// The batch must exist and be ready for processing - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn process_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - // Only process batches if this chain is Syncing, and only one at a time - if self.state != ChainSyncingState::Syncing || self.current_processing_batch.is_some() { - return Ok(KeepChain); - } - - let Some(beacon_processor) = network.beacon_processor_if_enabled() else { - return Ok(KeepChain); - }; - - let Some(batch) = self.batches.get_mut(&batch_id) else { - return Err(RemoveChain::WrongChainState(format!( - "Trying to process a batch that does not exist: {}", - batch_id - ))); - }; - - // NOTE: We send empty batches to the processor in order to trigger the block processor - // result callback. This is done, because an empty batch could end a chain and the logic - // for removing chains and checking completion is in the callback. - - let (blocks, duration_in_awaiting_processing) = batch.start_processing()?; - metrics::observe_duration( - &metrics::SYNCING_CHAIN_BATCH_AWAITING_PROCESSING, - duration_in_awaiting_processing, - ); - - let process_id = ChainSegmentProcessId::RangeBatchId(self.id, batch_id); - self.current_processing_batch = Some(batch_id); - - if let Err(e) = beacon_processor.send_chain_segment(process_id, blocks) { - crit!(msg = "process_batch",error = %e, batch = ?self.processing_target, "Failed to send chain segment to processor."); - // This is unlikely to happen but it would stall syncing since the batch now has no - // blocks to continue, and the chain is expecting a processing result that won't - // arrive. To mitigate this, (fake) fail this processing so that the batch is - // re-downloaded. - self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure) - } else { - Ok(KeepChain) - } - } - - /// Processes the next ready batch, prioritizing optimistic batches over the processing target. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn process_completed_batches( - &mut self, - network: &mut SyncNetworkContext, - ) -> ProcessingResult { - // Only process batches if this chain is Syncing and only process one batch at a time - if self.state != ChainSyncingState::Syncing || self.current_processing_batch.is_some() { - return Ok(KeepChain); - } - - // Find the id of the batch we are going to process. - // - // First try our optimistic start, if any. If this batch is ready, we process it. If the - // batch has not already been completed, check the current chain target. - if let Some(epoch) = self.optimistic_start { - if let Some(batch) = self.batches.get(&epoch) { - let state = batch.state(); - match state { - BatchState::AwaitingProcessing(..) => { - // this batch is ready - debug!(%epoch, "Processing optimistic start"); - return self.process_batch(network, epoch); - } - BatchState::Downloading(..) => { - // The optimistic batch is being downloaded. We wait for this before - // attempting to process other batches. - return Ok(KeepChain); - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Processing(_) - | BatchState::AwaitingDownload - | BatchState::Failed => { - // these are all inconsistent states: - // - Processing -> `self.current_processing_batch` is None - // - Failed -> non recoverable batch. For an optimistic batch, it should - // have been removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. - return Err(RemoveChain::WrongChainState(format!( - "Optimistic batch indicates inconsistent chain state: {:?}", - state - ))); - } - BatchState::AwaitingValidation(_) => { - // If an optimistic start is given to the chain after the corresponding - // batch has been requested and processed we can land here. We drop the - // optimistic candidate since we can't conclude whether the batch included - // blocks or not at this point - debug!(batch = %epoch, "Dropping optimistic candidate"); - self.optimistic_start = None; - } - } - } - } - - // if the optimistic target can't be processed, check the processing target - if let Some(batch) = self.batches.get(&self.processing_target) { - let state = batch.state(); - match state { - BatchState::AwaitingProcessing(..) => { - return self.process_batch(network, self.processing_target); - } - BatchState::Downloading(..) => { - // Batch is not ready, nothing to process - } - BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { - // these are all inconsistent states: - // - Failed -> non recoverable batch. Chain should have beee removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. - // - Processing -> `self.current_processing_batch` is None - return Err(RemoveChain::WrongChainState(format!( - "Robust target batch indicates inconsistent chain state: {:?}", - state - ))); - } - BatchState::AwaitingValidation(_) => { - // we can land here if an empty optimistic batch succeeds processing and is - // inside the download buffer (between `self.processing_target` and - // `self.to_be_downloaded`). In this case, eventually the chain advances to the - // batch (`self.processing_target` reaches this point). - debug!( - batch = %self.processing_target, - "Chain encountered a robust batch awaiting validation" - ); - - self.processing_target += EPOCHS_PER_BATCH; - if self.to_be_downloaded <= self.processing_target { - self.to_be_downloaded = self.processing_target + EPOCHS_PER_BATCH; - } - self.request_batches(network)?; - } - } - } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - // This is to handle the case where no batch was sent for the current processing - // target when there is no sampling peers available. This is a valid state and should not - // return an error. - return Ok(KeepChain); - } else { - return Err(RemoveChain::WrongChainState(format!( - "Batch not found for current processing target {}", - self.processing_target - ))); - } - Ok(KeepChain) - } - - /// The block processor has completed processing a batch. This function handles the result - /// of the batch processor. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn on_batch_process_result( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - result: &BatchProcessResult, - ) -> ProcessingResult { - // the first two cases are possible if the chain advances while waiting for a processing - // result - let batch_state = self.visualize_batch_state(); - let batch = match &self.current_processing_batch { - Some(processing_id) if *processing_id != batch_id => { - debug!(batch_epoch = %batch_id, expected_batch_epoch = %processing_id,"Unexpected batch result"); - return Ok(KeepChain); - } - None => { - debug!(batch_epoch = %batch_id,"Chain was not expecting a batch result"); - return Ok(KeepChain); - } - _ => { - // batch_id matches, continue - self.current_processing_batch = None; - self.batches.get_mut(&batch_id).ok_or_else(|| { - RemoveChain::WrongChainState(format!( - "Current processing batch not found: {}", - batch_id - )) - })? - } - }; - - let batch_peers = batch.processing_peers().ok_or_else(|| { - RemoveChain::WrongBatchState(format!( - "Processing target is in wrong state: {:?}", - batch.state(), - )) - })?; - - // Log the process result and the batch for debugging purposes. - debug!( - result = ?result, - batch_epoch = %batch_id, - batch_state = ?batch_state, - ?batch, - "Batch processing result" - ); - - // We consider three cases. Batch was successfully processed, Batch failed processing due - // to a faulty peer, or batch failed processing but the peer can't be deemed faulty. - match result { - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } => { - if sent_blocks > imported_blocks { - let ignored_blocks = sent_blocks - imported_blocks; - metrics::inc_counter_vec_by( - &metrics::SYNCING_CHAINS_IGNORED_BLOCKS, - &[self.chain_type.into()], - ignored_blocks as u64, - ); - } - metrics::inc_counter_vec( - &metrics::SYNCING_CHAINS_PROCESSED_BATCHES, - &[self.chain_type.into()], - ); - - batch.processing_completed(BatchProcessingResult::Success)?; - - // was not empty = sent_blocks > 0 - if *sent_blocks > 0 { - // If the processed batch was not empty, we can validate previous unvalidated - // blocks. - self.advance_chain(network, batch_id); - // we register so that on chain switching we don't try it again - self.attempted_optimistic_starts.insert(batch_id); - } else if self.optimistic_start == Some(batch_id) { - // check if this batch corresponds to an optimistic batch. In this case, we - // reject it as an optimistic candidate since the batch was empty - self.reject_optimistic_batch( - network, - false, /* do not re-request */ - "batch was empty", - )?; - } - - if batch_id == self.processing_target { - self.processing_target += EPOCHS_PER_BATCH; - } - - // check if the chain has completed syncing - if self.current_processed_slot() >= self.target_head_slot { - // chain is completed - Err(RemoveChain::ChainCompleted) - } else { - // chain is not completed - // attempt to request more batches - self.request_batches(network)?; - // attempt to process more batches - self.process_completed_batches(network) - } - } - BatchProcessResult::FaultyFailure { - imported_blocks, - peer_action, - // TODO(sync): propagate error in logs - error: _, - } => { - // TODO(sync): De-dup between back and forwards sync - if let Some(penalty) = peer_action.block_peer { - // Penalize the peer appropiately. - network.report_peer(batch_peers.block(), penalty, "faulty_batch"); - } - - // Penalize each peer only once. Currently a peer_action does not mix different - // PeerAction levels. - for (peer, penalty) in peer_action - .column_peer - .iter() - .filter_map(|(column_index, penalty)| { - batch_peers - .column(column_index) - .map(|peer| (*peer, *penalty)) - }) - .unique() - { - network.report_peer(peer, penalty, "faulty_batch_column"); - } - - // Check if this batch is allowed to continue - match batch.processing_completed(BatchProcessingResult::FaultyFailure)? { - BatchOperationOutcome::Continue => { - // Chain can continue. Check if it can be moved forward. - if *imported_blocks > 0 { - // At least one block was successfully verified and imported, so we can be sure all - // previous batches are valid and we only need to download the current failed - // batch. - self.advance_chain(network, batch_id); - } - // Handle this invalid batch, that is within the re-process retries limit. - self.handle_invalid_batch(network, batch_id) - } - BatchOperationOutcome::Failed { blacklist } => { - // TODO(das): what peer action should we apply to the rest of - // peers? Say a batch repeatedly fails because a custody peer is not - // sending us its custody columns - let penalty = PeerAction::LowToleranceError; - - // Check that we have not exceeded the re-process retry counter, - // If a batch has exceeded the invalid batch lookup attempts limit, it means - // that it is likely all peers in this chain are are sending invalid batches - // repeatedly and are either malicious or faulty. We drop the chain and - // report all peers. - // There are some edge cases with forks that could land us in this situation. - // This should be unlikely, so we tolerate these errors, but not often. - warn!( - score_adjustment = %penalty, - batch_epoch = %batch_id, - "Batch failed to download. Dropping chain scoring peers" - ); - - for peer in self.peers.drain() { - network.report_peer(peer, penalty, "faulty_chain"); - } - Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }) - } - } - } - BatchProcessResult::NonFaultyFailure => { - batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; - // Simply redownload the batch. - self.send_batch(network, batch_id) - } - } - } - - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn reject_optimistic_batch( - &mut self, - network: &mut SyncNetworkContext, - redownload: bool, - reason: &str, - ) -> ProcessingResult { - if let Some(epoch) = self.optimistic_start.take() { - self.attempted_optimistic_starts.insert(epoch); - // if this batch is inside the current processing range, keep it, otherwise drop - // it. NOTE: this is done to prevent non-sequential batches coming from optimistic - // starts from filling up the buffer size - if epoch < self.to_be_downloaded { - debug!(%epoch, reason, "Rejected optimistic batch left for future use"); - // this batch is now treated as any other batch, and re-requested for future use - if redownload { - return self.send_batch(network, epoch); - } - } else { - debug!(%epoch, reason, "Rejected optimistic batch"); - self.batches.remove(&epoch); - } - } - - Ok(KeepChain) - } - - /// Removes any batches previous to the given `validating_epoch` and updates the current - /// boundaries of the chain. - /// - /// The `validating_epoch` must align with batch boundaries. - /// - /// If a previous batch has been validated and it had been re-processed, penalize the original - /// peer. - #[allow(clippy::modulo_one)] - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn advance_chain(&mut self, network: &mut SyncNetworkContext, validating_epoch: Epoch) { - // make sure this epoch produces an advancement - if validating_epoch <= self.start_epoch { - return; - } - - // safety check for batch boundaries - if validating_epoch % EPOCHS_PER_BATCH != self.start_epoch % EPOCHS_PER_BATCH { - crit!("Validating Epoch is not aligned"); - return; - } - - // batches in the range [BatchId, ..) (not yet validated) - let remaining_batches = self.batches.split_off(&validating_epoch); - // batches less than `validating_epoch` - let removed_batches = std::mem::replace(&mut self.batches, remaining_batches); - - for (id, batch) in removed_batches.into_iter() { - // only for batches awaiting validation can we be sure the last attempt is - // right, and thus, that any different attempt is wrong - match batch.state() { - BatchState::AwaitingValidation(ref processed_attempt) => { - for attempt in batch.attempts() { - // The validated batch has been re-processed - if attempt.hash != processed_attempt.hash { - // The re-downloaded version was different - // TODO(das): should penalize other peers? - let valid_attempt_peer = processed_attempt.block_peer(); - let bad_attempt_peer = attempt.block_peer(); - if valid_attempt_peer != bad_attempt_peer { - // A different peer sent the correct batch, the previous peer did not - // We negatively score the original peer. - let action = PeerAction::LowToleranceError; - debug!( - batch_epoch = %id, score_adjustment = %action, - original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer, - "Re-processed batch validated. Scoring original peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_original_peer", - ); - } else { - // The same peer corrected it's previous mistake. There was an error, so we - // negative score the original peer. - let action = PeerAction::MidToleranceError; - debug!( - batch_epoch = %id, - score_adjustment = %action, - original_peer = %bad_attempt_peer, - new_peer = %valid_attempt_peer, - "Re-processed batch validated by the same peer" - ); - network.report_peer( - bad_attempt_peer, - action, - "batch_reprocessed_same_peer", - ); - } - } - } - } - BatchState::Downloading(..) => {} - BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { - crit!("batch indicates inconsistent chain state while advancing chain") - } - BatchState::AwaitingProcessing(..) => {} - BatchState::Processing(_) => { - debug!(batch = %id, %batch, "Advancing chain while processing a batch"); - if let Some(processing_id) = self.current_processing_batch { - if id <= processing_id { - self.current_processing_batch = None; - } - } - } - } - } - - self.processing_target = self.processing_target.max(validating_epoch); - let old_start = self.start_epoch; - self.start_epoch = validating_epoch; - self.to_be_downloaded = self.to_be_downloaded.max(validating_epoch); - if self.batches.contains_key(&self.to_be_downloaded) { - // if a chain is advanced by Range beyond the previous `self.to_be_downloaded`, we - // won't have this batch, so we need to request it. - self.to_be_downloaded += EPOCHS_PER_BATCH; - } - if let Some(epoch) = self.optimistic_start { - if epoch <= validating_epoch { - self.optimistic_start = None; - } - } - debug!( - previous_start = %old_start, - new_start = %self.start_epoch, - processing_target = %self.processing_target, - "Chain advanced" - ); - } - - /// An invalid batch has been received that could not be processed, but that can be retried. - /// - /// These events occur when a peer has successfully responded with blocks, but the blocks we - /// have received are incorrect or invalid. This indicates the peer has not performed as - /// intended and can result in downvoting a peer. - #[instrument(parent = None,level = "info", fields(service = self.id, network), skip_all)] - fn handle_invalid_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - // The current batch could not be processed, indicating either the current or previous - // batches are invalid. - - // The previous batch could be incomplete due to the block sizes being too large to fit in - // a single RPC request or there could be consecutive empty batches which are not supposed - // to be there - - // The current (sub-optimal) strategy is to simply re-request all batches that could - // potentially be faulty. If a batch returns a different result than the original and - // results in successful processing, we downvote the original peer that sent us the batch. - - if let Some(epoch) = self.optimistic_start { - // If this batch is an optimistic batch, we reject this epoch as an optimistic - // candidate and try to re download it - if epoch == batch_id { - return self.reject_optimistic_batch(network, true, "batch was invalid"); - // since this is the optimistic batch, we can't consider previous batches as - // invalid. - } - } - // this is our robust `processing_target`. All previous batches must be awaiting - // validation - let mut redownload_queue = Vec::new(); - - for (id, batch) in self.batches.range_mut(..batch_id) { - if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? { - // remove the chain early - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: *id, - }); - } - redownload_queue.push(*id); - } - - // no batch maxed out it process attempts, so now the chain's volatile progress must be - // reset - self.processing_target = self.start_epoch; - - for id in redownload_queue { - self.send_batch(network, id)?; - } - // finally, re-request the failed batch. - self.send_batch(network, batch_id) - } - - pub fn stop_syncing(&mut self) { - self.state = ChainSyncingState::Stopped; - } - - /// Either a new chain, or an old one with a peer list - /// This chain has been requested to start syncing. - /// - /// This could be new chain, or an old chain that is being resumed. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn start_syncing( - &mut self, - network: &mut SyncNetworkContext, - local_finalized_epoch: Epoch, - optimistic_start_epoch: Epoch, - ) -> ProcessingResult { - // to avoid dropping local progress, we advance the chain wrt its batch boundaries. This - let align = |epoch| { - // start_epoch + (number of batches in between)*length_of_batch - self.start_epoch + ((epoch - self.start_epoch) / EPOCHS_PER_BATCH) * EPOCHS_PER_BATCH - }; - // get the *aligned* epoch that produces a batch containing the `local_finalized_epoch` - let validating_epoch = align(local_finalized_epoch); - // align the optimistic_start too. - let optimistic_epoch = align(optimistic_start_epoch); - - // advance the chain to the new validating epoch - self.advance_chain(network, validating_epoch); - if self.optimistic_start.is_none() - && optimistic_epoch > self.processing_target - && !self.attempted_optimistic_starts.contains(&optimistic_epoch) - { - self.optimistic_start = Some(optimistic_epoch); - } - - // update the state - self.state = ChainSyncingState::Syncing; - - // begin requesting blocks from the peer pool, until all peers are exhausted. - self.request_batches(network)?; - - // start processing batches if needed - self.process_completed_batches(network) - } - - /// Add a peer to the chain. - /// - /// If the chain is active, this starts requesting batches from this peer. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn add_peer( - &mut self, - network: &mut SyncNetworkContext, - peer_id: PeerId, - ) -> ProcessingResult { - self.peers.insert(peer_id); - self.request_batches(network) - } - - /// An RPC error has occurred. - /// - /// If the batch exists it is re-requested. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn inject_error( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - peer_id: &PeerId, - request_id: Id, - err: RpcResponseError, - ) -> ProcessingResult { - let batch_state = self.visualize_batch_state(); - if let Some(batch) = self.batches.get_mut(&batch_id) { - // A batch could be retried without the peer failing the request (disconnecting/ - // sending an error /timeout) if the peer is removed from the chain for other - // reasons. Check that this block belongs to the expected peer - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. - if !batch.is_expecting_block(&request_id) { - debug!( - batch_epoch = %batch_id, - batch_state = ?batch.state(), - %peer_id, - %request_id, - ?batch_state, - "Batch not expecting block" - ); - return Ok(KeepChain); - } - debug!( - batch_epoch = %batch_id, - batch_state = ?batch.state(), - error = ?err, - %peer_id, - %request_id, - "Batch download error" - ); - if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(*peer_id))? - { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }); - } - self.send_batch(network, batch_id) - } else { - debug!( - batch_epoch = %batch_id, - %peer_id, - %request_id, - batch_state, - "Batch not found" - ); - // this could be an error for an old batch, removed when the chain advances - Ok(KeepChain) - } - } - - /// Requests the batch assigned to the given id from a given peer. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn send_batch( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - let batch_state = self.visualize_batch_state(); - if let Some(batch) = self.batches.get_mut(&batch_id) { - let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_block_peers(); - - // TODO(das): we should request only from peers that are part of this SyncingChain. - // However, then we hit the NoPeer error frequently which causes the batch to fail and - // the SyncingChain to be dropped. We need to handle this case more gracefully. - let synced_peers = network - .network_globals() - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - - match network.block_components_by_range_request( - request, - RangeRequestId::RangeSync { - chain_id: self.id, - batch_id, - }, - &synced_peers, - &failed_peers, - ) { - Ok(request_id) => { - // inform the batch about the new request - batch.start_downloading(request_id)?; - if self - .optimistic_start - .map(|epoch| epoch == batch_id) - .unwrap_or(false) - { - debug!(epoch = %batch_id, %batch, %batch_state, "Requesting optimistic batch"); - } else { - debug!(epoch = %batch_id, %batch, %batch_state, "Requesting batch"); - } - return Ok(KeepChain); - } - Err(e) => match e { - // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For - // sync to work properly it must be okay to have "stalled" batches in - // AwaitingDownload state. Currently it will error with invalid state if - // that happens. Sync manager must periodicatlly prune stalled batches like - // we do for lookup sync. Then we can deprecate the redundant - // `good_peers_on_sampling_subnets` checks. - e - @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); - // register the failed download and check if the batch can be retried - batch.start_downloading(1)?; // fake request_id = 1 is not relevant - match batch.download_failed(None)? { - BatchOperationOutcome::Failed { blacklist } => { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }) - } - BatchOperationOutcome::Continue => { - return self.send_batch(network, batch_id) - } - } - } - }, - } - } - - Ok(KeepChain) - } - - /// Returns true if this chain is currently syncing. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn is_syncing(&self) -> bool { - match self.state { - ChainSyncingState::Syncing => true, - ChainSyncingState::Stopped => false, - } - } - - /// Kickstarts the chain by sending for processing batches that are ready and requesting more - /// batches if needed. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn resume( - &mut self, - network: &mut SyncNetworkContext, - ) -> Result { - // Request more batches if needed. - self.request_batches(network)?; - // If there is any batch ready for processing, send it. - self.process_completed_batches(network) - } - - /// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer - /// pool and left over batches until the batch buffer is reached or all peers are exhausted. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn request_batches(&mut self, network: &mut SyncNetworkContext) -> ProcessingResult { - if !matches!(self.state, ChainSyncingState::Syncing) { - return Ok(KeepChain); - } - - // find the next pending batch and request it from the peer - - // check if we have the batch for our optimistic start. If not, request it first. - // We wait for this batch before requesting any other batches. - if let Some(epoch) = self.optimistic_start { - if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); - return Ok(KeepChain); - } - - if let Entry::Vacant(entry) = self.batches.entry(epoch) { - let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); - entry.insert(optimistic_batch); - self.send_batch(network, epoch)?; - } - return Ok(KeepChain); - } - - // find the next pending batch and request it from the peer - // Note: for this function to not infinite loop we must: - // - If `include_next_batch` returns Some we MUST increase the count of batches that are - // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of - // that function. - while let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id)?; - } - - // No more batches, simply stop - Ok(KeepChain) - } - - /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in - /// every sampling column subnet. - fn good_peers_on_sampling_subnets( - &self, - epoch: Epoch, - network: &SyncNetworkContext, - ) -> bool { - if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { - // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network - .network_globals() - .sampling_subnets() - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer(*subnet_id) - .count(); - - peer_count > 0 - }); - peers_on_all_custody_subnets - } else { - true - } - } - - /// Creates the next required batch from the chain. If there are no more batches required, - /// `false` is returned. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { - // don't request batches beyond the target head slot - if self - .to_be_downloaded - .start_slot(T::EthSpec::slots_per_epoch()) - >= self.target_head_slot - { - return None; - } - - // only request batches up to the buffer size limit - // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync - // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { - matches!( - batch.state(), - BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) - ) - }; - if self - .batches - .iter() - .filter(|&(_epoch, batch)| in_buffer(batch)) - .count() - > BATCH_BUFFER_SIZE as usize - { - return None; - } - - // don't send batch requests until we have peers on sampling subnets - // TODO(das): this is a workaround to avoid sending out excessive block requests because - // block and data column requests are currently coupled. This can be removed once we find a - // way to decouple the requests and do retries individually, see issue #6258. - if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); - return None; - } - - // If no batch needs a retry, attempt to send the batch of the next epoch to download - let next_batch_id = self.to_be_downloaded; - // this batch could have been included already being an optimistic batch - match self.batches.entry(next_batch_id) { - Entry::Occupied(_) => { - // this batch doesn't need downloading, let this same function decide the next batch - self.to_be_downloaded += EPOCHS_PER_BATCH; - self.include_next_batch(network) - } - Entry::Vacant(entry) => { - entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH)); - self.to_be_downloaded += EPOCHS_PER_BATCH; - Some(next_batch_id) - } - } - } - - /// Creates a string visualization of the current state of the chain, to make it easier for debugging and understanding - /// where sync is up to from glancing at the logs. - /// - /// This produces a string of the form: [D,E,E,E,E] - /// to indicate the current buffer state of the chain. The symbols are defined on each of the - /// batch states. See [BatchState::visualize] for symbol definitions. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - fn visualize_batch_state(&self) -> String { - let mut visualization_string = String::with_capacity((BATCH_BUFFER_SIZE * 3) as usize); - - // Start of the block - visualization_string.push('['); - - for mut batch_index in 0..BATCH_BUFFER_SIZE { - if let Some(batch) = self - .batches - .get(&(self.processing_target + batch_index as u64 * EPOCHS_PER_BATCH)) - { - visualization_string.push(batch.visualize()); - if batch_index != BATCH_BUFFER_SIZE { - // Add a comma in between elements - visualization_string.push(','); - } - } else { - // No batch exists, it is on our list to be downloaded - // Fill in the rest of the gaps - while batch_index < BATCH_BUFFER_SIZE { - visualization_string.push('E'); - // Add a comma between the empty batches - if batch_index < BATCH_BUFFER_SIZE.saturating_sub(1) { - visualization_string.push(',') - } - batch_index += 1; - } - break; - } - } - visualization_string.push(']'); - visualization_string - } -} - -use super::batch::WrongState as WrongBatchState; -impl From for RemoveChain { - fn from(err: WrongBatchState) -> Self { - RemoveChain::WrongBatchState(err.0) - } -} - -impl RemoveChain { - pub fn is_critical(&self) -> bool { - matches!( - self, - RemoveChain::WrongBatchState(..) | RemoveChain::WrongChainState(..) - ) - } -} - -impl From for SyncingChainType { - fn from(value: RangeSyncType) -> Self { - match value { - RangeSyncType::Head => Self::Head, - RangeSyncType::Finalized => Self::Finalized, - } - } -} diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs deleted file mode 100644 index 9f500c61e0b..00000000000 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ /dev/null @@ -1,530 +0,0 @@ -//! This provides the logic for the finalized and head chains. -//! -//! Each chain type is stored in it's own map. A variety of helper functions are given along with -//! this struct to simplify the logic of the other layers of sync. - -use super::chain::{ChainId, ProcessingResult, RemoveChain, SyncingChain}; -use super::sync_type::RangeSyncType; -use crate::metrics; -use crate::sync::network_context::SyncNetworkContext; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use fnv::FnvHashMap; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::PeerId; -use lighthouse_network::SyncInfo; -use logging::crit; -use smallvec::SmallVec; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::sync::Arc; -use tracing::{debug, error}; -use types::EthSpec; -use types::{Epoch, Hash256, Slot}; - -/// The number of head syncing chains to sync at a time. -const PARALLEL_HEAD_CHAINS: usize = 2; - -/// Minimum work we require a finalized chain to do before picking a chain with more peers. -const MIN_FINALIZED_CHAIN_PROCESSED_EPOCHS: u64 = 10; - -/// The state of the long range/batch sync. -#[derive(Clone)] -pub enum RangeSyncState { - /// A finalized chain is being synced. - Finalized(Id), - /// There are no finalized chains and we are syncing one more head chains. - Head(SmallVec<[Id; PARALLEL_HEAD_CHAINS]>), - /// There are no head or finalized chains and no long range sync is in progress. - Idle, -} - -pub type SyncChainStatus = - Result, &'static str>; - -/// A collection of finalized and head chains currently being processed. -pub struct ChainCollection { - /// The beacon chain for processing. - beacon_chain: Arc>, - /// The set of finalized chains being synced. - finalized_chains: FnvHashMap>, - /// The set of head chains being synced. - head_chains: FnvHashMap>, - /// The current sync state of the process. - state: RangeSyncState, -} - -impl ChainCollection { - pub fn new(beacon_chain: Arc>) -> Self { - ChainCollection { - beacon_chain, - finalized_chains: FnvHashMap::default(), - head_chains: FnvHashMap::default(), - state: RangeSyncState::Idle, - } - } - - /// Updates the Syncing state of the collection after a chain is removed. - fn on_chain_removed(&mut self, id: &ChainId, was_syncing: bool, sync_type: RangeSyncType) { - metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_REMOVED, &[sync_type.as_str()]); - self.update_metrics(); - - match self.state { - RangeSyncState::Finalized(ref syncing_id) => { - if syncing_id == id { - // the finalized chain that was syncing was removed - debug_assert!(was_syncing && sync_type == RangeSyncType::Finalized); - let syncing_head_ids: SmallVec<[Id; PARALLEL_HEAD_CHAINS]> = self - .head_chains - .iter() - .filter(|(_id, chain)| chain.is_syncing()) - .map(|(id, _)| *id) - .collect(); - self.state = if syncing_head_ids.is_empty() { - RangeSyncState::Idle - } else { - RangeSyncState::Head(syncing_head_ids) - }; - } else { - // we removed a head chain, or a stopped finalized chain - debug_assert!(!was_syncing || sync_type != RangeSyncType::Finalized); - } - } - RangeSyncState::Head(ref mut syncing_head_ids) => { - if let Some(index) = syncing_head_ids - .iter() - .enumerate() - .find(|(_, &chain_id)| &chain_id == id) - .map(|(i, _)| i) - { - // a syncing head chain was removed - debug_assert!(was_syncing); - syncing_head_ids.swap_remove(index); - if syncing_head_ids.is_empty() { - self.state = RangeSyncState::Idle; - } - } else { - debug_assert!(!was_syncing); - } - } - RangeSyncState::Idle => { - // the removed chain should not be syncing - debug_assert!(!was_syncing) - } - } - } - - /// Calls `func` on every chain of the collection. If the result is - /// `ProcessingResult::RemoveChain`, the chain is removed and returned. - /// NOTE: `func` must not change the syncing state of a chain. - pub fn call_all(&mut self, mut func: F) -> Vec<(SyncingChain, RangeSyncType, RemoveChain)> - where - F: FnMut(&mut SyncingChain) -> ProcessingResult, - { - let mut to_remove = Vec::new(); - - for (id, chain) in self.finalized_chains.iter_mut() { - if let Err(remove_reason) = func(chain) { - to_remove.push((*id, RangeSyncType::Finalized, remove_reason)); - } - } - - for (id, chain) in self.head_chains.iter_mut() { - if let Err(remove_reason) = func(chain) { - to_remove.push((*id, RangeSyncType::Head, remove_reason)); - } - } - - let mut results = Vec::with_capacity(to_remove.len()); - for (id, sync_type, reason) in to_remove.into_iter() { - let chain = match sync_type { - RangeSyncType::Finalized => self.finalized_chains.remove(&id), - RangeSyncType::Head => self.head_chains.remove(&id), - }; - let chain = chain.expect("Chain exists"); - self.on_chain_removed(&id, chain.is_syncing(), sync_type); - results.push((chain, sync_type, reason)); - } - results - } - - /// Executes a function on the chain with the given id. - /// - /// If the function returns `ProcessingResult::RemoveChain`, the chain is removed and returned. - /// If the chain is found, its syncing type is returned, or an error otherwise. - /// NOTE: `func` should not change the sync state of a chain. - #[allow(clippy::type_complexity)] - pub fn call_by_id( - &mut self, - id: ChainId, - func: F, - ) -> Result<(Option<(SyncingChain, RemoveChain)>, RangeSyncType), ()> - where - F: FnOnce(&mut SyncingChain) -> ProcessingResult, - { - if let Entry::Occupied(mut entry) = self.finalized_chains.entry(id) { - // Search in our finalized chains first - if let Err(remove_reason) = func(entry.get_mut()) { - let chain = entry.remove(); - self.on_chain_removed(&id, chain.is_syncing(), RangeSyncType::Finalized); - Ok((Some((chain, remove_reason)), RangeSyncType::Finalized)) - } else { - Ok((None, RangeSyncType::Finalized)) - } - } else if let Entry::Occupied(mut entry) = self.head_chains.entry(id) { - // Search in our head chains next - if let Err(remove_reason) = func(entry.get_mut()) { - let chain = entry.remove(); - self.on_chain_removed(&id, chain.is_syncing(), RangeSyncType::Head); - Ok((Some((chain, remove_reason)), RangeSyncType::Head)) - } else { - Ok((None, RangeSyncType::Head)) - } - } else { - // Chain was not found in the finalized collection, nor the head collection - Err(()) - } - } - - /// Updates the state of the chain collection. - /// - /// This removes any out-dated chains, swaps to any higher priority finalized chains and - /// updates the state of the collection. This starts head chains syncing if any are required to - /// do so. - pub fn update( - &mut self, - network: &mut SyncNetworkContext, - local: &SyncInfo, - awaiting_head_peers: &mut HashMap, - ) { - // Remove any outdated finalized/head chains - self.purge_outdated_chains(local, awaiting_head_peers); - - let local_head_epoch = local.head_slot.epoch(T::EthSpec::slots_per_epoch()); - // Choose the best finalized chain if one needs to be selected. - self.update_finalized_chains(network, local.finalized_epoch, local_head_epoch); - - if !matches!(self.state, RangeSyncState::Finalized(_)) { - // Handle head syncing chains if there are no finalized chains left. - self.update_head_chains( - network, - local.finalized_epoch, - local_head_epoch, - awaiting_head_peers, - ); - } - } - - pub fn state(&self) -> SyncChainStatus { - match self.state { - RangeSyncState::Finalized(ref syncing_id) => { - let chain = self - .finalized_chains - .get(syncing_id) - .ok_or("Finalized syncing chain not found")?; - Ok(Some(( - RangeSyncType::Finalized, - chain.start_epoch.start_slot(T::EthSpec::slots_per_epoch()), - chain.target_head_slot, - ))) - } - RangeSyncState::Head(ref syncing_head_ids) => { - let mut range: Option<(Slot, Slot)> = None; - for id in syncing_head_ids { - let chain = self - .head_chains - .get(id) - .ok_or("Head syncing chain not found")?; - let start = chain.start_epoch.start_slot(T::EthSpec::slots_per_epoch()); - let target = chain.target_head_slot; - - range = range - .map(|(min_start, max_slot)| (min_start.min(start), max_slot.max(target))) - .or(Some((start, target))); - } - let (start_slot, target_slot) = range.ok_or("Syncing head with empty head ids")?; - Ok(Some((RangeSyncType::Head, start_slot, target_slot))) - } - RangeSyncState::Idle => Ok(None), - } - } - - /// This looks at all current finalized chains and decides if a new chain should be prioritised - /// or not. - fn update_finalized_chains( - &mut self, - network: &mut SyncNetworkContext, - local_epoch: Epoch, - local_head_epoch: Epoch, - ) { - // Find the chain with most peers and check if it is already syncing - if let Some((mut new_id, max_peers)) = self - .finalized_chains - .iter() - .max_by_key(|(_, chain)| chain.available_peers()) - .map(|(id, chain)| (*id, chain.available_peers())) - { - let mut old_id = None; - if let RangeSyncState::Finalized(syncing_id) = self.state { - if syncing_id == new_id { - // best chain is already syncing - old_id = Some(None); - } else { - // chains are different, check that they don't have the same number of peers - if let Some(syncing_chain) = self.finalized_chains.get_mut(&syncing_id) { - if max_peers > syncing_chain.available_peers() - && syncing_chain.processed_epochs() - > MIN_FINALIZED_CHAIN_PROCESSED_EPOCHS - { - syncing_chain.stop_syncing(); - old_id = Some(Some(syncing_id)); - } else { - // chains have the same number of peers, pick the currently syncing - // chain to avoid unnecessary switchings and try to advance it - new_id = syncing_id; - old_id = Some(None); - } - } - } - } - - let chain = self - .finalized_chains - .get_mut(&new_id) - .expect("Chain exists"); - - match old_id { - Some(Some(old_id)) => debug!(old_id, id = chain.id(), "Switching finalized chains"), - None => debug!(id = chain.id(), "Syncing new finalized chain"), - Some(None) => { - // this is the same chain. We try to advance it. - } - } - - // update the state to a new finalized state - self.state = RangeSyncState::Finalized(new_id); - - if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch) - { - if remove_reason.is_critical() { - crit!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains"); - } else { - // this happens only if sending a batch over the `network` fails a lot - error!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains"); - } - self.finalized_chains.remove(&new_id); - self.on_chain_removed(&new_id, true, RangeSyncType::Finalized); - } - } - } - - /// Start syncing any head chains if required. - fn update_head_chains( - &mut self, - network: &mut SyncNetworkContext, - local_epoch: Epoch, - local_head_epoch: Epoch, - awaiting_head_peers: &mut HashMap, - ) { - // Include the awaiting head peers - for (peer_id, peer_sync_info) in awaiting_head_peers.drain() { - debug!("including head peer"); - self.add_peer_or_create_chain( - local_epoch, - peer_sync_info.head_root, - peer_sync_info.head_slot, - peer_id, - RangeSyncType::Head, - network, - ); - } - - if self.head_chains.is_empty() { - // There are no finalized chains, update the state. - self.state = RangeSyncState::Idle; - return; - } - - // Order chains by available peers, if two chains have the same number of peers, prefer one - // that is already syncing - let mut preferred_ids = self - .head_chains - .iter() - .map(|(id, chain)| (chain.available_peers(), !chain.is_syncing(), *id)) - .collect::>(); - preferred_ids.sort_unstable(); - - let mut syncing_chains = SmallVec::<[Id; PARALLEL_HEAD_CHAINS]>::new(); - for (_, _, id) in preferred_ids { - let chain = self.head_chains.get_mut(&id).expect("known chain"); - if syncing_chains.len() < PARALLEL_HEAD_CHAINS { - // start this chain if it's not already syncing - if !chain.is_syncing() { - debug!(id = chain.id(), "New head chain started syncing"); - } - if let Err(remove_reason) = - chain.start_syncing(network, local_epoch, local_head_epoch) - { - self.head_chains.remove(&id); - if remove_reason.is_critical() { - crit!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains"); - } else { - error!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains"); - } - } else { - syncing_chains.push(id); - } - } else { - // stop any other chain - chain.stop_syncing(); - } - } - - self.state = if syncing_chains.is_empty() { - RangeSyncState::Idle - } else { - RangeSyncState::Head(syncing_chains) - }; - } - - /// Returns if `true` if any finalized chains exist, `false` otherwise. - pub fn is_finalizing_sync(&self) -> bool { - !self.finalized_chains.is_empty() - } - - /// Removes any outdated finalized or head chains. - /// This removes chains with no peers, or chains whose start block slot is less than our current - /// finalized block slot. Peers that would create outdated chains are removed too. - pub fn purge_outdated_chains( - &mut self, - local_info: &SyncInfo, - awaiting_head_peers: &mut HashMap, - ) { - let local_finalized_slot = local_info - .finalized_epoch - .start_slot(T::EthSpec::slots_per_epoch()); - - let beacon_chain = &self.beacon_chain; - - let is_outdated = |target_slot: &Slot, target_root: &Hash256| { - target_slot <= &local_finalized_slot - || beacon_chain.block_is_known_to_fork_choice(target_root) - }; - - // Retain only head peers that remain relevant - awaiting_head_peers.retain(|_peer_id, peer_sync_info| { - !is_outdated(&peer_sync_info.head_slot, &peer_sync_info.head_root) - }); - - // Remove chains that are out-dated - let mut removed_chains = Vec::new(); - removed_chains.extend(self.finalized_chains.iter().filter_map(|(id, chain)| { - if is_outdated(&chain.target_head_slot, &chain.target_head_root) - || chain.available_peers() == 0 - { - debug!(id, "Purging out of finalized chain"); - Some((*id, chain.is_syncing(), RangeSyncType::Finalized)) - } else { - None - } - })); - - removed_chains.extend(self.head_chains.iter().filter_map(|(id, chain)| { - if is_outdated(&chain.target_head_slot, &chain.target_head_root) - || chain.available_peers() == 0 - { - debug!(id, "Purging out of date head chain"); - Some((*id, chain.is_syncing(), RangeSyncType::Head)) - } else { - None - } - })); - - // update the state of the collection - for (id, was_syncing, sync_type) in removed_chains { - // remove each chain, updating the state for each removal. - match sync_type { - RangeSyncType::Finalized => self.finalized_chains.remove(&id), - RangeSyncType::Head => self.head_chains.remove(&id), - }; - self.on_chain_removed(&id, was_syncing, sync_type); - } - } - - /// Adds a peer to a chain with the given target, or creates a new syncing chain if it doesn't - /// exists. - #[allow(clippy::too_many_arguments)] - pub fn add_peer_or_create_chain( - &mut self, - start_epoch: Epoch, - target_head_root: Hash256, - target_head_slot: Slot, - peer: PeerId, - sync_type: RangeSyncType, - network: &mut SyncNetworkContext, - ) { - let collection = if let RangeSyncType::Finalized = sync_type { - &mut self.finalized_chains - } else { - &mut self.head_chains - }; - - match collection - .iter_mut() - .find(|(_, chain)| chain.has_same_target(target_head_slot, target_head_root)) - { - Some((&id, chain)) => { - debug!(peer_id = %peer, ?sync_type, id, "Adding peer to known chain"); - debug_assert_eq!(chain.target_head_root, target_head_root); - debug_assert_eq!(chain.target_head_slot, target_head_slot); - if let Err(remove_reason) = chain.add_peer(network, peer) { - if remove_reason.is_critical() { - crit!(id, reason = ?remove_reason, "Chain removed after adding peer"); - } else { - error!(id, reason = ?remove_reason, "Chain removed after adding peer"); - } - let is_syncing = chain.is_syncing(); - collection.remove(&id); - self.on_chain_removed(&id, is_syncing, sync_type); - } - } - None => { - let peer_rpr = peer.to_string(); - let id = network.next_id(); - let new_chain = SyncingChain::new( - id, - start_epoch, - target_head_slot, - target_head_root, - peer, - sync_type.into(), - ); - - debug!( - peer_id = peer_rpr, - ?sync_type, - id, - %start_epoch, - %target_head_slot, - ?target_head_root, - "New chain added to sync" - ); - collection.insert(id, new_chain); - metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_ADDED, &[sync_type.as_str()]); - self.update_metrics(); - } - } - } - - fn update_metrics(&self) { - metrics::set_gauge_vec( - &metrics::SYNCING_CHAINS_COUNT, - &[RangeSyncType::Finalized.as_str()], - self.finalized_chains.len() as i64, - ); - metrics::set_gauge_vec( - &metrics::SYNCING_CHAINS_COUNT, - &[RangeSyncType::Head.as_str()], - self.head_chains.len() as i64, - ); - } -} diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs deleted file mode 100644 index 1218e0cd09c..00000000000 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -//! This provides the logic for syncing a chain when the local node is far behind it's current -//! peers. - -mod batch; -mod chain; -mod chain_collection; -mod range; -mod sync_type; - -pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, - ByRangeRequestType, -}; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; -#[cfg(test)] -pub use chain_collection::SyncChainStatus; -pub use range::RangeSync; -pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs deleted file mode 100644 index e2c076484a5..00000000000 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ /dev/null @@ -1,446 +0,0 @@ -//! This contains the logic for the long range (batch) sync strategy. -//! -//! The general premise is to group peers by their self-proclaimed finalized blocks and head -//! blocks. Once grouped, the peers become sources to download a specific `Chain`. A `Chain` is a -//! collection of blocks that terminates at the specified target head. -//! -//! This sync strategy can be separated into two distinct forms: -//! - Finalized Chain Sync -//! - Head Chain Sync -//! -//! ## Finalized chain sync -//! -//! This occurs when a peer connects that claims to have a finalized head slot that is greater -//! than our own. In this case, we form a chain from our last finalized epoch, to their claimed -//! finalized slot. Any peer that also claims to have this last finalized slot is added to a pool -//! of peers from which batches of blocks may be downloaded. Blocks are downloaded until the -//! finalized slot of the chain is reached. Once reached, all peers within the pool are sent a -//! STATUS message to potentially start a head chain sync, or check if further finalized chains -//! need to be downloaded. -//! -//! A few interesting notes about finalized chain syncing: -//! - Only one finalized chain can sync at a time -//! - The finalized chain with the largest peer pool takes priority. -//! - As one finalized chain completes, others are checked to see if we they can be continued, -//! otherwise they are removed. -//! -//! ## Head Chain Sync -//! -//! If a peer joins and there is no active finalized chains being synced, and it's head is beyond -//! our `SLOT_IMPORT_TOLERANCE` a chain is formed starting from this peers finalized epoch (this -//! has been necessarily downloaded by our node, otherwise we would start a finalized chain sync) -//! to this peers head slot. Any other peers that match this head slot and head root, are added to -//! this chain's peer pool, which will be downloaded in parallel. -//! -//! Unlike finalized chains, head chains can be synced in parallel. -//! -//! ## Batch Syncing -//! -//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially -//! and further batches are requested as current blocks are being processed. - -use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; -use super::chain_collection::{ChainCollection, SyncChainStatus}; -use super::sync_type::RangeSyncType; -use super::BatchPeers; -use crate::metrics; -use crate::status::ToStatusMessage; -use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; -use crate::sync::BatchProcessResult; -use beacon_chain::block_verification_types::RpcBlock; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::rpc::GoodbyeReason; -use lighthouse_network::service::api_types::Id; -use lighthouse_network::{PeerId, SyncInfo}; -use logging::crit; -use lru_cache::LRUTimeCache; -use std::collections::HashMap; -use std::sync::Arc; -use tracing::{debug, instrument, trace, warn}; -use types::{Epoch, EthSpec, Hash256}; - -/// For how long we store failed finalized chains to prevent retries. -const FAILED_CHAINS_EXPIRY_SECONDS: u64 = 30; - -/// The primary object dealing with long range/batch syncing. This contains all the active and -/// non-active chains that need to be processed before the syncing is considered complete. This -/// holds the current state of the long range sync. -pub struct RangeSync { - /// The beacon chain for processing. - beacon_chain: Arc>, - /// Last known sync info of our useful connected peers. We use this information to create Head - /// chains after all finalized chains have ended. - awaiting_head_peers: HashMap, - /// A collection of chains that need to be downloaded. This stores any head or finalized chains - /// that need to be downloaded. - chains: ChainCollection, - /// Chains that have failed and are stored to prevent being retried. - failed_chains: LRUTimeCache, -} - -impl RangeSync -where - T: BeaconChainTypes, -{ - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn new(beacon_chain: Arc>) -> Self { - RangeSync { - beacon_chain: beacon_chain.clone(), - chains: ChainCollection::new(beacon_chain), - failed_chains: LRUTimeCache::new(std::time::Duration::from_secs( - FAILED_CHAINS_EXPIRY_SECONDS, - )), - awaiting_head_peers: HashMap::new(), - } - } - - #[cfg(test)] - pub(crate) fn __failed_chains(&mut self) -> Vec { - self.failed_chains.keys().copied().collect() - } - - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn state(&self) -> SyncChainStatus { - self.chains.state() - } - - /// A useful peer has been added. The SyncManager has identified this peer as needing either - /// a finalized or head chain sync. This processes the peer and starts/resumes any chain that - /// may need to be synced as a result. A new peer, may increase the peer pool of a finalized - /// chain, this may result in a different finalized chain from syncing as finalized chains are - /// prioritised by peer-pool size. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn add_peer( - &mut self, - network: &mut SyncNetworkContext, - local_info: SyncInfo, - peer_id: PeerId, - remote_info: SyncInfo, - ) { - // evaluate which chain to sync from - - // determine if we need to run a sync to the nearest finalized state or simply sync to - // its current head - - // convenience variable - let remote_finalized_slot = remote_info - .finalized_epoch - .start_slot(T::EthSpec::slots_per_epoch()); - - // NOTE: A peer that has been re-status'd may now exist in multiple finalized chains. This - // is OK since we since only one finalized chain at a time. - - // determine which kind of sync to perform and set up the chains - match RangeSyncType::new(self.beacon_chain.as_ref(), &local_info, &remote_info) { - RangeSyncType::Finalized => { - // Make sure we have not recently tried this chain - if self.failed_chains.contains(&remote_info.finalized_root) { - debug!(failed_root = ?remote_info.finalized_root, %peer_id,"Disconnecting peer that belongs to previously failed chain"); - network.goodbye_peer(peer_id, GoodbyeReason::IrrelevantNetwork); - return; - } - - // Finalized chain search - debug!(%peer_id, "Finalization sync peer joined"); - self.awaiting_head_peers.remove(&peer_id); - - // Because of our change in finalized sync batch size from 2 to 1 and our transition - // to using exact epoch boundaries for batches (rather than one slot past the epoch - // boundary), we need to sync finalized sync to 2 epochs + 1 slot past our peer's - // finalized slot in order to finalize the chain locally. - let target_head_slot = - remote_finalized_slot + (2 * T::EthSpec::slots_per_epoch()) + 1; - - // Note: We keep current head chains. These can continue syncing whilst we complete - // this new finalized chain. - - self.chains.add_peer_or_create_chain( - local_info.finalized_epoch, - remote_info.finalized_root, - target_head_slot, - peer_id, - RangeSyncType::Finalized, - network, - ); - - self.chains - .update(network, &local_info, &mut self.awaiting_head_peers); - } - RangeSyncType::Head => { - // This peer requires a head chain sync - - if self.chains.is_finalizing_sync() { - // If there are finalized chains to sync, finish these first, before syncing head - // chains. - trace!(%peer_id, awaiting_head_peers = &self.awaiting_head_peers.len(),"Waiting for finalized sync to complete"); - self.awaiting_head_peers.insert(peer_id, remote_info); - return; - } - - // if the peer existed in any other head chain, remove it. - self.remove_peer(network, &peer_id); - self.awaiting_head_peers.remove(&peer_id); - - // The new peer has the same finalized (earlier filters should prevent a peer with an - // earlier finalized chain from reaching here). - - let start_epoch = std::cmp::min(local_info.head_slot, remote_finalized_slot) - .epoch(T::EthSpec::slots_per_epoch()); - self.chains.add_peer_or_create_chain( - start_epoch, - remote_info.head_root, - remote_info.head_slot, - peer_id, - RangeSyncType::Head, - network, - ); - self.chains - .update(network, &local_info, &mut self.awaiting_head_peers); - } - } - } - - /// A `BlocksByRange` response has been received from the network. - /// - /// This function finds the chain that made this request. Once found, processes the result. - /// This request could complete a chain or simply add to its progress. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn blocks_by_range_response( - &mut self, - network: &mut SyncNetworkContext, - batch_peers: BatchPeers, - chain_id: ChainId, - batch_id: BatchId, - request_id: Id, - blocks: Vec>, - ) { - // check if this chunk removes the chain - match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, batch_peers, request_id, blocks) - }) { - Ok((removed_chain, sync_type)) => { - if let Some((removed_chain, remove_reason)) = removed_chain { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "block response", - ); - } - } - Err(_) => { - trace!(%chain_id, "BlocksByRange response for removed chain") - } - } - } - - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn handle_block_process_result( - &mut self, - network: &mut SyncNetworkContext, - chain_id: ChainId, - batch_id: Epoch, - result: BatchProcessResult, - ) { - // check if this response removes the chain - match self.chains.call_by_id(chain_id, |chain| { - chain.on_batch_process_result(network, batch_id, &result) - }) { - Ok((None, _sync_type)) => { - // Chain was found and not removed - } - Ok((Some((removed_chain, remove_reason)), sync_type)) => { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "batch processing result", - ); - } - - Err(_) => { - trace!(%chain_id, "BlocksByRange response for removed chain") - } - } - } - - /// A peer has disconnected. This removes the peer from any ongoing chains and mappings. A - /// disconnected peer could remove a chain - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn peer_disconnect(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) { - // if the peer is in the awaiting head mapping, remove it - self.awaiting_head_peers.remove(peer_id); - - // remove the peer from any peer pool, failing its batches - self.remove_peer(network, peer_id); - } - - /// When a peer gets removed, both the head and finalized chains need to be searched to check - /// which pool the peer is in. The chain may also have a batch or batches awaiting - /// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum - /// retries. In this case, we need to remove the chain. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - fn remove_peer(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) { - for (removed_chain, sync_type, remove_reason) in - self.chains.call_all(|chain| chain.remove_peer(peer_id)) - { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "peer removed", - ); - } - } - - /// An RPC error has occurred. - /// - /// Check to see if the request corresponds to a pending batch. If so, re-request it if possible, if there have - /// been too many failed attempts for the batch, remove the chain. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn inject_error( - &mut self, - network: &mut SyncNetworkContext, - peer_id: PeerId, - batch_id: BatchId, - chain_id: ChainId, - request_id: Id, - err: RpcResponseError, - ) { - // check that this request is pending - match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id, err) - }) { - Ok((removed_chain, sync_type)) => { - if let Some((removed_chain, remove_reason)) = removed_chain { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "RPC error", - ); - } - } - Err(_) => { - trace!(%chain_id, "BlocksByRange response for removed chain") - } - } - } - - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - fn on_chain_removed( - &mut self, - chain: SyncingChain, - sync_type: RangeSyncType, - remove_reason: RemoveChain, - network: &mut SyncNetworkContext, - op: &'static str, - ) { - if remove_reason.is_critical() { - crit!(id = chain.id(), ?sync_type, reason = ?remove_reason, op, "Chain removed"); - } else { - debug!(id = chain.id(), ?sync_type, reason = ?remove_reason, op, "Chain removed"); - } - - if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason { - if RangeSyncType::Finalized == sync_type && blacklist { - warn!( - id = chain.id(), - "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", - FAILED_CHAINS_EXPIRY_SECONDS - ); - self.failed_chains.insert(chain.target_head_root); - } - } - - metrics::inc_counter_vec_by( - &metrics::SYNCING_CHAINS_DROPPED_BLOCKS, - &[sync_type.as_str()], - chain.pending_blocks() as u64, - ); - - network.status_peers(self.beacon_chain.as_ref(), chain.peers()); - - let status = self.beacon_chain.status_message(); - let local = SyncInfo { - head_slot: status.head_slot, - head_root: status.head_root, - finalized_epoch: status.finalized_epoch, - finalized_root: status.finalized_root, - }; - - // update the state of the collection - self.chains - .update(network, &local, &mut self.awaiting_head_peers); - } - - /// Kickstarts sync. - #[instrument(parent = None, - level = "info", - fields(component = "range_sync"), - name = "range_sync", - skip_all - )] - pub fn resume(&mut self, network: &mut SyncNetworkContext) { - for (removed_chain, sync_type, remove_reason) in - self.chains.call_all(|chain| chain.resume(network)) - { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "chain resumed", - ); - } - } -} diff --git a/beacon_node/network/src/sync/range_sync/sync_type.rs b/beacon_node/network/src/sync/range_sync/sync_type.rs deleted file mode 100644 index 4ff7e393101..00000000000 --- a/beacon_node/network/src/sync/range_sync/sync_type.rs +++ /dev/null @@ -1,46 +0,0 @@ -//! Contains logic about identifying which Sync to perform given PeerSyncInfo of ourselves and -//! of a remote. - -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::SyncInfo; - -/// The type of Range sync that should be done relative to our current state. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum RangeSyncType { - /// A finalized chain sync should be started with this peer. - Finalized, - /// A head chain sync should be started with this peer. - Head, -} - -impl RangeSyncType { - /// Determines the type of sync given our local `PeerSyncInfo` and the remote's - /// `PeerSyncInfo`. - pub fn new( - chain: &BeaconChain, - local_info: &SyncInfo, - remote_info: &SyncInfo, - ) -> RangeSyncType { - // Check for finalized chain sync - // - // The condition is: - // - The remotes finalized epoch is greater than our current finalized epoch and we have - // not seen the finalized hash before. - - if remote_info.finalized_epoch > local_info.finalized_epoch - && !chain.block_is_known_to_fork_choice(&remote_info.finalized_root) - { - RangeSyncType::Finalized - } else { - RangeSyncType::Head - } - } - - /// Get a `str` representation of the `RangeSyncType`. - pub fn as_str(&self) -> &'static str { - match self { - RangeSyncType::Finalized => "Finalized", - RangeSyncType::Head => "Head", - } - } -} diff --git a/beacon_node/network/src/sync/sync_block.rs b/beacon_node/network/src/sync/sync_block.rs new file mode 100644 index 00000000000..5d181d99fd8 --- /dev/null +++ b/beacon_node/network/src/sync/sync_block.rs @@ -0,0 +1,288 @@ +use super::network_context::{RpcRequestSendError, RpcResponseError, SyncNetworkContext}; +use crate::metrics; +use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::network_context::BatchPeers; +use crate::sync::BatchProcessResult; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::BeaconChainTypes; +use lighthouse_network::service::api_types::{ComponentsByRootRequestId, RangeRequestId}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::HashSet; +use std::sync::Arc; +use std::time::Instant; +use tracing::debug; +use types::{EthSpec, Hash256, Slot}; + +const MAX_DOWNLOAD_ATTEMPTS: usize = 5; +const MAX_PROCESS_ATTEMPTS: usize = 5; + +// TODO(tree-sync): have the peer set inside here when syncing add dedup logic +// TODO(tree-sync): for backfill sync use the sync state to check the peers have this block or not +pub struct SyncBlock { + id: RangeRequestId, + block_root: Hash256, + block_slot: Slot, + failed_peers: HashSet, + // TODO(tree-sync): deprecate this shared state for manual addition and removal + peers: Arc>>, + request: SyncingStatus, + download_errors: usize, + process_errors: usize, +} + +enum SyncingStatus { + AwaitingDownload, + Downloading(ComponentsByRootRequestId, Instant), + AwaitingProcessing(RpcBlock, BatchPeers, Instant), + Processing(RpcBlock, BatchPeers, Instant), +} + +pub enum OkToImport { + IfParentImported, + Bool(bool), +} + +#[must_use] +pub enum SyncBlockResult { + Done { parent_root: Hash256, slot: Slot }, + Wait, +} + +#[derive(Debug)] +pub enum Error { + InternalError(String), + TooManyErrors(String), +} + +impl SyncBlock { + pub fn new( + id: RangeRequestId, + block_root: Hash256, + block_slot: Slot, + initial_peers: &[PeerId], + ) -> Self { + Self { + id, + block_root, + block_slot, + failed_peers: <_>::default(), + peers: Arc::new(RwLock::new(HashSet::from_iter( + initial_peers.iter().copied(), + ))), + request: SyncingStatus::AwaitingDownload, + download_errors: 0, + process_errors: 0, + } + } + + pub fn block_root(&self) -> &Hash256 { + &self.block_root + } + + pub fn slot(&self) -> Slot { + self.block_slot + } + + pub fn id(&self) -> RangeRequestId { + self.id + } + + pub fn peer_count(&self) -> usize { + self.peers.read().len() + } + + pub fn clone_peers(&self) -> HashSet { + self.peers.read().clone() + } + + /// Returns whether the value was newly inserted + pub fn add_peer(&self, peer: PeerId) -> bool { + self.peers.write().insert(peer) + } + + pub fn remove_peer(&self, peer: &PeerId) -> bool { + self.peers.write().remove(peer) + } + + pub fn is_syncing(&self) -> bool { + !matches!(self.request, SyncingStatus::AwaitingDownload) + } + + #[cfg(test)] + pub fn is_processing(&self) -> bool { + matches!(self.request, SyncingStatus::Processing(..)) + } + + pub fn on_download_result( + &mut self, + req_id: ComponentsByRootRequestId, + result: Result<(RpcBlock, BatchPeers), RpcResponseError>, + _cx: &mut SyncNetworkContext, + ) -> Result<(), Error> { + match &mut self.request { + SyncingStatus::Downloading(expected_id, start_time) => { + metrics::observe_duration( + &metrics::SYNC_BLOCK_DOWNLOADING_TIME, + start_time.elapsed(), + ); + if req_id != *expected_id { + return Err(Error::InternalError(format!( + "Unexpected request ID {} != {}", + req_id, expected_id, + ))); + } + match result { + Ok((block, peers)) => { + debug!(id = %self.id, "Sync block downloaded"); + self.request = + SyncingStatus::AwaitingProcessing(block, peers, Instant::now()); + Ok(()) + } + Err(e) => { + debug!(id = %self.id, error = ?e, "Sync block download error"); + self.request = SyncingStatus::AwaitingDownload; + + self.download_errors += 1; + if self.download_errors > MAX_DOWNLOAD_ATTEMPTS { + return Err(Error::TooManyErrors("download errors".to_owned())); + } + + Ok(()) + } + } + } + _ => Err(Error::InternalError( + "Lookup not in expected state Downloading".to_owned(), + )), + } + } + + pub fn on_process_result( + &mut self, + result: BatchProcessResult, + cx: &mut SyncNetworkContext, + ) -> Result { + match &mut self.request { + SyncingStatus::Processing(block, peers, start_time) => { + metrics::observe_duration( + &metrics::SYNC_BLOCK_PROCESSING_TIME, + start_time.elapsed(), + ); + match result { + BatchProcessResult::Success => { + debug!(id = %self.id, "Sync block process success"); + Ok(SyncBlockResult::Done { + parent_root: block.as_block().parent_root(), + slot: block.as_block().slot(), + }) + } + BatchProcessResult::Failure { peer_action, error } => { + debug!(id = %self.id, error, "Sync block process error"); + + if let Some(peer_action) = peer_action { + for (peer, penalty) in peers.blame(peer_action) { + cx.report_peer(peer, penalty, "faulty_batch"); + self.failed_peers.insert(peer); + } + } + + self.process_errors += 1; + if self.process_errors > MAX_PROCESS_ATTEMPTS { + return Err(Error::TooManyErrors("process errors".to_owned())); + } + + self.request = SyncingStatus::AwaitingDownload; + Ok(SyncBlockResult::Wait) + } + } + } + _ => Err(Error::InternalError( + "Lookup not in expected state Processing".to_owned(), + )), + } + } + + /// Make progress on the request. Note that a request can never finish on this call, thus it + /// does not return `SyncBlockResult`. + pub fn continue_request( + &mut self, + cx: &mut SyncNetworkContext, + ok_to_import: OkToImport, + ) -> Result<(), Error> { + match &mut self.request { + SyncingStatus::AwaitingDownload => { + match cx.block_components_by_range_request( + self.block_root, + self.id, + self.peers.clone(), + &self.failed_peers, + ) { + Ok(req_id) => { + self.request = SyncingStatus::Downloading(req_id, Instant::now()); + Ok(()) + } + Err(e) => match e { + RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_) => { + Err(Error::InternalError(format!( + "Error sending block components request: {e:?}" + ))) + } + }, + } + } + SyncingStatus::Downloading(..) => Ok(()), + SyncingStatus::AwaitingProcessing(block, peers, start_time) => { + // No need to check if block is already imported here, we'll get an error + // from the beacon processor anyway. No need to add more code to handle this + // edge case faster. + + match ok_to_import { + OkToImport::IfParentImported => { + if !cx + .chain + .block_is_known_to_fork_choice(&block.as_block().parent_root()) + { + return Ok(()); + } + } + OkToImport::Bool(ok_to_import) => { + if !ok_to_import { + return Ok(()); + } + } + } + + if let Some(beacon_processor) = cx.beacon_processor_if_enabled() { + let id = match self.id { + RangeRequestId::ForwardSync(id) => ChainSegmentProcessId::ForwardSync(id), + RangeRequestId::BackfillSync(id) => ChainSegmentProcessId::BackfillSync(id), + }; + + if let Err(e) = beacon_processor.send_chain_segment(id, vec![block.clone()]) { + Err(Error::InternalError(format!( + "Error sending block to processor: {e:?}" + ))) + } else { + metrics::observe_duration( + &metrics::SYNC_BLOCK_AWAITING_PROCESSING_TIME, + start_time.elapsed(), + ); + + self.request = + SyncingStatus::Processing(block.clone(), peers.clone(), Instant::now()); + Ok(()) + } + } else { + // TODO(tree-sync): This error will cause the full chain of headers to + // be dropped if the beacon processor goes offline. When can that + // happen? + Err(Error::InternalError( + "Beacon processor is disabled".to_owned(), + )) + } + } + SyncingStatus::Processing(..) => Ok(()), + } + } +} diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index a2c359c87e7..d832d3b68fb 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1,9 +1,7 @@ +use super::range::{complete, filter, NO_FILTER}; use crate::network_beacon_processor::NetworkBeaconProcessor; -use crate::sync::block_lookups::{ - BlockLookupSummary, PARENT_DEPTH_TOLERANCE, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS, -}; use crate::sync::{ - manager::{BlockProcessType, BlockProcessingResult, SyncManager}, + manager::{BlockProcessingResult, SyncManager}, peer_sampling::SamplingConfig, SamplingId, SyncMessage, }; @@ -13,7 +11,6 @@ use std::time::Duration; use super::*; -use crate::sync::block_lookups::common::ResponseType; use beacon_chain::observed_data_sidecars::Observe; use beacon_chain::{ blob_verification::GossipVerifiedBlob, @@ -28,6 +25,7 @@ use beacon_chain::{ PayloadVerificationOutcome, PayloadVerificationStatus, }; use beacon_processor::WorkEvent; +use fork_choice::ForkChoiceStore; use lighthouse_network::discovery::CombinedKey; use lighthouse_network::{ rpc::{RPCError, RequestType, RpcErrorResponse}, @@ -36,7 +34,7 @@ use lighthouse_network::{ SamplingRequester, SingleLookupReqId, SyncRequestId, }, types::SyncState, - NetworkConfig, NetworkGlobals, PeerId, + NetworkConfig, NetworkGlobals, PeerId, SyncInfo, }; use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; @@ -44,18 +42,48 @@ use tracing::info; use types::{ data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, }; const D: Duration = Duration::new(0, 0); +const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 5; const PARENT_FAIL_TOLERANCE: u8 = SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS; +const PARENT_DEPTH_TOLERANCE: usize = 32; const SAMPLING_REQUIRED_SUCCESSES: usize = 2; type DCByRootIds = Vec; type DCByRootId = (SyncRequestId, Vec); +pub enum PeersConfig { + SupernodeAndRandom, + SupernodeOnly, +} + +pub enum ResponseType { + Block, + Blob, + CustodyColumn, +} + +struct BlockLookupSummary {} + +pub struct TestOptions { + /// If the node created by this test harness is a supernode + pub is_supernode: bool, +} + impl TestRig { pub fn test_setup() -> Self { + Self::test_setup_with_options(TestOptions { + is_supernode: false, + }) + } + + pub fn test_setup_as_supernode() -> Self { + Self::test_setup_with_options(TestOptions { is_supernode: true }) + } + + pub fn test_setup_with_options(options: TestOptions) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -84,10 +112,11 @@ impl TestRig { // TODO(das): make the generation of the ENR use the deterministic rng to have consistent // column assignments let network_config = Arc::new(NetworkConfig::default()); - let globals = Arc::new(NetworkGlobals::new_test_globals( + let globals = Arc::new(NetworkGlobals::new_test_globals_as_supernode( Vec::new(), network_config, chain.spec.clone(), + options.is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, @@ -116,6 +145,7 @@ impl TestRig { network_rx, network_rx_queue: vec![], sync_rx, + blocks_by_root: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), sync_manager: SyncManager::new( @@ -174,7 +204,7 @@ impl TestRig { self.send_sync_message(SyncMessage::UnknownParentBlob(peer_id, blob.into())); } - fn trigger_unknown_block_from_attestation(&mut self, block_root: Hash256, peer_id: PeerId) { + pub fn trigger_unknown_block_from_attestation(&mut self, block_root: Hash256, peer_id: PeerId) { self.send_sync_message(SyncMessage::UnknownBlockHashFromAttestation( peer_id, block_root, )); @@ -191,7 +221,7 @@ impl TestRig { } } - fn rand_block(&mut self) -> SignedBeaconBlock { + pub fn rand_block(&mut self) -> SignedBeaconBlock { self.rand_block_and_blobs(NumBlobs::None).0 } @@ -201,16 +231,17 @@ impl TestRig { ) -> (SignedBeaconBlock, Vec>) { let fork_name = self.fork_name; let rng = &mut self.rng; - generate_rand_block_and_blobs::(fork_name, num_blobs, rng, &self.spec) + let head_root = self.harness.chain.head().head_block_root(); + generate_rand_block_and_blobs::(fork_name, num_blobs, Some(head_root), rng, &self.spec) } - fn rand_block_and_data_columns( - &mut self, - ) -> (SignedBeaconBlock, Vec>>) { + fn rand_block_and_data_columns(&mut self) -> (SignedBeaconBlock, DataColumnSidecarList) { let num_blobs = NumBlobs::Number(1); + let head_root = self.harness.chain.head().head_block_root(); generate_rand_block_and_data_columns::( self.fork_name, num_blobs, + Some(head_root), &mut self.rng, &self.harness.spec, ) @@ -231,33 +262,16 @@ impl TestRig { self.sync_manager.handle_message(sync_message); } - fn active_single_lookups(&self) -> Vec { - self.sync_manager.active_single_lookups() - } - - fn active_single_lookups_count(&self) -> usize { - self.sync_manager.active_single_lookups().len() - } - - fn active_parent_lookups(&self) -> Vec> { - self.sync_manager.active_parent_lookups() - } - - fn active_parent_lookups_count(&self) -> usize { - self.sync_manager.active_parent_lookups().len() - } - - fn active_range_sync_chain(&self) -> (RangeSyncType, Slot, Slot) { - self.sync_manager.get_range_sync_chains().unwrap().unwrap() + fn assert_active_lookup(&mut self, block_root: Hash256) { + let lookups = self.sync_manager.forward_sync().get_lookups(); + if !lookups.contains(&block_root) { + panic!("Expected lookup {block_root} not found, active lookups: {lookups:?}"); + } } - fn assert_single_lookups_count(&self, count: usize) { - assert_eq!( - self.active_single_lookups_count(), - count, - "Unexpected count of single lookups. Current lookups: {:?}", - self.active_single_lookups() - ); + fn assert_active_lookups(&mut self, expected_lookups: &[Hash256]) { + let lookups = self.sync_manager.forward_sync().get_lookups(); + assert_eq!(lookups, expected_lookups, "Unexpected lookups"); } fn expect_no_active_sampling(&mut self) { @@ -281,76 +295,25 @@ impl TestRig { self.expect_no_active_sampling(); } - fn assert_parent_lookups_count(&self, count: usize) { - assert_eq!( - self.active_parent_lookups_count(), - count, - "Unexpected count of parent lookups. Parent lookups: {:?}. Current lookups: {:?}", - self.active_parent_lookups(), - self.active_single_lookups() - ); - } - - fn assert_lookup_is_active(&self, block_root: Hash256) { - let lookups = self.sync_manager.active_single_lookups(); - if !lookups.iter().any(|l| l.1 == block_root) { - panic!("Expected lookup {block_root} to be the only active: {lookups:?}"); - } - } - - fn assert_lookup_peers(&self, block_root: Hash256, mut expected_peers: Vec) { - let mut lookup = self + fn assert_lookup_peers(&mut self, block_root: Hash256, expected_peers: &[PeerId]) { + let mut peers = self .sync_manager - .active_single_lookups() - .into_iter() - .find(|l| l.1 == block_root) - .unwrap_or_else(|| panic!("no lookup for {block_root}")); - lookup.3.sort(); - expected_peers.sort(); - assert_eq!( - lookup.3, expected_peers, - "unexpected peers on lookup {block_root}" - ); - } - - fn insert_failed_chain(&mut self, block_root: Hash256) { - self.sync_manager.insert_failed_chain(block_root); - } - - fn assert_not_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); - if failed_chains.contains(&chain_hash) { - panic!("failed chains contain {chain_hash:?}: {failed_chains:?}"); - } - } - - fn assert_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); - if !failed_chains.contains(&chain_hash) { - panic!("expected failed chains to contain {chain_hash:?}: {failed_chains:?}"); - } - } - - fn find_single_lookup_for(&self, block_root: Hash256) -> Id { - self.active_single_lookups() - .iter() - .find(|l| l.1 == block_root) - .unwrap_or_else(|| panic!("no single block lookup found for {block_root}")) - .0 - } - - #[track_caller] - fn expect_no_active_single_lookups(&self) { - assert!( - self.active_single_lookups().is_empty(), - "expect no single block lookups: {:?}", - self.active_single_lookups() - ); + .forward_sync() + .block_peers(&block_root) + .expect("Error getting block peers") + .unwrap_or_else(|| panic!("Unknown block {block_root}")); + peers.sort_unstable(); + let mut expected_peers = expected_peers.to_vec(); + expected_peers.sort_unstable(); + assert_eq!(peers, expected_peers, "Unexpected block {block_root} peers"); } #[track_caller] - fn expect_no_active_lookups(&self) { - self.expect_no_active_single_lookups(); + pub fn expect_no_active_lookups(&mut self) { + let lookups = self.sync_manager.forward_sync().get_lookups(); + if !lookups.is_empty() { + panic!("expected no active lookups but found {lookups:?}") + } } fn expect_no_active_lookups_empty_network(&mut self) { @@ -358,29 +321,67 @@ impl TestRig { self.expect_empty_network(); } + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor pub fn new_connected_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(false) + } + + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor + pub fn new_connected_supernode_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(true) + } + + /// Add a random connected peer that is not known by the sync module + pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self .network_globals .peers .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key); - self.log(&format!("Added new peer for testing {peer_id:?}")); + .__add_connected_peer_testing_only(supernode, &self.harness.spec, key); + let mut peer_custody_subnets = self + .network_globals + .peers + .read() + .peer_info(&peer_id) + .expect("peer was just added") + .custody_subnets_iter() + .map(|subnet| **subnet) + .collect::>(); + peer_custody_subnets.sort_unstable(); + self.log(&format!( + "Added new peer for testing {peer_id:?} custody subnets {peer_custody_subnets:?}" + )); peer_id } - pub fn new_connected_supernode_peer(&mut self) -> PeerId { - let key = self.determinstic_key(); - self.network_globals - .peers - .write() - .__add_connected_peer_testing_only(true, &self.harness.spec, key) + /// Add a random connected peer + add it to sync with a specific remote Status + pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { + let peer_id = self.add_connected_peer_testing_only(supernode); + self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); + peer_id } fn determinstic_key(&mut self) -> CombinedKey { k256::ecdsa::SigningKey::random(&mut self.rng).into() } + pub fn add_sync_peers(&mut self, config: PeersConfig, remote_info: SyncInfo) { + match config { + PeersConfig::SupernodeAndRandom => { + for _ in 0..100 { + self.add_sync_peer(false, remote_info.clone()); + } + self.add_sync_peer(true, remote_info); + } + PeersConfig::SupernodeOnly => { + self.add_sync_peer(true, remote_info); + } + } + } + pub fn new_connected_peers_for_peerdas(&mut self) { // Enough sampling peers with few columns for _ in 0..100 { @@ -390,229 +391,6 @@ impl TestRig { self.new_connected_supernode_peer(); } - fn parent_chain_processed_success( - &mut self, - chain_hash: Hash256, - blocks: &[Arc>], - ) { - // Send import events for all pending parent blocks - for _ in blocks { - self.parent_block_processed_imported(chain_hash); - } - // Send final import event for the block that triggered the lookup - self.single_block_component_processed_imported(chain_hash); - } - - /// Locate a parent lookup chain with tip hash `chain_hash` - fn find_oldest_parent_lookup(&self, chain_hash: Hash256) -> Hash256 { - let parent_chain = self - .active_parent_lookups() - .into_iter() - .find(|chain| chain.first() == Some(&chain_hash)) - .unwrap_or_else(|| { - panic!( - "No parent chain with chain_hash {chain_hash:?}: Parent lookups {:?} Single lookups {:?}", - self.active_parent_lookups(), - self.active_single_lookups(), - ) - }); - *parent_chain.last().unwrap() - } - - fn parent_block_processed(&mut self, chain_hash: Hash256, result: BlockProcessingResult) { - let id = self.find_single_lookup_for(self.find_oldest_parent_lookup(chain_hash)); - self.single_block_component_processed(id, result); - } - - fn parent_blob_processed(&mut self, chain_hash: Hash256, result: BlockProcessingResult) { - let id = self.find_single_lookup_for(self.find_oldest_parent_lookup(chain_hash)); - self.single_blob_component_processed(id, result); - } - - fn parent_block_processed_imported(&mut self, chain_hash: Hash256) { - self.parent_block_processed( - chain_hash, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(chain_hash)), - ); - } - - fn single_block_component_processed(&mut self, id: Id, result: BlockProcessingResult) { - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type: BlockProcessType::SingleBlock { id }, - result, - }) - } - - fn single_block_component_processed_imported(&mut self, block_root: Hash256) { - let id = self.find_single_lookup_for(block_root); - self.single_block_component_processed( - id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)), - ) - } - - fn single_blob_component_processed(&mut self, id: Id, result: BlockProcessingResult) { - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type: BlockProcessType::SingleBlob { id }, - result, - }) - } - - fn parent_lookup_block_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - beacon_block: Option>>, - ) { - self.log("parent_lookup_block_response"); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::SingleBlock { id }, - peer_id, - beacon_block, - seen_timestamp: D, - }); - } - - fn single_lookup_block_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - beacon_block: Option>>, - ) { - self.log("single_lookup_block_response"); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::SingleBlock { id }, - peer_id, - beacon_block, - seen_timestamp: D, - }); - } - - fn parent_lookup_blob_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blob_sidecar: Option>>, - ) { - self.log(&format!( - "parent_lookup_blob_response {:?}", - blob_sidecar.as_ref().map(|b| b.index) - )); - self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: SyncRequestId::SingleBlob { id }, - peer_id, - blob_sidecar, - seen_timestamp: D, - }); - } - - fn single_lookup_blob_response( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blob_sidecar: Option>>, - ) { - self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: SyncRequestId::SingleBlob { id }, - peer_id, - blob_sidecar, - seen_timestamp: D, - }); - } - - fn complete_single_lookup_blob_download( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blobs: Vec>, - ) { - for blob in blobs { - self.single_lookup_blob_response(id, peer_id, Some(blob.into())); - } - self.single_lookup_blob_response(id, peer_id, None); - } - - fn complete_single_lookup_blob_lookup_valid( - &mut self, - id: SingleLookupReqId, - peer_id: PeerId, - blobs: Vec>, - import: bool, - ) { - let block_root = blobs.first().unwrap().block_root(); - let block_slot = blobs.first().unwrap().slot(); - self.complete_single_lookup_blob_download(id, peer_id, blobs); - self.expect_block_process(ResponseType::Blob); - self.single_blob_component_processed( - id.lookup_id, - if import { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - block_slot, block_root, - )) - }, - ); - } - - fn complete_lookup_block_download(&mut self, block: SignedBeaconBlock) { - let block_root = block.canonical_root(); - let id = self.expect_block_lookup_request(block_root); - self.expect_empty_network(); - let peer_id = self.new_connected_peer(); - self.single_lookup_block_response(id, peer_id, Some(block.into())); - self.single_lookup_block_response(id, peer_id, None); - } - - fn complete_lookup_block_import_valid(&mut self, block_root: Hash256, import: bool) { - self.expect_block_process(ResponseType::Block); - let id = self.find_single_lookup_for(block_root); - self.single_block_component_processed( - id, - if import { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - Slot::new(0), - block_root, - )) - }, - ) - } - - fn complete_single_lookup_block_valid(&mut self, block: SignedBeaconBlock, import: bool) { - let block_root = block.canonical_root(); - self.complete_lookup_block_download(block); - self.complete_lookup_block_import_valid(block_root, import) - } - - fn parent_lookup_failed(&mut self, id: SingleLookupReqId, peer_id: PeerId, error: RPCError) { - self.send_sync_message(SyncMessage::RpcError { - peer_id, - sync_request_id: SyncRequestId::SingleBlock { id }, - error, - }) - } - - fn parent_lookup_failed_unavailable(&mut self, id: SingleLookupReqId, peer_id: PeerId) { - self.parent_lookup_failed( - id, - peer_id, - RPCError::ErrorResponse( - RpcErrorResponse::ResourceUnavailable, - "older than deneb".into(), - ), - ); - } - - fn single_lookup_failed(&mut self, id: SingleLookupReqId, peer_id: PeerId, error: RPCError) { - self.send_sync_message(SyncMessage::RpcError { - peer_id, - sync_request_id: SyncRequestId::SingleBlock { id }, - error, - }) - } - fn return_empty_sampling_requests(&mut self, ids: DCByRootIds) { for id in ids { self.log(&format!("return empty data column for {id:?}")); @@ -646,36 +424,10 @@ impl TestRig { } } - fn complete_valid_block_request( - &mut self, - id: SingleLookupReqId, - block: Arc>, - missing_components: bool, - ) { - // Complete download - let peer_id = PeerId::random(); - let slot = block.slot(); - let block_root = block.canonical_root(); - self.single_lookup_block_response(id, peer_id, Some(block)); - self.single_lookup_block_response(id, peer_id, None); - // Expect processing and resolve with import - self.expect_block_process(ResponseType::Block); - self.single_block_component_processed( - id.lookup_id, - if missing_components { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - slot, block_root, - )) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(block_root)) - }, - ) - } - fn complete_valid_sampling_column_requests( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, ) { for id in ids { self.log(&format!("return valid data column for {id:?}")); @@ -697,7 +449,7 @@ impl TestRig { let block_root = first_dc.block_root(); let sampling_request_id = match id.0 { SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { - requester: DataColumnsByRootRequester::Sampling(sampling_id), + parent_request_id: DataColumnsByRootRequester::Sampling(sampling_id), .. }) => sampling_id.sampling_request_id, _ => unreachable!(), @@ -717,53 +469,6 @@ impl TestRig { }) } - fn complete_valid_custody_request( - &mut self, - ids: DCByRootIds, - data_columns: Vec>>, - missing_components: bool, - ) { - let lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { - requester: DataColumnsByRootRequester::Custody(id), - .. - }) = ids.first().unwrap().0 - { - id.requester.0.lookup_id - } else { - panic!("not a custody requester") - }; - - let first_column = data_columns.first().cloned().unwrap(); - - for id in ids { - self.log(&format!("return valid data column for {id:?}")); - let indices = &id.1; - let columns_to_send = indices - .iter() - .map(|&i| data_columns[i as usize].clone()) - .collect::>(); - self.complete_data_columns_by_root_request(id, &columns_to_send); - } - - // Expect work event - self.expect_rpc_custody_column_work_event(); - - // Respond with valid result - self.send_sync_message(SyncMessage::BlockComponentProcessed { - process_type: BlockProcessType::SingleCustodyColumn(lookup_id), - result: if missing_components { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - first_column.slot(), - first_column.block_root(), - )) - } else { - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported( - first_column.block_root(), - )) - }, - }); - } - fn complete_data_columns_by_root_request( &mut self, (sync_request_id, _): DCByRootId, @@ -791,7 +496,7 @@ impl TestRig { /// Return RPCErrors for all active requests of peer fn rpc_error_all_active_requests(&mut self, disconnected_peer_id: PeerId) { self.drain_network_rx(); - while let Ok(sync_request_id) = self.pop_received_network_event(|ev| match ev { + while let Ok(sync_request_id) = self.pop_received_network_event(&mut |ev| match ev { NetworkMessage::SendRequest { peer_id, app_request_id: AppRequestId::Sync(id), @@ -825,7 +530,7 @@ impl TestRig { pub fn pop_received_network_event) -> Option>( &mut self, - predicate_transform: F, + predicate_transform: &mut F, ) -> Result { self.drain_network_rx(); @@ -843,6 +548,30 @@ impl TestRig { } } + pub fn pop_received_network_events) -> Option>( + &mut self, + predicate_transform: &mut F, + ) -> Vec { + let mut events = vec![]; + while let Ok(ev) = self.pop_received_network_event(predicate_transform) { + events.push(ev) + } + events + } + + /// Similar to `pop_received_network_events` but finds matching events without removing them. + pub fn filter_received_network_events) -> Option>( + &mut self, + predicate_transform: F, + ) -> Vec { + self.drain_network_rx(); + + self.network_rx_queue + .iter() + .filter_map(predicate_transform) + .collect() + } + pub fn pop_received_processor_event) -> Option>( &mut self, predicate_transform: F, @@ -877,163 +606,56 @@ impl TestRig { } } - fn find_block_lookup_request( - &mut self, - for_block: Hash256, - ) -> Result { - self.pop_received_network_event(|ev| match ev { + /// Retrieves an unknown number of requests for data columns of `block_root`. Because peer ENRs + /// are random, and peer selection is random, the total number of batched requests is unknown. + fn expect_data_columns_by_root_requests(&mut self, block_root: Hash256) -> DCByRootIds { + self.pop_received_network_events(&mut |ev| match ev { NetworkMessage::SendRequest { peer_id: _, - request: RequestType::BlocksByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), - } if request.block_roots().to_vec().contains(&for_block) => Some(*id), + request: RequestType::DataColumnsByRoot(request), + app_request_id: AppRequestId::Sync(id @ SyncRequestId::DataColumnsByRoot { .. }), + } => { + let matching = request + .data_column_ids + .iter() + .find(|id| id.block_root == block_root)?; + + let indices = matching.columns.iter().copied().collect(); + Some((*id, indices)) + } _ => None, }) } - #[track_caller] - fn expect_block_lookup_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.find_block_lookup_request(for_block) - .unwrap_or_else(|e| panic!("Expected block request for {for_block:?}: {e}")) - } - - fn find_blob_lookup_request( + fn expect_only_data_columns_by_root_requests( &mut self, for_block: Hash256, - ) -> Result { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlobsByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), - } if request - .blob_ids - .to_vec() - .iter() - .any(|r| r.block_root == for_block) => - { - Some(*id) - } - _ => None, - }) + _count: usize, + ) -> DCByRootIds { + let ids = self.expect_data_columns_by_root_requests(for_block); + self.expect_empty_network(); + ids } #[track_caller] - fn expect_blob_lookup_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.find_blob_lookup_request(for_block) - .unwrap_or_else(|e| panic!("Expected blob request for {for_block:?}: {e}")) - } - - #[track_caller] - fn expect_block_parent_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlocksByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), - } if request.block_roots().to_vec().contains(&for_block) => Some(*id), - _ => None, - }) - .unwrap_or_else(|e| panic!("Expected block parent request for {for_block:?}: {e}")) - } - - fn expect_no_requests_for(&mut self, block_root: Hash256) { - if let Ok(request) = self.find_block_lookup_request(block_root) { - panic!("Expected no block request for {block_root:?} found {request:?}"); - } - if let Ok(request) = self.find_blob_lookup_request(block_root) { - panic!("Expected no blob request for {block_root:?} found {request:?}"); - } - } - - #[track_caller] - fn expect_blob_parent_request(&mut self, for_block: Hash256) -> SingleLookupReqId { - self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::BlobsByRoot(request), - app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), - } if request - .blob_ids - .to_vec() - .iter() - .all(|r| r.block_root == for_block) => - { - Some(*id) - } - _ => None, - }) - .unwrap_or_else(|e| panic!("Expected blob parent request for {for_block:?}: {e}")) - } - - /// Retrieves an unknown number of requests for data columns of `block_root`. Because peer ENRs - /// are random, and peer selection is random, the total number of batched requests is unknown. - fn expect_data_columns_by_root_requests( - &mut self, - block_root: Hash256, - count: usize, - ) -> DCByRootIds { - let mut requests: DCByRootIds = vec![]; - loop { - let req = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id: _, - request: RequestType::DataColumnsByRoot(request), - app_request_id: - AppRequestId::Sync(id @ SyncRequestId::DataColumnsByRoot { .. }), - } => { - let matching = request - .data_column_ids - .iter() - .find(|id| id.block_root == block_root)?; - - let indices = matching.columns.iter().copied().collect(); - Some((*id, indices)) - } - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Expected more DataColumnsByRoot requests for {block_root:?}: {e}") - }); - requests.push(req); - - // Should never infinite loop because sync does not send requests for 0 columns - if requests.iter().map(|r| r.1.len()).sum::() >= count { - return requests; - } - } - } - - fn expect_only_data_columns_by_root_requests( - &mut self, - for_block: Hash256, - count: usize, - ) -> DCByRootIds { - let ids = self.expect_data_columns_by_root_requests(for_block, count); - self.expect_empty_network(); - ids - } - - #[track_caller] - fn expect_block_process(&mut self, response_type: ResponseType) { - match response_type { - ResponseType::Block => self - .pop_received_processor_event(|ev| { - (ev.work_type() == beacon_processor::WorkType::RpcBlock).then_some(()) - }) - .unwrap_or_else(|e| panic!("Expected block work event: {e}")), - ResponseType::Blob => self - .pop_received_processor_event(|ev| { - (ev.work_type() == beacon_processor::WorkType::RpcBlobs).then_some(()) - }) - .unwrap_or_else(|e| panic!("Expected blobs work event: {e}")), - ResponseType::CustodyColumn => self - .pop_received_processor_event(|ev| { - (ev.work_type() == beacon_processor::WorkType::RpcCustodyColumn).then_some(()) - }) - .unwrap_or_else(|e| panic!("Expected column work event: {e}")), - } + fn expect_block_process(&mut self, response_type: ResponseType) { + match response_type { + ResponseType::Block => self + .pop_received_processor_event(|ev| { + (ev.work_type() == beacon_processor::WorkType::RpcBlock).then_some(()) + }) + .unwrap_or_else(|e| panic!("Expected block work event: {e}")), + ResponseType::Blob => self + .pop_received_processor_event(|ev| { + (ev.work_type() == beacon_processor::WorkType::RpcBlobs).then_some(()) + }) + .unwrap_or_else(|e| panic!("Expected blobs work event: {e}")), + ResponseType::CustodyColumn => self + .pop_received_processor_event(|ev| { + (ev.work_type() == beacon_processor::WorkType::RpcCustodyColumn).then_some(()) + }) + .unwrap_or_else(|e| panic!("Expected column work event: {e}")), + } } fn expect_rpc_custody_column_work_event(&mut self) { @@ -1091,6 +713,16 @@ impl TestRig { } } + pub fn expect_no_penalty_for_anyone(&mut self) { + let downscore_events = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if !downscore_events.is_empty() { + panic!("Expected no downscoring events but found: {downscore_events:?}"); + } + } + #[track_caller] fn expect_parent_chain_process(&mut self) { match self.beacon_processor_rx.try_recv() { @@ -1126,10 +758,46 @@ impl TestRig { } } + #[track_caller] + fn expect_empty_network_fully_synced(&mut self) { + self.expect_empty_network(); + self.expect_no_active_lookups(); + } + + #[track_caller] + pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { + let all_penalties = self.pop_received_network_events(&mut |ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if !all_penalties.is_empty() + && all_penalties + .iter() + .any(|(_, msg)| *msg != expected_penalty_msg) + { + panic!( + "Expected penalties only of {expected_penalty_msg}, but found {all_penalties:?}" + ); + } + self.log(&format!( + "Found expected penalties {expected_penalty_msg}: {all_penalties:?}" + )); + } + + pub fn expect_no_penalties(&mut self) { + let penalties = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if !penalties.is_empty() { + panic!("Expected no penalties but found {penalties:?}"); + } + } + #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self - .pop_received_network_event(|ev| match ev { + .pop_received_network_event(&mut |ev| match ev { NetworkMessage::ReportPeer { peer_id: p_id, msg, .. } if p_id == &peer_id => Some(msg.to_owned()), @@ -1193,94 +861,6 @@ impl TestRig { blocks } - fn insert_block_to_da_checker(&mut self, block: Arc>) { - let state = BeaconState::Base(BeaconStateBase::random_for_test(&mut self.rng)); - let parent_block = self.rand_block(); - let import_data = BlockImportData::::__new_for_test( - block.canonical_root(), - state, - parent_block.into(), - ); - let payload_verification_outcome = PayloadVerificationOutcome { - payload_verification_status: PayloadVerificationStatus::Verified, - is_valid_merge_transition_block: false, - }; - let executed_block = - AvailabilityPendingExecutedBlock::new(block, import_data, payload_verification_outcome); - match self - .harness - .chain - .data_availability_checker - .put_pending_executed_block(executed_block) - .unwrap() - { - Availability::Available(_) => panic!("block removed from da_checker, available"), - Availability::MissingComponents(block_root) => { - self.log(&format!("inserted block to da_checker {block_root:?}")) - } - }; - } - - fn insert_blob_to_da_checker(&mut self, blob: BlobSidecar) { - match self - .harness - .chain - .data_availability_checker - .put_gossip_verified_blobs( - blob.block_root(), - std::iter::once(GossipVerifiedBlob::<_, Observe>::__assumed_valid( - blob.into(), - )), - ) - .unwrap() - { - Availability::Available(_) => panic!("blob removed from da_checker, available"), - Availability::MissingComponents(block_root) => { - self.log(&format!("inserted blob to da_checker {block_root:?}")) - } - }; - } - - fn insert_block_to_processing_cache(&mut self, block: Arc>) { - self.harness - .chain - .reqresp_pre_import_cache - .write() - .insert(block.canonical_root(), block); - } - - fn simulate_block_gossip_processing_becomes_invalid(&mut self, block_root: Hash256) { - self.harness - .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); - - self.send_sync_message(SyncMessage::GossipBlockProcessResult { - block_root, - imported: false, - }); - } - - fn simulate_block_gossip_processing_becomes_valid_missing_components( - &mut self, - block: Arc>, - ) { - let block_root = block.canonical_root(); - self.harness - .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); - - self.insert_block_to_da_checker(block); - - self.send_sync_message(SyncMessage::GossipBlockProcessResult { - block_root, - imported: false, - }); - } - fn assert_sampling_request_ongoing(&self, block_root: Hash256, indices: &[ColumnIndex]) { for index in indices { let status = self @@ -1320,6 +900,64 @@ impl TestRig { "Sampling request status for {block_root}: {statuses:?}" )); } + + fn single_lookup_from_attestation_setup(&mut self) -> (Hash256, PeerId) { + let (head_root, _) = self.create_unimported_parent_chain(1); + // Use a supernode so Fulu tests can pass without edits + let peer_id = self.new_connected_supernode_peer(); + // Trigger the request + self.trigger_unknown_block_from_attestation(head_root, peer_id); + self.assert_active_lookup(head_root); + (head_root, peer_id) + } + + pub fn parent_lookup_from_unknown_block_parent_setup(&mut self) -> (Hash256, PeerId) { + let (head_root, _) = self.create_unimported_parent_chain(2); + // Use a supernode so Fulu tests can pass without edits + let peer_id = self.new_connected_supernode_peer(); + let head_block = self + .blocks_by_root + .get(&head_root) + .expect("block should exist"); + self.trigger_unknown_parent_block(peer_id, head_block.clone()); + (head_root, peer_id) + } + + fn expect_fully_complete_sync(&mut self, expected_head_root: Hash256) { + self.progress_until_no_events(NO_FILTER, complete()); + self.assert_head(expected_head_root); + self.expect_empty_network_fully_synced(); + } + + fn assert_head(&self, expected_head: Hash256) { + let mut fork_choice = self.harness.chain.canonical_head.fork_choice_write_lock(); + let current_slot = fork_choice.fc_store().get_current_slot(); + let head_root = fork_choice + .get_head(current_slot, &self.harness.spec) + .expect("error computing head"); + assert_eq!(head_root, expected_head, "Not expected head root"); + } + + fn fetch_unimported_ancestor_chain(&self, mut block_root: Hash256) -> Vec { + let mut chain = vec![]; + while let Some(block) = self.blocks_by_root.get(&block_root) { + if self + .harness + .chain + .block_is_known_to_fork_choice(&block_root) + { + break; + } + + chain.push(block_root); + block_root = block.parent_root(); + } + chain + } + + pub fn complete_header_chain(&mut self) { + self.progress_until_no_events(filter().header_requests_only(), complete()); + } } #[test] @@ -1327,7 +965,7 @@ fn stable_rng() { let spec = types::MainnetEthSpec::default_spec(); let mut rng = XorShiftRng::from_seed([42; 16]); let (block, _) = - generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, &mut rng, &spec); + generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, None, &mut rng, &spec); assert_eq!( block.canonical_root(), Hash256::from_slice( @@ -1340,738 +978,156 @@ fn stable_rng() { #[test] fn test_single_block_lookup_happy_path() { - let mut rig = TestRig::test_setup(); - let block = rig.rand_block(); - let peer_id = rig.new_connected_peer(); - let block_root = block.canonical_root(); - // Trigger the request - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - let id = rig.expect_block_lookup_request(block_root); - - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - rig.single_lookup_block_response(id, peer_id, Some(block.into())); - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(rig.active_single_lookups_count(), 1); - - // Send the stream termination. Peer should have not been penalized, and the request removed - // after processing. - rig.single_lookup_block_response(id, peer_id, None); - rig.single_block_component_processed_imported(block_root); - rig.expect_empty_network(); - rig.expect_no_active_lookups(); + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup(); + r.expect_fully_complete_sync(new_head_root); } // Tests that if a peer does not respond with a block, we downscore and retry the block only #[test] -fn test_single_block_lookup_empty_response() { +fn test_single_block_lookup_empty_response_until_failure() { let mut r = TestRig::test_setup(); - - let block = r.rand_block(); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - - // Trigger the request - r.trigger_unknown_block_from_attestation(block_root, peer_id); - let id = r.expect_block_lookup_request(block_root); - - // The peer does not have the block. It should be penalized. - r.single_lookup_block_response(id, peer_id, None); - r.expect_penalty(peer_id, "NotEnoughResponsesReturned"); - // it should be retried - let id = r.expect_block_lookup_request(block_root); - // Send the right block this time. - r.single_lookup_block_response(id, peer_id, Some(block.into())); - r.expect_block_process(ResponseType::Block); - r.single_block_component_processed_imported(block_root); + let (_, _) = r.single_lookup_from_attestation_setup(); + r.progress_until_no_events(NO_FILTER, complete().return_no_blocks()); + r.expect_penalties("NotEnoughResponsesReturned"); + // Test will loop until reaching max download attempts and remove the lookup r.expect_no_active_lookups(); } #[test] -fn test_single_block_lookup_wrong_response() { - let mut rig = TestRig::test_setup(); - - let block_hash = Hash256::random(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block_hash, peer_id); - let id = rig.expect_block_lookup_request(block_hash); - - // Peer sends something else. It should be penalized. - let bad_block = rig.rand_block(); - rig.single_lookup_block_response(id, peer_id, Some(bad_block.into())); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - rig.expect_block_lookup_request(block_hash); // should be retried - - // Send the stream termination. This should not produce an additional penalty. - rig.single_lookup_block_response(id, peer_id, None); - rig.expect_empty_network(); +fn test_single_block_lookup_empty_response_some_times() { + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.single_lookup_from_attestation_setup(); + r.progress_until_no_events(NO_FILTER, complete().return_no_blocks_n_times(3)); + r.expect_penalties("NotEnoughResponsesReturned"); + r.expect_fully_complete_sync(new_head_root); } #[test] -fn test_single_block_lookup_failure() { - let mut rig = TestRig::test_setup(); - - let block_hash = Hash256::random(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block_hash, peer_id); - let id = rig.expect_block_lookup_request(block_hash); +fn test_single_block_lookup_wrong_response() { + let mut r = TestRig::test_setup(); + let (_, _) = r.single_lookup_from_attestation_setup(); + r.progress_until_no_events(NO_FILTER, complete().return_wrong_blocks()); + r.expect_penalties("UnrequestedBlockRoot"); + r.expect_no_active_lookups(); + // Test will loop until reaching max download attempts and remove the lookup + r.expect_no_active_lookups(); +} - // The request fails. RPC failures are handled elsewhere so we should not penalize the peer. - rig.single_lookup_failed(id, peer_id, RPCError::UnsupportedProtocol); - rig.expect_block_lookup_request(block_hash); - rig.expect_empty_network(); +#[test] +fn test_single_block_lookup_rpc_error() { + let mut r = TestRig::test_setup(); + let (_, _) = r.single_lookup_from_attestation_setup(); + r.progress_until_no_events( + NO_FILTER, + complete().rpc_error(RPCError::UnsupportedProtocol), + ); + r.expect_no_penalties(); + // Test will loop until reaching max download attempts and remove the lookup + r.expect_no_active_lookups(); } +// TODO(tree-sync): Current behaviour drops the lookup if there's no peers left +#[ignore] #[test] fn test_single_block_lookup_peer_disconnected_then_rpc_error() { - let mut rig = TestRig::test_setup(); - - let block_hash = Hash256::random(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request. - rig.trigger_unknown_block_from_attestation(block_hash, peer_id); - let id = rig.expect_block_lookup_request(block_hash); - + let mut r = TestRig::test_setup(); + let (new_head_root, peer_id) = r.single_lookup_from_attestation_setup(); // The peer disconnect event reaches sync before the rpc error. - rig.peer_disconnected(peer_id); + r.peer_disconnected(peer_id); // The lookup is not removed as it can still potentially make progress. - rig.assert_single_lookups_count(1); + r.assert_active_lookup(new_head_root); // The request fails. - rig.single_lookup_failed(id, peer_id, RPCError::Disconnected); - rig.expect_block_lookup_request(block_hash); - // The request should be removed from the network context on disconnection. - rig.expect_empty_network(); -} - -#[test] -fn test_single_block_lookup_becomes_parent_request() { - let mut rig = TestRig::test_setup(); - - let block = Arc::new(rig.rand_block()); - let block_root = block.canonical_root(); - let parent_root = block.parent_root(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block.canonical_root(), peer_id); - let id = rig.expect_block_parent_request(block_root); - - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - rig.single_lookup_block_response(id, peer_id, Some(block.clone())); - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(rig.active_single_lookups_count(), 1); - - // Send the stream termination. Peer should have not been penalized, and the request moved to a - // parent request after processing. - rig.single_block_component_processed( - id.lookup_id, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ); - assert_eq!(rig.active_single_lookups_count(), 2); // 2 = current + parent - rig.expect_block_parent_request(parent_root); - rig.expect_empty_network(); - assert_eq!(rig.active_parent_lookups_count(), 1); + r.progress_until_no_events(NO_FILTER, complete().rpc_error(RPCError::Disconnected)); + r.expect_fully_complete_sync(new_head_root); } #[test] fn test_parent_lookup_happy_path() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - let id = rig.expect_block_parent_request(parent_root); - - // Peer sends the right block, it should be sent for processing. Peer should not be penalized. - rig.parent_lookup_block_response(id, peer_id, Some(parent.into())); - // No request of blobs because the block has not data - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - rig.expect_empty_network(); - - // Add peer to child lookup to prevent it being dropped - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - // Processing succeeds, now the rest of the chain should be sent for processing. - rig.parent_block_processed( - block_root, - BlockError::DuplicateFullyImported(block_root).into(), - ); - rig.expect_parent_chain_process(); - rig.parent_chain_processed_success(block_root, &[]); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_wrong_response() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - let id1 = rig.expect_block_parent_request(parent_root); - - // Peer sends the wrong block, peer should be penalized and the block re-requested. - let bad_block = rig.rand_block(); - rig.parent_lookup_block_response(id1, peer_id, Some(bad_block.into())); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - let id2 = rig.expect_block_parent_request(parent_root); - - // Send the stream termination for the first request. This should not produce extra penalties. - rig.parent_lookup_block_response(id1, peer_id, None); - rig.expect_empty_network(); - - // Send the right block this time. - rig.parent_lookup_block_response(id2, peer_id, Some(parent.into())); - rig.expect_block_process(ResponseType::Block); - - // Add peer to child lookup to prevent it being dropped - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - // Processing succeeds, now the rest of the chain should be sent for processing. - rig.parent_block_processed_imported(block_root); - rig.expect_parent_chain_process(); - rig.parent_chain_processed_success(block_root, &[]); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_rpc_failure() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - let id = rig.expect_block_parent_request(parent_root); - - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - let id = rig.expect_block_parent_request(parent_root); - - // Send the right block this time. - rig.parent_lookup_block_response(id, peer_id, Some(parent.into())); - rig.expect_block_process(ResponseType::Block); - - // Add peer to child lookup to prevent it being dropped - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - // Processing succeeds, now the rest of the chain should be sent for processing. - rig.parent_block_processed_imported(block_root); - rig.expect_parent_chain_process(); - rig.parent_chain_processed_success(block_root, &[]); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_many_attempts() { - let mut rig = TestRig::test_setup(); - - let block = rig.rand_block(); - let parent_root = block.parent_root(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - for i in 1..=PARENT_FAIL_TOLERANCE { - let id = rig.expect_block_parent_request(parent_root); - // Blobs are only requested in the first iteration as this test only retries blocks - - if i % 2 == 0 { - // make sure every error is accounted for - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - } else { - // Send a bad block this time. It should be tried again. - let bad_block = rig.rand_block(); - rig.parent_lookup_block_response(id, peer_id, Some(bad_block.into())); - // Send the stream termination - - // Note, previously we would send the same lookup id with a stream terminator, - // we'd ignore it because we'd intrepret it as an unrequested response, since - // we already got one response for the block. I'm not sure what the intent is - // for having this stream terminator line in this test at all. Receiving an invalid - // block and a stream terminator with the same Id now results in two failed attempts, - // I'm unsure if this is how it should behave? - // - rig.parent_lookup_block_response(id, peer_id, None); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - } - } - - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_many_download_attempts_no_blacklist() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - for i in 1..=PARENT_FAIL_TOLERANCE { - rig.assert_not_failed_chain(block_root); - let id = rig.expect_block_parent_request(parent_root); - if i % 2 != 0 { - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - } else { - // Send a bad block this time. It should be tried again. - let bad_block = rig.rand_block(); - rig.parent_lookup_block_response(id, peer_id, Some(bad_block.into())); - rig.expect_penalty(peer_id, "UnrequestedBlockRoot"); - } - } - - rig.assert_not_failed_chain(block_root); - rig.assert_not_failed_chain(parent.canonical_root()); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_many_processing_attempts_must_blacklist() { - const PROCESSING_FAILURES: u8 = PARENT_FAIL_TOLERANCE / 2 + 1; - let mut rig = TestRig::test_setup(); - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.into()); - - rig.log("Fail downloading the block"); - for _ in 0..(PARENT_FAIL_TOLERANCE - PROCESSING_FAILURES) { - let id = rig.expect_block_parent_request(parent_root); - // The request fails. It should be tried again. - rig.parent_lookup_failed_unavailable(id, peer_id); - } - - rig.log("Now fail processing a block in the parent request"); - for _ in 0..PROCESSING_FAILURES { - let id = rig.expect_block_parent_request(parent_root); - // Blobs are only requested in the previous first iteration as this test only retries blocks - rig.assert_not_failed_chain(block_root); - // send the right parent but fail processing - rig.parent_lookup_block_response(id, peer_id, Some(parent.clone().into())); - rig.parent_block_processed(block_root, BlockError::BlockSlotLimitReached.into()); - rig.parent_lookup_block_response(id, peer_id, None); - rig.expect_penalty(peer_id, "lookup_block_processing_failure"); - } - - rig.assert_not_failed_chain(block_root); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_too_deep_grow_ancestor() { - let mut rig = TestRig::test_setup(); - let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE); - - let peer_id = rig.new_connected_peer(); - let trigger_block = blocks.pop().unwrap(); - let chain_hash = trigger_block.canonical_root(); - rig.trigger_unknown_parent_block(peer_id, trigger_block); - - for block in blocks.into_iter().rev() { - let id = rig.expect_block_parent_request(block.canonical_root()); - // the block - rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); - // the stream termination - rig.parent_lookup_block_response(id, peer_id, None); - // the processing request - rig.expect_block_process(ResponseType::Block); - // the processing result - rig.parent_block_processed( - chain_hash, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ) - } - - // Should create a new syncing chain - rig.drain_sync_rx(); - assert_eq!( - rig.active_range_sync_chain(), - ( - RangeSyncType::Head, - Slot::new(0), - Slot::new(PARENT_DEPTH_TOLERANCE as u64 - 1) - ) - ); - // Should not penalize peer, but network is not clear because of the blocks_by_range requests - rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(chain_hash); + let mut r = TestRig::test_setup(); + let (new_head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); + r.expect_fully_complete_sync(new_head_root); } -// Regression test for https://github.com/sigp/lighthouse/pull/7118 #[test] -fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { - // GIVEN: A parent chain longer than PARENT_DEPTH_TOLERANCE. - let mut rig = TestRig::test_setup(); - let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE + 1); - let peer_id = rig.new_connected_peer(); - - // The child of the trigger block to be used to extend the chain. - let trigger_block_child = blocks.pop().unwrap(); - // The trigger block that starts the lookup. - let trigger_block = blocks.pop().unwrap(); - let tip_root = trigger_block.canonical_root(); - - // Trigger the initial unknown parent block for the tip. - rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); - - // Simulate the lookup chain building up via `ParentUnknown` errors. - for block in blocks.into_iter().rev() { - let id = rig.expect_block_parent_request(block.canonical_root()); - rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); - rig.parent_lookup_block_response(id, peer_id, None); - rig.expect_block_process(ResponseType::Block); - rig.parent_block_processed( - tip_root, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ); - } - - // At this point, the chain should have been deemed too deep and pruned. - // The tip root should have been inserted into failed chains. - rig.assert_failed_chain(tip_root); - rig.expect_no_penalty_for(peer_id); - - // WHEN: Trigger the extending block that points to the tip. - let trigger_block_child_root = trigger_block_child.canonical_root(); - rig.trigger_unknown_block_from_attestation(trigger_block_child_root, peer_id); - let id = rig.expect_block_lookup_request(trigger_block_child_root); - rig.single_lookup_block_response(id, peer_id, Some(trigger_block_child.clone())); - rig.single_lookup_block_response(id, peer_id, None); - rig.expect_block_process(ResponseType::Block); - rig.single_block_component_processed( - id.lookup_id, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: tip_root, - }), +fn test_parent_lookup_drop_parent() { + let mut r = TestRig::test_setup(); + let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); + // Complete the header chain so the first block can start syncing + r.complete_header_chain(); + let blocks = r.fetch_unimported_ancestor_chain(head_root); + // Return wrong blocks for the parent of `head_root` = chain[1] + r.progress_until_no_events( + filter().block_root(blocks[1]), + complete().return_wrong_blocks(), ); - - // THEN: The extending block should not create a lookup because the tip was inserted into failed chains. - rig.expect_no_active_lookups(); - // AND: The peer should be penalized for extending a failed chain. - rig.expect_single_penalty(peer_id, "failed_chain"); - rig.expect_empty_network(); + r.expect_penalties("UnrequestedBlockRoot"); + // It should drop all lookups + r.expect_no_active_lookups(); } #[test] -fn test_parent_lookup_too_deep_grow_tip() { - let mut rig = TestRig::test_setup(); - let blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE - 1); - let peer_id = rig.new_connected_peer(); - let tip = blocks.last().unwrap().clone(); - - for block in blocks.into_iter() { - let block_root = block.canonical_root(); - rig.trigger_unknown_block_from_attestation(block_root, peer_id); - let id = rig.expect_block_parent_request(block_root); - rig.single_lookup_block_response(id, peer_id, Some(block.clone())); - rig.single_lookup_block_response(id, peer_id, None); - rig.expect_block_process(ResponseType::Block); - rig.single_block_component_processed( - id.lookup_id, - BlockError::ParentUnknown { - parent_root: block.parent_root(), - } - .into(), - ); - } - - // Should create a new syncing chain - rig.drain_sync_rx(); - assert_eq!( - rig.active_range_sync_chain(), - ( - RangeSyncType::Head, - Slot::new(0), - Slot::new(PARENT_DEPTH_TOLERANCE as u64 - 2) - ) +fn test_parent_lookup_drop_child() { + let mut r = TestRig::test_setup(); + let (head_root, _) = r.parent_lookup_from_unknown_block_parent_setup(); + // Complete the header chain so the first block can start syncing + r.complete_header_chain(); + let blocks = r.fetch_unimported_ancestor_chain(head_root); + // Return wrong blocks for the parent of `head_root` = chain[1] + r.progress_until_no_events( + filter().block_root(blocks[0]), + complete().return_wrong_blocks(), ); - // Should not penalize peer, but network is not clear because of the blocks_by_range requests - rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(tip.canonical_root()); + r.expect_penalties("UnrequestedBlockRoot"); + // It should only drop the newest lookup + r.assert_active_lookups(&[blocks[1]]); } +// TODO(tree-sync): Current behaviour drops the lookup if there's no peers left +#[ignore] #[test] fn test_lookup_peer_disconnected_no_peers_left_while_request() { - let mut rig = TestRig::test_setup(); - let peer_id = rig.new_connected_peer(); - let trigger_block = rig.rand_block(); - rig.trigger_unknown_parent_block(peer_id, trigger_block.into()); - rig.peer_disconnected(peer_id); - rig.rpc_error_all_active_requests(peer_id); + let mut r = TestRig::test_setup(); + let (head_root, peer_id) = r.single_lookup_from_attestation_setup(); + r.peer_disconnected(peer_id); + r.rpc_error_all_active_requests(peer_id); // Erroring all rpc requests and disconnecting the peer shouldn't remove the requests // from the lookups map as they can still progress. - rig.assert_single_lookups_count(2); + r.assert_active_lookup(head_root); } #[test] fn test_lookup_disconnection_peer_left() { - let mut rig = TestRig::test_setup(); - let peer_ids = (0..2).map(|_| rig.new_connected_peer()).collect::>(); - let disconnecting_peer = *peer_ids.first().unwrap(); - let block_root = Hash256::random(); - // lookup should have two peers associated with the same block - for peer_id in peer_ids.iter() { - rig.trigger_unknown_block_from_attestation(block_root, *peer_id); - } + let mut r = TestRig::test_setup(); + let (head_root, peer_1) = r.single_lookup_from_attestation_setup(); + let peer_2 = r.new_connected_peer(); + r.trigger_unknown_block_from_attestation(head_root, peer_2); // Disconnect the first peer only, which is the one handling the request - rig.peer_disconnected(disconnecting_peer); - rig.rpc_error_all_active_requests(disconnecting_peer); - rig.assert_single_lookups_count(1); + r.peer_disconnected(peer_1); + r.rpc_error_all_active_requests(peer_1); + r.assert_active_lookup(head_root); } #[test] fn test_lookup_add_peers_to_parent() { let mut r = TestRig::test_setup(); - let peer_id_1 = r.new_connected_peer(); - let peer_id_2 = r.new_connected_peer(); - let blocks = r.rand_blockchain(5); - let last_block_root = blocks.last().unwrap().canonical_root(); - // Create a chain of lookups - for block in &blocks { - r.trigger_unknown_parent_block(peer_id_1, block.clone()); - } - r.trigger_unknown_block_from_attestation(last_block_root, peer_id_2); - for block in blocks.iter().take(blocks.len() - 1) { - // Parent has the original unknown parent event peer + new peer - r.assert_lookup_peers(block.canonical_root(), vec![peer_id_1, peer_id_2]); - } - // Child lookup only has the unknown attestation peer - r.assert_lookup_peers(last_block_root, vec![peer_id_2]); -} - -#[test] -fn test_skip_creating_failed_parent_lookup() { - let mut rig = TestRig::test_setup(); - let (_, block, parent_root, _) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - rig.insert_failed_chain(parent_root); - rig.trigger_unknown_parent_block(peer_id, block.into()); - // Expect single penalty for peer, despite dropping two lookups - rig.expect_single_penalty(peer_id, "failed_chain"); - // Both current and parent lookup should be rejected - rig.expect_no_active_lookups(); -} - -#[test] -fn test_single_block_lookup_ignored_response() { - let mut rig = TestRig::test_setup(); - - let block = rig.rand_block(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_block_from_attestation(block.canonical_root(), peer_id); - let id = rig.expect_block_lookup_request(block.canonical_root()); - - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - rig.single_lookup_block_response(id, peer_id, Some(block.into())); - rig.expect_empty_network(); - rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(rig.active_single_lookups_count(), 1); - - // Send the stream termination. Peer should have not been penalized, and the request removed - // after processing. - rig.single_lookup_block_response(id, peer_id, None); - // Send an Ignored response, the request should be dropped - rig.single_block_component_processed(id.lookup_id, BlockProcessingResult::Ignored); - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn test_parent_lookup_ignored_response() { - let mut rig = TestRig::test_setup(); - - let (parent, block, parent_root, block_root) = rig.rand_block_and_parent(); - let peer_id = rig.new_connected_peer(); - - // Trigger the request - rig.trigger_unknown_parent_block(peer_id, block.clone().into()); - let id = rig.expect_block_parent_request(parent_root); - // Note: single block lookup for current `block` does not trigger any request because it does - // not have blobs, and the block is already cached - - // Peer sends the right block, it should be sent for processing. Peer should not be penalized. - rig.parent_lookup_block_response(id, peer_id, Some(parent.into())); - rig.expect_block_process(ResponseType::Block); - rig.expect_empty_network(); - - // Return an Ignored result. The request should be dropped - rig.parent_block_processed(block_root, BlockProcessingResult::Ignored); - rig.expect_empty_network(); - rig.expect_no_active_lookups(); -} - -/// This is a regression test. -#[test] -fn test_same_chain_race_condition() { - let mut rig = TestRig::test_setup(); - - // if we use one or two blocks it will match on the hash or the parent hash, so make a longer - // chain. - let depth = 4; - let mut blocks = rig.rand_blockchain(depth); - let peer_id = rig.new_connected_peer(); - let trigger_block = blocks.pop().unwrap(); - let chain_hash = trigger_block.canonical_root(); - rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); - - for (i, block) in blocks.clone().into_iter().rev().enumerate() { - let id = rig.expect_block_parent_request(block.canonical_root()); - // the block - rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); - // the stream termination - rig.parent_lookup_block_response(id, peer_id, None); - // the processing request - rig.expect_block_process(ResponseType::Block); - // the processing result - if i + 2 == depth { - rig.log(&format!("Block {i} was removed and is already known")); - rig.parent_block_processed( - chain_hash, - BlockError::DuplicateFullyImported(block.canonical_root()).into(), - ) - } else { - rig.log(&format!("Block {i} ParentUnknown")); - rig.parent_block_processed( - chain_hash, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ) - } - } - - // Try to get this block again while the chain is being processed. We should not request it again. - let peer_id = rig.new_connected_peer(); - rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); - rig.expect_empty_network(); - - // Add a peer to the tip child lookup which has zero peers - rig.trigger_unknown_block_from_attestation(trigger_block.canonical_root(), peer_id); - - rig.log("Processing succeeds, now the rest of the chain should be sent for processing."); - for block in blocks.iter().skip(1).chain(&[trigger_block]) { - rig.expect_parent_chain_process(); - rig.single_block_component_processed_imported(block.canonical_root()); - } - rig.expect_no_active_lookups_empty_network(); -} - -#[test] -fn block_in_da_checker_skips_download() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); + let (head_root, _) = r.create_unimported_parent_chain(4); + let blocks = r.fetch_unimported_ancestor_chain(head_root); let peer_id = r.new_connected_peer(); - r.insert_block_to_da_checker(block.into()); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should not trigger block request - let id = r.expect_blob_lookup_request(block_root); - r.expect_empty_network(); - // Resolve blob and expect lookup completed - r.complete_single_lookup_blob_lookup_valid(id, peer_id, blobs, true); - r.expect_no_active_lookups(); -} - -#[test] -fn block_in_processing_cache_becomes_invalid() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should trigger blob request - let id = r.expect_blob_lookup_request(block_root); - // Should not trigger block request - r.expect_empty_network(); - // Simulate invalid block, removing it from processing cache - r.simulate_block_gossip_processing_becomes_invalid(block_root); - // Should download block, then issue blobs request - r.complete_lookup_block_download(block); - // Should not trigger block or blob request - r.expect_empty_network(); - r.complete_lookup_block_import_valid(block_root, false); - // Resolve blob and expect lookup completed - r.complete_single_lookup_blob_lookup_valid(id, peer_id, blobs, true); - r.expect_no_active_lookups(); -} + r.trigger_unknown_block_from_attestation(head_root, peer_id); + r.complete_header_chain(); -#[test] -fn block_in_processing_cache_becomes_valid_imported() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should trigger blob request - let id = r.expect_blob_lookup_request(block_root); - // Should not trigger block request - r.expect_empty_network(); - // Resolve the block from processing step - r.simulate_block_gossip_processing_becomes_valid_missing_components(block.into()); - // Should not trigger block or blob request - r.expect_empty_network(); - // Resolve blob and expect lookup completed - r.complete_single_lookup_blob_lookup_valid(id, peer_id, blobs, true); - r.expect_no_active_lookups(); -} + let new_peers = (0..2).map(|_| r.new_connected_peer()).collect::>(); + for peer in &new_peers { + r.trigger_unknown_block_from_attestation(head_root, *peer); + } -// IGNORE: wait for change that delays blob fetching to knowing the block -#[ignore] -#[test] -fn blobs_in_da_checker_skip_download() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - for blob in blobs { - r.insert_blob_to_da_checker(blob); + let mut expected_peers = new_peers.clone(); + expected_peers.push(peer_id); + for block in blocks { + // Parent has the original unknown parent event peer + new peer + r.assert_lookup_peers(block, &expected_peers); } - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should download and process the block - r.complete_single_lookup_block_valid(block, true); - // Should not trigger blob request - r.expect_empty_network(); - r.expect_no_active_lookups(); } #[test] @@ -2204,828 +1260,3 @@ fn sampling_batch_requests_not_enough_responses_returned() { r.expect_no_work_event(); r.expect_active_sampling(&block_root); } - -#[test] -fn custody_lookup_happy_path() { - let Some(mut r) = TestRig::test_setup_after_fulu() else { - return; - }; - let spec = E::default_spec(); - r.new_connected_peers_for_peerdas(); - let (block, data_columns) = r.rand_block_and_data_columns(); - let block_root = block.canonical_root(); - let peer_id = r.new_connected_peer(); - r.trigger_unknown_block_from_attestation(block_root, peer_id); - // Should not request blobs - let id = r.expect_block_lookup_request(block.canonical_root()); - r.complete_valid_block_request(id, block.into(), true); - // for each slot we download `samples_per_slot` columns - let sample_column_count = spec.samples_per_slot * spec.data_columns_per_group(); - let custody_ids = - r.expect_only_data_columns_by_root_requests(block_root, sample_column_count as usize); - r.complete_valid_custody_request(custody_ids, data_columns, false); - r.expect_no_active_lookups(); -} - -// TODO(das): Test retries of DataColumnByRoot: -// - Expect request for column_index -// - Respond with bad data -// - Respond with stream terminator -// ^ The stream terminator should be ignored and not close the next retry - -// TODO(das): Test error early a sampling request and it getting drop + then receiving responses -// from pending requests. - -mod deneb_only { - use super::*; - use beacon_chain::{ - block_verification_types::{AsBlock, RpcBlock}, - data_availability_checker::AvailabilityCheckError, - }; - use std::collections::VecDeque; - use types::RuntimeVariableList; - - struct DenebTester { - rig: TestRig, - block: Arc>, - blobs: Vec>>, - parent_block_roots: Vec, - parent_block: VecDeque>>, - parent_blobs: VecDeque>>>, - unknown_parent_block: Option>>, - unknown_parent_blobs: Option>>>, - peer_id: PeerId, - block_req_id: Option, - parent_block_req_id: Option, - blob_req_id: Option, - parent_blob_req_id: Option, - slot: Slot, - block_root: Hash256, - } - - enum RequestTrigger { - AttestationUnknownBlock, - GossipUnknownParentBlock(usize), - GossipUnknownParentBlob(usize), - } - - impl RequestTrigger { - fn num_parents(&self) -> usize { - match self { - RequestTrigger::AttestationUnknownBlock => 0, - RequestTrigger::GossipUnknownParentBlock(num_parents) => *num_parents, - RequestTrigger::GossipUnknownParentBlob(num_parents) => *num_parents, - } - } - } - - impl DenebTester { - fn new(request_trigger: RequestTrigger) -> Option { - let Some(mut rig) = TestRig::test_setup_after_deneb_before_fulu() else { - return None; - }; - let (block, blobs) = rig.rand_block_and_blobs(NumBlobs::Random); - let mut block = Arc::new(block); - let mut blobs = blobs.into_iter().map(Arc::new).collect::>(); - let slot = block.slot(); - - let num_parents = request_trigger.num_parents(); - let mut parent_block_chain = VecDeque::with_capacity(num_parents); - let mut parent_blobs_chain = VecDeque::with_capacity(num_parents); - let mut parent_block_roots = vec![]; - for _ in 0..num_parents { - // Set the current block as the parent. - let parent_root = block.canonical_root(); - let parent_block = block.clone(); - let parent_blobs = blobs.clone(); - parent_block_chain.push_front(parent_block); - parent_blobs_chain.push_front(parent_blobs); - parent_block_roots.push(parent_root); - - // Create the next block. - let (child_block, child_blobs) = - rig.block_with_parent_and_blobs(parent_root, NumBlobs::Random); - let mut child_block = Arc::new(child_block); - let mut child_blobs = child_blobs.into_iter().map(Arc::new).collect::>(); - - // Update the new block to the current block. - std::mem::swap(&mut child_block, &mut block); - std::mem::swap(&mut child_blobs, &mut blobs); - } - let block_root = block.canonical_root(); - - let peer_id = rig.new_connected_peer(); - - // Trigger the request - let (block_req_id, blob_req_id, parent_block_req_id, parent_blob_req_id) = - match request_trigger { - RequestTrigger::AttestationUnknownBlock => { - rig.send_sync_message(SyncMessage::UnknownBlockHashFromAttestation( - peer_id, block_root, - )); - let block_req_id = rig.expect_block_lookup_request(block_root); - (Some(block_req_id), None, None, None) - } - RequestTrigger::GossipUnknownParentBlock { .. } => { - rig.send_sync_message(SyncMessage::UnknownParentBlock( - peer_id, - block.clone(), - block_root, - )); - - let parent_root = block.parent_root(); - let parent_block_req_id = rig.expect_block_parent_request(parent_root); - rig.expect_empty_network(); // expect no more requests - (None, None, Some(parent_block_req_id), None) - } - RequestTrigger::GossipUnknownParentBlob { .. } => { - let single_blob = blobs.first().cloned().unwrap(); - let parent_root = single_blob.block_parent_root(); - rig.send_sync_message(SyncMessage::UnknownParentBlob(peer_id, single_blob)); - - let parent_block_req_id = rig.expect_block_parent_request(parent_root); - rig.expect_empty_network(); // expect no more requests - (None, None, Some(parent_block_req_id), None) - } - }; - - Some(Self { - rig, - block, - blobs, - parent_block: parent_block_chain, - parent_blobs: parent_blobs_chain, - parent_block_roots, - unknown_parent_block: None, - unknown_parent_blobs: None, - peer_id, - block_req_id, - parent_block_req_id, - blob_req_id, - parent_blob_req_id, - slot, - block_root, - }) - } - - fn trigger_unknown_block_from_attestation(mut self) -> Self { - let block_root = self.block.canonical_root(); - self.rig - .trigger_unknown_block_from_attestation(block_root, self.peer_id); - self - } - - fn parent_block_response(mut self) -> Self { - self.rig.expect_empty_network(); - let block = self.parent_block.pop_front().unwrap().clone(); - let _ = self.unknown_parent_block.insert(block.clone()); - self.rig.parent_lookup_block_response( - self.parent_block_req_id.expect("parent request id"), - self.peer_id, - Some(block), - ); - - self.rig.assert_parent_lookups_count(1); - self - } - - fn parent_block_response_expect_blobs(mut self) -> Self { - self.rig.expect_empty_network(); - let block = self.parent_block.pop_front().unwrap().clone(); - let _ = self.unknown_parent_block.insert(block.clone()); - self.rig.parent_lookup_block_response( - self.parent_block_req_id.expect("parent request id"), - self.peer_id, - Some(block), - ); - - // Expect blobs request after sending block - let s = self.expect_parent_blobs_request(); - - s.rig.assert_parent_lookups_count(1); - s - } - - fn parent_blob_response(mut self) -> Self { - let blobs = self.parent_blobs.pop_front().unwrap(); - let _ = self.unknown_parent_blobs.insert(blobs.clone()); - for blob in &blobs { - self.rig.parent_lookup_blob_response( - self.parent_blob_req_id.expect("parent blob request id"), - self.peer_id, - Some(blob.clone()), - ); - assert_eq!(self.rig.active_parent_lookups_count(), 1); - } - self.rig.parent_lookup_blob_response( - self.parent_blob_req_id.expect("parent blob request id"), - self.peer_id, - None, - ); - - self - } - - fn block_response_triggering_process(self) -> Self { - let mut me = self.block_response_and_expect_blob_request(); - me.rig.expect_block_process(ResponseType::Block); - - // The request should still be active. - assert_eq!(me.rig.active_single_lookups_count(), 1); - me - } - - fn block_response_and_expect_blob_request(mut self) -> Self { - // The peer provides the correct block, should not be penalized. Now the block should be sent - // for processing. - self.rig.single_lookup_block_response( - self.block_req_id.expect("block request id"), - self.peer_id, - Some(self.block.clone()), - ); - // After responding with block the node will issue a blob request - let mut s = self.expect_blobs_request(); - - s.rig.expect_empty_network(); - - // The request should still be active. - s.rig.assert_lookup_is_active(s.block.canonical_root()); - s - } - - fn blobs_response(mut self) -> Self { - self.rig - .log(&format!("blobs response {}", self.blobs.len())); - for blob in &self.blobs { - self.rig.single_lookup_blob_response( - self.blob_req_id.expect("blob request id"), - self.peer_id, - Some(blob.clone()), - ); - self.rig - .assert_lookup_is_active(self.block.canonical_root()); - } - self.rig.single_lookup_blob_response( - self.blob_req_id.expect("blob request id"), - self.peer_id, - None, - ); - self - } - - fn blobs_response_was_valid(mut self) -> Self { - self.rig.expect_empty_network(); - if !self.blobs.is_empty() { - self.rig.expect_block_process(ResponseType::Blob); - } - self - } - - fn expect_empty_beacon_processor(mut self) -> Self { - self.rig.expect_empty_beacon_processor(); - self - } - - fn empty_block_response(mut self) -> Self { - self.rig.single_lookup_block_response( - self.block_req_id.expect("block request id"), - self.peer_id, - None, - ); - self - } - - fn empty_blobs_response(mut self) -> Self { - self.rig.single_lookup_blob_response( - self.blob_req_id.expect("blob request id"), - self.peer_id, - None, - ); - self - } - - fn empty_parent_blobs_response(mut self) -> Self { - self.rig.parent_lookup_blob_response( - self.parent_blob_req_id.expect("blob request id"), - self.peer_id, - None, - ); - self - } - - fn block_missing_components(mut self) -> Self { - self.rig.single_block_component_processed( - self.block_req_id.expect("block request id").lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - self.block.slot(), - self.block_root, - )), - ); - self.rig.expect_empty_network(); - self.rig.assert_single_lookups_count(1); - self - } - - fn blob_imported(mut self) -> Self { - self.rig.single_blob_component_processed( - self.blob_req_id.expect("blob request id").lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(self.block_root)), - ); - self.rig.expect_empty_network(); - self.rig.assert_single_lookups_count(0); - self - } - - fn block_imported(mut self) -> Self { - // Missing blobs should be the request is not removed, the outstanding blobs request should - // mean we do not send a new request. - self.rig.single_block_component_processed( - self.block_req_id - .or(self.blob_req_id) - .expect("block request id") - .lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(self.block_root)), - ); - self.rig.expect_empty_network(); - self.rig.assert_single_lookups_count(0); - self - } - - fn parent_block_imported(mut self) -> Self { - let parent_root = *self.parent_block_roots.first().unwrap(); - self.rig - .log(&format!("parent_block_imported {parent_root:?}")); - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(parent_root)), - ); - self.rig.expect_no_requests_for(parent_root); - self.rig.assert_parent_lookups_count(0); - self - } - - fn parent_block_missing_components(mut self) -> Self { - let parent_root = *self.parent_block_roots.first().unwrap(); - self.rig - .log(&format!("parent_block_missing_components {parent_root:?}")); - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - Slot::new(0), - parent_root, - )), - ); - self.rig.expect_no_requests_for(parent_root); - self - } - - fn parent_blob_imported(mut self) -> Self { - let parent_root = *self.parent_block_roots.first().unwrap(); - self.rig - .log(&format!("parent_blob_imported {parent_root:?}")); - self.rig.parent_blob_processed( - self.block_root, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::Imported(parent_root)), - ); - - self.rig.expect_no_requests_for(parent_root); - self.rig.assert_parent_lookups_count(0); - self - } - - fn parent_block_unknown_parent(mut self) -> Self { - self.rig.log("parent_block_unknown_parent"); - let block = self.unknown_parent_block.take().unwrap(); - let max_len = self.rig.spec.max_blobs_per_block(block.epoch()) as usize; - // Now this block is the one we expect requests from - self.block = block.clone(); - let block = RpcBlock::new( - Some(block.canonical_root()), - block, - self.unknown_parent_blobs - .take() - .map(|vec| RuntimeVariableList::from_vec(vec, max_len)), - ) - .unwrap(); - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Err(BlockError::ParentUnknown { - parent_root: block.parent_root(), - }), - ); - assert_eq!(self.rig.active_parent_lookups_count(), 1); - self - } - - fn invalid_parent_processed(mut self) -> Self { - self.rig.parent_block_processed( - self.block_root, - BlockProcessingResult::Err(BlockError::BlockSlotLimitReached), - ); - assert_eq!(self.rig.active_parent_lookups_count(), 1); - self - } - - fn invalid_block_processed(mut self) -> Self { - self.rig.single_block_component_processed( - self.block_req_id.expect("block request id").lookup_id, - BlockProcessingResult::Err(BlockError::BlockSlotLimitReached), - ); - self.rig.assert_single_lookups_count(1); - self - } - - fn invalid_blob_processed(mut self) -> Self { - self.rig.log("invalid_blob_processed"); - self.rig.single_blob_component_processed( - self.blob_req_id.expect("blob request id").lookup_id, - BlockProcessingResult::Err(BlockError::AvailabilityCheck( - AvailabilityCheckError::InvalidBlobs(kzg::Error::KzgVerificationFailed), - )), - ); - self.rig.assert_single_lookups_count(1); - self - } - - fn missing_components_from_block_request(mut self) -> Self { - self.rig.single_block_component_processed( - self.block_req_id.expect("block request id").lookup_id, - BlockProcessingResult::Ok(AvailabilityProcessingStatus::MissingComponents( - self.slot, - self.block_root, - )), - ); - // Add block to da_checker so blobs request can continue - self.rig.insert_block_to_da_checker(self.block.clone()); - - self.rig.assert_single_lookups_count(1); - self - } - - fn complete_current_block_and_blobs_lookup(self) -> Self { - self.expect_block_request() - .block_response_and_expect_blob_request() - .blobs_response() - // TODO: Should send blobs for processing - .expect_block_process() - .block_imported() - } - - fn log(self, msg: &str) -> Self { - self.rig.log(msg); - self - } - - fn parent_block_then_empty_parent_blobs(self) -> Self { - self.log( - " Return empty blobs for parent, block errors with missing components, downscore", - ) - .parent_block_response() - .expect_parent_blobs_request() - .empty_parent_blobs_response() - .expect_penalty("NotEnoughResponsesReturned") - .log("Re-request parent blobs, succeed and import parent") - .expect_parent_blobs_request() - .parent_blob_response() - .expect_block_process() - .parent_block_missing_components() - // Insert new peer into child request before completing parent - .trigger_unknown_block_from_attestation() - .parent_blob_imported() - } - - fn expect_penalty(mut self, expect_penalty_msg: &'static str) -> Self { - self.rig.expect_penalty(self.peer_id, expect_penalty_msg); - self - } - fn expect_no_penalty(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn expect_no_penalty_and_no_requests(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn expect_block_request(mut self) -> Self { - let id = self - .rig - .expect_block_lookup_request(self.block.canonical_root()); - self.block_req_id = Some(id); - self - } - fn expect_blobs_request(mut self) -> Self { - let id = self - .rig - .expect_blob_lookup_request(self.block.canonical_root()); - self.blob_req_id = Some(id); - self - } - fn expect_parent_block_request(mut self) -> Self { - let id = self - .rig - .expect_block_parent_request(self.block.parent_root()); - self.parent_block_req_id = Some(id); - self - } - fn expect_parent_blobs_request(mut self) -> Self { - let id = self - .rig - .expect_blob_parent_request(self.block.parent_root()); - self.parent_blob_req_id = Some(id); - self - } - fn expect_no_blobs_request(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn expect_no_block_request(mut self) -> Self { - self.rig.expect_empty_network(); - self - } - fn invalidate_blobs_too_few(mut self) -> Self { - self.blobs.pop().expect("blobs"); - self - } - fn expect_block_process(mut self) -> Self { - self.rig.expect_block_process(ResponseType::Block); - self - } - fn expect_no_active_lookups(self) -> Self { - self.rig.expect_no_active_lookups(); - self - } - fn search_parent_dup(mut self) -> Self { - self.rig - .trigger_unknown_parent_block(self.peer_id, self.block.clone()); - self - } - } - - #[test] - fn single_block_and_blob_lookup_block_returned_first_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_and_expect_blob_request() - .blobs_response() - .block_missing_components() // blobs not yet imported - .blobs_response_was_valid() - .blob_imported(); // now blobs resolve as imported - } - - #[test] - fn single_block_response_then_empty_blob_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_and_expect_blob_request() - .missing_components_from_block_request() - .empty_blobs_response() - .expect_penalty("NotEnoughResponsesReturned") - .expect_blobs_request() - .expect_no_block_request(); - } - - #[test] - fn single_invalid_block_response_then_blob_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_triggering_process() - .invalid_block_processed() - .expect_penalty("lookup_block_processing_failure") - .expect_block_request() - .expect_no_blobs_request() - .blobs_response() - // blobs not sent for processing until the block is processed - .expect_no_penalty_and_no_requests(); - } - - #[test] - fn single_block_response_then_invalid_blob_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_triggering_process() - .missing_components_from_block_request() - .blobs_response() - .invalid_blob_processed() - .expect_penalty("lookup_blobs_processing_failure") - .expect_blobs_request() - .expect_no_block_request(); - } - - #[test] - fn single_block_response_then_too_few_blobs_response_attestation() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .block_response_triggering_process() - .missing_components_from_block_request() - .invalidate_blobs_too_few() - .blobs_response() - .expect_penalty("NotEnoughResponsesReturned") - .expect_blobs_request() - .expect_no_block_request(); - } - - // Test peer returning block that has unknown parent, and a new lookup is created - #[test] - fn parent_block_unknown_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .parent_block_unknown_parent() - .expect_parent_block_request() - .expect_empty_beacon_processor(); - } - - // Test peer returning invalid (processing) block, expect retry - #[test] - fn parent_block_invalid_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .invalid_parent_processed() - .expect_penalty("lookup_block_processing_failure") - .expect_parent_block_request() - .expect_empty_beacon_processor(); - } - - // Tests that if a peer does not respond with a block, we downscore and retry the block only - #[test] - fn empty_block_is_retried() { - let Some(tester) = DenebTester::new(RequestTrigger::AttestationUnknownBlock) else { - return; - }; - tester - .empty_block_response() - .expect_penalty("NotEnoughResponsesReturned") - .expect_block_request() - .expect_no_blobs_request() - .block_response_and_expect_blob_request() - .blobs_response() - .block_imported() - .expect_no_active_lookups(); - } - - #[test] - fn parent_block_then_empty_parent_blobs() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .parent_block_then_empty_parent_blobs() - .log("resolve original block trigger blobs request and import") - // Should not have block request, it is cached - .expect_blobs_request() - // TODO: Should send blobs for processing - .block_imported() - .expect_no_active_lookups(); - } - - #[test] - fn parent_blob_unknown_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .parent_block_unknown_parent() - .expect_parent_block_request() - .expect_empty_beacon_processor(); - } - - #[test] - fn parent_blob_invalid_parent() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_block_process() - .invalid_parent_processed() - .expect_penalty("lookup_block_processing_failure") - .expect_parent_block_request() - // blobs are not sent until block is processed - .expect_empty_beacon_processor(); - } - - #[test] - fn parent_block_and_blob_lookup_parent_returned_first_blob_trigger() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .parent_block_response() - .expect_parent_blobs_request() - .parent_blob_response() - .expect_block_process() - .trigger_unknown_block_from_attestation() - .parent_block_imported() - .complete_current_block_and_blobs_lookup() - .expect_no_active_lookups(); - } - - #[test] - fn parent_block_then_empty_parent_blobs_blob_trigger() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .parent_block_then_empty_parent_blobs() - .log("resolve original block trigger blobs request and import") - .complete_current_block_and_blobs_lookup() - .expect_no_active_lookups(); - } - - #[test] - fn parent_blob_unknown_parent_chain() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(2)) else { - return; - }; - tester - .expect_empty_beacon_processor() - .parent_block_response_expect_blobs() - .parent_blob_response() - .expect_no_penalty() - .expect_block_process() - .parent_block_unknown_parent() - .expect_parent_block_request() - .expect_empty_beacon_processor() - .parent_block_response() - .expect_parent_blobs_request() - .parent_blob_response() - .expect_no_penalty() - .expect_block_process(); - } - - #[test] - fn unknown_parent_block_dup() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlock(1)) else { - return; - }; - tester - .search_parent_dup() - .expect_no_blobs_request() - .expect_no_block_request(); - } - - #[test] - fn unknown_parent_blob_dup() { - let Some(tester) = DenebTester::new(RequestTrigger::GossipUnknownParentBlob(1)) else { - return; - }; - tester - .search_parent_dup() - .expect_no_blobs_request() - .expect_no_block_request(); - } - - // This test no longer applies, we don't issue requests for child lookups - // Keep for after updating rules on fetching blocks only first - #[ignore] - #[test] - fn no_peer_penalty_when_rpc_response_already_known_from_gossip() { - let Some(mut r) = TestRig::test_setup_after_deneb_before_fulu() else { - return; - }; - let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(2)); - let block_root = block.canonical_root(); - let blob_0 = blobs[0].clone(); - let blob_1 = blobs[1].clone(); - let peer_a = r.new_connected_peer(); - let peer_b = r.new_connected_peer(); - // Send unknown parent block lookup - r.trigger_unknown_parent_block(peer_a, block.into()); - // Expect network request for blobs - let id = r.expect_blob_lookup_request(block_root); - // Peer responses with blob 0 - r.single_lookup_blob_response(id, peer_a, Some(blob_0.into())); - // Blob 1 is received via gossip unknown parent blob from a different peer - r.trigger_unknown_parent_blob(peer_b, blob_1.clone()); - // Original peer sends blob 1 via RPC - r.single_lookup_blob_response(id, peer_a, Some(blob_1.into())); - // Assert no downscore event for original peer - r.expect_no_penalty_for(peer_a); - } -} diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index 3dca4571086..e9c2e84e4cc 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -1,14 +1,16 @@ use crate::sync::manager::SyncManager; -use crate::sync::range_sync::RangeSyncType; use crate::sync::SyncMessage; use crate::NetworkMessage; use beacon_chain::builder::Witness; use beacon_chain::eth1_chain::CachingEth1Backend; use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; use beacon_processor::WorkEvent; +use lighthouse_network::service::api_types::ComponentsByRootRequestId; use lighthouse_network::NetworkGlobals; +pub use lookups::PeersConfig; use rand_chacha::ChaCha20Rng; use slot_clock::ManualSlotClock; +use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::sync::{Arc, Once}; @@ -17,7 +19,7 @@ use tokio::sync::mpsc; use tracing_subscriber::fmt::MakeWriter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; -use types::{ChainSpec, ForkName, MinimalEthSpec as E}; +use types::{ChainSpec, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock}; mod lookups; mod range; @@ -69,6 +71,9 @@ struct TestRig { rng: ChaCha20Rng, fork_name: ForkName, spec: Arc, + + // Cache for produced blocks to serve + blocks_by_root: HashMap>>, } // Environment variable to read if `fork_from_env` feature is enabled. diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index c114eca555f..ae2a2416c11 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -1,28 +1,34 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; -use crate::sync::manager::SLOT_IMPORT_TOLERANCE; -use crate::sync::network_context::RangeRequestId; -use crate::sync::range_sync::RangeSyncType; +use crate::sync::manager::{BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; +use crate::sync::network_context::{BlockComponentsByRootRequestStep, RangeRequestId}; +use crate::sync::tests::lookups::TestOptions; +use crate::sync::BatchProcessResult; use crate::sync::SyncMessage; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; -use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; +use beacon_chain::{ + block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer, + PayloadVerificationStatus, +}; use beacon_processor::WorkType; use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, - OldBlocksByRangeRequestV2, + BlobsByRootRequest, BlocksByRootRequest, DataColumnsByRootRequest, }; -use lighthouse_network::rpc::{RequestType, StatusMessage}; +use lighthouse_network::rpc::{RPCError, RequestType, RpcErrorResponse, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - SyncRequestId, + AppRequestId, BlobsByRootRequestId, BlocksByRootRequestId, BlocksByRootRequester, + ComponentsByRootRequestId, DataColumnsByRootRequestId, HeaderLookupId, SyncRequestId, }; +use lighthouse_network::types::SyncState; use lighthouse_network::{PeerId, SyncInfo}; +use std::collections::HashSet; use std::time::Duration; use types::{ - BlobSidecarList, BlockImportSource, Epoch, EthSpec, Hash256, MinimalEthSpec as E, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, + BeaconBlock, Blob, BlobSidecar, BlobSidecarList, BlockImportSource, ColumnIndex, + DataColumnSidecar, Epoch, EthSpec, Hash256, KzgCommitment, KzgProof, MinimalEthSpec as E, + Signature, SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -32,12 +38,38 @@ pub(crate) enum DataSidecars { DataColumns(Vec>), } -enum ByRangeDataRequestIds { +enum ByRootDataRequestIds { PreDeneb, - PrePeerDAS(BlobsByRangeRequestId, PeerId), - PostPeerDAS(Vec<(DataColumnsByRangeRequestId, PeerId)>), + PrePeerDAS(BlobsByRootRequestId, PeerId, BlobsByRootRequest), + PostPeerDAS(Vec<(DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest)>), +} + +impl ByRootDataRequestIds { + /// If there's a single active request, returns its peer, else panics + fn peer(&self) -> PeerId { + match self { + Self::PreDeneb => panic!("no requests PreDeneb"), + Self::PrePeerDAS(_, peer, _) => *peer, + Self::PostPeerDAS(reqs) => { + if reqs.len() != 1 { + panic!("Should have 1 PostPeerDAS request"); + } + reqs.first().expect("no PostPeerDAS requests").1 + } + } + } +} + +struct Config { + peers: PeersConfig, } +type BlocksByRootRequestData = (BlocksByRootRequestId, PeerId, BlocksByRootRequest); + +type BlobsByRootRequestData = (BlobsByRootRequestId, PeerId, BlobsByRootRequest); + +type DataColumnsByRootRequestData = (DataColumnsByRootRequestId, PeerId, DataColumnsByRootRequest); + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -46,29 +78,247 @@ enum ByRangeDataRequestIds { /// To make writting tests succint, the machinery in this testing rig automatically identifies /// _which_ request to complete. Picking the right request is critical for tests to pass, so this /// filter allows better expressivity on the criteria to identify the right request. -#[derive(Default, Debug, Clone)] -struct RequestFilter { +#[derive(Default, Debug, Clone, Copy)] +pub struct RequestFilter { peer: Option, epoch: Option, + block_root: Option, + column_index: Option, + header_requests_only: bool, } +pub const NO_FILTER: RequestFilter = RequestFilter { + peer: None, + epoch: None, + block_root: None, + column_index: None, + header_requests_only: false, +}; + impl RequestFilter { - fn peer(mut self, peer: PeerId) -> Self { + pub fn peer(mut self, peer: PeerId) -> Self { self.peer = Some(peer); self } - fn epoch(mut self, epoch: u64) -> Self { + pub fn epoch(mut self, epoch: u64) -> Self { self.epoch = Some(epoch); self } + + pub fn block_root(mut self, block_root: Hash256) -> Self { + self.block_root = Some(block_root); + self + } + + pub fn column_index(mut self, index: u64) -> Self { + self.column_index = Some(index); + self + } + + pub fn header_requests_only(mut self) -> Self { + self.header_requests_only = true; + self + } + + fn blocks_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRoot(id)), + } if self.matches_blocks_by_root(peer_id, req, id) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn blobs_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlobsByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRoot(id)), + } if self.matches_blobs_by_root(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + + fn data_columns_by_root_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRoot(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), + } if self.matches_data_columns_by_root(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn matches_blocks_by_root( + &self, + peer: &PeerId, + req: &BlocksByRootRequest, + id: &BlocksByRootRequestId, + ) -> bool { + if self.header_requests_only + && !matches!(id.parent_request_id, BlocksByRootRequester::Header(_)) + { + return false; + } + + if let Some(block_root) = self.block_root { + if !req.block_roots().iter().any(|b| *b == block_root) { + return false; + } + } + + self.matches_peer(peer) + } + + fn matches_blobs_by_root(&self, peer: &PeerId, req: &BlobsByRootRequest) -> bool { + if self.header_requests_only { + return false; + } + + if let Some(block_root) = self.block_root { + if !req.blob_ids.iter().any(|id| id.block_root == block_root) { + return false; + } + } + + self.matches_peer(peer) + } + + fn matches_data_columns_by_root(&self, peer: &PeerId, req: &DataColumnsByRootRequest) -> bool { + if self.header_requests_only { + return false; + } + + if let Some(index) = self.column_index { + if !req + .data_column_ids + .iter() + .any(|id| id.columns.iter().any(|i| *i == index)) + { + return false; + } + } + self.matches_peer(peer) + } + + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { + if let Some(expected_epoch) = self.epoch { + let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); + if epoch != expected_epoch { + return false; + } + } + self.matches_peer(peer) + } + + fn matches_peer(&self, peer: &PeerId) -> bool { + if let Some(expected_peer) = self.peer { + if *peer != expected_peer { + return false; + } + } + true + } } -fn filter() -> RequestFilter { +pub fn filter() -> RequestFilter { RequestFilter::default() } +/// Instruct the testing rig how to complete requests for _by_range requests +pub struct CompleteConfig { + block_count: usize, + with_data: bool, + custody_failure_at_index: Option, + rpc_error: Option, + empty_sampling_response_once: bool, + stop_at_block: Option, + return_wrong_blocks: bool, + return_no_blocks_n_times: usize, + process_error: bool, +} + +impl CompleteConfig { + pub fn custody_failure_at_index(mut self, index: u64) -> Self { + self.custody_failure_at_index = Some(index); + self + } + + pub fn rpc_error(mut self, error: RPCError) -> Self { + self.rpc_error = Some(error); + self + } + + pub fn rpc_error_response(self, error: RpcErrorResponse) -> Self { + self.rpc_error(RPCError::ErrorResponse(error, "".to_owned())) + } + + pub fn empty_sampling_response_once(mut self) -> Self { + self.empty_sampling_response_once = true; + self + } + + pub fn stop_at_block(mut self, block: Hash256) -> Self { + self.stop_at_block = Some(block); + self + } + + pub fn return_wrong_blocks(mut self) -> Self { + self.return_wrong_blocks = true; + self + } + + pub fn return_no_blocks(self) -> Self { + self.return_no_blocks_n_times(usize::MAX) + } + + pub fn return_no_blocks_n_times(mut self, n_times: usize) -> Self { + self.return_no_blocks_n_times = n_times; + self + } +} + +pub fn complete() -> CompleteConfig { + CompleteConfig { + block_count: 1, + with_data: true, + custody_failure_at_index: None, + rpc_error: None, + empty_sampling_response_once: false, + stop_at_block: None, + return_wrong_blocks: false, + return_no_blocks_n_times: 0, + process_error: false, + } +} + impl TestRig { + fn our_custody_indices(&self) -> Vec { + self.network_globals + .sampling_columns() + .iter() + .copied() + .collect() + } + /// Produce a head peer with an advanced head fn add_head_peer(&mut self) -> PeerId { self.add_head_peer_with_root(Hash256::random()) @@ -77,7 +327,7 @@ impl TestRig { /// Produce a head peer with an advanced head fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { let local_info = self.local_info(); - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { head_root, head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), ..local_info @@ -93,7 +343,7 @@ impl TestRig { fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { let local_info = self.local_info(); let finalized_epoch = local_info.finalized_epoch + 2; - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { finalized_epoch, finalized_root, head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), @@ -128,57 +378,24 @@ impl TestRig { } } - fn add_random_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { - let peer_id = self.new_connected_peer(); - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { + self.add_sync_peer(false, remote_info) } - fn add_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { + fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { // Create valid peer known to network globals // TODO(fulu): Using supernode peers to ensure we have peer across all column // subnets for syncing. Should add tests connecting to full node peers. - let peer_id = self.new_connected_supernode_peer(); - // Send peer to sync - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + self.add_sync_peer(true, remote_info) } - fn add_random_peers(&mut self, remote_info: SyncInfo, count: usize) { - for _ in 0..count { - let peer = self.new_connected_peer(); - self.add_peer(peer, remote_info.clone()); - } + fn get_sync_state(&mut self) -> SyncState { + self.sync_manager.network().network_globals().sync_state() } - fn add_peer(&mut self, peer: PeerId, remote_info: SyncInfo) { - self.send_sync_message(SyncMessage::AddPeer(peer, remote_info)); - } - - fn assert_state(&self, state: RangeSyncType) { - assert_eq!( - self.sync_manager - .range_sync_state() - .expect("State is ok") - .expect("Range should be syncing, there are no chains") - .0, - state, - "not expected range sync state" - ); - } - - fn assert_no_chains_exist(&self) { - if let Some(chain) = self.sync_manager.get_range_sync_chains().unwrap() { - panic!("There still exists a chain {chain:?}"); - } - } - - fn assert_no_failed_chains(&mut self) { - assert_eq!( - self.sync_manager.__range_failed_chains(), - Vec::::new(), - "Expected no failed chains" - ) + fn assert_sync_state(&mut self, expected_state: SyncState) { + let current_state = self.sync_manager.network().network_globals().sync_state(); + assert_eq!(current_state, expected_state); } #[track_caller] @@ -191,186 +408,431 @@ impl TestRig { } } + fn expect_active_block_components_requests_on_custody_step(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_requests(); + if requests.is_empty() { + panic!("No active block components requests"); + } + for (id, step) in requests { + if !matches!(step, BlockComponentsByRootRequestStep::CustodyRequest) { + panic!("block components request {id} is not on CustodyRequest step: {step:?}"); + } + } + } + + fn expect_no_active_block_components_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_requests(); + if !requests.is_empty() { + panic!("Still active block components requests {requests:?}"); + } + } + + fn expect_no_active_rpc_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_requests() + .collect::>(); + if !requests.is_empty() { + panic!("There are still active RPC requests {requests:?}"); + } + } + fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); } - fn find_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> ((BlocksByRangeRequestId, PeerId), ByRangeDataRequestIds) { - let filter_f = |peer: PeerId, start_slot: u64| { - if let Some(expected_epoch) = request_filter.epoch { - let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); - if epoch != expected_epoch { - return false; - } - } - if let Some(expected_peer) = request_filter.peer { - if peer != expected_peer { - return false; - } + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> SignedBeaconBlock { + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { + blob_kzg_commitments + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); } - true - }; + } + *block.slot_mut() = slot; + SignedBeaconBlock::from_block(block, Signature::empty()) + } - let block_req = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::BlocksByRange(OldBlocksByRangeRequest::V2( - OldBlocksByRangeRequestV2 { start_slot, .. }, - )), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }); + pub fn create_unimported_parent_chain(&mut self, block_count: usize) -> (Hash256, Slot) { + self.log(&format!( + "Creating unimported chain of {block_count} blocks" + )); - let by_range_data_requests = if self.after_fulu() { - let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::DataColumnsByRange(DataColumnsByRangeRequest { - start_slot, .. - }), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) { - data_columns_requests.push(data_columns_request); - } - if data_columns_requests.is_empty() { - panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); - } - ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) - } else if self.after_deneb() { - let (id, peer) = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: RequestType::BlobsByRange(BlobsByRangeRequest { start_slot, .. }), - app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) - .unwrap_or_else(|e| { - panic!("Should have a blobs by range request, filter {request_filter:?}: {e:?}") - }); - ByRangeDataRequestIds::PrePeerDAS(id, peer) - } else { - ByRangeDataRequestIds::PreDeneb - }; + let current_head = self.harness.chain.head(); + let mut parent_root = current_head.head_block_root(); + let mut prev_slot = current_head.head_slot(); + for _ in 0..block_count { + let slot = prev_slot + Slot::new(1); + let mut block = self.zero_block_at_slot(slot, true); + *block.message_mut().parent_root_mut() = parent_root; + *block.message_mut().slot_mut() = slot; + let block_root = block.canonical_root(); + self.log(&format!("Block slot {slot} root {block_root:?}")); + self.blocks_by_root.insert(block_root, block.into()); + + parent_root = block_root; + prev_slot = slot; + } + (parent_root, prev_slot) + } - (block_req, by_range_data_requests) + fn send_rpc_error(&mut self, id: SyncRequestId, peer_id: PeerId, error: RPCError) { + self.log(&format!( + "Completing request {id:?} to {peer_id} with RPCError {error:?}" + )); + self.send_sync_message(SyncMessage::RpcError { + sync_request_id: id, + peer_id, + error, + }); } - fn find_and_complete_blocks_by_range_request( + fn send_blocks_by_root_response( &mut self, - request_filter: RequestFilter, - ) -> RangeRequestId { - let ((blocks_req_id, block_peer), by_range_data_request_ids) = - self.find_blocks_by_range_request(request_filter); - - // Complete the request with a single stream termination + req_id: BlocksByRootRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); self.log(&format!( - "Completing BlocksByRange request {blocks_req_id:?} with empty stream" + "Completing BlocksByRoot request {req_id} to {peer_id} with blocks {slots:?}" )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRoot(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(blocks_req_id), - peer_id: block_peer, + sync_request_id: SyncRequestId::BlocksByRoot(req_id), + peer_id, beacon_block: None, seen_timestamp: D, }); + } + + fn send_blobs_by_root_response( + &mut self, + id: BlobsByRootRequestId, + peer_id: PeerId, + blobs: &[Arc>], + ) { + let mut ids = blobs + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing BlobsByRoot request {id} to {peer_id} with data_columns {ids:?}" + )); - match by_range_data_request_ids { - ByRangeDataRequestIds::PreDeneb => {} - ByRangeDataRequestIds::PrePeerDAS(id, peer_id) => { - // Complete the request with a single stream termination + for blob in blobs { + self.send_sync_message(SyncMessage::RpcBlob { + sync_request_id: SyncRequestId::BlobsByRoot(id), + peer_id, + blob_sidecar: Some(blob.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlob { + sync_request_id: SyncRequestId::BlobsByRoot(id), + peer_id, + blob_sidecar: None, + seen_timestamp: D, + }); + } + + fn send_data_columns_by_root_response( + &mut self, + id: DataColumnsByRootRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRoot request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + + fn complete_blocks_by_root_request( + &mut self, + request: BlocksByRootRequestData, + config: &mut CompleteConfig, + ) { + let (req_id, peer, req) = request; + if let Some(error) = &config.rpc_error { + self.send_rpc_error(SyncRequestId::BlocksByRoot(req_id), peer, error.clone()); + return; + } + + if config.return_no_blocks_n_times > 0 { + config.return_no_blocks_n_times -= 1; + self.send_blocks_by_root_response(req_id, peer, &[]); + return; + } + + let blocks = req + .block_roots() + .iter() + .map(|block_root| { + if config.return_wrong_blocks { + Arc::new(self.rand_block()) + } else { + self.blocks_by_root + .get(block_root) + .expect("Test consumer requested unknown block") + .clone() + } + }) + .collect::>(); + + self.send_blocks_by_root_response(req_id, peer, &blocks); + } + + fn complete_blobs_by_root_request_range_sync( + &mut self, + (id, peer_id, req): BlobsByRootRequestData, + complete_config: &CompleteConfig, + ) { + let blobs = req + .blob_ids + .iter() + .flat_map(|blob_id| { + let block = self + .blocks_by_root + .get(&blob_id.block_root) + .expect("Test consumer requested unknown block") + .clone(); + + let kzg_commitment_inclusion_proof = block + .message() + .body() + .kzg_commitment_merkle_proof(blob_id.index as usize) + .unwrap(); + let kzg_commitment = *block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .get(blob_id.index as usize) + .unwrap(); + let signed_block_header = block.signed_block_header(); + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(BlobSidecar { + index: blob_id.index, + blob: Blob::::default(), + kzg_commitment, + kzg_proof: KzgProof::empty(), + signed_block_header, + kzg_commitment_inclusion_proof, + })) + }) + .collect::>(); + + self.send_blobs_by_root_response(id, peer_id, &blobs); + } + + fn complete_data_columns_by_root_request_range_sync( + &mut self, + (id, peer_id, req): DataColumnsByRootRequestData, + complete_config: &CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + let mut triggered_custody_failure = false; + + let data_columns = req + .data_column_ids + .iter() + .flat_map(|column_id| { + let block = self + .blocks_by_root + .get(&column_id.block_root) + .expect("Test consumer requested unknown block") + .clone(); + + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + column_id.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + triggered_custody_failure = true; + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if let Some(target_index) = complete_config.custody_failure_at_index { + if req + .data_column_ids + .iter() + .any(|id| id.columns.iter().any(|index| *index == target_index)) + { self.log(&format!( - "Completing BlobsByRange request {id:?} with empty stream" + "Forced custody failure at request {id} for peer {peer_id} index {target_index:?}" )); - self.send_sync_message(SyncMessage::RpcBlob { - sync_request_id: SyncRequestId::BlobsByRange(id), - peer_id, - blob_sidecar: None, - seen_timestamp: D, - }); - } - ByRangeDataRequestIds::PostPeerDAS(data_column_req_ids) => { - // Complete the request with a single stream termination - for (id, peer_id) in data_column_req_ids { - self.log(&format!( - "Completing DataColumnsByRange request {id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); - } } } - blocks_req_id.parent_request_id.requester + self.send_data_columns_by_root_response(id, peer_id, &data_columns); } - fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { - self.pop_received_processor_event(|ev| { - (ev.work_type() == WorkType::ChainSegment).then_some(()) - }) - .unwrap_or_else(|e| panic!("Expected chain segment work event: {e}")); + fn complete_block_processing(&mut self, ids: Vec, config: &CompleteConfig) { + if config.process_error { + for id in &ids { + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: ChainSegmentProcessId::ForwardSync(*id), + result: BatchProcessResult::Failure { + peer_action: None, + error: "test error".to_owned(), + }, + }); + } + } - self.log(&format!( - "Completing ChainSegment processing work {id:?} with success" - )); - self.send_sync_message(SyncMessage::BatchProcessed { - sync_type: id, - result: crate::sync::BatchProcessResult::Success { - sent_blocks: 8, - imported_blocks: 8, - }, - }); + // Sort ids first as we need to process blocks in order of ancestors. This only works if the + // test does not send blocks of two parallel chains at once. + let mut blocks = ids + .into_iter() + .map(|id| { + let block = self + .blocks_by_root + .get(&id.block_root) + .cloned() + .expect("unknown block"); + (id, block) + }) + .collect::>(); + blocks.sort_by_key(|(_, block)| block.slot()); + + for (id, block) in blocks { + self.log(&format!( + "Completing block processing {id} slot {}", + block.slot() + )); + + { + let mut head_state = self.harness.chain.head().snapshot.beacon_state.clone(); + *head_state.slot_mut() = block.slot(); + + let mut fork_choice = self.harness.chain.canonical_head.fork_choice_write_lock(); + fork_choice + .on_block( + block.slot(), + block.message(), + id.block_root, + Duration::from_secs(0), + &head_state, + PayloadVerificationStatus::Verified, + &self.spec, + ) + .expect("error importing block to fork-choice"); + } + + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: ChainSegmentProcessId::ForwardSync(id), + result: BatchProcessResult::Success, + }); + } } - fn complete_and_process_range_sync_until( + pub fn progress_until_no_events( &mut self, - last_epoch: u64, request_filter: RequestFilter, + mut complete_config: CompleteConfig, ) { - for epoch in 0..last_epoch { - // Note: In this test we can't predict the block peer - let id = - self.find_and_complete_blocks_by_range_request(request_filter.clone().epoch(epoch)); - if let RangeRequestId::RangeSync { batch_id, .. } = id { - assert_eq!(batch_id.as_u64(), epoch, "Unexpected batch_id"); - } else { - panic!("unexpected RangeRequestId {id:?}"); + self.log(&format!("progress until no events {request_filter:?}")); + loop { + if let Ok(request) = self + .pop_received_network_event(&mut |ev| request_filter.blocks_by_root_requests(ev)) + { + self.complete_blocks_by_root_request(request, &mut complete_config); + continue; } - let id = match id { - RangeRequestId::RangeSync { chain_id, batch_id } => { - ChainSegmentProcessId::RangeBatchId(chain_id, batch_id) - } - RangeRequestId::BackfillSync { batch_id } => { - ChainSegmentProcessId::BackSyncBatchId(batch_id) - } - }; - - self.find_and_complete_processing_chain_segment(id); - if epoch < last_epoch - 1 { - self.assert_state(RangeSyncType::Finalized); - } else { - self.assert_no_chains_exist(); - self.assert_no_failed_chains(); + if let Ok(request) = + self.pop_received_network_event(&mut |ev| request_filter.blobs_by_root_requests(ev)) + { + self.complete_blobs_by_root_request_range_sync(request, &complete_config); + continue; + } + + if let Ok(request) = self.pop_received_network_event(&mut |ev| { + request_filter.data_columns_by_root_requests(ev) + }) { + self.complete_data_columns_by_root_request_range_sync(request, &complete_config); + continue; + } + + // TODO(tree-sync): find a way to get this info from the beacon processor events + let ids = self.sync_manager.forward_sync().get_processing_ids(); + if !ids.is_empty() { + self.complete_block_processing(ids, &complete_config); + continue; } + + let sync_state = self.get_sync_state(); + self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); + + return; } } @@ -456,158 +918,126 @@ fn build_rpc_block( } } -#[test] -fn head_chain_removed_while_finalized_syncing() { - // NOTE: this is a regression test. - // Added in PR https://github.com/sigp/lighthouse/pull/2821 - let mut rig = TestRig::test_setup(); - - // Get a peer with an advanced head - let head_peer = rig.add_head_peer(); - rig.assert_state(RangeSyncType::Head); - - // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); - - // Now get a peer with an advanced finalized epoch. - let finalized_peer = rig.add_finalized_peer(); - rig.assert_state(RangeSyncType::Finalized); - - // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); - - // Fail the head chain by disconnecting the peer. - rig.peer_disconnected(head_peer); - rig.assert_state(RangeSyncType::Finalized); -} - -#[tokio::test] -async fn state_update_while_purging() { - // NOTE: this is a regression test. - // Added in PR https://github.com/sigp/lighthouse/pull/2827 - let mut rig = TestRig::test_setup(); - - // Create blocks on a separate harness - let mut rig_2 = TestRig::test_setup(); - // Need to create blocks that can be inserted into the fork-choice and fit the "known - // conditions" below. - let head_peer_block = rig_2.create_canonical_block().await; - let head_peer_root = head_peer_block.0.canonical_root(); - let finalized_peer_block = rig_2.create_canonical_block().await; - let finalized_peer_root = finalized_peer_block.0.canonical_root(); - - // Get a peer with an advanced head - let head_peer = rig.add_head_peer_with_root(head_peer_root); - rig.assert_state(RangeSyncType::Head); - - // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); - - // Now get a peer with an advanced finalized epoch. - let finalized_peer = rig.add_finalized_peer_with_root(finalized_peer_root); - rig.assert_state(RangeSyncType::Finalized); - - // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); - - // Now the chain knows both chains target roots. - rig.remember_block(head_peer_block).await; - rig.remember_block(finalized_peer_block).await; - - // Add an additional peer to the second chain to make range update it's status - rig.add_finalized_peer(); -} - -#[test] -fn pause_and_resume_on_ee_offline() { - let mut rig = TestRig::test_setup(); - - // add some peers - let peer1 = rig.add_head_peer(); - // make the ee offline - rig.update_execution_engine_state(EngineState::Offline); - // send the response to the request - rig.find_and_complete_blocks_by_range_request(filter().peer(peer1).epoch(0)); - // the beacon processor shouldn't have received any work - rig.expect_empty_processor(); - - // while the ee is offline, more peers might arrive. Add a new finalized peer. - let _peer2 = rig.add_finalized_peer(); - - // send the response to the request - // Don't filter requests and the columns requests may be sent to peer1 or peer2 - // We need to filter by epoch, because the previous batch eagerly sent requests for the next - // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_blocks_by_range_request(filter().epoch(0)); - // the beacon processor shouldn't have received any work - rig.expect_empty_processor(); - // make the beacon processor available again. - // update_execution_engine_state implicitly calls resume - // now resume range, we should have two processing requests in the beacon processor. - rig.update_execution_engine_state(EngineState::Online); - - // The head chain and finalized chain (2) should be in the processing queue - rig.expect_chain_segments(2); +fn sync_info_with_head_root(head_root: Hash256) -> SyncInfo { + SyncInfo { + head_slot: Slot::new(1), + head_root, + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + } } /// To attempt to finalize the peer's status finalized checkpoint we synced to its finalized epoch + /// 2 epochs + 1 slot. const EXTRA_SYNCED_EPOCHS: u64 = 2 + 1; -#[test] -fn finalized_sync_enough_global_custody_peers_few_chain_peers() { - // Run for all forks - let mut r = TestRig::test_setup(); - // This test creates enough global custody peers to satisfy column queries but only adds few - // peers to the chain - r.new_connected_peers_for_peerdas(); - - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); - - // Current priorization only sends batches to idle peers, so we need enough peers for each batch - // TODO: Test this with a single peer in the chain, it should still work - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS) as usize, - ); - r.assert_state(RangeSyncType::Finalized); +// Same test with different types of peers: +// - 100 peers +// - 1 supernode +// - perfectly distributed peer ids - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeOnly, + }); } #[test] -fn finalized_sync_not_enough_custody_peers_on_start() { - let mut r = TestRig::test_setup(); +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeAndRandom, + }); +} + +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { + let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; } - let advanced_epochs: u64 = 2; - let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let (head_root, _) = r.create_unimported_parent_chain(2); + let remote_info = sync_info_with_head_root(head_root); + r.add_sync_peer(false, remote_info.clone()); - // Unikely that the single peer we added has enough columns for us. Tests are determinstic and - // this error should never be hit - r.add_random_peer_not_supernode(remote_info.clone()); - r.assert_state(RangeSyncType::Finalized); + // We are a supernode, and just added a single non-supernode peer. The custody by root request + // will stall as many columns have zero peers. + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_no_active_rpc_requests(); - // Because we don't have enough peers on all columns we haven't sent any request. - // NOTE: There's a small chance that this single peer happens to custody exactly the set we - // expect, in that case the test will fail. Find a way to make the test deterministic. - r.expect_empty_network(); + // Here we have a batch with partially completed block_components_by_range requests. The batch + // should not have failed, we are still syncing, and there are no downscoring events. + r.expect_no_penalty_for_anyone(); + r.expect_active_block_components_requests_on_custody_step(); // Generate enough peers and supernodes to cover all custody columns - r.new_connected_peers_for_peerdas(); + r.add_sync_peers(config.peers, remote_info.clone()); // Note: not necessary to add this peers to the chain, as we draw from the global pool // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS - 1) as usize, + + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_requests(); + // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work + // events from here easily. What we pop from the beacon processor queue is an opaque closure + // wihtout any information. We don't know what batch it is for. +} + +#[test] +fn finalized_sync_single_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let (head_root, _) = r.create_unimported_parent_chain(2); + let peer_1 = r.new_connected_supernode_peer(); + // Trigger the request + r.trigger_unknown_block_from_attestation(head_root, peer_1); + + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + r.complete_header_chain(); + + // Progress all blocks_by_range and columns_by_range requests but respond empty for a single + // column index + r.progress_until_no_events( + NO_FILTER, + complete().custody_failure_at_index(column_index_to_fail), ); + r.expect_penalties("custody_failure"); + + // Some peer had a custody failure, but since there's a single peer in the batch we won't issue + // another request yet. + r.expect_no_active_rpc_requests(); + // Ensure that the block components by range request have not failed + r.expect_active_block_components_requests_on_custody_step(); + + // After adding a new peer we will try to fetch from it + let peer_2 = r.new_connected_supernode_peer(); + r.trigger_unknown_block_from_attestation(head_root, peer_2); + // complete this one request without the custody failure now + r.progress_until_no_events(NO_FILTER, complete()); + + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_requests(); +} - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); +#[test] +fn tree_sync_happy_path() { + let mut r = TestRig::test_setup(); + let (head_root, head_slot) = r.create_unimported_parent_chain(8); + let remote_info = SyncInfo { + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + head_slot, + head_root, + }; + r.add_sync_peer(false, remote_info.clone()); + r.progress_until_no_events(NO_FILTER, complete()); + r.add_sync_peer(true, remote_info); + r.progress_until_no_events(NO_FILTER, complete()); + r.expect_empty_network(); } diff --git a/common/eth2/src/lighthouse/sync_state.rs b/common/eth2/src/lighthouse/sync_state.rs index 0327f7073fa..b6677e5f636 100644 --- a/common/eth2/src/lighthouse/sync_state.rs +++ b/common/eth2/src/lighthouse/sync_state.rs @@ -4,17 +4,13 @@ use types::Slot; /// The current state of the node. #[derive(Clone, Debug, Serialize, Deserialize)] pub enum SyncState { - /// The node is performing a long-range (batch) sync over a finalized chain. - /// In this state, parent lookups are disabled. - SyncingFinalized { start_slot: Slot, target_slot: Slot }, - /// The node is performing a long-range (batch) sync over one or many head chains. - /// In this state parent lookups are disabled. - SyncingHead { start_slot: Slot, target_slot: Slot }, + /// The node is syncing one or many chains, either finalized or not + Syncing { start_slot: Slot, target_slot: Slot }, /// The node is undertaking a backfill sync. This occurs when a user has specified a trusted /// state. The node first syncs "forward" by downloading blocks up to the current head as /// specified by its peers. Once completed, the node enters this sync state and attempts to /// download all required historical blocks. - BackFillSyncing { completed: usize, remaining: usize }, + BackFillSyncing, /// The node has completed syncing a finalized chain and is in the process of re-evaluating /// which sync state to progress to. SyncTransition, @@ -43,17 +39,11 @@ impl PartialEq for SyncState { fn eq(&self, other: &Self) -> bool { matches!( (self, other), - ( - SyncState::SyncingFinalized { .. }, - SyncState::SyncingFinalized { .. } - ) | (SyncState::SyncingHead { .. }, SyncState::SyncingHead { .. }) + (SyncState::Syncing { .. }, SyncState::Syncing { .. }) | (SyncState::Synced, SyncState::Synced) | (SyncState::Stalled, SyncState::Stalled) | (SyncState::SyncTransition, SyncState::SyncTransition) - | ( - SyncState::BackFillSyncing { .. }, - SyncState::BackFillSyncing { .. } - ) + | (SyncState::BackFillSyncing, SyncState::BackFillSyncing) ) } } @@ -62,22 +52,10 @@ impl SyncState { /// Returns a boolean indicating the node is currently performing a long-range sync. pub fn is_syncing(&self) -> bool { match self { - SyncState::SyncingFinalized { .. } => true, - SyncState::SyncingHead { .. } => true, + SyncState::Syncing { .. } => true, SyncState::SyncTransition => true, // Backfill doesn't effect any logic, we consider this state, not syncing. - SyncState::BackFillSyncing { .. } => false, - SyncState::Synced => false, - SyncState::Stalled => false, - } - } - - pub fn is_syncing_finalized(&self) -> bool { - match self { - SyncState::SyncingFinalized { .. } => true, - SyncState::SyncingHead { .. } => false, - SyncState::SyncTransition => false, - SyncState::BackFillSyncing { .. } => false, + SyncState::BackFillSyncing => false, SyncState::Synced => false, SyncState::Stalled => false, } @@ -87,7 +65,7 @@ impl SyncState { /// /// NOTE: We consider the node synced if it is fetching old historical blocks. pub fn is_synced(&self) -> bool { - matches!(self, SyncState::Synced | SyncState::BackFillSyncing { .. }) + matches!(self, SyncState::Synced | SyncState::BackFillSyncing) } /// Returns true if the node is *stalled*, i.e. has no synced peers. @@ -102,12 +80,11 @@ impl SyncState { impl std::fmt::Display for SyncState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - SyncState::SyncingFinalized { .. } => write!(f, "Syncing Finalized Chain"), - SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"), + SyncState::Syncing { .. } => write!(f, "Syncing"), SyncState::Synced => write!(f, "Synced"), SyncState::Stalled => write!(f, "Stalled"), SyncState::SyncTransition => write!(f, "Evaluating known peers"), - SyncState::BackFillSyncing { .. } => write!(f, "Syncing Historical Blocks"), + SyncState::BackFillSyncing => write!(f, "Syncing Historical Blocks"), } } } diff --git a/consensus/types/src/signed_beacon_block.rs b/consensus/types/src/signed_beacon_block.rs index 85bed35a19c..de572014edc 100644 --- a/consensus/types/src/signed_beacon_block.rs +++ b/consensus/types/src/signed_beacon_block.rs @@ -321,6 +321,10 @@ impl> SignedBeaconBlock .unwrap_or(0) } + pub fn has_data(&self) -> bool { + self.num_expected_blobs() > 0 + } + /// Used for displaying commitments in logs. pub fn commitments_formatted(&self) -> String { let Ok(commitments) = self.message().body().blob_kzg_commitments() else {