From fbab82947307e7e022647dbe1c54bb31b2d5fedb Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 24 Jan 2025 17:51:36 +0800 Subject: [PATCH 01/64] Remove request tracking inside syncing chains --- .../network/src/sync/backfill_sync/mod.rs | 161 ++++-------------- beacon_node/network/src/sync/manager.rs | 6 +- .../network/src/sync/network_context.rs | 12 +- .../network/src/sync/range_sync/batch.rs | 75 ++++---- .../network/src/sync/range_sync/chain.rs | 155 +++-------------- .../network/src/sync/range_sync/range.rs | 5 +- 6 files changed, 101 insertions(+), 313 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index a3d2c826429..fb2fb5c47f9 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -10,8 +10,7 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::manager::BatchProcessResult; -use crate::sync::network_context::RangeRequestId; -use crate::sync::network_context::SyncNetworkContext; +use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, SyncNetworkContext}; use crate::sync::range_sync::{ BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, }; @@ -20,11 +19,10 @@ use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; -use rand::seq::SliceRandom; use slog::{crit, debug, error, info, warn}; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashMap, HashSet, + HashSet, }; use std::sync::Arc; use types::{Epoch, EthSpec}; @@ -121,9 +119,6 @@ pub struct BackFillSync { /// Sorted map of batches undergoing some kind of processing. batches: BTreeMap>, - /// List of peers we are currently awaiting a response for. - active_requests: HashMap>, - /// The current processing batch, if any. current_processing_batch: Option, @@ -175,7 +170,6 @@ impl BackFillSync { let bfs = BackFillSync { batches: BTreeMap::new(), - active_requests: HashMap::new(), processing_target: current_start, current_start, last_batch_downloaded: false, @@ -290,47 +284,11 @@ impl BackFillSync { /// A peer has disconnected. /// If the peer has active batches, those are considered failed and re-requested. #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn peer_disconnected( - &mut self, - peer_id: &PeerId, - network: &mut SyncNetworkContext, - ) -> Result<(), BackFillError> { + pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> { if matches!(self.state(), BackFillState::Failed) { return Ok(()); } - if let Some(batch_ids) = self.active_requests.remove(peer_id) { - // fail the batches. - for id in batch_ids { - if let Some(batch) = self.batches.get_mut(&id) { - match batch.download_failed(false) { - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(id))?; - } - Ok(BatchOperationOutcome::Continue) => {} - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?; - } - } - // If we have run out of peers in which to retry this batch, the backfill state - // transitions to a paused state. - // We still need to reset the state for all the affected batches, so we should not - // short circuit early. - if self.retry_batch_download(network, id).is_err() { - debug!( - self.log, - "Batch could not be retried"; - "batch_id" => id, - "error" => "no synced peers" - ); - } - } else { - debug!(self.log, "Batch not found while removing peer"; - "peer" => %peer_id, "batch" => id) - } - } - } - // Remove the peer from the participation list self.participating_peers.remove(peer_id); Ok(()) @@ -344,7 +302,6 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { @@ -357,9 +314,6 @@ impl BackFillSync { return Ok(()); } debug!(self.log, "Batch failed"; "batch_epoch" => batch_id, "error" => "rpc_error"); - if let Some(active_requests) = self.active_requests.get_mut(peer_id) { - active_requests.remove(&batch_id); - } match batch.download_failed(true) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { @@ -406,13 +360,7 @@ impl BackFillSync { return Ok(ProcessResult::Successful); } - // A stream termination has been sent. This batch has ended. Process a completed batch. - // Remove the request from the peer's active batches - self.active_requests - .get_mut(peer_id) - .map(|active_requests| active_requests.remove(&batch_id)); - - match batch.download_completed(blocks) { + match batch.download_completed(blocks, *peer_id) { Ok(received) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; @@ -458,7 +406,6 @@ impl BackFillSync { self.set_state(BackFillState::Failed); // Remove all batches and active requests and participating peers. self.batches.clear(); - self.active_requests.clear(); self.participating_peers.clear(); self.restart_failed_sync = false; @@ -574,7 +521,7 @@ impl BackFillSync { } }; - let Some(peer) = batch.current_peer() else { + let Some(peer) = batch.processing_peer() else { self.fail_sync(BackFillError::BatchInvalidState( batch_id, String::from("Peer does not exist"), @@ -642,6 +589,8 @@ impl BackFillSync { ); for peer in self.participating_peers.drain() { + // TODO(das): this participating peers is broken with custody columns backfill, consider + // a different mechanism network.report_peer(peer, *penalty, "backfill_batch_failed"); } self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) @@ -787,12 +736,7 @@ impl BackFillSync { } } } - BatchState::Downloading(peer, ..) => { - // remove this batch from the peer's active requests - if let Some(active_requests) = self.active_requests.get_mut(peer) { - active_requests.remove(&id); - } - } + BatchState::Downloading(..) => {} BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { crit!( self.log, @@ -883,39 +827,9 @@ impl BackFillSync { network: &mut SyncNetworkContext, batch_id: BatchId, ) -> Result<(), BackFillError> { - let Some(batch) = self.batches.get_mut(&batch_id) else { - return Ok(()); - }; - - // Find a peer to request the batch - let failed_peers = batch.failed_peers(); - - let new_peer = self - .network_globals - .peers - .read() - .synced_peers() - .map(|peer| { - ( - failed_peers.contains(peer), - self.active_requests.get(peer).map(|v| v.len()).unwrap_or(0), - rand::random::(), - *peer, - ) - }) - // Sort peers prioritizing unrelated peers with less active requests. - .min() - .map(|(_, _, _, peer)| peer); - - if let Some(peer) = new_peer { - self.participating_peers.insert(peer); - self.send_batch(network, batch_id, peer) - } else { - // If we are here the chain has no more synced peers - info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers"); - self.set_state(BackFillState::Paused); - Err(BackFillError::Paused) - } + // TODO(das): previously here we de-prioritize peers that had failed to download or + // process a batch + self.send_batch(network, batch_id) } /// Requests the batch assigned to the given id from a given peer. @@ -923,41 +837,46 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer: PeerId, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { + let synced_peers = self + .network_globals + .peers + .read() + .synced_peers() + .cloned() + .collect::>(); + let (request, is_blob_batch) = batch.to_blocks_by_range_request(); match network.block_components_by_range_request( - peer, is_blob_batch, request, RangeRequestId::BackfillSync { batch_id }, + &synced_peers, ) { Ok(request_id) => { // inform the batch about the new request - if let Err(e) = batch.start_downloading_from_peer(peer, request_id) { + if let Err(e) = batch.start_downloading_from_peer(request_id) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch); - // register the batch for this peer - self.active_requests - .entry(peer) - .or_default() - .insert(batch_id); return Ok(()); } + Err(RpcRequestSendError::NoCustodyPeers) => { + // If we are here the chain has no more synced peers + info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers"); + self.set_state(BackFillState::Paused); + return Err(BackFillError::Paused); + } Err(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(self.log, "Could not send batch request"; "batch_id" => batch_id, "error" => ?e, &batch); // register the failed download and check if the batch can be retried - if let Err(e) = batch.start_downloading_from_peer(peer, 1) { + if let Err(e) = batch.start_downloading_from_peer(1) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } - self.active_requests - .get_mut(&peer) - .map(|request| request.remove(&batch_id)); match batch.download_failed(true) { Err(e) => { @@ -1012,35 +931,15 @@ impl BackFillSync { } // find the next pending batch and request it from the peer - - // randomize the peers for load balancing - let mut rng = rand::thread_rng(); - let mut idle_peers = self - .network_globals - .peers - .read() - .synced_peers() - .filter(|peer_id| { - self.active_requests - .get(peer_id) - .map(|requests| requests.is_empty()) - .unwrap_or(true) - }) - .cloned() - .collect::>(); - - idle_peers.shuffle(&mut rng); - - while let Some(peer) = idle_peers.pop() { + loop { if let Some(batch_id) = self.include_next_batch(network) { // send the batch - self.send_batch(network, batch_id, peer)?; + self.send_batch(network, batch_id)?; } else { // No more batches, simply stop return Ok(()); } } - Ok(()) } /// Creates the next required batch from the chain. If there are no more batches required, diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index fc31e837277..ac8bda31281 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -517,9 +517,7 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); - let _ = self - .backfill_sync - .peer_disconnected(peer_id, &mut self.network); + let _ = self.backfill_sync.peer_disconnected(peer_id); self.block_lookups.peer_disconnected(peer_id); // Regardless of the outcome, we update the sync status. @@ -1305,7 +1303,7 @@ impl SyncManager { } RangeRequestId::BackfillSync { batch_id } => match self .backfill_sync - .inject_error(&mut self.network, batch_id, &peer_id, range_request_id.id) + .inject_error(&mut self.network, batch_id, range_request_id.id) { Ok(_) => {} Err(_) => self.update_sync_state(), diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 0cd21de7f41..8d36fe0aa89 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -353,10 +353,10 @@ impl SyncNetworkContext { /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, - peer_id: PeerId, batch_type: ByRangeRequestType, request: BlocksByRangeRequest, requester: RangeRequestId, + peers: &HashSet, ) -> Result { // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { @@ -364,6 +364,16 @@ impl SyncNetworkContext { requester, }; + let Some(peer_id) = peers + .iter() + .map(|peer| (rand::random::(), *peer)) + .min() + .map(|(_, peer)| peer) + else { + // TODO(das): is it safe to error here? + return Err(RpcRequestSendError::NoCustodyPeers); + }; + let _blocks_req_id = self.send_blocks_by_range_request(peer_id, request.clone(), id)?; let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 818fde07b83..d51e6e16a43 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -104,7 +104,7 @@ pub struct BatchInfo { /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: Vec, + failed_download_attempts: usize, /// State of the batch. state: BatchState, /// Whether this batch contains all blocks or all blocks and blobs. @@ -118,7 +118,7 @@ pub enum BatchState { /// The batch has failed either downloading or processing, but can be requested again. AwaitingDownload, /// The batch is being downloaded. - Downloading(PeerId, Id), + Downloading(Id), /// The batch has been completely downloaded and is ready for processing. AwaitingProcessing(PeerId, Vec>, Instant), /// The batch is being processed. @@ -164,7 +164,7 @@ impl BatchInfo { start_slot, end_slot, failed_processing_attempts: Vec::new(), - failed_download_attempts: Vec::new(), + failed_download_attempts: 0, non_faulty_processing_attempts: 0, state: BatchState::AwaitingDownload, batch_type, @@ -174,45 +174,35 @@ impl BatchInfo { /// Gives a list of peers from which this batch has had a failed download or processing /// attempt. - pub fn failed_peers(&self) -> HashSet { - let mut peers = HashSet::with_capacity( - self.failed_processing_attempts.len() + self.failed_download_attempts.len(), - ); - - for attempt in &self.failed_processing_attempts { - peers.insert(attempt.peer_id); - } - - for download in &self.failed_download_attempts { - peers.insert(*download); - } - - peers + pub fn failed_processing_peers(&self) -> HashSet { + self.failed_processing_attempts + .iter() + .map(|attempt| attempt.peer_id) + .collect() } /// Return the number of times this batch has failed downloading and failed processing, in this /// order. pub fn failed_attempts(&self) -> (usize, usize) { ( - self.failed_download_attempts.len(), + self.failed_download_attempts, self.failed_processing_attempts.len(), ) } /// Verifies if an incoming block belongs to this batch. pub fn is_expecting_block(&self, request_id: &Id) -> bool { - if let BatchState::Downloading(_, expected_id) = &self.state { + if let BatchState::Downloading(expected_id) = &self.state { return expected_id == request_id; } false } /// Returns the peer that is currently responsible for progressing the state of the batch. - pub fn current_peer(&self) -> Option<&PeerId> { + pub fn processing_peer(&self) -> Option<&PeerId> { match &self.state { - BatchState::AwaitingDownload | BatchState::Failed => None, - BatchState::Downloading(peer_id, _) - | BatchState::AwaitingProcessing(peer_id, _, _) + BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, + BatchState::AwaitingProcessing(peer_id, _, _) | BatchState::Processing(Attempt { peer_id, .. }) | BatchState::AwaitingValidation(Attempt { peer_id, .. }) => Some(peer_id), BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -250,8 +240,7 @@ impl BatchInfo { match self.state { BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Failed => BatchOperationOutcome::Failed { - blacklist: self.failed_processing_attempts.len() - > self.failed_download_attempts.len(), + blacklist: self.failed_processing_attempts.len() > self.failed_download_attempts, }, _ => BatchOperationOutcome::Continue, } @@ -271,12 +260,13 @@ impl BatchInfo { pub fn download_completed( &mut self, blocks: Vec>, + peer: PeerId, ) -> Result< usize, /* Received blocks */ Result<(Slot, Slot, BatchOperationOutcome), WrongState>, > { match self.state.poison() { - BatchState::Downloading(peer, _request_id) => { + BatchState::Downloading(_request_id) => { let received = blocks.len(); self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); Ok(received) @@ -303,19 +293,18 @@ impl BatchInfo { mark_failed: bool, ) -> Result { match self.state.poison() { - BatchState::Downloading(peer, _request_id) => { + BatchState::Downloading(_request_id) => { // register the attempt and check if the batch can be tried again if mark_failed { - self.failed_download_attempts.push(peer); + self.failed_download_attempts += 1; } - self.state = if self.failed_download_attempts.len() - >= B::max_batch_download_attempts() as usize - { - BatchState::Failed - } else { - // drop the blocks - BatchState::AwaitingDownload - }; + self.state = + if self.failed_download_attempts >= B::max_batch_download_attempts() as usize { + BatchState::Failed + } else { + // drop the blocks + BatchState::AwaitingDownload + }; Ok(self.outcome()) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -329,14 +318,10 @@ impl BatchInfo { } } - pub fn start_downloading_from_peer( - &mut self, - peer: PeerId, - request_id: Id, - ) -> Result<(), WrongState> { + pub fn start_downloading_from_peer(&mut self, request_id: Id) -> Result<(), WrongState> { match self.state.poison() { BatchState::AwaitingDownload => { - self.state = BatchState::Downloading(peer, request_id); + self.state = BatchState::Downloading(request_id); Ok(()) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -483,7 +468,7 @@ impl slog::KV for BatchInfo { "end_slot", serializer, )?; - serializer.emit_usize("downloaded", self.failed_download_attempts.len())?; + serializer.emit_usize("downloaded", self.failed_download_attempts)?; serializer.emit_usize("processed", self.failed_processing_attempts.len())?; serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?; serializer.emit_arguments("state", &format_args!("{:?}", self.state))?; @@ -508,8 +493,8 @@ impl std::fmt::Debug for BatchState { BatchState::AwaitingProcessing(ref peer, ref blocks, _) => { write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len()) } - BatchState::Downloading(peer, request_id) => { - write!(f, "Downloading({}, {})", peer, request_id) + BatchState::Downloading(request_id) => { + write!(f, "Downloading({})", request_id) } BatchState::Poisoned => f.write_str("Poisoned"), } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 4eb73f54839..b9f68eeac89 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -7,12 +7,9 @@ use crate::sync::network_context::RangeRequestId; use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; -use fnv::FnvHashMap; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use metrics::set_int_gauge; -use rand::seq::SliceRandom; -use rand::Rng; use slog::{crit, debug, o, warn}; use std::collections::{btree_map::Entry, BTreeMap, HashSet}; use strum::IntoStaticStr; @@ -90,7 +87,7 @@ pub struct SyncingChain { /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain /// and thus available to download this chain from, as well as the batches we are currently /// requesting. - peers: FnvHashMap>, + peers: HashSet, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -136,9 +133,6 @@ impl SyncingChain { chain_type: SyncingChainType, log: &slog::Logger, ) -> Self { - let mut peers = FnvHashMap::default(); - peers.insert(peer_id, Default::default()); - SyncingChain { id, chain_type, @@ -146,7 +140,7 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers, + peers: HashSet::from_iter([peer_id]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -174,7 +168,7 @@ impl SyncingChain { /// Peers currently syncing this chain. pub fn peers(&self) -> impl Iterator + '_ { - self.peers.keys().cloned() + self.peers.iter().cloned() } /// Progress in epochs made by the chain @@ -194,30 +188,8 @@ impl SyncingChain { /// Removes a peer from the chain. /// If the peer has active batches, those are considered failed and re-requested. - pub fn remove_peer( - &mut self, - peer_id: &PeerId, - network: &mut SyncNetworkContext, - ) -> ProcessingResult { - if let Some(batch_ids) = self.peers.remove(peer_id) { - // fail the batches. - for id in batch_ids { - if let Some(batch) = self.batches.get_mut(&id) { - if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(true)? - { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: id, - }); - } - self.retry_batch_download(network, id)?; - } else { - debug!(self.log, "Batch not found while removing peer"; - "peer" => %peer_id, "batch" => id) - } - } - } + pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { + self.peers.remove(peer_id); if self.peers.is_empty() { Err(RemoveChain::EmptyPeerPool) @@ -268,11 +240,9 @@ impl SyncingChain { { // A stream termination has been sent. This batch has ended. Process a completed batch. // Remove the request from the peer's active batches - self.peers - .get_mut(peer_id) - .map(|active_requests| active_requests.remove(&batch_id)); - match batch.download_completed(blocks) { + // TODO(das): should use peer group here + match batch.download_completed(blocks, *peer_id) { Ok(received) => { let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) @@ -489,7 +459,7 @@ impl SyncingChain { } }; - let peer = batch.current_peer().cloned().ok_or_else(|| { + let peer = batch.processing_peer().cloned().ok_or_else(|| { RemoveChain::WrongBatchState(format!( "Processing target is in wrong state: {:?}", batch.state(), @@ -590,7 +560,7 @@ impl SyncingChain { "batch_epoch"=> batch_id, ); - for (peer, _) in self.peers.drain() { + for peer in self.peers.drain() { network.report_peer(peer, *penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -698,12 +668,7 @@ impl SyncingChain { } } } - BatchState::Downloading(peer, ..) => { - // remove this batch from the peer's active requests - if let Some(active_batches) = self.peers.get_mut(peer) { - active_batches.remove(&id); - } - } + BatchState::Downloading(..) => {} BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => crit!( self.log, "batch indicates inconsistent chain state while advancing chain" @@ -845,13 +810,9 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - // add the peer without overwriting its active requests - if self.peers.entry(peer_id).or_default().is_empty() { - // Either new or not, this peer is idle, try to request more batches - self.request_batches(network) - } else { - Ok(KeepChain) - } + self.peers.insert(peer_id); + // Attempt to request more batches regardless of peer status + self.request_batches(network) } /// An RPC error has occurred. @@ -892,9 +853,6 @@ impl SyncingChain { "request_id" => %request_id, "batch_state" => batch_state ); - if let Some(active_requests) = self.peers.get_mut(peer_id) { - active_requests.remove(&batch_id); - } if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed(true)? { return Err(RemoveChain::ChainFailed { blacklist, @@ -922,34 +880,9 @@ impl SyncingChain { network: &mut SyncNetworkContext, batch_id: BatchId, ) -> ProcessingResult { - let Some(batch) = self.batches.get_mut(&batch_id) else { - return Ok(KeepChain); - }; - - // Find a peer to request the batch - let failed_peers = batch.failed_peers(); - - let new_peer = self - .peers - .iter() - .map(|(peer, requests)| { - ( - failed_peers.contains(peer), - requests.len(), - rand::thread_rng().gen::(), - *peer, - ) - }) - // Sort peers prioritizing unrelated peers with less active requests. - .min() - .map(|(_, _, _, peer)| peer); - - if let Some(peer) = new_peer { - self.send_batch(network, batch_id, peer) - } else { - // If we are here the chain has no more peers - Err(RemoveChain::EmptyPeerPool) - } + // TODO(das): Previously here we de-prioritize peers that had either failed to download + // a batch or failed in processing. + self.send_batch(network, batch_id) } /// Requests the batch assigned to the given id from a given peer. @@ -957,23 +890,22 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer: PeerId, ) -> ProcessingResult { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let (request, batch_type) = batch.to_blocks_by_range_request(); match network.block_components_by_range_request( - peer, batch_type, request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, + &self.peers, ) { Ok(request_id) => { // inform the batch about the new request - batch.start_downloading_from_peer(peer, request_id)?; + batch.start_downloading_from_peer(request_id)?; if self .optimistic_start .map(|epoch| epoch == batch_id) @@ -983,30 +915,14 @@ impl SyncingChain { } else { debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state); } - // register the batch for this peer - return self - .peers - .get_mut(&peer) - .map(|requests| { - requests.insert(batch_id); - Ok(KeepChain) - }) - .unwrap_or_else(|| { - Err(RemoveChain::WrongChainState(format!( - "Sending batch to a peer that is not in the chain: {}", - peer - ))) - }); + return Ok(KeepChain); } Err(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(self.log, "Could not send batch request"; "batch_id" => batch_id, "error" => ?e, &batch); // register the failed download and check if the batch can be retried - batch.start_downloading_from_peer(peer, 1)?; // fake request_id is not relevant - self.peers - .get_mut(&peer) - .map(|request| request.remove(&batch_id)); + batch.start_downloading_from_peer(1)?; // fake request_id = 1 is not relevant match batch.download_failed(true)? { BatchOperationOutcome::Failed { blacklist } => { return Err(RemoveChain::ChainFailed { @@ -1054,21 +970,6 @@ impl SyncingChain { // find the next pending batch and request it from the peer - // randomize the peers for load balancing - let mut rng = rand::thread_rng(); - let mut idle_peers = self - .peers - .iter() - .filter_map(|(peer, requests)| { - if requests.is_empty() { - Some(*peer) - } else { - None - } - }) - .collect::>(); - idle_peers.shuffle(&mut rng); - // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { @@ -1081,27 +982,23 @@ impl SyncingChain { } if let Entry::Vacant(entry) = self.batches.entry(epoch) { - if let Some(peer) = idle_peers.pop() { - let batch_type = network.batch_type(epoch); - let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); - entry.insert(optimistic_batch); - self.send_batch(network, epoch, peer)?; - } + let batch_type = network.batch_type(epoch); + let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); + entry.insert(optimistic_batch); + self.send_batch(network, epoch)?; } return Ok(KeepChain); } - while let Some(peer) = idle_peers.pop() { + loop { if let Some(batch_id) = self.include_next_batch(network) { // send the batch - self.send_batch(network, batch_id, peer)?; + self.send_batch(network, batch_id)?; } else { // No more batches, simply stop return Ok(KeepChain); } } - - Ok(KeepChain) } /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 78679403bb4..3b641e13bc1 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -274,9 +274,8 @@ where /// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum /// retries. In this case, we need to remove the chain. fn remove_peer(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) { - for (removed_chain, sync_type, remove_reason) in self - .chains - .call_all(|chain| chain.remove_peer(peer_id, network)) + for (removed_chain, sync_type, remove_reason) in + self.chains.call_all(|chain| chain.remove_peer(peer_id)) { self.on_chain_removed( removed_chain, From 2faf5f105a9120740003744a1e427c9b23d4623d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 24 Jan 2025 17:54:02 +0700 Subject: [PATCH 02/64] Prioritize by range peers in network context --- .../network/src/sync/network_context.rs | 50 ++++++++++++++++++- .../src/sync/network_context/requests.rs | 4 ++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 8d36fe0aa89..c7cbbdecd87 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -350,6 +350,42 @@ impl SyncNetworkContext { } } + fn active_request_count_by_peer(&self) -> HashMap { + let Self { + network_send: _, + request_id: _, + blocks_by_root_requests, + blobs_by_root_requests, + data_columns_by_root_requests, + blocks_by_range_requests, + blobs_by_range_requests, + data_columns_by_range_requests, + // custody_by_root_requests is a meta request of data_columns_by_root_requests + custody_by_root_requests: _, + // components_by_range_requests is a meta request of various _by_range requests + components_by_range_requests: _, + execution_engine_state: _, + network_beacon_processor: _, + chain: _, + log: _, + } = self; + + let mut active_request_count_by_peer = HashMap::::new(); + + for peer_id in blocks_by_root_requests + .iter_request_peers() + .chain(blobs_by_root_requests.iter_request_peers()) + .chain(data_columns_by_root_requests.iter_request_peers()) + .chain(blocks_by_range_requests.iter_request_peers()) + .chain(blobs_by_range_requests.iter_request_peers()) + .chain(data_columns_by_range_requests.iter_request_peers()) + { + *active_request_count_by_peer.entry(peer_id).or_default() += 1; + } + + active_request_count_by_peer + } + /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, @@ -364,11 +400,21 @@ impl SyncNetworkContext { requester, }; + let active_request_count_by_peer = self.active_request_count_by_peer(); + let Some(peer_id) = peers .iter() - .map(|peer| (rand::random::(), *peer)) + .map(|peer| { + ( + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + *peer, + ) + }) .min() - .map(|(_, peer)| peer) + .map(|(_, _, peer)| peer) else { // TODO(das): is it safe to error here? return Err(RpcRequestSendError::NoCustodyPeers); diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index c9b85e47b69..963b633ed6d 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -179,6 +179,10 @@ impl ActiveRequests { .collect() } + pub fn iter_request_peers(&self) -> impl Iterator + '_ { + self.requests.values().map(|request| request.peer_id) + } + pub fn len(&self) -> usize { self.requests.len() } From 0e13a8ddc0d5633256c4775a6ec93d89fec9f9c2 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 25 Jan 2025 15:20:18 +0400 Subject: [PATCH 03/64] Prioritize custody peers for columns by range --- .../lighthouse_network/src/types/globals.rs | 14 ++++ .../network/src/sync/network_context.rs | 73 +++++++++++++------ .../src/sync/network_context/custody.rs | 5 ++ 3 files changed, 68 insertions(+), 24 deletions(-) diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index 2800b75133b..9c93e4422a2 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -184,6 +184,20 @@ impl NetworkGlobals { .collect::>() } + /// Returns true if the peer is known and is a custodial of `column_index` + pub fn is_custody_peer_of(&self, column_index: ColumnIndex, peer_id: &PeerId) -> bool { + self.peers + .read() + .peer_info(peer_id) + .map(|info| { + info.is_assigned_to_custody_subnet(&DataColumnSubnetId::from_column_index( + column_index, + &self.spec, + )) + }) + .unwrap_or(false) + } + /// Returns the TopicConfig to compute the set of Gossip topics for a given fork pub fn as_topic_config(&self) -> TopicConfig { TopicConfig { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index c7cbbdecd87..af142593b15 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -402,7 +402,7 @@ impl SyncNetworkContext { let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(peer_id) = peers + let Some(block_peer) = peers .iter() .map(|peer| { ( @@ -410,21 +410,21 @@ impl SyncNetworkContext { active_request_count_by_peer.get(peer).copied().unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties rand::random::(), - *peer, + peer, ) }) .min() - .map(|(_, _, peer)| peer) + .map(|(_, _, peer)| *peer) else { // TODO(das): is it safe to error here? return Err(RpcRequestSendError::NoCustodyPeers); }; - let _blocks_req_id = self.send_blocks_by_range_request(peer_id, request.clone(), id)?; + let _blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { Some(self.send_blobs_by_range_request( - peer_id, + block_peer, BlobsByRangeRequest { start_slot: *request.start_slot(), count: *request.count(), @@ -440,12 +440,16 @@ impl SyncNetworkContext { let column_indexes = self.network_globals().sampling_columns.clone(); let data_column_requests = self - .make_columns_by_range_requests(request, &column_indexes)? + .select_columns_by_range_peers_to_request(&column_indexes, peers)? .into_iter() - .map(|(peer_id, columns_by_range_request)| { + .map(|(peer_id, columns)| { self.send_data_columns_by_range_request( peer_id, - columns_by_range_request, + DataColumnsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + columns, + }, id, ) }) @@ -470,18 +474,44 @@ impl SyncNetworkContext { Ok(id.id) } - fn make_columns_by_range_requests( + fn select_columns_by_range_peers_to_request( &self, - request: BlocksByRangeRequest, custody_indexes: &HashSet, - ) -> Result, RpcRequestSendError> { - let mut peer_id_to_request_map = HashMap::new(); + peers: &HashSet, + ) -> Result>, RpcRequestSendError> { + let mut peer_id_to_request_map = HashMap::>::new(); + + // Re-compute here to account for the block peer + let active_request_count_by_peer = self.active_request_count_by_peer(); for column_index in custody_indexes { - // TODO(das): The peer selection logic here needs to be improved - we should probably - // avoid retrying from failed peers, however `BatchState` currently only tracks the peer - // serving the blocks. - let Some(custody_peer) = self.get_random_custodial_peer(*column_index) else { + // Strictly consider peers that are custodials of this column AND are part of this + // syncing chain. If the forward range sync chain has few peers, it's likely that this + // function will not be able to find peers on our custody columns. + let Some(custody_peer) = peers + .iter() + .filter(|peer| { + self.network_globals() + .is_custody_peer_of(*column_index, peer) + }) + .map(|peer| { + ( + // Prefer peers with less overall requests + // Also account for requests that are not yet issued tracked in peer_id_to_request_map + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + peer_id_to_request_map + .get(peer) + .map(|columns| columns.len()) + .unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) + else { + // TODO(das): is it safe to error here? // TODO(das): this will be pretty bad UX. To improve we should: // - Attempt to fetch custody requests first, before requesting blocks // - Handle the no peers case gracefully, maybe add some timeout and give a few @@ -490,15 +520,10 @@ impl SyncNetworkContext { return Err(RpcRequestSendError::NoCustodyPeers); }; - let columns_by_range_request = peer_id_to_request_map + peer_id_to_request_map .entry(custody_peer) - .or_insert_with(|| DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns: vec![], - }); - - columns_by_range_request.columns.push(*column_index); + .or_default() + .push(*column_index); } Ok(peer_id_to_request_map) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 8a29545c21d..ed6cbcad39f 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -247,6 +247,11 @@ impl ActiveCustodyRequest { .or_default() += 1; } + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation / status / block message claiming to have imported the + // lookup. The frequency of those messages is low, so drawing only from lookup_peers + // could cause many lookups to take much longer or fail as they don't have enough + // custody peers on a given column let mut priorized_peers = custodial_peers .iter() .map(|peer| { From 70e6066199bcc1ea3ec8b68bc1fb11796a483a24 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 25 Jan 2025 16:37:42 +0400 Subject: [PATCH 04/64] Explicit error handling of the no peers error case --- .../network/src/sync/backfill_sync/mod.rs | 48 +++++++++-------- .../network/src/sync/network_context.rs | 53 ++++++++++++++----- .../src/sync/network_context/custody.rs | 5 +- .../network/src/sync/range_sync/chain.rs | 44 +++++++++------ 4 files changed, 96 insertions(+), 54 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index fb2fb5c47f9..8e8f39f3b34 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -863,33 +863,35 @@ impl BackFillSync { return Ok(()); } - Err(RpcRequestSendError::NoCustodyPeers) => { - // If we are here the chain has no more synced peers - info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers"); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); - } - Err(e) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(self.log, "Could not send batch request"; - "batch_id" => batch_id, "error" => ?e, &batch); - // register the failed download and check if the batch can be retried - if let Err(e) = batch.start_downloading_from_peer(1) { - return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); + Err(e) => match e { + RpcRequestSendError::NoPeer(no_peer) => { + // If we are here the chain has no more synced peers + info!(self.log, "Backfill sync paused"; "reason" => ?no_peer); + self.set_state(BackFillState::Paused); + return Err(BackFillError::Paused); } - - match batch.download_failed(true) { - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? - } - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? + RpcRequestSendError::InternalError(e) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(self.log, "Could not send batch request"; + "batch_id" => batch_id, "error" => ?e, &batch); + // register the failed download and check if the batch can be retried + if let Err(e) = batch.start_downloading_from_peer(1) { + return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } - Ok(BatchOperationOutcome::Continue) => { - return self.retry_batch_download(network, batch_id) + + match batch.download_failed(true) { + Err(e) => { + self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? + } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? + } + Ok(BatchOperationOutcome::Continue) => { + return self.retry_batch_download(network, batch_id) + } } } - } + }, } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index af142593b15..b6ae74af33a 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -79,11 +79,18 @@ pub enum RpcResponseError { #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// Network channel send failed - NetworkSendError, - NoCustodyPeers, - CustodyRequestError(custody::Error), - SlotClockError, + /// No peer available matching the required criteria + NoPeer(NoPeerError), + /// These errors should never happen, including unreachable custody errors or network send + /// errors. + InternalError(String), +} + +/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` +#[derive(Debug, PartialEq, Eq)] +pub enum NoPeerError { + BlockPeer, + CustodyPeer(ColumnIndex), } #[derive(Debug, PartialEq, Eq)] @@ -417,7 +424,7 @@ impl SyncNetworkContext { .map(|(_, _, peer)| *peer) else { // TODO(das): is it safe to error here? - return Err(RpcRequestSendError::NoCustodyPeers); + return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); }; let _blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; @@ -517,7 +524,9 @@ impl SyncNetworkContext { // - Handle the no peers case gracefully, maybe add some timeout and give a few // minutes / seconds to the peer manager to locate peers on this subnet before // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoCustodyPeers); + return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( + *column_index, + ))); }; peer_id_to_request_map @@ -642,7 +651,7 @@ impl SyncNetworkContext { request: RequestType::BlocksByRoot(request.into_request(&self.fork_context)), request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; self.blocks_by_root_requests.insert( id, @@ -724,7 +733,7 @@ impl SyncNetworkContext { request: RequestType::BlobsByRoot(request.clone().into_request(&self.fork_context)), request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; self.blobs_by_root_requests.insert( id, @@ -837,7 +846,25 @@ impl SyncNetworkContext { self.custody_by_root_requests.insert(requester, request); Ok(LookupRequestResult::RequestSent(req_id)) } - Err(e) => Err(RpcRequestSendError::CustodyRequestError(e)), + Err(e) => Err(match e { + CustodyRequestError::NoPeer(column_index) => { + RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) + } + // - TooManyFailures: `request` has just been created, it's count of download_failures + // is 0 here + // - BadState: Should never happen, a bad state can only happen when handling a + // network response + // - UnexpectedRequestId: Never happens: this Err is only constructed handling a + // download or processing response + // - SendFailed: Should never happen unless in a bad drop sequence when shutting + // down the node + e @ (CustodyRequestError::TooManyFailures + | CustodyRequestError::BadState { .. } + | CustodyRequestError::UnexpectedRequestId { .. } + | CustodyRequestError::SendFailed { .. }) => { + RpcRequestSendError::InternalError(format!("{e:?}")) + } + }), } } @@ -866,7 +893,7 @@ impl SyncNetworkContext { request: RequestType::BlocksByRange(request.clone().into()), request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; self.blocks_by_range_requests.insert( id, @@ -907,7 +934,7 @@ impl SyncNetworkContext { request: RequestType::BlobsByRange(request.clone()), request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; let max_blobs_per_block = self.chain.spec.max_blobs_per_block(request_epoch); self.blobs_by_range_requests.insert( @@ -947,7 +974,7 @@ impl SyncNetworkContext { request: RequestType::DataColumnsByRange(request.clone()), request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; self.data_columns_by_range_requests.insert( id, diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index ed6cbcad39f..c340cd08db2 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -46,7 +46,7 @@ pub enum Error { SendFailed(&'static str), TooManyFailures, BadState(String), - NoPeers(ColumnIndex), + NoPeer(ColumnIndex), /// Received a download result for a different request id than the in-flight request. /// There should only exist a single request at a time. Having multiple requests is a bug and /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. @@ -281,7 +281,7 @@ impl ActiveCustodyRequest { // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that // lookup will naturally retry when other peers send us attestations for // descendants of this un-available lookup. - return Err(Error::NoPeers(*column_index)); + return Err(Error::NoPeer(*column_index)); } else { // Do not issue requests if there is no custody peer on this column } @@ -311,6 +311,7 @@ impl ActiveCustodyRequest { let column_request = self .column_requests .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests .ok_or(Error::BadState("unknown column_index".to_owned()))?; column_request.on_download_start(req_id)?; diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index b9f68eeac89..3f2f4ff3b7c 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -3,7 +3,7 @@ use super::RangeSyncType; use crate::metrics; use crate::metrics::PEERS_PER_COLUMN_SUBNET; use crate::network_beacon_processor::ChainSegmentProcessId; -use crate::sync::network_context::RangeRequestId; +use crate::sync::network_context::{RangeRequestId, RpcRequestSendError}; use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; @@ -917,24 +917,36 @@ impl SyncingChain { } return Ok(KeepChain); } - Err(e) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(self.log, "Could not send batch request"; + Err(e) => match e { + RpcRequestSendError::NoPeer(no_peer) => { + // Not possible to reach this condition with NoPeer::BlockPeer. For this + // case the chain should have 0 peers and would be dropped already + debug!(self.log, "Error sending batch no peers"; "epoch" => batch_id, &batch, "no_peer" => ?no_peer); + // Set batch in stale state + // What to return here? + // Not necessary to return a `RemoveChain::EmptyPeerPool` here. If the + // chain actually has 0 peers it will be removed on the remove_peer call + return Ok(KeepChain); + } + RpcRequestSendError::InternalError(e) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(self.log, "Could not send batch request"; "batch_id" => batch_id, "error" => ?e, &batch); - // register the failed download and check if the batch can be retried - batch.start_downloading_from_peer(1)?; // fake request_id = 1 is not relevant - match batch.download_failed(true)? { - BatchOperationOutcome::Failed { blacklist } => { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }) - } - BatchOperationOutcome::Continue => { - return self.retry_batch_download(network, batch_id) + // register the failed download and check if the batch can be retried + batch.start_downloading_from_peer(1)?; // fake request_id = 1 is not relevant + match batch.download_failed(true)? { + BatchOperationOutcome::Failed { blacklist } => { + return Err(RemoveChain::ChainFailed { + blacklist, + failing_batch: batch_id, + }) + } + BatchOperationOutcome::Continue => { + return self.retry_batch_download(network, batch_id) + } } } - } + }, } } From 8cf4e8ca378a36bd051bdcfe732ac19d5704416e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 25 Jan 2025 16:47:38 +0400 Subject: [PATCH 05/64] Remove good_peers_on_sampling_subnets --- .../src/peer_manager/peerdb/peer_info.rs | 4 ++ beacon_node/network/src/metrics.rs | 22 +++++-- .../network/src/sync/range_sync/chain.rs | 61 ------------------- 3 files changed, 20 insertions(+), 67 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs index 2e8f462565f..cc61c6a15cb 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs @@ -234,6 +234,10 @@ impl PeerInfo { self.custody_subnets.contains(subnet) } + pub fn custody_subnets_iter(&self) -> impl Iterator { + self.custody_subnets.iter() + } + /// Returns true if the peer is connected to a long-lived subnet. pub fn has_long_lived_subnet(&self) -> bool { // Check the meta_data diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 154a59eade7..055302c79a2 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -746,16 +746,26 @@ pub fn update_sync_metrics(network_globals: &Arc>) // count per sync status, the number of connected peers let mut peers_per_sync_type = FnvHashMap::default(); - for sync_type in network_globals - .peers - .read() - .connected_peers() - .map(|(_peer_id, info)| info.sync_status().as_str()) - { + let mut peers_per_column_subnet = FnvHashMap::default(); + + for (_, info) in network_globals.peers.read().connected_peers() { + let sync_type = info.sync_status().as_str(); *peers_per_sync_type.entry(sync_type).or_default() += 1; + + for subnet in info.custody_subnets_iter() { + *peers_per_column_subnet.entry(*subnet).or_default() += 1; + } } for (sync_type, peer_count) in peers_per_sync_type { set_gauge_entry(&PEERS_PER_SYNC_TYPE, &[sync_type], peer_count); } + + for (subnet, peer_count) in peers_per_column_subnet { + set_gauge_entry( + &PEERS_PER_COLUMN_SUBNET, + &[&format!("{subnet}")], + peer_count, + ); + } } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 3f2f4ff3b7c..3f0e72cd072 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,7 +1,6 @@ use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use super::RangeSyncType; use crate::metrics; -use crate::metrics::PEERS_PER_COLUMN_SUBNET; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError}; use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; @@ -9,7 +8,6 @@ use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; -use metrics::set_int_gauge; use slog::{crit, debug, o, warn}; use std::collections::{btree_map::Entry, BTreeMap, HashSet}; use strum::IntoStaticStr; @@ -411,11 +409,6 @@ impl SyncingChain { self.request_batches(network)?; } } - } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - // This is to handle the case where no batch was sent for the current processing - // target when there is no sampling peers available. This is a valid state and should not - // return an error. - return Ok(KeepChain); } else { return Err(RemoveChain::WrongChainState(format!( "Batch not found for current processing target {}", @@ -985,14 +978,6 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!( - self.log, - "Waiting for peers to be available on sampling column subnets" - ); - return Ok(KeepChain); - } - if let Entry::Vacant(entry) = self.batches.entry(epoch) { let batch_type = network.batch_type(epoch); let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); @@ -1013,40 +998,6 @@ impl SyncingChain { } } - /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in - /// every sampling column subnet. - fn good_peers_on_sampling_subnets( - &self, - epoch: Epoch, - network: &SyncNetworkContext, - ) -> bool { - if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { - // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network - .network_globals() - .sampling_subnets - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer(*subnet_id) - .count(); - - set_int_gauge( - &PEERS_PER_COLUMN_SUBNET, - &[&subnet_id.to_string()], - peer_count as i64, - ); - peer_count > 0 - }); - peers_on_all_custody_subnets - } else { - true - } - } - /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { @@ -1077,18 +1028,6 @@ impl SyncingChain { return None; } - // don't send batch requests until we have peers on sampling subnets - // TODO(das): this is a workaround to avoid sending out excessive block requests because - // block and data column requests are currently coupled. This can be removed once we find a - // way to decouple the requests and do retries individually, see issue #6258. - if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!( - self.log, - "Waiting for peers to be available on custody column subnets" - ); - return None; - } - let batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch match self.batches.entry(batch_id) { From 4da322fa972f701345ce215f0667251f31aa1272 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 25 Jan 2025 16:57:19 +0400 Subject: [PATCH 06/64] Count AwaitingDownload towards the buffer limit --- .../network/src/sync/range_sync/chain.rs | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 3f0e72cd072..46ab93f40ad 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1009,19 +1009,23 @@ impl SyncingChain { { return None; } + // only request batches up to the buffer size limit - // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync - // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { - matches!( - batch.state(), - BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) - ) - }; if self .batches .iter() - .filter(|&(_epoch, batch)| in_buffer(batch)) + .filter(|&(_epoch, batch)| match batch.state() { + // filter batches that count towards the buffer limit + BatchState::AwaitingDownload => true, + BatchState::Downloading { .. } => true, + BatchState::AwaitingProcessing { .. } => true, + BatchState::Processing { .. } => true, + // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync + // if the current processing window is contained in a long range of skip slots. + BatchState::AwaitingValidation { .. } => false, + BatchState::Poisoned => false, + BatchState::Failed => false, + }) .count() > BATCH_BUFFER_SIZE as usize { From 9cd238d69d14709484f1db01cd53e601c530bb49 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 25 Jan 2025 17:09:11 +0400 Subject: [PATCH 07/64] Retry syncing chains in AwaitingDownload state --- .../network/src/sync/range_sync/chain.rs | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 46ab93f40ad..38d7d3f7dc7 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1032,19 +1032,29 @@ impl SyncingChain { return None; } - let batch_id = self.to_be_downloaded; + // Find some batch with epoch equal or less than to_be_downloaded that needs to be sent = is + // AwaitingDownload. Batches reached this state after failing to find peers on `send_batch`. + if let Some((to_retry_batch_id, _)) = self.batches.iter().find(|(batch_epoch, v)| { + **batch_epoch <= self.to_be_downloaded + && matches!(v.state(), BatchState::AwaitingDownload) + }) { + return Some(*to_retry_batch_id); + } + + // If no batch needs a retry, attempt to send the batch of the next epoch to download + let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch - match self.batches.entry(batch_id) { + match self.batches.entry(next_batch_id) { Entry::Occupied(_) => { // this batch doesn't need downloading, let this same function decide the next batch self.to_be_downloaded += EPOCHS_PER_BATCH; self.include_next_batch(network) } Entry::Vacant(entry) => { - let batch_type = network.batch_type(batch_id); - entry.insert(BatchInfo::new(&batch_id, EPOCHS_PER_BATCH, batch_type)); + let batch_type = network.batch_type(next_batch_id); + entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH, batch_type)); self.to_be_downloaded += EPOCHS_PER_BATCH; - Some(batch_id) + Some(next_batch_id) } } } From 891c8fca6994e0b9e8b0774f09ad691dca1c0385 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 5 Feb 2025 13:23:56 -0300 Subject: [PATCH 08/64] Use same peer priorization for lookups --- .../network/src/sync/network_context.rs | 30 +++++++++++++++---- .../src/sync/network_context/custody.rs | 11 ++++--- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index b6ae74af33a..686e7c49fba 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -374,6 +374,7 @@ impl SyncNetworkContext { execution_engine_state: _, network_beacon_processor: _, chain: _, + fork_context: _, log: _, } = self; @@ -408,7 +409,6 @@ impl SyncNetworkContext { }; let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(block_peer) = peers .iter() .map(|peer| { @@ -590,11 +590,21 @@ impl SyncNetworkContext { lookup_peers: Arc>>, block_root: Hash256, ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers .read() .iter() - .choose(&mut rand::thread_rng()) - .copied() + .map(|peer| { + ( + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) else { // Allow lookup to not have any peers and do nothing. This is an optimization to not // lose progress of lookups created from a block with unknown parent before we receive @@ -678,11 +688,21 @@ impl SyncNetworkContext { block_root: Hash256, expected_blobs: usize, ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers .read() .iter() - .choose(&mut rand::thread_rng()) - .copied() + .map(|peer| { + ( + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) else { // Allow lookup to not have any peers and do nothing. This is an optimization to not // lose progress of lookups created from a block with unknown parent before we receive diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index c340cd08db2..c2583024925 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -220,6 +220,7 @@ impl ActiveCustodyRequest { return Ok(Some((columns, peer_group))); } + let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); @@ -238,8 +239,6 @@ impl ActiveCustodyRequest { // only query the peers on that fork. Should this case be handled? How to handle it? let custodial_peers = cx.get_custodial_peers(*column_index); - // TODO(das): cache this computation in a OneCell or similar to prevent having to - // run it every loop let mut active_requests_by_peer = HashMap::::new(); for batch_request in self.active_batch_columns_requests.values() { *active_requests_by_peer @@ -262,8 +261,12 @@ impl ActiveCustodyRequest { // requests recently self.failed_peers.contains(peer), // Prefer peers with less requests to load balance across peers - active_requests_by_peer.get(peer).copied().unwrap_or(0), - // Final random factor to give all peers a shot in each retry + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer + .get(peer) + .map(|columns| columns.len()) + .unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties rand::thread_rng().gen::(), *peer, ) From 29a5aff595e6c510af0ba961c7153948460dc064 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 6 Feb 2025 02:02:07 -0300 Subject: [PATCH 09/64] Review PR --- .../network/src/sync/backfill_sync/mod.rs | 11 ++-- beacon_node/network/src/sync/manager.rs | 2 +- .../network/src/sync/network_context.rs | 5 +- .../src/sync/network_context/custody.rs | 10 +--- .../network/src/sync/range_sync/batch.rs | 58 +++++++++++-------- .../network/src/sync/range_sync/chain.rs | 12 ++-- 6 files changed, 55 insertions(+), 43 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 8e8f39f3b34..ce9da49c00d 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -302,6 +302,7 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, + peer_id: &PeerId, request_id: Id, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { @@ -314,7 +315,7 @@ impl BackFillSync { return Ok(()); } debug!(self.log, "Batch failed"; "batch_epoch" => batch_id, "error" => "rpc_error"); - match batch.download_failed(true) { + match batch.download_failed(Some(*peer_id)) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) @@ -848,15 +849,17 @@ impl BackFillSync { .collect::>(); let (request, is_blob_batch) = batch.to_blocks_by_range_request(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( is_blob_batch, request, RangeRequestId::BackfillSync { batch_id }, &synced_peers, + &failed_peers, ) { Ok(request_id) => { // inform the batch about the new request - if let Err(e) = batch.start_downloading_from_peer(request_id) { + if let Err(e) = batch.start_downloading(request_id) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch); @@ -875,11 +878,11 @@ impl BackFillSync { warn!(self.log, "Could not send batch request"; "batch_id" => batch_id, "error" => ?e, &batch); // register the failed download and check if the batch can be retried - if let Err(e) = batch.start_downloading_from_peer(1) { + if let Err(e) = batch.start_downloading(1) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } - match batch.download_failed(true) { + match batch.download_failed(None) { Err(e) => { self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index ac8bda31281..454febbcfd2 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1303,7 +1303,7 @@ impl SyncManager { } RangeRequestId::BackfillSync { batch_id } => match self .backfill_sync - .inject_error(&mut self.network, batch_id, range_request_id.id) + .inject_error(&mut self.network, batch_id, &peer_id, range_request_id.id) { Ok(_) => {} Err(_) => self.update_sync_state(), diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 686e7c49fba..ab909863f85 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -401,6 +401,7 @@ impl SyncNetworkContext { request: BlocksByRangeRequest, requester: RangeRequestId, peers: &HashSet, + peers_to_deprioritize: &HashSet, ) -> Result { // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { @@ -413,6 +414,8 @@ impl SyncNetworkContext { .iter() .map(|peer| { ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), // Prefer peers with less overall requests active_request_count_by_peer.get(peer).copied().unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties @@ -421,7 +424,7 @@ impl SyncNetworkContext { ) }) .min() - .map(|(_, _, peer)| *peer) + .map(|(_, _, _, peer)| *peer) else { // TODO(das): is it safe to error here? return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index c2583024925..6649aa95ef0 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -57,7 +57,6 @@ pub enum Error { } struct ActiveBatchColumnsRequest { - peer_id: PeerId, indices: Vec, } @@ -239,13 +238,6 @@ impl ActiveCustodyRequest { // only query the peers on that fork. Should this case be handled? How to handle it? let custodial_peers = cx.get_custodial_peers(*column_index); - let mut active_requests_by_peer = HashMap::::new(); - for batch_request in self.active_batch_columns_requests.values() { - *active_requests_by_peer - .entry(batch_request.peer_id) - .or_default() += 1; - } - // We draw from the total set of peers, but prioritize those peers who we have // received an attestation / status / block message claiming to have imported the // lookup. The frequency of those messages is low, so drawing only from lookup_peers @@ -321,7 +313,7 @@ impl ActiveCustodyRequest { } self.active_batch_columns_requests - .insert(req_id, ActiveBatchColumnsRequest { indices, peer_id }); + .insert(req_id, ActiveBatchColumnsRequest { indices }); } LookupRequestResult::NoRequestNeeded(_) => unreachable!(), LookupRequestResult::Pending(_) => unreachable!(), diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index d51e6e16a43..378b6878e8d 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -104,7 +104,7 @@ pub struct BatchInfo { /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: usize, + failed_download_attempts: Vec>, /// State of the batch. state: BatchState, /// Whether this batch contains all blocks or all blocks and blobs. @@ -164,7 +164,7 @@ impl BatchInfo { start_slot, end_slot, failed_processing_attempts: Vec::new(), - failed_download_attempts: 0, + failed_download_attempts: Vec::new(), non_faulty_processing_attempts: 0, state: BatchState::AwaitingDownload, batch_type, @@ -174,18 +174,27 @@ impl BatchInfo { /// Gives a list of peers from which this batch has had a failed download or processing /// attempt. - pub fn failed_processing_peers(&self) -> HashSet { - self.failed_processing_attempts - .iter() - .map(|attempt| attempt.peer_id) - .collect() + pub fn failed_peers(&self) -> HashSet { + let mut peers = HashSet::with_capacity( + self.failed_processing_attempts.len() + self.failed_download_attempts.len(), + ); + + for attempt in &self.failed_processing_attempts { + peers.insert(attempt.peer_id); + } + + for peer in self.failed_download_attempts.iter().flatten() { + peers.insert(*peer); + } + + peers } /// Return the number of times this batch has failed downloading and failed processing, in this /// order. pub fn failed_attempts(&self) -> (usize, usize) { ( - self.failed_download_attempts, + self.failed_download_attempts.len(), self.failed_processing_attempts.len(), ) } @@ -240,7 +249,8 @@ impl BatchInfo { match self.state { BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Failed => BatchOperationOutcome::Failed { - blacklist: self.failed_processing_attempts.len() > self.failed_download_attempts, + blacklist: self.failed_processing_attempts.len() + > self.failed_download_attempts.len(), }, _ => BatchOperationOutcome::Continue, } @@ -266,7 +276,7 @@ impl BatchInfo { Result<(Slot, Slot, BatchOperationOutcome), WrongState>, > { match self.state.poison() { - BatchState::Downloading(_request_id) => { + BatchState::Downloading(_) => { let received = blocks.len(); self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); Ok(received) @@ -290,21 +300,21 @@ impl BatchInfo { #[must_use = "Batch may have failed"] pub fn download_failed( &mut self, - mark_failed: bool, + peer: Option, ) -> Result { match self.state.poison() { - BatchState::Downloading(_request_id) => { + BatchState::Downloading(_) => { // register the attempt and check if the batch can be tried again - if mark_failed { - self.failed_download_attempts += 1; - } - self.state = - if self.failed_download_attempts >= B::max_batch_download_attempts() as usize { - BatchState::Failed - } else { - // drop the blocks - BatchState::AwaitingDownload - }; + self.failed_download_attempts.push(peer); + + self.state = if self.failed_download_attempts.len() + >= B::max_batch_download_attempts() as usize + { + BatchState::Failed + } else { + // drop the blocks + BatchState::AwaitingDownload + }; Ok(self.outcome()) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -318,7 +328,7 @@ impl BatchInfo { } } - pub fn start_downloading_from_peer(&mut self, request_id: Id) -> Result<(), WrongState> { + pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> { match self.state.poison() { BatchState::AwaitingDownload => { self.state = BatchState::Downloading(request_id); @@ -468,7 +478,7 @@ impl slog::KV for BatchInfo { "end_slot", serializer, )?; - serializer.emit_usize("downloaded", self.failed_download_attempts)?; + serializer.emit_usize("downloaded", self.failed_download_attempts.len())?; serializer.emit_usize("processed", self.failed_processing_attempts.len())?; serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?; serializer.emit_arguments("state", &format_args!("{:?}", self.state))?; diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 38d7d3f7dc7..bd4325885ad 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -846,7 +846,9 @@ impl SyncingChain { "request_id" => %request_id, "batch_state" => batch_state ); - if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed(true)? { + if let BatchOperationOutcome::Failed { blacklist } = + batch.download_failed(Some(*peer_id))? + { return Err(RemoveChain::ChainFailed { blacklist, failing_batch: batch_id, @@ -887,6 +889,7 @@ impl SyncingChain { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let (request, batch_type) = batch.to_blocks_by_range_request(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( batch_type, request, @@ -895,10 +898,11 @@ impl SyncingChain { batch_id, }, &self.peers, + &failed_peers, ) { Ok(request_id) => { // inform the batch about the new request - batch.start_downloading_from_peer(request_id)?; + batch.start_downloading(request_id)?; if self .optimistic_start .map(|epoch| epoch == batch_id) @@ -926,8 +930,8 @@ impl SyncingChain { warn!(self.log, "Could not send batch request"; "batch_id" => batch_id, "error" => ?e, &batch); // register the failed download and check if the batch can be retried - batch.start_downloading_from_peer(1)?; // fake request_id = 1 is not relevant - match batch.download_failed(true)? { + batch.start_downloading(1)?; // fake request_id = 1 is not relevant + match batch.download_failed(None)? { BatchOperationOutcome::Failed { blacklist } => { return Err(RemoveChain::ChainFailed { blacklist, From cd6c5d6ac0a0161b2062fe3dbaa2492129efe5fe Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 6 Feb 2025 02:15:22 -0300 Subject: [PATCH 10/64] Address TODOs --- .../network/src/sync/backfill_sync/mod.rs | 27 +++---- .../network/src/sync/network_context.rs | 71 +++++++++++-------- .../network/src/sync/range_sync/chain.rs | 25 ++----- 3 files changed, 55 insertions(+), 68 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index ce9da49c00d..0c8cc5979ec 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -320,7 +320,7 @@ impl BackFillSync { Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) } - Ok(BatchOperationOutcome::Continue) => self.retry_batch_download(network, batch_id), + Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), } } else { // this could be an error for an old batch, removed when the chain advances @@ -388,7 +388,7 @@ impl BackFillSync { return Ok(ProcessResult::Successful); } // this batch can't be used, so we need to request it again. - self.retry_batch_download(network, batch_id)?; + self.send_batch(network, batch_id)?; Ok(ProcessResult::Successful) } } @@ -590,8 +590,8 @@ impl BackFillSync { ); for peer in self.participating_peers.drain() { - // TODO(das): this participating peers is broken with custody columns backfill, consider - // a different mechanism + // TODO(das): `participating_peers` only includes block peers. Should we + // penalize the custody column peers too? network.report_peer(peer, *penalty, "backfill_batch_failed"); } self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) @@ -617,7 +617,7 @@ impl BackFillSync { { self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; } - self.retry_batch_download(network, batch_id)?; + self.send_batch(network, batch_id)?; Ok(ProcessResult::Successful) } } @@ -816,20 +816,9 @@ impl BackFillSync { self.processing_target = self.current_start; for id in redownload_queue { - self.retry_batch_download(network, id)?; + self.send_batch(network, id)?; } // finally, re-request the failed batch. - self.retry_batch_download(network, batch_id) - } - - /// Sends and registers the request of a batch awaiting download. - fn retry_batch_download( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result<(), BackFillError> { - // TODO(das): previously here we de-prioritize peers that had failed to download or - // process a batch self.send_batch(network, batch_id) } @@ -890,7 +879,7 @@ impl BackFillSync { self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? } Ok(BatchOperationOutcome::Continue) => { - return self.retry_batch_download(network, batch_id) + return self.send_batch(network, batch_id) } } } @@ -920,7 +909,7 @@ impl BackFillSync { .collect::>(); for batch_id in batch_ids_to_retry { - self.retry_batch_download(network, batch_id)?; + self.send_batch(network, batch_id)?; } Ok(()) } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index ab909863f85..94b0027a351 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -403,12 +403,6 @@ impl SyncNetworkContext { peers: &HashSet, peers_to_deprioritize: &HashSet, ) -> Result { - // Create the overall components_by_range request ID before its individual components - let id = ComponentsByRangeRequestId { - id: self.next_id(), - requester, - }; - let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(block_peer) = peers .iter() @@ -430,6 +424,21 @@ impl SyncNetworkContext { return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); }; + // Attempt to find all required custody peers before sending any request or creating an ID + let columns_by_range_peers_to_request = + if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { + let column_indexes = self.network_globals().sampling_columns.clone(); + Some(self.select_columns_by_range_peers_to_request(&column_indexes, peers)?) + } else { + None + }; + + // Create the overall components_by_range request ID before its individual components + let id = ComponentsByRangeRequestId { + id: self.next_id(), + requester, + }; + let _blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { @@ -445,38 +454,38 @@ impl SyncNetworkContext { None }; - let (expects_columns, data_column_requests) = - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns.clone(); - - let data_column_requests = self - .select_columns_by_range_peers_to_request(&column_indexes, peers)? - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>()?; - - ( - Some(column_indexes.into_iter().collect::>()), - Some(data_column_requests), + let data_column_requests = + if let Some(columns_by_range_peers_to_request) = columns_by_range_peers_to_request { + Some( + columns_by_range_peers_to_request + .into_iter() + .map(|(peer_id, columns)| { + self.send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + columns, + }, + id, + ) + }) + .collect::, _>>()?, ) } else { - (None, None) + None }; let expected_blobs = blobs_req_id.is_some(); let info = RangeBlockComponentsRequest::new( expected_blobs, - expects_columns, + data_column_requests.as_ref().map(|_| { + self.network_globals() + .sampling_columns + .iter() + .copied() + .collect() + }), data_column_requests.map(|items| items.len()), ); self.components_by_range_requests.insert(id, info); diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index bd4325885ad..a4595fea21d 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -263,7 +263,7 @@ impl SyncingChain { }); } // this batch can't be used, so we need to request it again. - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } } } @@ -566,7 +566,7 @@ impl SyncingChain { BatchProcessResult::NonFaultyFailure => { batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; // Simply redownload the batch. - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } } } @@ -586,7 +586,7 @@ impl SyncingChain { debug!(self.log, "Rejected optimistic batch left for future use"; "epoch" => %epoch, "reason" => reason); // this batch is now treated as any other batch, and re-requested for future use if redownload { - return self.retry_batch_download(network, epoch); + return self.send_batch(network, epoch); } } else { debug!(self.log, "Rejected optimistic batch"; "epoch" => %epoch, "reason" => reason); @@ -746,10 +746,10 @@ impl SyncingChain { self.processing_target = self.start_epoch; for id in redownload_queue { - self.retry_batch_download(network, id)?; + self.send_batch(network, id)?; } // finally, re-request the failed batch. - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } pub fn stop_syncing(&mut self) { @@ -854,7 +854,7 @@ impl SyncingChain { failing_batch: batch_id, }); } - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } else { debug!( self.log, @@ -869,17 +869,6 @@ impl SyncingChain { } } - /// Sends and registers the request of a batch awaiting download. - pub fn retry_batch_download( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - // TODO(das): Previously here we de-prioritize peers that had either failed to download - // a batch or failed in processing. - self.send_batch(network, batch_id) - } - /// Requests the batch assigned to the given id from a given peer. pub fn send_batch( &mut self, @@ -939,7 +928,7 @@ impl SyncingChain { }) } BatchOperationOutcome::Continue => { - return self.retry_batch_download(network, batch_id) + return self.send_batch(network, batch_id) } } } From 2c6a7ccd85786315c3c81e93116fbd328b004fc8 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:23:26 -0300 Subject: [PATCH 11/64] Revert changes to peer erroring in range sync --- .../network/src/sync/range_sync/chain.rs | 88 ++++++++++++++----- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index a4595fea21d..8cb0f071170 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,9 +1,10 @@ use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use super::RangeSyncType; -use crate::metrics; +use crate::metrics::{self, PEERS_PER_COLUMN_SUBNET}; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError}; use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; +use ::metrics::set_int_gauge; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::Id; @@ -409,6 +410,11 @@ impl SyncingChain { self.request_batches(network)?; } } + } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { + // This is to handle the case where no batch was sent for the current processing + // target when there is no sampling peers available. This is a valid state and should not + // return an error. + return Ok(KeepChain); } else { return Err(RemoveChain::WrongChainState(format!( "Batch not found for current processing target {}", @@ -971,6 +977,14 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { + if !self.good_peers_on_sampling_subnets(epoch, network) { + debug!( + self.log, + "Waiting for peers to be available on sampling column subnets" + ); + return Ok(KeepChain); + } + if let Entry::Vacant(entry) = self.batches.entry(epoch) { let batch_type = network.batch_type(epoch); let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); @@ -991,6 +1005,40 @@ impl SyncingChain { } } + /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in + /// every sampling column subnet. + fn good_peers_on_sampling_subnets( + &self, + epoch: Epoch, + network: &SyncNetworkContext, + ) -> bool { + if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { + // Require peers on all sampling column subnets before sending batches + let peers_on_all_custody_subnets = network + .network_globals() + .sampling_subnets + .iter() + .all(|subnet_id| { + let peer_count = network + .network_globals() + .peers + .read() + .good_custody_subnet_peer(*subnet_id) + .count(); + + set_int_gauge( + &PEERS_PER_COLUMN_SUBNET, + &[&subnet_id.to_string()], + peer_count as i64, + ); + peer_count > 0 + }); + peers_on_all_custody_subnets + } else { + true + } + } + /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { @@ -1004,34 +1052,34 @@ impl SyncingChain { } // only request batches up to the buffer size limit + // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync + // if the current processing window is contained in a long range of skip slots. + let in_buffer = |batch: &BatchInfo| { + matches!( + batch.state(), + BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) + ) + }; if self .batches .iter() - .filter(|&(_epoch, batch)| match batch.state() { - // filter batches that count towards the buffer limit - BatchState::AwaitingDownload => true, - BatchState::Downloading { .. } => true, - BatchState::AwaitingProcessing { .. } => true, - BatchState::Processing { .. } => true, - // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync - // if the current processing window is contained in a long range of skip slots. - BatchState::AwaitingValidation { .. } => false, - BatchState::Poisoned => false, - BatchState::Failed => false, - }) + .filter(|&(_epoch, batch)| in_buffer(batch)) .count() > BATCH_BUFFER_SIZE as usize { return None; } - // Find some batch with epoch equal or less than to_be_downloaded that needs to be sent = is - // AwaitingDownload. Batches reached this state after failing to find peers on `send_batch`. - if let Some((to_retry_batch_id, _)) = self.batches.iter().find(|(batch_epoch, v)| { - **batch_epoch <= self.to_be_downloaded - && matches!(v.state(), BatchState::AwaitingDownload) - }) { - return Some(*to_retry_batch_id); + // don't send batch requests until we have peers on sampling subnets + // TODO(das): this is a workaround to avoid sending out excessive block requests because + // block and data column requests are currently coupled. This can be removed once we find a + // way to decouple the requests and do retries individually, see issue #6258. + if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { + debug!( + self.log, + "Waiting for peers to be available on custody column subnets" + ); + return None; } // If no batch needs a retry, attempt to send the batch of the next epoch to download From f77bc24b45159189500c0aa1125b422cc5ca4e96 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:01:45 -0300 Subject: [PATCH 12/64] Revert metrics changes --- .../src/peer_manager/peerdb/peer_info.rs | 4 ---- beacon_node/network/src/metrics.rs | 22 +++++-------------- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs index cc61c6a15cb..2e8f462565f 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs @@ -234,10 +234,6 @@ impl PeerInfo { self.custody_subnets.contains(subnet) } - pub fn custody_subnets_iter(&self) -> impl Iterator { - self.custody_subnets.iter() - } - /// Returns true if the peer is connected to a long-lived subnet. pub fn has_long_lived_subnet(&self) -> bool { // Check the meta_data diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 055302c79a2..154a59eade7 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -746,26 +746,16 @@ pub fn update_sync_metrics(network_globals: &Arc>) // count per sync status, the number of connected peers let mut peers_per_sync_type = FnvHashMap::default(); - let mut peers_per_column_subnet = FnvHashMap::default(); - - for (_, info) in network_globals.peers.read().connected_peers() { - let sync_type = info.sync_status().as_str(); + for sync_type in network_globals + .peers + .read() + .connected_peers() + .map(|(_peer_id, info)| info.sync_status().as_str()) + { *peers_per_sync_type.entry(sync_type).or_default() += 1; - - for subnet in info.custody_subnets_iter() { - *peers_per_column_subnet.entry(*subnet).or_default() += 1; - } } for (sync_type, peer_count) in peers_per_sync_type { set_gauge_entry(&PEERS_PER_SYNC_TYPE, &[sync_type], peer_count); } - - for (subnet, peer_count) in peers_per_column_subnet { - set_gauge_entry( - &PEERS_PER_COLUMN_SUBNET, - &[&format!("{subnet}")], - peer_count, - ); - } } From 4792275cc9b17ddd7426a60011b8d59325d543f3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:09:59 -0300 Subject: [PATCH 13/64] Update comment --- .../network/src/sync/backfill_sync/mod.rs | 2 +- .../network/src/sync/network_context.rs | 10 +++++----- .../network/src/sync/range_sync/chain.rs | 19 ++++++++----------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0c8cc5979ec..5474ab2bd28 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -858,7 +858,7 @@ impl BackFillSync { Err(e) => match e { RpcRequestSendError::NoPeer(no_peer) => { // If we are here the chain has no more synced peers - info!(self.log, "Backfill sync paused"; "reason" => ?no_peer); + info!(self.log, "Backfill sync paused"; "reason" => format!("insufficient_synced_peers({no_peer:?})")); self.set_state(BackFillState::Paused); return Err(BackFillError::Paused); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 94b0027a351..bd4de669fbb 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -420,7 +420,9 @@ impl SyncNetworkContext { .min() .map(|(_, _, _, peer)| *peer) else { - // TODO(das): is it safe to error here? + // Backfill and forward sync handle this condition gracefully. + // - Backfill sync: will pause waiting for more peers to join + // - Forward sync: can never happen as the chain is dropped when removing the last peer. return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); }; @@ -530,9 +532,7 @@ impl SyncNetworkContext { .min() .map(|(_, _, peer)| *peer) else { - // TODO(das): is it safe to error here? // TODO(das): this will be pretty bad UX. To improve we should: - // - Attempt to fetch custody requests first, before requesting blocks // - Handle the no peers case gracefully, maybe add some timeout and give a few // minutes / seconds to the peer manager to locate peers on this subnet before // abandoing progress on the chain completely. @@ -882,8 +882,8 @@ impl SyncNetworkContext { CustodyRequestError::NoPeer(column_index) => { RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) } - // - TooManyFailures: `request` has just been created, it's count of download_failures - // is 0 here + // - TooManyFailures: Should never happen, `request` has just been created, it's + // count of download_failures is 0 here // - BadState: Should never happen, a bad state can only happen when handling a // network response // - UnexpectedRequestId: Never happens: this Err is only constructed handling a diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 8cb0f071170..6db78828870 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -910,17 +910,14 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - RpcRequestSendError::NoPeer(no_peer) => { - // Not possible to reach this condition with NoPeer::BlockPeer. For this - // case the chain should have 0 peers and would be dropped already - debug!(self.log, "Error sending batch no peers"; "epoch" => batch_id, &batch, "no_peer" => ?no_peer); - // Set batch in stale state - // What to return here? - // Not necessary to return a `RemoveChain::EmptyPeerPool` here. If the - // chain actually has 0 peers it will be removed on the remove_peer call - return Ok(KeepChain); - } - RpcRequestSendError::InternalError(e) => { + // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For + // sync to work properly it must be okay to have "stalled" batches in + // AwaitingDownload state. Currently it will error with invalid state if + // that happens. Sync manager must periodicatlly prune stalled batches like + // we do for lookup sync. Then we can deprecate the redundant + // `good_peers_on_sampling_subnets` checks. + e + @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(self.log, "Could not send batch request"; "batch_id" => batch_id, "error" => ?e, &batch); From 45f55282ee5d88310a3f2c9e9acdc1f44000f5e5 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 17 Mar 2025 00:24:12 -0300 Subject: [PATCH 14/64] Pass peers_to_deprioritize to select_columns_by_range_peers_to_request --- beacon_node/network/src/sync/network_context.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index bd4de669fbb..354de827f1f 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -404,6 +404,7 @@ impl SyncNetworkContext { peers_to_deprioritize: &HashSet, ) -> Result { let active_request_count_by_peer = self.active_request_count_by_peer(); + let Some(block_peer) = peers .iter() .map(|peer| { @@ -430,7 +431,12 @@ impl SyncNetworkContext { let columns_by_range_peers_to_request = if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { let column_indexes = self.network_globals().sampling_columns.clone(); - Some(self.select_columns_by_range_peers_to_request(&column_indexes, peers)?) + Some(self.select_columns_by_range_peers_to_request( + &column_indexes, + peers, + active_request_count_by_peer, + peers_to_deprioritize, + )?) } else { None }; @@ -499,12 +505,11 @@ impl SyncNetworkContext { &self, custody_indexes: &HashSet, peers: &HashSet, + active_request_count_by_peer: HashMap, + peers_to_deprioritize: &HashSet, ) -> Result>, RpcRequestSendError> { let mut peer_id_to_request_map = HashMap::>::new(); - // Re-compute here to account for the block peer - let active_request_count_by_peer = self.active_request_count_by_peer(); - for column_index in custody_indexes { // Strictly consider peers that are custodials of this column AND are part of this // syncing chain. If the forward range sync chain has few peers, it's likely that this @@ -517,6 +522,8 @@ impl SyncNetworkContext { }) .map(|peer| { ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), // Prefer peers with less overall requests // Also account for requests that are not yet issued tracked in peer_id_to_request_map active_request_count_by_peer.get(peer).copied().unwrap_or(0) @@ -530,7 +537,7 @@ impl SyncNetworkContext { ) }) .min() - .map(|(_, _, peer)| *peer) + .map(|(_, _, _, peer)| *peer) else { // TODO(das): this will be pretty bad UX. To improve we should: // - Handle the no peers case gracefully, maybe add some timeout and give a few From 7ec350bd04c4212d70cba9d90c3dcc8de2daa411 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 17 Mar 2025 00:28:38 -0300 Subject: [PATCH 15/64] more idiomatic --- .../network/src/sync/network_context.rs | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 354de827f1f..92347068a3f 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -462,27 +462,24 @@ impl SyncNetworkContext { None }; - let data_column_requests = - if let Some(columns_by_range_peers_to_request) = columns_by_range_peers_to_request { - Some( - columns_by_range_peers_to_request - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>()?, - ) - } else { - None - }; + let data_column_requests = columns_by_range_peers_to_request + .map(|columns_by_range_peers_to_request| { + columns_by_range_peers_to_request + .into_iter() + .map(|(peer_id, columns)| { + self.send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + columns, + }, + id, + ) + }) + .collect::, _>>() + }) + .transpose()?; let expected_blobs = blobs_req_id.is_some(); let info = RangeBlockComponentsRequest::new( From 8962808ae0c6f6c38707527d236435afeedd980a Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 17 Mar 2025 00:54:12 -0300 Subject: [PATCH 16/64] Idiomatic while --- beacon_node/network/src/sync/backfill_sync/mod.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 92febd2d209..0a1006bd8c4 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -1030,15 +1030,13 @@ impl BackFillSync { } // find the next pending batch and request it from the peer - loop { - if let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id)?; - } else { - // No more batches, simply stop - return Ok(()); - } + while let Some(batch_id) = self.include_next_batch(network) { + // send the batch + self.send_batch(network, batch_id)?; } + + // No more batches, simply stop + Ok(()) } /// Creates the next required batch from the chain. If there are no more batches required, From e9247d2f78bc46bdef7f731a61dfd29ada1672d2 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 17 Mar 2025 00:57:12 -0300 Subject: [PATCH 17/64] Add note about infinite loop --- beacon_node/network/src/sync/backfill_sync/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0a1006bd8c4..18a6f44051a 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -1030,6 +1030,10 @@ impl BackFillSync { } // find the next pending batch and request it from the peer + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. while let Some(batch_id) = self.include_next_batch(network) { // send the batch self.send_batch(network, batch_id)?; From 97e75efd9da2350566d870cbf02818440a92b7da Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 22 Mar 2025 22:43:14 -0300 Subject: [PATCH 18/64] Use while let --- .../network/src/sync/range_sync/chain.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 223f90050bc..6d93d7670d1 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1007,15 +1007,18 @@ impl SyncingChain { return Ok(KeepChain); } - loop { - if let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id)?; - } else { - // No more batches, simply stop - return Ok(KeepChain); - } + // find the next pending batch and request it from the peer + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. + while let Some(batch_id) = self.include_next_batch(network) { + // send the batch + self.send_batch(network, batch_id)?; } + + // No more batches, simply stop + Ok(KeepChain) } /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in From 0af1bd63825d096f46e98b831242917a6770e99f Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 8 Apr 2025 02:51:32 -0300 Subject: [PATCH 19/64] Fix wrong custody column count for lookup blocks --- beacon_node/beacon_chain/src/block_verification.rs | 9 ++++++++- .../beacon_chain/src/block_verification_types.rs | 3 ++- beacon_node/beacon_chain/src/test_utils.rs | 4 ++-- beacon_node/http_api/src/publish_blocks.rs | 8 ++++++-- beacon_node/network/src/sync/block_lookups/common.rs | 10 ++-------- beacon_node/network/src/sync/block_sidecar_coupling.rs | 3 ++- beacon_node/network/src/sync/network_context.rs | 8 +++++++- beacon_node/network/src/sync/tests/range.rs | 3 ++- 8 files changed, 31 insertions(+), 17 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 39bad34cd6c..de035efd4a7 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -1272,9 +1272,16 @@ impl IntoExecutionPendingBlock for Arc RpcBlock { pub fn new_without_blobs( block_root: Option, block: Arc>, + custody_columns_count: usize, ) -> Self { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); @@ -110,7 +111,7 @@ impl RpcBlock { block_root, block: RpcBlockInner::Block(block), // Block has zero columns - custody_columns_count: 0, + custody_columns_count, } } diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index fe78d83c03c..c0a38d61430 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2366,7 +2366,7 @@ where .blob_kzg_commitments() .is_ok_and(|c| !c.is_empty()); if !has_blobs { - return RpcBlock::new_without_blobs(Some(block_root), block); + return RpcBlock::new_without_blobs(Some(block_root), block, 0); } // Blobs are stored as data columns from Fulu (PeerDAS) @@ -2417,7 +2417,7 @@ where &self.spec, )? } else { - RpcBlock::new_without_blobs(Some(block_root), block) + RpcBlock::new_without_blobs(Some(block_root), block, sampling_column_count) } } else { let blobs = blob_items diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs index a5cd94536d8..58a28e99ff1 100644 --- a/beacon_node/http_api/src/publish_blocks.rs +++ b/beacon_node/http_api/src/publish_blocks.rs @@ -2,7 +2,7 @@ use crate::metrics; use std::future::Future; use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob}; -use beacon_chain::block_verification_types::AsBlock; +use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_column_verification::{GossipDataColumnError, GossipVerifiedDataColumn}; use beacon_chain::validator_monitor::{get_block_delay_ms, timestamp_now}; use beacon_chain::{ @@ -302,7 +302,11 @@ pub async fn publish_block>( ); let import_result = Box::pin(chain.process_block( block_root, - block.clone(), + RpcBlock::new_without_blobs( + Some(block_root), + block.clone(), + network_globals.custody_columns_count() as usize, + ), NotifyExecutionLayer::Yes, BlockImportSource::HttpApi, publish_fn, diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs index 8eefb2d6756..86b6894bac4 100644 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ b/beacon_node/network/src/sync/block_lookups/common.rs @@ -6,7 +6,6 @@ use crate::sync::block_lookups::{ }; use crate::sync::manager::BlockProcessType; use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; -use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; use lighthouse_network::service::api_types::Id; use parking_lot::RwLock; @@ -97,13 +96,8 @@ impl RequestState for BlockRequestState { seen_timestamp, .. } = download_result; - cx.send_block_for_processing( - id, - block_root, - RpcBlock::new_without_blobs(Some(block_root), value), - seen_timestamp, - ) - .map_err(LookupRequestError::SendFailedProcessor) + cx.send_block_for_processing(id, block_root, value, seen_timestamp) + .map_err(LookupRequestError::SendFailedProcessor) } fn response_type() -> ResponseType { diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index ef9285c8dc8..99428b0c805 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -266,7 +266,8 @@ impl RangeBlockComponentsRequest { ) .map_err(|e| format!("{e:?}"))? } else { - RpcBlock::new_without_blobs(Some(block_root), block) + // Block has no data, expects zero columns + RpcBlock::new_without_blobs(Some(block_root), block, 0) }); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 69b350f8cba..2cb5ec9a0a6 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1308,7 +1308,7 @@ impl SyncNetworkContext { &self, id: Id, block_root: Hash256, - block: RpcBlock, + block: Arc>, seen_timestamp: Duration, ) -> Result<(), SendErrorProcessor> { let span = span!( @@ -1322,6 +1322,12 @@ impl SyncNetworkContext { .beacon_processor_if_enabled() .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; + let block = RpcBlock::new_without_blobs( + Some(block_root), + block, + self.network_globals().custody_columns_count() as usize, + ); + debug!(block = ?block_root, id, "Sending block for processing"); // Lookup sync event safety: If `beacon_processor.send_rpc_beacon_block` returns Ok() sync // must receive a single `SyncMessage::BlockComponentProcessed` with this process type diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 2871ea2a4d2..932f485dd0d 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -459,7 +459,8 @@ fn build_rpc_block( ) .unwrap() } - None => RpcBlock::new_without_blobs(None, block), + // Block has no data, expects zero columns + None => RpcBlock::new_without_blobs(None, block, 0), } } From d6a68a63b52bcdacf7056e500827ef74b5b57264 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 8 Apr 2025 03:09:07 -0300 Subject: [PATCH 20/64] Remove impl --- .../beacon_chain/src/block_verification.rs | 41 ------------------- testing/ef_tests/src/cases/fork_choice.rs | 3 +- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index de035efd4a7..44424b8242e 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -1260,47 +1260,6 @@ impl IntoExecutionPendingBlock for SignatureVerifiedBloc } } -impl IntoExecutionPendingBlock for Arc> { - /// Verifies the `SignedBeaconBlock` by first transforming it into a `SignatureVerifiedBlock` - /// and then using that implementation of `IntoExecutionPendingBlock` to complete verification. - fn into_execution_pending_block_slashable( - self, - block_root: Hash256, - chain: &Arc>, - notify_execution_layer: NotifyExecutionLayer, - ) -> Result, BlockSlashInfo> { - // Perform an early check to prevent wasting time on irrelevant blocks. - let block_root = check_block_relevancy(&self, block_root, chain) - .map_err(|e| BlockSlashInfo::SignatureNotChecked(self.signed_block_header(), e))?; - // TODO(das): This is wrong, as the block may have columns. However this codepath is - // currently un-used, and should be removed with https://github.com/sigp/lighthouse/pull/7008 - let custody_column_count = 0; - let maybe_available = chain - .data_availability_checker - .verify_kzg_for_rpc_block(RpcBlock::new_without_blobs( - Some(block_root), - self.clone(), - custody_column_count, - )) - .map_err(|e| { - BlockSlashInfo::SignatureNotChecked( - self.signed_block_header(), - BlockError::AvailabilityCheck(e), - ) - })?; - SignatureVerifiedBlock::check_slashable(maybe_available, block_root, chain)? - .into_execution_pending_block_slashable(block_root, chain, notify_execution_layer) - } - - fn block(&self) -> &SignedBeaconBlock { - self - } - - fn block_cloned(&self) -> Arc> { - self.clone() - } -} - impl IntoExecutionPendingBlock for RpcBlock { /// Verifies the `SignedBeaconBlock` by first transforming it into a `SignatureVerifiedBlock` /// and then using that implementation of `IntoExecutionPendingBlock` to complete verification. diff --git a/testing/ef_tests/src/cases/fork_choice.rs b/testing/ef_tests/src/cases/fork_choice.rs index 43e96e3f1e2..b507383190f 100644 --- a/testing/ef_tests/src/cases/fork_choice.rs +++ b/testing/ef_tests/src/cases/fork_choice.rs @@ -3,6 +3,7 @@ use crate::decode::{ssz_decode_file, ssz_decode_file_with, ssz_decode_state, yam use ::fork_choice::{PayloadVerificationStatus, ProposerHeadError}; use beacon_chain::beacon_proposer_cache::compute_proposer_duties_from_head; use beacon_chain::blob_verification::GossipBlobError; +use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::chain_config::{ DisallowedReOrgOffsets, DEFAULT_RE_ORG_HEAD_THRESHOLD, DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_RE_ORG_PARENT_THRESHOLD, @@ -519,7 +520,7 @@ impl Tester { let result: Result, _> = self .block_on_dangerous(self.harness.chain.process_block( block_root, - block.clone(), + RpcBlock::new_without_blobs(Some(block_root), block.clone(), 0), NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), From 3131892bc03264bbbd39e93a56cef49fbcb7d532 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 8 Apr 2025 16:14:06 +1000 Subject: [PATCH 21/64] Remove stale comment --- beacon_node/beacon_chain/src/block_verification_types.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 87f69d3a9e8..dab54dc823e 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -110,7 +110,6 @@ impl RpcBlock { Self { block_root, block: RpcBlockInner::Block(block), - // Block has zero columns custody_columns_count, } } From 017c6abb83f77661b8169edcb3faa7caeedf103b Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 8 Apr 2025 17:14:31 +1000 Subject: [PATCH 22/64] Fix build errors. --- .../beacon_chain/tests/block_verification.rs | 32 +++++++++++-------- .../tests/payload_invalidation.rs | 26 +++++++++++---- beacon_node/beacon_chain/tests/store_tests.rs | 12 ++++--- .../src/network_beacon_processor/tests.rs | 18 +++++++++-- 4 files changed, 61 insertions(+), 27 deletions(-) diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 3dc46be16e0..471fda0ae7f 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -147,7 +147,7 @@ fn build_rpc_block( RpcBlock::new_with_custody_columns(None, block, columns.clone(), columns.len(), spec) .unwrap() } - None => RpcBlock::new_without_blobs(None, block), + None => RpcBlock::new_without_blobs(None, block, 0), } } @@ -367,7 +367,7 @@ async fn chain_segment_non_linear_parent_roots() { let (mut block, signature) = blocks[3].as_block().clone().deconstruct(); *block.parent_root_mut() = Hash256::zero(); - blocks[3] = RpcBlock::new_without_blobs( + blocks[3] = harness.build_rpc_block_from_store_blobs( None, Arc::new(SignedBeaconBlock::from_block(block, signature)), ); @@ -404,7 +404,7 @@ async fn chain_segment_non_linear_slots() { .collect(); let (mut block, signature) = blocks[3].as_block().clone().deconstruct(); *block.slot_mut() = Slot::new(0); - blocks[3] = RpcBlock::new_without_blobs( + blocks[3] = harness.build_rpc_block_from_store_blobs( None, Arc::new(SignedBeaconBlock::from_block(block, signature)), ); @@ -431,7 +431,7 @@ async fn chain_segment_non_linear_slots() { .collect(); let (mut block, signature) = blocks[3].as_block().clone().deconstruct(); *block.slot_mut() = blocks[2].slot(); - blocks[3] = RpcBlock::new_without_blobs( + blocks[3] = harness.build_rpc_block_from_store_blobs( None, Arc::new(SignedBeaconBlock::from_block(block, signature)), ); @@ -575,11 +575,12 @@ async fn invalid_signature_gossip_block() { .into_block_error() .expect("should import all blocks prior to the one being tested"); let signed_block = SignedBeaconBlock::from_block(block, junk_signature()); + let rpc_block = harness.build_rpc_block_from_store_blobs(None, Arc::new(signed_block)); let process_res = harness .chain .process_block( - signed_block.canonical_root(), - Arc::new(signed_block), + rpc_block.block_root(), + rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -1541,12 +1542,13 @@ async fn add_base_block_to_altair_chain() { )); // Ensure that it would be impossible to import via `BeaconChain::process_block`. + let base_rpc_block = RpcBlock::new_without_blobs(None, Arc::new(base_block.clone()), 0); assert!(matches!( harness .chain .process_block( - base_block.canonical_root(), - Arc::new(base_block.clone()), + base_rpc_block.block_root(), + base_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -1564,7 +1566,7 @@ async fn add_base_block_to_altair_chain() { harness .chain .process_chain_segment( - vec![RpcBlock::new_without_blobs(None, Arc::new(base_block))], + vec![RpcBlock::new_without_blobs(None, Arc::new(base_block), 0)], NotifyExecutionLayer::Yes, ) .await, @@ -1677,12 +1679,13 @@ async fn add_altair_block_to_base_chain() { )); // Ensure that it would be impossible to import via `BeaconChain::process_block`. + let altair_rpc_block = RpcBlock::new_without_blobs(None, Arc::new(altair_block.clone()), 0); assert!(matches!( harness .chain .process_block( - altair_block.canonical_root(), - Arc::new(altair_block.clone()), + altair_rpc_block.block_root(), + altair_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -1700,7 +1703,7 @@ async fn add_altair_block_to_base_chain() { harness .chain .process_chain_segment( - vec![RpcBlock::new_without_blobs(None, Arc::new(altair_block))], + vec![RpcBlock::new_without_blobs(None, Arc::new(altair_block), 0)], NotifyExecutionLayer::Yes ) .await, @@ -1761,11 +1764,12 @@ async fn import_duplicate_block_unrealized_justification() { // Create two verified variants of the block, representing the same block being processed in // parallel. let notify_execution_layer = NotifyExecutionLayer::Yes; - let verified_block1 = block + let rpc_block = harness.build_rpc_block_from_store_blobs(Some(block_root), block.clone()); + let verified_block1 = rpc_block .clone() .into_execution_pending_block(block_root, chain, notify_execution_layer) .unwrap(); - let verified_block2 = block + let verified_block2 = rpc_block .into_execution_pending_block(block_root, chain, notify_execution_layer) .unwrap(); diff --git a/beacon_node/beacon_chain/tests/payload_invalidation.rs b/beacon_node/beacon_chain/tests/payload_invalidation.rs index 4c4f0d8c6ad..cf9f43304c9 100644 --- a/beacon_node/beacon_chain/tests/payload_invalidation.rs +++ b/beacon_node/beacon_chain/tests/payload_invalidation.rs @@ -687,12 +687,15 @@ async fn invalidates_all_descendants() { assert_eq!(fork_parent_state.slot(), fork_parent_slot); let ((fork_block, _), _fork_post_state) = rig.harness.make_block(fork_parent_state, fork_slot).await; + let fork_rpc_block = rig + .harness + .build_rpc_block_from_store_blobs(None, fork_block.clone()); let fork_block_root = rig .harness .chain .process_block( - fork_block.canonical_root(), - fork_block, + fork_rpc_block.block_root(), + fork_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -788,12 +791,15 @@ async fn switches_heads() { let ((fork_block, _), _fork_post_state) = rig.harness.make_block(fork_parent_state, fork_slot).await; let fork_parent_root = fork_block.parent_root(); + let fork_rpc_block = rig + .harness + .build_rpc_block_from_store_blobs(None, fork_block.clone()); let fork_block_root = rig .harness .chain .process_block( - fork_block.canonical_root(), - fork_block, + fork_rpc_block.block_root(), + fork_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -1057,8 +1063,11 @@ async fn invalid_parent() { )); // Ensure the block built atop an invalid payload is invalid for import. + let rpc_block = rig + .harness + .build_rpc_block_from_store_blobs(None, block.clone()); assert!(matches!( - rig.harness.chain.process_block(block.canonical_root(), block.clone(), NotifyExecutionLayer::Yes, BlockImportSource::Lookup, + rig.harness.chain.process_block(rpc_block.block_root(), rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), ).await, Err(BlockError::ParentExecutionPayloadInvalid { parent_root: invalid_root }) @@ -1380,11 +1389,14 @@ async fn recover_from_invalid_head_by_importing_blocks() { } = InvalidHeadSetup::new().await; // Import the fork block, it should become the head. + let fork_rpc_block = rig + .harness + .build_rpc_block_from_store_blobs(None, fork_block.clone()); rig.harness .chain .process_block( - fork_block.canonical_root(), - fork_block.clone(), + fork_rpc_block.block_root(), + fork_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index e41f547fb5f..c4e52376a96 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2641,12 +2641,14 @@ async fn process_blocks_and_attestations_for_unaligned_checkpoint() { assert_eq!(split.block_root, valid_fork_block.parent_root()); assert_ne!(split.state_root, unadvanced_split_state_root); + let invalid_fork_rpc_block = + harness.build_rpc_block_from_store_blobs(None, invalid_fork_block.clone()); // Applying the invalid block should fail. let err = harness .chain .process_block( - invalid_fork_block.canonical_root(), - invalid_fork_block.clone(), + invalid_fork_rpc_block.block_root(), + invalid_fork_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -2656,11 +2658,13 @@ async fn process_blocks_and_attestations_for_unaligned_checkpoint() { assert!(matches!(err, BlockError::WouldRevertFinalizedSlot { .. })); // Applying the valid block should succeed, but it should not become head. + let valid_fork_rpc_block = + harness.build_rpc_block_from_store_blobs(None, valid_fork_block.clone()); harness .chain .process_block( - valid_fork_block.canonical_root(), - valid_fork_block.clone(), + valid_fork_rpc_block.block_root(), + valid_fork_rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index aa5f54ac1fb..5000941b0d1 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -323,12 +323,22 @@ impl TestRig { } } + pub fn custody_columns_count(&self) -> usize { + self.network_beacon_processor + .network_globals + .custody_columns_count() as usize + } + pub fn enqueue_rpc_block(&self) { let block_root = self.next_block.canonical_root(); self.network_beacon_processor .send_rpc_beacon_block( block_root, - RpcBlock::new_without_blobs(Some(block_root), self.next_block.clone()), + RpcBlock::new_without_blobs( + Some(block_root), + self.next_block.clone(), + self.custody_columns_count(), + ), std::time::Duration::default(), BlockProcessType::SingleBlock { id: 0 }, ) @@ -340,7 +350,11 @@ impl TestRig { self.network_beacon_processor .send_rpc_beacon_block( block_root, - RpcBlock::new_without_blobs(Some(block_root), self.next_block.clone()), + RpcBlock::new_without_blobs( + Some(block_root), + self.next_block.clone(), + self.custody_columns_count(), + ), std::time::Duration::default(), BlockProcessType::SingleBlock { id: 1 }, ) From cac7f3e92d6843f5e877651feda608a9613513e4 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 8 Apr 2025 14:30:56 -0300 Subject: [PATCH 23/64] Or default --- beacon_node/beacon_chain/src/test_utils.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index c0a38d61430..dc3cba143cc 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2371,7 +2371,11 @@ where // Blobs are stored as data columns from Fulu (PeerDAS) if self.spec.is_peer_das_enabled_for_epoch(block.epoch()) { - let columns = self.chain.get_data_columns(&block_root).unwrap().unwrap(); + let columns = self + .chain + .get_data_columns(&block_root) + .unwrap() + .or_default(); let custody_columns = columns .into_iter() .map(CustodyDataColumn::from_asserted_custody) From 5c87e37340a0899a97c343085341d7306e0f2ef4 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 8 Apr 2025 14:47:36 -0300 Subject: [PATCH 24/64] Review PR --- beacon_node/network/src/sync/backfill_sync/mod.rs | 2 -- beacon_node/network/src/sync/network_context.rs | 15 ++++++++------- .../network/src/sync/network_context/custody.rs | 9 ++++----- beacon_node/network/src/sync/range_sync/batch.rs | 2 +- beacon_node/network/src/sync/range_sync/chain.rs | 3 +-- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 18a6f44051a..9a7c25e6af3 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -395,8 +395,6 @@ impl BackFillSync { // sending an error /timeout) if the peer is removed from the chain for other // reasons. Check that this block belongs to the expected peer, and that the // request_id matches - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. if !batch.is_expecting_block(&request_id) { return Ok(ProcessResult::Successful); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 4271c5025a2..70f5f5a0f23 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -385,6 +385,8 @@ impl SyncNetworkContext { network_beacon_processor: _, chain: _, fork_context: _, + // Don't use a fallback match. We want to be sure that all requests are considered when + // adding new ones } = self; let mut active_request_count_by_peer = HashMap::::new(); @@ -517,7 +519,7 @@ impl SyncNetworkContext { active_request_count_by_peer: HashMap, peers_to_deprioritize: &HashSet, ) -> Result>, RpcRequestSendError> { - let mut peer_id_to_request_map = HashMap::>::new(); + let mut columns_to_request_by_peer = HashMap::>::new(); for column_index in custody_indexes { // Strictly consider peers that are custodials of this column AND are part of this @@ -535,11 +537,10 @@ impl SyncNetworkContext { peers_to_deprioritize.contains(peer), // Prefer peers with less overall requests // Also account for requests that are not yet issued tracked in peer_id_to_request_map + // We batch requests to the same peer, so count existance in the + // `columns_to_request_by_peer` as a single 1 request. active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + peer_id_to_request_map - .get(peer) - .map(|columns| columns.len()) - .unwrap_or(0), + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties rand::random::(), peer, @@ -557,13 +558,13 @@ impl SyncNetworkContext { ))); }; - peer_id_to_request_map + columns_to_request_by_peer .entry(custody_peer) .or_default() .push(*column_index); } - Ok(peer_id_to_request_map) + Ok(columns_to_request_by_peer) } /// Received a blocks by range or blobs by range response for a request that couples blocks ' diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 4e14825fb38..c5183fa896a 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -256,12 +256,11 @@ impl ActiveCustodyRequest { // De-prioritize peers that have failed to successfully respond to // requests recently self.failed_peers.contains(peer), - // Prefer peers with less requests to load balance across peers + // Prefer peers with less requests to load balance across peers. We + // We batch requests to the same peer, so count existance in the + // `columns_to_request_by_peer` as a single 1 request. active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer - .get(peer) - .map(|columns| columns.len()) - .unwrap_or(0), + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties rand::thread_rng().gen::(), *peer, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 76900246a75..264f83ee820 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -297,7 +297,7 @@ impl BatchInfo { /// Mark the batch as failed and return whether we can attempt a re-download. /// /// This can happen if a peer disconnects or some error occurred that was not the peers fault. - /// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of + /// The `peer` parameter, when set to None, does not increment the failed attempts of /// this batch and register the peer, rather attempts a re-download. #[must_use = "Batch may have failed"] pub fn download_failed( diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 6d93d7670d1..c800f22a7ed 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -255,7 +255,7 @@ impl SyncingChain { // A stream termination has been sent. This batch has ended. Process a completed batch. // Remove the request from the peer's active batches - // TODO(das): should use peer group here + // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 let received = batch.download_completed(blocks, *peer_id)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) @@ -827,7 +827,6 @@ impl SyncingChain { peer_id: PeerId, ) -> ProcessingResult { self.peers.insert(peer_id); - // Attempt to request more batches regardless of peer status self.request_batches(network) } From 034186f8347f7f1f9ad77b5ffff8982b9ad512f2 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 21 Mar 2025 04:12:33 -0300 Subject: [PATCH 25/64] BatchPeerGroup --- .../beacon_chain/src/block_verification.rs | 30 +-- .../src/block_verification_types.rs | 62 ++--- .../src/data_availability_checker.rs | 31 ++- .../src/data_availability_checker/error.rs | 6 +- .../beacon_chain/src/historical_blocks.rs | 97 ++++++-- beacon_node/beacon_chain/src/test_utils.rs | 13 +- .../src/network_beacon_processor/mod.rs | 2 +- .../network_beacon_processor/sync_methods.rs | 232 +++++++++--------- .../network/src/sync/backfill_sync/mod.rs | 61 +++-- .../src/sync/block_sidecar_coupling.rs | 226 +++++++++-------- beacon_node/network/src/sync/manager.rs | 24 +- .../network/src/sync/network_context.rs | 42 ++-- .../network/src/sync/range_sync/batch.rs | 60 +++-- .../network/src/sync/range_sync/chain.rs | 49 ++-- .../network/src/sync/range_sync/mod.rs | 4 +- .../network/src/sync/range_sync/range.rs | 5 +- beacon_node/network/src/sync/tests/range.rs | 5 +- 17 files changed, 565 insertions(+), 384 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 074ae93a790..77fde912efd 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -632,6 +632,22 @@ pub fn signature_verify_chain_segment( &chain.spec, )?; + // verify signatures before matching blocks and data + { + let pubkey_cache = get_validator_pubkey_cache(chain)?; + let mut signature_verifier = get_signature_verifier(&state, &pubkey_cache, &chain.spec); + for (block_root, block) in &chain_segment { + let mut consensus_context = + ConsensusContext::new(block.slot()).set_current_block_root(*block_root); + signature_verifier.include_all_signatures(block.as_block(), &mut consensus_context)?; + } + if signature_verifier.verify().is_err() { + return Err(BlockError::InvalidSignature(InvalidSignature::Unknown)); + } + } + + // Should check correct proposer cheap for added protection if blocks and columns don't match + // unzip chain segment and verify kzg in bulk let (roots, blocks): (Vec<_>, Vec<_>) = chain_segment.into_iter().unzip(); let maybe_available_blocks = chain @@ -653,20 +669,6 @@ pub fn signature_verify_chain_segment( }) .collect::>(); - // verify signatures - let pubkey_cache = get_validator_pubkey_cache(chain)?; - let mut signature_verifier = get_signature_verifier(&state, &pubkey_cache, &chain.spec); - for svb in &mut signature_verified_blocks { - signature_verifier - .include_all_signatures(svb.block.as_block(), &mut svb.consensus_context)?; - } - - if signature_verifier.verify().is_err() { - return Err(BlockError::InvalidSignature(InvalidSignature::Unknown)); - } - - drop(pubkey_cache); - if let Some(signature_verified_block) = signature_verified_blocks.first_mut() { signature_verified_block.parent = Some(parent); } diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index dab54dc823e..d3af68d2c5d 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -9,8 +9,8 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use types::blob_sidecar::BlobIdentifier; use types::{ - BeaconBlockRef, BeaconState, BlindedPayload, BlobSidecarList, ChainSpec, Epoch, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, + BeaconBlockRef, BeaconState, BlindedPayload, BlobSidecarList, ChainSpec, ColumnIndex, Epoch, + EthSpec, Hash256, RuntimeVariableList, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; /// A block that has been received over RPC. It has 2 internal variants: @@ -53,7 +53,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(block) => block, RpcBlockInner::BlockAndBlobs(block, _) => block, - RpcBlockInner::BlockAndCustodyColumns(block, _) => block, + RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block, } } @@ -61,7 +61,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(block) => block.clone(), RpcBlockInner::BlockAndBlobs(block, _) => block.clone(), - RpcBlockInner::BlockAndCustodyColumns(block, _) => block.clone(), + RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block.clone(), } } @@ -69,7 +69,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(_) => None, RpcBlockInner::BlockAndBlobs(_, blobs) => Some(blobs), - RpcBlockInner::BlockAndCustodyColumns(_, _) => None, + RpcBlockInner::BlockAndCustodyColumns(_, _, _) => None, } } @@ -77,7 +77,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(_) => None, RpcBlockInner::BlockAndBlobs(_, _) => None, - RpcBlockInner::BlockAndCustodyColumns(_, data_columns) => Some(data_columns), + RpcBlockInner::BlockAndCustodyColumns(_, data_columns, _) => Some(data_columns), } } } @@ -95,7 +95,11 @@ enum RpcBlockInner { BlockAndBlobs(Arc>, BlobSidecarList), /// This variant is used with parent lookups and by-range responses. It should have all /// requested data columns, all block roots matching for this block. - BlockAndCustodyColumns(Arc>, CustodyDataColumnList), + BlockAndCustodyColumns( + Arc>, + CustodyDataColumnList, + Vec, + ), } impl RpcBlock { @@ -161,24 +165,17 @@ impl RpcBlock { block_root: Option, block: Arc>, custody_columns: Vec>, - custody_columns_count: usize, + expected_custody_indices: Vec, spec: &ChainSpec, ) -> Result { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); - if block.num_expected_blobs() > 0 && custody_columns.is_empty() { - // The number of required custody columns is out of scope here. - return Err(AvailabilityCheckError::MissingCustodyColumns); - } - // Treat empty data column lists as if they are missing. - let inner = if !custody_columns.is_empty() { - RpcBlockInner::BlockAndCustodyColumns( - block, - RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize)?, - ) - } else { - RpcBlockInner::Block(block) - }; + let custody_columns_count = expected_custody_indices.len(); + let inner = RpcBlockInner::BlockAndCustodyColumns( + block, + RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize)?, + expected_custody_indices, + ); Ok(Self { block_root, block: inner, @@ -193,27 +190,34 @@ impl RpcBlock { Hash256, Arc>, Option>, - Option>, + Option<(CustodyDataColumnList, Vec)>, ) { let block_root = self.block_root(); match self.block { RpcBlockInner::Block(block) => (block_root, block, None, None), RpcBlockInner::BlockAndBlobs(block, blobs) => (block_root, block, Some(blobs), None), - RpcBlockInner::BlockAndCustodyColumns(block, data_columns) => { - (block_root, block, None, Some(data_columns)) - } + RpcBlockInner::BlockAndCustodyColumns( + block, + data_columns, + expected_custody_indices, + ) => ( + block_root, + block, + None, + Some((data_columns, expected_custody_indices)), + ), } } pub fn n_blobs(&self) -> usize { match &self.block { - RpcBlockInner::Block(_) | RpcBlockInner::BlockAndCustodyColumns(_, _) => 0, + RpcBlockInner::Block(_) | RpcBlockInner::BlockAndCustodyColumns(_, _, _) => 0, RpcBlockInner::BlockAndBlobs(_, blobs) => blobs.len(), } } pub fn n_data_columns(&self) -> usize { match &self.block { RpcBlockInner::Block(_) | RpcBlockInner::BlockAndBlobs(_, _) => 0, - RpcBlockInner::BlockAndCustodyColumns(_, data_columns) => data_columns.len(), + RpcBlockInner::BlockAndCustodyColumns(_, data_columns, _) => data_columns.len(), } } } @@ -528,14 +532,14 @@ impl AsBlock for RpcBlock { match &self.block { RpcBlockInner::Block(block) => block, RpcBlockInner::BlockAndBlobs(block, _) => block, - RpcBlockInner::BlockAndCustodyColumns(block, _) => block, + RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block, } } fn block_cloned(&self) -> Arc> { match &self.block { RpcBlockInner::Block(block) => block.clone(), RpcBlockInner::BlockAndBlobs(block, _) => block.clone(), - RpcBlockInner::BlockAndCustodyColumns(block, _) => block.clone(), + RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block.clone(), } } fn canonical_root(&self) -> Hash256 { diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 033b472da0c..a10d050e9d5 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -8,6 +8,7 @@ use crate::data_availability_checker::overflow_lru_cache::{ use crate::{metrics, BeaconChain, BeaconChainTypes, BeaconStore}; use kzg::Kzg; use slot_clock::SlotClock; +use std::collections::HashSet; use std::fmt; use std::fmt::Debug; use std::num::NonZeroUsize; @@ -17,8 +18,8 @@ use task_executor::TaskExecutor; use tracing::{debug, error, info_span, Instrument}; use types::blob_sidecar::{BlobIdentifier, BlobSidecar, FixedBlobSidecarList}; use types::{ - BlobSidecarList, ChainSpec, DataColumnIdentifier, DataColumnSidecar, DataColumnSidecarList, - Epoch, EthSpec, Hash256, RuntimeVariableList, SignedBeaconBlock, + BlobSidecarList, ChainSpec, ColumnIndex, DataColumnIdentifier, DataColumnSidecar, + DataColumnSidecarList, Epoch, EthSpec, Hash256, RuntimeVariableList, SignedBeaconBlock, }; mod error; @@ -345,7 +346,7 @@ impl DataAvailabilityChecker { }; } if self.data_columns_required_for_block(&block) { - return if let Some(data_column_list) = data_columns.as_ref() { + return if let Some((data_column_list, _)) = data_columns.as_ref() { verify_kzg_for_data_column_list_with_scoring( data_column_list .iter() @@ -426,6 +427,7 @@ impl DataAvailabilityChecker { .map_err(AvailabilityCheckError::InvalidColumn)?; } + // TODO(das): we could do the matching first before spending CPU cycles on KZG verification for block in blocks { let custody_columns_count = block.custody_columns_count(); let (block_root, block, blobs, data_columns) = block.deconstruct(); @@ -447,7 +449,21 @@ impl DataAvailabilityChecker { } } } else if self.data_columns_required_for_block(&block) { - if let Some(data_columns) = data_columns { + if let Some((data_columns, expected_custody_indices)) = data_columns { + let received_indices = + HashSet::::from_iter(data_columns.iter().map(|d| d.index())); + + let missing_custody_columns = expected_custody_indices + .into_iter() + .filter(|index| !received_indices.contains(index)) + .collect::>(); + + if !missing_custody_columns.is_empty() { + return Err(AvailabilityCheckError::MissingCustodyColumns( + missing_custody_columns, + )); + } + MaybeAvailableBlock::Available(AvailableBlock { block_root, block, @@ -458,11 +474,8 @@ impl DataAvailabilityChecker { spec: self.spec.clone(), }) } else { - MaybeAvailableBlock::AvailabilityPending { - block_root, - block, - custody_columns_count, - } + // Note: strictly asserts blocks to be available instead of returning MaybeAvailableBlock + return Err(AvailabilityCheckError::MissingAllCustodyColumns); } } else { MaybeAvailableBlock::Available(AvailableBlock { diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index d091d6fefb5..d4a3b9afb27 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -13,7 +13,8 @@ pub enum Error { Unexpected(String), SszTypes(ssz_types::Error), MissingBlobs, - MissingCustodyColumns, + MissingCustodyColumns(Vec), + MissingAllCustodyColumns, BlobIndexInvalid(u64), DataColumnIndexInvalid(u64), StoreError(store::Error), @@ -37,7 +38,8 @@ impl Error { match self { Error::SszTypes(_) | Error::MissingBlobs - | Error::MissingCustodyColumns + | Error::MissingCustodyColumns(_) + | Error::MissingAllCustodyColumns | Error::StoreError(_) | Error::DecodeError(_) | Error::Unexpected(_) diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 348e6d52a64..ce5317fc8be 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -1,4 +1,5 @@ -use crate::data_availability_checker::{AvailableBlock, AvailableBlockData}; +use crate::block_verification_types::{MaybeAvailableBlock, RpcBlock}; +use crate::data_availability_checker::{AvailabilityCheckError, AvailableBlockData}; use crate::{metrics, BeaconChain, BeaconChainTypes}; use itertools::Itertools; use state_processing::{ @@ -27,15 +28,17 @@ pub enum HistoricalBlockError { expected_block_root: Hash256, }, /// Bad signature, caller should retry with different blocks. - SignatureSet(SignatureSetError), - /// Bad signature, caller should retry with different blocks. - InvalidSignature, + InvalidSignature(String), + /// Unexpected error + Unexpected(String), /// Transitory error, caller should retry with the same blocks. ValidatorPubkeyCacheTimeout, /// Logic error: should never occur. IndexOutOfBounds, /// Internal store error StoreError(StoreError), + /// Faulty and internal AvailabilityCheckError + AvailabilityCheckError(AvailabilityCheckError), } impl From for HistoricalBlockError { @@ -44,7 +47,54 @@ impl From for HistoricalBlockError { } } +impl From for HistoricalBlockError { + fn from(err: SignatureSetError) -> Self { + match err { + // The encoding of the signature is invalid, peer fault + e @ SignatureSetError::SignatureInvalid(_) => Self::InvalidSignature(format!("{e:?}")), + // All these variants are internal errors or unreachable for historical block paths, + // which only check the proposer signature. + // BadBlsBytes = Unreachable + e @ (SignatureSetError::BeaconStateError(_) + | SignatureSetError::ValidatorUnknown(_) + | SignatureSetError::ValidatorPubkeyUnknown(_) + | SignatureSetError::IncorrectBlockProposer { .. } + | SignatureSetError::MismatchedPublicKeyLen { .. } + | SignatureSetError::PublicKeyDecompressionFailed + | SignatureSetError::BadBlsBytes { .. } + | SignatureSetError::InconsistentBlockFork(_)) => Self::Unexpected(format!("{e:?}")), + } + } +} + +impl From for HistoricalBlockError { + fn from(e: AvailabilityCheckError) -> Self { + Self::AvailabilityCheckError(e) + } +} + impl BeaconChain { + pub fn assert_correct_historical_block_chain( + &self, + blocks: &[RpcBlock], + ) -> Result<(), HistoricalBlockError> { + let anchor_info = self.store.get_anchor_info(); + let mut expected_block_root = anchor_info.oldest_block_parent; + + for block in blocks.iter().rev() { + if block.block_root() != expected_block_root { + return Err(HistoricalBlockError::MismatchedBlockRoot { + block_root: block.block_root(), + expected_block_root, + }); + } + + expected_block_root = block.as_block().message().parent_root(); + } + + Ok(()) + } + /// Store a batch of historical blocks in the database. /// /// The `blocks` should be given in slot-ascending order. One of the blocks should have a block @@ -65,8 +115,32 @@ impl BeaconChain { /// Return the number of blocks successfully imported. pub fn import_historical_block_batch( &self, - mut blocks: Vec>, + blocks: Vec>, ) -> Result { + // First check that chain of blocks is correct + self.assert_correct_historical_block_chain(&blocks)?; + + // Check that all data columns are present <- faulty failure if missing because we have + // checked the block root is correct first. + let mut blocks = self + .data_availability_checker + .verify_kzg_for_rpc_blocks(blocks) + .and_then(|blocks| { + blocks + .into_iter() + // RpcBlocks must always be Available, otherwise a data peer is faulty of + // malicious. `verify_kzg_for_rpc_blocks` returns errors for those cases, but we + // haven't updated its function signature. This code block can be deleted later + // bigger refactor. + .map(|maybe_available| match maybe_available { + MaybeAvailableBlock::Available(block) => Ok(block), + MaybeAvailableBlock::AvailabilityPending { .. } => Err( + AvailabilityCheckError::Unexpected("block not available".to_string()), + ), + }) + .collect::, _>>() + })?; + let anchor_info = self.store.get_anchor_info(); let blob_info = self.store.get_blob_info(); let data_column_info = self.store.get_data_column_info(); @@ -106,13 +180,6 @@ impl BeaconChain { for available_block in blocks_to_import.into_iter().rev() { let (block_root, block, block_data) = available_block.deconstruct(); - if block_root != expected_block_root { - return Err(HistoricalBlockError::MismatchedBlockRoot { - block_root, - expected_block_root, - }); - } - if !self.store.get_config().prune_payloads { // If prune-payloads is set to false, store the block which includes the execution payload self.store @@ -213,14 +280,16 @@ impl BeaconChain { ) }) .collect::, _>>() - .map_err(HistoricalBlockError::SignatureSet) .map(ParallelSignatureSets::from)?; drop(pubkey_cache); drop(setup_timer); + // TODO: Check that the proposer signature in the blobs and data columns is the same as the + // correct signature in the block. + let verify_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_VERIFY_TIMES); if !signature_set.verify() { - return Err(HistoricalBlockError::InvalidSignature); + return Err(HistoricalBlockError::InvalidSignature("invalid".to_owned())); } drop(verify_timer); drop(sig_timer); diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 759eec79d2b..858aaafcf07 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2371,11 +2371,8 @@ where // Blobs are stored as data columns from Fulu (PeerDAS) if self.spec.is_peer_das_enabled_for_epoch(block.epoch()) { - let columns = self - .chain - .get_data_columns(&block_root) - .unwrap() - .or_default(); + let columns = self.chain.get_data_columns(&block_root).unwrap().unwrap(); + let expected_custody_indices = columns.iter().map(|d| d.index).collect::>(); let custody_columns = columns .into_iter() .map(CustodyDataColumn::from_asserted_custody) @@ -2384,7 +2381,7 @@ where Some(block_root), block, custody_columns, - self.get_sampling_column_count(), + expected_custody_indices, &self.spec, ) .unwrap() @@ -2413,11 +2410,13 @@ where .take(sampling_column_count) .map(CustodyDataColumn::from_asserted_custody) .collect::>(); + let expected_custody_indices = + columns.iter().map(|d| d.index()).collect::>(); RpcBlock::new_with_custody_columns( Some(block_root), block, columns, - sampling_column_count, + expected_custody_indices, &self.spec, )? } else { diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 9a8edbfa4c4..53f73e4006e 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -33,7 +33,7 @@ use tokio::sync::mpsc::{self, error::TrySendError}; use tracing::{debug, error, trace, warn, Instrument}; use types::*; -pub use sync_methods::ChainSegmentProcessId; +pub use sync_methods::{ChainSegmentProcessId, PeerGroupAction}; use types::blob_sidecar::FixedBlobSidecarList; pub type Error = TrySendError>; diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 48ae26c8265..88723a48d1c 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -7,7 +7,6 @@ use crate::sync::{ }; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; -use beacon_chain::data_availability_checker::MaybeAvailableBlock; use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; use beacon_chain::{ validator_monitor::get_slot_delay_ms, AvailabilityProcessingStatus, BeaconChainTypes, @@ -18,6 +17,7 @@ use beacon_processor::{ AsyncFn, BlockingFn, DuplicateCache, }; use lighthouse_network::PeerAction; +use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; @@ -25,7 +25,9 @@ use tokio::sync::mpsc; use tracing::{debug, error, info, warn}; use types::beacon_block_body::format_kzg_commitments; use types::blob_sidecar::FixedBlobSidecarList; -use types::{BlockImportSource, DataColumnSidecar, DataColumnSidecarList, Epoch, Hash256}; +use types::{ + BlockImportSource, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, Hash256, +}; /// Id associated to a batch processing request, either a sync batch or a parent lookup. #[derive(Clone, Debug, PartialEq)] @@ -37,11 +39,65 @@ pub enum ChainSegmentProcessId { } /// Returned when a chain segment import fails. -struct ChainSegmentFailed { +#[derive(Debug)] +pub struct ChainSegmentFailed { /// To be displayed in logs. - message: String, + pub message: String, /// Used to penalize peers. - peer_action: Option, + pub peer_action: Option, +} + +#[derive(Debug)] +pub struct PeerGroupAction { + pub block_peer: Option, + pub column_peer: HashMap, +} + +impl PeerGroupAction { + fn block_peer(action: PeerAction) -> Self { + Self { + block_peer: Some(action), + column_peer: <_>::default(), + } + } + + fn column_peers(columns: &[ColumnIndex], action: PeerAction) -> Self { + Self { + block_peer: None, + column_peer: HashMap::from_iter(columns.iter().map(|index| (*index, action))), + } + } + + fn from_availability_check_error(e: &AvailabilityCheckError) -> Option { + match e { + AvailabilityCheckError::InvalidBlobs(_) => { + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) + } + AvailabilityCheckError::InvalidColumn(errors) => Some(PeerGroupAction::column_peers( + &errors.iter().map(|(index, _)| *index).collect::>(), + PeerAction::LowToleranceError, + )), + AvailabilityCheckError::ReconstructColumnsError(_) => None, // internal error + AvailabilityCheckError::KzgCommitmentMismatch { .. } => None, // should never happen after checking inclusion proof + AvailabilityCheckError::Unexpected(_) => None, // internal + AvailabilityCheckError::SszTypes(_) => None, // ?? + AvailabilityCheckError::MissingBlobs => None, // TODO(das) internal for now + AvailabilityCheckError::MissingCustodyColumns(columns) => Some( + PeerGroupAction::column_peers(columns, PeerAction::LowToleranceError), + ), + AvailabilityCheckError::MissingAllCustodyColumns => todo!(), + AvailabilityCheckError::BlobIndexInvalid(_) => { + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) + } + AvailabilityCheckError::DataColumnIndexInvalid(_) => None, // unreachable + AvailabilityCheckError::StoreError(_) => None, // unreachable + AvailabilityCheckError::DecodeError(_) => None, // ?? + AvailabilityCheckError::ParentStateMissing(_) => None, // ?? + AvailabilityCheckError::BlockReplayError(_) => None, // un-reachable ?? + AvailabilityCheckError::RebuildingStateCaches(_) => None, // ?? + AvailabilityCheckError::SlotClockError => None, // internal error + } + } } impl NetworkBeaconProcessor { @@ -476,7 +532,8 @@ impl NetworkBeaconProcessor { match e.peer_action { Some(penalty) => BatchProcessResult::FaultyFailure { imported_blocks, - penalty, + peer_action: penalty, + error: e.message, }, None => BatchProcessResult::NonFaultyFailure, } @@ -498,7 +555,7 @@ impl NetworkBeaconProcessor { .sum::(); match self.process_backfill_blocks(downloaded_blocks) { - (imported_blocks, Ok(_)) => { + Ok(imported_blocks) => { debug!( batch_epoch = %epoch, first_block_slot = start_slot, @@ -514,7 +571,7 @@ impl NetworkBeaconProcessor { imported_blocks, } } - (_, Err(e)) => { + Err(e) => { debug!( batch_epoch = %epoch, first_block_slot = start_slot, @@ -525,9 +582,10 @@ impl NetworkBeaconProcessor { "Backfill batch processing failed" ); match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { + Some(peer_action) => BatchProcessResult::FaultyFailure { imported_blocks: 0, - penalty, + peer_action, + error: e.message, }, None => BatchProcessResult::NonFaultyFailure, } @@ -585,122 +643,53 @@ impl NetworkBeaconProcessor { fn process_backfill_blocks( &self, downloaded_blocks: Vec>, - ) -> (usize, Result<(), ChainSegmentFailed>) { - let total_blocks = downloaded_blocks.len(); - let available_blocks = match self - .chain - .data_availability_checker - .verify_kzg_for_rpc_blocks(downloaded_blocks) - { - Ok(blocks) => blocks - .into_iter() - .filter_map(|maybe_available| match maybe_available { - MaybeAvailableBlock::Available(block) => Some(block), - MaybeAvailableBlock::AvailabilityPending { .. } => None, - }) - .collect::>(), - Err(e) => match e { - AvailabilityCheckError::StoreError(_) => { - return ( - 0, - Err(ChainSegmentFailed { - peer_action: None, - message: "Failed to check block availability".into(), - }), - ); - } - e => { - return ( - 0, - Err(ChainSegmentFailed { - peer_action: Some(PeerAction::LowToleranceError), - message: format!("Failed to check block availability : {:?}", e), - }), - ) - } - }, - }; - - if available_blocks.len() != total_blocks { - return ( - 0, - Err(ChainSegmentFailed { - peer_action: Some(PeerAction::LowToleranceError), - message: format!( - "{} out of {} blocks were unavailable", - (total_blocks - available_blocks.len()), - total_blocks - ), - }), - ); - } - - match self.chain.import_historical_block_batch(available_blocks) { + ) -> Result { + match self.chain.import_historical_block_batch(downloaded_blocks) { Ok(imported_blocks) => { metrics::inc_counter( &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_SUCCESS_TOTAL, ); - (imported_blocks, Ok(())) + Ok(imported_blocks) } Err(e) => { metrics::inc_counter( &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_FAILED_TOTAL, ); let peer_action = match &e { - HistoricalBlockError::MismatchedBlockRoot { - block_root, - expected_block_root, - } => { - debug!( - error = "mismatched_block_root", - ?block_root, - expected_root = ?expected_block_root, - "Backfill batch processing error" - ); - // The peer is faulty if they send blocks with bad roots. - Some(PeerAction::LowToleranceError) + HistoricalBlockError::AvailabilityCheckError(e) => { + PeerGroupAction::from_availability_check_error(e) } - HistoricalBlockError::InvalidSignature - | HistoricalBlockError::SignatureSet(_) => { - warn!( - error = ?e, - "Backfill batch processing error" - ); - // The peer is faulty if they bad signatures. - Some(PeerAction::LowToleranceError) + HistoricalBlockError::MismatchedBlockRoot { .. } + | HistoricalBlockError::InvalidSignature(_) => { + // The peer is faulty if they send blocks with bad roots or invalid + // signatures + // TODO(das): check blobs and columns signatures separately + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } - HistoricalBlockError::ValidatorPubkeyCacheTimeout => { - warn!( - error = "pubkey_cache_timeout", - "Backfill batch processing error" - ); + HistoricalBlockError::ValidatorPubkeyCacheTimeout + | HistoricalBlockError::IndexOutOfBounds + | HistoricalBlockError::StoreError(_) + | HistoricalBlockError::Unexpected(_) => { // This is an internal error, do not penalize the peer. None - } - HistoricalBlockError::IndexOutOfBounds => { - error!( - error = ?e, - "Backfill batch OOB error" - ); - // This should never occur, don't penalize the peer. - None - } - HistoricalBlockError::StoreError(e) => { - warn!(error = ?e, "Backfill batch processing error"); - // This is an internal error, don't penalize the peer. - None - } // - // Do not use a fallback match, handle all errors explicitly + } // Do not use a fallback match, handle all errors explicitly }; - let err_str: &'static str = e.into(); - ( - 0, - Err(ChainSegmentFailed { - message: format!("{:?}", err_str), - // This is an internal error, don't penalize the peer. - peer_action, - }), - ) + + if peer_action.is_some() { + // All errors that result in a peer penalty are "expected" external faults the + // node runner can't do anything about + debug!(?e, "Backfill batch processing error"); + } else { + // All others are some type of internal error worth surfacing? + warn!(?e, "Unexpected backfill batch processing error"); + } + + Err(ChainSegmentFailed { + // Render the full error in debug for full details + message: format!("{:?}", e), + // This is an internal error, don't penalize the peer. + peer_action, + }) } } } @@ -713,7 +702,7 @@ impl NetworkBeaconProcessor { Err(ChainSegmentFailed { message: format!("Block has an unknown parent: {}", parent_root), // Peers are faulty if they send non-sequential blocks. - peer_action: Some(PeerAction::LowToleranceError), + peer_action: Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)), }) } BlockError::DuplicateFullyImported(_) @@ -751,7 +740,7 @@ impl NetworkBeaconProcessor { block_slot, present_slot ), // Peers are faulty if they send blocks from the future. - peer_action: Some(PeerAction::LowToleranceError), + peer_action: Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)), }) } BlockError::WouldRevertFinalizedSlot { .. } => { @@ -767,7 +756,7 @@ impl NetworkBeaconProcessor { "Block with parent_root {} conflicts with our checkpoint state", block_parent_root ), - peer_action: Some(PeerAction::Fatal), + peer_action: Some(PeerGroupAction::block_peer(PeerAction::Fatal)), }) } BlockError::GenesisBlock => { @@ -787,6 +776,13 @@ impl NetworkBeaconProcessor { peer_action: None, }) } + BlockError::AvailabilityCheck(e) => { + let peer_group_action = PeerGroupAction::from_availability_check_error(&e); + Err(ChainSegmentFailed { + message: format!("Availability check error {:?}", e), + peer_action: peer_group_action, + }) + } ref err @ BlockError::ExecutionPayloadError(ref epe) => { if !epe.penalize_peer() { // These errors indicate an issue with the EL and not the `ChainSegment`. @@ -811,7 +807,9 @@ impl NetworkBeaconProcessor { "Peer sent a block containing invalid execution payload. Reason: {:?}", err ), - peer_action: Some(PeerAction::LowToleranceError), + peer_action: Some(PeerGroupAction::block_peer( + PeerAction::LowToleranceError, + )), }) } } @@ -826,7 +824,7 @@ impl NetworkBeaconProcessor { // We need to penalise harshly in case this represents an actual attack. In case // of a faulty EL it will usually require manual intervention to fix anyway, so // it's not too bad if we drop most of our peers. - peer_action: Some(PeerAction::LowToleranceError), + peer_action: Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)), }) } // Penalise peers for sending us banned blocks. @@ -834,7 +832,7 @@ impl NetworkBeaconProcessor { warn!(?block_root, "Received block known to be invalid",); Err(ChainSegmentFailed { message: format!("Banned block: {block_root:?}"), - peer_action: Some(PeerAction::Fatal), + peer_action: Some(PeerGroupAction::block_peer(PeerAction::Fatal)), }) } other => { diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 9a7c25e6af3..b59f7095d32 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -30,6 +30,8 @@ use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; use types::{Epoch, EthSpec}; +use super::range_sync::BatchPeerGroup; + /// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of /// blocks per batch are requested _at most_. A batch may request less blocks to account for /// already requested slots. There is a timeout for each batch request. If this value is too high, @@ -378,7 +380,7 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, + peer: BatchPeerGroup, request_id: Id, blocks: Vec>, ) -> Result { @@ -399,7 +401,7 @@ impl BackFillSync { return Ok(ProcessResult::Successful); } - match batch.download_completed(blocks, *peer_id) { + match batch.download_completed(blocks, peer) { Ok(received) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; @@ -573,7 +575,7 @@ impl BackFillSync { } }; - let Some(peer) = batch.processing_peer() else { + let Some(batch_peers) = batch.processing_peer() else { self.fail_sync(BackFillError::BatchInvalidState( batch_id, String::from("Peer does not exist"), @@ -585,8 +587,6 @@ impl BackFillSync { ?result, %batch, batch_epoch = %batch_id, - %peer, - client = %network.client_type(peer), "Backfill batch processed" ); @@ -628,8 +628,24 @@ impl BackFillSync { } BatchProcessResult::FaultyFailure { imported_blocks, - penalty, + peer_action, + error, } => { + // TODO: De-dup between back and forwards sync + if let Some(penalty) = peer_action.block_peer { + // Penalize the peer appropiately. + network.report_peer(batch_peers.block(), penalty, "faulty_batch"); + // TODO(das): downscore the right peer and display the client_type + // client = %network.client_type(peer), + } + for (column_index, penalty) in &peer_action.column_peer { + if let Some(peer) = batch_peers.column(*column_index) { + network.report_peer(peer, *penalty, "faulty_batch"); + } else { + warn!(%batch_id, column_index, "Missing peer in PeerGroup"); + } + } + match batch.processing_completed(BatchProcessingResult::FaultyFailure) { Err(e) => { // Batch was in the wrong state @@ -637,6 +653,11 @@ impl BackFillSync { .map(|_| ProcessResult::Successful) } Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + // TODO(das): what peer action should we apply to the rest of + // peers? Say a batch repeatedly fails because a custody peer is not + // sending us its custody columns + let penalty = PeerAction::LowToleranceError; + // check that we have not exceeded the re-process retry counter // If a batch has exceeded the invalid batch lookup attempts limit, it means // that it is likely all peers are sending invalid batches @@ -645,13 +666,14 @@ impl BackFillSync { warn!( score_adjustment = %penalty, batch_epoch = %batch_id, + error, "Backfill batch failed to download. Penalizing peers" ); for peer in self.participating_peers.drain() { // TODO(das): `participating_peers` only includes block peers. Should we // penalize the custody column peers too? - network.report_peer(peer, *penalty, "backfill_batch_failed"); + network.report_peer(peer, penalty, "backfill_batch_failed"); } self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) .map(|_| ProcessResult::Successful) @@ -781,37 +803,38 @@ impl BackFillSync { // The validated batch has been re-processed if attempt.hash != processed_attempt.hash { // The re-downloaded version was different. - if processed_attempt.peer_id != attempt.peer_id { + // TODO(das): should penalize other peers? + let valid_attempt_peer = processed_attempt.peer_id.block(); + let bad_attempt_peer = attempt.peer_id.block(); + if valid_attempt_peer != bad_attempt_peer { // A different peer sent the correct batch, the previous peer did not // We negatively score the original peer. let action = PeerAction::LowToleranceError; debug!( - batch_epoch = ?id, - score_adjustment = %action, - original_peer = %attempt.peer_id, - new_peer = %processed_attempt.peer_id, + batch_epoch = %id, score_adjustment = %action, + original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer, "Re-processed batch validated. Scoring original peer" ); network.report_peer( - attempt.peer_id, + bad_attempt_peer, action, - "backfill_reprocessed_original_peer", + "batch_reprocessed_original_peer", ); } else { // The same peer corrected it's previous mistake. There was an error, so we // negative score the original peer. let action = PeerAction::MidToleranceError; debug!( - batch_epoch = ?id, + batch_epoch = %id, score_adjustment = %action, - original_peer = %attempt.peer_id, - new_peer = %processed_attempt.peer_id, + original_peer = %bad_attempt_peer, + new_peer = %valid_attempt_peer, "Re-processed batch validated by the same peer" ); network.report_peer( - attempt.peer_id, + bad_attempt_peer, action, - "backfill_reprocessed_same_peer", + "batch_reprocessed_same_peer", ); } } diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 99428b0c805..d57bc894995 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -1,15 +1,23 @@ use beacon_chain::{ block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, }; -use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, +use lighthouse_network::{ + service::api_types::{ + BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, + }, + PeerId, +}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, }; -use std::{collections::HashMap, sync::Arc}; use types::{ BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, + Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, }; +use super::range_sync::BatchPeerGroup; + pub struct RangeBlockComponentsRequest { /// Blocks we have received awaiting for their corresponding sidecar. blocks_request: ByRangeRequest>>>, @@ -19,7 +27,7 @@ pub struct RangeBlockComponentsRequest { enum ByRangeRequest { Active(I), - Complete(T), + Complete(T, PeerId), } enum RangeBlockDataRequest { @@ -30,7 +38,7 @@ enum RangeBlockDataRequest { DataColumnsByRangeRequestId, ByRangeRequest>, >, - expected_custody_columns: Vec, + expected_column_to_peer: HashMap, }, } @@ -38,17 +46,20 @@ impl RangeBlockComponentsRequest { pub fn new( blocks_req_id: BlocksByRangeRequestId, blobs_req_id: Option, - data_columns: Option<(Vec, Vec)>, + data_columns: Option<( + Vec, + HashMap, + )>, ) -> Self { let block_data_request = if let Some(blobs_req_id) = blobs_req_id { RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) - } else if let Some((requests, expected_custody_columns)) = data_columns { + } else if let Some((requests, expected_column_to_peer)) = data_columns { RangeBlockDataRequest::DataColumns { requests: requests .into_iter() .map(|id| (id, ByRangeRequest::Active(id))) .collect(), - expected_custody_columns, + expected_column_to_peer, } } else { RangeBlockDataRequest::NoData @@ -64,18 +75,20 @@ impl RangeBlockComponentsRequest { &mut self, req_id: BlocksByRangeRequestId, blocks: Vec>>, + peer_id: PeerId, ) -> Result<(), String> { - self.blocks_request.finish(req_id, blocks) + self.blocks_request.finish(req_id, blocks, peer_id) } pub fn add_blobs( &mut self, req_id: BlobsByRangeRequestId, blobs: Vec>>, + peer_id: PeerId, ) -> Result<(), String> { match &mut self.block_data_request { RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), - RangeBlockDataRequest::Blobs(ref mut req) => req.finish(req_id, blobs), + RangeBlockDataRequest::Blobs(ref mut req) => req.finish(req_id, blobs, peer_id), RangeBlockDataRequest::DataColumns { .. } => { Err("received blobs but expected data columns".to_owned()) } @@ -86,6 +99,7 @@ impl RangeBlockComponentsRequest { &mut self, req_id: DataColumnsByRangeRequestId, columns: Vec>>, + peer_id: PeerId, ) -> Result<(), String> { match &mut self.block_data_request { RangeBlockDataRequest::NoData => { @@ -100,48 +114,55 @@ impl RangeBlockComponentsRequest { let req = requests .get_mut(&req_id) .ok_or(format!("unknown data columns by range req_id {req_id}"))?; - req.finish(req_id, columns) + req.finish(req_id, columns, peer_id) } } } - pub fn responses(&self, spec: &ChainSpec) -> Option>, String>> { - let Some(blocks) = self.blocks_request.to_finished() else { + #[allow(clippy::type_complexity)] + pub fn responses( + &self, + spec: &ChainSpec, + ) -> Option>, BatchPeerGroup), String>> { + let Some((blocks, &block_peer)) = self.blocks_request.to_finished() else { return None; }; match &self.block_data_request { - RangeBlockDataRequest::NoData => { - Some(Self::responses_with_blobs(blocks.to_vec(), vec![], spec)) - } + RangeBlockDataRequest::NoData => Some( + Self::responses_with_blobs(blocks.to_vec(), vec![], spec) + .map(|blocks| (blocks, BatchPeerGroup::new(block_peer))), + ), RangeBlockDataRequest::Blobs(request) => { - let Some(blobs) = request.to_finished() else { + let Some((blobs, _blob_peer)) = request.to_finished() else { return None; }; - Some(Self::responses_with_blobs( - blocks.to_vec(), - blobs.to_vec(), - spec, - )) + Some( + Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) + .map(|blocks| (blocks, BatchPeerGroup::new(block_peer))), + ) } RangeBlockDataRequest::DataColumns { requests, - expected_custody_columns, + expected_column_to_peer, } => { let mut data_columns = vec![]; for req in requests.values() { - let Some(data) = req.to_finished() else { + let Some((data, _column_peer)) = req.to_finished() else { return None; }; data_columns.extend(data.clone()) } - Some(Self::responses_with_custody_columns( - blocks.to_vec(), - data_columns, - expected_custody_columns, - spec, - )) + Some( + Self::responses_with_custody_columns( + blocks.to_vec(), + data_columns, + expected_column_to_peer.clone(), + spec, + ) + .map(|blocks| (blocks, BatchPeerGroup::new(block_peer))), + ) } } } @@ -199,106 +220,98 @@ impl RangeBlockComponentsRequest { fn responses_with_custody_columns( blocks: Vec>>, data_columns: DataColumnSidecarList, - expects_custody_columns: &[ColumnIndex], + expected_custody_columns: HashMap, spec: &ChainSpec, ) -> Result>, String> { // Group data columns by block_root and index - let mut data_columns_by_block = - HashMap::>>>::new(); + let mut custody_columns_by_block = HashMap::>>::new(); + let mut block_roots_by_slot = HashMap::>::new(); + let expected_custody_indices = expected_custody_columns.keys().cloned().collect::>(); for column in data_columns { let block_root = column.block_root(); let index = column.index; - if data_columns_by_block - .entry(block_root) + + block_roots_by_slot + .entry(column.slot()) .or_default() - .insert(index, column) - .is_some() - { + .insert(block_root); + + // Sanity check before casting to `CustodyDataColumn`. But this should never happen + if !expected_custody_columns.contains_key(&index) { return Err(format!( - "Repeated column block_root {block_root:?} index {index}" + "Received column not in expected custody indices {index}" )); } + + custody_columns_by_block + .entry(block_root) + .or_default() + .push(CustodyDataColumn::from_asserted_custody(column)); } // Now iterate all blocks ensuring that the block roots of each block and data column match, // plus we have columns for our custody requirements - let mut rpc_blocks = Vec::with_capacity(blocks.len()); - - for block in blocks { - let block_root = get_block_root(&block); - rpc_blocks.push(if block.num_expected_blobs() > 0 { - let Some(mut data_columns_by_index) = data_columns_by_block.remove(&block_root) - else { - // This PR ignores the fix from https://github.com/sigp/lighthouse/pull/5675 - // which allows blobs to not match blocks. - // TODO(das): on the initial version of PeerDAS the beacon chain does not check - // rpc custody requirements and dropping this check can allow the block to have - // an inconsistent DB. - return Err(format!("No columns for block {block_root:?} with data")); - }; - - let mut custody_columns = vec![]; - for index in expects_custody_columns { - let Some(data_column) = data_columns_by_index.remove(index) else { - return Err(format!("No column for block {block_root:?} index {index}")); - }; - // Safe to convert to `CustodyDataColumn`: we have asserted that the index of - // this column is in the set of `expects_custody_columns` and with the expected - // block root, so for the expected epoch of this batch. - custody_columns.push(CustodyDataColumn::from_asserted_custody(data_column)); - } - - // Assert that there are no columns left - if !data_columns_by_index.is_empty() { - let remaining_indices = data_columns_by_index.keys().collect::>(); - return Err(format!( - "Not all columns consumed for block {block_root:?}: {remaining_indices:?}" - )); - } + let rpc_blocks = blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + block_roots_by_slot + .entry(block.slot()) + .or_default() + .insert(block_root); + + let custody_columns = custody_columns_by_block + .remove(&block_root) + .unwrap_or_default(); RpcBlock::new_with_custody_columns( Some(block_root), block, custody_columns, - expects_custody_columns.len(), + expected_custody_indices.clone(), spec, ) - .map_err(|e| format!("{e:?}"))? - } else { - // Block has no data, expects zero columns - RpcBlock::new_without_blobs(Some(block_root), block, 0) - }); - } + .map_err(|e| format!("{e:?}")) + }) + .collect::, _>>()?; // Assert that there are no columns left for other blocks - if !data_columns_by_block.is_empty() { - let remaining_roots = data_columns_by_block.keys().collect::>(); + if !custody_columns_by_block.is_empty() { + let remaining_roots = custody_columns_by_block.keys().collect::>(); return Err(format!("Not all columns consumed: {remaining_roots:?}")); } + for (_slot, block_roots) in block_roots_by_slot { + if block_roots.len() > 1 { + // TODO: Some peer(s) are faulty or malicious. This batch will fail processing but + // we want to send it to the process to better attribute fault. Maybe warn log for + // now and track it in a metric? + } + } + Ok(rpc_blocks) } } impl ByRangeRequest { - fn finish(&mut self, id: I, data: T) -> Result<(), String> { + fn finish(&mut self, id: I, data: T, peer_id: PeerId) -> Result<(), String> { match self { Self::Active(expected_id) => { if expected_id != &id { return Err(format!("unexpected req_id expected {expected_id} got {id}")); } - *self = Self::Complete(data); + *self = Self::Complete(data, peer_id); Ok(()) } - Self::Complete(_) => Err("request already complete".to_owned()), + Self::Complete(_, _) => Err("request already complete".to_owned()), } } - fn to_finished(&self) -> Option<&T> { + fn to_finished(&self) -> Option<(&T, &PeerId)> { match self { Self::Active(_) => None, - Self::Complete(data) => Some(data), + Self::Complete(data, peer_id) => Some((data, peer_id)), } } } @@ -309,12 +322,15 @@ mod tests { use beacon_chain::test_utils::{ generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, NumBlobs, }; - use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, + use lighthouse_network::{ + service::api_types::{ + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + DataColumnsByRangeRequestId, Id, RangeRequestId, + }, + PeerId, }; use rand::SeedableRng; - use std::sync::Arc; + use std::{collections::HashMap, sync::Arc}; use types::{test_utils::XorShiftRng, Epoch, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; fn components_id() -> ComponentsByRangeRequestId { @@ -359,6 +375,7 @@ mod tests { #[test] fn no_blobs_into_responses() { let spec = test_spec::(); + let peer = PeerId::random(); let mut rng = XorShiftRng::from_seed([42; 16]); let blocks = (0..4) .map(|_| { @@ -372,7 +389,7 @@ mod tests { let mut info = RangeBlockComponentsRequest::::new(blocks_req_id, None, None); // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks).unwrap(); + info.add_blocks(blocks_req_id, blocks, peer).unwrap(); // Assert response is finished and RpcBlocks can be constructed info.responses(&test_spec::()).unwrap().unwrap(); @@ -381,6 +398,7 @@ mod tests { #[test] fn empty_blobs_into_responses() { let spec = test_spec::(); + let peer = PeerId::random(); let mut rng = XorShiftRng::from_seed([42; 16]); let blocks = (0..4) .map(|_| { @@ -403,9 +421,9 @@ mod tests { RangeBlockComponentsRequest::::new(blocks_req_id, Some(blobs_req_id), None); // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks).unwrap(); + info.add_blocks(blocks_req_id, blocks, peer).unwrap(); // Expect no blobs returned - info.add_blobs(blobs_req_id, vec![]).unwrap(); + info.add_blobs(blobs_req_id, vec![], peer).unwrap(); // Assert response is finished and RpcBlocks can be constructed, even if blobs weren't returned. // This makes sure we don't expect blobs here when they have expired. Checking this logic should @@ -416,7 +434,8 @@ mod tests { #[test] fn rpc_block_with_custody_columns() { let spec = test_spec::(); - let expects_custody_columns = vec![1, 2, 3, 4]; + let peer = PeerId::random(); + let expects_custody_columns = [1, 2, 3, 4]; let mut rng = XorShiftRng::from_seed([42; 16]); let blocks = (0..4) .map(|_| { @@ -436,15 +455,22 @@ mod tests { .enumerate() .map(|(i, _)| columns_id(i as Id, components_id)) .collect::>(); + + let column_to_peer = expects_custody_columns + .iter() + .map(|index| (*index, peer)) + .collect::>(); + let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, None, - Some((columns_req_id.clone(), expects_custody_columns.clone())), + Some((columns_req_id.clone(), column_to_peer)), ); // Send blocks and complete terminate response info.add_blocks( blocks_req_id, blocks.iter().map(|b| b.0.clone().into()).collect(), + peer, ) .unwrap(); // Assert response is not finished @@ -458,6 +484,7 @@ mod tests { .iter() .flat_map(|b| b.1.iter().filter(|d| d.index == column_index).cloned()) .collect(), + peer, ) .unwrap(); @@ -476,12 +503,13 @@ mod tests { #[test] fn rpc_block_with_custody_columns_batched() { let spec = test_spec::(); + let peer = PeerId::random(); let batched_column_requests = [vec![1_u64, 2], vec![3, 4]]; let expects_custody_columns = batched_column_requests .iter() .flatten() - .cloned() - .collect::>(); + .map(|index| (*index, peer)) + .collect::>(); let custody_column_request_ids = (0..batched_column_requests.len() as u32).collect::>(); let num_of_data_column_requests = custody_column_request_ids.len(); @@ -516,6 +544,7 @@ mod tests { info.add_blocks( blocks_req_id, blocks.iter().map(|b| b.0.clone().into()).collect(), + peer, ) .unwrap(); // Assert response is not finished @@ -533,6 +562,7 @@ mod tests { .cloned() }) .collect::>(), + peer, ) .unwrap(); diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 9119b1652c7..70181cb5d1b 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -41,7 +41,9 @@ use super::network_context::{ use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH}; -use crate::network_beacon_processor::{ChainSegmentProcessId, NetworkBeaconProcessor}; +use crate::network_beacon_processor::{ + ChainSegmentProcessId, NetworkBeaconProcessor, PeerGroupAction, +}; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::{ @@ -61,8 +63,8 @@ use lighthouse_network::service::api_types::{ SamplingId, SamplingRequester, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; +use lighthouse_network::PeerId; use lighthouse_network::SyncInfo; -use lighthouse_network::{PeerAction, PeerId}; use logging::crit; use lru_cache::LRUTimeCache; use std::ops::Sub; @@ -215,7 +217,8 @@ pub enum BatchProcessResult { /// The batch processing failed. It carries whether the processing imported any block. FaultyFailure { imported_blocks: usize, - penalty: PeerAction, + peer_action: PeerGroupAction, + error: String, }, NonFaultyFailure, } @@ -1251,17 +1254,18 @@ impl SyncManager { peer_id: PeerId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self - .network - .range_block_component_response(range_request_id, range_block_component) - { + if let Some(resp) = self.network.range_block_component_response( + range_request_id, + peer_id, + range_block_component, + ) { match resp { - Ok(blocks) => { + Ok((blocks, batch_peers)) => { match range_request_id.requester { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.blocks_by_range_response( &mut self.network, - peer_id, + batch_peers, chain_id, batch_id, range_request_id.id, @@ -1273,7 +1277,7 @@ impl SyncManager { match self.backfill_sync.on_block_response( &mut self.network, batch_id, - &peer_id, + batch_peers, range_request_id.id, blocks, ) { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 8e648a600f8..933b08418fc 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -5,7 +5,7 @@ use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::ByRangeRequestType; +use super::range_sync::{BatchPeerGroup, ByRangeRequestType}; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -475,7 +475,12 @@ impl SyncNetworkContext { let data_column_requests = columns_by_range_peers_to_request .map(|columns_by_range_peers_to_request| { - columns_by_range_peers_to_request + let column_to_peer_map = columns_by_range_peers_to_request + .iter() + .flat_map(|(peer_id, columns)| columns.iter().map(|column| (*column, *peer_id))) + .collect::>(); + + let requests = columns_by_range_peers_to_request .into_iter() .map(|(peer_id, columns)| { self.send_data_columns_by_range_request( @@ -488,25 +493,14 @@ impl SyncNetworkContext { id, ) }) - .collect::, _>>() + .collect::, _>>()?; + + Ok((requests, column_to_peer_map)) }) .transpose()?; - let info = RangeBlockComponentsRequest::new( - blocks_req_id, - blobs_req_id, - data_column_requests.map(|data_column_requests| { - ( - data_column_requests, - self.network_globals() - .sampling_columns - .clone() - .iter() - .copied() - .collect(), - ) - }), - ); + let info = + RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_column_requests); self.components_by_range_requests.insert(id, info); Ok(id.id) @@ -569,11 +563,13 @@ impl SyncNetworkContext { /// Received a blocks by range or blobs by range response for a request that couples blocks ' /// and blobs. + #[allow(clippy::type_complexity)] pub fn range_block_component_response( &mut self, id: ComponentsByRangeRequestId, + peer_id: PeerId, range_block_component: RangeBlockComponent, - ) -> Option>, RpcResponseError>> { + ) -> Option>, BatchPeerGroup), RpcResponseError>> { let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); return None; @@ -584,18 +580,18 @@ impl SyncNetworkContext { match range_block_component { RangeBlockComponent::Block(req_id, resp) => resp.and_then(|(blocks, _)| { request - .add_blocks(req_id, blocks) + .add_blocks(req_id, blocks, peer_id) .map_err(RpcResponseError::BlockComponentCouplingError) }), RangeBlockComponent::Blob(req_id, resp) => resp.and_then(|(blobs, _)| { request - .add_blobs(req_id, blobs) + .add_blobs(req_id, blobs, peer_id) .map_err(RpcResponseError::BlockComponentCouplingError) }), RangeBlockComponent::CustodyColumns(req_id, resp) => { resp.and_then(|(custody_columns, _)| { request - .add_custody_columns(req_id, custody_columns) + .add_custody_columns(req_id, custody_columns, peer_id) .map_err(RpcResponseError::BlockComponentCouplingError) }) } @@ -1115,7 +1111,7 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - debug!(%peer_id, %action, %msg, "Sync reporting peer"); + debug!(%peer_id, %action, %msg, client = %self.client_type(&peer_id), "Sync reporting peer"); self.network_send .send(NetworkMessage::ReportPeer { peer_id, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 264f83ee820..c2c4ddf9238 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -8,7 +8,7 @@ use std::hash::{Hash, Hasher}; use std::ops::Sub; use std::time::{Duration, Instant}; use strum::Display; -use types::{Epoch, EthSpec, Slot}; +use types::{ColumnIndex, Epoch, EthSpec, Slot}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; @@ -26,6 +26,25 @@ pub enum ByRangeRequestType { Blocks, } +#[derive(Clone, Debug)] +pub struct BatchPeerGroup { + block_peer: PeerId, +} + +impl BatchPeerGroup { + pub fn new(block_peer: PeerId) -> Self { + Self { block_peer } + } + + pub fn block(&self) -> PeerId { + self.block_peer + } + + pub fn column(&self, _index: ColumnIndex) -> Option { + todo!(); + } +} + /// Allows customisation of the above constants used in other sync methods such as BackFillSync. pub trait BatchConfig { /// The maximum batch download attempts. @@ -107,7 +126,7 @@ pub struct BatchInfo { /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: Vec>, + failed_download_attempts: Vec, /// State of the batch. state: BatchState, /// Whether this batch contains all blocks or all blocks and blobs. @@ -134,7 +153,7 @@ pub enum BatchState { /// The batch is being downloaded. Downloading(Id), /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(PeerId, Vec>, Instant), + AwaitingProcessing(BatchPeerGroup, Vec>, Instant), /// The batch is being processed. Processing(Attempt), /// The batch was successfully processed and is waiting to be validated. @@ -194,10 +213,10 @@ impl BatchInfo { ); for attempt in &self.failed_processing_attempts { - peers.insert(attempt.peer_id); + peers.insert(attempt.peer_id.block()); } - for peer in self.failed_download_attempts.iter().flatten() { + for peer in self.failed_download_attempts.iter() { peers.insert(*peer); } @@ -213,7 +232,7 @@ impl BatchInfo { } /// Returns the peer that is currently responsible for progressing the state of the batch. - pub fn processing_peer(&self) -> Option<&PeerId> { + pub fn processing_peer(&self) -> Option<&BatchPeerGroup> { match &self.state { BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, BatchState::AwaitingProcessing(peer_id, _, _) @@ -275,10 +294,10 @@ impl BatchInfo { pub fn download_completed( &mut self, blocks: Vec>, - peer: PeerId, + peer: BatchPeerGroup, ) -> Result { match self.state.poison() { - BatchState::Downloading(_) => { + BatchState::Downloading(_request_id) => { let received = blocks.len(); self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); Ok(received) @@ -305,10 +324,11 @@ impl BatchInfo { peer: Option, ) -> Result { match self.state.poison() { - BatchState::Downloading(_) => { + BatchState::Downloading(_request_id) => { // register the attempt and check if the batch can be tried again - self.failed_download_attempts.push(peer); - + if let Some(peer) = peer { + self.failed_download_attempts.push(peer); + } self.state = if self.failed_download_attempts.len() >= B::max_batch_download_attempts() as usize { @@ -349,8 +369,8 @@ impl BatchInfo { pub fn start_processing(&mut self) -> Result<(Vec>, Duration), WrongState> { match self.state.poison() { - BatchState::AwaitingProcessing(peer, blocks, start_instant) => { - self.state = BatchState::Processing(Attempt::new::(peer, &blocks)); + BatchState::AwaitingProcessing(peers, blocks, start_instant) => { + self.state = BatchState::Processing(Attempt::new::(peers, &blocks)); Ok((blocks, start_instant.elapsed())) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -441,16 +461,16 @@ impl BatchInfo { /// Represents a peer's attempt and providing the result for this batch. /// /// Invalid attempts will downscore a peer. -#[derive(PartialEq, Debug)] +#[derive(Debug)] pub struct Attempt { /// The peer that made the attempt. - pub peer_id: PeerId, + pub peer_id: BatchPeerGroup, /// The hash of the blocks of the attempt. pub hash: u64, } impl Attempt { - fn new(peer_id: PeerId, blocks: &[RpcBlock]) -> Self { + fn new(peer_id: BatchPeerGroup, blocks: &[RpcBlock]) -> Self { let hash = B::batch_attempt_hash(blocks); Attempt { peer_id, hash } } @@ -462,15 +482,15 @@ impl std::fmt::Debug for BatchState { BatchState::Processing(Attempt { ref peer_id, hash: _, - }) => write!(f, "Processing({})", peer_id), + }) => write!(f, "Processing({})", peer_id.block()), BatchState::AwaitingValidation(Attempt { ref peer_id, hash: _, - }) => write!(f, "AwaitingValidation({})", peer_id), + }) => write!(f, "AwaitingValidation({})", peer_id.block()), BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(ref peer, ref blocks, _) => { - write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len()) + BatchState::AwaitingProcessing(_, ref blocks, _) => { + write!(f, "AwaitingProcessing({} blocks)", blocks.len()) } BatchState::Downloading(request_id) => { write!(f, "Downloading({})", request_id) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index c800f22a7ed..68d27758c9a 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,4 +1,4 @@ -use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; +use super::batch::{BatchInfo, BatchPeerGroup, BatchProcessingResult, BatchState}; use super::RangeSyncType; use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; @@ -227,7 +227,7 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, + peer: BatchPeerGroup, request_id: Id, blocks: Vec>, ) -> ProcessingResult { @@ -256,7 +256,7 @@ impl SyncingChain { // Remove the request from the peer's active batches // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 - let received = batch.download_completed(blocks, *peer_id)?; + let received = batch.download_completed(blocks, peer)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -458,7 +458,7 @@ impl SyncingChain { } }; - let peer = batch.processing_peer().cloned().ok_or_else(|| { + let batch_peers = batch.processing_peer().cloned().ok_or_else(|| { RemoveChain::WrongBatchState(format!( "Processing target is in wrong state: {:?}", batch.state(), @@ -469,7 +469,6 @@ impl SyncingChain { debug!( result = ?result, batch_epoch = %batch_id, - client = %network.client_type(&peer), batch_state = ?batch_state, ?batch, "Batch processing result" @@ -532,10 +531,22 @@ impl SyncingChain { } BatchProcessResult::FaultyFailure { imported_blocks, - penalty, + peer_action, + // TODO: propagate error in logs + error: _, } => { - // Penalize the peer appropiately. - network.report_peer(peer, *penalty, "faulty_batch"); + // TODO: De-dup between back and forwards sync + if let Some(penalty) = peer_action.block_peer { + // Penalize the peer appropiately. + network.report_peer(batch_peers.block(), penalty, "faulty_batch"); + } + for (column_index, penalty) in &peer_action.column_peer { + if let Some(peer) = batch_peers.column(*column_index) { + network.report_peer(peer, *penalty, "faulty_batch"); + } else { + warn!(%batch_id, column_index, "Missing peer in PeerGroup"); + } + } // Check if this batch is allowed to continue match batch.processing_completed(BatchProcessingResult::FaultyFailure)? { @@ -551,6 +562,11 @@ impl SyncingChain { self.handle_invalid_batch(network, batch_id) } BatchOperationOutcome::Failed { blacklist } => { + // TODO(das): what peer action should we apply to the rest of + // peers? Say a batch repeatedly fails because a custody peer is not + // sending us its custody columns + let penalty = PeerAction::LowToleranceError; + // Check that we have not exceeded the re-process retry counter, // If a batch has exceeded the invalid batch lookup attempts limit, it means // that it is likely all peers in this chain are are sending invalid batches @@ -565,7 +581,7 @@ impl SyncingChain { ); for peer in self.peers.drain() { - network.report_peer(peer, *penalty, "faulty_chain"); + network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { blacklist, @@ -644,17 +660,20 @@ impl SyncingChain { // The validated batch has been re-processed if attempt.hash != processed_attempt.hash { // The re-downloaded version was different - if processed_attempt.peer_id != attempt.peer_id { + // TODO(das): should penalize other peers? + let valid_attempt_peer = processed_attempt.peer_id.block(); + let bad_attempt_peer = attempt.peer_id.block(); + if valid_attempt_peer != bad_attempt_peer { // A different peer sent the correct batch, the previous peer did not // We negatively score the original peer. let action = PeerAction::LowToleranceError; debug!( batch_epoch = %id, score_adjustment = %action, - original_peer = %attempt.peer_id, new_peer = %processed_attempt.peer_id, + original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer, "Re-processed batch validated. Scoring original peer" ); network.report_peer( - attempt.peer_id, + bad_attempt_peer, action, "batch_reprocessed_original_peer", ); @@ -665,12 +684,12 @@ impl SyncingChain { debug!( batch_epoch = %id, score_adjustment = %action, - original_peer = %attempt.peer_id, - new_peer = %processed_attempt.peer_id, + original_peer = %bad_attempt_peer, + new_peer = %valid_attempt_peer, "Re-processed batch validated by the same peer" ); network.report_peer( - attempt.peer_id, + bad_attempt_peer, action, "batch_reprocessed_same_peer", ); diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 8f881fba90f..f57c1497180 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -8,8 +8,8 @@ mod range; mod sync_type; pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, + BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeerGroup, BatchProcessingResult, + BatchState, ByRangeRequestType, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index f77bbde5031..919a321f380 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -42,6 +42,7 @@ use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; +use super::BatchPeerGroup; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; @@ -227,7 +228,7 @@ where pub fn blocks_by_range_response( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, + peer_id: BatchPeerGroup, chain_id: ChainId, batch_id: BatchId, request_id: Id, @@ -235,7 +236,7 @@ where ) { // check if this chunk removes the chain match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, &peer_id, request_id, blocks) + chain.on_block_response(network, batch_id, peer_id, request_id, blocks) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 932f485dd0d..06dca355e53 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -449,12 +449,13 @@ fn build_rpc_block( RpcBlock::new(None, block, Some(blobs.clone())).unwrap() } Some(DataSidecars::DataColumns(columns)) => { + // TODO(das): Assumes CGC = max value. Change if we want to do more complex tests + let expected_custody_indices = columns.iter().map(|d| d.index()).collect::>(); RpcBlock::new_with_custody_columns( None, block, columns.clone(), - // TODO(das): Assumes CGC = max value. Change if we want to do more complex tests - columns.len(), + expected_custody_indices, spec, ) .unwrap() From ecc894a7efc5803d17523947162babafb321f0a3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 12 Apr 2025 10:51:26 -0300 Subject: [PATCH 26/64] Match block and blob signatures --- .../beacon_chain/src/block_verification.rs | 35 ++++++++++++++----- .../src/block_verification_types.rs | 30 ++++++++++++++++ .../beacon_chain/src/historical_blocks.rs | 29 +++++++++++++-- .../gossip_methods.rs | 7 +++- .../network_beacon_processor/sync_methods.rs | 7 ++++ 5 files changed, 96 insertions(+), 12 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 77fde912efd..2ae18c9bdd8 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -94,6 +94,7 @@ use store::{Error as DBError, HotStateSummary, KeyValueStore, StoreOp}; use strum::AsRefStr; use task_executor::JoinHandle; use tracing::{debug, error}; +use types::ColumnIndex; use types::{ data_column_sidecar::DataColumnSidecarError, BeaconBlockRef, BeaconState, BeaconStateError, BlobsList, ChainSpec, DataColumnSidecarList, Epoch, EthSpec, ExecutionBlockHash, FullPayload, @@ -220,6 +221,10 @@ pub enum BlockError { /// /// The block is invalid and the peer is faulty. InvalidSignature(InvalidSignature), + /// One or more signatures in a BlobSidecar of an RpcBlock are invalid + InvalidBlobsSignature(Vec), + /// One or more signatures in a DataColumnSidecar of an RpcBlock are invalid + InvalidDataColumnsSignature(Vec), /// The provided block is not from a later slot than its parent. /// /// ## Peer scoring @@ -633,16 +638,28 @@ pub fn signature_verify_chain_segment( )?; // verify signatures before matching blocks and data - { - let pubkey_cache = get_validator_pubkey_cache(chain)?; - let mut signature_verifier = get_signature_verifier(&state, &pubkey_cache, &chain.spec); - for (block_root, block) in &chain_segment { - let mut consensus_context = - ConsensusContext::new(block.slot()).set_current_block_root(*block_root); - signature_verifier.include_all_signatures(block.as_block(), &mut consensus_context)?; + let pubkey_cache = get_validator_pubkey_cache(chain)?; + let mut signature_verifier = get_signature_verifier(&state, &pubkey_cache, &chain.spec); + for (block_root, block) in &chain_segment { + let mut consensus_context = + ConsensusContext::new(block.slot()).set_current_block_root(*block_root); + signature_verifier.include_all_signatures(block.as_block(), &mut consensus_context)?; + } + if signature_verifier.verify().is_err() { + return Err(BlockError::InvalidSignature(InvalidSignature::Unknown)); + } + + // Verify that blobs or data columns signatures match + for (_, block) in &chain_segment { + if let Some(indices) = block.non_matching_blobs_signed_headers() { + if !indices.is_empty() { + return Err(BlockError::InvalidBlobsSignature(indices)); + } } - if signature_verifier.verify().is_err() { - return Err(BlockError::InvalidSignature(InvalidSignature::Unknown)); + if let Some(indices) = block.non_matching_custody_columns_signed_headers() { + if !indices.is_empty() { + return Err(BlockError::InvalidDataColumnsSignature(indices)); + } } } diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index d3af68d2c5d..b870f1f8027 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -80,6 +80,36 @@ impl RpcBlock { RpcBlockInner::BlockAndCustodyColumns(_, data_columns, _) => Some(data_columns), } } + + pub fn non_matching_blobs_signed_headers(&self) -> Option> { + match &self.block { + RpcBlockInner::Block(_) => None, + RpcBlockInner::BlockAndBlobs(block, blobs) => Some( + blobs + .iter() + .filter(|blob| &blob.signed_block_header.signature != block.signature()) + .map(|blob| blob.index) + .collect(), + ), + RpcBlockInner::BlockAndCustodyColumns(..) => None, + } + } + + pub fn non_matching_custody_columns_signed_headers(&self) -> Option> { + match &self.block { + RpcBlockInner::Block(_) => None, + RpcBlockInner::BlockAndBlobs(..) => None, + RpcBlockInner::BlockAndCustodyColumns(block, data_columns, _) => Some( + data_columns + .iter() + .filter(|column| { + &column.as_data_column().signed_block_header.signature != block.signature() + }) + .map(|column| column.index()) + .collect(), + ), + } + } } /// Note: This variant is intentionally private because we want to safely construct the diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index ce5317fc8be..9ea4c76ca60 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -13,7 +13,7 @@ use store::metadata::DataColumnInfo; use store::{AnchorInfo, BlobInfo, DBColumn, Error as StoreError, KeyValueStore, KeyValueStoreOp}; use strum::IntoStaticStr; use tracing::debug; -use types::{FixedBytesExtended, Hash256, Slot}; +use types::{ColumnIndex, FixedBytesExtended, Hash256, Slot}; /// Use a longer timeout on the pubkey cache. /// @@ -29,6 +29,10 @@ pub enum HistoricalBlockError { }, /// Bad signature, caller should retry with different blocks. InvalidSignature(String), + /// One or more signatures in a BlobSidecar of an RpcBlock are invalid + InvalidBlobsSignature(Vec), + /// One or more signatures in a DataColumnSidecar of an RpcBlock are invalid + InvalidDataColumnsSignature(Vec), /// Unexpected error Unexpected(String), /// Transitory error, caller should retry with the same blocks. @@ -120,6 +124,26 @@ impl BeaconChain { // First check that chain of blocks is correct self.assert_correct_historical_block_chain(&blocks)?; + // Verify that blobs or data columns signatures match + // Why are we computing the DB ops before verifying the signatures? ¯\_(ツ)_/¯ We have to + // wait to maybe return the invalid block signature error. + let matching_sidecar_signatures_error = blocks + .iter() + .map(|block| { + if let Some(indices) = block.non_matching_blobs_signed_headers() { + if !indices.is_empty() { + return Err(HistoricalBlockError::InvalidBlobsSignature(indices)); + } + } + if let Some(indices) = block.non_matching_custody_columns_signed_headers() { + if !indices.is_empty() { + return Err(HistoricalBlockError::InvalidDataColumnsSignature(indices)); + } + } + Ok(()) + }) + .collect::, _>>(); + // Check that all data columns are present <- faulty failure if missing because we have // checked the block root is correct first. let mut blocks = self @@ -284,8 +308,9 @@ impl BeaconChain { drop(pubkey_cache); drop(setup_timer); - // TODO: Check that the proposer signature in the blobs and data columns is the same as the + // Check that the proposer signature in the blobs and data columns is the same as the // correct signature in the block. + matching_sidecar_signatures_error?; let verify_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_VERIFY_TIMES); if !signature_set.verify() { diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index d61ea583772..b7e95947d21 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -1425,7 +1425,12 @@ impl NetworkBeaconProcessor { return None; } // BlobNotRequired is unreachable. Only constructed in `process_gossip_blob` - Err(e @ BlockError::InternalError(_)) | Err(e @ BlockError::BlobNotRequired(_)) => { + // InvalidBlobsSignature is unreachable. Only constructed in `process_chain_segment` + // InvalidDataColumnsSignature is unreachable. Only constructed in `process_chain_segment` + Err(e @ BlockError::InternalError(_)) + | Err(e @ BlockError::BlobNotRequired(_)) + | Err(e @ BlockError::InvalidBlobsSignature(_)) + | Err(e @ BlockError::InvalidDataColumnsSignature(_)) => { error!(error = %e, "Internal block gossip validation error"); return None; } diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 88723a48d1c..47379f38264 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -666,6 +666,13 @@ impl NetworkBeaconProcessor { // TODO(das): check blobs and columns signatures separately Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } + // Blobs are served by the block_peer + HistoricalBlockError::InvalidBlobsSignature(_) => { + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) + } + HistoricalBlockError::InvalidDataColumnsSignature(indices) => Some( + PeerGroupAction::column_peers(indices, PeerAction::LowToleranceError), + ), HistoricalBlockError::ValidatorPubkeyCacheTimeout | HistoricalBlockError::IndexOutOfBounds | HistoricalBlockError::StoreError(_) From bb6175cd5d9c963f3fd8ee33caa736991df72aae Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 12 Apr 2025 11:57:43 -0300 Subject: [PATCH 27/64] Explicit match statement to BlockError in range sync --- .../network_beacon_processor/sync_methods.rs | 187 +++++++----------- 1 file changed, 68 insertions(+), 119 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 47379f38264..63cf35427ee 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -685,10 +685,10 @@ impl NetworkBeaconProcessor { if peer_action.is_some() { // All errors that result in a peer penalty are "expected" external faults the // node runner can't do anything about - debug!(?e, "Backfill batch processing error"); + debug!(?e, "Backfill sync processing error"); } else { // All others are some type of internal error worth surfacing? - warn!(?e, "Unexpected backfill batch processing error"); + warn!(?e, "Unexpected backfill sync processing error"); } Err(ChainSegmentFailed { @@ -703,26 +703,17 @@ impl NetworkBeaconProcessor { /// Helper function to handle a `BlockError` from `process_chain_segment` fn handle_failed_chain_segment(&self, error: BlockError) -> Result<(), ChainSegmentFailed> { - match error { - BlockError::ParentUnknown { parent_root, .. } => { + let peer_action = match &error { + BlockError::ParentUnknown { .. } => { // blocks should be sequential and all parents should exist - Err(ChainSegmentFailed { - message: format!("Block has an unknown parent: {}", parent_root), - // Peers are faulty if they send non-sequential blocks. - peer_action: Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)), - }) - } - BlockError::DuplicateFullyImported(_) - | BlockError::DuplicateImportStatusUnknown(..) => { - // This can happen for many reasons. Head sync's can download multiples and parent - // lookups can download blocks before range sync - Ok(()) + // Peers are faulty if they send non-sequential blocks. + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } BlockError::FutureSlot { present_slot, block_slot, } => { - if present_slot + FUTURE_SLOT_TOLERANCE >= block_slot { + if *present_slot + FUTURE_SLOT_TOLERANCE >= *block_slot { // The block is too far in the future, drop it. warn!( msg = "block for future slot rejected, check your time", @@ -731,130 +722,88 @@ impl NetworkBeaconProcessor { FUTURE_SLOT_TOLERANCE, "Block is ahead of our slot clock" ); - } else { - // The block is in the future, but not too far. - debug!( - %present_slot, - %block_slot, - FUTURE_SLOT_TOLERANCE, - "Block is slightly ahead of our slot clock. Ignoring." - ); } - - Err(ChainSegmentFailed { - message: format!( - "Block with slot {} is higher than the current slot {}", - block_slot, present_slot - ), - // Peers are faulty if they send blocks from the future. - peer_action: Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)), - }) - } - BlockError::WouldRevertFinalizedSlot { .. } => { - debug!("Finalized or earlier block processed"); - Ok(()) + // Peers are faulty if they send blocks from the future. + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } - BlockError::NotFinalizedDescendant { block_parent_root } => { - debug!( - "Not syncing to a chain that conflicts with the canonical or manual finalized checkpoint" - ); - Err(ChainSegmentFailed { - message: format!( - "Block with parent_root {} conflicts with our checkpoint state", - block_parent_root - ), - peer_action: Some(PeerGroupAction::block_peer(PeerAction::Fatal)), - }) + // Block is invalid + BlockError::StateRootMismatch { .. } + | BlockError::BlockSlotLimitReached + | BlockError::IncorrectBlockProposer { .. } + | BlockError::UnknownValidator { .. } + | BlockError::BlockIsNotLaterThanParent { .. } + | BlockError::NonLinearParentRoots + | BlockError::NonLinearSlots + | BlockError::PerBlockProcessingError(_) + | BlockError::InconsistentFork(_) + | BlockError::InvalidSignature(_) => { + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } - BlockError::GenesisBlock => { - debug!("Genesis block was processed"); - Ok(()) + // Currently blobs are served by the block peer + BlockError::InvalidBlobsSignature(_) => { + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } - BlockError::BeaconChainError(e) => { - warn!( - msg = "unexpected condition in processing block.", - outcome = ?e, - "BlockProcessingFailure" - ); - - Err(ChainSegmentFailed { - message: format!("Internal error whilst processing block: {:?}", e), - // Do not penalize peers for internal errors. - peer_action: None, - }) + BlockError::InvalidDataColumnsSignature(indices) => Some( + PeerGroupAction::column_peers(indices, PeerAction::LowToleranceError), + ), + BlockError::GenesisBlock + | BlockError::WouldRevertFinalizedSlot { .. } + | BlockError::DuplicateFullyImported(_) + | BlockError::DuplicateImportStatusUnknown(..) => { + // This can happen for many reasons. Head sync's can download multiples and parent + // lookups can download blocks before range sync + return Ok(()); } - BlockError::AvailabilityCheck(e) => { - let peer_group_action = PeerGroupAction::from_availability_check_error(&e); - Err(ChainSegmentFailed { - message: format!("Availability check error {:?}", e), - peer_action: peer_group_action, - }) + // Not syncing to a chain that conflicts with the canonical or manual finalized checkpoint + BlockError::NotFinalizedDescendant { .. } | BlockError::WeakSubjectivityConflict => { + Some(PeerGroupAction::block_peer(PeerAction::Fatal)) } - ref err @ BlockError::ExecutionPayloadError(ref epe) => { - if !epe.penalize_peer() { + BlockError::AvailabilityCheck(e) => PeerGroupAction::from_availability_check_error(e), + BlockError::ExecutionPayloadError(e) => { + if !e.penalize_peer() { // These errors indicate an issue with the EL and not the `ChainSegment`. // Pause the syncing while the EL recovers - debug!( - outcome = "pausing sync", - ?err, - "Execution layer verification failed" - ); - Err(ChainSegmentFailed { - message: format!("Execution layer offline. Reason: {:?}", err), - // Do not penalize peers for internal errors. - peer_action: None, - }) + None } else { - debug!( - error = ?err, - "Invalid execution payload" - ); - Err(ChainSegmentFailed { - message: format!( - "Peer sent a block containing invalid execution payload. Reason: {:?}", - err - ), - peer_action: Some(PeerGroupAction::block_peer( - PeerAction::LowToleranceError, - )), - }) + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } } - ref err @ BlockError::ParentExecutionPayloadInvalid { ref parent_root } => { + // We need to penalise harshly in case this represents an actual attack. In case + // of a faulty EL it will usually require manual intervention to fix anyway, so + // it's not too bad if we drop most of our peers. + BlockError::ParentExecutionPayloadInvalid { parent_root } => { warn!( ?parent_root, advice = "check execution node for corruption then restart it and Lighthouse", "Failed to sync chain built on invalid parent" ); - Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {err:?}"), - // We need to penalise harshly in case this represents an actual attack. In case - // of a faulty EL it will usually require manual intervention to fix anyway, so - // it's not too bad if we drop most of our peers. - peer_action: Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)), - }) + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } // Penalise peers for sending us banned blocks. BlockError::KnownInvalidExecutionPayload(block_root) => { - warn!(?block_root, "Received block known to be invalid",); - Err(ChainSegmentFailed { - message: format!("Banned block: {block_root:?}"), - peer_action: Some(PeerGroupAction::block_peer(PeerAction::Fatal)), - }) + warn!(?block_root, "Received block known to be invalid"); + Some(PeerGroupAction::block_peer(PeerAction::Fatal)) } - other => { - debug!( - msg = "peer sent invalid block", - outcome = %other, - "Invalid block received" - ); + // TODO(sync): Should we penalize slashable blocks? + BlockError::Slashable => None, + // Do not penalize peers for internal errors. + // BlobNotRequired is never constructed on this path + // TODO(sync): Double check that all `BeaconChainError` variants are actually internal + // errors in thie code path + BlockError::BeaconChainError(_) + | BlockError::InternalError(_) + | BlockError::BlobNotRequired(_) => None, + }; - Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {:?}", other), - // Do not penalize peers for internal errors. - peer_action: None, - }) - } + if peer_action.is_some() { + debug!(?error, "Range sync processing error"); + } else { + warn!(?error, "Unexpected range sync processing error"); } + + Err(ChainSegmentFailed { + message: format!("{error:?}"), + peer_action, + }) } } From dbd23a4a5fb9d34d10353bff943946954f81327c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 12 Apr 2025 12:31:27 -0300 Subject: [PATCH 28/64] Remove todo in BatchPeerGroup --- .../network/src/sync/backfill_sync/mod.rs | 4 ++-- .../src/sync/block_sidecar_coupling.rs | 14 ++++++++----- .../network/src/sync/range_sync/batch.rs | 20 ++++++++++++++----- .../network/src/sync/range_sync/chain.rs | 6 +++--- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index b59f7095d32..57d43d9191c 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -639,8 +639,8 @@ impl BackFillSync { // client = %network.client_type(peer), } for (column_index, penalty) in &peer_action.column_peer { - if let Some(peer) = batch_peers.column(*column_index) { - network.report_peer(peer, *penalty, "faulty_batch"); + if let Some(peer) = batch_peers.column(column_index) { + network.report_peer(*peer, *penalty, "faulty_batch"); } else { warn!(%batch_id, column_index, "Missing peer in PeerGroup"); } diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index d57bc894995..585657e7ef2 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -131,7 +131,7 @@ impl RangeBlockComponentsRequest { match &self.block_data_request { RangeBlockDataRequest::NoData => Some( Self::responses_with_blobs(blocks.to_vec(), vec![], spec) - .map(|blocks| (blocks, BatchPeerGroup::new(block_peer))), + .map(|blocks| (blocks, BatchPeerGroup::new_from_block_peer(block_peer))), ), RangeBlockDataRequest::Blobs(request) => { let Some((blobs, _blob_peer)) = request.to_finished() else { @@ -139,7 +139,7 @@ impl RangeBlockComponentsRequest { }; Some( Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) - .map(|blocks| (blocks, BatchPeerGroup::new(block_peer))), + .map(|blocks| (blocks, BatchPeerGroup::new_from_block_peer(block_peer))), ) } RangeBlockDataRequest::DataColumns { @@ -147,11 +147,15 @@ impl RangeBlockComponentsRequest { expected_column_to_peer, } => { let mut data_columns = vec![]; + let mut column_peers = HashMap::new(); for req in requests.values() { - let Some((data, _column_peer)) = req.to_finished() else { + let Some((resp_columns, column_peer)) = req.to_finished() else { return None; }; - data_columns.extend(data.clone()) + data_columns.extend(resp_columns.clone()); + for column in resp_columns { + column_peers.insert(column.index, *column_peer); + } } Some( @@ -161,7 +165,7 @@ impl RangeBlockComponentsRequest { expected_column_to_peer.clone(), spec, ) - .map(|blocks| (blocks, BatchPeerGroup::new(block_peer))), + .map(|blocks| (blocks, BatchPeerGroup::new(block_peer, column_peers))), ) } } diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index c2c4ddf9238..2bb5c9b8481 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -2,7 +2,7 @@ use beacon_chain::block_verification_types::RpcBlock; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::hash::{Hash, Hasher}; use std::ops::Sub; @@ -29,19 +29,29 @@ pub enum ByRangeRequestType { #[derive(Clone, Debug)] pub struct BatchPeerGroup { block_peer: PeerId, + column_peers: HashMap, } impl BatchPeerGroup { - pub fn new(block_peer: PeerId) -> Self { - Self { block_peer } + pub fn new_from_block_peer(block_peer: PeerId) -> Self { + Self { + block_peer, + column_peers: <_>::default(), + } + } + pub fn new(block_peer: PeerId, column_peers: HashMap) -> Self { + Self { + block_peer, + column_peers, + } } pub fn block(&self) -> PeerId { self.block_peer } - pub fn column(&self, _index: ColumnIndex) -> Option { - todo!(); + pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { + self.column_peers.get(index) } } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 68d27758c9a..822c7bcc30c 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -458,7 +458,7 @@ impl SyncingChain { } }; - let batch_peers = batch.processing_peer().cloned().ok_or_else(|| { + let batch_peers = batch.processing_peer().ok_or_else(|| { RemoveChain::WrongBatchState(format!( "Processing target is in wrong state: {:?}", batch.state(), @@ -541,8 +541,8 @@ impl SyncingChain { network.report_peer(batch_peers.block(), penalty, "faulty_batch"); } for (column_index, penalty) in &peer_action.column_peer { - if let Some(peer) = batch_peers.column(*column_index) { - network.report_peer(peer, *penalty, "faulty_batch"); + if let Some(peer) = batch_peers.column(column_index) { + network.report_peer(*peer, *penalty, "faulty_batch"); } else { warn!(%batch_id, column_index, "Missing peer in PeerGroup"); } From ea6cdb7adcb963a5f7fa5a48e6ba4869005da656 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 12 Apr 2025 12:53:39 -0300 Subject: [PATCH 29/64] Remove participating peers from backfill sync --- .../network_beacon_processor/sync_methods.rs | 5 +- .../network/src/sync/backfill_sync/mod.rs | 60 ++++--------------- beacon_node/network/src/sync/manager.rs | 1 - .../network/src/sync/range_sync/chain.rs | 4 +- 4 files changed, 16 insertions(+), 54 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 63cf35427ee..884d293ef6d 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -659,11 +659,9 @@ impl NetworkBeaconProcessor { HistoricalBlockError::AvailabilityCheckError(e) => { PeerGroupAction::from_availability_check_error(e) } + // The peer is faulty if they send blocks with bad roots or invalid signatures HistoricalBlockError::MismatchedBlockRoot { .. } | HistoricalBlockError::InvalidSignature(_) => { - // The peer is faulty if they send blocks with bad roots or invalid - // signatures - // TODO(das): check blobs and columns signatures separately Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } // Blobs are served by the block_peer @@ -793,6 +791,7 @@ impl NetworkBeaconProcessor { BlockError::BeaconChainError(_) | BlockError::InternalError(_) | BlockError::BlobNotRequired(_) => None, + // Do not use a fallback match, handle all errors explicitly }; if peer_action.is_some() { diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 57d43d9191c..a4411cc4fac 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -130,12 +130,6 @@ pub struct BackFillSync { /// Batches validated by this chain. validated_batches: u64, - /// We keep track of peers that are participating in the backfill sync. Unlike RangeSync, - /// BackFillSync uses all synced peers to download the chain from. If BackFillSync fails, we don't - /// want to penalize all our synced peers, so we use this variable to keep track of peers that - /// have participated and only penalize these peers if backfill sync fails. - participating_peers: HashSet, - /// When a backfill sync fails, we keep track of whether a new fully synced peer has joined. /// This signifies that we are able to attempt to restart a failed chain. restart_failed_sync: bool, @@ -183,7 +177,6 @@ impl BackFillSync { network_globals, current_processing_batch: None, validated_batches: 0, - participating_peers: HashSet::new(), restart_failed_sync: false, beacon_chain, }; @@ -304,25 +297,6 @@ impl BackFillSync { } } - /// A peer has disconnected. - /// If the peer has active batches, those are considered failed and re-requested. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> { - if matches!(self.state(), BackFillState::Failed) { - return Ok(()); - } - - // Remove the peer from the participation list - self.participating_peers.remove(peer_id); - Ok(()) - } - /// An RPC error has occurred. /// /// If the batch exists it is re-requested. @@ -442,7 +416,6 @@ impl BackFillSync { self.set_state(BackFillState::Failed); // Remove all batches and active requests and participating peers. self.batches.clear(); - self.participating_peers.clear(); self.restart_failed_sync = false; // Reset all downloading and processing targets @@ -631,12 +604,10 @@ impl BackFillSync { peer_action, error, } => { - // TODO: De-dup between back and forwards sync + // TODO(sync): De-dup between back and forwards sync if let Some(penalty) = peer_action.block_peer { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); - // TODO(das): downscore the right peer and display the client_type - // client = %network.client_type(peer), } for (column_index, penalty) in &peer_action.column_peer { if let Some(peer) = batch_peers.column(column_index) { @@ -652,29 +623,22 @@ impl BackFillSync { self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) .map(|_| ProcessResult::Successful) } - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - // TODO(das): what peer action should we apply to the rest of - // peers? Say a batch repeatedly fails because a custody peer is not - // sending us its custody columns - let penalty = PeerAction::LowToleranceError; - - // check that we have not exceeded the re-process retry counter - // If a batch has exceeded the invalid batch lookup attempts limit, it means - // that it is likely all peers are sending invalid batches - // repeatedly and are either malicious or faulty. We stop the backfill sync and - // report all synced peers that have participated. + Ok(BatchOperationOutcome::Failed { .. }) => { + // When backfill syncing post-PeerDAS we can't attribute fault to previous + // peers if a batch fails to process too many times. We have strict peer + // scoring for faulty errors, so participating peers that sent invalid + // data are already downscored. + // + // Because backfill sync deals with historical data that we can assert + // to be correct, once we import a batch that contains at least one + // block we are sure we got the right data. There's no need to penalize + // all participating peers in backfill sync if a batch fails warn!( - score_adjustment = %penalty, batch_epoch = %batch_id, error, - "Backfill batch failed to download. Penalizing peers" + "Backfill sync failed after attempting to process batch too many times" ); - for peer in self.participating_peers.drain() { - // TODO(das): `participating_peers` only includes block peers. Should we - // penalize the custody column peers too? - network.report_peer(peer, penalty, "backfill_batch_failed"); - } self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) .map(|_| ProcessResult::Successful) } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 70181cb5d1b..a490053e316 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -518,7 +518,6 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); - let _ = self.backfill_sync.peer_disconnected(peer_id); self.block_lookups.peer_disconnected(peer_id); // Regardless of the outcome, we update the sync status. diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 822c7bcc30c..575f856123d 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -532,10 +532,10 @@ impl SyncingChain { BatchProcessResult::FaultyFailure { imported_blocks, peer_action, - // TODO: propagate error in logs + // TODO(sync): propagate error in logs error: _, } => { - // TODO: De-dup between back and forwards sync + // TODO(sync): De-dup between back and forwards sync if let Some(penalty) = peer_action.block_peer { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); From 9db66f09286a5f6deb69dc2d349b7526df03fdd6 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Sat, 12 Apr 2025 19:13:18 -0300 Subject: [PATCH 30/64] Remove MissingAllCustodyColumns error --- .../beacon_chain/src/block_verification_types.rs | 12 +++++++----- .../src/data_availability_checker.rs | 8 ++++++-- .../src/data_availability_checker/error.rs | 2 -- .../src/network_beacon_processor/sync_methods.rs | 1 - .../network/src/sync/backfill_sync/mod.rs | 10 ++-------- .../network/src/sync/block_sidecar_coupling.rs | 3 +++ beacon_node/network/src/sync/network_context.rs | 6 ++++-- beacon_node/network/src/sync/range_sync/batch.rs | 16 +++++----------- beacon_node/network/src/sync/range_sync/chain.rs | 9 +++------ 9 files changed, 30 insertions(+), 37 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index b870f1f8027..dbc2494e8b7 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -118,13 +118,15 @@ impl RpcBlock { #[derive(Debug, Clone, Derivative)] #[derivative(Hash(bound = "E: EthSpec"))] enum RpcBlockInner { - /// Single block lookup response. This should potentially hit the data availability cache. + /// **Range sync**: Variant for all pre-Deneb blocks + /// **Lookup sync**: Variant used for all blocks of all forks, regardless if the have data or + /// not. Note: this is confusing and should be fixed in a later refactor. Block(Arc>), - /// This variant is used with parent lookups and by-range responses. It should have all blobs - /// ordered, all block roots matching, and the correct number of blobs for this block. + /// Variant for all post-Deneb blocks regardless if they have data or not. Only used for chain + /// segments in range sync BlockAndBlobs(Arc>, BlobSidecarList), - /// This variant is used with parent lookups and by-range responses. It should have all - /// requested data columns, all block roots matching for this block. + /// Variant for all post-Fulu blocks regardless if they have data or not. Only used for chain + /// segments in range sync BlockAndCustodyColumns( Arc>, CustodyDataColumnList, diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index a10d050e9d5..e23973b356d 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -474,8 +474,12 @@ impl DataAvailabilityChecker { spec: self.spec.clone(), }) } else { - // Note: strictly asserts blocks to be available instead of returning MaybeAvailableBlock - return Err(AvailabilityCheckError::MissingAllCustodyColumns); + // This is unreachable. If a block returns true for + // `data_columns_required_for_block` it must be a Fulu block. All Fulu RpcBlocks + // are constructed with the `DataColumns` variant, so `data_columns` must be Some + return Err(AvailabilityCheckError::Unexpected( + "Data columns should be Some for a Fulu block".to_string(), + )); } } else { MaybeAvailableBlock::Available(AvailableBlock { diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index d4a3b9afb27..c7c68e61b1b 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -14,7 +14,6 @@ pub enum Error { SszTypes(ssz_types::Error), MissingBlobs, MissingCustodyColumns(Vec), - MissingAllCustodyColumns, BlobIndexInvalid(u64), DataColumnIndexInvalid(u64), StoreError(store::Error), @@ -39,7 +38,6 @@ impl Error { Error::SszTypes(_) | Error::MissingBlobs | Error::MissingCustodyColumns(_) - | Error::MissingAllCustodyColumns | Error::StoreError(_) | Error::DecodeError(_) | Error::Unexpected(_) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 884d293ef6d..fc087283743 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -85,7 +85,6 @@ impl PeerGroupAction { AvailabilityCheckError::MissingCustodyColumns(columns) => Some( PeerGroupAction::column_peers(columns, PeerAction::LowToleranceError), ), - AvailabilityCheckError::MissingAllCustodyColumns => todo!(), AvailabilityCheckError::BlobIndexInvalid(_) => { Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index a4411cc4fac..25bc0190c2d 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -913,10 +913,9 @@ impl BackFillSync { .cloned() .collect::>(); - let (request, is_blob_batch) = batch.to_blocks_by_range_request(); + let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( - is_blob_batch, request, RangeRequestId::BackfillSync { batch_id }, &synced_peers, @@ -1076,12 +1075,7 @@ impl BackFillSync { self.include_next_batch(network) } Entry::Vacant(entry) => { - let batch_type = network.batch_type(batch_id); - entry.insert(BatchInfo::new( - &batch_id, - BACKFILL_EPOCHS_PER_BATCH, - batch_type, - )); + entry.insert(BatchInfo::new(&batch_id, BACKFILL_EPOCHS_PER_BATCH)); if self.would_complete(batch_id) { self.last_batch_downloaded = true; } diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 585657e7ef2..90397649cc5 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -31,8 +31,11 @@ enum ByRangeRequest { } enum RangeBlockDataRequest { + /// All pre-deneb blocks NoData, + /// All post-Deneb blocks, regardless of if they have data or not Blobs(ByRangeRequest>>>), + /// All post-Fulu blocks, regardless of if they have data or not DataColumns { requests: HashMap< DataColumnsByRangeRequestId, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 933b08418fc..8ea9caa9eca 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -408,12 +408,14 @@ impl SyncNetworkContext { /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, - batch_type: ByRangeRequestType, request: BlocksByRangeRequest, requester: RangeRequestId, peers: &HashSet, peers_to_deprioritize: &HashSet, ) -> Result { + let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); + let batch_type = self.batch_type(batch_epoch); + let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(block_peer) = peers @@ -1172,7 +1174,7 @@ impl SyncNetworkContext { /// Check whether a batch for this epoch (and only this epoch) should request just blocks or /// blocks and blobs. - pub fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { + fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { // Induces a compile time panic if this doesn't hold true. #[allow(clippy::assertions_on_constants)] const _: () = assert!( diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 2bb5c9b8481..3fed7153fce 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -139,8 +139,6 @@ pub struct BatchInfo { failed_download_attempts: Vec, /// State of the batch. state: BatchState, - /// Whether this batch contains all blocks or all blocks and blobs. - batch_type: ByRangeRequestType, /// Pin the generic marker: std::marker::PhantomData, } @@ -200,7 +198,7 @@ impl BatchInfo { /// fork boundary will be of mixed type (all blocks and one last blockblob), and I don't want to /// deal with this for now. /// This means finalization might be slower in deneb - pub fn new(start_epoch: &Epoch, num_of_epochs: u64, batch_type: ByRangeRequestType) -> Self { + pub fn new(start_epoch: &Epoch, num_of_epochs: u64) -> Self { let start_slot = start_epoch.start_slot(E::slots_per_epoch()); let end_slot = start_slot + num_of_epochs * E::slots_per_epoch(); BatchInfo { @@ -210,7 +208,6 @@ impl BatchInfo { failed_download_attempts: Vec::new(), non_faulty_processing_attempts: 0, state: BatchState::AwaitingDownload, - batch_type, marker: std::marker::PhantomData, } } @@ -266,13 +263,10 @@ impl BatchInfo { } /// Returns a BlocksByRange request associated with the batch. - pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) { - ( - BlocksByRangeRequest::new( - self.start_slot.into(), - self.end_slot.sub(self.start_slot).into(), - ), - self.batch_type, + pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest { + BlocksByRangeRequest::new( + self.start_slot.into(), + self.end_slot.sub(self.start_slot).into(), ) } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 575f856123d..5eead51b763 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -918,10 +918,9 @@ impl SyncingChain { ) -> ProcessingResult { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { - let (request, batch_type) = batch.to_blocks_by_range_request(); + let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( - batch_type, request, RangeRequestId::RangeSync { chain_id: self.id, @@ -1017,8 +1016,7 @@ impl SyncingChain { } if let Entry::Vacant(entry) = self.batches.entry(epoch) { - let batch_type = network.batch_type(epoch); - let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); + let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; } @@ -1119,8 +1117,7 @@ impl SyncingChain { self.include_next_batch(network) } Entry::Vacant(entry) => { - let batch_type = network.batch_type(next_batch_id); - entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH, batch_type)); + entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH)); self.to_be_downloaded += EPOCHS_PER_BATCH; Some(next_batch_id) } From 4b2bbe32423d989b5c0e38c99d2a575a5f589694 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 13 May 2025 20:15:21 -0500 Subject: [PATCH 31/64] Merge fixes --- beacon_node/beacon_chain/src/data_availability_checker.rs | 4 ++-- beacon_node/network/src/sync/range_sync/batch.rs | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 3c61e5348a2..42938097d38 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -18,8 +18,8 @@ use task_executor::TaskExecutor; use tracing::{debug, error, info_span, Instrument}; use types::blob_sidecar::{BlobIdentifier, BlobSidecar, FixedBlobSidecarList}; use types::{ - BlobSidecarList, ChainSpec, ColumnIndex, DataColumnIdentifier, DataColumnSidecar, - DataColumnSidecarList, Epoch, EthSpec, Hash256, RuntimeVariableList, SignedBeaconBlock, + BlobSidecarList, ChainSpec, ColumnIndex, DataColumnSidecarList, Epoch, EthSpec, Hash256, + RuntimeVariableList, SignedBeaconBlock, }; mod error; diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 785621a311a..20c8407377b 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -223,7 +223,7 @@ impl BatchInfo { peers.insert(attempt.peer_id.block()); } - for peer in self.failed_download_attempts.iter() { + for peer in self.failed_download_attempts.iter().flatten() { peers.insert(*peer); } @@ -330,9 +330,7 @@ impl BatchInfo { match self.state.poison() { BatchState::Downloading(_request_id) => { // register the attempt and check if the batch can be tried again - if let Some(peer) = peer { - self.failed_download_attempts.push(peer); - } + self.failed_download_attempts.push(peer); self.state = if self.failed_download_attempts.len() >= B::max_batch_download_attempts() as usize { From e0b36501c82af44bf20262092885d9d8c3769b98 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 13 May 2025 20:29:46 -0500 Subject: [PATCH 32/64] Clean up PR --- beacon_node/beacon_chain/src/block_verification.rs | 6 ++++-- .../beacon_chain/src/block_verification_types.rs | 10 +++++----- beacon_node/beacon_chain/src/historical_blocks.rs | 12 ++++++++---- .../src/network_beacon_processor/sync_methods.rs | 1 + 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 2ae18c9bdd8..b3783342b0a 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -650,6 +650,10 @@ pub fn signature_verify_chain_segment( } // Verify that blobs or data columns signatures match + // + // TODO(das): Should check correct proposer cheap for added protection if blocks and columns + // don't match. This code attributes fault to the blobs / data columns if they don't match the + // block for (_, block) in &chain_segment { if let Some(indices) = block.non_matching_blobs_signed_headers() { if !indices.is_empty() { @@ -663,8 +667,6 @@ pub fn signature_verify_chain_segment( } } - // Should check correct proposer cheap for added protection if blocks and columns don't match - // unzip chain segment and verify kzg in bulk let (roots, blocks): (Vec<_>, Vec<_>) = chain_segment.into_iter().unzip(); let maybe_available_blocks = chain diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index dbc2494e8b7..39899f3cc5a 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -120,13 +120,13 @@ impl RpcBlock { enum RpcBlockInner { /// **Range sync**: Variant for all pre-Deneb blocks /// **Lookup sync**: Variant used for all blocks of all forks, regardless if the have data or - /// not. Note: this is confusing and should be fixed in a later refactor. + /// not Block(Arc>), - /// Variant for all post-Deneb blocks regardless if they have data or not. Only used for chain - /// segments in range sync + /// **Range sync**: Variant for all post-Deneb blocks regardless if they have data or not + /// **Lookup sync**: Not used BlockAndBlobs(Arc>, BlobSidecarList), - /// Variant for all post-Fulu blocks regardless if they have data or not. Only used for chain - /// segments in range sync + /// **Range sync**: Variant for all post-Fulu blocks regardless if they have data or not + /// **Lookup sync**: Not used BlockAndCustodyColumns( Arc>, CustodyDataColumnList, diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 9ea4c76ca60..0ff0da7fa52 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -55,7 +55,10 @@ impl From for HistoricalBlockError { fn from(err: SignatureSetError) -> Self { match err { // The encoding of the signature is invalid, peer fault - e @ SignatureSetError::SignatureInvalid(_) => Self::InvalidSignature(format!("{e:?}")), + e + @ (SignatureSetError::SignatureInvalid(_) | SignatureSetError::BadBlsBytes { .. }) => { + Self::InvalidSignature(format!("{e:?}")) + } // All these variants are internal errors or unreachable for historical block paths, // which only check the proposer signature. // BadBlsBytes = Unreachable @@ -65,7 +68,6 @@ impl From for HistoricalBlockError { | SignatureSetError::IncorrectBlockProposer { .. } | SignatureSetError::MismatchedPublicKeyLen { .. } | SignatureSetError::PublicKeyDecompressionFailed - | SignatureSetError::BadBlsBytes { .. } | SignatureSetError::InconsistentBlockFork(_)) => Self::Unexpected(format!("{e:?}")), } } @@ -125,8 +127,10 @@ impl BeaconChain { self.assert_correct_historical_block_chain(&blocks)?; // Verify that blobs or data columns signatures match - // Why are we computing the DB ops before verifying the signatures? ¯\_(ツ)_/¯ We have to - // wait to maybe return the invalid block signature error. + // + // TODO(das): We don't raise the `matching_sidecar_signatures_error` yet. We have to wait to + // return an invalid block signature error first. We may want to refactor this order in a + // later code change. let matching_sidecar_signatures_error = blocks .iter() .map(|block| { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 0bb113c4f5f..ae74edf5325 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -47,6 +47,7 @@ pub struct ChainSegmentFailed { pub peer_action: Option, } +/// Tracks which block(s) component caused the block to be invalid. Used to attribute fault in sync. #[derive(Debug)] pub struct PeerGroupAction { pub block_peer: Option, From 6d563238504328df1de2fbed94035f741687c52b Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 15:45:51 -0500 Subject: [PATCH 33/64] Consistent naming of batch_peers --- .../network/src/sync/backfill_sync/mod.rs | 12 ++--- .../src/sync/block_sidecar_coupling.rs | 10 ++-- .../network/src/sync/network_context.rs | 4 +- .../network/src/sync/range_sync/batch.rs | 46 ++++++++++--------- .../network/src/sync/range_sync/chain.rs | 12 ++--- .../network/src/sync/range_sync/mod.rs | 4 +- .../network/src/sync/range_sync/range.rs | 6 +-- 7 files changed, 48 insertions(+), 46 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index d814171253c..cbcf2a14f0e 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -30,7 +30,7 @@ use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; use types::{Epoch, EthSpec}; -use super::range_sync::BatchPeerGroup; +use super::range_sync::BatchPeers; /// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of /// blocks per batch are requested _at most_. A batch may request less blocks to account for @@ -354,7 +354,7 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer: BatchPeerGroup, + batch_peers: BatchPeers, request_id: Id, blocks: Vec>, ) -> Result { @@ -375,7 +375,7 @@ impl BackFillSync { return Ok(ProcessResult::Successful); } - match batch.download_completed(blocks, peer) { + match batch.download_completed(blocks, batch_peers) { Ok(received) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; @@ -548,7 +548,7 @@ impl BackFillSync { } }; - let Some(batch_peers) = batch.processing_peer() else { + let Some(batch_peers) = batch.processing_peers() else { self.fail_sync(BackFillError::BatchInvalidState( batch_id, String::from("Peer does not exist"), @@ -768,8 +768,8 @@ impl BackFillSync { if attempt.hash != processed_attempt.hash { // The re-downloaded version was different. // TODO(das): should penalize other peers? - let valid_attempt_peer = processed_attempt.peer_id.block(); - let bad_attempt_peer = attempt.peer_id.block(); + let valid_attempt_peer = processed_attempt.block_peer(); + let bad_attempt_peer = attempt.block_peer(); if valid_attempt_peer != bad_attempt_peer { // A different peer sent the correct batch, the previous peer did not // We negatively score the original peer. diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 90397649cc5..f9e962f9a1c 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -16,7 +16,7 @@ use types::{ Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, }; -use super::range_sync::BatchPeerGroup; +use super::range_sync::BatchPeers; pub struct RangeBlockComponentsRequest { /// Blocks we have received awaiting for their corresponding sidecar. @@ -126,7 +126,7 @@ impl RangeBlockComponentsRequest { pub fn responses( &self, spec: &ChainSpec, - ) -> Option>, BatchPeerGroup), String>> { + ) -> Option>, BatchPeers), String>> { let Some((blocks, &block_peer)) = self.blocks_request.to_finished() else { return None; }; @@ -134,7 +134,7 @@ impl RangeBlockComponentsRequest { match &self.block_data_request { RangeBlockDataRequest::NoData => Some( Self::responses_with_blobs(blocks.to_vec(), vec![], spec) - .map(|blocks| (blocks, BatchPeerGroup::new_from_block_peer(block_peer))), + .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), ), RangeBlockDataRequest::Blobs(request) => { let Some((blobs, _blob_peer)) = request.to_finished() else { @@ -142,7 +142,7 @@ impl RangeBlockComponentsRequest { }; Some( Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) - .map(|blocks| (blocks, BatchPeerGroup::new_from_block_peer(block_peer))), + .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), ) } RangeBlockDataRequest::DataColumns { @@ -168,7 +168,7 @@ impl RangeBlockComponentsRequest { expected_column_to_peer.clone(), spec, ) - .map(|blocks| (blocks, BatchPeerGroup::new(block_peer, column_peers))), + .map(|blocks| (blocks, BatchPeers::new(block_peer, column_peers))), ) } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index eee25d90d79..84e7b5746ce 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -5,7 +5,7 @@ use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::{BatchPeerGroup, ByRangeRequestType}; +use super::range_sync::{BatchPeers, ByRangeRequestType}; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -606,7 +606,7 @@ impl SyncNetworkContext { id: ComponentsByRangeRequestId, peer_id: PeerId, range_block_component: RangeBlockComponent, - ) -> Option>, BatchPeerGroup), RpcResponseError>> { + ) -> Option>, BatchPeers), RpcResponseError>> { let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); return None; diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 20c8407377b..48ec0eedb79 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -27,12 +27,12 @@ pub enum ByRangeRequestType { } #[derive(Clone, Debug)] -pub struct BatchPeerGroup { +pub struct BatchPeers { block_peer: PeerId, column_peers: HashMap, } -impl BatchPeerGroup { +impl BatchPeers { pub fn new_from_block_peer(block_peer: PeerId) -> Self { Self { block_peer, @@ -161,7 +161,7 @@ pub enum BatchState { /// The batch is being downloaded. Downloading(Id), /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(BatchPeerGroup, Vec>, Instant), + AwaitingProcessing(BatchPeers, Vec>, Instant), /// The batch is being processed. Processing(Attempt), /// The batch was successfully processed and is waiting to be validated. @@ -220,7 +220,7 @@ impl BatchInfo { ); for attempt in &self.failed_processing_attempts { - peers.insert(attempt.peer_id.block()); + peers.insert(attempt.peers.block()); } for peer in self.failed_download_attempts.iter().flatten() { @@ -238,13 +238,13 @@ impl BatchInfo { false } - /// Returns the peer that is currently responsible for progressing the state of the batch. - pub fn processing_peer(&self) -> Option<&BatchPeerGroup> { + /// Returns the peers that provided this batch's downloaded contents + pub fn processing_peers(&self) -> Option<&BatchPeers> { match &self.state { BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, - BatchState::AwaitingProcessing(peer_id, _, _) - | BatchState::Processing(Attempt { peer_id, .. }) - | BatchState::AwaitingValidation(Attempt { peer_id, .. }) => Some(peer_id), + BatchState::AwaitingProcessing(peers, _, _) + | BatchState::Processing(Attempt { peers, .. }) + | BatchState::AwaitingValidation(Attempt { peers, .. }) => Some(peers), BatchState::Poisoned => unreachable!("Poisoned batch"), } } @@ -298,12 +298,12 @@ impl BatchInfo { pub fn download_completed( &mut self, blocks: Vec>, - peer: BatchPeerGroup, + batch_peers: BatchPeers, ) -> Result { match self.state.poison() { BatchState::Downloading(_request_id) => { let received = blocks.len(); - self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); + self.state = BatchState::AwaitingProcessing(batch_peers, blocks, Instant::now()); Ok(received) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -466,29 +466,31 @@ impl BatchInfo { #[derive(Debug)] pub struct Attempt { /// The peer that made the attempt. - pub peer_id: BatchPeerGroup, + peers: BatchPeers, /// The hash of the blocks of the attempt. pub hash: u64, } impl Attempt { - fn new(peer_id: BatchPeerGroup, blocks: &[RpcBlock]) -> Self { + fn new(peers: BatchPeers, blocks: &[RpcBlock]) -> Self { let hash = B::batch_attempt_hash(blocks); - Attempt { peer_id, hash } + Attempt { peers, hash } + } + + pub fn block_peer(&self) -> PeerId { + self.peers.block() } } impl std::fmt::Debug for BatchState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - BatchState::Processing(Attempt { - ref peer_id, - hash: _, - }) => write!(f, "Processing({})", peer_id.block()), - BatchState::AwaitingValidation(Attempt { - ref peer_id, - hash: _, - }) => write!(f, "AwaitingValidation({})", peer_id.block()), + BatchState::Processing(Attempt { ref peers, hash: _ }) => { + write!(f, "Processing({})", peers.block()) + } + BatchState::AwaitingValidation(Attempt { ref peers, hash: _ }) => { + write!(f, "AwaitingValidation({})", peers.block()) + } BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), BatchState::AwaitingProcessing(_, ref blocks, _) => { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index d6cfd217a9e..28cef5cc497 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,4 +1,4 @@ -use super::batch::{BatchInfo, BatchPeerGroup, BatchProcessingResult, BatchState}; +use super::batch::{BatchInfo, BatchPeers, BatchProcessingResult, BatchState}; use super::RangeSyncType; use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; @@ -216,7 +216,7 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer: BatchPeerGroup, + batch_peers: BatchPeers, request_id: Id, blocks: Vec>, ) -> ProcessingResult { @@ -245,7 +245,7 @@ impl SyncingChain { // Remove the request from the peer's active batches // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 - let received = batch.download_completed(blocks, peer)?; + let received = batch.download_completed(blocks, batch_peers)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -447,7 +447,7 @@ impl SyncingChain { } }; - let batch_peers = batch.processing_peer().ok_or_else(|| { + let batch_peers = batch.processing_peers().ok_or_else(|| { RemoveChain::WrongBatchState(format!( "Processing target is in wrong state: {:?}", batch.state(), @@ -650,8 +650,8 @@ impl SyncingChain { if attempt.hash != processed_attempt.hash { // The re-downloaded version was different // TODO(das): should penalize other peers? - let valid_attempt_peer = processed_attempt.peer_id.block(); - let bad_attempt_peer = attempt.peer_id.block(); + let valid_attempt_peer = processed_attempt.block_peer(); + let bad_attempt_peer = attempt.block_peer(); if valid_attempt_peer != bad_attempt_peer { // A different peer sent the correct batch, the previous peer did not // We negatively score the original peer. diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index f57c1497180..1218e0cd09c 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -8,8 +8,8 @@ mod range; mod sync_type; pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeerGroup, BatchProcessingResult, - BatchState, ByRangeRequestType, + BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, + ByRangeRequestType, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 8249313f2ea..e2c076484a5 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -42,7 +42,7 @@ use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; -use super::BatchPeerGroup; +use super::BatchPeers; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; @@ -228,7 +228,7 @@ where pub fn blocks_by_range_response( &mut self, network: &mut SyncNetworkContext, - peer_id: BatchPeerGroup, + batch_peers: BatchPeers, chain_id: ChainId, batch_id: BatchId, request_id: Id, @@ -236,7 +236,7 @@ where ) { // check if this chunk removes the chain match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, peer_id, request_id, blocks) + chain.on_block_response(network, batch_id, batch_peers, request_id, blocks) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { From 1b8a8a21123cf7e8566ef582f696200e4b44985d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 16:30:13 -0500 Subject: [PATCH 34/64] Address multiple review comments --- .../beacon_chain/src/block_verification_types.rs | 8 +++++++- .../src/data_availability_checker/error.rs | 10 +--------- .../src/network_beacon_processor/sync_methods.rs | 5 +++-- beacon_node/network/src/sync/backfill_sync/mod.rs | 2 +- .../network/src/sync/block_sidecar_coupling.rs | 1 + beacon_node/network/src/sync/network_context.rs | 5 +++-- beacon_node/network/src/sync/range_sync/batch.rs | 11 +++++++---- beacon_node/network/src/sync/range_sync/chain.rs | 5 ++--- 8 files changed, 25 insertions(+), 22 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 39899f3cc5a..8bf13ea328b 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -205,7 +205,13 @@ impl RpcBlock { let custody_columns_count = expected_custody_indices.len(); let inner = RpcBlockInner::BlockAndCustodyColumns( block, - RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize)?, + RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize).map_err( + |e| { + AvailabilityCheckError::Unexpected(format!( + "custody_columns len exceeds number_of_columns: {e:?}" + )) + }, + )?, expected_custody_indices, ); Ok(Self { diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index c7c68e61b1b..f215d0a5a14 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -11,7 +11,6 @@ pub enum Error { block_commitment: KzgCommitment, }, Unexpected(String), - SszTypes(ssz_types::Error), MissingBlobs, MissingCustodyColumns(Vec), BlobIndexInvalid(u64), @@ -35,8 +34,7 @@ pub enum ErrorCategory { impl Error { pub fn category(&self) -> ErrorCategory { match self { - Error::SszTypes(_) - | Error::MissingBlobs + Error::MissingBlobs | Error::MissingCustodyColumns(_) | Error::StoreError(_) | Error::DecodeError(_) @@ -55,12 +53,6 @@ impl Error { } } -impl From for Error { - fn from(value: ssz_types::Error) -> Self { - Self::SszTypes(value) - } -} - impl From for Error { fn from(value: store::Error) -> Self { Self::StoreError(value) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index ae74edf5325..7b92e1968fc 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -81,8 +81,9 @@ impl PeerGroupAction { AvailabilityCheckError::ReconstructColumnsError(_) => None, // internal error AvailabilityCheckError::KzgCommitmentMismatch { .. } => None, // should never happen after checking inclusion proof AvailabilityCheckError::Unexpected(_) => None, // internal - AvailabilityCheckError::SszTypes(_) => None, // ?? - AvailabilityCheckError::MissingBlobs => None, // TODO(das) internal for now + AvailabilityCheckError::MissingBlobs => { + Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) + } AvailabilityCheckError::MissingCustodyColumns(columns) => Some( PeerGroupAction::column_peers(columns, PeerAction::LowToleranceError), ), diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index cbcf2a14f0e..0fde586887f 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -914,7 +914,7 @@ impl BackFillSync { .collect::>(); let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_peers(); + let failed_peers = batch.failed_block_peers(); match network.block_components_by_range_request( request, RangeRequestId::BackfillSync { batch_id }, diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index f9e962f9a1c..68f15491256 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -122,6 +122,7 @@ impl RangeBlockComponentsRequest { } } + /// If all internal requests are complete returns a Vec of coupled RpcBlocks #[allow(clippy::type_complexity)] pub fn responses( &self, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 84e7b5746ce..50b39fe72ef 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -598,8 +598,9 @@ impl SyncNetworkContext { Ok(columns_to_request_by_peer) } - /// Received a blocks by range or blobs by range response for a request that couples blocks ' - /// and blobs. + /// Received a _by_range response for a request that couples blocks and its data + /// + /// `peer_id` is the peer that served this individual RPC _by_range response. #[allow(clippy::type_complexity)] pub fn range_block_component_response( &mut self, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 48ec0eedb79..72598a25405 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -214,7 +214,10 @@ impl BatchInfo { /// Gives a list of peers from which this batch has had a failed download or processing /// attempt. - pub fn failed_peers(&self) -> HashSet { + /// + /// TODO(das): Returns only block peers to keep the mainnet path equivalent. The failed peers + /// mechanism is broken for PeerDAS and will be fixed with https://github.com/sigp/lighthouse/issues/6258 + pub fn failed_block_peers(&self) -> HashSet { let mut peers = HashSet::with_capacity( self.failed_processing_attempts.len() + self.failed_download_attempts.len(), ); @@ -460,12 +463,12 @@ impl BatchInfo { } } -/// Represents a peer's attempt and providing the result for this batch. +/// Represents a batch attempt awaiting validation /// -/// Invalid attempts will downscore a peer. +/// Invalid attempts will downscore its peers #[derive(Debug)] pub struct Attempt { - /// The peer that made the attempt. + /// The peers that served this batch contents peers: BatchPeers, /// The hash of the blocks of the attempt. pub hash: u64, diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 28cef5cc497..350e46cc463 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -244,7 +244,6 @@ impl SyncingChain { // A stream termination has been sent. This batch has ended. Process a completed batch. // Remove the request from the peer's active batches - // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 let received = batch.download_completed(blocks, batch_peers)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) @@ -531,7 +530,7 @@ impl SyncingChain { } for (column_index, penalty) in &peer_action.column_peer { if let Some(peer) = batch_peers.column(column_index) { - network.report_peer(*peer, *penalty, "faulty_batch"); + network.report_peer(*peer, *penalty, "faulty_batch_columns"); } else { warn!(%batch_id, column_index, "Missing peer in PeerGroup"); } @@ -908,7 +907,7 @@ impl SyncingChain { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_peers(); + let failed_peers = batch.failed_block_peers(); match network.block_components_by_range_request( request, RangeRequestId::RangeSync { From 6ba7f7ce7a9d1fb790e14fc04f1f0d436ce43755 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 17:00:05 -0500 Subject: [PATCH 35/64] Better errors for das --- .../src/data_availability_checker/error.rs | 12 +----------- .../src/data_availability_checker/state_lru_cache.rs | 6 +++--- .../src/network_beacon_processor/sync_methods.rs | 10 +++++----- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index f215d0a5a14..e602ce2d134 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -1,5 +1,5 @@ use kzg::{Error as KzgError, KzgCommitment}; -use types::{BeaconStateError, ColumnIndex, Hash256}; +use types::{BeaconStateError, ColumnIndex}; #[derive(Debug)] pub enum Error { @@ -16,8 +16,6 @@ pub enum Error { BlobIndexInvalid(u64), DataColumnIndexInvalid(u64), StoreError(store::Error), - DecodeError(ssz::DecodeError), - ParentStateMissing(Hash256), BlockReplayError(state_processing::BlockReplayError), RebuildingStateCaches(BeaconStateError), SlotClockError, @@ -37,9 +35,7 @@ impl Error { Error::MissingBlobs | Error::MissingCustodyColumns(_) | Error::StoreError(_) - | Error::DecodeError(_) | Error::Unexpected(_) - | Error::ParentStateMissing(_) | Error::BlockReplayError(_) | Error::RebuildingStateCaches(_) | Error::SlotClockError => ErrorCategory::Internal, @@ -59,12 +55,6 @@ impl From for Error { } } -impl From for Error { - fn from(value: ssz::DecodeError) -> Self { - Self::DecodeError(value) - } -} - impl From for Error { fn from(value: state_processing::BlockReplayError) -> Self { Self::BlockReplayError(value) diff --git a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs index 5fe674f30c1..fe8c89e6c3d 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs @@ -157,9 +157,9 @@ impl StateLRUCache { parent_block_state_root, ) .map_err(AvailabilityCheckError::StoreError)? - .ok_or(AvailabilityCheckError::ParentStateMissing( - parent_block_state_root, - ))?; + .ok_or(AvailabilityCheckError::Unexpected(format!( + "Parent state missing {parent_block_state_root:?}" + )))?; let state_roots = vec![ Ok((parent_state_root, diet_executed_block.parent_block.slot())), diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 7b92e1968fc..aaf3a9caf4d 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -84,18 +84,18 @@ impl PeerGroupAction { AvailabilityCheckError::MissingBlobs => { Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } + // TOOD(das): PeerAction::High may be too soft of a penalty. Also may be deprecated + // with https://github.com/sigp/lighthouse/issues/6258 AvailabilityCheckError::MissingCustodyColumns(columns) => Some( - PeerGroupAction::column_peers(columns, PeerAction::LowToleranceError), + PeerGroupAction::column_peers(columns, PeerAction::HighToleranceError), ), AvailabilityCheckError::BlobIndexInvalid(_) => { Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) } AvailabilityCheckError::DataColumnIndexInvalid(_) => None, // unreachable AvailabilityCheckError::StoreError(_) => None, // unreachable - AvailabilityCheckError::DecodeError(_) => None, // ?? - AvailabilityCheckError::ParentStateMissing(_) => None, // ?? - AvailabilityCheckError::BlockReplayError(_) => None, // un-reachable ?? - AvailabilityCheckError::RebuildingStateCaches(_) => None, // ?? + AvailabilityCheckError::BlockReplayError(_) => None, // internal error + AvailabilityCheckError::RebuildingStateCaches(_) => None, // internal error AvailabilityCheckError::SlotClockError => None, // internal error } } From 675ae07b9ac103518661cb6aad024ca83bdd8c12 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 17:00:47 -0500 Subject: [PATCH 36/64] Penalize column peers once --- .../src/peer_manager/peerdb/score.rs | 2 +- .../network/src/sync/backfill_sync/mod.rs | 21 +++++++++++++------ .../network/src/sync/range_sync/chain.rs | 21 +++++++++++++------ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs index 995ebf90646..517151a06f8 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs @@ -43,7 +43,7 @@ const GOSSIPSUB_POSITIVE_SCORE_WEIGHT: f64 = GOSSIPSUB_NEGATIVE_SCORE_WEIGHT; /// Each variant has an associated score change. // To easily assess the behaviour of scores changes the number of variants should stay low, and // somewhat generic. -#[derive(Debug, Clone, Copy, AsRefStr)] +#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq, AsRefStr)] #[strum(serialize_all = "snake_case")] pub enum PeerAction { /// We should not communicate more with this peer. diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0fde586887f..7b5701cc8d2 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -18,6 +18,7 @@ use crate::sync::range_sync::{ }; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; +use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; @@ -609,12 +610,20 @@ impl BackFillSync { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); } - for (column_index, penalty) in &peer_action.column_peer { - if let Some(peer) = batch_peers.column(column_index) { - network.report_peer(*peer, *penalty, "faulty_batch"); - } else { - warn!(%batch_id, column_index, "Missing peer in PeerGroup"); - } + + // Penalize each peer only once. Currently a peer_action does not mix different + // PeerAction levels. + for (peer, penalty) in peer_action + .column_peer + .iter() + .filter_map(|(column_index, penalty)| { + batch_peers + .column(column_index) + .map(|peer| (*peer, *penalty)) + }) + .unique() + { + network.report_peer(peer, penalty, "faulty_batch_column"); } match batch.processing_completed(BatchProcessingResult::FaultyFailure) { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 350e46cc463..37963a7a6dc 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -6,6 +6,7 @@ use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcRespo use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; +use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; @@ -528,12 +529,20 @@ impl SyncingChain { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); } - for (column_index, penalty) in &peer_action.column_peer { - if let Some(peer) = batch_peers.column(column_index) { - network.report_peer(*peer, *penalty, "faulty_batch_columns"); - } else { - warn!(%batch_id, column_index, "Missing peer in PeerGroup"); - } + + // Penalize each peer only once. Currently a peer_action does not mix different + // PeerAction levels. + for (peer, penalty) in peer_action + .column_peer + .iter() + .filter_map(|(column_index, penalty)| { + batch_peers + .column(column_index) + .map(|peer| (*peer, *penalty)) + }) + .unique() + { + network.report_peer(peer, penalty, "faulty_batch_column"); } // Check if this batch is allowed to continue From 56f7c36072af10f8c6398a70ed64ead0f86acbf1 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 17:03:11 -0500 Subject: [PATCH 37/64] Restore fn --- .../src/network_beacon_processor/tests.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index bc3efccf0b7..292e894870f 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -348,6 +348,22 @@ impl TestRig { } } + pub fn enqueue_gossip_data_columns(&self, col_index: usize) { + if let Some(data_columns) = self.next_data_columns.as_ref() { + let data_column = data_columns.get(col_index).unwrap(); + self.network_beacon_processor + .send_gossip_data_column_sidecar( + junk_message_id(), + junk_peer_id(), + Client::default(), + DataColumnSubnetId::from_column_index(data_column.index, &self.chain.spec), + data_column.clone(), + Duration::from_secs(0), + ) + .unwrap(); + } + } + pub fn custody_columns_count(&self) -> usize { self.network_beacon_processor .network_globals From f12d210647991c4013f903aca5e53b9654cd6b9c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 17:03:21 -0500 Subject: [PATCH 38/64] Fix error enum --- .../network/src/network_beacon_processor/sync_methods.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index aaf3a9caf4d..58492df255f 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -697,7 +697,6 @@ impl NetworkBeaconProcessor { Err(ChainSegmentFailed { // Render the full error in debug for full details message: format!("{:?}", e), - // This is an internal error, don't penalize the peer. peer_action, }) } @@ -787,8 +786,9 @@ impl NetworkBeaconProcessor { warn!(?block_root, "Received block known to be invalid"); Some(PeerGroupAction::block_peer(PeerAction::Fatal)) } - // TODO(sync): Should we penalize slashable blocks? - BlockError::Slashable => None, + BlockError::Slashable => { + Some(PeerGroupAction::block_peer(PeerAction::MidToleranceError)) + } // Do not penalize peers for internal errors. // BlobNotRequired is never constructed on this path // TODO(sync): Double check that all `BeaconChainError` variants are actually internal From f0c775026925e00946f88cb230a7ee278dd38750 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 17:04:12 -0500 Subject: [PATCH 39/64] Removed MismatchedPublicKeyLen --- beacon_node/beacon_chain/src/historical_blocks.rs | 1 - .../src/per_block_processing/signature_sets.rs | 3 --- 2 files changed, 4 deletions(-) diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 0ff0da7fa52..728a49bb072 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -66,7 +66,6 @@ impl From for HistoricalBlockError { | SignatureSetError::ValidatorUnknown(_) | SignatureSetError::ValidatorPubkeyUnknown(_) | SignatureSetError::IncorrectBlockProposer { .. } - | SignatureSetError::MismatchedPublicKeyLen { .. } | SignatureSetError::PublicKeyDecompressionFailed | SignatureSetError::InconsistentBlockFork(_)) => Self::Unexpected(format!("{e:?}")), } diff --git a/consensus/state_processing/src/per_block_processing/signature_sets.rs b/consensus/state_processing/src/per_block_processing/signature_sets.rs index 39f438f97f6..e954541b592 100644 --- a/consensus/state_processing/src/per_block_processing/signature_sets.rs +++ b/consensus/state_processing/src/per_block_processing/signature_sets.rs @@ -34,9 +34,6 @@ pub enum Error { /// /// The block is invalid. IncorrectBlockProposer { block: u64, local_shuffling: u64 }, - /// The public keys supplied do not match the number of objects requiring keys. Block validity - /// was not determined. - MismatchedPublicKeyLen { pubkey_len: usize, other_len: usize }, /// Pubkey decompression failed. The block is invalid. PublicKeyDecompressionFailed, /// The public key bytes stored in the `BeaconState` were not valid. This is a serious internal From fad3a0a2ddd873c11a59d8aabb42e913827e69ee Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 18:16:27 -0500 Subject: [PATCH 40/64] Revert testing changes --- .../beacon_chain/tests/block_verification.rs | 18 ++++++++++++----- .../tests/payload_invalidation.rs | 20 ++++++++----------- beacon_node/beacon_chain/tests/store_tests.rs | 14 +++++++++---- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 42a090bf538..9225ffd9f41 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -367,7 +367,7 @@ async fn chain_segment_non_linear_parent_roots() { let (mut block, signature) = blocks[3].as_block().clone().deconstruct(); *block.parent_root_mut() = Hash256::zero(); - blocks[3] = harness.build_rpc_block_from_store_blobs( + blocks[3] = RpcBlock::new_without_blobs( None, Arc::new(SignedBeaconBlock::from_block(block, signature)), harness.sampling_column_count, @@ -405,7 +405,7 @@ async fn chain_segment_non_linear_slots() { .collect(); let (mut block, signature) = blocks[3].as_block().clone().deconstruct(); *block.slot_mut() = Slot::new(0); - blocks[3] = harness.build_rpc_block_from_store_blobs( + blocks[3] = RpcBlock::new_without_blobs( None, Arc::new(SignedBeaconBlock::from_block(block, signature)), harness.sampling_column_count, @@ -433,7 +433,7 @@ async fn chain_segment_non_linear_slots() { .collect(); let (mut block, signature) = blocks[3].as_block().clone().deconstruct(); *block.slot_mut() = blocks[2].slot(); - blocks[3] = harness.build_rpc_block_from_store_blobs( + blocks[3] = RpcBlock::new_without_blobs( None, Arc::new(SignedBeaconBlock::from_block(block, signature)), harness.sampling_column_count, @@ -578,7 +578,11 @@ async fn invalid_signature_gossip_block() { .into_block_error() .expect("should import all blocks prior to the one being tested"); let signed_block = SignedBeaconBlock::from_block(block, junk_signature()); - let rpc_block = harness.build_rpc_block_from_store_blobs(None, Arc::new(signed_block)); + let rpc_block = RpcBlock::new_without_blobs( + None, + Arc::new(signed_block), + harness.sampling_column_count, + ); let process_res = harness .chain .process_block( @@ -1767,7 +1771,11 @@ async fn import_duplicate_block_unrealized_justification() { // Create two verified variants of the block, representing the same block being processed in // parallel. let notify_execution_layer = NotifyExecutionLayer::Yes; - let rpc_block = harness.build_rpc_block_from_store_blobs(Some(block_root), block.clone()); + let rpc_block = RpcBlock::new_without_blobs( + Some(block_root), + block.clone(), + harness.sampling_column_count, + ); let verified_block1 = rpc_block .clone() .into_execution_pending_block(block_root, chain, notify_execution_layer) diff --git a/beacon_node/beacon_chain/tests/payload_invalidation.rs b/beacon_node/beacon_chain/tests/payload_invalidation.rs index b8f72f291b1..c6fc3416e05 100644 --- a/beacon_node/beacon_chain/tests/payload_invalidation.rs +++ b/beacon_node/beacon_chain/tests/payload_invalidation.rs @@ -688,9 +688,8 @@ async fn invalidates_all_descendants() { assert_eq!(fork_parent_state.slot(), fork_parent_slot); let ((fork_block, _), _fork_post_state) = rig.harness.make_block(fork_parent_state, fork_slot).await; - let fork_rpc_block = rig - .harness - .build_rpc_block_from_store_blobs(None, fork_block.clone()); + let fork_rpc_block = + RpcBlock::new_without_blobs(None, fork_block.clone(), rig.harness.sampling_column_count); let fork_block_root = rig .harness .chain @@ -792,9 +791,8 @@ async fn switches_heads() { let ((fork_block, _), _fork_post_state) = rig.harness.make_block(fork_parent_state, fork_slot).await; let fork_parent_root = fork_block.parent_root(); - let fork_rpc_block = rig - .harness - .build_rpc_block_from_store_blobs(None, fork_block.clone()); + let fork_rpc_block = + RpcBlock::new_without_blobs(None, fork_block.clone(), rig.harness.sampling_column_count); let fork_block_root = rig .harness .chain @@ -1064,9 +1062,8 @@ async fn invalid_parent() { )); // Ensure the block built atop an invalid payload is invalid for import. - let rpc_block = rig - .harness - .build_rpc_block_from_store_blobs(None, block.clone()); + let rpc_block = + RpcBlock::new_without_blobs(None, block.clone(), rig.harness.sampling_column_count); assert!(matches!( rig.harness.chain.process_block(rpc_block.block_root(), rpc_block, NotifyExecutionLayer::Yes, BlockImportSource::Lookup, || Ok(()), @@ -1390,9 +1387,8 @@ async fn recover_from_invalid_head_by_importing_blocks() { } = InvalidHeadSetup::new().await; // Import the fork block, it should become the head. - let fork_rpc_block = rig - .harness - .build_rpc_block_from_store_blobs(None, fork_block.clone()); + let fork_rpc_block = + RpcBlock::new_without_blobs(None, fork_block.clone(), rig.harness.sampling_column_count); rig.harness .chain .process_block( diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 804ddbe33c4..3343dc101b5 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2644,8 +2644,11 @@ async fn process_blocks_and_attestations_for_unaligned_checkpoint() { assert_eq!(split.block_root, valid_fork_block.parent_root()); assert_ne!(split.state_root, unadvanced_split_state_root); - let invalid_fork_rpc_block = - harness.build_rpc_block_from_store_blobs(None, invalid_fork_block.clone()); + let invalid_fork_rpc_block = RpcBlock::new_without_blobs( + None, + invalid_fork_block.clone(), + harness.sampling_column_count, + ); // Applying the invalid block should fail. let err = harness .chain @@ -2661,8 +2664,11 @@ async fn process_blocks_and_attestations_for_unaligned_checkpoint() { assert!(matches!(err, BlockError::WouldRevertFinalizedSlot { .. })); // Applying the valid block should succeed, but it should not become head. - let valid_fork_rpc_block = - harness.build_rpc_block_from_store_blobs(None, valid_fork_block.clone()); + let valid_fork_rpc_block = RpcBlock::new_without_blobs( + None, + valid_fork_block.clone(), + harness.sampling_column_count, + ); harness .chain .process_block( From f637cbbabd987546bc9a95ea166ec747999ecc05 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 18:20:13 -0500 Subject: [PATCH 41/64] Change BlockAndCustodyColumns enum variant --- .../src/block_verification_types.rs | 58 ++++++++++--------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 8bf13ea328b..488b2ac6c56 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -53,7 +53,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(block) => block, RpcBlockInner::BlockAndBlobs(block, _) => block, - RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block, + RpcBlockInner::BlockAndCustodyColumns { block, .. } => block, } } @@ -61,7 +61,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(block) => block.clone(), RpcBlockInner::BlockAndBlobs(block, _) => block.clone(), - RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block.clone(), + RpcBlockInner::BlockAndCustodyColumns { block, .. } => block.clone(), } } @@ -69,7 +69,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(_) => None, RpcBlockInner::BlockAndBlobs(_, blobs) => Some(blobs), - RpcBlockInner::BlockAndCustodyColumns(_, _, _) => None, + RpcBlockInner::BlockAndCustodyColumns { .. } => None, } } @@ -77,7 +77,7 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(_) => None, RpcBlockInner::BlockAndBlobs(_, _) => None, - RpcBlockInner::BlockAndCustodyColumns(_, data_columns, _) => Some(data_columns), + RpcBlockInner::BlockAndCustodyColumns { data_columns, .. } => Some(data_columns), } } @@ -91,7 +91,7 @@ impl RpcBlock { .map(|blob| blob.index) .collect(), ), - RpcBlockInner::BlockAndCustodyColumns(..) => None, + RpcBlockInner::BlockAndCustodyColumns { .. } => None, } } @@ -99,7 +99,11 @@ impl RpcBlock { match &self.block { RpcBlockInner::Block(_) => None, RpcBlockInner::BlockAndBlobs(..) => None, - RpcBlockInner::BlockAndCustodyColumns(block, data_columns, _) => Some( + RpcBlockInner::BlockAndCustodyColumns { + block, + data_columns, + .. + } => Some( data_columns .iter() .filter(|column| { @@ -127,11 +131,11 @@ enum RpcBlockInner { BlockAndBlobs(Arc>, BlobSidecarList), /// **Range sync**: Variant for all post-Fulu blocks regardless if they have data or not /// **Lookup sync**: Not used - BlockAndCustodyColumns( - Arc>, - CustodyDataColumnList, - Vec, - ), + BlockAndCustodyColumns { + block: Arc>, + data_columns: CustodyDataColumnList, + expected_custody_indices: Vec, + }, } impl RpcBlock { @@ -203,17 +207,19 @@ impl RpcBlock { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); let custody_columns_count = expected_custody_indices.len(); - let inner = RpcBlockInner::BlockAndCustodyColumns( + let inner = RpcBlockInner::BlockAndCustodyColumns { block, - RuntimeVariableList::new(custody_columns, spec.number_of_columns as usize).map_err( - |e| { - AvailabilityCheckError::Unexpected(format!( - "custody_columns len exceeds number_of_columns: {e:?}" - )) - }, - )?, + data_columns: RuntimeVariableList::new( + custody_columns, + spec.number_of_columns as usize, + ) + .map_err(|e| { + AvailabilityCheckError::Unexpected(format!( + "custody_columns len exceeds number_of_columns: {e:?}" + )) + })?, expected_custody_indices, - ); + }; Ok(Self { block_root, block: inner, @@ -234,11 +240,11 @@ impl RpcBlock { match self.block { RpcBlockInner::Block(block) => (block_root, block, None, None), RpcBlockInner::BlockAndBlobs(block, blobs) => (block_root, block, Some(blobs), None), - RpcBlockInner::BlockAndCustodyColumns( + RpcBlockInner::BlockAndCustodyColumns { block, data_columns, expected_custody_indices, - ) => ( + } => ( block_root, block, None, @@ -248,14 +254,14 @@ impl RpcBlock { } pub fn n_blobs(&self) -> usize { match &self.block { - RpcBlockInner::Block(_) | RpcBlockInner::BlockAndCustodyColumns(_, _, _) => 0, + RpcBlockInner::Block(_) | RpcBlockInner::BlockAndCustodyColumns { .. } => 0, RpcBlockInner::BlockAndBlobs(_, blobs) => blobs.len(), } } pub fn n_data_columns(&self) -> usize { match &self.block { RpcBlockInner::Block(_) | RpcBlockInner::BlockAndBlobs(_, _) => 0, - RpcBlockInner::BlockAndCustodyColumns(_, data_columns, _) => data_columns.len(), + RpcBlockInner::BlockAndCustodyColumns { data_columns, .. } => data_columns.len(), } } } @@ -570,14 +576,14 @@ impl AsBlock for RpcBlock { match &self.block { RpcBlockInner::Block(block) => block, RpcBlockInner::BlockAndBlobs(block, _) => block, - RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block, + RpcBlockInner::BlockAndCustodyColumns { block, .. } => block, } } fn block_cloned(&self) -> Arc> { match &self.block { RpcBlockInner::Block(block) => block.clone(), RpcBlockInner::BlockAndBlobs(block, _) => block.clone(), - RpcBlockInner::BlockAndCustodyColumns(block, _, _) => block.clone(), + RpcBlockInner::BlockAndCustodyColumns { block, .. } => block.clone(), } } fn canonical_root(&self) -> Hash256 { From 9292f82c1c23be6d9a1b9d36015aabf60342518e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 14 May 2025 21:11:05 -0500 Subject: [PATCH 42/64] Revert type change in import_historical_block_batch --- .../beacon_chain/src/block_verification.rs | 3 +- .../src/block_verification_types.rs | 51 ++++++++++------ .../src/data_availability_checker.rs | 21 +++++++ .../beacon_chain/src/historical_blocks.rs | 61 +++++++++++-------- beacon_node/beacon_chain/tests/store_tests.rs | 2 +- .../network_beacon_processor/sync_methods.rs | 5 +- 6 files changed, 97 insertions(+), 46 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index b3783342b0a..66829def1bc 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -637,7 +637,8 @@ pub fn signature_verify_chain_segment( &chain.spec, )?; - // verify signatures before matching blocks and data + // Verify signatures before matching blocks and data. Otherwise we may penalize blob or column + // peers for valid signatures if the block peer sends us an invalid signature. let pubkey_cache = get_validator_pubkey_cache(chain)?; let mut signature_verifier = get_signature_verifier(&state, &pubkey_cache, &chain.spec); for (block_root, block) in &chain_segment { diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 488b2ac6c56..1dea71acc06 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -9,8 +9,9 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use types::blob_sidecar::BlobIdentifier; use types::{ - BeaconBlockRef, BeaconState, BlindedPayload, BlobSidecarList, ChainSpec, ColumnIndex, Epoch, - EthSpec, Hash256, RuntimeVariableList, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, + BeaconBlockRef, BeaconState, BlindedPayload, BlobSidecarList, ChainSpec, ColumnIndex, + DataColumnSidecar, Epoch, EthSpec, Hash256, RuntimeVariableList, SignedBeaconBlock, + SignedBeaconBlockHeader, Slot, }; /// A block that has been received over RPC. It has 2 internal variants: @@ -84,13 +85,9 @@ impl RpcBlock { pub fn non_matching_blobs_signed_headers(&self) -> Option> { match &self.block { RpcBlockInner::Block(_) => None, - RpcBlockInner::BlockAndBlobs(block, blobs) => Some( - blobs - .iter() - .filter(|blob| &blob.signed_block_header.signature != block.signature()) - .map(|blob| blob.index) - .collect(), - ), + RpcBlockInner::BlockAndBlobs(block, blobs) => { + Some(non_matching_blobs_block_signature(block, blobs)) + } RpcBlockInner::BlockAndCustodyColumns { .. } => None, } } @@ -103,15 +100,13 @@ impl RpcBlock { block, data_columns, .. - } => Some( - data_columns + } => Some(non_matching_custody_columns_block_signature( + block, + &data_columns .iter() - .filter(|column| { - &column.as_data_column().signed_block_header.signature != block.signature() - }) - .map(|column| column.index()) - .collect(), - ), + .map(|data_column| data_column.clone_arc()) + .collect::>(), + )), } } } @@ -590,3 +585,25 @@ impl AsBlock for RpcBlock { self.as_block().canonical_root() } } + +pub fn non_matching_blobs_block_signature( + block: &SignedBeaconBlock, + blobs: &BlobSidecarList, +) -> Vec { + blobs + .iter() + .filter(|blob| &blob.signed_block_header.signature != block.signature()) + .map(|blob| blob.index) + .collect() +} + +pub fn non_matching_custody_columns_block_signature( + block: &SignedBeaconBlock, + data_columns: &[Arc>], +) -> Vec { + data_columns + .iter() + .filter(|column| &column.signed_block_header.signature != block.signature()) + .map(|column| column.index) + .collect() +} diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 42938097d38..c1956c33455 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -1,5 +1,6 @@ use crate::blob_verification::{verify_kzg_for_blob_list, GossipVerifiedBlob, KzgVerifiedBlobList}; use crate::block_verification_types::{ + non_matching_blobs_block_signature, non_matching_custody_columns_block_signature, AvailabilityPendingExecutedBlock, AvailableExecutedBlock, RpcBlock, }; use crate::data_availability_checker::overflow_lru_cache::{ @@ -801,6 +802,26 @@ impl AvailableBlock { (block_root, block, blob_data) } + pub fn non_matching_blobs_signed_headers(&self) -> Option> { + match &self.blob_data { + AvailableBlockData::NoData => None, + AvailableBlockData::Blobs(blobs) => { + Some(non_matching_blobs_block_signature(&self.block, blobs)) + } + AvailableBlockData::DataColumns(_) => None, + } + } + + pub fn non_matching_custody_columns_signed_headers(&self) -> Option> { + match &self.blob_data { + AvailableBlockData::NoData => None, + AvailableBlockData::Blobs(_) => None, + AvailableBlockData::DataColumns(data_columns) => Some( + non_matching_custody_columns_block_signature(&self.block, data_columns), + ), + } + } + /// Only used for testing pub fn __clone_without_recv(&self) -> Result { Ok(Self { diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 728a49bb072..1be5c8eb6af 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -1,5 +1,7 @@ use crate::block_verification_types::{MaybeAvailableBlock, RpcBlock}; -use crate::data_availability_checker::{AvailabilityCheckError, AvailableBlockData}; +use crate::data_availability_checker::{ + AvailabilityCheckError, AvailableBlock, AvailableBlockData, +}; use crate::{metrics, BeaconChain, BeaconChainTypes}; use itertools::Itertools; use state_processing::{ @@ -100,6 +102,37 @@ impl BeaconChain { Ok(()) } + pub fn verify_and_import_historical_block_batch( + &self, + blocks: Vec>, + ) -> Result { + // First check that chain of blocks is correct + self.assert_correct_historical_block_chain(&blocks)?; + + // Check that all data columns are present <- faulty failure if missing because we have + // checked the block root is correct first. + let blocks = self + .data_availability_checker + .verify_kzg_for_rpc_blocks(blocks) + .and_then(|blocks| { + blocks + .into_iter() + // RpcBlocks must always be Available, otherwise a data peer is faulty of + // malicious. `verify_kzg_for_rpc_blocks` returns errors for those cases, but we + // haven't updated its function signature. This code block can be deleted later + // bigger refactor. + .map(|maybe_available| match maybe_available { + MaybeAvailableBlock::Available(block) => Ok(block), + MaybeAvailableBlock::AvailabilityPending { .. } => Err( + AvailabilityCheckError::Unexpected("block not available".to_string()), + ), + }) + .collect::, _>>() + })?; + + self.import_historical_block_batch(blocks) + } + /// Store a batch of historical blocks in the database. /// /// The `blocks` should be given in slot-ascending order. One of the blocks should have a block @@ -120,11 +153,8 @@ impl BeaconChain { /// Return the number of blocks successfully imported. pub fn import_historical_block_batch( &self, - blocks: Vec>, + mut blocks: Vec>, ) -> Result { - // First check that chain of blocks is correct - self.assert_correct_historical_block_chain(&blocks)?; - // Verify that blobs or data columns signatures match // // TODO(das): We don't raise the `matching_sidecar_signatures_error` yet. We have to wait to @@ -147,27 +177,6 @@ impl BeaconChain { }) .collect::, _>>(); - // Check that all data columns are present <- faulty failure if missing because we have - // checked the block root is correct first. - let mut blocks = self - .data_availability_checker - .verify_kzg_for_rpc_blocks(blocks) - .and_then(|blocks| { - blocks - .into_iter() - // RpcBlocks must always be Available, otherwise a data peer is faulty of - // malicious. `verify_kzg_for_rpc_blocks` returns errors for those cases, but we - // haven't updated its function signature. This code block can be deleted later - // bigger refactor. - .map(|maybe_available| match maybe_available { - MaybeAvailableBlock::Available(block) => Ok(block), - MaybeAvailableBlock::AvailabilityPending { .. } => Err( - AvailabilityCheckError::Unexpected("block not available".to_string()), - ), - }) - .collect::, _>>() - })?; - let anchor_info = self.store.get_anchor_info(); let blob_info = self.store.get_blob_info(); let data_column_info = self.store.get_data_column_info(); diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 3343dc101b5..559d613c92c 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2503,7 +2503,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { beacon_chain .import_historical_block_batch(batch_with_invalid_first_block) .unwrap_err(), - HistoricalBlockError::InvalidSignature + HistoricalBlockError::InvalidSignature(_) )); // Importing the batch with valid signatures should succeed. diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 58492df255f..98d70f39ae9 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -649,7 +649,10 @@ impl NetworkBeaconProcessor { &self, downloaded_blocks: Vec>, ) -> Result { - match self.chain.import_historical_block_batch(downloaded_blocks) { + match self + .chain + .verify_and_import_historical_block_batch(downloaded_blocks) + { Ok(imported_blocks) => { metrics::inc_counter( &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_SUCCESS_TOTAL, From edc7da0a054c8ddc7e5bd92daefe3977d1b3cd07 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 12:48:45 -0500 Subject: [PATCH 43/64] Drop pubkey cache --- beacon_node/beacon_chain/src/block_verification.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 66829def1bc..de249206992 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -649,6 +649,7 @@ pub fn signature_verify_chain_segment( if signature_verifier.verify().is_err() { return Err(BlockError::InvalidSignature(InvalidSignature::Unknown)); } + drop(pubkey_cache); // Verify that blobs or data columns signatures match // From 859f380a9d9c9fd9ab8c4e5084b5e4386a9d9d96 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:01:33 -0500 Subject: [PATCH 44/64] Don't collect Vec --- .../beacon_chain/src/block_verification_types.rs | 10 ++++------ .../beacon_chain/src/data_availability_checker.rs | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 1dea71acc06..af17e2f5181 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -102,10 +102,9 @@ impl RpcBlock { .. } => Some(non_matching_custody_columns_block_signature( block, - &data_columns + data_columns .iter() - .map(|data_column| data_column.clone_arc()) - .collect::>(), + .map(|data_column| data_column.as_data_column()), )), } } @@ -597,12 +596,11 @@ pub fn non_matching_blobs_block_signature( .collect() } -pub fn non_matching_custody_columns_block_signature( +pub fn non_matching_custody_columns_block_signature<'a, E: EthSpec>( block: &SignedBeaconBlock, - data_columns: &[Arc>], + data_columns: impl Iterator>>, ) -> Vec { data_columns - .iter() .filter(|column| &column.signed_block_header.signature != block.signature()) .map(|column| column.index) .collect() diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index c1956c33455..6e0073c81ab 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -817,7 +817,7 @@ impl AvailableBlock { AvailableBlockData::NoData => None, AvailableBlockData::Blobs(_) => None, AvailableBlockData::DataColumns(data_columns) => Some( - non_matching_custody_columns_block_signature(&self.block, data_columns), + non_matching_custody_columns_block_signature(&self.block, data_columns.iter()), ), } } From 74f137ecd7c291d97b315b34d927b409f7a2a25c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:01:42 -0500 Subject: [PATCH 45/64] Classify errors --- .../beacon_chain/src/data_availability_checker/error.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index e602ce2d134..534a7692208 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -32,14 +32,14 @@ pub enum ErrorCategory { impl Error { pub fn category(&self) -> ErrorCategory { match self { - Error::MissingBlobs - | Error::MissingCustodyColumns(_) - | Error::StoreError(_) + Error::StoreError(_) | Error::Unexpected(_) | Error::BlockReplayError(_) | Error::RebuildingStateCaches(_) | Error::SlotClockError => ErrorCategory::Internal, - Error::InvalidBlobs { .. } + Error::MissingBlobs + | Error::MissingCustodyColumns(_) + | Error::InvalidBlobs { .. } | Error::InvalidColumn { .. } | Error::ReconstructColumnsError { .. } | Error::BlobIndexInvalid(_) From f48b586f04f84af997853c3ef81f5f8e04b7245e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:06:17 -0500 Subject: [PATCH 46/64] Remove ReconstructColumnsError --- beacon_node/beacon_chain/src/data_availability_checker.rs | 2 +- beacon_node/beacon_chain/src/data_availability_checker/error.rs | 2 -- .../network/src/network_beacon_processor/sync_methods.rs | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 6e0073c81ab..2a8f6f5e3d4 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -589,7 +589,7 @@ impl DataAvailabilityChecker { self.availability_cache .handle_reconstruction_failure(block_root); metrics::inc_counter(&KZG_DATA_COLUMN_RECONSTRUCTION_FAILURES); - AvailabilityCheckError::ReconstructColumnsError(e) + AvailabilityCheckError::Unexpected(format!("Error reconstructing columns: {e:?}")) })?; // Check indices from cache again to make sure we don't publish components we've already received. diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index 534a7692208..3388fd75cbd 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -5,7 +5,6 @@ use types::{BeaconStateError, ColumnIndex}; pub enum Error { InvalidBlobs(KzgError), InvalidColumn(Vec<(ColumnIndex, KzgError)>), - ReconstructColumnsError(KzgError), KzgCommitmentMismatch { blob_commitment: KzgCommitment, block_commitment: KzgCommitment, @@ -41,7 +40,6 @@ impl Error { | Error::MissingCustodyColumns(_) | Error::InvalidBlobs { .. } | Error::InvalidColumn { .. } - | Error::ReconstructColumnsError { .. } | Error::BlobIndexInvalid(_) | Error::DataColumnIndexInvalid(_) | Error::KzgCommitmentMismatch { .. } => ErrorCategory::Malicious, diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 98d70f39ae9..88658c2e2ad 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -78,7 +78,6 @@ impl PeerGroupAction { &errors.iter().map(|(index, _)| *index).collect::>(), PeerAction::LowToleranceError, )), - AvailabilityCheckError::ReconstructColumnsError(_) => None, // internal error AvailabilityCheckError::KzgCommitmentMismatch { .. } => None, // should never happen after checking inclusion proof AvailabilityCheckError::Unexpected(_) => None, // internal AvailabilityCheckError::MissingBlobs => { From 464921194f8be450b16b43362fc0fe1c783dc343 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:13:05 -0500 Subject: [PATCH 47/64] More detailed UnrequestedSlot error --- .../network/src/sync/network_context/requests.rs | 10 ++++++++-- .../sync/network_context/requests/blobs_by_range.rs | 12 ++++++++---- .../sync/network_context/requests/blocks_by_range.rs | 12 ++++++++---- .../requests/data_columns_by_range.rs | 12 ++++++++---- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 963b633ed6d..cd70a2e7ebc 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -28,11 +28,17 @@ mod data_columns_by_root; #[derive(Debug, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { - NotEnoughResponsesReturned { actual: usize }, + NotEnoughResponsesReturned { + actual: usize, + }, TooManyResponses, UnrequestedBlockRoot(Hash256), UnrequestedIndex(u64), - UnrequestedSlot(Slot), + UnrequestedSlot { + slot: Slot, + start_slot: Slot, + end_slot: Slot, + }, InvalidInclusionProof, DuplicatedData(Slot, u64), InternalError(String), diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs index 9c6f516199c..1320c734c91 100644 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs @@ -25,10 +25,14 @@ impl ActiveRequestItems for BlobsByRangeRequestItems { type Item = Arc>; fn add(&mut self, blob: Self::Item) -> Result { - if blob.slot() < self.request.start_slot - || blob.slot() >= self.request.start_slot + self.request.count - { - return Err(LookupVerifyError::UnrequestedSlot(blob.slot())); + let end_slot = self.request.start_slot + self.request.count; + + if blob.slot() < self.request.start_slot || blob.slot() >= end_slot { + return Err(LookupVerifyError::UnrequestedSlot { + slot: blob.slot(), + start_slot: self.request.start_slot, + end_slot, + }); } if blob.index >= self.max_blobs_per_block { return Err(LookupVerifyError::UnrequestedIndex(blob.index)); diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs index c7d2dda01ea..9ea763400c0 100644 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs @@ -23,10 +23,14 @@ impl ActiveRequestItems for BlocksByRangeRequestItems { type Item = Arc>; fn add(&mut self, block: Self::Item) -> Result { - if block.slot().as_u64() < *self.request.start_slot() - || block.slot().as_u64() >= self.request.start_slot() + self.request.count() - { - return Err(LookupVerifyError::UnrequestedSlot(block.slot())); + let end_slot = self.request.start_slot() + self.request.count(); + + if block.slot().as_u64() < *self.request.start_slot() || block.slot().as_u64() >= end_slot { + return Err(LookupVerifyError::UnrequestedSlot { + slot: block.slot(), + start_slot: self.request.start_slot(), + end_slot, + }); } if self .items diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs index 9dabb2defa0..7e3fddc705d 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs @@ -23,10 +23,14 @@ impl ActiveRequestItems for DataColumnsByRangeRequestItems { type Item = Arc>; fn add(&mut self, data_column: Self::Item) -> Result { - if data_column.slot() < self.request.start_slot - || data_column.slot() >= self.request.start_slot + self.request.count - { - return Err(LookupVerifyError::UnrequestedSlot(data_column.slot())); + let end_slot = self.request.start_slot + self.request.count; + + if data_column.slot() < self.request.start_slot || data_column.slot() >= end_slot { + return Err(LookupVerifyError::UnrequestedSlot { + slot: data_column.slot(), + start_slot: self.request.start_slot, + end_slot, + }); } if !self.request.columns.contains(&data_column.index) { return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); From 10882fdff93b5ce1219c8e69c8bc4d56ba754bbb Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:23:08 -0500 Subject: [PATCH 48/64] Lint test --- beacon_node/beacon_chain/tests/block_verification.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 9225ffd9f41..3350aa83363 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -143,10 +143,14 @@ fn build_rpc_block( Some(DataSidecars::Blobs(blobs)) => { RpcBlock::new(None, block, Some(blobs.clone())).unwrap() } - Some(DataSidecars::DataColumns(columns)) => { - RpcBlock::new_with_custody_columns(None, block, columns.clone(), columns.len(), spec) - .unwrap() - } + Some(DataSidecars::DataColumns(columns)) => RpcBlock::new_with_custody_columns( + None, + block, + columns.clone(), + columns.iter().map(|d| d.index).collect(), + spec, + ) + .unwrap(), None => RpcBlock::new_without_blobs(None, block, 0), } } From 50032a467de5fc70e6320a0c43667dab75e9f1db Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:23:19 -0500 Subject: [PATCH 49/64] Fix slot conversion --- .../src/sync/network_context/requests/blobs_by_range.rs | 9 +++++---- .../src/sync/network_context/requests/blocks_by_range.rs | 9 +++++---- .../network_context/requests/data_columns_by_range.rs | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs index 1320c734c91..8a9a8c9813c 100644 --- a/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/blobs_by_range.rs @@ -1,7 +1,7 @@ use super::{ActiveRequestItems, LookupVerifyError}; use lighthouse_network::rpc::methods::BlobsByRangeRequest; use std::sync::Arc; -use types::{BlobSidecar, EthSpec}; +use types::{BlobSidecar, EthSpec, Slot}; /// Accumulates results of a blobs_by_range request. Only returns items after receiving the /// stream termination. @@ -25,12 +25,13 @@ impl ActiveRequestItems for BlobsByRangeRequestItems { type Item = Arc>; fn add(&mut self, blob: Self::Item) -> Result { - let end_slot = self.request.start_slot + self.request.count; + let start_slot = Slot::new(self.request.start_slot); + let end_slot = start_slot + Slot::new(self.request.count); - if blob.slot() < self.request.start_slot || blob.slot() >= end_slot { + if blob.slot() < start_slot || blob.slot() >= end_slot { return Err(LookupVerifyError::UnrequestedSlot { slot: blob.slot(), - start_slot: self.request.start_slot, + start_slot, end_slot, }); } diff --git a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs b/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs index 9ea763400c0..ae39ac1d766 100644 --- a/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/blocks_by_range.rs @@ -1,7 +1,7 @@ use super::{ActiveRequestItems, LookupVerifyError}; use lighthouse_network::rpc::BlocksByRangeRequest; use std::sync::Arc; -use types::{EthSpec, SignedBeaconBlock}; +use types::{EthSpec, SignedBeaconBlock, Slot}; /// Accumulates results of a blocks_by_range request. Only returns items after receiving the /// stream termination. @@ -23,12 +23,13 @@ impl ActiveRequestItems for BlocksByRangeRequestItems { type Item = Arc>; fn add(&mut self, block: Self::Item) -> Result { - let end_slot = self.request.start_slot() + self.request.count(); + let start_slot = Slot::new(*self.request.start_slot()); + let end_slot = start_slot + Slot::new(*self.request.count()); - if block.slot().as_u64() < *self.request.start_slot() || block.slot().as_u64() >= end_slot { + if block.slot() < start_slot || block.slot() >= end_slot { return Err(LookupVerifyError::UnrequestedSlot { slot: block.slot(), - start_slot: self.request.start_slot(), + start_slot, end_slot, }); } diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs index 7e3fddc705d..276ede93c12 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs @@ -1,7 +1,7 @@ use super::{ActiveRequestItems, LookupVerifyError}; use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use std::sync::Arc; -use types::{DataColumnSidecar, EthSpec}; +use types::{DataColumnSidecar, EthSpec, Slot}; /// Accumulates results of a data_columns_by_range request. Only returns items after receiving the /// stream termination. @@ -23,12 +23,13 @@ impl ActiveRequestItems for DataColumnsByRangeRequestItems { type Item = Arc>; fn add(&mut self, data_column: Self::Item) -> Result { - let end_slot = self.request.start_slot + self.request.count; + let start_slot = Slot::new(self.request.start_slot); + let end_slot = start_slot + Slot::new(self.request.count); - if data_column.slot() < self.request.start_slot || data_column.slot() >= end_slot { + if data_column.slot() < start_slot || data_column.slot() >= end_slot { return Err(LookupVerifyError::UnrequestedSlot { slot: data_column.slot(), - start_slot: self.request.start_slot, + start_slot, end_slot, }); } From 06d4076ddff53c4518f04bda6e948ac60b238e8c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:37:39 -0500 Subject: [PATCH 50/64] Reduce penalty for missing blobs --- .../network/src/network_beacon_processor/sync_methods.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 88658c2e2ad..b1777cef792 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -81,7 +81,7 @@ impl PeerGroupAction { AvailabilityCheckError::KzgCommitmentMismatch { .. } => None, // should never happen after checking inclusion proof AvailabilityCheckError::Unexpected(_) => None, // internal AvailabilityCheckError::MissingBlobs => { - Some(PeerGroupAction::block_peer(PeerAction::LowToleranceError)) + Some(PeerGroupAction::block_peer(PeerAction::HighToleranceError)) } // TOOD(das): PeerAction::High may be too soft of a penalty. Also may be deprecated // with https://github.com/sigp/lighthouse/issues/6258 From 91663f4e96a5c29dd36e3c383a4bedc440e8e5ca Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 13:37:50 -0500 Subject: [PATCH 51/64] Revert changes in peer selection --- beacon_node/network/src/sync/range_sync/chain.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 37963a7a6dc..ba809a14ba1 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -917,13 +917,25 @@ impl SyncingChain { if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); + + // TODO(das): we should request only from peers that are part of this SyncingChain. + // However, then we hit the NoPeer error frequently which causes the batch to fail and + // the SyncingChain to be dropped. We need to handle this case more gracefully. + let synced_peers = network + .network_globals() + .peers + .read() + .synced_peers() + .cloned() + .collect::>(); + match network.block_components_by_range_request( request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, - &self.peers, + &synced_peers, &failed_peers, ) { Ok(request_id) => { From 31f46a12be5b6f3eaf20d7eb3d90d15d977d98c1 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 15 May 2025 14:29:36 -0500 Subject: [PATCH 52/64] Lint tests --- beacon_node/beacon_chain/tests/block_verification.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 3350aa83363..4f3556263fc 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -147,7 +147,7 @@ fn build_rpc_block( None, block, columns.clone(), - columns.iter().map(|d| d.index).collect(), + columns.iter().map(|d| d.index()).collect(), spec, ) .unwrap(), From b11f8a380b3223c44f9296cad990930cda00c5e4 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 16 May 2025 13:54:22 -0500 Subject: [PATCH 53/64] Rename block matching functions --- .../beacon_chain/src/block_verification.rs | 12 ++--- .../src/block_verification_types.rs | 52 ++++++++++++------- .../src/data_availability_checker.rs | 30 ++++++----- .../beacon_chain/src/historical_blocks.rs | 12 ++--- 4 files changed, 57 insertions(+), 49 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index de249206992..31a9d061d0e 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -657,15 +657,11 @@ pub fn signature_verify_chain_segment( // don't match. This code attributes fault to the blobs / data columns if they don't match the // block for (_, block) in &chain_segment { - if let Some(indices) = block.non_matching_blobs_signed_headers() { - if !indices.is_empty() { - return Err(BlockError::InvalidBlobsSignature(indices)); - } + if let Err(indices) = block.match_block_and_blobs() { + return Err(BlockError::InvalidBlobsSignature(indices)); } - if let Some(indices) = block.non_matching_custody_columns_signed_headers() { - if !indices.is_empty() { - return Err(BlockError::InvalidDataColumnsSignature(indices)); - } + if let Err(indices) = block.match_block_and_data_columns() { + return Err(BlockError::InvalidDataColumnsSignature(indices)); } } diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index af17e2f5181..7abaf09e5e0 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -82,30 +82,32 @@ impl RpcBlock { } } - pub fn non_matching_blobs_signed_headers(&self) -> Option> { + /// Returns Err if any of its inner BlobSidecar's signed_block_header does not match the inner + /// block + pub fn match_block_and_blobs(&self) -> Result<(), Vec> { match &self.block { - RpcBlockInner::Block(_) => None, - RpcBlockInner::BlockAndBlobs(block, blobs) => { - Some(non_matching_blobs_block_signature(block, blobs)) - } - RpcBlockInner::BlockAndCustodyColumns { .. } => None, + RpcBlockInner::Block(_) => Ok(()), + RpcBlockInner::BlockAndBlobs(block, blobs) => match_block_and_blobs(block, blobs), + RpcBlockInner::BlockAndCustodyColumns { .. } => Ok(()), } } - pub fn non_matching_custody_columns_signed_headers(&self) -> Option> { + /// Returns Err if any of its inner DataColumnSidecar's signed_block_header does not match the + /// inner block + pub fn match_block_and_data_columns(&self) -> Result<(), Vec> { match &self.block { - RpcBlockInner::Block(_) => None, - RpcBlockInner::BlockAndBlobs(..) => None, + RpcBlockInner::Block(_) => Ok(()), + RpcBlockInner::BlockAndBlobs(..) => Ok(()), RpcBlockInner::BlockAndCustodyColumns { block, data_columns, .. - } => Some(non_matching_custody_columns_block_signature( + } => match_block_and_data_columns( block, data_columns .iter() .map(|data_column| data_column.as_data_column()), - )), + ), } } } @@ -585,23 +587,35 @@ impl AsBlock for RpcBlock { } } -pub fn non_matching_blobs_block_signature( +/// Returns Err if any of `blobs` BlobSidecar's signed_block_header does not match +/// block +pub fn match_block_and_blobs( block: &SignedBeaconBlock, blobs: &BlobSidecarList, -) -> Vec { - blobs +) -> Result<(), Vec> { + let indices = blobs .iter() .filter(|blob| &blob.signed_block_header.signature != block.signature()) .map(|blob| blob.index) - .collect() + .collect::>(); + if indices.is_empty() { + Ok(()) + } else { + Err(indices) + } } -pub fn non_matching_custody_columns_block_signature<'a, E: EthSpec>( +pub fn match_block_and_data_columns<'a, E: EthSpec>( block: &SignedBeaconBlock, data_columns: impl Iterator>>, -) -> Vec { - data_columns +) -> Result<(), Vec> { + let indices = data_columns .filter(|column| &column.signed_block_header.signature != block.signature()) .map(|column| column.index) - .collect() + .collect::>(); + if indices.is_empty() { + Ok(()) + } else { + Err(indices) + } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 2a8f6f5e3d4..e55f9b50f05 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -1,7 +1,7 @@ use crate::blob_verification::{verify_kzg_for_blob_list, GossipVerifiedBlob, KzgVerifiedBlobList}; use crate::block_verification_types::{ - non_matching_blobs_block_signature, non_matching_custody_columns_block_signature, - AvailabilityPendingExecutedBlock, AvailableExecutedBlock, RpcBlock, + match_block_and_blobs, match_block_and_data_columns, AvailabilityPendingExecutedBlock, + AvailableExecutedBlock, RpcBlock, }; use crate::data_availability_checker::overflow_lru_cache::{ DataAvailabilityCheckerInner, ReconstructColumnsDecision, @@ -802,23 +802,25 @@ impl AvailableBlock { (block_root, block, blob_data) } - pub fn non_matching_blobs_signed_headers(&self) -> Option> { + /// Returns Err if any of its inner BlobSidecar's signed_block_header does not match the inner + /// block + pub fn match_block_and_blobs(&self) -> Result<(), Vec> { match &self.blob_data { - AvailableBlockData::NoData => None, - AvailableBlockData::Blobs(blobs) => { - Some(non_matching_blobs_block_signature(&self.block, blobs)) - } - AvailableBlockData::DataColumns(_) => None, + AvailableBlockData::NoData => Ok(()), + AvailableBlockData::Blobs(blobs) => match_block_and_blobs(&self.block, blobs), + AvailableBlockData::DataColumns(_) => Ok(()), } } - pub fn non_matching_custody_columns_signed_headers(&self) -> Option> { + /// Returns Err if any of its inner DataColumnSidecar's signed_block_header does not match the + /// inner block + pub fn match_block_and_data_columns(&self) -> Result<(), Vec> { match &self.blob_data { - AvailableBlockData::NoData => None, - AvailableBlockData::Blobs(_) => None, - AvailableBlockData::DataColumns(data_columns) => Some( - non_matching_custody_columns_block_signature(&self.block, data_columns.iter()), - ), + AvailableBlockData::NoData => Ok(()), + AvailableBlockData::Blobs(_) => Ok(()), + AvailableBlockData::DataColumns(data_columns) => { + match_block_and_data_columns(&self.block, data_columns.iter()) + } } } diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 1be5c8eb6af..50ace054e3c 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -163,15 +163,11 @@ impl BeaconChain { let matching_sidecar_signatures_error = blocks .iter() .map(|block| { - if let Some(indices) = block.non_matching_blobs_signed_headers() { - if !indices.is_empty() { - return Err(HistoricalBlockError::InvalidBlobsSignature(indices)); - } + if let Err(indices) = block.match_block_and_blobs() { + return Err(HistoricalBlockError::InvalidBlobsSignature(indices)); } - if let Some(indices) = block.non_matching_custody_columns_signed_headers() { - if !indices.is_empty() { - return Err(HistoricalBlockError::InvalidDataColumnsSignature(indices)); - } + if let Err(indices) = block.match_block_and_data_columns() { + return Err(HistoricalBlockError::InvalidDataColumnsSignature(indices)); } Ok(()) }) From d82bf260d437cc7b5d9d06a6e2ddbf4635318ba4 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 16 May 2025 13:58:51 -0500 Subject: [PATCH 54/64] Reorder block matching in historical blocks --- .../src/data_availability_checker.rs | 21 ++---------- .../beacon_chain/src/historical_blocks.rs | 33 ++++++++----------- beacon_node/beacon_chain/tests/store_tests.rs | 2 +- 3 files changed, 16 insertions(+), 40 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index e55f9b50f05..bdd25b98b13 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -731,7 +731,7 @@ async fn availability_cache_maintenance_service( } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum AvailableBlockData { /// Block is pre-Deneb or has zero blobs NoData, @@ -742,7 +742,7 @@ pub enum AvailableBlockData { } /// A fully available block that is ready to be imported into fork choice. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct AvailableBlock { block_root: Hash256, block: Arc>, @@ -823,23 +823,6 @@ impl AvailableBlock { } } } - - /// Only used for testing - pub fn __clone_without_recv(&self) -> Result { - Ok(Self { - block_root: self.block_root, - block: self.block.clone(), - blob_data: match &self.blob_data { - AvailableBlockData::NoData => AvailableBlockData::NoData, - AvailableBlockData::Blobs(blobs) => AvailableBlockData::Blobs(blobs.clone()), - AvailableBlockData::DataColumns(data_columns) => { - AvailableBlockData::DataColumns(data_columns.clone()) - } - }, - blobs_available_timestamp: self.blobs_available_timestamp, - spec: self.spec.clone(), - }) - } } #[derive(Debug)] diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 50ace054e3c..5a933994aa1 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -155,24 +155,6 @@ impl BeaconChain { &self, mut blocks: Vec>, ) -> Result { - // Verify that blobs or data columns signatures match - // - // TODO(das): We don't raise the `matching_sidecar_signatures_error` yet. We have to wait to - // return an invalid block signature error first. We may want to refactor this order in a - // later code change. - let matching_sidecar_signatures_error = blocks - .iter() - .map(|block| { - if let Err(indices) = block.match_block_and_blobs() { - return Err(HistoricalBlockError::InvalidBlobsSignature(indices)); - } - if let Err(indices) = block.match_block_and_data_columns() { - return Err(HistoricalBlockError::InvalidDataColumnsSignature(indices)); - } - Ok(()) - }) - .collect::, _>>(); - let anchor_info = self.store.get_anchor_info(); let blob_info = self.store.get_blob_info(); let data_column_info = self.store.get_data_column_info(); @@ -209,7 +191,7 @@ impl BeaconChain { let mut hot_batch = Vec::with_capacity(blocks_to_import.len()); let mut signed_blocks = Vec::with_capacity(blocks_to_import.len()); - for available_block in blocks_to_import.into_iter().rev() { + for available_block in blocks_to_import.iter().cloned().rev() { let (block_root, block, block_data) = available_block.deconstruct(); if !self.store.get_config().prune_payloads { @@ -318,7 +300,18 @@ impl BeaconChain { // Check that the proposer signature in the blobs and data columns is the same as the // correct signature in the block. - matching_sidecar_signatures_error?; + blocks_to_import + .iter() + .map(|block| { + if let Err(indices) = block.match_block_and_blobs() { + return Err(HistoricalBlockError::InvalidBlobsSignature(indices)); + } + if let Err(indices) = block.match_block_and_data_columns() { + return Err(HistoricalBlockError::InvalidDataColumnsSignature(indices)); + } + Ok(()) + }) + .collect::, _>>()?; let verify_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_VERIFY_TIMES); if !signature_set.verify() { diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 559d613c92c..ac6009cd10c 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -3678,5 +3678,5 @@ fn get_blocks( } fn clone_block(block: &AvailableBlock) -> AvailableBlock { - block.__clone_without_recv().unwrap() + block.clone() } From 9434046868877c3d7d0e44715e35c337b0690287 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Fri, 16 May 2025 18:24:12 -0500 Subject: [PATCH 55/64] Fix order of block matching --- beacon_node/beacon_chain/src/historical_blocks.rs | 14 +++++++------- beacon_node/beacon_chain/tests/store_tests.rs | 13 +++++++------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 5a933994aa1..180c5dcab1d 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -298,6 +298,13 @@ impl BeaconChain { drop(pubkey_cache); drop(setup_timer); + let verify_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_VERIFY_TIMES); + if !signature_set.verify() { + return Err(HistoricalBlockError::InvalidSignature("invalid".to_owned())); + } + drop(verify_timer); + drop(sig_timer); + // Check that the proposer signature in the blobs and data columns is the same as the // correct signature in the block. blocks_to_import @@ -313,13 +320,6 @@ impl BeaconChain { }) .collect::, _>>()?; - let verify_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_VERIFY_TIMES); - if !signature_set.verify() { - return Err(HistoricalBlockError::InvalidSignature("invalid".to_owned())); - } - drop(verify_timer); - drop(sig_timer); - // Write the I/O batches to disk, writing the blocks themselves first, as it's better // for the hot DB to contain extra blocks than for the cold DB to point to blocks that // do not exist. diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index ac6009cd10c..4f64a828486 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2499,12 +2499,13 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { }; // Importing the invalid batch should error. - assert!(matches!( - beacon_chain - .import_historical_block_batch(batch_with_invalid_first_block) - .unwrap_err(), - HistoricalBlockError::InvalidSignature(_) - )); + let err = beacon_chain + .import_historical_block_batch(batch_with_invalid_first_block) + .unwrap_err(); + match err { + HistoricalBlockError::InvalidSignature(_) => {} // ok + e => panic!("Unexpected error {e:?}"), + } // Importing the batch with valid signatures should succeed. let available_blocks_dup = available_blocks.iter().map(clone_block).collect::>(); From 40772ca01f4e807b0ca49509dc0de88e168e0c10 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 19 May 2025 14:44:34 -0500 Subject: [PATCH 56/64] Add store tests --- beacon_node/beacon_chain/tests/store_tests.rs | 170 +++++++++++++++++- 1 file changed, 166 insertions(+), 4 deletions(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 4f64a828486..fe0d6153163 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -3,7 +3,8 @@ use beacon_chain::attestation_verification::Error as AttnError; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::builder::BeaconChainBuilder; -use beacon_chain::data_availability_checker::AvailableBlock; +use beacon_chain::data_availability_checker::{AvailableBlock, AvailableBlockData}; +use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::schema_change::migrate_schema; use beacon_chain::test_utils::SyncCommitteeStrategy; use beacon_chain::test_utils::{ @@ -11,9 +12,11 @@ use beacon_chain::test_utils::{ BlockStrategy, DiskHarnessType, }; use beacon_chain::{ - data_availability_checker::MaybeAvailableBlock, historical_blocks::HistoricalBlockError, - migrate::MigratorConfig, BeaconChain, BeaconChainError, BeaconChainTypes, BeaconSnapshot, - BlockError, ChainConfig, NotifyExecutionLayer, ServerSentEventHandler, WhenSlotSkipped, + data_availability_checker::{AvailabilityCheckError, MaybeAvailableBlock}, + historical_blocks::HistoricalBlockError, + migrate::MigratorConfig, + BeaconChain, BeaconChainError, BeaconChainTypes, BeaconSnapshot, BlockError, ChainConfig, + NotifyExecutionLayer, ServerSentEventHandler, WhenSlotSkipped, }; use logging::create_test_tracing_subscriber; use maplit::hashset; @@ -2339,6 +2342,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { let store = get_store(&temp2); let spec = test_spec::(); let seconds_per_slot = spec.seconds_per_slot; + let wss_fork = harness.spec.fork_name_at_slot::(checkpoint_slot); let kzg = get_kzg(&spec); @@ -2507,6 +2511,142 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { e => panic!("Unexpected error {e:?}"), } + if wss_fork.deneb_enabled() { + // Currently ExecutionBlockGenerator::build_new_execution_payload doesn't accept a parameter + // to generate a fixed number of blob TXs, so it's random. Given the large number of blocks + // in this batch it's very unlikely that no block has data, but it's probable that's it's + // not index 0, so we need to find the first block with data. + let first_block_with_data = available_blocks + .iter() + .position(|block| block.block().num_expected_blobs() > 0) + .expect("No blocks have data, try different RNG"); + + // Test 1: Invalidate sidecar header signature + + let mut batch_with_invalid_header = available_blocks + .iter() + .map(|block| block.clone()) + .collect::>(); + batch_with_invalid_header[first_block_with_data] = { + let (block_root, block, block_data) = batch_with_invalid_header[first_block_with_data] + .clone() + .deconstruct(); + if wss_fork.fulu_enabled() { + let AvailableBlockData::DataColumns(mut data_columns) = block_data else { + panic!("no columns") + }; + assert!( + !data_columns.is_empty(), + "data column sidecars shouldn't be empty" + ); + let mut data_column = (*data_columns[0]).clone(); + data_column.signed_block_header.signature = Signature::empty(); + data_columns[0] = data_column.into(); + AvailableBlock::__new_for_testing( + block_root, + block, + AvailableBlockData::DataColumns(data_columns), + beacon_chain.spec.clone(), + ) + } else { + let AvailableBlockData::Blobs(mut blobs) = block_data else { + let blocks_have_blobs = available_blocks + .into_iter() + .map(|block| (block.block().slot(), block.has_blobs())) + .collect::>(); + panic!( + "no blobs at block {:?} {}. blocks_have_blobs {:?}", + block_root, + block.slot(), + blocks_have_blobs + ); + }; + assert!(!blobs.is_empty(), "blob sidecars shouldn't be empty"); + let mut blob = (*blobs[0]).clone(); + blob.signed_block_header.signature = Signature::empty(); + blobs[0] = blob.into(); + AvailableBlock::__new_for_testing( + block_root, + block, + AvailableBlockData::Blobs(blobs), + beacon_chain.spec.clone(), + ) + } + }; + + // Importing the invalid batch should error. + let err = beacon_chain + .import_historical_block_batch(batch_with_invalid_header) + .unwrap_err(); + if wss_fork.fulu_enabled() { + match err { + HistoricalBlockError::InvalidSignature(_) => {} // ok + e => panic!("Unexpected error {e:?}"), + } + } else { + match err { + HistoricalBlockError::InvalidBlobsSignature(_) => {} // ok + e => panic!("Unexpected error {e:?}"), + } + } + + // Test 2: invalidate KZG proof + + let mut batch_with_invalid_kzg = available_blocks + .iter() + .map(|block| available_to_rpc_block(block.clone(), &harness.spec)) + .collect::>(); + + batch_with_invalid_kzg[first_block_with_data] = { + let (block_root, block, blobs, cols) = batch_with_invalid_kzg[first_block_with_data] + .clone() + .deconstruct(); + if wss_fork.fulu_enabled() { + let (data_columns, expected_column_indices) = cols.unwrap(); + assert!( + !data_columns.is_empty(), + "data column sidecars shouldn't be empty" + ); + let mut sidecar = data_columns[0].clone_arc(); + let mut_sidecar = Arc::make_mut(&mut sidecar); + mut_sidecar.kzg_proofs[0] = KzgProof::empty(); + RpcBlock::new_with_custody_columns( + Some(block_root), + block, + data_columns.to_vec(), + expected_column_indices, + &harness.spec, + ) + .unwrap() + } else { + let mut blobs = blobs.unwrap(); + assert!(!blobs.is_empty(), "blob sidecars shouldn't be empty"); + let mut_sidecar = Arc::make_mut(&mut blobs[0]); + mut_sidecar.kzg_proof = KzgProof::empty(); + RpcBlock::new(Some(block_root), block, Some(blobs)).unwrap() + } + }; + + let err = beacon_chain + .verify_and_import_historical_block_batch(batch_with_invalid_kzg) + .unwrap_err(); + if wss_fork.fulu_enabled() { + match err { + HistoricalBlockError::AvailabilityCheckError( + AvailabilityCheckError::InvalidColumn(_), + ) => {} // ok + e => panic!("Unexpected error {e:?}"), + } + } else { + match err { + HistoricalBlockError::AvailabilityCheckError( + AvailabilityCheckError::InvalidBlobs(_), + ) => {} // ok + e => panic!("Unexpected error {e:?}"), + } + } + } + // Importing the batch with valid signatures should succeed. let available_blocks_dup = available_blocks.iter().map(clone_block).collect::>(); beacon_chain @@ -3681,3 +3821,25 @@ fn get_blocks( fn clone_block(block: &AvailableBlock) -> AvailableBlock { block.clone() } + +fn available_to_rpc_block(block: AvailableBlock, spec: &ChainSpec) -> RpcBlock { + let (block_root, block, block_data) = block.deconstruct(); + + match block_data { + AvailableBlockData::NoData => RpcBlock::new(Some(block_root), block, None).unwrap(), + AvailableBlockData::Blobs(blobs) => { + RpcBlock::new(Some(block_root), block, Some(blobs)).unwrap() + } + AvailableBlockData::DataColumns(data_columns) => RpcBlock::new_with_custody_columns( + Some(block_root), + block, + data_columns + .into_iter() + .map(|d| CustodyDataColumn::from_asserted_custody(d)) + .collect(), + vec![], + spec, + ) + .unwrap(), + } +} From 005847136212f1c48c661ca9bf2199941a636bf5 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 11:47:19 -0500 Subject: [PATCH 57/64] Filter blockchain in assert_correct_historical_block_chain --- beacon_node/beacon_chain/src/historical_blocks.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 180c5dcab1d..8d8c0569ab5 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -26,8 +26,10 @@ const PUBKEY_CACHE_LOCK_TIMEOUT: Duration = Duration::from_secs(30); pub enum HistoricalBlockError { /// Block root mismatch, caller should retry with different blocks. MismatchedBlockRoot { + block_slot: Slot, block_root: Hash256, expected_block_root: Hash256, + oldest_block_parent: Hash256, }, /// Bad signature, caller should retry with different blocks. InvalidSignature(String), @@ -89,10 +91,16 @@ impl BeaconChain { let mut expected_block_root = anchor_info.oldest_block_parent; for block in blocks.iter().rev() { + if block.as_block().slot() >= anchor_info.oldest_block_slot { + continue; + } + if block.block_root() != expected_block_root { return Err(HistoricalBlockError::MismatchedBlockRoot { + block_slot: block.as_block().slot(), block_root: block.block_root(), expected_block_root, + oldest_block_parent: anchor_info.oldest_block_parent, }); } From d4046aca25b7580b19fead6b0b161ef05da794df Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 11:54:42 -0500 Subject: [PATCH 58/64] Also filter before KZG checks --- .../beacon_chain/src/historical_blocks.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 8d8c0569ab5..d4e015706bb 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -114,14 +114,22 @@ impl BeaconChain { &self, blocks: Vec>, ) -> Result { + let anchor_info = self.store.get_anchor_info(); + + // Take all blocks with slots less than the oldest block slot. + let blocks_to_import = blocks + .into_iter() + .filter(|block| block.as_block().slot() < anchor_info.oldest_block_slot) + .collect::>(); + // First check that chain of blocks is correct - self.assert_correct_historical_block_chain(&blocks)?; + self.assert_correct_historical_block_chain(&blocks_to_import)?; // Check that all data columns are present <- faulty failure if missing because we have // checked the block root is correct first. - let blocks = self + let available_blocks_to_import = self .data_availability_checker - .verify_kzg_for_rpc_blocks(blocks) + .verify_kzg_for_rpc_blocks(blocks_to_import) .and_then(|blocks| { blocks .into_iter() @@ -138,7 +146,7 @@ impl BeaconChain { .collect::, _>>() })?; - self.import_historical_block_batch(blocks) + self.import_historical_block_batch(available_blocks_to_import) } /// Store a batch of historical blocks in the database. From fe0a15d099f0180bd4eb9050a8118c8a0ad73dfe Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 12:03:36 -0500 Subject: [PATCH 59/64] Lint tests --- beacon_node/beacon_chain/tests/store_tests.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index fe0d6153163..0c02091626b 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2523,10 +2523,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { // Test 1: Invalidate sidecar header signature - let mut batch_with_invalid_header = available_blocks - .iter() - .map(|block| block.clone()) - .collect::>(); + let mut batch_with_invalid_header = available_blocks.iter().cloned().collect::>(); batch_with_invalid_header[first_block_with_data] = { let (block_root, block, block_data) = batch_with_invalid_header[first_block_with_data] .clone() From 0659420f7615a56ec5c2a61141af37c5664710b5 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 20 May 2025 10:20:11 -0700 Subject: [PATCH 60/64] Fix lint --- beacon_node/beacon_chain/tests/store_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 0c02091626b..f9d6a797bdd 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2523,7 +2523,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { // Test 1: Invalidate sidecar header signature - let mut batch_with_invalid_header = available_blocks.iter().cloned().collect::>(); + let mut batch_with_invalid_header = available_blocks.to_vec(); batch_with_invalid_header[first_block_with_data] = { let (block_root, block, block_data) = batch_with_invalid_header[first_block_with_data] .clone() From fb8e6fe58c82bb13f1153c2f487bda2dd28bd4e6 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 14:39:28 -0500 Subject: [PATCH 61/64] Fix fulu err assertion --- beacon_node/beacon_chain/tests/store_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index f9d6a797bdd..5c18f5a6dd1 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2577,7 +2577,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .unwrap_err(); if wss_fork.fulu_enabled() { match err { - HistoricalBlockError::InvalidSignature(_) => {} // ok + HistoricalBlockError::InvalidDataColumnsSignature(_) => {} // ok e => panic!("Unexpected error {e:?}"), } } else { From eb400f0e5d984c33ad375a00b33c3b0799eeb73c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 16:22:25 -0500 Subject: [PATCH 62/64] Check point is not at infinity --- beacon_node/beacon_chain/tests/store_tests.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 5c18f5a6dd1..36b52080e1e 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2606,6 +2606,9 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { ); let mut sidecar = data_columns[0].clone_arc(); let mut_sidecar = Arc::make_mut(&mut sidecar); + if mut_sidecar.kzg_proofs[0] == KzgProof::empty() { + panic!("kzg_proof is already G1_POINT_AT_INFINITY") + } mut_sidecar.kzg_proofs[0] = KzgProof::empty(); RpcBlock::new_with_custody_columns( Some(block_root), From aabadb549a622d2ddac16131902e260adf749b32 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 17:26:40 -0500 Subject: [PATCH 63/64] Fix ws sync test --- .../src/data_availability_checker.rs | 8 ++++---- beacon_node/beacon_chain/tests/store_tests.rs | 20 ++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index bdd25b98b13..947af3aa750 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -20,7 +20,7 @@ use tracing::{debug, error, info_span, Instrument}; use types::blob_sidecar::{BlobIdentifier, BlobSidecar, FixedBlobSidecarList}; use types::{ BlobSidecarList, ChainSpec, ColumnIndex, DataColumnSidecarList, Epoch, EthSpec, Hash256, - RuntimeVariableList, SignedBeaconBlock, + SignedBeaconBlock, }; mod error; @@ -398,6 +398,9 @@ impl DataAvailabilityChecker { let mut results = Vec::with_capacity(blocks.len()); let all_blobs = blocks .iter() + // TODO(das): we may want to remove this line. If blobs are present they should be + // verified. It's the role of another function to ignore blobs. And this blobs may not + // be checked and imported later. .filter(|block| self.blobs_required_for_block(block.as_block())) // this clone is cheap as it's cloning an Arc .filter_map(|block| block.blobs().cloned()) @@ -412,14 +415,11 @@ impl DataAvailabilityChecker { let all_data_columns = blocks .iter() - .filter(|block| self.data_columns_required_for_block(block.as_block())) // this clone is cheap as it's cloning an Arc .filter_map(|block| block.custody_columns().cloned()) .flatten() .map(CustodyDataColumn::into_inner) .collect::>(); - let all_data_columns = - RuntimeVariableList::from_vec(all_data_columns, self.spec.number_of_columns as usize); // verify kzg for all data columns at once if !all_data_columns.is_empty() { diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 36b52080e1e..98d46482bca 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -36,6 +36,7 @@ use store::{ BlobInfo, DBColumn, HotColdDB, StoreConfig, }; use tempfile::{tempdir, TempDir}; +use tracing::info; use types::test_utils::{SeedableRng, XorShiftRng}; use types::*; @@ -2529,6 +2530,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .clone() .deconstruct(); if wss_fork.fulu_enabled() { + info!(block_slot = %block.slot(), ?block_root, "Corrupting data column header signature"); let AvailableBlockData::DataColumns(mut data_columns) = block_data else { panic!("no columns") }; @@ -2546,6 +2548,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { beacon_chain.spec.clone(), ) } else { + info!(block_slot = %block.slot(), ?block_root, "Corrupting blob header signature"); let AvailableBlockData::Blobs(mut blobs) = block_data else { let blocks_have_blobs = available_blocks .into_iter() @@ -2599,17 +2602,18 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .clone() .deconstruct(); if wss_fork.fulu_enabled() { - let (data_columns, expected_column_indices) = cols.unwrap(); + info!(block_slot = %block.slot(), ?block_root, "Corrupting data column KZG proof"); + let (mut data_columns, expected_column_indices) = cols.unwrap(); assert!( !data_columns.is_empty(), "data column sidecars shouldn't be empty" ); - let mut sidecar = data_columns[0].clone_arc(); - let mut_sidecar = Arc::make_mut(&mut sidecar); - if mut_sidecar.kzg_proofs[0] == KzgProof::empty() { + let mut data_column = (*(data_columns[0]).clone_arc()).clone(); + if data_column.kzg_proofs[0] == KzgProof::empty() { panic!("kzg_proof is already G1_POINT_AT_INFINITY") } - mut_sidecar.kzg_proofs[0] = KzgProof::empty(); + data_column.kzg_proofs[0] = KzgProof::empty(); + data_columns[0] = CustodyDataColumn::from_asserted_custody(data_column.into()); RpcBlock::new_with_custody_columns( Some(block_root), block, @@ -2619,10 +2623,12 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { ) .unwrap() } else { + info!(block_slot = %block.slot(), ?block_root, "Corrupting blob KZG proof"); let mut blobs = blobs.unwrap(); assert!(!blobs.is_empty(), "blob sidecars shouldn't be empty"); - let mut_sidecar = Arc::make_mut(&mut blobs[0]); - mut_sidecar.kzg_proof = KzgProof::empty(); + let mut blob = (*blobs[0]).clone(); + blob.kzg_proof = KzgProof::empty(); + blobs[0] = blob.into(); RpcBlock::new(Some(block_root), block, Some(blobs)).unwrap() } }; From 6a663d8d3bf118d393f1bd12eafb4a73fab30b60 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 20 May 2025 18:04:19 -0500 Subject: [PATCH 64/64] Revert dropping filter fn --- beacon_node/beacon_chain/src/data_availability_checker.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 947af3aa750..26694faf110 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -398,9 +398,6 @@ impl DataAvailabilityChecker { let mut results = Vec::with_capacity(blocks.len()); let all_blobs = blocks .iter() - // TODO(das): we may want to remove this line. If blobs are present they should be - // verified. It's the role of another function to ignore blobs. And this blobs may not - // be checked and imported later. .filter(|block| self.blobs_required_for_block(block.as_block())) // this clone is cheap as it's cloning an Arc .filter_map(|block| block.blobs().cloned()) @@ -415,6 +412,10 @@ impl DataAvailabilityChecker { let all_data_columns = blocks .iter() + // TODO(das): we may want to remove this line. If columns are present they should be + // verified. The outcome of `data_columns_required_for_block` is time dependant. So we + // may end up importing data columns that are not verified. + .filter(|block| self.data_columns_required_for_block(block.as_block())) // this clone is cheap as it's cloning an Arc .filter_map(|block| block.custody_columns().cloned()) .flatten()