From 83e2c6db582fd6c27dffc652dce9a10a98f62849 Mon Sep 17 00:00:00 2001 From: Garand Tyson Date: Fri, 1 May 2026 01:16:56 -0700 Subject: [PATCH 1/2] Add experimental trigger timer --- docs/metrics.md | 1 + src/herder/HerderImpl.cpp | 162 +++++++++++++++++++++++++++++++-- src/herder/HerderImpl.h | 20 ++++ src/herder/HerderSCPDriver.cpp | 2 - src/herder/HerderSCPDriver.h | 7 +- src/main/Config.cpp | 3 + src/main/Config.h | 4 + 7 files changed, 185 insertions(+), 14 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 37c0dadd1d..3ea8afbe88 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -195,6 +195,7 @@ scp.timing.nominated | timer | time spent in nomination scp.timing.externalized | timer | time spent in ballot protocol scp.timing.first-to-self-externalize-lag | timer | delay between first externalize message and local node externalizing scp.timing.self-to-others-externalize-lag | timer | delay between local node externalizing and later externalize messages from other nodes +scp.trigger.prepare-start-fallback | meter | experimental trigger timer fell back from the network-close-time anchor to the local prepare-start anchor scp.value.invalid | meter | SCP value is invalid scp.value.valid | meter | SCP value is valid scp.slot.values-referenced | histogram | number of values referenced per consensus round diff --git a/src/herder/HerderImpl.cpp b/src/herder/HerderImpl.cpp index 98d29f62eb..f105726bb5 100644 --- a/src/herder/HerderImpl.cpp +++ b/src/herder/HerderImpl.cpp @@ -80,6 +80,8 @@ HerderImpl::SCPMetrics::SCPMetrics(Application& app) {"scp", "envelope", "validsig"}, "envelope")) , mEnvelopeInvalidSig(app.getMetrics().NewMeter( {"scp", "envelope", "invalidsig"}, "envelope")) + , mTriggerPrepareStartFallback(app.getMetrics().NewMeter( + {"scp", "trigger", "prepare-start-fallback"}, "trigger")) { } @@ -1253,6 +1255,151 @@ HerderImpl::lastClosedLedgerIncreased(bool latest, TxSetXDRFrameConstPtr txSet, } } +VirtualClock::time_point +HerderImpl::triggerAnchorFromPrepareStart( + uint64_t lastIndex, VirtualClock::time_point now, + std::chrono::milliseconds expectedClose) +{ + auto lastStart = mHerderSCPDriver.getPrepareStart(lastIndex); + if (lastStart) + { + return *lastStart; + } + // Pessimistic estimate: assume the previous ballot protocol started one + // full interval ago, so the next trigger should fire immediately. + return now - expectedClose; +} + +// Compute the anchor for the next trigger. Caller fires at +// `anchor + expectedClose`. Unlike the local ballot cadence trigger, this uses +// the externalized close time from the previous ledger, which is a timestamp +// coming from a different, potentially unsynced node. This is more accurate +// than just using local timers, as those skew late based on latency from the +// leader, but we need to be careful to account for drift. +// +// Constraints: +// 1. Track `expectedClose` on the network timeline as closely as possible. +// 2. Under drift, prefer a longer ledger to a shorter one to prevent fast +// ledgers causing too much strain on the network. +// 3. Never let a badly lagging local clock wedge the node (i.e. never schedule +// a trigger in the far future). +// +// Scenarios (assume last `closeTime` = N, target = 5s): +// 1. Our clock ahead of network: Check how much time our system clock says has +// elapsed since the externalized network ledger closed +// (timeSinceNetworkLedgerStart). If timeSinceNetworkLedgerStart > target, +// either the network itself is slow, or our clock is ahead. +// +// To get a better sense of which, we also take into account +// nomination time, so the check becomes timeSinceNetworkLedgerStart > target +// + nominationBudget, where nomination budget scales with timeouts. +// +// If we think we're drifting ahead after taking nomination into account, we +// fall back to prepare-start anchor, which is based on our local clock and +// can't drift, but is slower. +// +// Note that if nomination is quick, but apply takes a long time, it still +// appears like we're ahead and we fall back to prepare-start. This isn't a +// problem. The prepare-start timer starts before the apply stage, so a long +// apply is "baked in" to the time. If apply was the bottleneck, we'll +// probably trigger immediately even if we use the prepare-start timer because +// we're behind, so using prepare-start is identical to the network-based +// anchor from the perf standpoint if we really are in sync anyway. +// +// This is not true for nomination. The prepare-start timer starts after +// nomination, so it does not reflect a long nomination. This is why we need +// the nomination budget in our check. If nomination took a long time and we +// fall back to the conservative timer, this is much worse from a perf +// perspective, as we are waiting nomination time + target time before +// starting the next ledger. +// +// 2. Our clock behind network: Check whether our local timer says +// more time has elapsed since prepare-start vs how much time has +// elapsed since the externalized network ledger closed. +// +// If true, our system clock is lagging far enough that the network-based +// anchor would schedule the next trigger later than the conservative timer +// based on local ballot cadence. That delay can grow with drift and wedge the +// node if arbitrarily in the future, so we fall back to the prepare-start +// anchor. This anchor is conservative and local, but it gives us a bounded +// trigger time even when system time is badly behind. +// +// 3. Clocks synced, but nomination/apply was slow: With synced clocks, +// time since network close time is large because real time really passed, +// not because our system clock drifted. The goal is to avoid falling back to +// a conservative timer and snowballing the real delay. +// +// See scenario 1, but TL;DR we can look at the nomination timeouts and see +// if the network is slow vs. the node drifting. If nomination is slow, we +// can't fall back to the prepare-apply timer because it would compound the +// delay. If apply is slow, it doesn't matter which timer we use, they both +// will result in triggering immediately. +VirtualClock::time_point +HerderImpl::triggerAnchorFromConsensusCloseTime( + uint64_t lastIndex, VirtualClock::time_point now, + std::chrono::milliseconds expectedClose) +{ + auto fallbackToPrepareStart = [&]() { + mSCPMetrics.mTriggerPrepareStartFallback.Mark(); + return triggerAnchorFromPrepareStart(lastIndex, now, expectedClose); + }; + + auto consensusCloseTime = trackingConsensusCloseTime(); + if (consensusCloseTime == 0) + { + CLOG_WARNING(Herder, "Consensus close time is 0, falling back to " + "prepare-start anchor"); + return fallbackToPrepareStart(); + } + + // Compare elapsed time on the externalized closeTime timeline with elapsed + // time on our local prepare-start timeline. + // Relation, with drift > 0 meaning our clock is ahead of network time: + // + // timeSinceNetworkLedgerStart + // = nominationBudget + timeSinceLocalBallotStart + drift + // + // where nominationBudget is the slow-nomination allowance described above. + auto externalizedSystemTime = VirtualClock::from_time_t(consensusCloseTime); + auto currentSystemTime = mApp.getClock().system_now(); + auto timeSinceNetworkLedgerStart = + std::chrono::duration_cast( + currentSystemTime - externalizedSystemTime); + + auto localBallotStart = + triggerAnchorFromPrepareStart(lastIndex, now, expectedClose); + auto timeSinceLocalBallotStart = + std::chrono::duration_cast(now - + localBallotStart); + + // Scenario 2: if system time is behind the local prepare-start timer, the + // network-based anchor can wedge the node, so use the local fallback. + if (timeSinceLocalBallotStart > timeSinceNetworkLedgerStart) + { + return fallbackToPrepareStart(); + } + + // Scenario 1: widen the ahead-drift bound by the slow nomination we can + // explain from the previous slot's timeout count. + auto nominationTimeouts = + mHerderSCPDriver.getNominationTimeouts(lastIndex).value_or(0); + auto nominationBudget = std::chrono::milliseconds::zero(); + for (int64_t round = 1; round <= nominationTimeouts; ++round) + { + nominationBudget += mHerderSCPDriver.computeTimeout( + static_cast(round), /*isNomination=*/true); + } + + // Scenario 1: if elapsed system time exceeds target plus explainable + // nomination delay, treat it as clock-ahead drift and use the fallback. + if (timeSinceNetworkLedgerStart > expectedClose + nominationBudget) + { + return fallbackToPrepareStart(); + } + + return now - timeSinceNetworkLedgerStart; +} + void HerderImpl::setupTriggerNextLedger() { @@ -1279,19 +1426,16 @@ HerderImpl::setupTriggerNextLedger() std::chrono::milliseconds milliseconds = mLedgerManager.getExpectedLedgerCloseTime(); - // bootstrap with a pessimistic estimate of when - // the ballot protocol started last auto now = mApp.getClock().now(); - auto lastBallotStart = now - milliseconds; - auto lastStart = mHerderSCPDriver.getPrepareStart(lastIndex); - if (lastStart) - { - lastBallotStart = *lastStart; - } + + auto lastLedgerStartingPoint = + mApp.getConfig().EXPERIMENTAL_TRIGGER_TIMER + ? triggerAnchorFromConsensusCloseTime(lastIndex, now, milliseconds) + : triggerAnchorFromPrepareStart(lastIndex, now, milliseconds); // Adjust trigger time in case node's clock has drifted. // This ensures that next value to nominate is valid - auto triggerTime = lastBallotStart + milliseconds; + auto triggerTime = lastLedgerStartingPoint + milliseconds; if (triggerTime < now) { diff --git a/src/herder/HerderImpl.h b/src/herder/HerderImpl.h index 72ff138824..065d65d8bb 100644 --- a/src/herder/HerderImpl.h +++ b/src/herder/HerderImpl.h @@ -258,6 +258,22 @@ class HerderImpl : public Herder void setupTriggerNextLedger(); + // Compute the trigger-timer anchor point using the local node's + // prepare-start timestamp for the previous slot. Returns a pessimistic + // estimate (now - expectedClose) if no prepare-start is recorded. + VirtualClock::time_point + triggerAnchorFromPrepareStart(uint64_t lastIndex, + VirtualClock::time_point now, + std::chrono::milliseconds expectedClose); + + // Compute the trigger-timer anchor point using the network-agreed + // consensus close time on the system clock. Falls back to + // triggerAnchorFromPrepareStart if consensus close time is unavailable + // or if the local clock is significantly drifting from the network time. + VirtualClock::time_point triggerAnchorFromConsensusCloseTime( + uint64_t lastIndex, VirtualClock::time_point now, + std::chrono::milliseconds expectedClose); + void startOutOfSyncTimer(); void outOfSyncRecovery(); void broadcast(SCPEnvelope const& e); @@ -342,6 +358,10 @@ class HerderImpl : public Herder medida::Meter& mEnvelopeValidSig; medida::Meter& mEnvelopeInvalidSig; + // Marked when the experimental trigger timer falls back from the + // network-close-time anchor to the local prepare-start anchor. + medida::Meter& mTriggerPrepareStartFallback; + SCPMetrics(Application& app); }; diff --git a/src/herder/HerderSCPDriver.cpp b/src/herder/HerderSCPDriver.cpp index b1b4f54aee..5a30402b02 100644 --- a/src/herder/HerderSCPDriver.cpp +++ b/src/herder/HerderSCPDriver.cpp @@ -1540,7 +1540,6 @@ HerderSCPDriver::getNodeWeight(NodeID const& nodeID, SCPQuorumSet const& qset, return qualityWeightIt->second / homeDomainSizeIt->second; } -#ifdef BUILD_TESTS std::optional HerderSCPDriver::getNominationTimeouts(uint64_t slotIndex) const { @@ -1551,6 +1550,5 @@ HerderSCPDriver::getNominationTimeouts(uint64_t slotIndex) const } return std::nullopt; } -#endif } diff --git a/src/herder/HerderSCPDriver.h b/src/herder/HerderSCPDriver.h index 1d5871cf26..b82f0bad11 100644 --- a/src/herder/HerderSCPDriver.h +++ b/src/herder/HerderSCPDriver.h @@ -167,15 +167,16 @@ class HerderSCPDriver : public SCPDriver void cacheValidTxSet(ApplicableTxSetFrame const& txSet, LedgerHeaderHistoryEntry const& lcl, uint64_t closeTimeOffset) const; + + // Get the number of nomination timeouts that occurred for a given slot + std::optional getNominationTimeouts(uint64_t slotIndex) const; + #ifdef BUILD_TESTS RandomEvictionCache& getTxSetValidityCache() { return mTxSetValidCache; } - - // Get the number of nomination timeouts that occurred for a given slot - std::optional getNominationTimeouts(uint64_t slotIndex) const; #endif private: diff --git a/src/main/Config.cpp b/src/main/Config.cpp index ed80271b39..a9f3f74cf5 100644 --- a/src/main/Config.cpp +++ b/src/main/Config.cpp @@ -174,6 +174,7 @@ Config::Config() : NODE_SEED(SecretKey::random()) PARALLEL_LEDGER_APPLY = true; DISABLE_SOROBAN_METRICS_FOR_TESTING = false; BACKGROUND_TX_SIG_VERIFICATION = true; + EXPERIMENTAL_TRIGGER_TIMER = false; BUCKETLIST_DB_INDEX_PAGE_SIZE_EXPONENT = 14; // 2^14 == 16 kb BUCKETLIST_DB_INDEX_CUTOFF = 20; // 20 mb BUCKETLIST_DB_MEMORY_FOR_CACHING = 0; @@ -1200,6 +1201,8 @@ Config::processConfig(std::shared_ptr t) }}, {"BACKGROUND_TX_SIG_VERIFICATION", [&]() { BACKGROUND_TX_SIG_VERIFICATION = readBool(item); }}, + {"EXPERIMENTAL_TRIGGER_TIMER", + [&]() { EXPERIMENTAL_TRIGGER_TIMER = readBool(item); }}, {"ARTIFICIALLY_DELAY_LEDGER_CLOSE_FOR_TESTING", [&]() { ARTIFICIALLY_DELAY_LEDGER_CLOSE_FOR_TESTING = diff --git a/src/main/Config.h b/src/main/Config.h index a718a8b843..a5c0e9d4d6 100644 --- a/src/main/Config.h +++ b/src/main/Config.h @@ -559,6 +559,10 @@ class Config : public std::enable_shared_from_this // also enabled. bool BACKGROUND_TX_SIG_VERIFICATION; + // Experimental flag to use externalized close time for trigger timer + // calculation instead of prepare start time. + bool EXPERIMENTAL_TRIGGER_TIMER; + // When set to true, BucketListDB indexes are persisted on-disk so that the // BucketList does not need to be reindexed on startup. Defaults to true. // This should only be set to false for testing purposes From b17483db697389f373d21eaff729b79d36c6606a Mon Sep 17 00:00:00 2001 From: Garand Tyson Date: Fri, 1 May 2026 02:16:13 -0700 Subject: [PATCH 2/2] Added tests for experimental timer --- src/herder/HerderSCPDriver.cpp | 20 ++++ src/herder/HerderSCPDriver.h | 3 + src/herder/test/HerderTests.cpp | 161 ++++++++++++++++++++++++++++++++ src/main/ApplicationImpl.cpp | 11 +++ src/main/Config.cpp | 17 ++++ src/main/Config.h | 8 ++ src/scp/NominationProtocol.cpp | 25 +++++ src/scp/SCPDriver.h | 8 ++ src/scp/Slot.h | 8 +- src/test/test.cpp | 1 + src/util/Timer.cpp | 28 +++++- src/util/Timer.h | 22 +++++ 12 files changed, 307 insertions(+), 5 deletions(-) diff --git a/src/herder/HerderSCPDriver.cpp b/src/herder/HerderSCPDriver.cpp index 5a30402b02..75e4c8b749 100644 --- a/src/herder/HerderSCPDriver.cpp +++ b/src/herder/HerderSCPDriver.cpp @@ -484,6 +484,18 @@ void HerderSCPDriver::timerCallbackWrapper(uint64_t slotIndex, int timerID, std::function cb) { +#ifdef BUILD_TESTS + if (timerID == Slot::NOMINATION_EMIT_TIMER) + { + if (!mHerder.isTracking() || + mHerder.nextConsensusLedgerIndex() == slotIndex) + { + cb(); + } + return; + } +#endif + // reschedule timers for future slots when tracking if (mHerder.isTracking() && mHerder.nextConsensusLedgerIndex() != slotIndex) { @@ -609,6 +621,14 @@ HerderSCPDriver::computeTimeout(uint32 roundNumber, bool isNomination) return std::chrono::milliseconds(timeoutMS); } +#ifdef BUILD_TESTS +std::chrono::milliseconds +HerderSCPDriver::getNominationEmitDelayForTesting() const +{ + return mApp.getConfig().ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING; +} +#endif + // returns true if l < r // lh, rh are the hashes of l,h static bool diff --git a/src/herder/HerderSCPDriver.h b/src/herder/HerderSCPDriver.h index b82f0bad11..506298fc3c 100644 --- a/src/herder/HerderSCPDriver.h +++ b/src/herder/HerderSCPDriver.h @@ -84,6 +84,9 @@ class HerderSCPDriver : public SCPDriver void stopTimer(uint64 slotIndex, int timerID) override; std::chrono::milliseconds computeTimeout(uint32 roundNumber, bool isNomination) override; +#ifdef BUILD_TESTS + std::chrono::milliseconds getNominationEmitDelayForTesting() const override; +#endif // hashing support Hash getHashOf(std::vector> const& vals) const override; diff --git a/src/herder/test/HerderTests.cpp b/src/herder/test/HerderTests.cpp index 75d9f1531e..ae0ee9352e 100644 --- a/src/herder/test/HerderTests.cpp +++ b/src/herder/test/HerderTests.cpp @@ -8297,3 +8297,164 @@ TEST_CASE("far-future slots cleanup", "[herder]") // Check that far-future slots have been removed REQUIRE(herder0.getSCP().getHighestKnownSlotIndex() < FAR_FUTURE_BASE); } + +TEST_CASE("experimental trigger timer", "[herder][hide]") +{ + constexpr uint32_t LEDGERS_TO_RUN = 10; + constexpr int64_t MIN_DRIFT_FALLBACKS = (LEDGERS_TO_RUN + 1) / 2; + auto const driftOffset = std::chrono::seconds(4); + + struct RunResult + { + std::chrono::milliseconds elapsed; + int64_t totalFallbacks{0}; + int64_t driftedNodeFallbacks{0}; + int64_t otherNodeFallbacks{0}; + bool sawNominationTimeout{false}; + }; + + auto fallbackCount = [](Application::pointer const& app) { + auto const metrics = app->getMetrics().GetAllMetrics(); + auto const it = + metrics.find({"scp", "trigger", "prepare-start-fallback"}); + if (it == metrics.end()) + { + return int64_t{0}; + } + auto meter = dynamic_cast(it->second.get()); + releaseAssert(meter); + return static_cast(meter->count()); + }; + + auto runSimulation = + [&](bool experimentalTriggerTimer, + std::chrono::milliseconds nominationEmitDelay, + std::chrono::milliseconds triggerClockOffset = + std::chrono::milliseconds::zero()) -> RunResult { + auto networkID = sha256(getTestConfig().NETWORK_PASSPHRASE); + + auto simulation = Topologies::separateAllHighQuality( + 4, Simulation::OVER_TCP, networkID, [&](int i) { + auto cfg = getTestConfig(i, Config::TESTDB_DEFAULT); + cfg.ARTIFICIALLY_ACCELERATE_TIME_FOR_TESTING = false; + cfg.EXPERIMENTAL_TRIGGER_TIMER = experimentalTriggerTimer; + cfg.ARTIFICIALLY_DELAY_LEDGER_CLOSE_FOR_TESTING = + std::chrono::milliseconds(1000); + cfg.ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING = + nominationEmitDelay; + + // Drift one validator. Note: i == 0 is the Simulation's + // idle app (its config is generated first by the constructor), + // so the first real validator is i == 1. + if (i == 1) + { + cfg.ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING = + triggerClockOffset; + } + return cfg; + }); + + simulation->fullyConnectAllPending(); + simulation->startAllNodes(); + auto nodes = simulation->getNodes(); + auto const expectedClose = simulation->getExpectedLedgerCloseTime(); + REQUIRE(expectedClose == std::chrono::seconds(5)); + + std::vector fallbackCounts; + std::transform(nodes.begin(), nodes.end(), + std::back_inserter(fallbackCounts), fallbackCount); + + auto minLedger = [&]() { + return std::min_element(nodes.begin(), nodes.end(), + [](Application::pointer const& lhs, + Application::pointer const& rhs) { + return lhs->getLedgerManager() + .getLastClosedLedgerNum() < + rhs->getLedgerManager() + .getLastClosedLedgerNum(); + }) + ->get() + ->getLedgerManager() + .getLastClosedLedgerNum(); + }; + + auto const startLedger = minLedger(); + auto targetLedger = startLedger + LEDGERS_TO_RUN; + auto const startTime = nodes.front()->getClock().now(); + + simulation->crankUntil( + [&]() { return simulation->haveAllExternalized(targetLedger, 1); }, + 10 * (LEDGERS_TO_RUN + 1) * expectedClose, true); + + RunResult result; + result.elapsed = std::chrono::duration_cast( + nodes.front()->getClock().now() - startTime); + + for (size_t i = 0; i < nodes.size(); ++i) + { + auto const delta = fallbackCount(nodes[i]) - fallbackCounts.at(i); + result.totalFallbacks += delta; + + auto const isDriftedNode = + triggerClockOffset != std::chrono::milliseconds::zero() && + nodes[i]->getConfig() + .ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING == + triggerClockOffset; + if (isDriftedNode) + { + result.driftedNodeFallbacks = delta; + } + else + { + result.otherNodeFallbacks += delta; + } + + auto const& driver = + dynamic_cast(nodes[i]->getHerder()) + .getHerderSCPDriver(); + for (uint32_t ledger = startLedger + 1; ledger <= targetLedger; + ++ledger) + { + auto timeouts = driver.getNominationTimeouts(ledger); + result.sawNominationTimeout = + result.sawNominationTimeout || + (timeouts.has_value() && timeouts.value() > 0); + } + } + + return result; + }; + + // New timer is faster without drift. + { + auto const nominationDelay = std::chrono::milliseconds(1000); + auto const oldTimer = runSimulation(false, nominationDelay); + auto const newTimer = runSimulation(true, nominationDelay); + + REQUIRE(oldTimer.elapsed > newTimer.elapsed); + REQUIRE(newTimer.elapsed < oldTimer.elapsed); + REQUIRE(newTimer.totalFallbacks == 0); + } + + // One node drifting ahead falls back. + { + auto const nodeAhead = + runSimulation(true, std::chrono::milliseconds::zero(), driftOffset); + REQUIRE(nodeAhead.driftedNodeFallbacks >= MIN_DRIFT_FALLBACKS); + } + + // One node drifting behind falls back. + { + auto const nodeBehind = runSimulation( + true, std::chrono::milliseconds::zero(), -driftOffset); + REQUIRE(nodeBehind.driftedNodeFallbacks >= MIN_DRIFT_FALLBACKS); + } + + // Long nomination does not cause timer fallback + { + auto const nominationDelay = std::chrono::milliseconds(5000); + auto const slowNomination = runSimulation(true, nominationDelay); + REQUIRE(slowNomination.sawNominationTimeout); + REQUIRE(slowNomination.totalFallbacks == 0); + } +} diff --git a/src/main/ApplicationImpl.cpp b/src/main/ApplicationImpl.cpp index eb3d610fce..27d27d2aae 100644 --- a/src/main/ApplicationImpl.cpp +++ b/src/main/ApplicationImpl.cpp @@ -148,6 +148,17 @@ ApplicationImpl::ApplicationImpl(VirtualClock& clock, Config const& cfg) homeStr += mConfig.NODE_HOME_DOMAIN; TracyAppInfo(homeStr.c_str(), homeStr.size()); +#ifdef BUILD_TESTS + if (mConfig.ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING != + std::chrono::milliseconds::zero()) + { + mVirtualClock.setSystemTimeOffset( + std::chrono::duration_cast( + mConfig.ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING)); + mStartedOn = clock.system_now(); + } +#endif + mStopSignals.async_wait([this](asio::error_code const& ec, int sig) { if (!ec) { diff --git a/src/main/Config.cpp b/src/main/Config.cpp index a9f3f74cf5..47bc27ae27 100644 --- a/src/main/Config.cpp +++ b/src/main/Config.cpp @@ -68,6 +68,7 @@ static std::unordered_set const TESTING_ONLY_OPTIONS = { "ARTIFICIALLY_SET_SURVEY_PHASE_DURATION_FOR_TESTING", "ARTIFICIALLY_DELAY_BUCKET_APPLICATION_FOR_TESTING", "ARTIFICIALLY_SLEEP_MAIN_THREAD_FOR_TESTING", + "ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING", "ARTIFICIALLY_SKIP_CONNECTION_ADJUSTMENT_FOR_TESTING", "ARTIFICIALLY_DELAY_LEDGER_CLOSE_FOR_TESTING", "SKIP_HIGH_CRITICAL_VALIDATOR_CHECKS_FOR_TESTING", @@ -358,6 +359,10 @@ Config::Config() : NODE_SEED(SecretKey::random()) CATCHUP_SKIP_KNOWN_RESULTS_FOR_TESTING = false; MODE_USES_IN_MEMORY_LEDGER = false; SKIP_HIGH_CRITICAL_VALIDATOR_CHECKS_FOR_TESTING = false; + ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING = + std::chrono::milliseconds::zero(); + ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING = + std::chrono::milliseconds::zero(); #endif #ifdef BEST_OFFER_DEBUGGING @@ -1961,6 +1966,18 @@ Config::processConfig(std::shared_ptr t) ARTIFICIALLY_SLEEP_MAIN_THREAD_FOR_TESTING = std::chrono::microseconds(readInt(item)); }}, +#ifdef BUILD_TESTS + {"ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING", + [&]() { + ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING = + std::chrono::milliseconds(readInt(item)); + }}, + {"ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING", + [&]() { + ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING = + std::chrono::milliseconds(readInt(item)); + }}, +#endif {"MAX_DEX_TX_OPERATIONS_IN_TX_SET", [&]() { auto value = readInt(item); diff --git a/src/main/Config.h b/src/main/Config.h index a5c0e9d4d6..08a7723a70 100644 --- a/src/main/Config.h +++ b/src/main/Config.h @@ -928,6 +928,14 @@ class Config : public std::enable_shared_from_this // bounds on config upgrades (for testing only). bool TESTING_IGNORE_LEDGER_TIME_UPGRADE_BOUNDS; + // Injects a signed wall-clock offset into the node's system clock for + // testing. Expressed in milliseconds. + std::chrono::milliseconds ARTIFICIALLY_SET_SYSTEM_CLOCK_OFFSET_FOR_TESTING; + + // Delay emission of updated nomination messages for testing nomination + // timeout behavior. Expressed in milliseconds. + std::chrono::milliseconds ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING; + // Set QUORUM_SET using automatic quorum set configuration based on // `validators`. void diff --git a/src/scp/NominationProtocol.cpp b/src/scp/NominationProtocol.cpp index 1cdc267a30..89de9ff8be 100644 --- a/src/scp/NominationProtocol.cpp +++ b/src/scp/NominationProtocol.cpp @@ -676,6 +676,31 @@ NominationProtocol::nominate(ValueWrapperPtr value, Value const& previousValue, slot->nominate(value, previousValue, true); }); +#ifdef BUILD_TESTS + // If a nomination-emit delay is configured, defer the emit by + // arming NOMINATION_EMIT_TIMER instead of broadcasting now. A subsequent + // nominate() will replace the timer so the eventual emit reflects the + // latest state. Skip the deferral once the SCP nomination timeout has + // fired (`timedout`) so retries don't compound latency, and cancel any + // still-armed timer in that case. + if (updated) + { + auto delay = mSlot.getSCPDriver().getNominationEmitDelayForTesting(); + if (delay > std::chrono::milliseconds::zero() && !timedout) + { + mSlot.getSCPDriver().setupTimer( + mSlot.getSlotIndex(), Slot::NOMINATION_EMIT_TIMER, delay, + [slot, this]() { emitNomination(); }); + return updated; + } + if (delay > std::chrono::milliseconds::zero()) + { + mSlot.getSCPDriver().stopTimer(mSlot.getSlotIndex(), + Slot::NOMINATION_EMIT_TIMER); + } + } +#endif + if (updated) { emitNomination(); diff --git a/src/scp/SCPDriver.h b/src/scp/SCPDriver.h index 7c6dc01fc8..fe9b24637c 100644 --- a/src/scp/SCPDriver.h +++ b/src/scp/SCPDriver.h @@ -191,6 +191,14 @@ class SCPDriver virtual std::chrono::milliseconds computeTimeout(uint32 roundNumber, bool isNomination) = 0; +#ifdef BUILD_TESTS + virtual std::chrono::milliseconds + getNominationEmitDelayForTesting() const + { + return std::chrono::milliseconds::zero(); + } +#endif + // returns the weight of the node within the qset normalized between // 0-UINT64_MAX. If `nodeID` is the local node, then set `isLocalNode` to // `true`. diff --git a/src/scp/Slot.h b/src/scp/Slot.h index 20eca36d95..ecf1e8fee5 100644 --- a/src/scp/Slot.h +++ b/src/scp/Slot.h @@ -197,7 +197,13 @@ class Slot : public std::enable_shared_from_this enum timerIDs { NOMINATION_TIMER = 0, - BALLOT_PROTOCOL_TIMER = 1 + BALLOT_PROTOCOL_TIMER = 1, +#ifdef BUILD_TESTS + // Test-only: defers broadcast of a nomination vote by + // ARTIFICIALLY_DELAY_NOMINATION_EMIT_FOR_TESTING. Used to simulate + // slow nominators without breaking the protocol. + NOMINATION_EMIT_TIMER = 2, +#endif }; // The number of times the timer has to expire before we consider the node diff --git a/src/test/test.cpp b/src/test/test.cpp index deedcb9e83..9045199576 100644 --- a/src/test/test.cpp +++ b/src/test/test.cpp @@ -548,6 +548,7 @@ getTestConfig(int instanceNumber, Config::TestDbMode mode) thisConfig.MANUAL_CLOSE = true; thisConfig.TEST_CASES_ENABLED = true; + thisConfig.EXPERIMENTAL_TRIGGER_TIMER = true; thisConfig.PEER_PORT = static_cast(DEFAULT_PEER_PORT + instanceNumber * 2); diff --git a/src/util/Timer.cpp b/src/util/Timer.cpp index 2d776faccf..3a90b6a74e 100644 --- a/src/util/Timer.cpp +++ b/src/util/Timer.cpp @@ -43,21 +43,33 @@ VirtualClock::now() const noexcept } } -VirtualClock::system_time_point -VirtualClock::system_now() const noexcept +std::pair +VirtualClock::actual_and_fake_system_now() const noexcept { + system_time_point actual; if (mMode == REAL_TIME) { - return std::chrono::system_clock::now(); + actual = std::chrono::system_clock::now(); } else { LOCK_GUARD(mVirtualNowMutex, lock); auto offset = mVirtualNow.time_since_epoch(); - return std::chrono::system_clock::time_point( + actual = std::chrono::system_clock::time_point( std::chrono::duration_cast< std::chrono::system_clock::time_point::duration>(offset)); } + auto drifted = actual; +#ifdef BUILD_TESTS + drifted += mSystemTimeOffset.load(std::memory_order_relaxed); +#endif + return {actual, drifted}; +} + +VirtualClock::system_time_point +VirtualClock::system_now() const noexcept +{ + return actual_and_fake_system_now().second; } void @@ -288,6 +300,14 @@ VirtualClock::setCurrentVirtualTime(system_time_point t) setCurrentVirtualTime(time_point(offset)); } +#ifdef BUILD_TESTS +void +VirtualClock::setSystemTimeOffset(std::chrono::microseconds offset) +{ + mSystemTimeOffset.store(offset, std::memory_order_relaxed); +} +#endif + void VirtualClock::sleep_for(std::chrono::microseconds us) { diff --git a/src/util/Timer.h b/src/util/Timer.h index 3ae767591c..089b486fe7 100644 --- a/src/util/Timer.h +++ b/src/util/Timer.h @@ -12,11 +12,13 @@ #include "util/Scheduler.h" #include "util/ThreadAnnotations.h" +#include #include #include #include #include #include +#include namespace stellar { @@ -209,6 +211,15 @@ class VirtualClock RealSteadyTimer mRealTimer; mutable ANNOTATED_MUTEX(mVirtualNowMutex); +#ifdef BUILD_TESTS + // Offset applied to system_now() to simulate wall-clock drift without + // affecting steady_clock scheduling. Positive values make system_now() + // return a time in the future; negative values make it return a time in + // the past. + std::atomic mSystemTimeOffset{ + std::chrono::microseconds{0}}; +#endif + public: // A VirtualClock is instantiated in either real or virtual mode. In real // mode, crank() sleeps until the next event, either timer or IO; in virtual @@ -231,6 +242,11 @@ class VirtualClock // someone has set the time forward using setCurrentVirtualTime below). system_time_point system_now() const noexcept; + // Returns both the unshifted and drifted system time samples captured from + // the same underlying timestamp source. + std::pair + actual_and_fake_system_now() const noexcept; + void enqueue(std::shared_ptr ve); void flushCancelledEvents(); bool cancelAllEvents(); @@ -254,6 +270,12 @@ class VirtualClock size_t getActionQueueSize() const; bool actionQueueIsOverloaded() const; Scheduler::ActionType currentSchedulerActionType() const; + +#ifdef BUILD_TESTS + // Inject a wall-clock offset into system_now() to simulate clock drift. + // Does not affect steady_clock (now()) or event scheduling. + void setSystemTimeOffset(std::chrono::microseconds offset); +#endif }; class VirtualClockEvent : public NonMovableOrCopyable