From a971fa19897d5861b1a28bb9d7bdc480c1b02da7 Mon Sep 17 00:00:00 2001 From: NightCrawler Date: Tue, 10 Mar 2026 13:22:44 +0000 Subject: [PATCH] fix: re-enable legacy metrics reporter for audit bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The audit module's epoch-end recovery requires peer observations from active probers. When the module was first activated on testnet, all supernodes running v2.4.5-testnet had already been POSTPONED by the legacy staleness handler (they stopped submitting MsgReportSupernodeMetrics ~500 blocks after upgrading, before the chain upgrade). This created a deadlock: - Recovery needs peer observations from active probers - No active probers exist (empty active_supernode_accounts in every anchor) - POSTPONED SNs submit epoch reports but cannot recover - The 3 SNs on old releases bounce ACTIVE↔POSTPONED via legacy metrics but are always POSTPONED at epoch start (anchor freeze time) Fix: run the legacy metrics reporter alongside the audit host_reporter. Legacy MsgReportSupernodeMetrics recovers POSTPONED SNs to ACTIVE mid-epoch. Since they also submit audit epoch reports, the audit EndBlocker won't re-postpone them (report exists, host minimums are disabled, no peer-port streak). They survive the epoch end and appear ACTIVE in the next epoch anchor, bootstrapping the peer-observation cycle for all remaining POSTPONED SNs. Once the active set stabilizes, the legacy reporter can be removed in a future release. --- supernode/cmd/start.go | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/supernode/cmd/start.go b/supernode/cmd/start.go index be1c1c15..2ed3a04c 100644 --- a/supernode/cmd/start.go +++ b/supernode/cmd/start.go @@ -26,9 +26,7 @@ import ( hostReporterService "github.com/LumeraProtocol/supernode/v2/supernode/host_reporter" statusService "github.com/LumeraProtocol/supernode/v2/supernode/status" storageChallengeService "github.com/LumeraProtocol/supernode/v2/supernode/storage_challenge" - // Legacy supernode metrics reporter (MsgReportSupernodeMetrics) has been superseded by - // epoch-scoped audit reporting in `x/audit`. - // supernodeMetrics "github.com/LumeraProtocol/supernode/v2/supernode/supernode_metrics" + supernodeMetrics "github.com/LumeraProtocol/supernode/v2/supernode/supernode_metrics" "github.com/LumeraProtocol/supernode/v2/supernode/transport/gateway" cascadeRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/cascade" server "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/status" @@ -173,18 +171,22 @@ The supernode will connect to the Lumera network and begin participating in the logtrace.Fatal(ctx, "Failed to initialize host reporter", logtrace.Fields{"error": err.Error()}) } - // Legacy on-chain supernode metrics reporting has been superseded by `x/audit`. - // metricsCollector := supernodeMetrics.NewCollector( - // statusSvc, - // lumeraClient, - // appConfig.SupernodeConfig.Identity, - // Version, - // kr, - // appConfig.SupernodeConfig.Port, - // appConfig.P2PConfig.Port, - // appConfig.SupernodeConfig.GatewayPort, - // ) - // logtrace.Info(ctx, "Metrics collection enabled", logtrace.Fields{}) + // Legacy on-chain supernode metrics reporting (MsgReportSupernodeMetrics) + // runs alongside the audit epoch reporter. It is needed to recover + // POSTPONED supernodes via the supernode module's instant-recovery + // path so they appear ACTIVE in the next epoch anchor — which + // bootstraps the audit peer-observation cycle. + metricsCollector := supernodeMetrics.NewCollector( + statusSvc, + lumeraClient, + appConfig.SupernodeConfig.Identity, + Version, + kr, + appConfig.SupernodeConfig.Port, + appConfig.P2PConfig.Port, + appConfig.SupernodeConfig.GatewayPort, + ) + logtrace.Info(ctx, "Legacy metrics collection enabled (audit bootstrap)", logtrace.Fields{}) // Storage challenge history DB (shared by the gRPC handler and runner). historyStore, err := queries.OpenHistoryDB() @@ -253,7 +255,7 @@ The supernode will connect to the Lumera network and begin participating in the // Start the services using the standard runner and capture exit servicesErr := make(chan error, 1) go func() { - services := []service{grpcServer, cService, p2pService, gatewayServer, hostReporter} + services := []service{grpcServer, cService, p2pService, gatewayServer, hostReporter, metricsCollector} if storageChallengeRunner != nil { services = append(services, storageChallengeRunner) }