diff --git a/tests/systemtests/audit_empty_active_set_bootstrap_test.go b/tests/systemtests/audit_empty_active_set_bootstrap_test.go new file mode 100644 index 00000000..c13a8083 --- /dev/null +++ b/tests/systemtests/audit_empty_active_set_bootstrap_test.go @@ -0,0 +1,191 @@ +//go:build system_test + +package system + +// This test validates the "empty active set deadlock" bootstrap scenario: +// +// When ALL supernodes are POSTPONED at epoch start, the epoch anchor has an +// empty active_supernode_accounts set. Without active probers, no peer +// observations are generated, and the audit module's recovery rule +// (compliant host report + peer all-ports-OPEN) can never be satisfied. +// +// The fix is to use legacy MsgReportSupernodeMetrics to recover SNs to +// ACTIVE mid-epoch. Combined with audit epoch reports, the SN survives +// the audit EndBlocker and appears in the next epoch's anchor, seeding +// the active set and bootstrapping the peer-observation cycle. +// +// Scenario: +// 1. Two supernodes register and start ACTIVE. +// 2. Neither submits epoch reports for epoch 0 → both POSTPONED at epoch 0 end. +// 3. Epoch 1: empty active set. Both submit host-only audit reports. +// Verify: audit recovery alone cannot recover them (no peer observations). +// 4. Legacy MsgReportSupernodeMetrics recovers both mid-epoch 2. +// 5. Epoch 2 end: audit enforcement checks them as ACTIVE — they have reports, +// host minimums disabled, no peer-port streak → they stay ACTIVE. +// 6. Epoch 3: both are in the anchor active set → peer observations flow → self-sustaining. + +import ( + "testing" + + sntypes "github.com/LumeraProtocol/lumera/x/supernode/v1/types" + "github.com/stretchr/testify/require" +) + +func TestAuditEmptyActiveSetBootstrap_LegacyMetricsBreaksDeadlock(t *testing.T) { + const ( + epochLengthBlocks = uint64(10) + originHeight = int64(1) + ) + + sut.ModifyGenesisJSON(t, + setSupernodeParamsForAuditTests(t), + setAuditParamsForFastEpochs(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}), + ) + sut.StartChain(t) + + cli := NewLumeradCLI(t, sut, true) + n0 := getNodeIdentity(t, cli, "node0") + n1 := getNodeIdentity(t, cli, "node1") + + registerSupernode(t, cli, n0, "192.168.1.1") + registerSupernode(t, cli, n1, "192.168.1.2") + + // Both are ACTIVE after registration. + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n0.valAddr)) + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n1.valAddr)) + + // ── Epoch 0: Do NOT submit any epoch reports. ── + // This simulates the testnet scenario where SNs were running releases + // without audit code when the chain upgraded to enable the audit module. + currentHeight := sut.AwaitNextBlock(t) + _, epoch0Start := nextEpochAfterHeight(originHeight, epochLengthBlocks, currentHeight) + epoch1Start := epoch0Start + int64(epochLengthBlocks) + epoch2Start := epoch1Start + int64(epochLengthBlocks) + + // Wait for epoch 0 to end → both get POSTPONED for missing reports. + awaitAtLeastHeight(t, epoch1Start) + + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr), + "node0 should be POSTPONED after missing epoch 0 report") + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr), + "node1 should be POSTPONED after missing epoch 0 report") + + // ── Epoch 1: Empty active set — the deadlock. ── + epochID1 := uint64((epoch1Start - originHeight) / int64(epochLengthBlocks)) + + // Both submit host-only audit epoch reports (as POSTPONED reporters, no observations). + hostOK := auditHostReportJSON([]string{"PORT_STATE_OPEN"}) + tx0 := submitEpochReport(t, cli, n0.nodeName, epochID1, hostOK, nil) + RequireTxSuccess(t, tx0) + tx1 := submitEpochReport(t, cli, n1.nodeName, epochID1, hostOK, nil) + RequireTxSuccess(t, tx1) + + // Wait for epoch 1 to end WITHOUT legacy metrics recovery. + // Both should remain POSTPONED — audit recovery fails (no peer observations). + awaitAtLeastHeight(t, epoch2Start) + + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr), + "node0 should still be POSTPONED — audit recovery alone cannot break the deadlock") + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr), + "node1 should still be POSTPONED — audit recovery alone cannot break the deadlock") + + // ── Epoch 2: Break the deadlock with legacy MsgReportSupernodeMetrics. ── + epochID2 := epochID1 + 1 + epoch3Start := epoch2Start + int64(epochLengthBlocks) + + // Submit legacy metrics → instant recovery to ACTIVE. + compliantMetrics := sntypes.SupernodeMetrics{ + VersionMajor: 2, + VersionMinor: 4, + VersionPatch: 5, + OpenPorts: []sntypes.PortStatus{ + {Port: 4444, State: sntypes.PortState_PORT_STATE_OPEN}, + }, + } + + hash0 := reportSupernodeMetrics(t, cli, n0.nodeName, n0.valAddr, n0.accAddr, compliantMetrics) + txJSON0 := waitForTx(t, cli, hash0) + resp0 := decodeTxResponse(t, txJSON0) + require.Equal(t, uint32(0), resp0.Code, "legacy metrics tx for node0 should succeed: %s", resp0.RawLog) + + hash1 := reportSupernodeMetrics(t, cli, n1.nodeName, n1.valAddr, n1.accAddr, compliantMetrics) + txJSON1 := waitForTx(t, cli, hash1) + resp1 := decodeTxResponse(t, txJSON1) + require.Equal(t, uint32(0), resp1.Code, "legacy metrics tx for node1 should succeed: %s", resp1.RawLog) + + // Both should now be ACTIVE (instant recovery via legacy path). + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n0.valAddr), + "node0 should be ACTIVE after legacy metrics recovery") + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n1.valAddr), + "node1 should be ACTIVE after legacy metrics recovery") + + // Also submit audit epoch reports so the audit EndBlocker doesn't re-postpone them. + tx0e2 := submitEpochReport(t, cli, n0.nodeName, epochID2, hostOK, nil) + RequireTxSuccess(t, tx0e2) + tx1e2 := submitEpochReport(t, cli, n1.nodeName, epochID2, hostOK, nil) + RequireTxSuccess(t, tx1e2) + + // Wait for epoch 2 to end. + awaitAtLeastHeight(t, epoch3Start) + + // ── Verify: both survive the audit EndBlocker and remain ACTIVE. ── + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n0.valAddr), + "node0 should remain ACTIVE after epoch 2 enforcement (legacy metrics + audit report)") + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n1.valAddr), + "node1 should remain ACTIVE after epoch 2 enforcement (legacy metrics + audit report)") +} + +// TestAuditEmptyActiveSetDeadlock_HostOnlyReportsCannotRecover verifies that +// when all supernodes are POSTPONED, submitting host-only epoch reports across +// multiple epochs is insufficient for recovery — proving the deadlock exists. +func TestAuditEmptyActiveSetDeadlock_HostOnlyReportsCannotRecover(t *testing.T) { + const ( + epochLengthBlocks = uint64(10) + originHeight = int64(1) + ) + + sut.ModifyGenesisJSON(t, + setSupernodeParamsForAuditTests(t), + setAuditParamsForFastEpochs(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}), + ) + sut.StartChain(t) + + cli := NewLumeradCLI(t, sut, true) + n0 := getNodeIdentity(t, cli, "node0") + n1 := getNodeIdentity(t, cli, "node1") + + registerSupernode(t, cli, n0, "192.168.1.1") + registerSupernode(t, cli, n1, "192.168.1.2") + + // Epoch 0: no reports → both POSTPONED. + currentHeight := sut.AwaitNextBlock(t) + _, epoch0Start := nextEpochAfterHeight(originHeight, epochLengthBlocks, currentHeight) + epoch1Start := epoch0Start + int64(epochLengthBlocks) + + awaitAtLeastHeight(t, epoch1Start) + + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr)) + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr)) + + // Submit host-only reports for 3 consecutive epochs. None should recover. + hostOK := auditHostReportJSON([]string{"PORT_STATE_OPEN"}) + for i := 0; i < 3; i++ { + epochStart := epoch1Start + int64(i)*int64(epochLengthBlocks) + nextEpochStart := epochStart + int64(epochLengthBlocks) + epochID := uint64((epochStart - originHeight) / int64(epochLengthBlocks)) + + awaitAtLeastHeight(t, epochStart) + + tx0 := submitEpochReport(t, cli, n0.nodeName, epochID, hostOK, nil) + RequireTxSuccess(t, tx0) + tx1 := submitEpochReport(t, cli, n1.nodeName, epochID, hostOK, nil) + RequireTxSuccess(t, tx1) + + awaitAtLeastHeight(t, nextEpochStart) + + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr), + "node0 should remain POSTPONED in epoch %d — no peer observations possible", epochID) + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr), + "node1 should remain POSTPONED in epoch %d — no peer observations possible", epochID) + } +} diff --git a/x/audit/v1/keeper/enforcement_empty_active_set_test.go b/x/audit/v1/keeper/enforcement_empty_active_set_test.go new file mode 100644 index 00000000..ce3e6fcc --- /dev/null +++ b/x/audit/v1/keeper/enforcement_empty_active_set_test.go @@ -0,0 +1,140 @@ +package keeper_test + +import ( + "testing" + + "github.com/LumeraProtocol/lumera/testutil/cryptotestutils" + "github.com/LumeraProtocol/lumera/x/audit/v1/types" + sntypes "github.com/LumeraProtocol/lumera/x/supernode/v1/types" + sdk "github.com/cosmos/cosmos-sdk/types" + "go.uber.org/mock/gomock" +) + +// TestEnforceEpochEnd_EmptyActiveSet_PostponedCannotRecover verifies that when +// the active set is empty (all supernodes POSTPONED), submitting compliant +// host-only epoch reports is insufficient for recovery because no peer +// observations exist. This is the "empty active set deadlock". +func TestEnforceEpochEnd_EmptyActiveSet_PostponedCannotRecover(t *testing.T) { + f := initFixture(t) + + _, sn0Acc, sn0Val := cryptotestutils.SupernodeAddresses() + _, sn1Acc, sn1Val := cryptotestutils.SupernodeAddresses() + + sn0 := sntypes.SuperNode{ + SupernodeAccount: sn0Acc.String(), + ValidatorAddress: sdk.ValAddress(sn0Val).String(), + } + sn1 := sntypes.SuperNode{ + SupernodeAccount: sn1Acc.String(), + ValidatorAddress: sdk.ValAddress(sn1Val).String(), + } + + params := types.DefaultParams() + params.RequiredOpenPorts = []uint32{4444} + params.ConsecutiveEpochsToPostpone = 1 + + epochID := uint64(1) + + // Both POSTPONED supernodes submit compliant host-only reports. + for _, sn := range []sntypes.SuperNode{sn0, sn1} { + err := f.keeper.SetReport(f.ctx, types.EpochReport{ + SupernodeAccount: sn.SupernodeAccount, + EpochId: epochID, + ReportHeight: f.ctx.BlockHeight(), + HostReport: types.HostReport{}, + }) + if err != nil { + t.Fatalf("failed to set report for %s: %v", sn.SupernodeAccount, err) + } + } + + // No StorageChallengeReportIndex entries — no one probed anyone + // (empty active set means no probers were assigned). + + // Mock: no ACTIVE supernodes, two POSTPONED. + f.supernodeKeeper.EXPECT(). + GetAllSuperNodes(gomock.AssignableToTypeOf(f.ctx), sntypes.SuperNodeStateActive). + Return([]sntypes.SuperNode{}, nil). + Times(1) + f.supernodeKeeper.EXPECT(). + GetAllSuperNodes(gomock.AssignableToTypeOf(f.ctx), sntypes.SuperNodeStatePostponed). + Return([]sntypes.SuperNode{sn0, sn1}, nil). + Times(1) + + // Recovery should NOT be called — no peer observations exist. + f.supernodeKeeper.EXPECT(). + RecoverSuperNodeFromPostponed(gomock.Any(), gomock.Any()). + Times(0) + + err := f.keeper.EnforceEpochEnd(f.ctx, epochID, params) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +// TestEnforceEpochEnd_LegacyRecoveredSN_SurvivesWithReport verifies that a +// supernode which was recovered to ACTIVE mid-epoch (e.g., by legacy +// MsgReportSupernodeMetrics) and also submitted an audit epoch report +// is NOT re-postponed at epoch end, even when no peer observations exist. +// +// This confirms the fix: legacy metrics recovery + audit epoch report = +// the SN survives enforcement and can appear in the next epoch's anchor. +func TestEnforceEpochEnd_LegacyRecoveredSN_SurvivesWithReport(t *testing.T) { + f := initFixture(t) + + _, sn0Acc, sn0Val := cryptotestutils.SupernodeAddresses() + _, sn1Acc, sn1Val := cryptotestutils.SupernodeAddresses() + + sn0 := sntypes.SuperNode{ + SupernodeAccount: sn0Acc.String(), + ValidatorAddress: sdk.ValAddress(sn0Val).String(), + } + sn1 := sntypes.SuperNode{ + SupernodeAccount: sn1Acc.String(), + ValidatorAddress: sdk.ValAddress(sn1Val).String(), + } + + params := types.DefaultParams() + params.RequiredOpenPorts = []uint32{4444} + params.ConsecutiveEpochsToPostpone = 1 + + epochID := uint64(1) + + // Both supernodes submitted epoch reports (host-only, as they were + // POSTPONED when submitting — no storage challenge observations). + for _, sn := range []sntypes.SuperNode{sn0, sn1} { + err := f.keeper.SetReport(f.ctx, types.EpochReport{ + SupernodeAccount: sn.SupernodeAccount, + EpochId: epochID, + ReportHeight: f.ctx.BlockHeight(), + HostReport: types.HostReport{}, + }) + if err != nil { + t.Fatalf("failed to set report for %s: %v", sn.SupernodeAccount, err) + } + } + + // Simulate: both were recovered to ACTIVE mid-epoch via legacy metrics. + // At epoch end, the audit enforcement sees them as ACTIVE. + f.supernodeKeeper.EXPECT(). + GetAllSuperNodes(gomock.AssignableToTypeOf(f.ctx), sntypes.SuperNodeStateActive). + Return([]sntypes.SuperNode{sn0, sn1}, nil). + Times(1) + f.supernodeKeeper.EXPECT(). + GetAllSuperNodes(gomock.AssignableToTypeOf(f.ctx), sntypes.SuperNodeStatePostponed). + Return([]sntypes.SuperNode{}, nil). + Times(1) + + // They have reports → no missing-report postponement. + // Host minimums are all 0 → no violation. + // No peer observations → peersPortStateMeetsThreshold returns false → no streak → no postponement. + // Expect: SetSuperNodePostponed is NEVER called. + f.supernodeKeeper.EXPECT(). + SetSuperNodePostponed(gomock.Any(), gomock.Any(), gomock.Any()). + Times(0) + + err := f.keeper.EnforceEpochEnd(f.ctx, epochID, params) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +}