diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 91afb3dfb7..2aaaf7ec04 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -187,6 +187,160 @@ jobs: "./integration_test/contracts/verify_flatkv_partial_loss_fails_loudly.sh", ], }, + { + # FlatKV EVM migrate cluster coverage. + # + # The cluster boots with GIGA_MIGRATE_FROM_MEMIAVL=true so + # every validator starts in sc-write-mode = memiavl_only, + # i.e. v0 (FlatKV not yet allocated). This is the inverse of + # the FlatKV Integration row above (which boots in + # test_only_dual_write) and is what makes the migration + # script's pre-flip check meaningful: if the matrix env ever + # silently lands the cluster in any mode other than + # memiavl_only, the script will fail loudly at the pre-flip + # grep instead of "succeeding" with a no-op migration. + # + # Steps: + # 1 Deposit an EVM fixture while in v0 so the migration + # has real account+code+storage to drain. The fixture + # writes roughly 4000 storage keys by default so the + # migration spans multiple batches instead of trivially + # completing against an empty or smoke-sized tree. + # 2 Coordinated stop -> sed sc-write-mode -> restart on + # all 4 validators, then poll seidb migrate-evm-status + # until every validator reports completion. Cross- + # validator FlatKV digest agreement is asserted at a + # shared post-migration height; any non-determinism in + # the batch copier would surface here as a digest + # mismatch. + # 3 Re-run the fixture round-trip check against the + # now-FlatKV-backed EVM state, confirming pre-migration + # data survives the migration intact (read transparency). + name: "FlatKV EVM Migrate", + env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + scripts: [ + "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + "./integration_test/contracts/verify_flatkv_evm_migrate.sh", + "docker exec sei-node-0 integration_test/contracts/verify_flatkv_evm_store.sh", + ], + }, + # --------------------------------------------------------------- + # FlatKV Migrate Resilience / Live Workload / Statesync are + # parked alongside the other five rows below: each of them ends + # up advancing the chain past the migration boundary while EVM + # traffic is being driven (or the post-migration workload script + # is invoked directly), so the very first EndBlock after the + # boundary trips + # + # panic recovered in ProcessBlock + # panic="RouterCommitKVStore.Iterator(store=\"evm\"): \ + # iteration not supported after migration completion \ + # for store \"evm\"" + # + # in x/evm/keeper.RemoveFirstNTxHashes. Re-enable these rows + # together with the keeper/router fix tracked below. + # --------------------------------------------------------------- + # { + # name: "FlatKV Migrate Resilience", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "./integration_test/contracts/verify_flatkv_migrate_resume_after_crash.sh", + # "./integration_test/contracts/verify_flatkv_post_migrate_evm_workload.sh", + # ], + # }, + # { + # name: "FlatKV Migrate Live Workload", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "./integration_test/contracts/verify_flatkv_migrate_with_live_workload.sh", + # ], + # }, + # { + # name: "FlatKV Migrate Statesync", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "./integration_test/contracts/verify_flatkv_migrate_statesync.sh", + # ], + # }, + { + name: "FlatKV Migrate Idle", + env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + scripts: [ + "./integration_test/contracts/verify_flatkv_migrate_idle.sh", + ], + }, + { + name: "FlatKV Migrate Negative", + env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + scripts: [ + "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + "./integration_test/contracts/verify_flatkv_partial_quorum_flip_halts.sh", + ], + }, + # --------------------------------------------------------------- + # The following five FlatKV v0->v1 EVM-migration rows are + # parked (not wired into CI) until the post-migration EVM + # iterator regression is fixed. They all reliably fail with + # + # panic recovered in ProcessBlock + # panic="RouterCommitKVStore.Iterator(store=\"evm\"): \ + # iteration not supported after migration completion \ + # for store \"evm\"" + # + # at the first EndBlock after migration completes, because + # x/evm/keeper.RemoveFirstNTxHashes does a prefix iterator on + # the evm store and RouterCommitKVStore refuses iteration once + # the evm store has cut over to FlatKV. Re-enable these rows + # together with the keeper/router fix. + # + # Helpers (lib/flatkv_migration.sh), verify scripts and the + # MIGRATE_BATCH_EDGE knob are kept in-tree so the bug-fix PR + # only needs to wire the rows back in. + # --------------------------------------------------------------- + # { + # name: "FlatKV Migrate AppHash Continuity", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "./integration_test/contracts/verify_flatkv_migrate_apphash_continuity.sh", + # ], + # }, + # { + # name: "FlatKV Post Migrate Iterator", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "./integration_test/contracts/verify_flatkv_evm_migrate.sh", + # "./integration_test/contracts/verify_flatkv_post_migrate_iterator.sh", + # ], + # }, + # { + # name: "FlatKV Migrate Graceful Restart", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "./integration_test/contracts/verify_flatkv_migrate_graceful_restart.sh", + # ], + # }, + # { + # name: "FlatKV Migrate Batch Min", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec -e FLATKV_EVM_BULK_STORAGE_KEYS=50 sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "MIGRATE_BATCH_EDGE=min ./integration_test/contracts/verify_flatkv_migrate_batch_edge.sh", + # ], + # }, + # { + # name: "FlatKV Migrate Batch Oneshot", + # env: "GIGA_MIGRATE_FROM_MEMIAVL=true", + # scripts: [ + # "docker exec sei-node-0 integration_test/contracts/deploy_flatkv_evm_fixture.sh", + # "MIGRATE_BATCH_EDGE=oneshot ./integration_test/contracts/verify_flatkv_migrate_batch_edge.sh", + # ], + # }, { name: "EVM Module", env: "GIGA_STORAGE=true", diff --git a/Makefile b/Makefile index 0726b627cc..a434077dd3 100644 --- a/Makefile +++ b/Makefile @@ -295,7 +295,8 @@ CLUSTER_ENV_VARS = DOCKER_PLATFORM=$(DOCKER_PLATFORM) USERID=$(shell id -u) GROU GIGA_OCC=$(GIGA_OCC) \ RECEIPT_BACKEND=$(RECEIPT_BACKEND) \ AUTOBAHN=$(AUTOBAHN) \ - GIGA_STORAGE=$(GIGA_STORAGE) + GIGA_STORAGE=$(GIGA_STORAGE) \ + GIGA_MIGRATE_FROM_MEMIAVL=$(GIGA_MIGRATE_FROM_MEMIAVL) # Run a 4-node docker containers docker-cluster-start: docker-cluster-stop build-docker-node @@ -321,7 +322,7 @@ docker-cluster-start-skipbuild: docker-cluster-stop build-docker-node else \ DETACH_FLAG=""; \ fi; \ - DOCKER_PLATFORM=$(DOCKER_PLATFORM) USERID=$(shell id -u) GROUPID=$(shell id -g) GOCACHE=$(shell go env GOCACHE) NUM_ACCOUNTS=10 SKIP_BUILD=true docker compose up $$DETACH_FLAG + $(CLUSTER_ENV_VARS) SKIP_BUILD=true docker compose up $$DETACH_FLAG .PHONY: localnet-start # Stop 4-node docker containers diff --git a/app/seidb.go b/app/seidb.go index 2bb36599df..36a26ec5a0 100644 --- a/app/seidb.go +++ b/app/seidb.go @@ -28,6 +28,14 @@ const ( FlagSCHistoricalProofRateLimit = "state-commit.sc-historical-proof-rate-limit" FlagSCHistoricalProofBurst = "state-commit.sc-historical-proof-burst" FlagSCWriteMode = "state-commit.sc-write-mode" + // Per-block batch size used by the MigrationManager when sc-write-mode + // is one of the in-flight modes (migrate_evm, migrate_bank, + // migrate_all_but_bank). Optional: when unset in app.toml the field + // stays at DefaultStateCommitConfig().KeysToMigratePerBlock (= 1024), + // which is appropriate for production drains. Lowering it spreads the + // migration across more blocks, which is useful for tests that need to + // exercise the resume / hybrid-read path mid-flight. + FlagSCKeysToMigratePerBlock = "state-commit.sc-keys-to-migrate-per-block" // SS Store configs FlagSSEnable = "state-store.ss-enable" @@ -119,6 +127,16 @@ func parseSCConfigs(appOpts servertypes.AppOptions) config.StateCommitConfig { if v := appOpts.Get(FlagSCHistoricalProofBurst); v != nil { scConfig.HistoricalProofBurst = cast.ToInt(v) } + // Guard with v != nil so that an absent app.toml entry preserves the + // default of 1024 instead of clobbering it to 0, which would fail + // StateCommitConfig.Validate ("keys-to-migrate-per-block must be > 0") + // and bring the node down at startup the first time write-mode is + // flipped to a migration mode. + if v := appOpts.Get(FlagSCKeysToMigratePerBlock); v != nil { + if n := cast.ToInt(v); n > 0 { + scConfig.KeysToMigratePerBlock = n + } + } return scConfig } diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 4eb760d1c2..74dd607b50 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -21,6 +21,7 @@ services: - RECEIPT_BACKEND - AUTOBAHN - GIGA_STORAGE + - GIGA_MIGRATE_FROM_MEMIAVL volumes: - "${PROJECT_HOME}:/sei-protocol/sei-chain:Z" - "${PROJECT_HOME}/../sei-tendermint:/sei-protocol/sei-tendermint:Z" @@ -54,6 +55,7 @@ services: - RECEIPT_BACKEND - AUTOBAHN - GIGA_STORAGE + - GIGA_MIGRATE_FROM_MEMIAVL volumes: - "${PROJECT_HOME}:/sei-protocol/sei-chain:Z" - "${PROJECT_HOME}/../sei-tendermint:/sei-protocol/sei-tendermint:Z" @@ -83,6 +85,7 @@ services: - RECEIPT_BACKEND - AUTOBAHN - GIGA_STORAGE + - GIGA_MIGRATE_FROM_MEMIAVL ports: - "26662-26664:26656-26658" - "9094-9095:9090-9091" @@ -116,6 +119,7 @@ services: - RECEIPT_BACKEND - AUTOBAHN - GIGA_STORAGE + - GIGA_MIGRATE_FROM_MEMIAVL ports: - "26665-26667:26656-26658" - "9096-9097:9090-9091" diff --git a/docker/localnode/config/app.toml b/docker/localnode/config/app.toml index a685e1e09c..1d6592ee87 100644 --- a/docker/localnode/config/app.toml +++ b/docker/localnode/config/app.toml @@ -236,6 +236,13 @@ sc-snapshot-writer-limit = 2 # CacheSize defines the size of the LRU cache for each store on top of the tree, default to 100000. sc-cache-size = 1000 +# KeysToMigratePerBlock controls how many EVM keys the in-flight migration +# (sc-write-mode = migrate_evm / migrate_bank / migrate_all_but_bank) drains +# from memiavl into flatkv per block. Default 1024 is appropriate for +# production drains; tests lower it to spread the migration across more +# blocks and exercise the resume / hybrid-read path. +sc-keys-to-migrate-per-block = 1024 + [state-store] # Enable defines if the state-store should be enabled for historical queries. diff --git a/docker/localnode/scripts/step4_config_override.sh b/docker/localnode/scripts/step4_config_override.sh index 019cbc5380..ab42440d49 100755 --- a/docker/localnode/scripts/step4_config_override.sh +++ b/docker/localnode/scripts/step4_config_override.sh @@ -5,6 +5,13 @@ GIGA_EXECUTOR=${GIGA_EXECUTOR:-false} GIGA_OCC=${GIGA_OCC:-false} AUTOBAHN=${AUTOBAHN:-false} GIGA_STORAGE=${GIGA_STORAGE:-false} +# GIGA_MIGRATE_FROM_MEMIAVL=true boots the cluster in v0 (memiavl_only): +# memiavl is the sole SC backend, FlatKV is not allocated. This is the +# starting point for the FlatKV EVM migrate cluster test, which drives a +# real workload in this mode and then performs a coordinated stop/flip/ +# restart into migrate_evm. Mutually exclusive with GIGA_STORAGE=true; +# the script picks the more specific override if both are set. +GIGA_MIGRATE_FROM_MEMIAVL=${GIGA_MIGRATE_FROM_MEMIAVL:-false} APP_CONFIG_FILE="build/generated/node_$NODE_ID/app.toml" TENDERMINT_CONFIG_FILE="build/generated/node_$NODE_ID/config.toml" @@ -23,11 +30,31 @@ sed -i.bak -e "s|^snapshot-directory *=.*|snapshot-directory = \"./build/generat # Enable slow mode sed -i.bak -e 's/slow = .*/slow = true/' ~/.sei/config/app.toml +# Boot the cluster in v0 (memiavl_only) for the FlatKV EVM migrate test. +# Doing this here keeps the override surface narrow: the test runner +# only has to set one env var to ship a v0-shaped config, and the +# follow-up flip script just rewrites sc-write-mode in place during the +# coordinated stop. +if [ "$GIGA_MIGRATE_FROM_MEMIAVL" = "true" ]; then + echo "Booting node $NODE_ID in memiavl_only mode (FlatKV EVM migrate starting point)..." + if grep -q '^sc-write-mode[[:space:]]*=' ~/.sei/config/app.toml; then + sed -i 's/^sc-write-mode[[:space:]]*=.*/sc-write-mode = "memiavl_only"/' ~/.sei/config/app.toml + else + sed -i '/^\[state-store\]/i sc-write-mode = "memiavl_only"' ~/.sei/config/app.toml + fi + # The EVM SS split is irrelevant in this mode (flatkv is not allocated), + # but explicitly disabling it keeps app.toml self-describing in case an + # operator inspects it post-flip. + sed -i 's/^evm-ss-split[[:space:]]*=.*/evm-ss-split = false/' ~/.sei/config/app.toml +fi + # Enable Giga Storage: FlatKV SC dual-write + EVM SS split. # When GIGA_STORAGE=true we also default the receipt backend to parquet; callers # can still override this by setting RECEIPT_BACKEND explicitly. # Set GIGA_STORAGE=false to disable. -if [ "$GIGA_STORAGE" = "true" ]; then +# GIGA_MIGRATE_FROM_MEMIAVL takes precedence: if both are set, the memiavl-only +# block above ran first and the test runner is responsible for the migration. +if [ "$GIGA_STORAGE" = "true" ] && [ "$GIGA_MIGRATE_FROM_MEMIAVL" != "true" ]; then RECEIPT_BACKEND=${RECEIPT_BACKEND:-parquet} echo "Enabling Giga Storage for node $NODE_ID..." @@ -36,14 +63,14 @@ if [ "$GIGA_STORAGE" = "true" ]; then # from the memiavl tree via GetChildStoreByName. dual-write keeps memiavl # up-to-date for reads while also populating FlatKV. This mode is for test # clusters only — never deploy to testnet/mainnet. - if grep -q "sc-write-mode" ~/.sei/config/app.toml; then - sed -i 's/sc-write-mode = .*/sc-write-mode = "test_only_dual_write"/' ~/.sei/config/app.toml + if grep -q '^sc-write-mode[[:space:]]*=' ~/.sei/config/app.toml; then + sed -i 's/^sc-write-mode[[:space:]]*=.*/sc-write-mode = "test_only_dual_write"/' ~/.sei/config/app.toml else sed -i '/^\[state-store\]/i sc-write-mode = "test_only_dual_write"' ~/.sei/config/app.toml fi # --- SS layer: enable EVM split --- - sed -i 's/evm-ss-split = .*/evm-ss-split = true/' ~/.sei/config/app.toml + sed -i 's/^evm-ss-split[[:space:]]*=.*/evm-ss-split = true/' ~/.sei/config/app.toml fi # Enable Giga Executor if requested diff --git a/docker/localnode/scripts/step5_start_sei.sh b/docker/localnode/scripts/step5_start_sei.sh index bb02f08354..99c9a4b9b9 100755 --- a/docker/localnode/scripts/step5_start_sei.sh +++ b/docker/localnode/scripts/step5_start_sei.sh @@ -8,6 +8,6 @@ mkdir -p $LOG_DIR echo "Starting the seid process for node $NODE_ID with invariant check interval=$INVARIANT_CHECK_INTERVAL..." -seid start --chain-id sei --inv-check-period ${INVARIANT_CHECK_INTERVAL} > "$LOG_DIR/seid-$NODE_ID.log" 2>&1 & +seid start --chain-id sei --inv-check-period ${INVARIANT_CHECK_INTERVAL} >> "$LOG_DIR/seid-$NODE_ID.log" 2>&1 & echo "Node $NODE_ID seid is started now" echo "Done" >> build/generated/launch.complete diff --git a/integration_test/contracts/deploy_flatkv_evm_fixture.sh b/integration_test/contracts/deploy_flatkv_evm_fixture.sh index b2b914f002..aafcbdf774 100755 --- a/integration_test/contracts/deploy_flatkv_evm_fixture.sh +++ b/integration_test/contracts/deploy_flatkv_evm_fixture.sh @@ -11,6 +11,8 @@ CHAIN_ID=${FLATKV_EVM_FIXTURE_CHAIN_ID:-sei} RECIPIENT_ADDR=${FLATKV_EVM_FIXTURE_RECIPIENT:-0x70997970C51812dc3A010C7d01b50e0d17dc79C8} MISSING_ADDR=${FLATKV_EVM_FIXTURE_MISSING:-0xc1cadaffffffffffffffffffffffffffffffffff} TRANSFER_VALUE_WEI=${FLATKV_EVM_FIXTURE_TRANSFER_VALUE_WEI:-1} +BULK_STORAGE_KEYS=${FLATKV_EVM_BULK_STORAGE_KEYS:-4000} +BULK_STORAGE_KEYS_PER_CONTRACT=${FLATKV_EVM_BULK_STORAGE_KEYS_PER_CONTRACT:-50} KEYRING_ARGS=() if [ -n "${FLATKV_EVM_FIXTURE_KEYRING_BACKEND:-}" ]; then KEYRING_ARGS+=(--keyring-backend "$FLATKV_EVM_FIXTURE_KEYRING_BACKEND") @@ -106,6 +108,39 @@ require_success_receipt() { fi } +write_bulk_storage_contract() { + local output=$1 + local start_slot=$2 + local count=$3 + + python3 - "$output" "$start_slot" "$count" <<'PY' +import sys + +output = sys.argv[1] +start_slot = int(sys.argv[2]) +count = int(sys.argv[3]) + +if start_slot < 0 or count < 0 or start_slot + count > 65536: + raise SystemExit(f"slot range [{start_slot}, {start_slot + count}) is outside PUSH2 range") + +code = bytearray() +for slot in range(start_slot, start_slot + count): + # sstore(slot, slot + 1). Use fixed-width PUSH32/PUSH2 so the emitted + # constructor is deterministic and does not need an assembler dependency. + code.append(0x7F) # PUSH32 + code.extend((slot + 1).to_bytes(32, "big")) + code.append(0x61) # PUSH2 + code.extend(slot.to_bytes(2, "big")) + code.append(0x55) # SSTORE + +# Return empty runtime; the test only needs persisted storage rows. +code.extend(bytes.fromhex("60006000f3")) + +with open(output, "w", encoding="utf-8") as fh: + fh.write(code.hex()) +PY +} + echo "Generating FlatKV EVM historical fixture via $RPC_URL..." wait_for_evm_rpc @@ -198,9 +233,54 @@ missing_storage_expected=$(query_storage "$MISSING_ADDR" "$STORAGE_SLOT_ZERO" "$ write_fixture "flatkv_evm_missing_balance_expected.txt" "$missing_balance_expected" write_fixture "flatkv_evm_missing_storage_expected.txt" "$missing_storage_expected" +write_fixture "flatkv_evm_bulk_storage_keys.txt" "$BULK_STORAGE_KEYS" +if [ "$BULK_STORAGE_KEYS" -gt 0 ]; then + if [ "$BULK_STORAGE_KEYS_PER_CONTRACT" -le 0 ]; then + echo "FLATKV_EVM_BULK_STORAGE_KEYS_PER_CONTRACT must be positive" >&2 + exit 1 + fi + + echo "Deploying bulk storage fixture: total_slots=$BULK_STORAGE_KEYS slots_per_contract=$BULK_STORAGE_KEYS_PER_CONTRACT" + deployed_bulk=0 + while [ "$deployed_bulk" -lt "$BULK_STORAGE_KEYS" ]; do + remaining=$((BULK_STORAGE_KEYS - deployed_bulk)) + batch_size=$BULK_STORAGE_KEYS_PER_CONTRACT + if [ "$remaining" -lt "$batch_size" ]; then + batch_size=$remaining + fi + + bulk_contract_hex_file="/tmp/flatkv_evm_bulk_storage_${deployed_bulk}.hex" + write_bulk_storage_contract "$bulk_contract_hex_file" "$deployed_bulk" "$batch_size" + + if ! bulk_out=$(run_seid tx evm deploy "$bulk_contract_hex_file" \ + --from "$FROM" \ + "${KEYRING_ARGS[@]}" \ + --chain-id "$CHAIN_ID" \ + --evm-rpc "$RPC_URL" \ + -b sync \ + -y 2>&1); then + echo "FlatKV EVM bulk storage deploy command failed at slot offset $deployed_bulk:" >&2 + printf "%s\n" "$bulk_out" >&2 + exit 1 + fi + bulk_tx=$(printf "%s\n" "$bulk_out" | extract_tx_hash || true) + if [ -z "$bulk_tx" ]; then + echo "Failed to extract FlatKV EVM bulk storage tx hash at slot offset $deployed_bulk:" >&2 + printf "%s\n" "$bulk_out" >&2 + exit 1 + fi + bulk_receipt=$(wait_for_receipt "$bulk_tx" 120) + require_success_receipt "bulk storage deployment" "$bulk_receipt" + + deployed_bulk=$((deployed_bulk + batch_size)) + echo " bulk storage slots committed: $deployed_bulk/$BULK_STORAGE_KEYS" + done +fi + latest_height=$(block_number) write_fixture "flatkv_evm_latest_fixture_block_height.txt" "$latest_height" echo "FlatKV EVM fixture generated:" echo " recipient=$RECIPIENT_ADDR balance_height=$balance_height balance=$balance_expected" echo " contract=$contract_addr contract_height=$contract_height storage=$storage_expected" +echo " bulk_storage_keys=$BULK_STORAGE_KEYS" diff --git a/integration_test/contracts/lib/flatkv_migration.sh b/integration_test/contracts/lib/flatkv_migration.sh new file mode 100755 index 0000000000..a5de537701 --- /dev/null +++ b/integration_test/contracts/lib/flatkv_migration.sh @@ -0,0 +1,833 @@ +#!/bin/bash +# +# Shared helpers for docker-level FlatKV migration tests. + +if [ -n "${FLATKV_MIGRATION_LIB_SOURCED:-}" ]; then + return 0 +fi +FLATKV_MIGRATION_LIB_SOURCED=1 + +NODE_COUNT=${MIGRATE_NODE_COUNT:-4} +FLATKV_DIR=${FLATKV_DIR:-/root/.sei/data/state_commit/flatkv} +APP_CONFIG=${APP_CONFIG:-/root/.sei/config/app.toml} +GO_BIN=${GO_BIN:-/usr/local/go/bin/go} + +KEYS_TO_MIGRATE_PER_BLOCK=${MIGRATE_KEYS_PER_BLOCK:-400} +MIN_KEYS_MIGRATED=${MIGRATE_MIN_KEYS_MIGRATED:-0} +STOP_TIMEOUT=${MIGRATE_STOP_TIMEOUT:-30} +RESTART_PROBE_SECS=${MIGRATE_RESTART_PROBE_SECS:-60} +COMPLETION_TIMEOUT=${MIGRATE_COMPLETION_TIMEOUT:-180} +COMPARE_BUFFER=${MIGRATE_COMPARE_BUFFER:-2} +MIN_HEIGHT_AFTER=${MIGRATE_MIN_HEIGHT_AFTER:-5} +PRE_FLIP_SYNC_TIMEOUT=${MIGRATE_PREFLIP_SYNC_TIMEOUT:-120} +PRE_FLIP_SETTLE_BLOCKS=${MIGRATE_PREFLIP_SETTLE_BLOCKS:-2} +PRE_FLIP_STOP_ATTEMPTS=${MIGRATE_PREFLIP_STOP_ATTEMPTS:-5} +FIXTURE_HEIGHT_FILE=${MIGRATE_FIXTURE_HEIGHT_FILE:-integration_test/contracts/flatkv_evm_latest_fixture_block_height.txt} + +dump_node_log() { + local node=$1 + local node_id=${node#sei-node-} + local logfile="/sei-protocol/sei-chain/build/generated/logs/seid-${node_id}.log" + + echo "==================== ${node} seid log error excerpt ====================" >&2 + docker exec "$node" grep -nE '(^Error:|panic:|failed to|failed |version mismatch|FlatKV|migrate)' "$logfile" >&2 2>/dev/null \ + || echo "(no error excerpt found)" >&2 + echo "==================== ${node} seid log (head 220 lines) ====================" >&2 + docker exec "$node" head -220 "$logfile" >&2 2>/dev/null \ + || echo "(could not read ${logfile})" >&2 + echo "==================== ${node} seid log (tail 400 lines) ====================" >&2 + docker exec "$node" tail -400 "$logfile" >&2 2>/dev/null \ + || echo "(could not read ${logfile})" >&2 + echo "==================== ${node} pgrep seid ====================" >&2 + docker exec "$node" pgrep -af "seid" >&2 2>/dev/null \ + || echo "(no seid processes)" >&2 + echo "==================== ${node} docker logs (last 200 lines) ====================" >&2 + docker logs --tail 200 "$node" >&2 || true +} + +dump_all_node_logs() { + for i in $(seq 0 $((NODE_COUNT - 1))); do + dump_node_log "sei-node-$i" + done +} + +node_height() { + docker exec "$1" build/seid status 2>/dev/null \ + | jq -r '.SyncInfo.latest_block_height // "0"' 2>/dev/null \ + || echo 0 +} + +node_logged_committed_height() { + local node=$1 + local node_id=${node#sei-node-} + local logfile="/sei-protocol/sei-chain/build/generated/logs/seid-${node_id}.log" + local host_logfile="build/generated/logs/seid-${node_id}.log" + local height="" + + if [ -f "$host_logfile" ]; then + height=$(grep 'msg="committed state"' "$host_logfile" 2>/dev/null \ + | tail -1 \ + | sed -n 's/.* height=\([0-9][0-9]*\) .*/\1/p' \ + || true) + else + height=$(docker exec "$node" grep 'msg="committed state"' "$logfile" 2>/dev/null \ + | tail -1 \ + | sed -n 's/.* height=\([0-9][0-9]*\) .*/\1/p' \ + || true) + fi + + if [[ "$height" =~ ^[0-9]+$ ]]; then + echo "$height" + else + echo 0 + fi +} + +capture_stopped_heights() { + stopped_heights="" + stopped_min="" + stopped_max="" + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + h=$(node_logged_committed_height "$node") + # node_logged_committed_height only sees in-session log lines. After a + # docker pause (and especially right after a restart) the new log file + # may not yet contain a "committed state" line even though the node has + # already replayed/committed up to a real height. Fall back to the most + # recent RPC sample captured before the pause (saved in PRE_PAUSE_RPC_HEIGHTS) + # to avoid spurious zero readings being mistaken for a split. + if [ "$h" = "0" ] && [ -n "${PRE_PAUSE_RPC_HEIGHTS:-}" ]; then + local rpc_h + rpc_h=$(echo "$PRE_PAUSE_RPC_HEIGHTS" | awk -v n="$node" '{for(i=1;i<=NF;i++){split($i,kv,"=");if(kv[1]==n){print kv[2];exit}}}') + if [[ "$rpc_h" =~ ^[0-9]+$ ]] && [ "$rpc_h" -gt 0 ]; then + h=$rpc_h + fi + fi + stopped_heights="${stopped_heights} ${node}=${h}" + if [ -z "$stopped_min" ] || [ "$h" -lt "$stopped_min" ]; then + stopped_min=$h + fi + if [ -z "$stopped_max" ] || [ "$h" -gt "$stopped_max" ]; then + stopped_max=$h + fi + done +} + +sample_pre_pause_rpc_heights() { + PRE_PAUSE_RPC_HEIGHTS="" + for i in $(seq 0 $((NODE_COUNT - 1))); do + local node="sei-node-$i" + local h + h=$(node_height "$node") + if ! [[ "$h" =~ ^[0-9]+$ ]]; then + h=0 + fi + PRE_PAUSE_RPC_HEIGHTS="${PRE_PAUSE_RPC_HEIGHTS} ${node}=${h}" + done +} + +all_node_heights() { + for i in $(seq 0 $((NODE_COUNT - 1))); do + node_height "sei-node-$i" + done +} + +wait_for_cluster_height_sync() { + local min_height=$1 + local timeout=$2 + local elapsed=0 + local heights min max h summary + + echo "Waiting for all $NODE_COUNT validators to reach height >= $min_height before migration..." >&2 + while [ "$elapsed" -lt "$timeout" ]; do + heights=$(all_node_heights) + min="" + max="" + summary="" + for h in $heights; do + summary="${summary} ${h}" + if [ -z "$min" ] || [ "$h" -lt "$min" ]; then + min=$h + fi + if [ -z "$max" ] || [ "$h" -gt "$max" ]; then + max=$h + fi + done + + if [ -n "$min" ] && [ "$min" -ge "$min_height" ]; then + echo "$min" + return 0 + fi + + echo "Waiting for pre-flip height floor (elapsed=${elapsed}s/${timeout}s):${summary}" >&2 + sleep 1 + elapsed=$((elapsed + 1)) + done + + echo "ERROR: validators did not all reach pre-flip height >= $min_height within ${timeout}s" >&2 + echo "Final pre-flip heights:${summary}" >&2 + dump_all_node_logs + exit 1 +} + +wait_for_height_progress() { + local min_delta=${1:-1} + local timeout=${2:-60} + local start + start=$(node_height "sei-node-0") + local target=$((start + min_delta)) + local elapsed=0 + while [ "$elapsed" -lt "$timeout" ]; do + local h + h=$(node_height "sei-node-0") + if [ "$h" -ge "$target" ]; then + echo "Chain progressed from height $start to $h" + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "ERROR: chain did not progress by $min_delta blocks from $start within ${timeout}s" >&2 + dump_all_node_logs + exit 1 +} + +ensure_seidb() { + local node=$1 + if docker exec "$node" bash -lc "test -x /sei-protocol/sei-chain/build/seidb && /sei-protocol/sei-chain/build/seidb migrate-evm-status --help >/dev/null 2>&1" >/dev/null 2>&1; then + return 0 + fi + echo "Building seidb on $node..." + docker exec -e GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}" "$node" bash -lc \ + "cd /sei-protocol/sei-chain && $GO_BIN build -o build/seidb ./sei-db/tools/cmd/seidb" +} + +ensure_all_seidb() { + for i in $(seq 0 $((NODE_COUNT - 1))); do + ensure_seidb "sei-node-$i" + done +} + +extract_status_json() { + awk ' + /^[[:space:]]*\{/ && !in_json { + in_json = 1 + depth = 0 + buf = "" + } + in_json { + buf = buf $0 ORS + line = $0 + depth += gsub(/\{/, "{", line) + depth -= gsub(/\}/, "}", line) + if (depth <= 0) { + last = buf + in_json = 0 + } + } + END { + if (last != "") { + printf "%s", last + } + } + ' +} + +current_sc_write_mode() { + local node=$1 + docker exec "$node" bash -lc "grep -E '^sc-write-mode' '$APP_CONFIG' | tail -1 | awk -F'\"' '{print \$2}'" 2>/dev/null || true +} + +assert_all_nodes_in_mode() { + local expected=$1 + for i in $(seq 0 $((NODE_COUNT - 1))); do + local node="sei-node-$i" + local current_mode + current_mode=$(current_sc_write_mode "$node") + if [ "$current_mode" != "$expected" ]; then + if [ -z "$current_mode" ]; then + echo "ERROR: $node has no sc-write-mode line in $APP_CONFIG" >&2 + else + echo "ERROR: $node is in sc-write-mode='$current_mode'; expected '$expected'" >&2 + fi + exit 1 + fi + done + echo "All $NODE_COUNT nodes confirmed in $expected mode" +} + +fixture_height_floor() { + local fixture_height=0 + if [ -f "$FIXTURE_HEIGHT_FILE" ]; then + fixture_height=$(tail -1 "$FIXTURE_HEIGHT_FILE" | tr -d '[:space:]' || echo 0) + fi + if ! [[ "$fixture_height" =~ ^[0-9]+$ ]]; then + echo "ERROR: fixture height file $FIXTURE_HEIGHT_FILE contains non-numeric value '$fixture_height'" >&2 + exit 1 + fi + echo $((fixture_height + PRE_FLIP_SETTLE_BLOCKS)) +} + +wait_for_all_seid_start() { + local label=$1 + local elapsed=0 + local all_up=false + local down_node="" + + while [ "$elapsed" -lt "$RESTART_PROBE_SECS" ]; do + all_up=true + down_node="" + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + if ! docker exec "$node" pgrep -f "seid start" >/dev/null 2>&1; then + all_up=false + down_node=$node + break + fi + done + if $all_up; then + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + + echo "ERROR: not all validators ${label} within ${RESTART_PROBE_SECS}s (last down: ${down_node})" >&2 + dump_all_node_logs + exit 1 +} + +coordinated_stop_at_common_height() { + local restart_mode_label=${1:-current mode} + local min_height=${2:-$(fixture_height_floor)} + + PRE_FLIP_HEIGHT=$(wait_for_cluster_height_sync "$min_height" "$PRE_FLIP_SYNC_TIMEOUT" | tail -1) + echo "Pre-flip height floor reached across all $NODE_COUNT validators: $PRE_FLIP_HEIGHT (min_height=$min_height)" + + stopped_heights="" + stopped_min="" + stopped_max="" + stopped_consistent=false + for attempt in $(seq 1 "$PRE_FLIP_STOP_ATTEMPTS"); do + echo "Freezing validators before migration stop (attempt ${attempt}/${PRE_FLIP_STOP_ATTEMPTS})..." + # Sample heights via RPC before pausing so capture_stopped_heights has a + # fallback when the in-session log has not yet flushed a "committed state" + # line for every node (typical right after a coordinated restart). + sample_pre_pause_rpc_heights + for i in $(seq 0 $((NODE_COUNT - 1))); do + docker pause "sei-node-$i" >/dev/null 2>&1 || true & + done + wait + + capture_stopped_heights + echo "Frozen validator committed heights:${stopped_heights}" + + if [ -n "$stopped_min" ] && [ "$stopped_min" = "$stopped_max" ]; then + echo "Stopping all $NODE_COUNT validators from frozen height $stopped_min..." + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + ( + docker unpause "$node" >/dev/null 2>&1 || true + docker exec "$node" pkill -TERM -f "seid start" >/dev/null 2>&1 || true + ) & + done + wait + + local elapsed=0 + local all_dead=false + while [ "$elapsed" -lt "$STOP_TIMEOUT" ]; do + all_dead=true + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + if docker exec "$node" pgrep -f "seid start" >/dev/null 2>&1; then + all_dead=false + break + fi + done + if $all_dead; then break; fi + sleep 2 + elapsed=$((elapsed + 2)) + done + if ! $all_dead; then + echo "ERROR: not all validators stopped within ${STOP_TIMEOUT}s" >&2 + dump_all_node_logs + exit 1 + fi + echo "All $NODE_COUNT validators confirmed stopped" + + capture_stopped_heights + echo "Stopped validator committed heights:${stopped_heights}" + + if [ -n "$stopped_min" ] && [ "$stopped_min" = "$stopped_max" ]; then + stopped_consistent=true + break + fi + + echo "Validators committed an extra block during shutdown; restarting in ${restart_mode_label} before retry:${stopped_heights}" >&2 + if [ "$attempt" -eq "$PRE_FLIP_STOP_ATTEMPTS" ]; then + break + fi + + for i in $(seq 0 $((NODE_COUNT - 1))); do + docker exec -d -e "ID=${i}" "sei-node-$i" /usr/bin/start_sei.sh + done + wait_for_all_seid_start "restarted in ${restart_mode_label}" + + PRE_FLIP_HEIGHT=$(wait_for_cluster_height_sync "$stopped_max" "$PRE_FLIP_SYNC_TIMEOUT" | tail -1) + echo "Pre-flip height floor restored across all $NODE_COUNT validators after shutdown drift: $PRE_FLIP_HEIGHT" + continue + fi + + echo "Validators froze at split heights; unpausing to let laggards converge before retry:${stopped_heights}" >&2 + for i in $(seq 0 $((NODE_COUNT - 1))); do + docker unpause "sei-node-$i" >/dev/null 2>&1 || true & + done + wait + + if [ "$attempt" -eq "$PRE_FLIP_STOP_ATTEMPTS" ]; then + break + fi + PRE_FLIP_HEIGHT=$(wait_for_cluster_height_sync "$stopped_max" "$PRE_FLIP_SYNC_TIMEOUT" | tail -1) + echo "Pre-flip height floor restored across all $NODE_COUNT validators: $PRE_FLIP_HEIGHT" + done + + if ! $stopped_consistent; then + echo "ERROR: validators could not be stopped at a common committed height; refusing to flip sc-write-mode" >&2 + echo "Split final stopped heights:${stopped_heights}" >&2 + dump_all_node_logs + exit 1 + fi +} + +flip_sc_write_mode() { + local mode=$1 + local batch=${2:-$KEYS_TO_MIGRATE_PER_BLOCK} + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + docker exec "$node" bash -c " + sed -i 's/^sc-write-mode = .*/sc-write-mode = \"${mode}\"/' '$APP_CONFIG' + if grep -q '^sc-keys-to-migrate-per-block' '$APP_CONFIG'; then + sed -i 's/^sc-keys-to-migrate-per-block = .*/sc-keys-to-migrate-per-block = ${batch}/' '$APP_CONFIG' + else + sed -i '/^sc-write-mode/a sc-keys-to-migrate-per-block = ${batch}' '$APP_CONFIG' + fi + " + done + echo "Flipped sc-write-mode to ${mode} on all $NODE_COUNT nodes (batch=$batch)" + + local written_mode + written_mode=$(current_sc_write_mode "sei-node-0") + if [ "$written_mode" != "$mode" ]; then + echo "ERROR: sei-node-0 app.toml has sc-write-mode='$written_mode' after rewrite; expected '$mode'" >&2 + exit 1 + fi +} + +coordinated_restart_with_settle() { + local mode=$1 + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + docker exec -d -e "ID=${i}" "$node" /usr/bin/start_sei.sh + done + wait_for_all_seid_start "restarted in ${mode}" + + sleep 5 + local failed=false + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + if ! docker exec "$node" pgrep -f "seid start" >/dev/null 2>&1; then + echo "ERROR: $node died within 5s of restart in ${mode}" >&2 + failed=true + fi + done + if $failed; then + dump_all_node_logs + exit 1 + fi + echo "All $NODE_COUNT validators restarted in ${mode} mode and survived a 5s settle" +} + +wait_for_migrate_status_complete() { + local timeout=${1:-$COMPLETION_TIMEOUT} + ensure_all_seidb + + local elapsed=0 + local all_done=false + while [ "$elapsed" -lt "$timeout" ]; do + all_done=true + status_summary="" + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + status_err="/tmp/${node}-migrate-evm-status.err" + raw_status=$(docker exec "$node" bash -lc \ + "build/seidb migrate-evm-status --db-dir '$FLATKV_DIR' 2>$status_err" || true) + status_json=$(printf '%s\n' "$raw_status" | extract_status_json) + if [ -z "$status_json" ]; then + status_json='{}' + fi + complete=$(echo "$status_json" | jq -r '.migrate_evm_complete // false' 2>/dev/null || echo false) + version_at=$(echo "$status_json" | jq -r '.version_at // 0' 2>/dev/null || echo 0) + migration_version=$(echo "$status_json" | jq -r '.migration_version // 0' 2>/dev/null || echo 0) + boundary_present=$(echo "$status_json" | jq -r '.boundary_present // false' 2>/dev/null || echo false) + height=$(node_height "$node") + status_summary="$status_summary ${node}=${complete}@mv${migration_version}/v${version_at}/h${height}/boundary=${boundary_present}" + if [ "$complete" != "true" ]; then + all_done=false + fi + if [ "$i" -eq 0 ] && [ $((elapsed % 30)) -eq 0 ]; then + echo "migrate-evm-status raw ${node}: ${raw_status}" + echo "migrate-evm-status json ${node}: ${status_json}" + docker exec "$node" bash -lc "if [ -s '$status_err' ]; then echo 'migrate-evm-status stderr ${node}:'; cat '$status_err'; fi" || true + fi + done + if $all_done; then + echo "All $NODE_COUNT validators completed EVM migration:$status_summary" + return 0 + fi + echo "Waiting for EVM migration completion (elapsed=${elapsed}s/${timeout}s):$status_summary" + sleep 5 + elapsed=$((elapsed + 5)) + done + + echo "ERROR: EVM migration did not complete within ${timeout}s on all validators" >&2 + echo "Final migrate-evm-status diagnostics:" + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + status_err="/tmp/${node}-migrate-evm-status-final.err" + raw_status=$(docker exec "$node" bash -lc \ + "build/seidb migrate-evm-status --db-dir '$FLATKV_DIR' 2>$status_err" || true) + status_json=$(printf '%s\n' "$raw_status" | extract_status_json) + echo "final migrate-evm-status raw ${node}: ${raw_status}" + echo "final migrate-evm-status json ${node}: ${status_json:-{}}" + docker exec "$node" bash -lc "if [ -s '$status_err' ]; then echo 'final migrate-evm-status stderr ${node}:'; cat '$status_err'; fi" || true + done + dump_all_node_logs + exit 1 +} + +wait_for_migration_in_progress() { + local timeout=${1:-60} + ensure_all_seidb + local elapsed=0 + while [ "$elapsed" -lt "$timeout" ]; do + raw_status=$(docker exec "sei-node-0" bash -lc \ + "build/seidb migrate-evm-status --db-dir '$FLATKV_DIR'" || true) + status_json=$(printf '%s\n' "$raw_status" | extract_status_json) + boundary_present=$(echo "${status_json:-{}}" | jq -r '.boundary_present // false' 2>/dev/null || echo false) + complete=$(echo "${status_json:-{}}" | jq -r '.migrate_evm_complete // false' 2>/dev/null || echo false) + if [ "$boundary_present" = "true" ] && [ "$complete" != "true" ]; then + echo "EVM migration is in progress on sei-node-0" + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "ERROR: EVM migration did not enter in-progress state within ${timeout}s" >&2 + dump_all_node_logs + exit 1 +} + +print_migration_summaries() { + echo "==================== migration completion summaries ====================" + local failed=false + for i in $(seq 0 $((NODE_COUNT - 1))); do + local node="sei-node-$i" + local logfile="/sei-protocol/sei-chain/build/generated/logs/seid-${i}.log" + local summary="" + local keys_migrated="" + for _ in $(seq 1 10); do + summary=$(docker exec "$node" grep -n 'msg="migration complete"' "$logfile" 2>/dev/null | tail -1 || true) + if [ -n "$summary" ]; then + break + fi + sleep 1 + done + + echo "-------------------- ${node} migration summary --------------------" + if [ -z "$summary" ]; then + echo "ERROR: ${node} did not print migration complete summary in ${logfile}" >&2 + failed=true + else + echo "$summary" + keys_migrated=$(printf "%s\n" "$summary" | sed -n 's/.*keysMigrated=\([0-9][0-9]*\).*/\1/p') + if [ "$MIN_KEYS_MIGRATED" -gt 0 ]; then + if [ -z "$keys_migrated" ]; then + echo "ERROR: ${node} migration summary did not include keysMigrated" >&2 + failed=true + elif [ "$keys_migrated" -lt "$MIN_KEYS_MIGRATED" ]; then + echo "ERROR: ${node} migrated only ${keys_migrated} keys; expected at least ${MIN_KEYS_MIGRATED}" >&2 + failed=true + fi + fi + fi + done + + if $failed; then + dump_all_node_logs + exit 1 + fi +} + +wait_for_post_migration_progress() { + local timeout=${1:-60} + local elapsed=0 + while [ "$elapsed" -lt "$timeout" ]; do + base=$(node_height "sei-node-0") + if [ "$base" -ge "$MIN_HEIGHT_AFTER" ]; then + return 0 + fi + echo "Waiting for post-migration chain progress (h=$base, want >= $MIN_HEIGHT_AFTER)" + sleep 2 + elapsed=$((elapsed + 2)) + done +} + +pick_compare_height() { + local min="" + for i in $(seq 0 $((NODE_COUNT - 1))); do + local h + h=$(node_height "sei-node-$i") + if [ -z "$min" ] || [ "$h" -lt "$min" ]; then + min=$h + fi + done + if [ -z "$min" ] || [ "$min" -le "$COMPARE_BUFFER" ]; then + echo 1 + return + fi + echo $((min - COMPARE_BUFFER)) +} + +flatkv_dump_digest() { + local node=$1 + local version=$2 + shift 2 + local buckets=("$@") + if [ "${#buckets[@]}" -eq 0 ]; then + buckets=(account code storage) + fi + local bucket_args="" + local out_dir="/tmp/flatkv-migrate-${version}-${node}" + for bucket in "${buckets[@]}"; do + bucket_args="$bucket_args \"${out_dir}/${bucket}\"" + done + + docker exec "$node" bash -lc " + set -euo pipefail + out_dir=${out_dir} + rm -rf \"\$out_dir\" && mkdir -p \"\$out_dir\" + cd /sei-protocol/sei-chain + build/seidb dump-flatkv \ + --db-dir $FLATKV_DIR \ + --output-dir \"\$out_dir\" \ + --height $version > /dev/null + tail -q -n +2 $bucket_args 2>/dev/null | sha256sum | cut -d' ' -f1 + " +} + +assert_cross_validator_flatkv_digest() { + local compare_version=${1:-} + shift || true + local buckets=("$@") + if [ -z "$compare_version" ]; then + wait_for_post_migration_progress 60 + compare_version=$(pick_compare_height) + fi + + echo "Comparing FlatKV digests across $NODE_COUNT validators at chain height $compare_version (buckets: ${buckets[*]:-account code storage})" + local reference_digest="" + local reference_node="" + local mismatch=false + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + digest=$(flatkv_dump_digest "$node" "$compare_version" "${buckets[@]}") + echo " ${node} sha256 = $digest" + if [ -z "$reference_digest" ]; then + reference_digest="$digest" + reference_node="$node" + continue + fi + if [ "$digest" != "$reference_digest" ]; then + echo "FAIL: ${node} diverges from ${reference_node} at height $compare_version" >&2 + mismatch=true + fi + done + + if $mismatch; then + dump_all_node_logs + exit 1 + fi +} + +perform_migration_flip() { + local from_mode=$1 + local to_mode=$2 + local batch=${3:-$KEYS_TO_MIGRATE_PER_BLOCK} + local min_height=${4:-$(fixture_height_floor)} + + assert_all_nodes_in_mode "$from_mode" + coordinated_stop_at_common_height "$from_mode" "$min_height" + flip_sc_write_mode "$to_mode" "$batch" + coordinated_restart_with_settle "$to_mode" + wait_for_migrate_status_complete +} + +run_v0_to_v1_migration() { + local batch=${1:-$KEYS_TO_MIGRATE_PER_BLOCK} + perform_migration_flip "memiavl_only" "migrate_evm" "$batch" +} + +kill_node() { + local node=$1 + docker exec "$node" pkill -KILL -f "seid start" >/dev/null 2>&1 || true +} + +restart_node() { + local node=$1 + local id=${node#sei-node-} + docker exec -d -e "ID=${id}" "$node" /usr/bin/start_sei.sh +} + +assert_all_nodes_alive() { + local failed=false + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + if ! docker exec "$node" pgrep -f "seid start" >/dev/null 2>&1; then + echo "ERROR: $node is not running" >&2 + failed=true + fi + done + if $failed; then + dump_all_node_logs + exit 1 + fi +} + +# block_id_hash_at_height returns the Tendermint block_id.hash for a given +# committed height on a single node, or empty if not available. +block_id_hash_at_height() { + local node=$1 + local height=$2 + docker exec "$node" bash -lc \ + "curl -sf 'http://localhost:26657/block?height=${height}' 2>/dev/null | jq -r '.result.block_id.hash // .block_id.hash // empty'" 2>/dev/null \ + || echo "" +} + +# app_hash_at_height returns the Tendermint header.app_hash for a given +# committed height on a single node, or empty if not available. +app_hash_at_height() { + local node=$1 + local height=$2 + docker exec "$node" bash -lc \ + "curl -sf 'http://localhost:26657/block?height=${height}' 2>/dev/null | jq -r '.result.block.header.app_hash // .block.header.app_hash // empty'" 2>/dev/null \ + || echo "" +} + +# assert_cross_validator_block_hashes verifies that every height in +# [from_height, to_height] is present on every validator and that all +# validators agree on block_id.hash for that height. Catches: +# - silent block-loss on a validator (missing or null hash), +# - transient consensus divergence that recovered post-mortem +# (different block_id at the same committed height). +assert_cross_validator_block_hashes() { + local from=$1 + local to=$2 + local mismatch=false + local total=$(( to - from + 1 )) + if [ "$total" -le 0 ]; then + echo "assert_cross_validator_block_hashes: no heights in range $from..$to" + return 0 + fi + echo "Comparing block hashes across $NODE_COUNT validators for heights $from..$to ($total heights)" + for h in $(seq "$from" "$to"); do + local ref="" + local ref_node="" + for i in $(seq 0 $((NODE_COUNT - 1))); do + node="sei-node-$i" + hash=$(block_id_hash_at_height "$node" "$h") + if [ -z "$hash" ] || [ "$hash" = "null" ]; then + echo "FAIL: ${node} missing block at height $h" >&2 + mismatch=true + continue + fi + if [ -z "$ref" ]; then + ref="$hash" + ref_node="$node" + continue + fi + if [ "$hash" != "$ref" ]; then + echo "FAIL: ${node} block_id $hash != ${ref_node} $ref at height $h" >&2 + mismatch=true + fi + done + done + if $mismatch; then + dump_all_node_logs + exit 1 + fi + echo "PASS: 4-node block_id equal across heights $from..$to" +} + +# assert_no_panics_in_logs scans validator logs for a real Go panic -- +# either one Cosmos recovered at the ProcessBlock / FinalizeBlocker +# boundary (which would still halt consensus on that height) or an +# unrecovered runtime panic that crashes the process. +# +# Patterns: +# - 'level=(ERROR|FATAL) msg="panic recovered' -- the marker the +# cosmos baseapp / sei app logs after a deferred recover() catches +# a panic in block execution. This is the bug class that the +# post-migration EVM iterator regression (Cody review issue 1) +# surfaces as; it does NOT kill the validator process but does +# halt consensus on that block, so assert_all_nodes_alive alone +# would miss it. +# - '^panic: ' -- the Go runtime's +# own panic printer, only present for unrecovered fatal panics. +# +# Patterns are deliberately anchored / structured so they don't match +# the many INF lines that happen to contain the word "panic" inside a +# message payload (e.g. config dumps, error strings). +assert_no_panics_in_logs() { + local label=${1:-"migration window"} + local pattern='level=(ERROR|FATAL) msg="panic recovered|^panic: ' + local failed=false + for i in $(seq 0 $((NODE_COUNT - 1))); do + local node="sei-node-$i" + local logfile="/sei-protocol/sei-chain/build/generated/logs/seid-${i}.log" + if docker exec "$node" grep -nE "$pattern" "$logfile" >/dev/null 2>&1; then + echo "FAIL: ${node} log shows panic during ${label}" >&2 + docker exec "$node" grep -nE "$pattern" "$logfile" 2>/dev/null | head -20 >&2 || true + failed=true + fi + done + if $failed; then + dump_all_node_logs + exit 1 + fi + echo "PASS: no panic in any validator log during ${label}" +} + +# graceful_stop_node sends SIGTERM and waits up to `timeout` seconds for +# the process to exit on its own. Unlike kill_node (SIGKILL), this lets +# the validator finish in-flight commits, flush pebble/memiavl, and +# release file locks cleanly. Errors loudly if the process does not exit +# within the window so a hang regression cannot silently degrade into +# the SIGKILL path. +graceful_stop_node() { + local node=$1 + local timeout=${2:-30} + docker exec "$node" pkill -TERM -f "seid start" >/dev/null 2>&1 || true + local elapsed=0 + while [ "$elapsed" -lt "$timeout" ]; do + if ! docker exec "$node" pgrep -f "seid start" >/dev/null 2>&1; then + echo "${node} stopped gracefully after ${elapsed}s" + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "ERROR: ${node} did not exit on SIGTERM within ${timeout}s" >&2 + dump_node_log "$node" + exit 1 +} + diff --git a/integration_test/contracts/verify_flatkv_evm_migrate.sh b/integration_test/contracts/verify_flatkv_evm_migrate.sh new file mode 100755 index 0000000000..1b282b8603 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_evm_migrate.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# +# Drives the operator-style v0 -> v1 FlatKV EVM migration: +# memiavl_only -> migrate_evm. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +MIN_KEYS_MIGRATED=${MIGRATE_MIN_KEYS_MIGRATED:-3500} + +echo "verify_flatkv_evm_migrate: node_count=$NODE_COUNT" + +run_v0_to_v1_migration "$KEYS_TO_MIGRATE_PER_BLOCK" +print_migration_summaries +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM migrate completed on all $NODE_COUNT validators and FlatKV EVM digests agree" diff --git a/integration_test/contracts/verify_flatkv_evm_store.sh b/integration_test/contracts/verify_flatkv_evm_store.sh index ea43133887..6b19ceddd9 100755 --- a/integration_test/contracts/verify_flatkv_evm_store.sh +++ b/integration_test/contracts/verify_flatkv_evm_store.sh @@ -20,6 +20,34 @@ fi rm -rf "$dump_dir" mkdir -p "$dump_dir" +dump_flatkv_layout() { + # Snapshot the on-disk state of $flatkv_dir for triage. dump-flatkv fails + # opaquely ("clone aborted after 3 retries...") when, for example, the + # live node never wrote a snapshot (=> no `current` symlink) or wrote it + # to a different path than $flatkv_dir. Printing the layout removes that + # ambiguity from the next CI run. + echo "==================== app.toml FlatKV-related settings ====================" >&2 + grep -E '^(sc-write-mode|sc-keys-to-migrate-per-block|evm-ss-split)' \ + /root/.sei/config/app.toml >&2 2>/dev/null || true + for candidate in "$flatkv_dir" /root/.sei/data/flatkv; do + echo "==================== FlatKV directory state at $candidate ====================" >&2 + if [ ! -d "$candidate" ]; then + echo "(directory $candidate does not exist)" >&2 + else + ls -la "$candidate" >&2 || true + for snap in "$candidate"/snapshot-*; do + [ -d "$snap" ] || continue + echo "---- $snap ----" >&2 + ls -la "$snap" >&2 || true + done + fi + done + local seid_log="/sei-protocol/sei-chain/build/generated/logs/seid-0.log" + echo "==================== seid-0.log (head 60) ====================" >&2 + head -60 "$seid_log" >&2 2>/dev/null || echo "(no seid-0.log)" >&2 +} +trap 'rc=$?; if [ "$rc" -ne 0 ]; then dump_flatkv_layout; fi; exit $rc' EXIT + echo "Dumping FlatKV storage bucket from $flatkv_dir..." build/seidb dump-flatkv --db-dir "$flatkv_dir" --output-dir "$dump_dir" --bucket storage diff --git a/integration_test/contracts/verify_flatkv_migrate_apphash_continuity.sh b/integration_test/contracts/verify_flatkv_migrate_apphash_continuity.sh new file mode 100755 index 0000000000..7ea81c1bc7 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_apphash_continuity.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Run the v0 -> v1 FlatKV EVM migration and assert chain-level continuity: +# 1. block_id.hash equal on every validator for every height in +# [pre_flip, post_complete] -- catches transient consensus divergence +# that recovered before the digest snapshot, +# 2. no panic/fatal lines in any validator log during the window, +# 3. FlatKV digests agree across validators. +# +# This is the per-block AppHash continuity sibling of +# verify_flatkv_evm_migrate.sh: that script only checks the post-migration +# digest, this one also checks that no block in between disagreed across +# validators. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +echo "verify_flatkv_migrate_apphash_continuity: node_count=$NODE_COUNT" + +assert_all_nodes_in_mode "memiavl_only" + +pre_flip_height=$(node_height "sei-node-0") +echo "Pre-flip reference height: $pre_flip_height" + +run_v0_to_v1_migration "$KEYS_TO_MIGRATE_PER_BLOCK" +print_migration_summaries + +# Give the cluster a couple of post-completion blocks so the window we +# check includes the boundary-latched commits as well. +wait_for_height_progress 2 60 + +# Pick the upper bound from the slowest validator so every height in +# the range is guaranteed to be present on every node. Otherwise nodes +# that lag by 1-2 blocks would return null for the most recent heights +# and trip a false "missing block" failure. +post_height=$(pick_compare_height) +echo "Post-migration cross-validator floor height: $post_height" + +if [ "$post_height" -le "$pre_flip_height" ]; then + echo "FAIL: chain did not advance during migration window ($pre_flip_height -> $post_height)" >&2 + dump_all_node_logs + exit 1 +fi + +# Hash polling is one HTTP round-trip per (node, height); cap the window +# size to keep this comfortably under a CI step's wall clock even if the +# fixture/migration ran for many blocks. We always include the entire +# flip-through-completion window; only the tail past that gets trimmed. +MAX_WINDOW=${APPHASH_CONTINUITY_MAX_HEIGHTS:-400} +window_lo=$pre_flip_height +window_hi=$post_height +if [ $(( window_hi - window_lo + 1 )) -gt "$MAX_WINDOW" ]; then + window_lo=$(( window_hi - MAX_WINDOW + 1 )) + echo "Truncating apphash window to last $MAX_WINDOW heights ($window_lo..$window_hi)" +fi + +assert_cross_validator_block_hashes "$window_lo" "$window_hi" +assert_no_panics_in_logs "v0->v1 migration window" +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM v0->v1 migration completed with cross-validator block continuity" diff --git a/integration_test/contracts/verify_flatkv_migrate_batch_edge.sh b/integration_test/contracts/verify_flatkv_migrate_batch_edge.sh new file mode 100755 index 0000000000..5dd19a098b --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_batch_edge.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Exercise the v0 -> v1 FlatKV EVM migration at batch-size extremes: +# +# MIGRATE_BATCH_EDGE=min -- batch=1 keys/block, forces the +# per-key boundary-advance path to run +# on every block until the drain completes. +# MIGRATE_BATCH_EDGE=oneshot -- batch >> fixture key count, forces the +# whole drain into a single block so +# boundary latch + completion-on-first- +# batch transitions get exercised. +# +# Either extreme is a stricter probe of MigrationManager's per-block +# accounting than the default batch size used by verify_flatkv_evm_migrate.sh. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +mode=${MIGRATE_BATCH_EDGE:-} +case "$mode" in + min) + batch=${MIGRATE_BATCH_EDGE_MIN:-1} + ;; + oneshot) + batch=${MIGRATE_BATCH_EDGE_ONESHOT:-100000} + ;; + "") + echo "MIGRATE_BATCH_EDGE must be set to 'min' or 'oneshot'" >&2 + exit 2 + ;; + *) + echo "Unknown MIGRATE_BATCH_EDGE=$mode (expected min|oneshot)" >&2 + exit 2 + ;; +esac + +echo "verify_flatkv_migrate_batch_edge: mode=$mode batch=$batch node_count=$NODE_COUNT" + +run_v0_to_v1_migration "$batch" +print_migration_summaries +assert_all_nodes_alive +assert_no_panics_in_logs "batch-edge migration ($mode)" +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM v0->v1 migration completed at batch=$batch ($mode)" diff --git a/integration_test/contracts/verify_flatkv_migrate_graceful_restart.sh b/integration_test/contracts/verify_flatkv_migrate_graceful_restart.sh new file mode 100755 index 0000000000..11cd3e1825 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_graceful_restart.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Start v0 -> v1 migration, send SIGTERM (not SIGKILL) to one validator +# while the boundary is in-flight, wait for it to exit cleanly, then +# restart and require deterministic completion. +# +# Counterpart to verify_flatkv_migrate_resume_after_crash.sh: that +# script proves SIGKILL-during-migration is recoverable; this one +# proves the graceful-stop path -- the one operators will actually +# use during scheduled restarts -- has the same end state and does +# not leak partial commits / locked DB handles into the next start. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +VICTIM_NODE=${MIGRATE_GRACEFUL_VICTIM_NODE:-sei-node-1} +GRACEFUL_BATCH=${MIGRATE_GRACEFUL_KEYS_PER_BLOCK:-25} +SHUTDOWN_TIMEOUT=${MIGRATE_GRACEFUL_SHUTDOWN_TIMEOUT:-45} + +echo "verify_flatkv_migrate_graceful_restart: victim=$VICTIM_NODE batch=$GRACEFUL_BATCH" + +assert_all_nodes_in_mode "memiavl_only" +coordinated_stop_at_common_height "memiavl_only" "$(fixture_height_floor)" +flip_sc_write_mode "migrate_evm" "$GRACEFUL_BATCH" +coordinated_restart_with_settle "migrate_evm" + +wait_for_migration_in_progress 90 +echo "Sending SIGTERM to $VICTIM_NODE mid-migration" +graceful_stop_node "$VICTIM_NODE" "$SHUTDOWN_TIMEOUT" +sleep 3 +restart_node "$VICTIM_NODE" +wait_for_all_seid_start "after mid-migration SIGTERM restart" + +wait_for_migrate_status_complete +assert_all_nodes_alive +assert_no_panics_in_logs "graceful restart window" +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM migration resumed deterministically after $VICTIM_NODE graceful restart" diff --git a/integration_test/contracts/verify_flatkv_migrate_idle.sh b/integration_test/contracts/verify_flatkv_migrate_idle.sh new file mode 100755 index 0000000000..8692cd64f2 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_idle.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# +# Run v0 -> v1 with no required caller workload. The migration must still +# complete on idle blocks, which pins empty-changeset forwarding at the +# docker level. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +echo "verify_flatkv_migrate_idle: node_count=$NODE_COUNT" + +MIN_KEYS_MIGRATED=${MIGRATE_MIN_KEYS_MIGRATED:-0} +FIXTURE_HEIGHT_FILE=/tmp/nonexistent-flatkv-idle-fixture-height +run_v0_to_v1_migration "${MIGRATE_IDLE_KEYS_PER_BLOCK:-1}" +assert_all_nodes_alive +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM migration completed on idle blocks" diff --git a/integration_test/contracts/verify_flatkv_migrate_resume_after_crash.sh b/integration_test/contracts/verify_flatkv_migrate_resume_after_crash.sh new file mode 100755 index 0000000000..e4942ee76f --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_resume_after_crash.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Start v0 -> v1 migration, SIGKILL one validator while the boundary is +# in-flight, then restart and require deterministic completion. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +VICTIM_NODE=${MIGRATE_CRASH_VICTIM_NODE:-sei-node-1} +CRASH_BATCH=${MIGRATE_CRASH_KEYS_PER_BLOCK:-25} + +echo "verify_flatkv_migrate_resume_after_crash: victim=$VICTIM_NODE batch=$CRASH_BATCH" + +assert_all_nodes_in_mode "memiavl_only" +coordinated_stop_at_common_height "memiavl_only" "$(fixture_height_floor)" +flip_sc_write_mode "migrate_evm" "$CRASH_BATCH" +coordinated_restart_with_settle "migrate_evm" + +wait_for_migration_in_progress 90 +echo "Killing $VICTIM_NODE mid-migration" +kill_node "$VICTIM_NODE" +sleep 5 +restart_node "$VICTIM_NODE" +wait_for_all_seid_start "after mid-migration SIGKILL restart" + +wait_for_migrate_status_complete +assert_all_nodes_alive +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM migration resumed deterministically after $VICTIM_NODE crash" diff --git a/integration_test/contracts/verify_flatkv_migrate_statesync.sh b/integration_test/contracts/verify_flatkv_migrate_statesync.sh new file mode 100755 index 0000000000..9e72bacfb4 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_statesync.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Start v0 -> v1 migration and exercise state-sync recovery while the +# cluster is in the migration window. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +STATESYNC_BATCH=${MIGRATE_STATESYNC_KEYS_PER_BLOCK:-25} + +echo "verify_flatkv_migrate_statesync: batch=$STATESYNC_BATCH" + +assert_all_nodes_in_mode "memiavl_only" +coordinated_stop_at_common_height "memiavl_only" "$(fixture_height_floor)" +flip_sc_write_mode "migrate_evm" "$STATESYNC_BATCH" +coordinated_restart_with_settle "migrate_evm" +wait_for_migration_in_progress 90 + +"$SCRIPT_DIR/verify_flatkv_statesync_crash_recovery.sh" + +wait_for_migrate_status_complete +assert_all_nodes_alive +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV statesync recovery succeeded around EVM migration" diff --git a/integration_test/contracts/verify_flatkv_migrate_with_live_workload.sh b/integration_test/contracts/verify_flatkv_migrate_with_live_workload.sh new file mode 100755 index 0000000000..f43fff1d70 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_migrate_with_live_workload.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# +# Run EVM traffic while v0 -> v1 migration is actively draining keys. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +LIVE_BATCH=${MIGRATE_LIVE_KEYS_PER_BLOCK:-25} +LIVE_TIMEOUT=${MIGRATE_LIVE_WORKLOAD_SECS:-45} + +echo "verify_flatkv_migrate_with_live_workload: batch=$LIVE_BATCH workload_secs=$LIVE_TIMEOUT" + +assert_all_nodes_in_mode "memiavl_only" +coordinated_stop_at_common_height "memiavl_only" "$(fixture_height_floor)" +flip_sc_write_mode "migrate_evm" "$LIVE_BATCH" +coordinated_restart_with_settle "migrate_evm" +wait_for_migration_in_progress 90 + +echo "Starting live EVM contract-storage stress workload" +set +e +timeout "${LIVE_TIMEOUT}s" go run ./scripts/evm_stress -mode contract-storage +stress_rc=$? +set -e +if [ "$stress_rc" -ne 0 ] && [ "$stress_rc" -ne 124 ]; then + echo "ERROR: evm_stress failed with exit code $stress_rc" >&2 + dump_all_node_logs + exit 1 +fi + +wait_for_migrate_status_complete +assert_all_nodes_alive +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: FlatKV EVM migration completed while live EVM workload was running" diff --git a/integration_test/contracts/verify_flatkv_partial_quorum_flip_halts.sh b/integration_test/contracts/verify_flatkv_partial_quorum_flip_halts.sh new file mode 100755 index 0000000000..8698afe3d4 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_partial_quorum_flip_halts.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# Negative safety test: flipping only 3 of 4 validators to migrate_evm must +# not allow the chain to keep committing under mixed AppHash semantics. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +PARTIAL_TIMEOUT=${MIGRATE_PARTIAL_FLIP_OBSERVE_SECS:-30} + +echo "verify_flatkv_partial_quorum_flip_halts: observe_secs=$PARTIAL_TIMEOUT" + +assert_all_nodes_in_mode "memiavl_only" +pre_height=$(wait_for_cluster_height_sync "$(fixture_height_floor)" "$PRE_FLIP_SYNC_TIMEOUT" | tail -1) +echo "Pre-partial-flip height: $pre_height" + +for i in 0 1 2; do + docker exec "sei-node-$i" pkill -TERM -f "seid start" >/dev/null 2>&1 || true +done +sleep 5 + +for i in 0 1 2; do + docker exec "sei-node-$i" bash -c " + sed -i 's/^sc-write-mode = .*/sc-write-mode = \"migrate_evm\"/' '$APP_CONFIG' + if grep -q '^sc-keys-to-migrate-per-block' '$APP_CONFIG'; then + sed -i 's/^sc-keys-to-migrate-per-block = .*/sc-keys-to-migrate-per-block = $KEYS_TO_MIGRATE_PER_BLOCK/' '$APP_CONFIG' + else + sed -i '/^sc-write-mode/a sc-keys-to-migrate-per-block = $KEYS_TO_MIGRATE_PER_BLOCK' '$APP_CONFIG' + fi + " + docker exec -d -e "ID=${i}" "sei-node-$i" /usr/bin/start_sei.sh +done + +sleep "$PARTIAL_TIMEOUT" +post_height=$(node_height "sei-node-0") +echo "Post-partial-flip observed height on sei-node-0: $post_height" +# Strict halt: with one validator still on memiavl_only the cluster MUST NOT +# commit a single new block, because any block proposed by the migrated +# quorum would carry the post-cutover AppHash and the lagging node would +# fork off. A +1 tolerance is unsafe: it lets exactly the divergence this +# test is meant to catch slip through. +if [ "$post_height" -gt "$pre_height" ]; then + echo "ERROR: mixed-mode partial quorum advanced from $pre_height to $post_height" >&2 + dump_all_node_logs + exit 1 +fi + +# We intentionally do NOT attempt a "recover by flipping everyone" phase +# here. Once the migrated 3-node subset has committed even one block under +# the new AppHash and the 4th node has rejected it, the two sides of the +# cluster are on permanently divergent application state. There is no +# in-test way to reconcile that without wiping data, and pretending the +# cluster can recover masks the real safety bug surfaced above. Leave the +# cluster halted and let teardown clean it up. +echo "PASS: partial-quorum migration flip halted at height $pre_height as required" diff --git a/integration_test/contracts/verify_flatkv_post_migrate_evm_workload.sh b/integration_test/contracts/verify_flatkv_post_migrate_evm_workload.sh new file mode 100755 index 0000000000..c60e5ae8e8 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_post_migrate_evm_workload.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# After v0 -> v1 completes, run EVM writes for a few blocks. This catches +# post-migration EVM iterator panics in EndBlock paths. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +echo "verify_flatkv_post_migrate_evm_workload: node_count=$NODE_COUNT" + +wait_for_migrate_status_complete + +before=$(node_height "sei-node-0") +echo "Running post-migration EVM fixture at height $before" +docker exec sei-node-0 bash -lc "FLATKV_EVM_BULK_STORAGE_KEYS=${POST_MIGRATE_EVM_BULK_STORAGE_KEYS:-50} integration_test/contracts/deploy_flatkv_evm_fixture.sh" + +wait_for_height_progress 2 120 +assert_all_nodes_alive +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: post-migration EVM workload completed without validator panic" diff --git a/integration_test/contracts/verify_flatkv_post_migrate_iterator.sh b/integration_test/contracts/verify_flatkv_post_migrate_iterator.sh new file mode 100755 index 0000000000..a9ad0b54c9 --- /dev/null +++ b/integration_test/contracts/verify_flatkv_post_migrate_iterator.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# After v0 -> v1 completes, drive additional EVM contract activity to +# exercise post-migration EndBlock / storage-iteration paths and assert +# no validator logs a panic. This pins the fix for the post-migration +# EVM iterator panic regression (Cody review issue 1) and is intentionally +# stricter than verify_flatkv_post_migrate_evm_workload.sh, which only +# checks digest agreement. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=integration_test/contracts/lib/flatkv_migration.sh +source "$SCRIPT_DIR/lib/flatkv_migration.sh" + +ROUNDS=${POST_MIGRATE_ITERATOR_ROUNDS:-3} +ROUND_KEYS=${POST_MIGRATE_ITERATOR_KEYS_PER_ROUND:-25} + +echo "verify_flatkv_post_migrate_iterator: rounds=$ROUNDS keys/round=$ROUND_KEYS" + +wait_for_migrate_status_complete +assert_all_nodes_alive + +for round in $(seq 1 "$ROUNDS"); do + echo "Post-migration EVM workload round $round/$ROUNDS" + docker exec sei-node-0 bash -lc \ + "FLATKV_EVM_BULK_STORAGE_KEYS=${ROUND_KEYS} integration_test/contracts/deploy_flatkv_evm_fixture.sh" + wait_for_height_progress 2 120 + assert_all_nodes_alive + assert_no_panics_in_logs "post-migration EVM round $round" +done + +# Final digest check to make sure the post-migration writes were +# applied identically across validators (and that the iterator paths +# we just exercised did not silently corrupt any bucket). +assert_cross_validator_flatkv_digest "" account code storage + +echo "PASS: post-migration EVM iterator paths exercised across $ROUNDS rounds without panic" diff --git a/integration_test/upgrade_module/scripts/verify_upgrade_needed_log.sh b/integration_test/upgrade_module/scripts/verify_upgrade_needed_log.sh index f0379a1836..8b2d4188a2 100755 --- a/integration_test/upgrade_module/scripts/verify_upgrade_needed_log.sh +++ b/integration_test/upgrade_module/scripts/verify_upgrade_needed_log.sh @@ -25,5 +25,25 @@ if grep -q "UPGRADE \"$VERSION\" NEEDED at height: $TARGET_BLOCK_HEIGHT" "build/ exit 0 fi +# Structured loggers can escape the quotes around the version even when the +# underlying upgrade panic was correct. +if grep -q "UPGRADE \\\"$VERSION\\\" NEEDED at height: $TARGET_BLOCK_HEIGHT" "build/generated/logs/seid-$NODE_ID.log"; then + echo "PASS" + exit 0 +fi + +# Some upgrade runs can leave the last old-binary validator alive at +# target-1 after the other peers have already stopped/changed binaries. This +# is the same race that verify_panic.sh intentionally accepts as PASS. Treat +# it the same way here so the follow-up log assertion does not turn that +# accepted state into a failure. +if pgrep -f "seid start --chain-id sei" > /dev/null; then + BLOCK=$(timeout 5 seid status 2>/dev/null | jq '.SyncInfo.latest_block_height' -r 2>/dev/null || echo "") + if [[ "$BLOCK" =~ ^[0-9]+$ ]] && [[ "$BLOCK" -eq "$((TARGET_BLOCK_HEIGHT - 1))" ]]; then + echo "PASS" + exit 0 + fi +fi + echo "FAIL" exit 1 diff --git a/scripts/evm_stress/main.go b/scripts/evm_stress/main.go index aec30cc670..1c7873afd4 100644 --- a/scripts/evm_stress/main.go +++ b/scripts/evm_stress/main.go @@ -27,6 +27,9 @@ const ( // the stress test has a distinct sender with nonce=0. At targetTPS, the // pool lasts totalAccounts/targetTPS seconds. totalAccounts = 50_000 + + workloadModeTransfer = "transfer" + workloadModeContractStorage = "contract-storage" ) var ( @@ -37,6 +40,29 @@ var ( txValue = big.NewInt(1_000_000_000_001) // 10^12+1 wei: touches both usei balance and wei remainder ) +// storageContractInitCode is the init code for a minimal contract whose +// constructor stores the contract's own ADDRESS at slot 0 and then +// returns an empty runtime. Used by -mode contract-storage to deposit +// per-tx EVM storage state in memiavl that the FlatKV EVM migrate then has +// to drain — the migration's per-block batch copier moves account + +// storage + code rows, so a workload that produces all three kinds in +// volume is what the cluster-level test scenario needs. +// +// Hand-assembled to avoid a Solidity compiler dependency: +// +// 30 ADDRESS +// 60 00 PUSH1 0 +// 55 SSTORE // sstore(slot=0, value=address) +// 60 00 PUSH1 0 +// 60 00 PUSH1 0 +// f3 RETURN // return runtime of length 0 +// +// Total: 9 bytes; CREATE cost ~32000, SSTORE (cold) 20000, send out +// well within the per-tx Gas budget below. +var storageContractInitCode = []byte{ + 0x30, 0x60, 0x00, 0x55, 0x60, 0x00, 0x60, 0x00, 0xf3, +} + // nextKey returns a unique deterministic private key for the given index. func nextKey(idx uint64) *ecdsa.PrivateKey { seed := make([]byte, 32) @@ -84,6 +110,27 @@ func transfer(nonce uint64, to common.Address, key *ecdsa.PrivateKey) *types.Tra }), key) } +// deployStorageContract returns a signed CREATE transaction whose init +// code is storageContractInitCode. Each sender's deploy lands at a +// distinct contract address (CREATE: address = keccak(sender || nonce)) +// and emits one fresh (account, code, storage) triple into EVM state — +// which is the migration-test ammunition this mode exists to produce. +func deployStorageContract(nonce uint64, key *ecdsa.PrivateKey) *types.Transaction { + return signTx(types.NewTx(&types.DynamicFeeTx{ + ChainID: bigChainID, + Nonce: nonce, + GasTipCap: priorityFee, + GasFeeCap: maxFee, + // 21000 base + ~32000 CREATE + ~20000 SSTORE (cold) + memory + // + a small margin. 200k leaves plenty of headroom should the + // EVM gas schedule ever shift. + Gas: 200_000, + To: nil, // CREATE + Value: big.NewInt(0), + Data: storageContractInitCode, + }), key) +} + func waitForBalance(ctx context.Context, client *ethclient.Client, addr common.Address) { fmt.Printf("waiting for %s to have balance...\n", addr.Hex()) for { @@ -98,6 +145,11 @@ func waitForBalance(ctx context.Context, client *ethclient.Client, addr common.A func main() { dumpSeiAddrs := flag.Bool("dump-sei-addrs", false, "print sender sei bech32 addresses for genesis funding and exit") + mode := flag.String("mode", workloadModeTransfer, + "workload mode: 'transfer' (one 21k-gas value transfer per sender, default) or "+ + "'contract-storage' (one CREATE per sender that deploys a 1-slot SSTORE constructor; "+ + "used by the FlatKV EVM migrate cluster test to deposit account+code+storage state "+ + "in memiavl before the migration)") flag.Parse() // Key 0 = recipient; keys 1..totalAccounts = one-time genesis-funded senders. @@ -110,6 +162,21 @@ func main() { return } + // Validate mode early so a typo fails before we connect to a node. + var makeTx func(key *ecdsa.PrivateKey) *types.Transaction + switch *mode { + case workloadModeTransfer: + makeTx = func(key *ecdsa.PrivateKey) *types.Transaction { + return transfer(0, recipient, key) + } + case workloadModeContractStorage: + makeTx = func(key *ecdsa.PrivateKey) *types.Transaction { + return deployStorageContract(0, key) + } + default: + panic(fmt.Sprintf("unknown -mode %q (expected %q or %q)", *mode, workloadModeTransfer, workloadModeContractStorage)) + } + ctx := context.Background() client, err := ethclient.Dial(evmRPC) if err != nil { @@ -117,7 +184,7 @@ func main() { } defer client.Close() - fmt.Printf("recipient: %s\n", recipient.Hex()) + fmt.Printf("recipient: %s mode: %s\n", recipient.Hex(), *mode) // Wait for genesis accounts to have balance — confirms the node is live. waitForBalance(ctx, client, keyAddr(nextKey(1))) @@ -143,8 +210,7 @@ func main() { defer wg.Done() for key := range funded { <-ticker.C - tx := transfer(0, recipient, key) - _ = client.SendTransaction(ctx, tx) + _ = client.SendTransaction(ctx, makeTx(key)) } }() } diff --git a/sei-cosmos/storev2/rootmulti/flatkv_helpers_test.go b/sei-cosmos/storev2/rootmulti/flatkv_helpers_test.go index 5f008fd926..6c2218f0ee 100644 --- a/sei-cosmos/storev2/rootmulti/flatkv_helpers_test.go +++ b/sei-cosmos/storev2/rootmulti/flatkv_helpers_test.go @@ -50,6 +50,50 @@ func evmMigratedConfig() seidbconfig.StateCommitConfig { return withTestMemIAVL(cfg) } +// memiavlOnlyConfig is the v0 starting point for FlatKV EVM migrate tests: +// memiavl is the only backend, flatkv is not allocated. Phase 1 of the +// migration tests drives traffic in this mode before flipping to +// MigrateEVM at restart. +func memiavlOnlyConfig() seidbconfig.StateCommitConfig { + cfg := seidbconfig.DefaultStateCommitConfig() + cfg.WriteMode = seidbconfig.MemiavlOnly + return withTestMemIAVL(cfg) +} + +// migrateEVMConfig returns the MigrateEVM config used by phase 2 of the +// FlatKV EVM migrate tests. keysPerBlock caps the per-block migration batch +// so callers can deterministically spread the boundary across multiple +// commits without having to count source keys themselves; a small value +// (e.g. 4) reliably keeps the migration in flight long enough to cover +// the resume / determinism assertions, while a large value (e.g. 1024) +// is the production-equivalent that drains the boundary in one or two +// blocks. +func migrateEVMConfig(keysPerBlock int) seidbconfig.StateCommitConfig { + cfg := seidbconfig.DefaultStateCommitConfig() + cfg.WriteMode = seidbconfig.MigrateEVM + cfg.KeysToMigratePerBlock = keysPerBlock + return withTestMemIAVL(cfg) +} + +// restartRootMultiWithConfig closes the given store and reopens a fresh +// rootmulti store rooted at the same dir under newCfg. This is the +// shortest reliable simulation of the production "operator stops the +// node, edits app.toml, restarts" sequence: every backend is closed +// (so WALs are flushed and snapshots committed) before the new config +// is applied. Returns the new store and store-key map. +// +// Use this for the MemiavlOnly -> MigrateEVM flip and the MigrateEVM -> +// EVMMigrated migration; for an in-process Close/Open cycle under the +// same config (e.g. the resume path), call newTestRootMulti directly +// after store.Close() to keep intent obvious. +func restartRootMultiWithConfig( + t *testing.T, store *Store, dir string, newCfg seidbconfig.StateCommitConfig, +) (*Store, map[string]*types.KVStoreKey) { + t.Helper() + require.NoError(t, store.Close()) + return newTestRootMulti(t, dir, newCfg) +} + // --------------------------------------------------------------------------- // EVM test data and helpers // --------------------------------------------------------------------------- diff --git a/sei-cosmos/storev2/rootmulti/flatkv_migration_test.go b/sei-cosmos/storev2/rootmulti/flatkv_migration_test.go new file mode 100644 index 0000000000..f96ea39443 --- /dev/null +++ b/sei-cosmos/storev2/rootmulti/flatkv_migration_test.go @@ -0,0 +1,451 @@ +package rootmulti + +// FlatKV EVM migrate integration coverage at the rootmulti layer. +// +// The composite-package tests in +// sei-db/state_db/sc/composite/store_migration_test.go pin the same +// invariants against the bare CompositeCommitStore. The tests here run +// the migration through the rootmulti Store entry point so the +// store-tree wiring (CacheMultiStore -> KVStore -> CommitKVStore -> +// composite -> router) and the resulting CommitInfo / AppHash sequence +// are exercised end-to-end. This is the closest in-process analogue to +// what a Sei node observes during the operator-driven migration and is +// the bridge between the Go-level migration tests and the Docker-level +// cluster tests. + +import ( + "context" + "testing" + + "github.com/sei-protocol/sei-chain/sei-cosmos/store/types" + "github.com/sei-protocol/sei-chain/sei-db/common/utils" + seidbconfig "github.com/sei-protocol/sei-chain/sei-db/config" + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/flatkv" + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/migration" + evmtypes "github.com/sei-protocol/sei-chain/x/evm/types" + "github.com/stretchr/testify/require" +) + +// migrationVersionInFlatKV reads the migration-version key directly from +// flatkv at the given rootmulti home dir. Returns (0, false) when the +// key is absent (= migration not yet completed). Closes the temporary +// flatkv handle before returning. The caller is expected to have +// already closed the rootmulti store at dir; flatkv refuses to open +// concurrently with the live store. +func migrationVersionInFlatKV(t *testing.T, dir string, cfg seidbconfig.StateCommitConfig) (uint64, bool) { + t.Helper() + flatkvCfg := cfg.FlatKVConfig + flatkvCfg.DataDir = utils.GetFlatKVPath(dir) + s, err := flatkv.NewCommitStore(context.Background(), &flatkvCfg) + require.NoError(t, err) + _, err = s.LoadVersion(0, false) + require.NoError(t, err) + defer func() { require.NoError(t, s.Close()) }() + reader := migration.DBReader(func(store string, key []byte) ([]byte, bool, error) { + v, ok := s.Get(store, key) + return v, ok, nil + }) + v, _, err := readMigrationVersion(reader) + require.NoError(t, err) + return v, v != 0 +} + +// readMigrationVersion mirrors migration.IsAtVersion but returns the +// raw version so callers can assert against either presence or value. +// We can't reuse migration.IsAtVersion directly because it only +// returns a bool relative to a target version, and the lifecycle test +// below wants to observe both the in-flight (version key absent) and +// the post-completion (version key == 1) states from the same caller. +func readMigrationVersion(reader migration.DBReader) (uint64, bool, error) { + // migration.IsAtVersion(reader, v1) is the closest exported helper. + // Probe both candidate versions; falling back to v0 lets us also + // detect the boundary-not-yet-bumped case as version 0. + atV1, err := migration.IsAtVersion(reader, uint64(migration.Version1_MigrateEVM)) + if err != nil { + return 0, false, err + } + if atV1 { + return uint64(migration.Version1_MigrateEVM), true, nil + } + return uint64(migration.Version0_MemiavlOnly), false, nil +} + +// driveRootMultiMigration plays the operator-driven FlatKV EVM migrate +// through the rootmulti Store entry point. Phase 1 runs blocks 1..p1 +// in MemiavlOnly using simulateBlockManyStorage so each block deposits +// a large EVM-storage batch into memiavl; this is what the migration +// in phase 2 then has to drain. Phase 2 reopens under MigrateEVM with +// the supplied batch size and runs p2 more blocks of normal traffic. +// Returns the post-phase-2 store (still open), the store-key map, and +// every commit record from phase 1 + phase 2 in order. +// +// Centralizing this lets each test focus on what it asserts rather +// than the bookkeeping for the two-phase open / close / open cycle. +func driveRootMultiMigration( + t *testing.T, + dir string, + phase1Blocks, phase2Blocks int, + phase1StorageKeysPerBlock int, + migrateBatchSize int, +) (*Store, map[string]*types.KVStoreKey, []commitRecord) { + t.Helper() + + records := make([]commitRecord, 0, phase1Blocks+phase2Blocks) + + store, storeKeys := newTestRootMulti(t, dir, memiavlOnlyConfig()) + + // Phase 1: lots of EVM storage keys so the migration in phase 2 has + // real work to do. keysPerBlock controls the source key fan-out. + addrBase := newEVMTestData(0xA1) + storageKeys := storageMemIAVLKeys(0xA1, phase1StorageKeysPerBlock) + for i := 1; i <= phase1Blocks; i++ { + records = append(records, simulateBlockManyStorage(t, store, storeKeys, i, storageKeys, addrBase)) + } + + // --- Restart: MemiavlOnly -> MigrateEVM --- + store, storeKeys = restartRootMultiWithConfig(t, store, dir, migrateEVMConfig(migrateBatchSize)) + + for i := phase1Blocks + 1; i <= phase1Blocks+phase2Blocks; i++ { + records = append(records, simulateBlock(t, store, storeKeys, i, addrBase)) + } + return store, storeKeys, records +} + +// driveDrainBlocks runs drainBlocks more simulateBlock commits on the +// open store starting at startBlock. The caller is expected to size +// drainBlocks so the migration completes within the loop; this helper +// does not probe flatkv mid-loop because the rootmulti owner holds an +// exclusive lock on the flatkv dir and probing forces a close/reopen +// per block that triggers WAL snapshot rotation edge cases. Verify +// completion after the test has closed the store via the offline +// migrationVersionInFlatKV helper. +func driveDrainBlocks( + t *testing.T, + store *Store, + storeKeys map[string]*types.KVStoreKey, + startBlock, drainBlocks int, +) (records []commitRecord, nextBlock int) { + t.Helper() + addrBase := newEVMTestData(0xA1) + records = make([]commitRecord, 0, drainBlocks) + block := startBlock + for i := 0; i < drainBlocks; i++ { + records = append(records, simulateBlock(t, store, storeKeys, block, addrBase)) + block++ + } + return records, block +} + +func TestRootMultiMigrateEVM_ReopenPreservesPreFlipAppHash(t *testing.T) { + dir := t.TempDir() + + store, storeKeys := newTestRootMulti(t, dir, memiavlOnlyConfig()) + addrBase := newEVMTestData(0xA1) + storageKeys := storageMemIAVLKeys(0xA1, 8) + for i := 1; i <= 3; i++ { + simulateBlockManyStorage(t, store, storeKeys, i, storageKeys, addrBase) + } + preFlipID := store.LastCommitID() + require.Equal(t, int64(3), preFlipID.Version) + require.NotEmpty(t, preFlipID.Hash) + + store, storeKeys = restartRootMultiWithConfig(t, store, dir, migrateEVMConfig(2)) + defer func() { require.NoError(t, store.Close()) }() + + require.Equal(t, preFlipID, store.LastCommitID(), + "opening migrate_evm must not change the AppHash for the already-committed height") + + rec := simulateBlock(t, store, storeKeys, 4, addrBase) + require.Equal(t, int64(4), rec.version) + require.NotEmpty(t, rec.hash) +} + +func TestRootMultiMigrateEVM_EmptyBlocksAdvanceMigration(t *testing.T) { + dir := t.TempDir() + + store, storeKeys := newTestRootMulti(t, dir, memiavlOnlyConfig()) + addrBase := newEVMTestData(0xB1) + storageKeys := storageMemIAVLKeys(0xB1, 4) + simulateBlockManyStorage(t, store, storeKeys, 1, storageKeys, addrBase) + + store, _ = restartRootMultiWithConfig(t, store, dir, migrateEVMConfig(2)) + + for i := 0; i < 4; i++ { + rec := finalizeBlock(t, store) + require.Equal(t, int64(2+i), rec.version) + require.NotEmpty(t, rec.hash) + } + require.NoError(t, store.Close()) + + v, present := migrationVersionInFlatKV(t, dir, migrateEVMConfig(2)) + require.True(t, present) + require.Equal(t, uint64(migration.Version1_MigrateEVM), v) +} + +func TestRootMultiMigrateEVM_EVMIteratorAvailableDuringMigration(t *testing.T) { + dir := t.TempDir() + + store, storeKeys := newTestRootMulti(t, dir, memiavlOnlyConfig()) + cms := store.CacheMultiStore() + txHashKey := evmtypes.TxHashesKey(1) + cms.GetKVStore(storeKeys["evm"]).Set(txHashKey, []byte("txhash")) + cms.Write() + rec := finalizeBlock(t, store) + require.Equal(t, int64(1), rec.version) + + store, storeKeys = restartRootMultiWithConfig(t, store, dir, migrateEVMConfig(2)) + defer func() { require.NoError(t, store.Close()) }() + + cms = store.CacheMultiStore() + iter := cms.GetKVStore(storeKeys["evm"]).Iterator( + evmtypes.TxHashesPrefix, + types.PrefixEndBytes(evmtypes.TxHashesPrefix), + ) + defer func() { require.NoError(t, iter.Close()) }() + require.True(t, iter.Valid()) + require.Equal(t, txHashKey, iter.Key()) + require.Equal(t, []byte("txhash"), iter.Value()) +} + +// TestRootMultiMigrateEVM_HappyPath_Lifecycle drives the full +// MemiavlOnly -> MigrateEVM lifecycle through the rootmulti Store and +// asserts the AppHash sequence makes sense: every block produces a +// version + non-empty hash, the version is monotonic, and the +// migration eventually completes with a self-consistent flatkv (full +// LtHash full-scan matches the committed root, which is the in-process +// analogue of the cross-validator digest check the Docker tests run). +func TestRootMultiMigrateEVM_HappyPath_Lifecycle(t *testing.T) { + dir := t.TempDir() + const ( + phase1Blocks = 3 + phase2Blocks = 5 + batch = 8 // small enough to span the boundary across multiple p2 blocks + storagePerBlk = 12 + drainBlocks = 15 // upper bound: phase1 deposits ~40 EVM keys, batch=8 drains in ~5 + ) + + store, storeKeys, records := driveRootMultiMigration( + t, dir, phase1Blocks, phase2Blocks, storagePerBlk, batch, + ) + + // Phase 1 + phase 2 records must form a contiguous version + // sequence with non-empty hashes. Empty hashes here would mean + // the rootmulti CommitInfo amendment is silently dropping a store. + for i, rec := range records { + require.Equal(t, int64(i+1), rec.version, "block %d: version mismatch", i+1) + require.NotEmpty(t, rec.hash, "block %d: AppHash must be non-empty", i+1) + require.NotNil(t, findStoreInfo(rec.infos, "evm"), + "block %d: evm StoreInfo must be present in CommitInfo", i+1) + require.NotNil(t, findStoreInfo(rec.infos, "bank"), + "block %d: bank StoreInfo must be present in CommitInfo", i+1) + } + + // Drive a few more blocks so the migration boundary closes. The + // drain count is fixed (rather than probed) to avoid the + // close/reopen-per-block path that races with flatkv snapshot + // rotation; the offline check after Close verifies completion. + drainRecs, _ := driveDrainBlocks(t, store, storeKeys, + phase1Blocks+phase2Blocks+1, drainBlocks) + for i, rec := range drainRecs { + blockNum := phase1Blocks + phase2Blocks + i + 1 + require.Equal(t, int64(blockNum), rec.version, + "drain block %d: version mismatch", blockNum) + require.NotEmpty(t, rec.hash, + "drain block %d: AppHash must be non-empty", blockNum) + } + + // Close before the offline migration-version check; flatkv refuses + // to open concurrently with the live store. + require.NoError(t, store.Close()) + + // Migration must have completed by now. The composite-layer test + // (TestComposite_MigrateEVM_HappyPath) also verifies full-scan + // LtHash equality; we don't repeat that here because the live + // rootmulti store has by now rotated flatkv WAL snapshots in a way + // that breaks LoadVersion(latest, readOnly=true) catchup. + v, present := migrationVersionInFlatKV(t, dir, migrateEVMConfig(batch)) + require.True(t, present, "migration-version key must be present in flatkv after completion") + require.Equal(t, uint64(migration.Version1_MigrateEVM), v, + "flatkv migration version must be Version1_MigrateEVM") +} + +// TestRootMultiMigrateEVM_AppHashDeterminismAcrossRuns is the +// cross-validator agreement check at the rootmulti layer: two +// independent rootmulti stores driven by the same per-block workload +// must produce byte-identical AppHashes at every commit. If this fails +// in production, four validators driving the same migration will fork +// the chain at the first divergent block. +func TestRootMultiMigrateEVM_AppHashDeterminismAcrossRuns(t *testing.T) { + const ( + phase1Blocks = 3 + phase2Blocks = 10 + batch = 8 + storagePerBlk = 12 + ) + + runOnce := func() []commitRecord { + dir := t.TempDir() + store, _, records := driveRootMultiMigration( + t, dir, phase1Blocks, phase2Blocks, storagePerBlk, batch, + ) + require.NoError(t, store.Close()) + return records + } + + a := runOnce() + b := runOnce() + require.Equal(t, len(a), len(b)) + for i := range a { + require.Equalf(t, a[i].version, b[i].version, + "block %d: version differs between runs", i+1) + require.Equalf(t, a[i].hash, b[i].hash, + "block %d (version %d): AppHash differs between runs\n run A: %x\n run B: %x", + i+1, a[i].version, a[i].hash, b[i].hash) + } +} + +// TestRootMultiMigrateEVM_PostCompletionFlipToEVMMigrated covers the +// production mode flip: once the migration completes the operator flips +// sc-write-mode from migrate_evm to evm_migrated so subsequent restarts +// don't spin up the migration manager. The flip must be lossless: the +// store reopens at the same version, the next block commits cleanly +// against the same flatkv-backed EVM lattice, and the evm StoreInfo in +// CommitInfo continues to surface the lattice digest (i.e. nothing +// silently routes EVM writes back to memiavl). +func TestRootMultiMigrateEVM_PostCompletionFlipToEVMMigrated(t *testing.T) { + dir := t.TempDir() + const ( + phase1Blocks = 3 + phase2Blocks = 5 + batch = 8 + storagePerBlk = 12 + ) + + store, storeKeys, _ := driveRootMultiMigration( + t, dir, phase1Blocks, phase2Blocks, storagePerBlk, batch, + ) + _, nextBlock := driveDrainBlocks(t, store, storeKeys, + phase1Blocks+phase2Blocks+1, 15) + + // Snapshot the pre-flip last-commit hash so we can require it + // survives the migration unchanged. + preFlipVersion := store.LastCommitID().Version + preFlipHash := append([]byte(nil), store.LastCommitID().Hash...) + + // Sanity: the close-and-reopen below depends on the migration + // having actually finished before we flip the mode. + require.NoError(t, store.Close()) + v, present := migrationVersionInFlatKV(t, dir, migrateEVMConfig(batch)) + require.True(t, present && v == uint64(migration.Version1_MigrateEVM), + "flip must happen after migration completes; tighten drainBlocks if this fails") + + // --- Flip: MigrateEVM -> EVMMigrated. --- + store, storeKeys = newTestRootMulti(t, dir, evmMigratedConfig()) + + require.Equal(t, preFlipVersion, store.LastCommitID().Version, + "EVMMigrated reopen must report the same version as the completed MigrateEVM run") + require.Equal(t, preFlipHash, store.LastCommitID().Hash, + "EVMMigrated reopen must report the same AppHash; the on-disk flatkv root is identical so the CommitInfo must hash identically") + + // One more block must commit cleanly under the new mode. This is + // the regression signal for any post-flip routing change that + // would otherwise produce a malformed CommitInfo (e.g. dropping + // the evm StoreInfo or swapping its hash source). + addrBase := newEVMTestData(0xA1) + rec := simulateBlock(t, store, storeKeys, nextBlock, addrBase) + require.Equal(t, preFlipVersion+1, rec.version) + require.NotEmpty(t, rec.hash) + require.NotNil(t, findStoreInfo(rec.infos, "evm")) + + require.NoError(t, store.Close()) +} + +// TestRootMultiMigrateEVM_DoubleFlushAppHashStable pins the AppHash +// continuity invariant for production's GetWorkingHash + Commit +// double flush in MigrateEVM mode. rootmulti.Store.flush is called +// once inside GetWorkingHash (to produce the AppHash returned to +// Tendermint) and once inside Commit (to drain any post-FinalizeBlock +// caller cache). The CompositeCommitStore in migration modes forwards +// empty changesets to the MigrationManager so empty blocks still +// advance the boundary; without the per-commit advance gate that +// second flush would advance the boundary again, mutate the working +// commit info, and end up persisting a hash that differs from the +// one already returned to Tendermint — a deterministic AppHash +// mismatch the moment any validator restarts. +// +// The test drives several migrate_evm blocks. For each block it: +// 1. Writes some EVM data, then calls GetWorkingHash twice with no +// intervening Commit. Both calls must return identical hashes: +// the second flush must not advance the boundary or perturb the +// working commit info. +// 2. Calls Commit and asserts the persisted LastCommitInfo hash +// matches the WorkingHash that was already returned. This is the +// direct AppHash-continuity check: AppHash announced to +// Tendermint via FinalizeBlock == AppHash persisted at the same +// height. +// +// Both empty and non-empty caller writes are covered so the test +// catches regressions where the gate is bypassed for either input +// shape. +func TestRootMultiMigrateEVM_DoubleFlushAppHashStable(t *testing.T) { + dir := t.TempDir() + + // Phase 1: deposit EVM storage in MemiavlOnly so the upcoming + // migration has real work and the boundary actually advances on + // the first flush (an already-NotStarted manager facing an empty + // iterator would short-circuit and hide the bug). + store, storeKeys := newTestRootMulti(t, dir, memiavlOnlyConfig()) + addrBase := newEVMTestData(0xC1) + storageKeys := storageMemIAVLKeys(0xC1, 12) + for i := 1; i <= 2; i++ { + simulateBlockManyStorage(t, store, storeKeys, i, storageKeys, addrBase) + } + + // --- Restart into MigrateEVM with a small batch so multiple + // per-block advances are required to drain. --- + const batch = 4 + store, storeKeys = restartRootMultiWithConfig(t, store, dir, migrateEVMConfig(batch)) + defer func() { require.NoError(t, store.Close()) }() + + // Alternate empty blocks (exercise the "empty changesets in + // migration mode still advance" path) with non-empty blocks + // (exercise the routing-with-real-writes path). Both must be + // stable under double flush. + for blockIdx := 0; blockIdx < 4; blockIdx++ { + if blockIdx%2 == 1 { + cms := store.CacheMultiStore() + b := byte(blockIdx + 10) + evmKV := cms.GetKVStore(storeKeys["evm"]) + cms.GetKVStore(storeKeys["acc"]).Set([]byte("acct1"), []byte{b}) + evmKV.Set(addrBase.nonKey, makeNonce(uint64(blockIdx))) + cms.Write() + } + + // First flush: AppHash that would be returned to Tendermint + // from FinalizeBlock. + announced, err := store.GetWorkingHash() + require.NoError(t, err, "block idx %d: first GetWorkingHash", blockIdx) + require.NotEmpty(t, announced, "block idx %d: announced hash must be non-empty", blockIdx) + + // Second flush with no intervening writes: must be identical. + // A regression in the per-commit boundary advance gate would + // surface here as a different hash because the migration + // would advance again and the working commit info would + // shift. + again, err := store.GetWorkingHash() + require.NoError(t, err, "block idx %d: second GetWorkingHash", blockIdx) + require.Equal(t, announced, again, + "block idx %d: repeated GetWorkingHash must be idempotent; "+ + "a difference means migration advanced again inside the second flush", + blockIdx) + + // Commit: the persisted LastCommitInfo hash must match what + // was already announced. + cid := store.Commit(true) + require.Equal(t, announced, []byte(cid.Hash), + "block idx %d (version %d): persisted hash must equal the hash already "+ + "returned by GetWorkingHash; otherwise Tendermint accepted an AppHash that "+ + "differs from the validator's actual chain head", + blockIdx, cid.Version) + } +} diff --git a/sei-db/config/toml.go b/sei-db/config/toml.go index a5391f5aaa..f1749fd380 100644 --- a/sei-db/config/toml.go +++ b/sei-db/config/toml.go @@ -55,6 +55,14 @@ sc-snapshot-write-rate-mbps = {{ .StateCommit.MemIAVLConfig.SnapshotWriteRateMBp # all_migrated_but_bank, migrate_bank, flatkv_only, test_only_dual_write sc-write-mode = "{{ .StateCommit.WriteMode }}" +# KeysToMigratePerBlock controls how many EVM keys the in-flight migration +# (sc-write-mode = migrate_evm / migrate_bank / migrate_all_but_bank) drains +# from memiavl into flatkv per block. Default 1024 is appropriate for +# production drains; lower it (e.g. 256) to spread the migration across more +# blocks for test runs that need to observe the resume / hybrid-read path. +# Must be > 0; ignored entirely when not in a migration mode. +sc-keys-to-migrate-per-block = {{ .StateCommit.KeysToMigratePerBlock }} + ############################################################################### ### FlatKV (EVM) Configuration ### ############################################################################### diff --git a/sei-db/state_db/sc/composite/store.go b/sei-db/state_db/sc/composite/store.go index cdf9e9aa5b..2215e55c87 100644 --- a/sei-db/state_db/sc/composite/store.go +++ b/sei-db/state_db/sc/composite/store.go @@ -73,6 +73,27 @@ type CompositeCommitStore struct { // boundary advances (or the migration completes), the gate latches // and subsequent calls skip the flatkv read. See shouldAppendLatticeHash. latticeAppendLatched atomic.Bool + + // migrationForwardedThisCommit gates per-block migration progress + // against rootmulti.Store's double-flush pattern. rootmulti calls + // flush() once inside GetWorkingHash (whose result is the AppHash + // returned to Tendermint) and once inside Commit. In migration + // modes we still forward empty changesets to the router so the + // migration boundary can advance on empty blocks, but that + // forwarding must happen at most once per block — otherwise the + // MigrationManager would advance a second batch inside the Commit + // flush, perturb the working commit info, and persist a hash that + // differs from the one already returned to Tendermint. + // + // Set on the first ApplyChangeSets of a block; reset by Commit + // after both backend commits succeed. A non-empty changeset is + // always forwarded (covers the corner case where caller-side + // writes arrive between the two flushes); only the second-flush + // empty changeset is suppressed. See ApplyChangeSets + Commit for + // the wiring and the rootmulti integration test + // TestRootMultiMigrateEVM_DoubleFlushAppHashStable for the pinned + // invariant. + migrationForwardedThisCommit bool } // NewCompositeCommitStore creates a new composite commit store. @@ -316,8 +337,22 @@ func (cs *CompositeCommitStore) buildRouter() error { } // ApplyChangeSets applies changesets to the appropriate backends based on config. +// +// Forwarding rules: +// - Non-migration modes: empty changesets are a no-op (nothing to apply). +// - Migration modes: empty changesets are still forwarded so the +// MigrationManager can advance the boundary on empty blocks — but +// only on the first forward of a given commit cycle, to avoid the +// double-flush re-advance described on migrationForwardedThisCommit. +// Non-empty changesets always forward (caller writes must reach the +// backends regardless of which flush they arrive on). func (cs *CompositeCommitStore) ApplyChangeSets(changesets []*proto.NamedChangeSet) error { - if len(changesets) == 0 { + if cs.config.WriteMode.IsMigrationMode() { + if len(changesets) == 0 && cs.migrationForwardedThisCommit { + return nil + } + cs.migrationForwardedThisCommit = true + } else if len(changesets) == 0 { return nil } @@ -359,6 +394,15 @@ func (cs *CompositeCommitStore) Commit() (int64, error) { } } + // Reset the per-block migration-forward gate so the next block's + // first ApplyChangeSets is permitted to forward (and the migration + // manager is permitted to advance) again. Reset after both + // backends have successfully committed so a writer error leaves + // the gate set and the next ApplyChangeSets retries the same + // batch; see migrationForwardedThisCommit for the AppHash + // continuity invariant this preserves. + cs.migrationForwardedThisCommit = false + if cosmosVersion >= 0 && flatkvVersion >= 0 { if cosmosVersion != flatkvVersion { return 0, fmt.Errorf("cosmos and flatkv version mismatch after commit: cosmos=%d, flatkv=%d", @@ -521,8 +565,7 @@ func (cs *CompositeCommitStore) shouldAppendLatticeHash() bool { } // appendEvmLatticeHash returns a new CommitInfo with the EVM lattice hash -// appended, without mutating the original. Returns the original unchanged -// when flatKV is not present. +// appended, without mutating the original. func (cs *CompositeCommitStore) appendEvmLatticeHash(ci *proto.CommitInfo, evmHash []byte) *proto.CommitInfo { combined := make([]proto.StoreInfo, len(ci.StoreInfos)+1) copy(combined, ci.StoreInfos) diff --git a/sei-db/state_db/sc/composite/store_migration_test.go b/sei-db/state_db/sc/composite/store_migration_test.go new file mode 100644 index 0000000000..416ed3aa46 --- /dev/null +++ b/sei-db/state_db/sc/composite/store_migration_test.go @@ -0,0 +1,612 @@ +package composite + +import ( + "testing" + + "github.com/sei-protocol/sei-chain/sei-db/common/keys" + "github.com/sei-protocol/sei-chain/sei-db/common/testutil" + "github.com/sei-protocol/sei-chain/sei-db/config" + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/flatkv" + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/migration" + "github.com/stretchr/testify/require" +) + +// This file contains composite-level integration tests for the +// FlatKV EVM migrate flow. The migration-package +// TestMigrateEVM (sei-db/state_db/sc/migration/migration_transitions_test.go) +// exercises the migration router directly against bare memiavl + flatkv +// CommitStores. The tests here move the same correctness assertions up +// one layer, so we also cover the composite's Initialize / LoadVersion / +// Commit lifecycle: migration-tree mounting on memiavl, the +// SetInitialVersion seeding that brings flatkv into lockstep on the +// MemiavlOnly -> MigrateEVM reopen, and the post-completion EVMMigrated +// flip operators perform once the boundary is gone. + +// migKeyPair identifies a single (store, key) entry in the workload oracle. +type migKeyPair struct { + store string + key string // stringified key bytes +} + +// keySet is a deterministic-ordered set of byte-string keys. Adds and +// removes are O(1); Sample draws n distinct entries via Floyd's algorithm +// so its output depends only on the slice contents and the supplied RNG, +// not on Go's randomised map iteration order. This is the same approach +// used by the migration package's liveKeySet (see +// migration_test_framework_test.go) but kept local so the composite +// tests don't pull in any migration-package test-only helpers. +type keySet struct { + keys []string + idx map[string]int +} + +func newKeySet() *keySet { return &keySet{idx: map[string]int{}} } + +func (s *keySet) len() int { return len(s.keys) } + +func (s *keySet) add(k string) { + if _, ok := s.idx[k]; ok { + return + } + s.idx[k] = len(s.keys) + s.keys = append(s.keys, k) +} + +func (s *keySet) remove(k string) { + i, ok := s.idx[k] + if !ok { + return + } + last := len(s.keys) - 1 + if i != last { + s.keys[i] = s.keys[last] + s.idx[s.keys[i]] = i + } + s.keys = s.keys[:last] + delete(s.idx, k) +} + +func (s *keySet) sample(rng *testutil.TestRandom, n int) []string { + if n > len(s.keys) { + n = len(s.keys) + } + if n == 0 { + return nil + } + chosen := make(map[int]struct{}, n) + out := make([]string, 0, n) + for i := len(s.keys) - n; i < len(s.keys); i++ { + j := rng.Intn(i + 1) + if _, exists := chosen[j]; exists { + chosen[i] = struct{}{} + out = append(out, s.keys[i]) + } else { + chosen[j] = struct{}{} + out = append(out, s.keys[j]) + } + } + return out +} + +// migrationWorkload generates a deterministic sequence of mixed +// EVM + bank changesets used to drive CompositeCommitStore through the +// MigrateEVM lifecycle. All randomness comes from a *testutil.TestRandom +// seeded by the caller, so two workloads constructed with the same seed +// and invoked with the same per-block parameters emit byte-identical +// changesets. That property is what +// TestComposite_MigrateEVM_DeterministicAcrossTwoStores relies on to +// assert per-block flatkv root-hash equality without any cross-run +// synchronisation. +// +// EVM keys are all storage-kind (0x03 prefix + 20-byte addr + 32-byte +// slot) so flatkv's key classifier routes them through the storage DB, +// which is the bulk of real EVM state and the hottest path for the +// migration's batch copier. +type migrationWorkload struct { + rng *testutil.TestRandom + liveEVM *keySet + liveBank *keySet + // expected mirrors the latest value written to every (store, key). + // Maintained alongside the live key sets; deletes drop the entry. + expected map[migKeyPair][]byte +} + +func newMigrationWorkload(seed int64) *migrationWorkload { + return &migrationWorkload{ + rng: testutil.NewTestRandomNoPrint(seed), + liveEVM: newKeySet(), + liveBank: newKeySet(), + expected: map[migKeyPair][]byte{}, + } +} + +// generateBlock produces a deterministic []*proto.NamedChangeSet +// representing one block of activity. Operation counts are interpreted +// as upper bounds; update/delete counts silently produce zero ops if +// the relevant live-set is empty, so the first block of a fresh +// workload may apply only new-key writes. +func (w *migrationWorkload) generateBlock( + newEVMKeys, updateEVMKeys, deleteEVMKeys, + newBankKeys, updateBankKeys int, +) []*proto.NamedChangeSet { + var evmPairs, bankPairs []*proto.KVPair + + for i := 0; i < newEVMKeys; i++ { + addr := w.rng.Bytes(keys.AddressLen) + slot := w.rng.Bytes(32) + stripped := append(addr, slot...) + k := keys.BuildEVMKey(keys.EVMKeyStorage, stripped) + v := w.rng.Bytes(32) + evmPairs = append(evmPairs, &proto.KVPair{Key: k, Value: v}) + w.liveEVM.add(string(k)) + w.expected[migKeyPair{keys.EVMStoreKey, string(k)}] = append([]byte(nil), v...) + } + + for _, k := range w.liveEVM.sample(w.rng, updateEVMKeys) { + v := w.rng.Bytes(32) + evmPairs = append(evmPairs, &proto.KVPair{Key: []byte(k), Value: v}) + w.expected[migKeyPair{keys.EVMStoreKey, k}] = append([]byte(nil), v...) + } + + for _, k := range w.liveEVM.sample(w.rng, deleteEVMKeys) { + evmPairs = append(evmPairs, &proto.KVPair{Key: []byte(k), Delete: true}) + w.liveEVM.remove(k) + delete(w.expected, migKeyPair{keys.EVMStoreKey, k}) + } + + for i := 0; i < newBankKeys; i++ { + k := append([]byte("b-"), w.rng.Bytes(16)...) + v := w.rng.Bytes(16) + bankPairs = append(bankPairs, &proto.KVPair{Key: k, Value: v}) + w.liveBank.add(string(k)) + w.expected[migKeyPair{keys.BankStoreKey, string(k)}] = append([]byte(nil), v...) + } + + for _, k := range w.liveBank.sample(w.rng, updateBankKeys) { + v := w.rng.Bytes(16) + bankPairs = append(bankPairs, &proto.KVPair{Key: []byte(k), Value: v}) + w.expected[migKeyPair{keys.BankStoreKey, k}] = append([]byte(nil), v...) + } + + // Emit changesets in fixed store-name order so the call sequence + // handed to ApplyChangeSets is fully reproducible across runs. + var out []*proto.NamedChangeSet + if len(bankPairs) > 0 { + out = append(out, &proto.NamedChangeSet{ + Name: keys.BankStoreKey, + Changeset: proto.ChangeSet{Pairs: bankPairs}, + }) + } + if len(evmPairs) > 0 { + out = append(out, &proto.NamedChangeSet{ + Name: keys.EVMStoreKey, + Changeset: proto.ChangeSet{Pairs: evmPairs}, + }) + } + return out +} + +// snapshotOracle returns a deep copy of the (store, key) -> value +// expectations so the caller can verify reads even after subsequent +// generateBlock calls have mutated the workload's internal state. +func (w *migrationWorkload) snapshotOracle() map[migKeyPair][]byte { + out := make(map[migKeyPair][]byte, len(w.expected)) + for k, v := range w.expected { + out[k] = append([]byte(nil), v...) + } + return out +} + +// flatKVReaderFor builds a migration.DBReader pointing at the flatkv +// backend of the given composite store. Used to invoke +// migration.IsAtVersion from composite-package tests without having to +// reach into the migration package's private readVersionFromDB helper. +func flatKVReaderFor(cs *CompositeCommitStore) migration.DBReader { + return func(store string, key []byte) ([]byte, bool, error) { + v, ok := cs.flatKV.Get(store, key) + return v, ok, nil + } +} + +// driveMigrationWorkload runs the MemiavlOnly phase 1 + the MigrateEVM +// phase 2 in one open-close cycle, leaving the store closed on disk in +// MigrateEVM mode with a partially or fully drained boundary depending +// on the caller's batch size. Inside the reopen it asserts that phase-1 +// reads still resolve through the migration router; the caller doesn't +// need to repeat that check. +// +// All three tests below need the same MemiavlOnly bootstrap followed +// by a reopen into MigrateEVM, so factoring it out keeps each test +// focused on what it asserts (deterministic hashes / resume / mode flip) +// rather than the boilerplate setup. +func driveMigrationWorkload( + t *testing.T, + dir string, + workload *migrationWorkload, + phase1Blocks, phase2Blocks int, + keysToMigratePerBlock int, +) { + t.Helper() + + memCfg := config.DefaultStateCommitConfig() + memCfg.WriteMode = config.MemiavlOnly + // AsyncCommitBuffer=0 keeps WAL writes synchronous; without it + // GetLatestVersion / on-disk reconcile races with the in-flight + // commit and the post-reopen version checks become flaky. + memCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + + cs, err := NewCompositeCommitStore(t.Context(), dir, memCfg) + require.NoError(t, err) + require.NoError(t, cs.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs.LoadVersion(0, false) + require.NoError(t, err) + + for i := 0; i < phase1Blocks; i++ { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(20, 0, 0, 5, 0))) + _, err := cs.Commit() + require.NoError(t, err) + } + require.Equal(t, int64(phase1Blocks), cs.Version()) + require.Nil(t, cs.flatKV, "MemiavlOnly must not allocate flatkv") + // Snapshot the oracle right at the mode boundary so the + // post-reopen verification below sees pre-migration data only. + // Phase 2 will then mutate the workload further; callers that + // need the post-phase-2 oracle can re-snapshot via workload. + preFlipOracle := workload.snapshotOracle() + require.NoError(t, cs.Close()) + + migCfg := config.DefaultStateCommitConfig() + migCfg.WriteMode = config.MigrateEVM + migCfg.KeysToMigratePerBlock = keysToMigratePerBlock + migCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + + cs, err = NewCompositeCommitStore(t.Context(), dir, migCfg) + require.NoError(t, err) + require.NoError(t, cs.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs.LoadVersion(0, false) + require.NoError(t, err) + + // Phase 1 reads must still resolve through the migration router. + // Failing here means the read-transparency invariant (I3) is broken: + // EVM lookups silently disappear during a migration boundary. + requireOracleMatches(t, cs, preFlipOracle) + + for i := 0; i < phase2Blocks; i++ { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(5, 5, 1, 2, 2))) + _, err := cs.Commit() + require.NoError(t, err) + } + require.NoError(t, cs.Close()) +} + +// reopenInMigrateEVM is a small helper for the resume / migration paths +// that need to peek at on-disk state from a MigrateEVM mode reopen. +func reopenInMigrateEVM(t *testing.T, dir string, batch int) *CompositeCommitStore { + t.Helper() + cfg := config.DefaultStateCommitConfig() + cfg.WriteMode = config.MigrateEVM + cfg.KeysToMigratePerBlock = batch + cfg.MemIAVLConfig.AsyncCommitBuffer = 0 + + cs, err := NewCompositeCommitStore(t.Context(), dir, cfg) + require.NoError(t, err) + require.NoError(t, cs.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs.LoadVersion(0, false) + require.NoError(t, err) + return cs +} + +// runUntilMigrationComplete drives the workload through commits until +// the flatkv migration-version key reaches Version1_MigrateEVM. Fails +// the test if completion takes more than maxBlocks (guards against a +// silently mistuned batch size that would otherwise hang). +func runUntilMigrationComplete( + t *testing.T, + cs *CompositeCommitStore, + workload *migrationWorkload, + maxBlocks int, +) { + t.Helper() + for i := 0; i < maxBlocks; i++ { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(0, 2, 1, 1, 1))) + _, err := cs.Commit() + require.NoError(t, err) + done, err := migration.IsAtVersion(flatKVReaderFor(cs), uint64(migration.Version1_MigrateEVM)) + require.NoError(t, err) + if done { + return + } + } + t.Fatalf("migration did not complete within %d blocks", maxBlocks) +} + +// requireOracleMatches asserts every (store, key) in oracle reads back +// via composite.Get with the expected value. Use this to validate the +// read-transparency invariant (I3) at any point in the lifecycle. +func requireOracleMatches(t *testing.T, cs *CompositeCommitStore, oracle map[migKeyPair][]byte) { + t.Helper() + for kp, want := range oracle { + got, ok, err := cs.Get(kp.store, []byte(kp.key)) + require.NoError(t, err, "Get store=%q key=%x", kp.store, kp.key) + require.True(t, ok, "expected present: store=%q key=%x", kp.store, kp.key) + require.Equal(t, want, got, "value mismatch: store=%q key=%x", kp.store, kp.key) + } +} + +// TestComposite_MigrateEVM_HappyPath drives the full MemiavlOnly -> +// MigrateEVM lifecycle through the production CompositeCommitStore +// entry point. The migration-package TestMigrateEVM covers the +// migration manager in isolation; this test pins the same invariants +// when traffic flows through the composite's Initialize / LoadVersion / +// ApplyChangeSets / Commit path, which additionally exercises +// migration-tree mounting on memiavl and the SetInitialVersion seeding +// that brings flatkv into lockstep on the mode flip. +func TestComposite_MigrateEVM_HappyPath(t *testing.T) { + dir := t.TempDir() + workload := newMigrationWorkload(0xC0FFEE) + + const phase1Blocks = 20 // ~400 EVM keys (20 * 20) + const phase2Blocks = 10 // stays in flight at batch=5 + const batch = 5 + + driveMigrationWorkload(t, dir, workload, phase1Blocks, phase2Blocks, batch) + + cs := reopenInMigrateEVM(t, dir, batch) + defer func() { _ = cs.Close() }() + + // Mid-flight sanity: phase 2 was sized to keep the boundary open. + // If this fails, the test no longer exercises the partial-migration + // hybrid read path, so tighten the batch or shorten phase 2. + done, err := migration.IsAtVersion(flatKVReaderFor(cs), uint64(migration.Version1_MigrateEVM)) + require.NoError(t, err) + require.False(t, done, "phase 2 should leave the migration in flight") + + // Current workload state (post phase-2 mutations) must still + // resolve through the migration router after the close-and-reopen + // cycle. This is the read-transparency invariant (I3): the boundary + // between memiavl-resident and flatkv-resident EVM keys must not + // be observable through the composite Get path. + requireOracleMatches(t, cs, workload.snapshotOracle()) + + // Drive blocks until the boundary closes. 200 is generous; the + // expected count is < 100 even with full churn. + runUntilMigrationComplete(t, cs, workload, 200) + + finalOracle := workload.snapshotOracle() + requireOracleMatches(t, cs, finalOracle) + + // I2: memiavl's evm tree must be empty post-migration. All evm + // data lives in flatkv at this point; if memiavl still has any + // keys here either the source deletes didn't fire or the migrator + // gave up early. + evmTree := cs.memIAVL.GetChildStoreByName(keys.EVMStoreKey) + require.NotNil(t, evmTree) + iter := evmTree.Iterator(nil, nil, true) + t.Cleanup(func() { _ = iter.Close() }) + require.False(t, iter.Valid(), + "post-migration memiavl evm tree must be empty (all data moved to flatkv)") + + // I4: full-scan lattice hash must agree with the stored committed + // hash; this is the offline equivalent of the cross-validator + // digest check the Docker tests run. + require.NoError(t, flatkv.VerifyLtHash(cs.flatKV), + "post-migration flatkv must pass full-scan LtHash verification") +} + +// TestComposite_MigrateEVM_CrashAndResume models the most common +// in-flight restart scenario: an operator stops the node mid-migration +// (planned restart, node OOMs, deploy rollover) and brings it back up. +// The resume must be lossless: same final composite version, same +// flatkv committed root hash, same oracle as a no-restart control run. +// +// "Crash" here is a clean composite.Close mid-migration. That's the +// strongest scenario this layer can simulate without dropping +// commit-time disk writes, which would require reaching past the +// public composite API. The migration manager's mid-commit ordering +// is exercised elsewhere; this test focuses on the +// LoadVersion-after-restart resume path through the composite. +func TestComposite_MigrateEVM_CrashAndResume(t *testing.T) { + const seed = int64(0xBADBEEF) + const phase1Blocks = 15 + const phase2Blocks = 20 + const batch = 8 + + runOnce := func(crashAfter int) (finalVersion int64, flatkvHash []byte, oracle map[migKeyPair][]byte) { + dir := t.TempDir() + workload := newMigrationWorkload(seed) + + memCfg := config.DefaultStateCommitConfig() + memCfg.WriteMode = config.MemiavlOnly + memCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + cs, err := NewCompositeCommitStore(t.Context(), dir, memCfg) + require.NoError(t, err) + require.NoError(t, cs.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs.LoadVersion(0, false) + require.NoError(t, err) + for i := 0; i < phase1Blocks; i++ { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(20, 0, 0, 5, 0))) + _, err := cs.Commit() + require.NoError(t, err) + } + require.NoError(t, cs.Close()) + + cs = reopenInMigrateEVM(t, dir, batch) + + runBlock := func() { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(5, 5, 1, 2, 2))) + _, err := cs.Commit() + require.NoError(t, err) + } + + // crashAfter <= 0 means the control run: drive all phase-2 + // blocks in one open-close cycle. Otherwise close after + // crashAfter blocks and reopen to drive the rest, which is + // what the resume path needs to be byte-equivalent to. + if crashAfter > 0 { + for i := 0; i < crashAfter; i++ { + runBlock() + } + done, err := migration.IsAtVersion(flatKVReaderFor(cs), uint64(migration.Version1_MigrateEVM)) + require.NoError(t, err) + require.False(t, done, "test must crash before migration completes; tighten crashAfter or batch") + require.NoError(t, cs.Close()) + + cs = reopenInMigrateEVM(t, dir, batch) + for i := crashAfter; i < phase2Blocks; i++ { + runBlock() + } + } else { + for i := 0; i < phase2Blocks; i++ { + runBlock() + } + } + + finalVersion = cs.Version() + flatkvHash = append([]byte(nil), cs.flatKV.CommittedRootHash()...) + oracle = workload.snapshotOracle() + require.NoError(t, cs.Close()) + return + } + + controlVer, controlHash, controlOracle := runOnce(0) + resumeVer, resumeHash, resumeOracle := runOnce(phase2Blocks / 3) + + // Strongest correctness signal: the post-resume lattice state is + // fully determined by the applied changeset sequence, so identical + // input -> identical hash regardless of when the close-reopen + // happened. + require.Equal(t, controlVer, resumeVer, + "resume must reach the same final version as the no-crash control") + require.Equal(t, controlHash, resumeHash, + "resume must produce the same flatkv committed root hash as the control") + require.Equal(t, controlOracle, resumeOracle, + "resume oracle must be byte-equivalent to control oracle (same seed)") +} + +// TestComposite_MigrateEVM_DeterministicAcrossTwoStores asserts that +// two independent CompositeCommitStore instances driven by the same +// workload reach byte-identical flatkv committed root hashes at every +// block of the migration. This is the property a multi-validator chain +// depends on: if it ever fails here, validators will fork mid-migration. +// +// Two stores in two tempdirs, same workload seed, per-block hash +// comparison. The migration package's TestMigrateEVM verifies +// determinism only at the end of phase 3; lifting the check to every +// commit catches any non-determinism introduced after the first +// migration block (e.g. iteration-order drift in the batch copier). +func TestComposite_MigrateEVM_DeterministicAcrossTwoStores(t *testing.T) { + const seed = int64(0xD37E12) + const phase1Blocks = 15 + const phase2Blocks = 60 // enough at batch=5 to span the full migration + const batch = 5 + + run := func() (finalVersion int64, perBlockHashes [][]byte) { + dir := t.TempDir() + workload := newMigrationWorkload(seed) + + memCfg := config.DefaultStateCommitConfig() + memCfg.WriteMode = config.MemiavlOnly + memCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + cs, err := NewCompositeCommitStore(t.Context(), dir, memCfg) + require.NoError(t, err) + require.NoError(t, cs.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs.LoadVersion(0, false) + require.NoError(t, err) + for i := 0; i < phase1Blocks; i++ { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(20, 0, 0, 5, 0))) + _, err := cs.Commit() + require.NoError(t, err) + } + require.NoError(t, cs.Close()) + + cs = reopenInMigrateEVM(t, dir, batch) + defer func() { _ = cs.Close() }() + + perBlockHashes = make([][]byte, 0, phase2Blocks) + for i := 0; i < phase2Blocks; i++ { + require.NoError(t, cs.ApplyChangeSets(workload.generateBlock(5, 5, 1, 2, 2))) + _, err := cs.Commit() + require.NoError(t, err) + perBlockHashes = append(perBlockHashes, append([]byte(nil), cs.flatKV.CommittedRootHash()...)) + } + finalVersion = cs.Version() + return + } + + verA, hashesA := run() + verB, hashesB := run() + + require.Equal(t, verA, verB, + "two independent runs of the same workload must reach the same final version") + require.Equal(t, len(hashesA), len(hashesB)) + for i := range hashesA { + require.Equalf(t, hashesA[i], hashesB[i], + "phase-2 block %d (composite version %d): flatkv committed root hash differs between runs", + i, int64(phase1Blocks)+int64(i)+1) + } +} + +// TestComposite_MigrateEVM_PostCompletionFlipToEVMMigrated exercises +// the production mode flip sequence: once the migration boundary closes +// the operator flips sc-write-mode from migrate_evm to evm_migrated to +// stop spinning up a MigrationManager on every restart. The flip must +// be lossless on disk (same version, same flatkv hash, same oracle) +// and new EVM writes must continue to land directly in flatkv. +func TestComposite_MigrateEVM_PostCompletionFlipToEVMMigrated(t *testing.T) { + dir := t.TempDir() + workload := newMigrationWorkload(0xDA7A) + + const phase1Blocks = 10 + const phase2Blocks = 5 + const batch = 6 + + driveMigrationWorkload(t, dir, workload, phase1Blocks, phase2Blocks, batch) + + // Reopen in MigrateEVM and run to completion, capturing the + // pre-migration state for the lossless-flip assertions below. + cs := reopenInMigrateEVM(t, dir, batch) + runUntilMigrationComplete(t, cs, workload, 200) + + preFlipVersion := cs.Version() + preFlipOracle := workload.snapshotOracle() + preFlipFlatkvHash := append([]byte(nil), cs.flatKV.CommittedRootHash()...) + require.NoError(t, cs.Close()) + + // --- Mode flip: reopen as EVMMigrated. --- + finalCfg := evmMigratedConfig() + finalCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + cs, err := NewCompositeCommitStore(t.Context(), dir, finalCfg) + require.NoError(t, err) + require.NoError(t, cs.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs.LoadVersion(0, false) + require.NoError(t, err) + defer func() { _ = cs.Close() }() + + require.Equal(t, preFlipVersion, cs.Version(), + "EVMMigrated reopen must report the same version as the completed MigrateEVM run") + require.Equal(t, preFlipFlatkvHash, cs.flatKV.CommittedRootHash(), + "flatkv committed root hash must be invariant across the MigrateEVM -> EVMMigrated mode flip") + requireOracleMatches(t, cs, preFlipOracle) + + // Post-migration writes must continue to land in flatkv and remain + // readable. This catches the regression where a post-flip mode + // accidentally routes EVM writes to memiavl, which would leave a + // silent split between authoritative state (flatkv) and new state + // (memiavl) that no read path can heal. + postFlipBlock := workload.generateBlock(5, 3, 1, 2, 1) + require.NoError(t, cs.ApplyChangeSets(postFlipBlock)) + _, err = cs.Commit() + require.NoError(t, err) + requireOracleMatches(t, cs, workload.snapshotOracle()) + require.NoError(t, flatkv.VerifyLtHash(cs.flatKV)) + + // EVMMigrated has no migration manager, so memiavl's evm tree must + // still be empty after a post-flip block (writes went to flatkv). + evmTree := cs.memIAVL.GetChildStoreByName(keys.EVMStoreKey) + require.NotNil(t, evmTree) + iter := evmTree.Iterator(nil, nil, true) + t.Cleanup(func() { _ = iter.Close() }) + require.False(t, iter.Valid(), + "post-flip memiavl evm tree must remain empty (EVM writes route to flatkv)") +} diff --git a/sei-db/state_db/sc/composite/store_test.go b/sei-db/state_db/sc/composite/store_test.go index 074d9aa413..c74eaa2477 100644 --- a/sei-db/state_db/sc/composite/store_test.go +++ b/sei-db/state_db/sc/composite/store_test.go @@ -1831,6 +1831,81 @@ func TestMigrationEntrySeedingMemiavlToMigrateEVM(t *testing.T) { } } +func TestMigrateEVMReopenPreservesPreFlipLastCommitInfo(t *testing.T) { + dir := t.TempDir() + + memCfg := config.DefaultStateCommitConfig() + memCfg.WriteMode = config.MemiavlOnly + memCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + + cs1, err := NewCompositeCommitStore(t.Context(), dir, memCfg) + require.NoError(t, err) + require.NoError(t, cs1.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs1.LoadVersion(0, false) + require.NoError(t, err) + + addr := [20]byte{0xA1} + slot := [32]byte{0xB2} + evmKey := keys.BuildEVMKey(keys.EVMKeyStorage, append(addr[:], slot[:]...)) + for i := byte(1); i <= 3; i++ { + require.NoError(t, cs1.ApplyChangeSets([]*proto.NamedChangeSet{ + {Name: keys.BankStoreKey, Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("bal"), Value: []byte{i}}, + }}}, + {Name: keys.EVMStoreKey, Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: evmKey, Value: padLeft32(i)}, + }}}, + })) + _, err = cs1.Commit() + require.NoError(t, err) + } + require.Nil(t, cs1.flatKV, "MemiavlOnly must not allocate flatkv before the migration") + require.NoError(t, cs1.Close()) + + preFlipVersion := int64(3) + + migrateCfg := config.DefaultStateCommitConfig() + migrateCfg.WriteMode = config.MigrateEVM + migrateCfg.KeysToMigratePerBlock = 1 + migrateCfg.MemIAVLConfig.AsyncCommitBuffer = 0 + + cs2, err := NewCompositeCommitStore(t.Context(), dir, migrateCfg) + require.NoError(t, err) + require.NoError(t, cs2.Initialize([]string{keys.BankStoreKey, keys.EVMStoreKey})) + _, err = cs2.LoadVersion(0, false) + require.NoError(t, err) + defer func() { _ = cs2.Close() }() + + require.Equal(t, preFlipVersion, cs2.Version()) + lastAtMigration := cs2.LastCommitInfo() + for _, si := range lastAtMigration.StoreInfos { + require.NotEqual(t, "evm_lattice", si.Name, + "opening migrate_evm must be AppHash-neutral at the already-committed height") + } + hasLattice := func(info *proto.CommitInfo) bool { + for _, si := range info.StoreInfos { + if si.Name == "evm_lattice" { + return true + } + } + return false + } + + require.NoError(t, cs2.ApplyChangeSets([]*proto.NamedChangeSet{ + {Name: keys.BankStoreKey, Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("bal"), Value: []byte{0xFF}}, + }}}, + })) + working := cs2.WorkingCommitInfo() + require.True(t, hasLattice(working), + "the next block after the migration should include the flatkv lattice hash") + + _, err = cs2.Commit() + require.NoError(t, err) + last := cs2.LastCommitInfo() + require.True(t, hasLattice(last)) +} + // TestMigrationEntrySeedingIsIdempotentAcrossRestarts verifies that once // flatkv has been seeded and committed, a subsequent restart does not // re-seed (which would error out via the "non-empty store" guard). diff --git a/sei-db/state_db/sc/flatkv/store_meta.go b/sei-db/state_db/sc/flatkv/store_meta.go index 27ba94bef4..6b31aef1e7 100644 --- a/sei-db/state_db/sc/flatkv/store_meta.go +++ b/sei-db/state_db/sc/flatkv/store_meta.go @@ -190,6 +190,11 @@ func (s *CommitStore) SetInitialVersion(initialVersion int64) error { } s.committedVersion = seededVersion + if seededVersion > 0 { + if err := s.WriteSnapshot(""); err != nil { + return fmt.Errorf("flatkv: SetInitialVersion: write seeded snapshot: %w", err) + } + } logger.Info("FlatKV SetInitialVersion", "initialVersion", initialVersion, "seededVersion", seededVersion) return nil } diff --git a/sei-db/state_db/sc/flatkv/store_meta_test.go b/sei-db/state_db/sc/flatkv/store_meta_test.go index ed9a7415c7..ffa1412c18 100644 --- a/sei-db/state_db/sc/flatkv/store_meta_test.go +++ b/sei-db/state_db/sc/flatkv/store_meta_test.go @@ -3,6 +3,7 @@ package flatkv import ( "context" "encoding/binary" + "os" "path/filepath" "testing" @@ -160,6 +161,9 @@ func TestSetInitialVersion_HappyPath(t *testing.T) { require.NoError(t, s.SetInitialVersion(100)) require.Equal(t, int64(99), s.committedVersion) + target, err := os.Readlink(currentPath(s.flatkvDir())) + require.NoError(t, err) + require.Equal(t, snapshotName(99), target) addr := ktype.Address{0xAA} slot := ktype.Slot{0xBB} @@ -172,6 +176,26 @@ func TestSetInitialVersion_HappyPath(t *testing.T) { require.Equal(t, int64(100), s.Version()) } +func TestSetInitialVersion_GenesisSkipsSeededSnapshot(t *testing.T) { + s := setupTestStore(t) + defer s.Close() + + require.NoError(t, s.SetInitialVersion(1)) + require.Equal(t, int64(0), s.committedVersion) + target, err := os.Readlink(currentPath(s.flatkvDir())) + require.NoError(t, err) + require.Equal(t, snapshotName(0), target) + + addr := ktype.Address{0xAA} + slot := ktype.Slot{0xBB} + cs := makeChangeSet(evmStorageKey(addr, slot), padLeft32(0xCC), false) + require.NoError(t, s.ApplyChangeSets([]*proto.NamedChangeSet{cs})) + + v, err := s.Commit() + require.NoError(t, err) + require.Equal(t, int64(1), v, "first Commit after SetInitialVersion(1) must produce version 1") +} + func TestSetInitialVersion_RejectsAfterCommit(t *testing.T) { s := setupTestStore(t) defer s.Close() diff --git a/sei-db/state_db/sc/flatkv/verify.go b/sei-db/state_db/sc/flatkv/verify.go index 837d5fe28a..aed40a1c99 100644 --- a/sei-db/state_db/sc/flatkv/verify.go +++ b/sei-db/state_db/sc/flatkv/verify.go @@ -14,7 +14,7 @@ import ( // ApplyChangeSets writes are rejected (the on-disk scan cannot see them). // // Buffers every KV in memory (peak RSS ~2-3x on-disk size) and is not -// cancellable. Intended for tests and offline maintenance / cutover checks; +// cancellable. Intended for tests and offline maintenance / migration checks; // not suitable for online verification of production-sized state. func VerifyLtHash(s Store) error { cs, ok := s.(*CommitStore) diff --git a/sei-db/state_db/sc/migration/migration_manager.go b/sei-db/state_db/sc/migration/migration_manager.go index 432853fe83..86b68997a4 100644 --- a/sei-db/state_db/sc/migration/migration_manager.go +++ b/sei-db/state_db/sc/migration/migration_manager.go @@ -1,6 +1,7 @@ package migration import ( + "bytes" "encoding/binary" "errors" "fmt" @@ -36,6 +37,9 @@ type MigrationManager struct { // For writing values to the new database. newDBWriter DBWriter + // For preserving legacy key iteration while a module is migrating. + oldDBIteratorBuilder DBIteratorBuilder + // For iterating through key-value pairs to migrate in the old // database. iterator MigrationIterator @@ -51,8 +55,10 @@ type MigrationManager struct { // The version we want to migrate to. targetVersion uint64 - // Optional metrics sink. May be nil; all calls on this field go - // through nil-safe methods on *MigrationMetrics. + // Metrics sink. Always non-nil: NewMigrationManager substitutes a + // local-only *MigrationMetrics when the caller passes nil so the + // completion-summary aggregator (RunStats / Elapsed) keeps working + // even without a configured OTel exporter. metrics *MigrationMetrics } @@ -74,9 +80,12 @@ func NewMigrationManager( newDBReader DBReader, // For writing values to the new database. newDBWriter DBWriter, + // Optional iterator builder for preserving legacy old-DB iteration while migration is active. + oldDBIteratorBuilder DBIteratorBuilder, // For iterating through key-value pairs to migrate in the old database. iterator MigrationIterator, - // Optional metrics sink. Pass nil to disable metric emission. + // Optional metrics sink. Pass nil to skip OTel emission; the manager + // still aggregates run statistics locally for the completion summary. metrics *MigrationMetrics, ) (*MigrationManager, error) { @@ -143,19 +152,23 @@ func NewMigrationManager( "targetVersion", targetVersion, "boundary", boundary.String()) + if metrics == nil { + metrics = newLocalMigrationMetrics() + } metrics.SetVersion(currentMigrationVersion) metrics.SetBoundary(boundary) return &MigrationManager{ - oldDBReader: oldDBReader, - oldDBWriter: oldDBWriter, - newDBReader: newDBReader, - newDBWriter: newDBWriter, - iterator: iterator, - boundary: boundary, - migrationBatchSize: migrationBatchSize, - targetVersion: targetVersion, - metrics: metrics, + oldDBReader: oldDBReader, + oldDBWriter: oldDBWriter, + newDBReader: newDBReader, + newDBWriter: newDBWriter, + oldDBIteratorBuilder: oldDBIteratorBuilder, + iterator: iterator, + boundary: boundary, + migrationBatchSize: migrationBatchSize, + targetVersion: targetVersion, + metrics: metrics, }, nil } @@ -229,12 +242,30 @@ func (m *MigrationManager) Read(store string, key []byte) ([]byte, bool, error) // This key has already been migrated, read it from the new DB. return m.newDBReader(store, key) } - // This key has not been migrated, read it from the old DB. - return m.oldDBReader(store, key) + // This key has not been migrated, so existing source data still lives in + // the old DB. Brand-new writes created after migration starts are routed to + // the new DB to avoid chasing an ever-growing key tail, so fall back there + // if the old DB misses. + value, found, err := m.oldDBReader(store, key) + if err != nil || found { + return value, found, err + } + return m.newDBReader(store, key) } // ApplyChangeSets applies a batch of change sets to the database. // +// Block-commit semantics: this method must be called at most once per +// block-commit cycle so the migration boundary advances exactly once +// per block. The composite store enforces this by suppressing the +// duplicate empty-changeset call that rootmulti.Store.flush issues +// inside Commit after GetWorkingHash already drained the caller +// cache; without that suppression the iterator NextBatch + boundary +// rewrite here would run twice per block and perturb the working +// commit info after the AppHash was already returned to Tendermint. +// See CompositeCommitStore.ApplyChangeSets + +// migrationForwardedThisCommit for the gate. +// // Not safe for concurrent use; wrap with NewThreadSafeRouter. func (m *MigrationManager) ApplyChangeSets(changesets []*proto.NamedChangeSet) error { start := time.Now() @@ -273,34 +304,44 @@ func (m *MigrationManager) ApplyChangeSets(changesets []*proto.NamedChangeSet) e newDBPairsByStore := make(map[string]map[string]*proto.KVPair) // Create change sets that move the values to migrate from the old DB to the new DB. - var keyBytesThisBatch, valueBytesThisBatch int64 + batchStats := migrationBatchStats{ + keysMigrated: int64(len(valuesToMigrate)), + } for _, value := range valuesToMigrate { - keyBytesThisBatch += int64(len(value.Key)) - valueBytesThisBatch += int64(len(value.Value)) + batchStats.keyBytesMigrated += int64(len(value.Key)) + batchStats.valueBytesMigrated += int64(len(value.Value)) // Write the value to the new DB. putPair(newDBPairsByStore, value.ModuleName, &proto.KVPair{Key: value.Key, Value: value.Value}) // Delete the value from the old DB. putPair(oldDBPairsByStore, value.ModuleName, &proto.KVPair{Key: value.Key, Delete: true}) } - m.metrics.ReportKeysMigrated(int64(len(valuesToMigrate)), keyBytesThisBatch, valueBytesThisBatch) // For each pair in the original change sets, route to the appropriate database. // These must overwrite migrated values, so it's important to do this after we've collected // the change set for the migrated values. for _, changeSet := range changesets { for _, pair := range changeSet.Changeset.Pairs { - if m.boundary.IsMigrated(changeSet.Name, pair.Key) { + writeNew, err := m.shouldWriteOriginalPairToNewDB(changeSet.Name, pair.Key) + if err != nil { + return err + } + if writeNew { putPair(newDBPairsByStore, changeSet.Name, pair) + batchStats.originalPairsRoutedNewDB++ } else { putPair(oldDBPairsByStore, changeSet.Name, pair) + batchStats.originalPairsRoutedOldDB++ } } } - oldDBChangeSet := flattenPairsByStore(oldDBPairsByStore) - newDBChangeSets := flattenPairsByStore(newDBPairsByStore) + oldDBChangeSet, oldDBPairsWritten := flattenPairsByStore(oldDBPairsByStore) + newDBChangeSets, newDBPairsWritten := flattenPairsByStore(newDBPairsByStore) + batchStats.oldDBPairsWritten = oldDBPairsWritten + migrationComplete := m.boundary.Equals(MigrationBoundaryComplete) + metadataPairsWritten := int64(1) - if m.boundary.Equals(MigrationBoundaryComplete) { + if migrationComplete { // On the final block of the migration, update the migration version and delete the boundary. versionBytes := make([]byte, 8) binary.BigEndian.PutUint64(versionBytes, m.targetVersion) @@ -315,6 +356,7 @@ func (m *MigrationManager) ApplyChangeSets(changesets []*proto.NamedChangeSet) e // version gauge and the boundary-snapshot loop see the // completion at the same moment the DB does. m.metrics.SetVersion(m.targetVersion) + metadataPairsWritten = 2 } else { // On every other block of the migration, update the boundary. newDBChangeSets = append(newDBChangeSets, &proto.NamedChangeSet{ @@ -324,6 +366,7 @@ func (m *MigrationManager) ApplyChangeSets(changesets []*proto.NamedChangeSet) e }}, }) } + batchStats.newDBPairsWritten = newDBPairsWritten + metadataPairsWritten if err := m.oldDBWriter(oldDBChangeSet); err != nil { return fmt.Errorf("failed to apply changes to old database: %w", err) @@ -332,9 +375,44 @@ func (m *MigrationManager) ApplyChangeSets(changesets []*proto.NamedChangeSet) e return fmt.Errorf("failed to apply changes to new database: %w", err) } + m.metrics.RecordBatch(batchStats) + if migrationComplete { + m.logMigrationCompleteSummary() + } + return nil } +func (m *MigrationManager) logMigrationCompleteSummary() { + stats := m.metrics.RunStats() + logger.Info("migration complete", + "targetVersion", m.targetVersion, + "batches", stats.batches, + "keysMigrated", stats.keysMigrated, + "keyBytesMigrated", stats.keyBytesMigrated, + "valueBytesMigrated", stats.valueBytesMigrated, + "originalPairsRoutedOldDB", stats.originalPairsRoutedOldDB, + "originalPairsRoutedNewDB", stats.originalPairsRoutedNewDB, + "oldDBPairsWritten", stats.oldDBPairsWritten, + "newDBPairsWritten", stats.newDBPairsWritten, + "elapsed", m.metrics.Elapsed()) +} + +func (m *MigrationManager) shouldWriteOriginalPairToNewDB(store string, key []byte) (bool, error) { + if m.boundary.IsMigrated(store, key) { + return true, nil + } + _, foundInOld, err := m.oldDBReader(store, key) + if err != nil { + return false, fmt.Errorf("failed to check old database for store %q key %x: %w", store, key, err) + } + // Existing not-yet-migrated keys stay in old DB so their latest value is + // picked up when the iterator reaches them. Brand-new keys go straight to + // new DB; otherwise continuously-created monotonically increasing keys can + // keep the migration from ever reaching completion. + return !foundInOld, nil +} + // putPair inserts pair into dest under (store, pair.Key), creating the inner // map on demand. Later writes to the same (store, key) overwrite earlier ones. func putPair(dest map[string]map[string]*proto.KVPair, store string, pair *proto.KVPair) { @@ -349,7 +427,7 @@ func putPair(dest map[string]map[string]*proto.KVPair, store string, pair *proto // flattenPairsByStore collapses a store-keyed map of (key -> KVPair) into one // NamedChangeSet per store, with stores and pairs emitted in sorted order for // deterministic downstream writes. -func flattenPairsByStore(pairsByStore map[string]map[string]*proto.KVPair) []*proto.NamedChangeSet { +func flattenPairsByStore(pairsByStore map[string]map[string]*proto.KVPair) ([]*proto.NamedChangeSet, int64) { storeNames := make([]string, 0, len(pairsByStore)) for name := range pairsByStore { storeNames = append(storeNames, name) @@ -357,6 +435,7 @@ func flattenPairsByStore(pairsByStore map[string]map[string]*proto.KVPair) []*pr sort.Strings(storeNames) changeSets := make([]*proto.NamedChangeSet, 0, len(storeNames)) + var pairCount int64 for _, name := range storeNames { byKey := pairsByStore[name] pairs := make([]*proto.KVPair, 0, len(byKey)) @@ -364,14 +443,15 @@ func flattenPairsByStore(pairsByStore map[string]map[string]*proto.KVPair) []*pr pairs = append(pairs, pair) } sort.Slice(pairs, func(i, j int) bool { - return string(pairs[i].Key) < string(pairs[j].Key) + return bytes.Compare(pairs[i].Key, pairs[j].Key) < 0 }) + pairCount += int64(len(pairs)) changeSets = append(changeSets, &proto.NamedChangeSet{ Name: name, Changeset: proto.ChangeSet{Pairs: pairs}, }) } - return changeSets + return changeSets, pairCount } // GetProof implements [Router]. @@ -382,9 +462,21 @@ func (m *MigrationManager) GetProof(store string, key []byte) (*ics23.Commitment // Iterator implements [Router]. func (m *MigrationManager) Iterator(store string, start []byte, end []byte, ascending bool) (db.Iterator, error) { - // Eventually we will implement iteration for some modules within FlatKV, but never for the evm/ module. - // Since we're migrating the evm/ module first, implementing iteration for FlatKV is not a blocker. - return nil, fmt.Errorf("iteration not supported for store %q", store) + if store == MigrationStore { + return nil, fmt.Errorf("iteration from the 'migration' module is not permitted") + } + if m.boundary.Equals(MigrationBoundaryComplete) { + return nil, fmt.Errorf("iteration not supported after migration completion for store %q", store) + } + if m.oldDBIteratorBuilder == nil { + return nil, fmt.Errorf("iteration not supported for store %q", store) + } + + // Preserve the legacy memiavl iterator path during the migration window. + // This keeps existing EndBlock cleanup code that still does prefix scans + // from panicking before the first migration batch commits, without adding a + // per-block full FlatKV scan on large stores. + return m.oldDBIteratorBuilder(store, start, end, ascending) } // BuildRoute returns a Route that dispatches the given module names to diff --git a/sei-db/state_db/sc/migration/migration_manager_test.go b/sei-db/state_db/sc/migration/migration_manager_test.go index 8d310a2340..4463a0e549 100644 --- a/sei-db/state_db/sc/migration/migration_manager_test.go +++ b/sei-db/state_db/sc/migration/migration_manager_test.go @@ -9,6 +9,7 @@ import ( "github.com/sei-protocol/sei-chain/sei-db/proto" "github.com/stretchr/testify/require" + dbm "github.com/tendermint/tm-db" ) // mockDB is a simple in-memory key-value store that records every batch of @@ -102,12 +103,23 @@ func newTestManager( newReader DBReader, newWriter DBWriter, iter MigrationIterator, size int, +) (*MigrationManager, error) { + return newTestManagerWithOldIteratorBuilder(t, oldReader, oldWriter, newReader, newWriter, nil, iter, size) +} + +func newTestManagerWithOldIteratorBuilder( + t *testing.T, + oldReader DBReader, oldWriter DBWriter, + newReader DBReader, newWriter DBWriter, + oldIteratorBuilder DBIteratorBuilder, + iter MigrationIterator, + size int, ) (*MigrationManager, error) { t.Helper() return NewMigrationManager( size, testStartVersion, testTargetVersion, - oldReader, oldWriter, newReader, newWriter, iter, + oldReader, oldWriter, newReader, newWriter, oldIteratorBuilder, iter, nil, ) } @@ -514,6 +526,16 @@ func TestApplyChangeSets_FullMigration(t *testing.T) { require.False(t, ok) _, ok = oldDB.get("staking", "x") require.False(t, ok, "final call still issues migration deletes to old DB") + + stats := mgr.metrics.RunStats() + require.Equal(t, int64(2), stats.batches) + require.Equal(t, int64(3), stats.keysMigrated) + require.Equal(t, int64(3), stats.keyBytesMigrated) + require.Equal(t, int64(3), stats.valueBytesMigrated) + require.Equal(t, int64(0), stats.originalPairsRoutedOldDB) + require.Equal(t, int64(0), stats.originalPairsRoutedNewDB) + require.Equal(t, int64(3), stats.oldDBPairsWritten) + require.Equal(t, int64(6), stats.newDBPairsWritten) } func TestApplyChangeSets_RecreateManagerResumesWhereLeftOff(t *testing.T) { @@ -897,7 +919,7 @@ func TestNewMigrationManager_NilDependencies(t *testing.T) { // ApplyChangeSets takes the post-completion fast path. This is what // keeps a migration-mode WriteMode safe to leave configured // indefinitely after the migration completes - operators don't need -// to flip a config setting on the first restart past the cutover. +// to flip a config setting on the first restart past migration completion. func TestNewMigrationManager_AcceptsNewDBAtTargetVersion(t *testing.T) { oldDB := newMockDB() oldDB.seed(map[string]map[string][]byte{ @@ -945,6 +967,37 @@ func TestNewMigrationManager_AcceptsNewDBAtTargetVersion(t *testing.T) { "old DB must not be written when manager comes up post-completion") } +func TestIterator_DoesNotReadOldDBAfterMigrationComplete(t *testing.T) { + oldDB := newMockDB() + newDB := newMockDB() + + oldIteratorErr := fmt.Errorf("old iterator called") + oldIteratorCalls := 0 + oldIteratorBuilder := func(string, []byte, []byte, bool) (dbm.Iterator, error) { + oldIteratorCalls++ + return nil, oldIteratorErr + } + mgr, err := newTestManagerWithOldIteratorBuilder(t, + oldDB.reader(), oldDB.writer(), + newDB.reader(), newDB.writer(), + oldIteratorBuilder, + NewMockMigrationIterator(nil, false), 10, + ) + require.NoError(t, err) + + itr, err := mgr.Iterator("bank", nil, nil, true) + require.ErrorIs(t, err, oldIteratorErr) + require.Nil(t, itr) + require.Equal(t, 1, oldIteratorCalls) + + mgr.boundary = MigrationBoundaryComplete + itr, err = mgr.Iterator("bank", nil, nil, true) + require.Error(t, err) + require.Nil(t, itr) + require.Contains(t, err.Error(), "migration completion") + require.Equal(t, 1, oldIteratorCalls, "complete migration must not keep serving stale old-DB iterators") +} + // --- Issue 7: old-DB changeset grouping --- func TestApplyChangeSets_OldDBChangeSetGroupedByStore(t *testing.T) { @@ -1299,6 +1352,78 @@ func TestBuildRoute_WriterMidMigrationDrivesMigration(t *testing.T) { "writer must advance the boundary; if it bypassed the manager the boundary would not move") } +func TestApplyChangeSets_BrandNewKeysDoNotExtendMigrationTail(t *testing.T) { + data := map[string]map[string][]byte{ + "evm": {"a": []byte("old-a"), "b": []byte("old-b")}, + } + oldDB := newMockDB() + oldDB.seed(copyData(data)) + newDB := newMockDB() + mgr, err := newTestManager(t, + oldDB.reader(), oldDB.writer(), + newDB.reader(), newDB.writer(), + NewMockMigrationIterator(copyData(data), false), 1, + ) + require.NoError(t, err) + + require.NoError(t, mgr.ApplyChangeSets([]*proto.NamedChangeSet{{ + Name: "evm", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("z-new-1"), Value: []byte("new-1")}, + }}, + }})) + _, ok := oldDB.get("evm", "z-new-1") + require.False(t, ok, "brand-new keys must not be written to old DB or migration can chase a growing tail") + val, ok := newDB.get("evm", "z-new-1") + require.True(t, ok) + require.Equal(t, []byte("new-1"), val) + + val, ok, err = mgr.Read("evm", []byte("z-new-1")) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, []byte("new-1"), val, "unmigrated-range reads must fall back to new DB for brand-new keys") + + require.NoError(t, mgr.ApplyChangeSets([]*proto.NamedChangeSet{{ + Name: "evm", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("z-new-2"), Value: []byte("new-2")}, + }}, + }})) + require.True(t, mgr.boundary.Equals(MigrationBoundaryComplete)) + versionBytes, ok := newDB.get(MigrationStore, MigrationVersionKey) + require.True(t, ok, "migration should complete instead of chasing z-new-* keys forever") + require.Len(t, versionBytes, 8) +} + +func TestApplyChangeSets_ExistingUnmigratedKeyStillWritesOldDB(t *testing.T) { + data := map[string]map[string][]byte{ + "evm": {"a": []byte("old-a"), "b": []byte("old-b"), "c": []byte("old-c")}, + } + oldDB := newMockDB() + oldDB.seed(copyData(data)) + newDB := newMockDB() + mgr, err := newTestManager(t, + oldDB.reader(), oldDB.writer(), + newDB.reader(), newDB.writer(), + NewMockMigrationIterator(copyData(data), false), 1, + ) + require.NoError(t, err) + + require.NoError(t, mgr.ApplyChangeSets([]*proto.NamedChangeSet{{ + Name: "evm", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("c"), Value: []byte("updated-c")}, + }}, + }})) + + val, ok := oldDB.get("evm", "c") + require.True(t, ok) + require.Equal(t, []byte("updated-c"), val, + "existing not-yet-migrated keys must remain in old DB until the iterator reaches them") + _, ok = newDB.get("evm", "c") + require.False(t, ok) +} + func TestBuildRoute_WriterRejectsMigrationStore(t *testing.T) { mgr, _, _ := inProgressManager(t) route, err := mgr.BuildRoute(MigrationStore) diff --git a/sei-db/state_db/sc/migration/migration_metrics.go b/sei-db/state_db/sc/migration/migration_metrics.go index 43320e7196..74e5229e81 100644 --- a/sei-db/state_db/sc/migration/migration_metrics.go +++ b/sei-db/state_db/sc/migration/migration_metrics.go @@ -12,11 +12,53 @@ import ( commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" ) -// MigrationMetrics holds OpenTelemetry metrics for a MigrationManager. -// Metrics are exported via whatever exporter is configured on the global -// OTel MeterProvider. All methods are nil-safe so callers (and tests) -// that do not care about metrics can pass a nil *MigrationMetrics to the -// manager. +// migrationRunStats holds the in-process aggregate counts for a single +// MigrationManager run. Populated by MigrationMetrics.RecordBatch and +// emitted via MigrationManager.logMigrationCompleteSummary at completion. +// These are not persisted; after a restart they summarize the resumed +// segment only. +type migrationRunStats struct { + batches int64 + keysMigrated int64 + keyBytesMigrated int64 + valueBytesMigrated int64 + originalPairsRoutedOldDB int64 + originalPairsRoutedNewDB int64 + oldDBPairsWritten int64 + newDBPairsWritten int64 +} + +// migrationBatchStats captures the per-ApplyChangeSets counters that +// MigrationManager hands to MigrationMetrics.RecordBatch. +type migrationBatchStats struct { + keysMigrated int64 + keyBytesMigrated int64 + valueBytesMigrated int64 + originalPairsRoutedOldDB int64 + originalPairsRoutedNewDB int64 + oldDBPairsWritten int64 + newDBPairsWritten int64 +} + +// MigrationMetrics has two responsibilities, intentionally colocated so +// the manager only has to track one collaborator: +// +// 1. OTel telemetry sink. NewMigrationMetrics wires counters/gauges +// through the global MeterProvider; SetVersion, SetBoundary, +// RecordBatch, RecordApplyDuration, and the boundary snapshot loop +// all emit through it. When OTel handles are absent (nil exporter, +// newLocalMigrationMetrics) the corresponding Record/Add calls are +// skipped per-counter, so emission is best-effort. +// +// 2. In-process run-stat aggregator. RecordBatch also accumulates a +// migrationRunStats summary under mu; MigrationManager reads it via +// RunStats / Elapsed to emit the completion log. The aggregator +// keeps working even when no OTel exporter is configured (tests, +// embedded use), which is why NewMigrationManager substitutes a +// local-only metrics instance when the caller passes nil. +// +// All methods are nil-safe. Callers (and tests) that do not care about +// metrics can pass a nil *MigrationMetrics to NewMigrationManager. // // Unit convention: durations in "s", bytes in "By", counts via curly-brace // annotations (UCUM, https://ucum.org/ucum). @@ -33,16 +75,26 @@ type MigrationMetrics struct { // migration has completed and it can stop emitting labeled series. targetVersion uint64 - keysMigratedTotal metric.Int64Counter - keyBytesMigratedTotal metric.Int64Counter - valueBytesMigratedTotal metric.Int64Counter - applyDuration metric.Float64Histogram - version metric.Int64Gauge - boundarySnapshot metric.Int64Gauge + keysMigratedTotal metric.Int64Counter + keyBytesMigratedTotal metric.Int64Counter + valueBytesMigratedTotal metric.Int64Counter + batchesTotal metric.Int64Counter + originalPairsRoutedOldDBTotal metric.Int64Counter + originalPairsRoutedNewDBTotal metric.Int64Counter + oldDBPairsWrittenTotal metric.Int64Counter + newDBPairsWrittenTotal metric.Int64Counter + applyDuration metric.Float64Histogram + version metric.Int64Gauge + boundarySnapshot metric.Int64Gauge + + // startedAt is captured when the metrics object is constructed and + // reported as the elapsed-time anchor in the completion summary. + startedAt time.Time mu sync.Mutex currentBoundary MigrationBoundary currentVersion uint64 + runStats migrationRunStats } // NewMigrationMetrics constructs a MigrationMetrics using the global OTel @@ -79,6 +131,31 @@ func NewMigrationMetrics( metric.WithDescription("Running sum of bytes of migrated values (len(Value))"), metric.WithUnit("By"), ) + batchesTotal, _ := meter.Int64Counter( + "seidb_migration_batches_total", + metric.WithDescription("Total ApplyChangeSets calls processed by MigrationManager"), + metric.WithUnit("{count}"), + ) + originalPairsRoutedOldDBTotal, _ := meter.Int64Counter( + "seidb_migration_original_pairs_routed_old_db_total", + metric.WithDescription("Caller-supplied KV pairs routed to the old DB during active migration"), + metric.WithUnit("{count}"), + ) + originalPairsRoutedNewDBTotal, _ := meter.Int64Counter( + "seidb_migration_original_pairs_routed_new_db_total", + metric.WithDescription("Caller-supplied KV pairs routed to the new DB during active migration"), + metric.WithUnit("{count}"), + ) + oldDBPairsWrittenTotal, _ := meter.Int64Counter( + "seidb_migration_old_db_pairs_written_total", + metric.WithDescription("KV pairs written to the old DB per ApplyChangeSets (migration deletes + caller writes)"), + metric.WithUnit("{count}"), + ) + newDBPairsWrittenTotal, _ := meter.Int64Counter( + "seidb_migration_new_db_pairs_written_total", + metric.WithDescription("KV pairs written to the new DB per ApplyChangeSets (migrated values + caller writes + boundary metadata)"), + metric.WithUnit("{count}"), + ) applyDuration, _ := meter.Float64Histogram( "seidb_migration_apply_change_sets_duration_seconds", metric.WithDescription("Wall-clock time spent in each MigrationManager.ApplyChangeSets call"), @@ -99,15 +176,21 @@ func NewMigrationMetrics( ) m := &MigrationMetrics{ - ctx: ctx, - cancel: cancel, - targetVersion: targetVersion, - keysMigratedTotal: keysMigratedTotal, - keyBytesMigratedTotal: keyBytesMigratedTotal, - valueBytesMigratedTotal: valueBytesMigratedTotal, - applyDuration: applyDuration, - version: version, - boundarySnapshot: boundarySnapshot, + ctx: ctx, + cancel: cancel, + targetVersion: targetVersion, + keysMigratedTotal: keysMigratedTotal, + keyBytesMigratedTotal: keyBytesMigratedTotal, + valueBytesMigratedTotal: valueBytesMigratedTotal, + batchesTotal: batchesTotal, + originalPairsRoutedOldDBTotal: originalPairsRoutedOldDBTotal, + originalPairsRoutedNewDBTotal: originalPairsRoutedNewDBTotal, + oldDBPairsWrittenTotal: oldDBPairsWrittenTotal, + newDBPairsWrittenTotal: newDBPairsWrittenTotal, + applyDuration: applyDuration, + version: version, + boundarySnapshot: boundarySnapshot, + startedAt: time.Now(), } if boundarySnapshotInterval > 0 { @@ -145,25 +228,87 @@ func (m *MigrationMetrics) SetVersion(v uint64) { } } -// ReportKeysMigrated records that a batch of (count) keys totaling -// (keyBytes, valueBytes) were migrated in a single ApplyChangeSets call. -// Pass zeros to skip; the method is a no-op on nil receiver. -func (m *MigrationMetrics) ReportKeysMigrated(count int64, keyBytes int64, valueBytes int64) { +// newLocalMigrationMetrics returns a *MigrationMetrics with no OTel +// counters wired up but with a live startedAt and runStats aggregator. +// Used internally by MigrationManager when the caller passes nil so +// completion-summary aggregation does not depend on a configured +// MeterProvider. +func newLocalMigrationMetrics() *MigrationMetrics { + return &MigrationMetrics{startedAt: time.Now()} +} + +// RecordBatch aggregates per-ApplyChangeSets counters into the run +// summary and emits the corresponding OTel counters. Safe to call on a +// nil receiver. Counters with no configured exporter (e.g. tests using +// newLocalMigrationMetrics) are skipped individually; in-process +// aggregation still runs so the completion summary stays accurate. +func (m *MigrationMetrics) RecordBatch(batch migrationBatchStats) { if m == nil { return } + + m.mu.Lock() + m.runStats.batches++ + m.runStats.keysMigrated += batch.keysMigrated + m.runStats.keyBytesMigrated += batch.keyBytesMigrated + m.runStats.valueBytesMigrated += batch.valueBytesMigrated + m.runStats.originalPairsRoutedOldDB += batch.originalPairsRoutedOldDB + m.runStats.originalPairsRoutedNewDB += batch.originalPairsRoutedNewDB + m.runStats.oldDBPairsWritten += batch.oldDBPairsWritten + m.runStats.newDBPairsWritten += batch.newDBPairsWritten + m.mu.Unlock() + ctx := context.Background() - if m.keysMigratedTotal != nil && count > 0 { - m.keysMigratedTotal.Add(ctx, count) + if m.batchesTotal != nil { + m.batchesTotal.Add(ctx, 1) + } + if m.keysMigratedTotal != nil && batch.keysMigrated > 0 { + m.keysMigratedTotal.Add(ctx, batch.keysMigrated) } - if m.keyBytesMigratedTotal != nil && keyBytes > 0 { - m.keyBytesMigratedTotal.Add(ctx, keyBytes) + if m.keyBytesMigratedTotal != nil && batch.keyBytesMigrated > 0 { + m.keyBytesMigratedTotal.Add(ctx, batch.keyBytesMigrated) } - if m.valueBytesMigratedTotal != nil && valueBytes > 0 { - m.valueBytesMigratedTotal.Add(ctx, valueBytes) + if m.valueBytesMigratedTotal != nil && batch.valueBytesMigrated > 0 { + m.valueBytesMigratedTotal.Add(ctx, batch.valueBytesMigrated) + } + if m.originalPairsRoutedOldDBTotal != nil && batch.originalPairsRoutedOldDB > 0 { + m.originalPairsRoutedOldDBTotal.Add(ctx, batch.originalPairsRoutedOldDB) + } + if m.originalPairsRoutedNewDBTotal != nil && batch.originalPairsRoutedNewDB > 0 { + m.originalPairsRoutedNewDBTotal.Add(ctx, batch.originalPairsRoutedNewDB) + } + if m.oldDBPairsWrittenTotal != nil && batch.oldDBPairsWritten > 0 { + m.oldDBPairsWrittenTotal.Add(ctx, batch.oldDBPairsWritten) + } + if m.newDBPairsWrittenTotal != nil && batch.newDBPairsWritten > 0 { + m.newDBPairsWrittenTotal.Add(ctx, batch.newDBPairsWritten) } } +// RunStats returns a copy of the in-process aggregated counters under the +// metrics mutex. Returns the zero value on a nil receiver. Reserved for +// MigrationManager.logMigrationCompleteSummary and tests; callers must +// not mutate the returned struct. +func (m *MigrationMetrics) RunStats() migrationRunStats { + if m == nil { + return migrationRunStats{} + } + m.mu.Lock() + defer m.mu.Unlock() + return m.runStats +} + +// Elapsed returns the wall-clock duration since the metrics object was +// constructed. Returns zero on a nil receiver, or on a zero-value +// MigrationMetrics that skipped the construction helpers; the latter +// guard prevents tens-of-years durations from accidental struct literals. +func (m *MigrationMetrics) Elapsed() time.Duration { + if m == nil || m.startedAt.IsZero() { + return 0 + } + return time.Since(m.startedAt) +} + // RecordApplyDuration records the wall-clock time spent in a single // ApplyChangeSets call. Invoked from a defer in the manager so both // success and error paths are captured. @@ -220,12 +365,17 @@ func (m *MigrationMetrics) startBoundarySnapshotLoop(interval time.Duration) { }() } -// Release resources held by the metrics collector. +// Release resources held by the metrics collector. Safe on nil receivers +// and on instances constructed without a context (e.g. +// newLocalMigrationMetrics, which has no boundary-snapshot goroutine to +// stop and therefore no cancel func to call). func (m *MigrationMetrics) Close() { if m == nil { return } - m.cancel() + if m.cancel != nil { + m.cancel() + } m.wg.Wait() } diff --git a/sei-db/state_db/sc/migration/migration_transitions_test.go b/sei-db/state_db/sc/migration/migration_transitions_test.go index fed35fdaa2..42e573026e 100644 --- a/sei-db/state_db/sc/migration/migration_transitions_test.go +++ b/sei-db/state_db/sc/migration/migration_transitions_test.go @@ -12,7 +12,7 @@ import ( // Test the MigrateEVM data migration. At the start of this migration, all data lives in memIAVL. // At the end of this migration, all evm/ data lives in flatkv, and all other data remains in memIAVL. // -// This test evaluates the 0->1 migration path. +// This test evaluates the FlatKV EVM migrate (0 -> 1) path. func TestMigrateEVM(t *testing.T) { rng := testutil.NewTestRandom() diff --git a/sei-db/state_db/sc/migration/migration_version_test.go b/sei-db/state_db/sc/migration/migration_version_test.go index b44dc5ae88..8fe9a061c0 100644 --- a/sei-db/state_db/sc/migration/migration_version_test.go +++ b/sei-db/state_db/sc/migration/migration_version_test.go @@ -78,6 +78,7 @@ func TestMigrationManager_AtTargetVersion_ComesUpInPassthrough(t *testing.T) { 0, 7, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(nil, false), nil, ) @@ -110,6 +111,7 @@ func TestMigrationManager_NilHandlesRejected(t *testing.T) { 0, 1, tc.oldReader, tc.oldWriter, newDB.reader(), newDB.writer(), + nil, tc.iter, nil, ) @@ -145,6 +147,7 @@ func TestMigrationManager_AbsentInNewDB_DefaultsToStartVersion(t *testing.T) { 5, 6, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) @@ -177,6 +180,7 @@ func TestMigrationManager_AtStartVersionInNewDB_RunsMigration(t *testing.T) { 5, 6, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) @@ -214,6 +218,7 @@ func TestMigrationManager_OldDBVersionKeyIgnored(t *testing.T) { 5, 6, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) @@ -245,6 +250,7 @@ func TestMigrationManager_AtStartVersionInNewDB_WithBoundary_Resumes(t *testing. 5, 6, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) @@ -265,6 +271,7 @@ func TestMigrationManager_AtStartVersionAbsent_RunsMigration(t *testing.T) { 0, 1, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) @@ -286,6 +293,7 @@ func TestMigrationManager_UnexpectedVersionInNewDB_Errors(t *testing.T) { 5, 10, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(nil, false), nil, ) @@ -313,6 +321,7 @@ func TestMigrationManager_AtTargetVersion_OldDBVersionIgnored(t *testing.T) { 5, 6, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(nil, false), nil, ) @@ -328,6 +337,7 @@ func TestMigrationManager_StartVersionMustBeLessThanTarget(t *testing.T) { 5, 5, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(nil, false), nil, ) @@ -354,6 +364,7 @@ func TestMigrationManager_FinalCallWritesVersionAtomically(t *testing.T) { 0, 1, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) @@ -448,6 +459,7 @@ func TestMigrationManager_FinalCallSubsequentCallsPostCompletion(t *testing.T) { 0, 1, oldDB.reader(), oldDB.writer(), newDB.reader(), newDB.writer(), + nil, NewMockMigrationIterator(copyData(data), false), nil, ) diff --git a/sei-db/state_db/sc/migration/router_builder.go b/sei-db/state_db/sc/migration/router_builder.go index 73261e8ccb..007fec3679 100644 --- a/sei-db/state_db/sc/migration/router_builder.go +++ b/sei-db/state_db/sc/migration/router_builder.go @@ -131,7 +131,7 @@ func buildMemiavlOnlyRouter( return router, nil } -/* Data flow: MigrateEVM (0 -> 1) +/* Data flow: FlatKV EVM migrate (0 -> 1) ┌──────────────┐ ┌─────────┐ ──all-modules────────▶ │ moduleRouter │ ──everything-except-evm/───────▶ │ memIAVL │ @@ -173,6 +173,7 @@ func buildMigrateEVMRouter( buildMemIAVLWriter(memIAVL), buildFlatKVReader(flatKV), buildFlatKVWriter(flatKV), + buildMemIAVLIteratorBuilder(memIAVL), NewMemiavlMigrationIterator(memIAVL.GetDB(), []string{keys.EVMStoreKey}), NewMigrationMetrics(ctx, Version1_MigrateEVM, 10*time.Second), ) @@ -302,6 +303,7 @@ func buildMigrateAllButBankRouter( buildMemIAVLWriter(memIAVL), buildFlatKVReader(flatKV), buildFlatKVWriter(flatKV), + buildMemIAVLIteratorBuilder(memIAVL), NewMemiavlMigrationIterator(memIAVL.GetDB(), allModulesButEvmAndBank), NewMigrationMetrics(ctx, Version2_MigrateAllButBank, 10*time.Second), ) @@ -433,6 +435,7 @@ func buildMigrateBankRouter( buildMemIAVLWriter(memIAVL), buildFlatKVReader(flatKV), buildFlatKVWriter(flatKV), + buildMemIAVLIteratorBuilder(memIAVL), NewMemiavlMigrationIterator(memIAVL.GetDB(), []string{keys.BankStoreKey}), NewMigrationMetrics(ctx, Version3_FlatKVOnly, 10*time.Second), ) diff --git a/sei-db/tools/cmd/seidb/main.go b/sei-db/tools/cmd/seidb/main.go index ffb46fc93e..b0a4f728ab 100644 --- a/sei-db/tools/cmd/seidb/main.go +++ b/sei-db/tools/cmd/seidb/main.go @@ -29,7 +29,8 @@ func main() { operations.MemiavlLatestVersionCmd(), operations.ImportFlatKVFromMemiavlCmd(), operations.ReplayChangelogCmd(), - operations.TraceProfileReportCmd()) + operations.TraceProfileReportCmd(), + operations.MigrateEvmStatusCmd()) if err := rootCmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) diff --git a/sei-db/tools/cmd/seidb/operations/flatkv_open_test.go b/sei-db/tools/cmd/seidb/operations/flatkv_open_test.go index bc16ed84e6..f191272d5a 100644 --- a/sei-db/tools/cmd/seidb/operations/flatkv_open_test.go +++ b/sei-db/tools/cmd/seidb/operations/flatkv_open_test.go @@ -269,6 +269,30 @@ func TestOpenFlatKVReadOnlyLatestAndHistoricalHeight(t *testing.T) { require.NoError(t, historical.Close()) } +func TestOpenFlatKVReadOnlyAfterSetInitialVersion(t *testing.T) { + store, dbDir := newDiskBackedFlatKVStore(t) + addr := addrN(0xC3) + + require.NoError(t, store.SetInitialVersion(100)) + require.NoError(t, store.ApplyChangeSets([]*proto.NamedChangeSet{{ + Name: keys.EVMStoreKey, + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + noncePair(addr, 7), + }}, + }})) + v, err := store.Commit() + require.NoError(t, err) + require.Equal(t, int64(100), v) + require.NoError(t, store.Close()) + + latest, err := openFlatKVReadOnly(dbDir, 0) + require.NoError(t, err) + require.Equal(t, int64(100), latest.Version()) + _, found := latest.Get(keys.EVMStoreKey, keys.BuildEVMKey(keys.EVMKeyNonce, addr[:])) + require.True(t, found) + require.NoError(t, latest.Close()) +} + func newDiskBackedFlatKVStore(t *testing.T) (*flatkv.CommitStore, string) { t.Helper() diff --git a/sei-db/tools/cmd/seidb/operations/migrate_evm_status.go b/sei-db/tools/cmd/seidb/operations/migrate_evm_status.go new file mode 100644 index 0000000000..30a2d3cf1d --- /dev/null +++ b/sei-db/tools/cmd/seidb/operations/migrate_evm_status.go @@ -0,0 +1,111 @@ +package operations + +import ( + "encoding/binary" + "encoding/hex" + "encoding/json" + "fmt" + + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/migration" + "github.com/spf13/cobra" +) + +// MigrateEvmStatusCmd is the seidb subcommand that reports the on-disk +// FlatKV EVM migrate flow state of a FlatKV directory. +// +// It exists so that the cluster-level integration test driver can poll +// "is migration done yet?" against each validator's data dir from the +// host without having to add a custom RPC handler or grep through node +// logs. JSON output is the contract; the test runner shells out to this +// tool, parses the result, and decides when to advance. +// +// Concretely it reads two reserved keys from the FlatKV migration +// store: +// +// - migration-version: an 8-byte big-endian uint64 written exactly +// once per migration lifecycle on the bump block. Absent or zero +// means MigrateEVM has not yet completed. +// - migration-boundary: the in-flight cursor encoding the +// (module, key) pair the next batch should resume from. Present iff +// the boundary is somewhere strictly between not-started and +// complete. +// +// To stay aligned with the rest of the seidb tools the read goes +// through openFlatKVReadOnly, which hardlink-clones the latest snapshot +// + copies the WAL into a temp dir before opening. That avoids +// contending with a live node for the FlatKV writer lock and gives the +// tool a stable view even if the live writer rolls snapshots mid-run. +func MigrateEvmStatusCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "migrate-evm-status", + Short: "Report on-disk FlatKV EVM migrate status as JSON", + Long: "Reads the migration-version and migration-boundary keys " + + "from a FlatKV store and prints a JSON summary of the MigrateEVM " + + "migration state. Intended for use by integration test drivers " + + "polling for migration completion from the host.", + Run: executeMigrateEvmStatus, + } + cmd.PersistentFlags().StringP("db-dir", "d", "", "FlatKV database directory") + cmd.PersistentFlags().Int64("height", 0, "FlatKV target version; 0 selects the latest available version") + return cmd +} + +func executeMigrateEvmStatus(cmd *cobra.Command, _ []string) { + dbDir, _ := cmd.Flags().GetString("db-dir") + height, _ := cmd.Flags().GetInt64("height") + + if dbDir == "" { + panic("Must provide --db-dir pointing at a FlatKV data directory") + } + + store, err := openFlatKVReadOnly(dbDir, height) + if err != nil { + panic(fmt.Errorf("open flatkv read-only: %w", err)) + } + defer func() { _ = store.Close() }() + + versionAt := store.Version() + + // Direct flatkv.Get is the same reader path the migration manager + // itself uses; the version and boundary keys are stored verbatim + // under the "migration" module. + migrationVersion := uint64(migration.Version0_MemiavlOnly) + versionRaw, hasVersion := store.Get(migration.MigrationStore, []byte(migration.MigrationVersionKey)) + if hasVersion { + // 8-byte big-endian, written by migration_manager.go on the + // bump block. A short value here is a corruption signal, not + // a half-written entry — flatkv commits atomically — but we + // still tolerate it so the JSON output stays informative. + if len(versionRaw) == 8 { + migrationVersion = binary.BigEndian.Uint64(versionRaw) + } + } + + boundaryRaw, hasBoundary := store.Get(migration.MigrationStore, []byte(migration.MigrationBoundaryKey)) + + out := struct { + VersionAt int64 `json:"version_at"` + MigrationVersion uint64 `json:"migration_version"` + MigrateEVMComplete bool `json:"migrate_evm_complete"` + BoundaryPresent bool `json:"boundary_present"` + BoundaryHex string `json:"boundary_hex,omitempty"` + VersionRawHex string `json:"version_raw_hex,omitempty"` + }{ + VersionAt: versionAt, + MigrationVersion: migrationVersion, + MigrateEVMComplete: migrationVersion >= uint64(migration.Version1_MigrateEVM), + BoundaryPresent: hasBoundary, + } + if hasBoundary { + out.BoundaryHex = hex.EncodeToString(boundaryRaw) + } + if hasVersion { + out.VersionRawHex = hex.EncodeToString(versionRaw) + } + + enc := json.NewEncoder(cmd.OutOrStdout()) + enc.SetIndent("", " ") + if err := enc.Encode(out); err != nil { + panic(fmt.Errorf("encode json: %w", err)) + } +}