From 0da6e5ce6f716907e8fe8b9e41478b41720d5396 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 22:15:26 -0400 Subject: [PATCH 1/3] catalog v82->v83: normalize live Role rows still in pre-v81 byte form --- src/catalog/src/durable/upgrade/v82_to_v83.rs | 234 ++++++++++++++++-- 1 file changed, 219 insertions(+), 15 deletions(-) diff --git a/src/catalog/src/durable/upgrade/v82_to_v83.rs b/src/catalog/src/durable/upgrade/v82_to_v83.rs index 51249801caefb..ea57c23ced215 100644 --- a/src/catalog/src/durable/upgrade/v82_to_v83.rs +++ b/src/catalog/src/durable/upgrade/v82_to_v83.rs @@ -44,23 +44,38 @@ //! //! # The repair //! -//! For every Role with the structural signature of this bug — a dangling `-1` -//! plus at least one `+1` whose parsed `RoleValue` equals it, plus at most -//! one *other* `+1` with a different parsed value — we emit: +//! Two passes over the snapshot. +//! +//! **Pass 1 — cancel already-dangling retractions.** For every Role with the +//! structural signature of the bug — a dangling `-1` plus at least one `+1` +//! whose parsed `RoleValue` equals it, plus at most one *other* `+1` with a +//! different parsed value — we emit: //! //! 1. `+1` of the dangling row, cancelling the dangling `-1`. //! 2. `-1` of every parsed-equal stale `+1`, completing the retraction the //! original DDL intended. //! -//! After commit, each affected `RoleKey` has either one live `+1` or no rows -//! at all (for the dropped case). +//! **Pass 2 — normalize untouched stale rows.** Every remaining `+1` Role +//! row whose stored form differs from what re-serializing its parsed value +//! through the current proto would produce is retracted and re-inserted in +//! canonical form. Without this, a Role still in pre-v81 form that hasn't +//! yet had any DDL run against it would survive the migration unchanged, +//! and the next `ALTER ROLE`/`DROP ROLE` after v83 would manufacture a +//! fresh dangling `-1` against bytes that no migration runs against +//! anymore. After pass 2, every Role row in the shard has the byte form +//! that future retractions will also produce, so consolidation cancels. +//! +//! After commit, each affected `RoleKey` has either one live canonical +//! `+1` or no rows at all (for the dropped case). //! //! Anything that doesn't fit the fingerprint — no parsed-equal sibling, //! multiple distinct live candidates, non-Role kinds, `|diff| > 1` — is -//! logged at WARN and left for human review. Better to under-clean and +//! logged at WARN and left for human review, and pass 2 also leaves every +//! `+1` for any such key alone (rewriting a subset of an unrepaired key +//! would just produce a new dangling diff). Better to under-clean and //! surface unknown shapes for triage than over-clean and retire live state. -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use mz_repr::Diff; @@ -82,6 +97,9 @@ pub(crate) struct RepairStats { /// Stale `+1` rows (alternate forms of the same parsed Role value as a /// repaired phantom) that were retracted as part of the repair. pub stale_retracted: usize, + /// Live `+1` Role rows whose stored form didn't match the canonical + /// re-serialization of their parsed value, rewritten in place. + pub normalized: usize, /// Dangling Role `-1`s that didn't fit the structural signature. pub skipped_role: usize, /// Dangling rows for kinds other than `Role`. The known corruption only @@ -105,6 +123,7 @@ pub async fn upgrade( tracing::info!( repaired = stats.repaired, stale_retracted = stats.stale_retracted, + normalized = stats.normalized, "repairing Role rows left inconsistent by the v80->v81 migration's non-cloud no-op", ); } @@ -139,16 +158,22 @@ pub async fn upgrade( } /// Inspect a consolidated snapshot and return the updates needed to converge -/// every affected Role onto a single live `+1` (or zero rows, for the dropped -/// case). +/// every affected Role onto a single canonical-form `+1` (or zero rows, for +/// the dropped case), and every untouched Role onto canonical form so future +/// writers' retractions consolidate. +/// +/// The returned `Vec` is safe to feed straight into `compare_and_append`. /// -/// The returned `Vec` is safe to feed straight into `compare_and_append`. For -/// each repair site we emit: +/// Two passes: /// -/// * `+1` of the dangling row (cancels the existing `-1`); -/// * one `-1` per *stale* `+1` row whose parsed value equals the dangling -/// row's (completes the retraction the original DDL was supposed to -/// perform). +/// 1. For each dangling `-1` Role row matching the v80-form-drift signature, +/// cancel it (`+1` of the same bytes) and retract every parsed-equal stale +/// `+1` sibling. +/// 2. For each remaining `+1` Role row whose stored bytes don't match the +/// canonical re-serialization of its parsed value, retract it and insert +/// the canonical form. Skipped for any `RoleKey` whose dangling `-1` we +/// declined to repair in pass 1 — partial rewriting there would manufacture +/// a fresh dangling diff against the canonical form. /// /// Separated from `upgrade` so it can be unit-tested without spinning up a /// real catalog handle. @@ -177,6 +202,13 @@ pub(crate) fn compute_repairs( let mut repairs = Vec::new(); let mut stats = RepairStats::default(); + // `+1` rows pass 1 already retracts: pass 2 must not double-retract them. + let mut retracted_in_pass_1: BTreeSet<&StateUpdateKindJson> = BTreeSet::new(); + // Role keys with an unrepaired dangling diff: pass 2 leaves their `+1`s + // alone, since normalizing only some bytes for a key that still has a + // dangling `-1` would just shift the consolidation failure onto the new + // canonical form. + let mut unrepaired_keys: BTreeSet = BTreeSet::new(); for (kind_json, _, diff) in snapshot { if *diff == Diff::ONE { continue; @@ -203,6 +235,7 @@ pub(crate) fn compute_repairs( "Role row with unexpected diff magnitude; not repaired", ); stats.skipped_role += 1; + unrepaired_keys.insert(dangling.key); continue; } @@ -238,6 +271,7 @@ pub(crate) fn compute_repairs( "dangling Role -1 has no parsed-equal +1 sibling; not the v80-form-drift signature", ); stats.skipped_role += 1; + unrepaired_keys.insert(dangling.key); continue; } if ambiguous_live { @@ -247,6 +281,7 @@ pub(crate) fn compute_repairs( "Role key has multiple distinct live +1 rows; refusing to auto-repair", ); stats.skipped_role += 1; + unrepaired_keys.insert(dangling.key); continue; } @@ -268,11 +303,49 @@ pub(crate) fn compute_repairs( continue; } repairs.push((s.bytes.clone(), Diff::MINUS_ONE)); + retracted_in_pass_1.insert(s.bytes); stats.stale_retracted += 1; } stats.repaired += 1; } + // Pass 2: normalize every remaining live `+1` Role row whose stored bytes + // don't equal what re-serializing its parsed value through the current + // proto would produce. Any future retraction will use canonical bytes; + // unless what's stored matches, the retract+insert pair from that future + // write will dangle a fresh `-1`. + // + // Dedupe canonical inserts per parsed value so two distinct stale byte + // forms of the same role collapse to a single `+1` rather than `+2`. + let mut inserted_canonical: BTreeSet = BTreeSet::new(); + for (kind_json, _, diff) in snapshot { + if *diff != Diff::ONE { + continue; + } + let Some(role) = try_as_role(kind_json) else { + continue; + }; + if retracted_in_pass_1.contains(kind_json) { + continue; + } + if unrepaired_keys.contains(&role.key) { + continue; + } + let canonical: StateUpdateKindJson = v83::StateUpdateKind::Role(role.clone()).into(); + if &canonical == kind_json { + continue; + } + tracing::info!( + role_name = %role.value.name, + "normalizing pre-v81 Role row to canonical byte form", + ); + repairs.push((kind_json.clone(), Diff::MINUS_ONE)); + if inserted_canonical.insert(canonical.clone()) { + repairs.push((canonical, Diff::ONE)); + } + stats.normalized += 1; + } + (repairs, stats) } @@ -644,6 +717,137 @@ mod tests { assert_eq!(stats.stale_retracted, 2); } + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // can't call foreign function `decContextDefault` on OS `linux` + fn untouched_pre_v81_role_is_normalized() { + // A Role row stuck in pre-v81 byte form because no DDL has touched + // it since the v80->v81 no-op. No dangling `-1` yet — but the next + // ALTER ROLE under the current proto would manufacture one. Pass 2 + // rewrites the row in canonical form so that doesn't happen. + let stale = stale_role_kind_with_dropped_field(50, "carol@materialize.com", 20100); + let canonical = role_kind(50, "carol@materialize.com", 20100, None, None, None); + assert_ne!( + stale, canonical, + "test fixture broken: stale and canonical must have distinct stored forms", + ); + + let snap = snapshot(vec![(stale.clone(), Diff::ONE)]); + let (repairs, stats) = compute_repairs(&snap); + assert_eq!( + repairs, + vec![(stale, Diff::MINUS_ONE), (canonical, Diff::ONE)], + ); + assert_eq!( + stats, + RepairStats { + normalized: 1, + ..Default::default() + }, + ); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // can't call foreign function `decContextDefault` on OS `linux` + fn canonical_form_role_is_not_normalized() { + // Belt-and-braces companion to `healthy_snapshot_is_a_noop`: + // explicitly verify that a `+1` whose stored form already matches + // its canonical re-serialization triggers no normalize work. + let canonical = role_kind(51, "dave@materialize.com", 20101, Some(true), None, None); + let snap = snapshot(vec![(canonical, Diff::ONE)]); + let (repairs, stats) = compute_repairs(&snap); + assert!(repairs.is_empty()); + assert_eq!(stats.normalized, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // can't call foreign function `decContextDefault` on OS `linux` + fn multiple_stale_byte_forms_no_dangling_collapse_to_one_canonical() { + // Pathological: two distinct stale stored forms of the same parsed + // Role value, both `+1`, no dangling `-1`. Pass 2 must retract both + // but only insert the canonical row once — emitting `(canonical, +1)` + // per stale row would leave the shard with `(canonical, +2)`. + let stale_a = stale_role_kind_with_dropped_field(60, "ed@materialize.com", 20110); + let stale_b = stale_role_kind_with_extra_whitespace(60, "ed@materialize.com", 20110); + let canonical = role_kind(60, "ed@materialize.com", 20110, None, None, None); + assert_ne!(stale_a, stale_b); + assert_ne!(stale_a, canonical); + assert_ne!(stale_b, canonical); + + let snap = snapshot(vec![ + (stale_a.clone(), Diff::ONE), + (stale_b.clone(), Diff::ONE), + ]); + let (repairs, stats) = compute_repairs(&snap); + + let plus: Vec<_> = repairs + .iter() + .filter(|(_, d)| *d == Diff::ONE) + .cloned() + .collect(); + let minus: std::collections::BTreeSet<_> = repairs + .iter() + .filter(|(_, d)| *d == Diff::MINUS_ONE) + .map(|(k, _)| k.clone()) + .collect(); + assert_eq!(plus, vec![(canonical, Diff::ONE)]); + let expected_minus: std::collections::BTreeSet<_> = + [stale_a, stale_b].into_iter().collect(); + assert_eq!(minus, expected_minus); + assert_eq!(stats.normalized, 2); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // can't call foreign function `decContextDefault` on OS `linux` + fn unrepaired_dangling_skips_normalize_for_same_key() { + // Pass 1 declines to repair this key (ambiguous live state). Pass 2 + // must also leave its stale `+1` alone — normalizing only the stale + // row would just shift the dangling diff onto the canonical bytes, + // turning a known-skip into a new dangling row. + let live_a = role_kind(70, "frank", 20120, Some(true), None, None); + let live_b = role_kind(70, "frank", 20120, Some(false), Some(true), None); + let dangling = role_kind(70, "frank", 20120, None, None, None); + let stale = stale_role_kind_with_dropped_field(70, "frank", 20120); + + let snap = snapshot(vec![ + (stale.clone(), Diff::ONE), + (live_a, Diff::ONE), + (live_b, Diff::ONE), + (dangling, Diff::MINUS_ONE), + ]); + let (repairs, stats) = compute_repairs(&snap); + assert!(repairs.is_empty()); + assert_eq!(stats.skipped_role, 1); + assert_eq!(stats.normalized, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // can't call foreign function `decContextDefault` on OS `linux` + fn normalize_does_not_double_retract_pass_1_stale_rows() { + // A repair-fingerprint role: pass 1 already retracts the stale `+1`. + // Pass 2 must not retract it a second time, or the row goes to + // `-2` and the shard is worse off than before. + let live = role_kind(80, "grace@materialize.com", 20130, Some(true), None, None); + let dangling = role_kind(80, "grace@materialize.com", 20130, None, None, None); + let stale = stale_role_kind_with_dropped_field(80, "grace@materialize.com", 20130); + + let snap = snapshot(vec![ + (stale.clone(), Diff::ONE), + (dangling.clone(), Diff::MINUS_ONE), + (live, Diff::ONE), + ]); + let (repairs, stats) = compute_repairs(&snap); + + // The stale row appears exactly once with diff -1. + let stale_retractions = repairs + .iter() + .filter(|(k, d)| k == &stale && *d == Diff::MINUS_ONE) + .count(); + assert_eq!(stale_retractions, 1); + assert_eq!(stats.repaired, 1); + assert_eq!(stats.stale_retracted, 1); + assert_eq!(stats.normalized, 0); + } + /// Like `stale_role_kind_with_dropped_field` but a different stored /// shape that still parses to the same value, used to exercise the /// repair's multi-stale-row handling. From 6a961df09ea72bbf64d37fd2db34a15270e968f7 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 13 May 2026 01:59:40 +0000 Subject: [PATCH 2/3] Extend existing incident-1007 test --- .../incident-1007/mzcompose.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/test/catalog-migration/incident-1007/mzcompose.py b/test/catalog-migration/incident-1007/mzcompose.py index e07ef88b45c4f..5a918bfcc6896 100644 --- a/test/catalog-migration/incident-1007/mzcompose.py +++ b/test/catalog-migration/incident-1007/mzcompose.py @@ -32,11 +32,13 @@ from textwrap import dedent +from materialize import buildkite from materialize.docker import image_registry from materialize.mz_version import MzVersion from materialize.mzcompose.composition import Composition, WorkflowArgumentParser from materialize.mzcompose.services.materialized import Materialized from materialize.mzcompose.services.minio import Minio +from materialize.mzcompose.services.mz import Mz from materialize.mzcompose.services.postgres import PostgresMetadata from materialize.mzcompose.services.testdrive import Testdrive from materialize.ui import UIError @@ -60,6 +62,7 @@ external_blob_store=True, no_reset=True, ), + Mz(app_password=""), ] @@ -121,6 +124,20 @@ def _ensure_select_roles_fails(c: Composition) -> None: def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: + def process(name: str) -> None: + if name == "default": + return + with c.test_case(name): + c.workflow(name) + c.down() + + workflows = buildkite.shard_list( + list(c.workflows.keys()), lambda workflow: workflow + ) + c.test_parts(workflows, process) + + +def workflow_dangling(c: Composition, parser: WorkflowArgumentParser) -> None: """v26.17 -> v26.18 (expect failure) -> current (expect repair).""" c.up("postgres-metadata", "minio") @@ -173,3 +190,43 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: > SELECT EXISTS (SELECT 1 FROM mz_roles WHERE name = 'user1'); false """)) + + +def workflow_stale_rows(c: Composition, parser: WorkflowArgumentParser) -> None: + """v26.17 -> current (only cause negative diff here) -> current.""" + c.up("postgres-metadata", "minio") + + _start_at(c, PRE_BUG_VERSION) + c.testdrive(dedent(""" + > CREATE ROLE user1 WITH LOGIN PASSWORD 'password'; + """)) + c.kill("materialized") + + _start_at(c, None) + c.testdrive(dedent(""" + $ postgres-execute connection=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr} + ALTER ROLE user1 SUPERUSER; + """)) + + c.kill("materialized") + _start_at(c, None) + _select_roles(c) + + c.testdrive(dedent(""" + $ postgres-execute connection=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr} + DROP ROLE user1; + """)) + c.testdrive(dedent(""" + > SELECT EXISTS (SELECT 1 FROM mz_roles WHERE name = 'user1'); + false + """)) + _select_roles(c) + + c.kill("materialized") + _start_at(c, None) + _select_roles(c) + + c.testdrive(dedent(""" + > SELECT EXISTS (SELECT 1 FROM mz_roles WHERE name = 'user1'); + false + """)) From e4f621218a1c8f1a26d638f33ae57760f6a9614a Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Tue, 12 May 2026 08:22:52 +0000 Subject: [PATCH 3/3] platform-checks Add check and scenario for incident 1007 --- ci/nightly/pipeline.template.yml | 12 ++++ .../materialize/checks/all_checks/cluster.py | 31 +++++++++ .../checks/all_checks/password_auth.py | 46 +++++++++++++ .../materialize/checks/mzcompose_actions.py | 14 +++- .../materialize/checks/scenarios_upgrade.py | 65 +++++++++++++++++-- .../mzcompose/services/materialized.py | 10 ++- 6 files changed, 169 insertions(+), 9 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 8b1dbd1f07390..aae942bab3414 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -1097,6 +1097,18 @@ steps: composition: platform-checks args: [--scenario=UpgradeEntireMzFromPreviousSelfManaged, "--seed=$BUILDKITE_JOB_ID"] + - id: checks-v80-migration + label: "Checks upgrade v80 migration" + depends_on: build-x86_64 + timeout_in_minutes: 60 + parallelism: 4 + agents: + queue: hetzner-x86-64-12cpu-24gb + plugins: + - ./ci/plugins/mzcompose: + composition: platform-checks + args: [--scenario=UpgradeV80Migration, "--seed=$BUILDKITE_JOB_ID"] + - id: checks-preflight-check-rollback label: "Checks preflight-check and roll back upgrade" depends_on: build-x86_64 diff --git a/misc/python/materialize/checks/all_checks/cluster.py b/misc/python/materialize/checks/all_checks/cluster.py index 76ffbffd757b8..9c893083c6069 100644 --- a/misc/python/materialize/checks/all_checks/cluster.py +++ b/misc/python/materialize/checks/all_checks/cluster.py @@ -177,3 +177,34 @@ def validate(self) -> Testdrive: ! SELECT * FROM drop_cluster2_view; contains: unknown catalog item 'drop_cluster2_view' """)) + + +class CreateClusterRF0(Check): + def manipulate(self) -> list[Testdrive]: + # This list MUST be of length 2. + return [ + Testdrive(dedent(s)) + for s in [ + """ + $ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr} + GRANT CREATECLUSTER ON SYSTEM TO materialize + + > CREATE CLUSTER create_cluster_rf0_1 (SIZE 'scale=2,workers=2', REPLICATION FACTOR 0); + """, + """ + $ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr} + GRANT CREATECLUSTER ON SYSTEM TO materialize + + > CREATE CLUSTER create_cluster_rf0_2 (SIZE 'scale=2,workers=2', REPLICATION FACTOR 0); + """, + ] + ] + + def validate(self) -> Testdrive: + return Testdrive(dedent(""" + > SHOW CREATE CLUSTER create_cluster_rf0_1; + create_cluster_rf0_1 "CREATE CLUSTER \\"create_cluster_rf0_1\\" (INTROSPECTION DEBUGGING = false, INTROSPECTION INTERVAL = INTERVAL '00:00:01', MANAGED = true, REPLICATION FACTOR = 0, SIZE = 'scale=2,workers=2', SCHEDULE = MANUAL)" + + > SHOW CREATE CLUSTER create_cluster_rf0_2; + create_cluster_rf0_2 "CREATE CLUSTER \\"create_cluster_rf0_2\\" (INTROSPECTION DEBUGGING = false, INTROSPECTION INTERVAL = INTERVAL '00:00:01', MANAGED = true, REPLICATION FACTOR = 0, SIZE = 'scale=2,workers=2', SCHEDULE = MANUAL)" + """)) diff --git a/misc/python/materialize/checks/all_checks/password_auth.py b/misc/python/materialize/checks/all_checks/password_auth.py index 6310d9a0ee72c..a4f50699d9695 100644 --- a/misc/python/materialize/checks/all_checks/password_auth.py +++ b/misc/python/materialize/checks/all_checks/password_auth.py @@ -86,3 +86,49 @@ def validate(self) -> Testdrive: $ postgres-execute connection=postgres://user2:password2@${testdrive.materialize-sasl-sql-addr} SELECT * FROM materialize.schema2.t1 """)) + + +class AlterRoleCatalogCheck(Check): + def _can_run(self, e: Executor) -> bool: + return self.base_version >= MzVersion.parse_mz("v0.147.0-dev") + + def initialize(self) -> Testdrive: + return Testdrive("> CREATE ROLE user_alter1 WITH LOGIN PASSWORD 'password';") + + def manipulate(self) -> list[Testdrive]: + return [ + Testdrive(dedent(s)) + for s in [ + """ + > ALTER ROLE user_alter1 SUPERUSER; + + > CREATE ROLE user_alter2 WITH LOGIN PASSWORD 'password'; + """, + """ + > DROP ROLE user_alter1; + + > ALTER ROLE user_alter2 SUPERUSER; + + > SELECT * FROM mz_roles WHERE name = 'user_alter1'; + + > CREATE ROLE user_alter3 WITH LOGIN PASSWORD 'password'; + """, + ] + ] + + def validate(self) -> Testdrive: + return Testdrive(dedent(""" + > DROP ROLE IF EXISTS user_alter2; + + > ALTER ROLE user_alter3 SUPERUSER; + + $ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr} + SELECT * FROM mz_internal.mz_catalog_raw WHERE data->>'kind' = 'Role'; + + > SELECT * FROM mz_roles WHERE name = 'user_alter1'; + + > SELECT * FROM mz_roles WHERE name = 'user_alter2'; + + > SELECT name FROM mz_roles WHERE name = 'user_alter3'; + user_alter3 + """)) diff --git a/misc/python/materialize/checks/mzcompose_actions.py b/misc/python/materialize/checks/mzcompose_actions.py index 5b915cf806ddc..6690996327b1d 100644 --- a/misc/python/materialize/checks/mzcompose_actions.py +++ b/misc/python/materialize/checks/mzcompose_actions.py @@ -41,6 +41,9 @@ def __init__( system_parameter_defaults: dict[str, str] | None = None, additional_system_parameter_defaults: dict[str, str] = {}, system_parameter_version: MzVersion | None = None, + soft_assertions: bool = True, + builtin_system_cluster_replication_factor: int | None = None, + builtin_probe_cluster_replication_factor: int | None = None, mz_service: str | None = None, platform: str | None = None, healthcheck: list[str] | None = None, @@ -56,6 +59,13 @@ def __init__( self.system_parameter_defaults = system_parameter_defaults self.additional_system_parameter_defaults = additional_system_parameter_defaults self.system_parameter_version = system_parameter_version or tag + self.soft_assertions = soft_assertions + self.builtin_system_cluster_replication_factor = ( + builtin_system_cluster_replication_factor + ) + self.builtin_probe_cluster_replication_factor = ( + builtin_probe_cluster_replication_factor + ) self.healthcheck = healthcheck self.mz_service = mz_service self.platform = platform @@ -97,6 +107,7 @@ def execute(self, e: Executor) -> None: system_parameter_defaults=self.system_parameter_defaults, additional_system_parameter_defaults=self.additional_system_parameter_defaults, system_parameter_version=self.system_parameter_version, + soft_assertions=self.soft_assertions, sanity_restart=False, platform=self.platform, healthcheck=self.healthcheck, @@ -104,8 +115,9 @@ def execute(self, e: Executor) -> None: restart=self.restart, force_migrations=self.force_migrations, publish=self.publish, - default_replication_factor=2, support_external_clusterd=True, + builtin_system_cluster_replication_factor=self.builtin_system_cluster_replication_factor, + builtin_probe_cluster_replication_factor=self.builtin_probe_cluster_replication_factor, listeners_config_path=listeners_config_path, ) diff --git a/misc/python/materialize/checks/scenarios_upgrade.py b/misc/python/materialize/checks/scenarios_upgrade.py index 9936435219ae8..2b80dda62049b 100644 --- a/misc/python/materialize/checks/scenarios_upgrade.py +++ b/misc/python/materialize/checks/scenarios_upgrade.py @@ -321,6 +321,64 @@ def actions(self) -> list[Action]: ] +class UpgradeV80Migration(Scenario): + """Test upgrade v26.17.1 (catalog 80) -> v26.18.0 (catalog 81) -> X""" + + def __init__( + self, + checks: list[type[Check]], + executor: Executor, + features: Features, + seed: str | None = None, + ): + super().__init__(checks, executor, features, seed) + + def base_version(self) -> MzVersion: + return MzVersion.parse_mz("v26.17.1") + + def actions(self) -> list[Action]: + print( + "Upgrade path: v26.17.1 -> initialize -> v26.18.0 -> manipulate#1 -> restart -> manipulate#2 -> current -> restart -> validate" + ) + return [ + StartMz( + self, + tag=self.base_version(), + soft_assertions=False, + builtin_system_cluster_replication_factor=0, + ), + Initialize(self), + KillMz(capture_logs=True), + StartMz( + self, + tag=MzVersion.parse_mz("v26.18.0"), + soft_assertions=False, + builtin_system_cluster_replication_factor=0, + ), + Manipulate(self, phase=1), + KillMz(capture_logs=True), + StartMz( + self, + tag=MzVersion.parse_mz("v26.18.0"), + soft_assertions=False, + builtin_system_cluster_replication_factor=0, + ), + Manipulate(self, phase=2), + KillMz(capture_logs=True), + StartMz( + self, + tag=None, + soft_assertions=False, + ), + KillMz(), + StartMz( + self, + tag=None, + ), + Validate(self), + ] + + # # We are limited with respect to the different orders in which stuff can be upgraded: # - some sequences of events are invalid @@ -809,11 +867,6 @@ def actions(self) -> list[Action]: Manipulate(self, phase=2, mz_service=mz_services[0].service_name), ) - if both_done_at_position == 0: - actions.append( - Validate(self, mz_service=mz_services[0].service_name), - ) - for i, service_info in enumerate[MzServiceUpgradeInfo]( mz_services[1:], start=1 ): @@ -833,7 +886,7 @@ def actions(self) -> list[Action]: Manipulate(self, phase=2, mz_service=service_info.service_name), ) - if i >= both_done_at_position: + if i >= both_done_at_position and service_info.service_name is None: actions.append( Validate(self, mz_service=service_info.service_name), ) diff --git a/misc/python/materialize/mzcompose/services/materialized.py b/misc/python/materialize/mzcompose/services/materialized.py index e7ca80ef8a34d..0be2b747811b6 100644 --- a/misc/python/materialize/mzcompose/services/materialized.py +++ b/misc/python/materialize/mzcompose/services/materialized.py @@ -99,6 +99,8 @@ def __init__( cluster_replica_size: dict[str, dict[str, Any]] | None = None, bootstrap_replica_size: str | None = None, default_replication_factor: int = 1, + builtin_system_cluster_replication_factor: int | None = None, + builtin_probe_cluster_replication_factor: int | None = None, listeners_config_path: str = f"{MZ_ROOT}/src/materialized/ci/listener_configs/testdrive.json", config_sync_file_path: str | None = None, support_external_clusterd: bool = False, @@ -119,6 +121,10 @@ def __init__( bootstrap_replica_size = bootstrap_cluster_replica_size() if cluster_replica_size is None: cluster_replica_size = cluster_replica_size_map() + if builtin_system_cluster_replication_factor is None: + builtin_system_cluster_replication_factor = default_replication_factor + if builtin_probe_cluster_replication_factor is None: + builtin_probe_cluster_replication_factor = default_replication_factor environment = [ "MZ_NO_TELEMETRY=1", @@ -158,8 +164,8 @@ def __init__( f"MZ_BOOTSTRAP_BUILTIN_ANALYTICS_CLUSTER_REPLICA_SIZE={bootstrap_replica_size}", # Note(SangJunBak): mz_system and mz_probe have no replicas by default in materialized # but we re-enable them here since many of our tests rely on them. - f"MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICATION_FACTOR={default_replication_factor}", - f"MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICATION_FACTOR={default_replication_factor}", + f"MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICATION_FACTOR={builtin_system_cluster_replication_factor}", + f"MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICATION_FACTOR={builtin_probe_cluster_replication_factor}", f"MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR={default_replication_factor}", *environment_extra, *DEFAULT_CRDB_ENVIRONMENT,