Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ci/nightly/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,18 @@ steps:
composition: platform-checks
args: [--scenario=UpgradeEntireMzFromPreviousSelfManaged, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-v80-migration
label: "Checks upgrade v80 migration"
depends_on: build-x86_64
timeout_in_minutes: 60
parallelism: 4
agents:
queue: hetzner-x86-64-12cpu-24gb
plugins:
- ./ci/plugins/mzcompose:
composition: platform-checks
args: [--scenario=UpgradeV80Migration, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-preflight-check-rollback
label: "Checks preflight-check and roll back upgrade"
depends_on: build-x86_64
Expand Down
10 changes: 10 additions & 0 deletions ci/test/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,16 @@ steps:
agents:
queue: hetzner-x86-64-4cpu-8gb

- id: catalog-migration-incident-1007
label: "v80->v81 catalog migration fix (incident-1007)"
depends_on: build-aarch64
timeout_in_minutes: 30
plugins:
- ./ci/plugins/mzcompose:
composition: incident-1007
agents:
queue: hetzner-aarch64-4cpu-8gb

- id: source-sink-errors
label: "S&S error reporting"
depends_on: build-aarch64
Expand Down
31 changes: 31 additions & 0 deletions misc/python/materialize/checks/all_checks/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,34 @@ def validate(self) -> Testdrive:
! SELECT * FROM drop_cluster2_view;
contains: unknown catalog item 'drop_cluster2_view'
"""))


class CreateClusterRF0(Check):
def manipulate(self) -> list[Testdrive]:
# This list MUST be of length 2.
return [
Testdrive(dedent(s))
for s in [
"""
$ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr}
GRANT CREATECLUSTER ON SYSTEM TO materialize

> CREATE CLUSTER create_cluster_rf0_1 (SIZE 'scale=2,workers=2', REPLICATION FACTOR 0);
""",
"""
$ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr}
GRANT CREATECLUSTER ON SYSTEM TO materialize

> CREATE CLUSTER create_cluster_rf0_2 (SIZE 'scale=2,workers=2', REPLICATION FACTOR 0);
""",
]
]

def validate(self) -> Testdrive:
return Testdrive(dedent("""
> SHOW CREATE CLUSTER create_cluster_rf0_1;
create_cluster_rf0_1 "CREATE CLUSTER \\"create_cluster_rf0_1\\" (INTROSPECTION DEBUGGING = false, INTROSPECTION INTERVAL = INTERVAL '00:00:01', MANAGED = true, REPLICATION FACTOR = 0, SIZE = 'scale=2,workers=2', SCHEDULE = MANUAL)"

> SHOW CREATE CLUSTER create_cluster_rf0_2;
create_cluster_rf0_2 "CREATE CLUSTER \\"create_cluster_rf0_2\\" (INTROSPECTION DEBUGGING = false, INTROSPECTION INTERVAL = INTERVAL '00:00:01', MANAGED = true, REPLICATION FACTOR = 0, SIZE = 'scale=2,workers=2', SCHEDULE = MANUAL)"
"""))
31 changes: 31 additions & 0 deletions misc/python/materialize/checks/all_checks/password_auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,34 @@ def validate(self) -> Testdrive:
$ postgres-execute connection=postgres://user2:password2@${testdrive.materialize-sasl-sql-addr}
SELECT * FROM materialize.schema2.t1
"""))


class AlterRoleCatalogCheck(Check):
def _can_run(self, e: Executor) -> bool:
return self.base_version >= MzVersion.parse_mz("v0.147.0-dev")

def initialize(self) -> Testdrive:
return Testdrive("> CREATE ROLE user WITH LOGIN PASSWORD 'password';")

def manipulate(self) -> list[Testdrive]:
return [
Testdrive(dedent(s))
for s in [
"""
> ALTER ROLE user SUPERUSER;
""",
"""
> DROP ROLE user;

> SELECT * FROM mz_roles WHERE name = 'user';
""",
]
]

def validate(self) -> Testdrive:
return Testdrive(dedent("""
$ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr}
SELECT * FROM mz_internal.mz_catalog_raw WHERE data->>'kind' = 'Role';

> SELECT * FROM mz_roles WHERE name = 'user';
"""))
1 change: 0 additions & 1 deletion misc/python/materialize/checks/mzcompose_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def execute(self, e: Executor) -> None:
restart=self.restart,
force_migrations=self.force_migrations,
publish=self.publish,
default_replication_factor=2,
support_external_clusterd=True,
listeners_config_path=listeners_config_path,
)
Expand Down
64 changes: 58 additions & 6 deletions misc/python/materialize/checks/scenarios_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,63 @@ def actions(self) -> list[Action]:
]


class UpgradeV80Migration(Scenario):
"""Test upgrade v26.17.1 (catalog 80) -> v26.18.0 (catalog 81) -> X"""

def __init__(
self,
checks: list[type[Check]],
executor: Executor,
features: Features,
seed: str | None = None,
):
super().__init__(checks, executor, features, seed)

def base_version(self) -> MzVersion:
return MzVersion.parse_mz("v26.17.1")

def actions(self) -> list[Action]:
print(
"Upgrade path: v26.17.1 -> initialize -> v26.18.0 -> manipulate#1 -> restart -> manipulate#2 -> current -> validate -> restart -> validate"
)
return [
StartMz(
self,
tag=self.base_version(),
),
Initialize(self),
KillMz(capture_logs=True),
StartMz(
self,
tag=MzVersion.parse_mz("v26.18.0"),
),
Manipulate(self, phase=1),
KillMz(capture_logs=True),
StartMz(
self,
tag=MzVersion.parse_mz("v26.18.0"),
),
Manipulate(self, phase=2),
KillMz(capture_logs=True),
StartMz(
self,
tag=get_last_version(),
),
KillMz(capture_logs=True),
StartMz(
self,
tag=None,
),
Validate(self),
KillMz(),
StartMz(
self,
tag=None,
),
Validate(self),
]


#
# We are limited with respect to the different orders in which stuff can be upgraded:
# - some sequences of events are invalid
Expand Down Expand Up @@ -809,11 +866,6 @@ def actions(self) -> list[Action]:
Manipulate(self, phase=2, mz_service=mz_services[0].service_name),
)

if both_done_at_position == 0:
actions.append(
Validate(self, mz_service=mz_services[0].service_name),
)

for i, service_info in enumerate[MzServiceUpgradeInfo](
mz_services[1:], start=1
):
Expand All @@ -833,7 +885,7 @@ def actions(self) -> list[Action]:
Manipulate(self, phase=2, mz_service=service_info.service_name),
)

if i >= both_done_at_position:
if i >= both_done_at_position and service_info is None:
actions.append(
Validate(self, mz_service=service_info.service_name),
)
Expand Down
1 change: 1 addition & 0 deletions misc/python/materialize/data_ingest/transaction_def.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def generate(self, fields: list[Field]) -> Iterator[Transaction | None]:
restart="on-failure",
healthcheck=LEADER_STATUS_HEALTHCHECK,
sanity_restart=False,
force_migrations="replacement",
),
):
self.composition.up(self.workload.mz_service, detach=True)
Expand Down
14 changes: 14 additions & 0 deletions test/catalog-migration/incident-1007/mzcompose
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

# Copyright Materialize, Inc. and contributors. All rights reserved.
#
# Use of this software is governed by the Business Source License
# included in the LICENSE file at the root of this repository.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0.
#
# mzcompose — runs Docker Compose with Materialize customizations.

exec "$(dirname "$0")"/../../bin/pyactivate -m materialize.cli.mzcompose "$@"
175 changes: 175 additions & 0 deletions test/catalog-migration/incident-1007/mzcompose.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Copyright Materialize, Inc. and contributors. All rights reserved.
#
# Use of this software is governed by the Business Source License
# included in the LICENSE file at the root of this repository.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0.

"""
End-to-end repro and repair of the v80-form Role byte-drift bug.

Catalog versions:
v26.17.x -> catalog 80 (pre-bug)
v26.18.0 -> catalog 81, introduces the buggy `v80_to_v81::upgrade` whose
`is_cloud` heuristic silently no-ops on self-managed envs.
Role rows stay in their v80 byte form
(no `auto_provision_source` key) while the catalog version
advances. Any subsequent ALTER reads the v80 form, parses it,
and writes a retract+insert pair re-serialized through the
current proto — which always includes `auto_provision_source:
null` — leaving a dangling `-1`.
current -> catalog 83, this PR's `v82_to_v83::upgrade` scans for the
structural signature of the drift and emits compensating
updates to retire the dangling `-1` and the stale `+1`.

The workflow chains v26.17 -> v26.18 -> current. At v26.18, we expect querying
`mz_internal.mz_catalog_raw` to fail — `PersistPeek` enforces a per-row
non-negativity check that fires on the dangling `-1`. After upgrading to
current, the repair fires and the same query succeeds.
"""

from textwrap import dedent

from materialize.docker import image_registry
from materialize.mz_version import MzVersion
from materialize.mzcompose.composition import Composition, WorkflowArgumentParser
from materialize.mzcompose.services.materialized import Materialized
from materialize.mzcompose.services.minio import Minio
from materialize.mzcompose.services.postgres import PostgresMetadata
from materialize.mzcompose.services.testdrive import Testdrive
from materialize.ui import UIError

# Pre-bug release: catalog 80.
PRE_BUG_VERSION = MzVersion.parse_mz("v26.17.1")

# First release containing the buggy v80->v81 migration: catalog 81.
BUGGY_VERSION = MzVersion.parse_mz("v26.18.0")

SERVICES = [
PostgresMetadata(),
Minio(setup_materialize=True),
Materialized(
external_metadata_store=True,
external_blob_store=True,
metadata_store="postgres-metadata",
sanity_restart=False,
),
Testdrive(
external_blob_store=True,
no_reset=True,
),
]


def _mz(image: str | None) -> Materialized:
return Materialized(
name="materialized",
image=image,
metadata_store="postgres-metadata",
external_metadata_store=True,
external_blob_store=True,
sanity_restart=False,
# Set the default replication factor since
# one of the heuristics of a "non-cloud" environment is that the
# replication factor is 0 for mz_system.
default_replication_factor=0,
# MZ_SOFT_ASSERTIONS=1 turns
# soft asserts into panics, killing environmentd mid-upgrade
# before v82->v83 ever runs. Demote them to log-only so the upgrade
# chain can complete and the repair can do its work.
soft_assertions=False,
)


def _start_at(c: Composition, version: MzVersion | None) -> None:
if version is None:
image: str | None = None
label = "current"
else:
image = f"{image_registry()}/materialized:{version}"
label = str(version)
print(f"Starting Materialize at {label} ({image})")
with c.override(_mz(image=image), fail_on_new_service=False):
c.up("materialized")


def _select_roles(c: Composition) -> None:
# `PersistPeek::do_peek` enforces per-row non-negativity when
# serving `mz_internal.mz_catalog_raw`, so the SELECT should error
# if the row has a negative multiplicity.
c.testdrive(dedent("""
$ postgres-execute connection=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr}
SELECT * FROM mz_internal.mz_catalog_raw WHERE data->>'kind' = 'Role';
"""))


def _ensure_select_roles_fails(c: Composition) -> None:
failed_as_expected = False
try:
_select_roles(c)
except Exception as e:
print(f"Got expected failure: {type(e).__name__}: {e}")
failed_as_expected = True

if not failed_as_expected:
raise UIError(
"expected SELECT from mz_internal.mz_catalog_raw to fail, but it "
"succeeded"
)


def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
"""v26.17 -> v26.18 (expect failure) -> current (expect repair)."""
c.up("postgres-metadata", "minio")

# Pre-bug release. Role row written in the v80 catalog format.
_start_at(c, PRE_BUG_VERSION)
c.testdrive(dedent("""
> CREATE ROLE user1 WITH LOGIN PASSWORD 'password';
"""))
c.kill("materialized")

# Buggy release. The v80->v81 migration runs and silently no-ops
# on this self-managed env, so the role row doesn't get written in the v81 catalog format.
# The ALTER below causes a negative multiplicity, so consolidation can't merge.=
# the update caused by the ALTER, leaving a dangling `-1` diff.
_start_at(c, BUGGY_VERSION)
c.testdrive(dedent("""
$ postgres-execute connection=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr}
ALTER ROLE user1 SUPERUSER;
"""))

# Still on the buggy release. The corruption persists across the
# restart;
c.kill("materialized")
_start_at(c, BUGGY_VERSION)
print("Expecting catalog corruption to surface as a query failure...")
_ensure_select_roles_fails(c)

# Drop the role to ensure its -1 diff gets consolidated with its +1 creation diff.
c.testdrive(dedent("""
$ postgres-execute connection=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr}
DROP ROLE user1;
"""))
c.testdrive(dedent("""
> SELECT EXISTS (SELECT 1 FROM mz_roles WHERE name = 'user1');
false
"""))
# Ensure select roles still fails after the drop.
_ensure_select_roles_fails(c)

c.kill("materialized")
# Current release. On startup the catalog is at version 81, so the
# upgrade chain runs v81->v82 (no-op for Role rows) then v82->v83 (the
# repair). After that the dangling `-1` and stale `+1` are gone and the
# SELECT succeeds.
_start_at(c, None)
_select_roles(c)

# Ensure the role is still dropped after the repair.
c.testdrive(dedent("""
> SELECT EXISTS (SELECT 1 FROM mz_roles WHERE name = 'user1');
false
"""))
Loading