diff --git a/Cargo.lock b/Cargo.lock index 86851059fce5e..10cec38aca6fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -172,6 +172,22 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "antithesis_sdk" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18dbd97a5b6c21cc9176891cf715f7f0c273caf3959897f43b9bd1231939e675" +dependencies = [ + "libc", + "libloading", + "linkme", + "once_cell", + "rand 0.8.5", + "rustc_version_runtime", + "serde", + "serde_json", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -5120,6 +5136,26 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "linkme" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83272d46373fb8decca684579ac3e7c8f3d71d4cc3aa693df8759e260ae41cf" +dependencies = [ + "linkme-impl", +] + +[[package]] +name = "linkme-impl" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d59e20403c7d08fe62b4376edfe5c7fb2ef1e6b1465379686d0f21c8df444b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -5779,6 +5815,7 @@ dependencies = [ name = "mz-catalog" version = "0.0.0" dependencies = [ + "antithesis_sdk", "anyhow", "async-trait", "base64 0.22.1", @@ -7167,6 +7204,7 @@ dependencies = [ name = "mz-persist-client" version = "26.25.0-dev.0" dependencies = [ + "antithesis_sdk", "anyhow", "arrayvec 0.7.6", "arrow", @@ -7942,6 +7980,7 @@ dependencies = [ name = "mz-storage" version = "0.0.0" dependencies = [ + "antithesis_sdk", "anyhow", "arrow", "arrow-ipc", @@ -10661,6 +10700,16 @@ dependencies = [ "semver", ] +[[package]] +name = "rustc_version_runtime" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dd18cd2bae1820af0b6ad5e54f4a51d0f3fcc53b05f845675074efcc7af071d" +dependencies = [ + "rustc_version", + "semver", +] + [[package]] name = "rustix" version = "0.38.44" diff --git a/Cargo.toml b/Cargo.toml index 8ba97cb61b290..5d38ff3d8124b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -263,6 +263,7 @@ ahash = { version = "0.8.12", default-features = false } aho-corasick = "1.1.4" allocation-counter = "0" anyhow = "1.0.102" +antithesis_sdk = "0.2.8" array-concat = "0.5.5" arrayvec = "0.7.6" arrow = { version = "57", default-features = false } diff --git a/bin/ci-builder b/bin/ci-builder index 066bf273130a9..6d53be5cad2f5 100755 --- a/bin/ci-builder +++ b/bin/ci-builder @@ -18,6 +18,9 @@ set -euo pipefail NIGHTLY_RUST_DATE=2026-05-06 +# Allow overriding the container runtime (e.g. MZ_DEV_CI_BUILDER_RUNTIME=podman). +DOCKER="${MZ_DEV_CI_BUILDER_RUNTIME:-docker}" + workdir=$(pwd) cd "$(dirname "$0")/.." @@ -128,10 +131,14 @@ gid=$(id -g) [[ "$gid" -lt 500 ]] && gid=$uid build() { + local cache_args=() + if [[ "$DOCKER" != "podman" ]]; then + cache_args+=(--cache-from=materialize/ci-builder:"$cache_tag") + cache_args+=(--cache-to=type=inline,mode=max) + fi # shellcheck disable=SC2086 # intentional splitting of build args string - docker buildx build --pull \ - --cache-from=materialize/ci-builder:"$cache_tag" \ - --cache-to=type=inline,mode=max \ + "$DOCKER" buildx build --pull \ + "${cache_args[@]}" \ $docker_build_args \ --tag materialize/ci-builder:"$tag" \ --tag ghcr.io/materializeinc/materialize/ci-builder:"$tag" \ @@ -181,13 +188,13 @@ case "$cmd" in build "$@" ;; exists) - docker manifest inspect "$image_registry"/ci-builder:"$tag" &> /dev/null + "$DOCKER" manifest inspect "$image_registry"/ci-builder:"$tag" &> /dev/null ;; tag) echo "$tag" ;; push) - docker login ghcr.io -u materialize-bot --password "$GITHUB_GHCR_TOKEN" + "$DOCKER" login ghcr.io -u materialize-bot --password "$GITHUB_GHCR_TOKEN" build --push "$@" ;; run) @@ -274,6 +281,7 @@ case "$cmd" in --env AZURE_SERVICE_ACCOUNT_PASSWORD --env AZURE_SERVICE_ACCOUNT_TENANT --env GCP_SERVICE_ACCOUNT_JSON + --env ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON --env GITHUB_TOKEN --env GITHUB_GHCR_TOKEN --env GPG_KEY @@ -372,20 +380,26 @@ case "$cmd" in ) fi if [[ "$(uname -s)" = Linux ]]; then - args+=( - --user "$(id -u):$(stat -c %g /var/run/docker.sock)" - ) + if [[ "${MZ_DEV_CI_BUILDER_RUNTIME:-docker}" == "podman" ]]; then + args+=(--userns=keep-id) + else + args+=( + --user "$(id -u):$(stat -c %g /var/run/docker.sock)" + ) + fi if [[ $secrets == "true" ]]; then # Allow Docker-in-Docker by mounting the Docker socket in the # container. Host networking allows us to see ports created by # containers that we launch. args+=( - --volume "/var/run/docker.sock:/var/run/docker.sock" --network host --env "DOCKER_TLS_VERIFY=${DOCKER_TLS_VERIFY-}" --env "DOCKER_HOST=${DOCKER_HOST-}" ) + if [[ -S /var/run/docker.sock ]]; then + args+=(--volume "/var/run/docker.sock:/var/run/docker.sock") + fi # Forward Docker configuration too, if available. docker_dir=${DOCKER_CONFIG:-$HOME/.docker} @@ -431,14 +445,22 @@ case "$cmd" in image="$image_registry/ci-builder:$tag" # Try downloading the image a few times in case of registry flakiness if [[ "${CI:-}" ]]; then - if ! docker inspect "$image" > /dev/null 2>&1; then - docker pull "$image" || (sleep 3 && docker pull "$image") || (sleep 3 && docker pull "$image") || sleep 3 + if ! "$DOCKER" inspect "$image" > /dev/null 2>&1; then + "$DOCKER" pull "$image" || (sleep 3 && "$DOCKER" pull "$image") || (sleep 3 && "$DOCKER" pull "$image") || sleep 3 fi fi - docker run "${args[@]}" "$image" eatmydata "${docker_command[@]}" + if [[ "$DOCKER" == "podman" ]]; then + # --userns=keep-id already maps the host UID/GID into the + # container, so autouseradd is unnecessary. Override the + # entrypoint to skip it. + args+=(--entrypoint eatmydata) + "$DOCKER" run "${args[@]}" "$image" "${docker_command[@]}" + else + "$DOCKER" run "${args[@]}" "$image" eatmydata "${docker_command[@]}" + fi ;; root-shell) - docker exec --interactive --tty --user 0:0 "$(<"$cid_file")" eatmydata ci/builder/root-shell.sh + "$DOCKER" exec --interactive --tty --user 0:0 "$(<"$cid_file")" eatmydata ci/builder/root-shell.sh ;; *) printf "unknown command %q\n" "$cmd" diff --git a/ci/builder/Dockerfile b/ci/builder/Dockerfile index be1da20d8591f..eb6b71be277a4 100644 --- a/ci/builder/Dockerfile +++ b/ci/builder/Dockerfile @@ -399,6 +399,11 @@ ENV CARGO_HOME=/cargo RUN mkdir /cargo && chmod 777 /cargo VOLUME /cargo +# Antithesis coverage instrumentation library (used when --antithesis is passed) +RUN curl -sSL https://antithesis.com/assets/instrumentation/libvoidstar.so \ + -o /usr/lib/libvoidstar.so \ + && ldconfig + # Stage 3: Build a lightweight CI Builder image for console/playwright jobs. FROM ubuntu:noble-20260324 AS ci-builder-console diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py index 79fcb7bd2a0c9..d6be6018c7532 100644 --- a/ci/mkpipeline.py +++ b/ci/mkpipeline.py @@ -121,6 +121,12 @@ def main() -> int: type=Sanitizer, choices=Sanitizer, ) + parser.add_argument( + "--antithesis", + action="store_true", + default=ui.env_is_truthy("CI_ANTITHESIS"), + help="enable Antithesis coverage instrumentation", + ) parser.add_argument( "--priority", type=int, @@ -166,6 +172,7 @@ def get_hashes(arch: Arch) -> tuple[str, bool]: arch=arch, coverage=args.coverage, sanitizer=args.sanitizer, + antithesis=args.antithesis, ) deps = repo.resolve_dependencies(image for image in repo if image.publish) check = deps.check() @@ -209,6 +216,7 @@ def fetch_hashes() -> None: args.coverage, args.sanitizer, lto, + args.antithesis, ) trim_ci_glue_exempt_steps(pipeline) else: @@ -218,9 +226,11 @@ def fetch_hashes() -> None: args.coverage, args.sanitizer, lto, + args.antithesis, ) truncate_skip_length(pipeline) handle_sanitizer_skip(pipeline, args.sanitizer) + handle_antithesis_skip(pipeline, args.antithesis) increase_agents_timeouts(pipeline, args.sanitizer, args.coverage) prioritize_pipeline(pipeline, args.priority) switch_jobs_to_aws(pipeline, args.priority) @@ -240,6 +250,7 @@ def fetch_hashes() -> None: args.coverage, args.sanitizer, lto, + args.antithesis, ) add_nightly_deploy_dependency(pipeline, args.pipeline) remove_dependencies_on_prs(pipeline, args.pipeline, hash_check) @@ -328,6 +339,21 @@ def handle_sanitizer_skip(pipeline: Any, sanitizer: Sanitizer) -> None: step["skip"] = True +def handle_antithesis_skip(pipeline: Any, antithesis: bool) -> None: + if antithesis: + pipeline.setdefault("env", {})["CI_ANTITHESIS"] = "1" + + for step in steps(pipeline): + if step.get("antithesis") == "skip": + step["skip"] = True + + else: + + for step in steps(pipeline): + if step.get("antithesis") == "only": + step["skip"] = True + + def increase_agents_timeouts( pipeline: Any, sanitizer: Sanitizer, coverage: bool ) -> None: @@ -711,6 +737,7 @@ def trim_tests_pipeline( coverage: bool, sanitizer: Sanitizer, lto: bool, + antithesis: bool = False, ) -> None: """Trim pipeline steps whose inputs have not changed in this branch. @@ -731,6 +758,7 @@ def trim_tests_pipeline( profile=mzbuild.Profile.RELEASE if lto else mzbuild.Profile.OPTIMIZED, coverage=coverage, sanitizer=sanitizer, + antithesis=antithesis, ) deps = repo.resolve_dependencies(image for image in repo) @@ -917,6 +945,7 @@ def add_cargo_test_dependency( coverage: bool, sanitizer: Sanitizer, lto: bool, + antithesis: bool = False, ) -> None: """Cargo Test normally doesn't have to wait for the build to complete, but it requires a few images (ubuntu-base, postgres), which are rarely changed. So only add a dependency when those images are not on Dockerhub yet.""" if pipeline_name not in ("test", "nightly"): @@ -933,6 +962,7 @@ def add_cargo_test_dependency( profile=mzbuild.Profile.RELEASE if lto else mzbuild.Profile.OPTIMIZED, coverage=coverage, sanitizer=sanitizer, + antithesis=antithesis, ) composition = Composition(repo, name="cargo-test") deps = composition.dependencies @@ -1090,6 +1120,8 @@ def remove_mz_specific_keys(pipeline: Any) -> None: del step["coverage"] if "sanitizer" in step: del step["sanitizer"] + if "antithesis" in step: + del step["antithesis"] if "ci_glue_exempt" in step: del step["ci_glue_exempt"] if ( diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index d10055451b451..b3c3068e04970 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -65,6 +65,29 @@ steps: branches: "main" skip: "currently broken" + - id: build-x86_64-antithesis + label: ":rust: Build x86_64 (Antithesis)" + # Regenerate the antithesis compose YAML before building so the + # `antithesis-config` image's fingerprint captures the same + # materialized fingerprint we're about to publish — otherwise + # Antithesis would try to pull a stale `materialized:mzbuild-…` + # whenever the committed YAML lagged behind source changes. + command: bin/ci-builder run stable ci/test/build-antithesis.sh + inputs: + - "*" + depends_on: [] + timeout_in_minutes: 90 + agents: + queue: l-builder-linux-x86_64 + env: + CI_ANTITHESIS: "1" + # Antithesis-flavored images get distinct mzbuild fingerprints, so + # they coexist with regular GHCR tags. The build is x86_64-only — + # Antithesis runs amd64 sandboxes. + sanitizer: skip + coverage: skip + antithesis: skip + - id: build-rust-latest-beta label: "Build with Latest Rust Beta" command: bin/ci-builder run stable ci/test/rust-beta-build.sh diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh new file mode 100755 index 0000000000000..23d9480ad8188 --- /dev/null +++ b/ci/test/build-antithesis.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# build-antithesis.sh — antithesis-flavored build + Antithesis-registry push. +# +# 1. Write `.env` so `antithesis-config` bakes in compose refs that point +# at the Antithesis GCP Artifact Registry (where we'll mirror to). The +# .env content is one of antithesis-config's mzbuild inputs, so the +# image fingerprint tracks the source it references — self-consistent. +# 2. Run the standard `ci.test.build` to compile antithesis-flavored Rust +# binaries and build the docker images (pushed to GHCR via mzbuild). +# 3. `docker login` the Antithesis GCP Artifact Registry using +# `ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON` (a service account scoped to +# `materialize-storage@molten-verve-216720.iam.gserviceaccount.com` — +# kept distinct from `GCP_SERVICE_ACCOUNT_JSON` which is used elsewhere +# for unrelated GCP integrations). +# 4. Retag + push `materialized`, `antithesis-workload`, and +# `antithesis-config` to the Antithesis registry. Public images +# referenced by the compose (postgres, minio, kafka stack) stay on +# their upstream registries — Antithesis can reach those directly. + +set -euo pipefail + +: "${CI_ANTITHESIS:?build-antithesis.sh expects CI_ANTITHESIS=1}" + +# GCP Artifact Registry path for Antithesis. Tags pushed under +# $ANTITHESIS_REGISTRY/:mzbuild-. +ANTITHESIS_REGISTRY="${ANTITHESIS_REGISTRY:-us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository}" + +echo "--- Writing test/antithesis/config/.env (registry: $ANTITHESIS_REGISTRY)" +bin/pyactivate test/antithesis/export-env.py \ + --registry "$ANTITHESIS_REGISTRY" \ + > test/antithesis/config/.env + +echo "--- Building antithesis-flavored mzbuild images" +bin/pyactivate -m ci.test.build + +echo "--- Authenticating to Antithesis registry" +if [[ -z "${ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then + echo "ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2 + echo "Provision it as a Buildkite-agent env var (see bin/ci-builder env-forwarding)." >&2 + exit 1 +fi +echo "$ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON" \ + | docker login -u _json_key --password-stdin "https://${ANTITHESIS_REGISTRY%%/*}" + +echo "--- Pushing Materialize-built images to the Antithesis registry" +bin/pyactivate test/antithesis/push-antithesis.py --registry "$ANTITHESIS_REGISTRY" diff --git a/ci/test/build.py b/ci/test/build.py index d91e82ffe2734..95f4227afbaa7 100755 --- a/ci/test/build.py +++ b/ci/test/build.py @@ -34,18 +34,38 @@ def main() -> None: set_build_status("pending") coverage = ui.env_is_truthy("CI_COVERAGE_ENABLED") sanitizer = Sanitizer[os.getenv("CI_SANITIZER", "none")] + antithesis = ui.env_is_truthy("CI_ANTITHESIS") repo = mzbuild.Repository( Path("."), coverage=coverage, sanitizer=sanitizer, + antithesis=antithesis, image_registry="materialize", ) # Build and push any images that are not already available on Docker Hub, # so they are accessible to other build agents. print("--- Acquiring mzbuild images") - deps = repo.resolve_dependencies(image for image in repo if image.publish) + if antithesis: + # Antithesis only consumes these three images; everything else in + # the repo (balancerd, sqllogictest, testdrive, ...) is wasted CI + # time for this pipeline. resolve_dependencies walks depends_on + # transitively, so anything materialized actually needs still + # comes along. Keep this list in sync with ANTITHESIS_IMAGES in + # test/antithesis/push-antithesis.py. + antithesis_images = [ + "materialized", + "antithesis-workload", + "antithesis-config", + ] + deps = repo.resolve_dependencies( + repo.images[name] for name in antithesis_images + ) + else: + deps = repo.resolve_dependencies( + image for image in repo if image.publish + ) deps.ensure(pre_build=lambda images: upload_debuginfo(repo, images)) set_build_status("success") annotate_buildkite_with_tags(repo.rd.arch, deps) diff --git a/ci/test/lint-main/checks/check-antithesis-compose.sh b/ci/test/lint-main/checks/check-antithesis-compose.sh new file mode 100755 index 0000000000000..55c54f0bccfba --- /dev/null +++ b/ci/test/lint-main/checks/check-antithesis-compose.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# check-antithesis-compose.sh — ensure test/antithesis/config/docker-compose.yaml +# is in sync with test/antithesis/mzcompose.py. +# +# Image refs in the committed YAML are `${MATERIALIZED_IMAGE}` style +# placeholders (resolved from `.env` at compose-parse time), so the file is +# stable across materialized source changes. A plain diff catches any +# composition (services/ports/env/deps) drift. + +set -euo pipefail + +cd "$(dirname "$0")/../../../.." + +. misc/shlib/shlib.bash + +check_antithesis_compose() { + local committed=test/antithesis/config/docker-compose.yaml + local generated rc=0 + generated=$(mktemp) + + bin/pyactivate test/antithesis/export-compose.py > "$generated" + + if ! diff -u "$committed" "$generated"; then + echo + echo "$committed is out of sync with test/antithesis/mzcompose.py." + echo "Regenerate with:" + echo " bin/pyactivate test/antithesis/export-compose.py > $committed" + rc=1 + fi + + rm -f "$generated" + return $rc +} + +try check_antithesis_compose + +try_status_report diff --git a/ci/test/lint-main/checks/check-pipeline.sh b/ci/test/lint-main/checks/check-pipeline.sh index baed7ae9a717c..95da47ae547c8 100755 --- a/ci/test/lint-main/checks/check-pipeline.sh +++ b/ci/test/lint-main/checks/check-pipeline.sh @@ -28,6 +28,7 @@ unset CI_TEST_IDS unset CI_TEST_SELECTION unset CI_SANITIZER unset CI_COVERAGE_ENABLED +unset CI_ANTITHESIS unset CI_WAITING_FOR_BUILD pids=() diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index f653b84abc4a9..2200188139219 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -187,6 +187,7 @@ def __init__( sanitizer: Sanitizer, image_registry: str, image_prefix: str, + antithesis: bool = False, ): self.root = root self.arch = arch @@ -196,6 +197,7 @@ def __init__( self.cargo_workspace = cargo.Workspace(root) self.image_registry = image_registry self.image_prefix = image_prefix + self.antithesis = antithesis def build( self, @@ -513,6 +515,8 @@ def extra(self) -> str: flags += "optimized" if self.rd.coverage: flags += "coverage" + if self.rd.antithesis: + flags += ["antithesis"] if self.rd.sanitizer != Sanitizer.none: flags += self.rd.sanitizer.value flags.sort() @@ -547,15 +551,14 @@ def generate_cargo_build_command( examples: list[str], features: list[str] | None = None, ) -> list[str]: - rustflags = ( - rustc_flags.coverage - if rd.coverage - else ( - rustc_flags.sanitizer[rd.sanitizer] - if rd.sanitizer != Sanitizer.none - else ["--cfg=tokio_unstable"] - ) - ) + if rd.antithesis: + rustflags = rustc_flags.antithesis + elif rd.coverage: + rustflags = rustc_flags.coverage + elif rd.sanitizer != Sanitizer.none: + rustflags = rustc_flags.sanitizer[rd.sanitizer] + else: + rustflags = ["--cfg=tokio_unstable"] cflags = ( [ f"--target={target(rd.arch)}", @@ -568,8 +571,8 @@ def generate_cargo_build_command( if rd.sanitizer != Sanitizer.none else [] ) - extra_env = ( - { + if rd.sanitizer != Sanitizer.none: + extra_env = { "CFLAGS": " ".join(cflags), "CXXFLAGS": " ".join(cflags), "LDFLAGS": " ".join(cflags), @@ -582,9 +585,8 @@ def generate_cargo_build_command( "PATH": f"/sanshim:/opt/x-tools/{target(rd.arch)}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "TSAN_OPTIONS": "report_bugs=0", # build-scripts fail } - if rd.sanitizer != Sanitizer.none - else {} - ) + else: + extra_env = {} cargo_build = rd.build( "build", channel=None, rustflags=rustflags, extra_env=extra_env @@ -672,7 +674,11 @@ def copy(src: Path, relative_dst: Path) -> None: exe_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy(src, exe_path) - if self.strip: + if self.rd.antithesis: + # Antithesis needs full debug symbols for symbolization. + # Don't strip anything. + pass + elif self.strip: # The debug information is large enough that it slows down CI, # since we're packaging these binaries up into Docker images and # shipping them around. @@ -945,6 +951,7 @@ def _build_locked( "ARCH_GCC": str(self.image.rd.arch), "ARCH_GO": self.image.rd.arch.go_str(), "CI_SANITIZER": str(self.image.rd.sanitizer), + "ANTITHESIS": "1" if self.image.rd.antithesis else "", } f = self.write_dockerfile() @@ -1416,6 +1423,7 @@ def __init__( sanitizer: Sanitizer = Sanitizer.none, image_registry: str = image_registry(), image_prefix: str = "", + antithesis: bool = False, ): self.rd = RepositoryDetails( root, @@ -1425,6 +1433,7 @@ def __init__( sanitizer, image_registry, image_prefix, + antithesis=antithesis, ) self.images: dict[str, Image] = {} self.compositions: dict[str, Path] = {} @@ -1517,6 +1526,12 @@ def install_arguments(parser: argparse.ArgumentParser) -> None: default="", help="a prefix to apply to all Docker image names", ) + parser.add_argument( + "--antithesis", + help="whether to enable Antithesis coverage instrumentation", + default=ui.env_is_truthy("CI_ANTITHESIS"), + action="store_true", + ) @classmethod def from_arguments(cls, root: Path, args: argparse.Namespace) -> "Repository": @@ -1544,6 +1559,7 @@ def from_arguments(cls, root: Path, args: argparse.Namespace) -> "Repository": image_registry=args.image_registry, image_prefix=args.image_prefix, arch=args.arch, + antithesis=args.antithesis, ) @property diff --git a/misc/python/materialize/rustc_flags.py b/misc/python/materialize/rustc_flags.py index 6353f83d3b68a..f6aac45573e14 100644 --- a/misc/python/materialize/rustc_flags.py +++ b/misc/python/materialize/rustc_flags.py @@ -25,6 +25,20 @@ ] +# Flags to enable Antithesis coverage instrumentation. +# Requires libvoidstar.so at /usr/lib/ (installed in ci-builder and +# the materialized Docker image). +# See: https://antithesis.com/docs/using_antithesis/sdk/rust/instrumentation/ +antithesis = [ + "-Ccodegen-units=1", + "-Cpasses=sancov-module", + "-Cllvm-args=-sanitizer-coverage-level=3", + "-Cllvm-args=-sanitizer-coverage-trace-pc-guard", + "-Clink-args=-Wl,--build-id", + "-lvoidstar", +] + + class Sanitizer(Enum): """What sanitizer to use""" diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index 6704bd79d8b06..3553217de30ed 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -10,6 +10,7 @@ publish = false workspace = true [dependencies] +antithesis_sdk.workspace = true anyhow.workspace = true async-trait.workspace = true base64.workspace = true diff --git a/src/catalog/src/durable/persist.rs b/src/catalog/src/durable/persist.rs index c93830e38d7e3..83d560c98004c 100644 --- a/src/catalog/src/durable/persist.rs +++ b/src/catalog/src/durable/persist.rs @@ -17,6 +17,7 @@ use std::str::FromStr; use std::sync::{Arc, LazyLock}; use std::time::{Duration, Instant}; +use antithesis_sdk::assert_always_greater_than; use async_trait::async_trait; use differential_dataflow::lattice::Lattice; use futures::{FutureExt, StreamExt}; @@ -41,6 +42,7 @@ use mz_repr::Diff; use mz_storage_client::controller::PersistEpoch; use mz_storage_types::StorageDiff; use mz_storage_types::sources::SourceData; +use serde_json::json; use sha2::Digest; use timely::progress::{Antichain, Timestamp as TimelyTimestamp}; use tracing::{debug, info, warn}; @@ -145,6 +147,21 @@ impl FenceableToken { current_token, fence_token, } => { + // The two `assert!` calls below are the natural placement + // for an Antithesis `assert_always!` covering the + // FenceableToken state-machine invariant. They are not + // wrapped today because Materialize does not run multiple + // concurrent environmentd processes against the same + // catalog shard, so the `Fenced` state is unreachable in + // every supported topology — including the Antithesis + // topology in this repo. Wrapping them would create + // assertions Antithesis cannot exercise, which is dead + // weight in coverage reports. If we ever ship multi- + // environmentd (e.g. for a 0DT-preflight Antithesis run), + // convert these to `assert_always!` with distinct + // messages so a violation becomes a reportable property + // failure rather than a panic. See the + // `epoch-fencing-prevents-split-brain` catalog entry. assert!( fence_token > current_token, "must be fenced by higher token; current={current_token:?}, fence={fence_token:?}" @@ -1182,12 +1199,43 @@ impl UnopenedPersistCatalogState { "fencing previous catalogs" ); if matches!(self.mode, Mode::Writable) { + // Snapshot the prior durable epoch so the post-CaS anchor + // below can verify monotonicity. Captured before the write + // because `compare_and_append` may call `sync()` which + // reads new state into `self.fenceable_token`. + let prior_durable_epoch = self + .fenceable_token + .token() + .map(|t| t.epoch.get()) + .unwrap_or(0); match self .compare_and_append(fence_updates.clone(), commit_ts) .await { Ok(upper) => { commit_ts = upper; + // Antithesis anchor for `epoch-fencing-prevents- + // split-brain`: after our fence-token CaS commits, + // the freshly-minted epoch we just persisted must + // be strictly greater than the prior durable + // epoch. A regression here would mean a future + // lower-epoch writer would not be fenced out by + // the write we just made, opening the split-brain + // window the catalog is supposed to close. + let new_epoch = current_fenceable_token + .token() + .expect("freshly minted Unfenced token always has a current_token") + .epoch + .get(); + assert_always_greater_than!( + new_epoch, + prior_durable_epoch, + "catalog fencing: new durable epoch did not strictly increase after fence-token CaS", + &json!({ + "prior_durable_epoch": prior_durable_epoch, + "new_epoch": new_epoch, + }) + ); } Err(CompareAndAppendError::Fence(e)) => return Err(e.into()), Err(e @ CompareAndAppendError::UpperMismatch { .. }) => { diff --git a/src/materialized/ci/Dockerfile b/src/materialized/ci/Dockerfile index 18686251a7b07..e06aaf6bad0cf 100644 --- a/src/materialized/ci/Dockerfile +++ b/src/materialized/ci/Dockerfile @@ -20,6 +20,17 @@ COPY materialized entrypoint.sh /usr/local/bin/ USER root RUN ln -s /usr/local/bin/materialized /usr/local/bin/environmentd \ && ln -s /usr/local/bin/materialized /usr/local/bin/clusterd + +# Antithesis instrumentation (conditional on --build-arg ANTITHESIS=1) +ARG ANTITHESIS +RUN if [ -n "$ANTITHESIS" ]; then \ + curl -sSL https://antithesis.com/assets/instrumentation/libvoidstar.so \ + -o /usr/lib/libvoidstar.so \ + && ldconfig \ + && mkdir -p /symbols \ + && ln -s /usr/local/bin/materialized /symbols/materialized; \ + fi + USER materialize ENTRYPOINT ["tini", "--", "entrypoint.sh"] diff --git a/src/persist-client/Cargo.toml b/src/persist-client/Cargo.toml index 0fad73a172d71..0d2b068964372 100644 --- a/src/persist-client/Cargo.toml +++ b/src/persist-client/Cargo.toml @@ -28,6 +28,7 @@ name = "benches" harness = false [dependencies] +antithesis_sdk.workspace = true anyhow.workspace = true arrayvec.workspace = true arrow.workspace = true diff --git a/src/persist-client/src/internal/apply.rs b/src/persist-client/src/internal/apply.rs index a48982ff77eb9..5085b24b3d6fb 100644 --- a/src/persist-client/src/internal/apply.rs +++ b/src/persist-client/src/internal/apply.rs @@ -15,6 +15,9 @@ use std::ops::ControlFlow::{self, Break, Continue}; use std::sync::Arc; use std::time::Instant; +use antithesis_sdk::assert_always_greater_than; +use serde_json::json; + use crate::cache::{LockingTypedState, StateCache}; use crate::error::{CodecMismatch, InvalidUsage}; use crate::internal::gc::GcReq; @@ -598,6 +601,21 @@ where } } + // Antithesis-reportable form of the broader `persist-cas-monotonicity` + // catalog property: SeqNo must strictly increase across any committed + // state transition. The narrower equality check below (next == seqno) + // still panics on violation and stays in place to catch skip/regress + // in the same call. + assert_always_greater_than!( + new_state.seqno().0, + expected.0, + "persist: state seqno did not strictly increase across CaS apply", + &json!({ + "expected_prev": expected.0, + "computed_next": new_state.seqno().0, + "cmd": cmd.name, + }) + ); assert_eq!( expected.next(), new_state.seqno(), diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index f96d9991511dc..2e7f4f4a37ab7 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -15,6 +15,7 @@ bench = false [dependencies] anyhow.workspace = true +antithesis_sdk.workspace = true async-stream.workspace = true async-trait.workspace = true aws-credential-types.workspace = true diff --git a/src/storage/src/source/kafka.rs b/src/storage/src/source/kafka.rs index 60ab8b8928058..2f6e8d28f960e 100644 --- a/src/storage/src/source/kafka.rs +++ b/src/storage/src/source/kafka.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use std::thread; use std::time::Duration; +use antithesis_sdk::{assert_always, assert_unreachable}; use anyhow::anyhow; use chrono::{DateTime, NaiveDateTime}; use differential_dataflow::{AsCollection, Hashable}; @@ -52,6 +53,7 @@ use rdkafka::statistics::Statistics; use rdkafka::topic_partition_list::Offset; use rdkafka::{ClientContext, Message, TopicPartitionList}; use serde::{Deserialize, Serialize}; +use serde_json::json; use timely::PartialOrder; use timely::container::CapacityContainerBuilder; use timely::dataflow::channels::pact::Pipeline; @@ -273,7 +275,13 @@ fn render_reader<'scope>( .iter() .map(|(_name, kind)| kind.clone()) .collect::>(), - _ => panic!("unexpected source export details: {:?}", details), + _ => { + assert_unreachable!( + "kafka: unexpected source export details", + &json!({"source_id": id.to_string()}) + ); + panic!("unexpected source export details: {:?}", details) + } }; let statistics = config @@ -888,6 +896,11 @@ fn render_reader<'scope>( } } // We can now put them back + assert_always!( + reader.partition_consumers.is_empty(), + "kafka: partition_consumers not drained at shutdown", + &json!({"remaining": reader.partition_consumers.len()}) + ); assert!(reader.partition_consumers.is_empty()); reader.partition_consumers = consumers; @@ -1139,6 +1152,20 @@ impl KafkaSourceReader { // Given the explicit consumer to partition assignment, we should never receive a message // for a partition for which we have no metadata + let partition_known = self + .last_offsets + .get(output_index) + .map(|m| m.contains_key(&partition)) + .unwrap_or(false); + assert_always!( + partition_known, + "kafka: partition missing from last_offsets", + &json!({ + "source_id": self.id.to_string(), + "partition": partition, + "output_index": output_index, + }) + ); assert!( self.last_offsets .get(output_index) @@ -1190,6 +1217,13 @@ fn construct_source_message( ) { let pid = msg.partition(); let Ok(offset) = u64::try_from(msg.offset()) else { + assert_unreachable!( + "kafka: negative offset from non-error message", + &json!({ + "partition": msg.partition(), + "raw_offset": msg.offset(), + }) + ); panic!( "got negative offset ({}) from otherwise non-error'd kafka message", msg.offset() diff --git a/src/storage/src/source/reclock.rs b/src/storage/src/source/reclock.rs index d4ab5ac4b312b..745115e5dbf72 100644 --- a/src/storage/src/source/reclock.rs +++ b/src/storage/src/source/reclock.rs @@ -10,11 +10,13 @@ /// The `ReclockOperator` observes the progress of a stream that is /// timestamped with some source time `FromTime` and generates bindings that describe how the /// collection should evolve in target time `IntoTime`. +use antithesis_sdk::assert_reachable; use differential_dataflow::consolidation; use differential_dataflow::lattice::Lattice; use mz_persist_client::error::UpperMismatch; use mz_repr::Diff; use mz_storage_client::util::remap_handle::RemapHandle; +use serde_json::json; use timely::order::PartialOrder; use timely::progress::Timestamp; use timely::progress::frontier::{Antichain, AntichainRef, MutableAntichain}; @@ -128,6 +130,12 @@ where upper: self.upper.clone(), }; + // Tracks whether append_batch hit an UpperMismatch during this mint + // invocation. If true and we still exit the while loop normally, + // we've exercised the retry path covered by the catalog property + // `reclock-mint-eventually-succeeds`. + let mut cas_retry_count: u64 = 0; + while *self.upper == [IntoTime::minimum()] || (PartialOrder::less_equal(&self.source_upper.frontier(), &new_from_upper) && PartialOrder::less_than(&self.upper, &new_into_upper) @@ -159,12 +167,28 @@ where let new_batch = match self.append_batch(updates, &new_into_upper).await { Ok(trace_batch) => trace_batch, - Err(UpperMismatch { current, .. }) => self.sync(current.borrow()).await, + Err(UpperMismatch { current, .. }) => { + cas_retry_count = cas_retry_count.saturating_add(1); + self.sync(current.borrow()).await + } }; batch.updates.extend(new_batch.updates); batch.upper = new_batch.upper; } + // Reachability anchor for `reclock-mint-eventually-succeeds`: this + // line fires only when a CaS UpperMismatch was observed and the + // mint loop nonetheless terminated. That's the path the catalog + // wants Antithesis to observe at least once per run; reaching it + // is the signal, so the marker is unconditional `assert_reachable!` + // rather than `assert_sometimes!(true, …)`. + if cas_retry_count > 0 { + assert_reachable!( + "reclock: mint completed after at least one compare_and_append UpperMismatch", + &json!({"cas_retry_count": cas_retry_count}) + ); + } + batch } diff --git a/src/storage/src/source/reclock/compat.rs b/src/storage/src/source/reclock/compat.rs index a260e2dfcf060..607bbc4c5e680 100644 --- a/src/storage/src/source/reclock/compat.rs +++ b/src/storage/src/source/reclock/compat.rs @@ -15,6 +15,7 @@ use std::rc::Rc; use std::sync::Arc; use std::time::Duration; +use antithesis_sdk::assert_unreachable; use anyhow::Context; use differential_dataflow::lattice::Lattice; use fail::fail_point; @@ -33,6 +34,7 @@ use mz_storage_client::util::remap_handle::{RemapHandle, RemapHandleReader}; use mz_storage_types::StorageDiff; use mz_storage_types::controller::CollectionMetadata; use mz_storage_types::sources::{SourceData, SourceTimestamp}; +use serde_json::json; use timely::order::{PartialOrder, TotalOrder}; use timely::progress::Timestamp; use timely::progress::frontier::Antichain; @@ -303,7 +305,13 @@ where *self.shared_write_frontier.borrow_mut() = new_upper; return result; } - Err(invalid_use) => panic!("compare_and_append failed: {invalid_use}"), + Err(invalid_use) => { + assert_unreachable!( + "reclock: compare_and_append InvalidUsage", + &json!({"error": invalid_use.to_string()}) + ); + panic!("compare_and_append failed: {invalid_use}") + } } } diff --git a/src/storage/src/upsert/types.rs b/src/storage/src/upsert/types.rs index 2bf8270aa2c95..57a4b85033563 100644 --- a/src/storage/src/upsert/types.rs +++ b/src/storage/src/upsert/types.rs @@ -88,11 +88,13 @@ use std::num::Wrapping; use std::sync::Arc; use std::time::Instant; +use antithesis_sdk::{assert_always, assert_unreachable}; use bincode::Options; use itertools::Itertools; use mz_ore::error::ErrorExt; use mz_repr::{Diff, GlobalId}; use serde::{Serialize, de::DeserializeOwned}; +use serde_json::json; use crate::metrics::upsert::{UpsertMetrics, UpsertSharedMetrics}; use crate::statistics::SourceStatistics; @@ -294,6 +296,10 @@ impl StateValue { match self { Self::Value(value) => value, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: into_decoded on Consolidating StateValue", + &json!({"accessor": "into_decoded"}) + ); panic!("called `into_decoded without calling `ensure_decoded`") } } @@ -366,6 +372,10 @@ impl StateValue { }), }), StateValue::Consolidating(_) => { + assert_unreachable!( + "upsert: into_provisional_value on Consolidating StateValue", + &json!({"accessor": "into_provisional_value"}) + ); panic!("called `into_provisional_value` without calling `ensure_decoded`") } } @@ -400,6 +410,10 @@ impl StateValue { }), }), StateValue::Consolidating(_) => { + assert_unreachable!( + "upsert: into_provisional_tombstone on Consolidating StateValue", + &json!({"accessor": "into_provisional_tombstone"}) + ); panic!("called `into_provisional_tombstone` without calling `ensure_decoded`") } } @@ -413,6 +427,10 @@ impl StateValue { _ => None, }, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: provisional_order on Consolidating StateValue", + &json!({"accessor": "provisional_order"}) + ); panic!("called `provisional_order` without calling `ensure_decoded`") } } @@ -427,6 +445,10 @@ impl StateValue { _ => value.finalized.as_ref(), }, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: provisional_value_ref on Consolidating StateValue", + &json!({"accessor": "provisional_value_ref"}) + ); panic!("called `provisional_value_ref` without calling `ensure_decoded`") } } @@ -437,6 +459,10 @@ impl StateValue { match self { Self::Value(v) => v.finalized, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: into_finalized_value on Consolidating StateValue", + &json!({"accessor": "into_finalized_value"}) + ); panic!("called `into_finalized_value` without calling `ensure_decoded`") } } @@ -577,7 +603,13 @@ impl StateValue { *acc ^= val; } } - _ => panic!("`merge_update_state` called with non-consolidating state"), + _ => { + assert_unreachable!( + "upsert: merge_update_state on non-Consolidating state", + &json!({"site": "merge_update_state"}) + ); + panic!("`merge_update_state` called with non-consolidating state") + } } } @@ -618,29 +650,61 @@ impl StateValue { }) .expect("invalid upsert state"); // Truncation is fine (using `as`) as this is just a checksum + let want_checksum = seahash::hash(value) as i64; + assert_always!( + consolidating.checksum_sum.0 == want_checksum, + "upsert: consolidating checksum_sum mismatch (diff_sum=1)", + &json!({ + "source_id": source_id.to_string(), + "checksum_sum": consolidating.checksum_sum.0, + "expected_seahash": want_checksum, + }) + ); assert_eq!( - consolidating.checksum_sum.0, - // Hash the value, not the full buffer, which may have extra 0's - seahash::hash(value) as i64, + consolidating.checksum_sum.0, want_checksum, "invalid upsert state: checksum_sum does not match, state: {}, {}", - consolidating, - source_id, + consolidating, source_id, ); *self = Self::finalized_value(bincode_opts.deserialize(value).unwrap()); } 0 => { + assert_always!( + consolidating.len_sum.0 == 0, + "upsert: consolidating len_sum nonzero (diff_sum=0)", + &json!({ + "source_id": source_id.to_string(), + "len_sum": consolidating.len_sum.0, + }) + ); assert_eq!( consolidating.len_sum.0, 0, "invalid upsert state: len_sum is non-0, state: {}, {}", consolidating, source_id, ); + assert_always!( + consolidating.checksum_sum.0 == 0, + "upsert: consolidating checksum_sum nonzero (diff_sum=0)", + &json!({ + "source_id": source_id.to_string(), + "checksum_sum": consolidating.checksum_sum.0, + }) + ); assert_eq!( consolidating.checksum_sum.0, 0, "invalid upsert state: checksum_sum is non-0, state: {}, {}", consolidating, source_id, ); + let all_zero = consolidating.value_xor.iter().all(|&x| x == 0); + assert_always!( + all_zero, + "upsert: consolidating value_xor nonzero (diff_sum=0)", + &json!({ + "source_id": source_id.to_string(), + "value_xor_len": consolidating.value_xor.len(), + }) + ); assert!( - consolidating.value_xor.iter().all(|&x| x == 0), + all_zero, "invalid upsert state: value_xor not all 0s with 0 diff. \ Non-zero positions: {:?}, state: {}, {}", consolidating @@ -669,6 +733,15 @@ impl StateValue { ), Err(_) => "Err(UpsertValueError)".to_string(), }); + assert_unreachable!( + "upsert: consolidating diff_sum not in {0,1}", + &json!({ + "source_id": source_id.to_string(), + "diff_sum": other, + "value_byte_len": value_byte_len, + "decodable": decode_ok, + }) + ); panic!( "invalid upsert state: non 0/1 diff_sum: {}, state: {}, {}, \ key: {:?}, value_byte_len: {:?}, decodable: {:?}", @@ -1059,6 +1132,10 @@ where }); if completed && self.snapshot_completed { + assert_unreachable!( + "upsert: snapshot completion called twice", + &json!({"site": "consolidate_chunk"}) + ); panic!("attempted completion of already completed upsert snapshot") } diff --git a/src/storage/src/upsert_continual_feedback.rs b/src/storage/src/upsert_continual_feedback.rs index a4669d3a80099..5fb562a7aa08a 100644 --- a/src/storage/src/upsert_continual_feedback.rs +++ b/src/storage/src/upsert_continual_feedback.rs @@ -14,6 +14,7 @@ use std::cmp::Reverse; use std::fmt::Debug; use std::sync::Arc; +use antithesis_sdk::{assert_always, assert_unreachable}; use differential_dataflow::hashable::Hashable; use differential_dataflow::{AsCollection, VecCollection}; use indexmap::map::Entry; @@ -23,6 +24,7 @@ use mz_storage_types::errors::{DataflowError, EnvelopeError}; use mz_timely_util::builder_async::{ Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, PressOnDropButton, }; +use serde_json::json; use std::convert::Infallible; use timely::container::CapacityContainerBuilder; use timely::dataflow::StreamVec; @@ -623,6 +625,11 @@ fn stage_input( } stash.extend(data.drain(..).map(|((key, value, order), time, diff)| { + assert_always!( + diff.is_positive(), + "upsert: input diff positive (cf v1)", + &json!({"diff": diff.into_inner()}) + ); assert!(diff.is_positive(), "invalid upsert input"); (time, key, Reverse(order), value) })); @@ -797,6 +804,10 @@ where let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) { command_state } else { + assert_unreachable!( + "upsert: key missing from commands_state (cf v1)", + &json!({"source_id": source_config.id.to_string()}) + ); panic!("key missing from commands_state"); }; diff --git a/src/storage/src/upsert_continual_feedback_v2.rs b/src/storage/src/upsert_continual_feedback_v2.rs index 32de9e3770086..8560ffd614603 100644 --- a/src/storage/src/upsert_continual_feedback_v2.rs +++ b/src/storage/src/upsert_continual_feedback_v2.rs @@ -65,6 +65,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; +use antithesis_sdk::{assert_always, assert_unreachable}; use differential_dataflow::difference::{IsZero, Semigroup}; use differential_dataflow::hashable::Hashable; use differential_dataflow::lattice::Lattice; @@ -81,6 +82,7 @@ use mz_storage_types::errors::{DataflowError, EnvelopeError}; use mz_timely_util::builder_async::{ Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, PressOnDropButton, }; +use serde_json::json; use std::convert::Infallible; use timely::container::CapacityContainerBuilder; use timely::dataflow::StreamVec; @@ -312,6 +314,11 @@ where AsyncEvent::Data(cap, data) => { let mut pushed_any = false; for ((key, value, from_time), ts, diff) in data { + assert_always!( + diff.is_positive(), + "upsert: input diff positive (cf v2)", + &json!({"diff": diff.into_inner()}) + ); assert!(diff.is_positive(), "invalid upsert input"); if PartialOrder::less_equal(&input_upper, &resume_upper) && !resume_upper.less_equal(&ts) @@ -480,7 +487,13 @@ where (Some(a), Some(b)) => std::cmp::min(a, b).clone(), (Some(a), None) => a.clone(), (None, Some(b)) => b.clone(), - (None, None) => unreachable!(), + (None, None) => { + assert_unreachable!( + "upsert: cf v2 join produced (None, None)", + &json!({"site": "min_ts join"}) + ); + unreachable!() + } }; cap.downgrade(&min_ts); } else { diff --git a/test/antithesis/AGENTS.md b/test/antithesis/AGENTS.md new file mode 100644 index 0000000000000..b93956df1ea94 --- /dev/null +++ b/test/antithesis/AGENTS.md @@ -0,0 +1,21 @@ +Files relevant to running Materialize under Antithesis. + +Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands. + +**mzcompose.py** +Source of truth for the Antithesis topology. Standard mzcompose composition: services (`postgres-metadata`, `minio`, `redpanda`, `materialized`, `workload`), dependencies, env, ports. The generated `config/docker-compose.yaml` is derived from this. + +**export-compose.py** +Renders `mzcompose.py` into a flat docker-compose YAML that Antithesis can consume. Images are emitted as `ghcr.io/materializeinc/materialize/:mzbuild-` refs that Antithesis pulls directly from public GHCR. + +**workload/** +Mzbuild image (`antithesis-workload`) for the Python test driver. Dockerfile, entrypoint, and test-template scripts (`test/*.sh`) live here. Test command files must be prefixed with one of `parallel_driver_`, `singleton_driver_`, `serial_driver_`, `first_`, `eventually_`, `finally_`, `anytime_`; files prefixed with `helper_` are ignored by Test Composer. + +**config/** +Mzbuild image (`antithesis-config`) — a `FROM scratch` container holding the generated `docker-compose.yaml`. This is the image Antithesis points at to bring up the environment. + +**scratchbook/** +Antithesis scratchbook: system analysis, property catalog, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, persistent integration notes. Keep up to date as Antithesis-related decisions change. + +**setup-complete.sh** (in `workload/`) +Inject this script into a Dockerfile to notify Antithesis that setup is complete. Should only run once the system under test is ready for testing — Antithesis will not run test commands until it receives this event. diff --git a/test/antithesis/Makefile b/test/antithesis/Makefile new file mode 100644 index 0000000000000..878bf7e384019 --- /dev/null +++ b/test/antithesis/Makefile @@ -0,0 +1,99 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Local-dev helper for the Materialize Antithesis harness. +# +# Antithesis images ship via the standard mzbuild → GHCR flow; CI publishes +# the same images CI publishes for everything else, fingerprint-tagged with +# `mzbuild-`. Locally, we just acquire the mzbuild images, regenerate +# the compose YAML, and let `docker compose` find them by their canonical +# spec. +# +# Targets: +# make build # regenerate compose YAML, acquire local mzbuild images +# make up # build + bring up the stack +# make down # tear down (preserves volumes) +# make smoke # build + up + smoke test +# make test # smoke test against a running stack +# make clean # tear down + remove volumes + +SHELL := /usr/bin/env bash +.SHELLFLAGS := -eu -o pipefail -c + +PROJECT := materialize-antithesis +REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/../..) + +# Pick podman if available, else docker. +ifndef RUNTIME + RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none)) +endif +ifeq ($(RUNTIME),none) + $(error neither podman nor docker found in PATH; set RUNTIME=docker or install podman) +endif +ifeq ($(RUNTIME),podman) + export MZ_DEV_CI_BUILDER_RUNTIME := podman +endif + +COMPOSE_FILE := $(REPO_ROOT)/test/antithesis/config/docker-compose.yaml +ENV_FILE := $(REPO_ROOT)/test/antithesis/config/.env +COMPOSE := $(RUNTIME) compose -p $(PROJECT) --env-file $(ENV_FILE) -f $(COMPOSE_FILE) +PSQL := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize + +# mzbuild images we need built locally. Third-party images (postgres, minio, +# kafka, …) are pulled by `docker compose` from their upstream registries. +MZBUILD_IMAGES := materialized antithesis-workload + +# --------------------------------------------------------------------------- +# Build +# --------------------------------------------------------------------------- +.PHONY: build export-compose export-env acquire-images + +build: export-compose export-env acquire-images + +export-compose: + cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-compose.py > $(COMPOSE_FILE) + @echo "Wrote $(COMPOSE_FILE)" + +export-env: + cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-env.py > $(ENV_FILE) + @echo "Wrote $(ENV_FILE)" + +acquire-images: + @for image in $(MZBUILD_IMAGES); do \ + echo "--- Acquiring $$image (--antithesis)"; \ + cd $(REPO_ROOT) && bin/mzimage acquire "$$image" --antithesis; \ + done + +# --------------------------------------------------------------------------- +# Up / Down +# --------------------------------------------------------------------------- +.PHONY: up down clean + +up: build + $(COMPOSE) up -d + +down: + $(COMPOSE) down + +clean: down + $(COMPOSE) down -v --remove-orphans 2>/dev/null || true + +# --------------------------------------------------------------------------- +# Test +# --------------------------------------------------------------------------- +.PHONY: test smoke + +test: + $(PSQL) -c "CREATE TABLE IF NOT EXISTS smoke_test (k INT, v TEXT)" + $(PSQL) -c "INSERT INTO smoke_test VALUES (1, 'hello'), (2, 'world')" + $(PSQL) -c "SELECT * FROM smoke_test ORDER BY k" + $(PSQL) -c "DROP TABLE smoke_test" + +smoke: up test + @echo "[smoke] passed" diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env new file mode 100644 index 0000000000000..d4f160a98596f --- /dev/null +++ b/test/antithesis/config/.env @@ -0,0 +1,21 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Compose env-file for `test/antithesis/config/docker-compose.yaml`. +# Tracked by git only so that the file exists for mzbuild's input +# fingerprinting and survives `git clean -ffdX` between builds. The +# committed values are placeholders — `build-antithesis.sh` overwrites +# them in CI with refs to images pushed to Antithesis's GCP Artifact +# Registry, and `make export-env` does the same with local-dev refs. +# +# If you see these placeholder values on a running cluster, your build +# pipeline did not regenerate this file. Run: +# bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env +MATERIALIZED_IMAGE=placeholder-not-built +ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built diff --git a/test/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile new file mode 100644 index 0000000000000..32fcb07e30460 --- /dev/null +++ b/test/antithesis/config/Dockerfile @@ -0,0 +1,18 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Antithesis config image: a FROM-scratch tarball holding the +# docker-compose.yaml that Antithesis uses to bring up the system under +# test, plus a `.env` mapping `${MATERIALIZED_IMAGE}` / +# `${ANTITHESIS_WORKLOAD_IMAGE}` to current mzbuild fingerprints. Compose +# loads `.env` automatically at parse time. See mzbuild.yml for +# regeneration instructions. + +FROM scratch +COPY docker-compose.yaml .env / diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml new file mode 100644 index 0000000000000..0a9c072b81aad --- /dev/null +++ b/test/antithesis/config/docker-compose.yaml @@ -0,0 +1,552 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# GENERATED FILE — do not edit. Regenerate via: +# bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml +# Source of truth: test/antithesis/mzcompose.py. + +services: + postgres-metadata: + command: + - postgres + - -c + - wal_level=logical + - -c + - max_wal_senders=100 + - -c + - max_replication_slots=100 + - -c + - max_connections=5000 + ports: + - '26257' + environment: + - POSTGRESDB=postgres + - POSTGRES_PASSWORD=postgres + - PGPORT=26257 + - POSTGRES_HOST_AUTH_METHOD=trust + healthcheck: + test: + - CMD + - pg_isready + - -U + - postgres + interval: 1s + start_period: 30s + restart: 'no' + platform: linux/amd64 + image: postgres:17.7 + entrypoint: + - sh + - -c + - 'cat <<''SQL'' > /docker-entrypoint-initdb.d/z_setup_materialize.sql + + CREATE ROLE root WITH LOGIN PASSWORD ''root''; + + CREATE DATABASE root; + + GRANT ALL PRIVILEGES ON DATABASE root TO root; + + \c root + + CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root; + + CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root; + + CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root; + + CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root; + + GRANT ALL PRIVILEGES ON SCHEMA public TO root; + + SQL + + exec docker-entrypoint.sh "$$@"' + - -- + minio: + entrypoint: + - sh + - -c + command: + - mkdir -p /data/persist && minio server /data --console-address :9001 + ports: + - 9000 + - 9001 + environment: + - MINIO_STORAGE_CLASS_STANDARD=EC:0 + - MINIO_HEAL_DISABLE=on + - MINIO_DISK_WATERMARK_LOW=1 + - MINIO_DISK_WATERMARK_HIGH=1 + healthcheck: + test: + - CMD + - curl + - --fail + - http://localhost:9000/minio/health/live + timeout: 5s + interval: 1s + start_period: 30s + platform: linux/amd64 + image: minio/minio:latest + zookeeper: + image: confluentinc/cp-zookeeper:7.9.4 + ports: + - 2181 + environment: + - ZOOKEEPER_CLIENT_PORT=2181 + healthcheck: + test: + - CMD + - nc + - -z + - localhost + - '2181' + interval: 1s + start_period: 120s + platform: linux/amd64 + kafka: + image: confluentinc/cp-kafka:7.9.4 + ports: + - '9092' + environment: + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false + - KAFKA_MIN_INSYNC_REPLICAS=1 + - KAFKA_OFFSETS_TOPIC_NUM_PARTITIONS=1 + - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1 + - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1 + - KAFKA_MESSAGE_MAX_BYTES=15728640 + - KAFKA_REPLICA_FETCH_MAX_BYTES=15728640 + - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=100 + - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 + - KAFKA_BROKER_ID=1 + - KAFKA_AUTO_CREATE_TOPICS_ENABLE=True + - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 + depends_on: + zookeeper: + condition: service_started + healthcheck: + test: + - CMD + - nc + - -z + - localhost + - '9092' + interval: 1s + start_period: 120s + platform: linux/amd64 + schema-registry: + image: confluentinc/cp-schema-registry:7.9.4 + ports: + - 8081 + networks: + default: + aliases: [] + environment: + - SCHEMA_REGISTRY_KAFKASTORE_TIMEOUT_MS=10000 + - SCHEMA_REGISTRY_KAFKASTORE_TOPIC_REPLICATION_FACTOR=1 + - SCHEMA_REGISTRY_HOST_NAME=schema-registry + - SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS=PLAINTEXT://kafka:9092 + command: + - /bin/bash + - -c + - . /etc/confluent/docker/bash-config && . /etc/confluent/docker/mesos-setup.sh + && . /etc/confluent/docker/apply-mesos-overrides && /etc/confluent/docker/configure + && exec /etc/confluent/docker/launch + depends_on: + kafka: + condition: service_started + healthcheck: + test: + - CMD + - curl + - -fu + - materialize:sekurity + - localhost:8081 + interval: 1s + start_period: 120s + platform: linux/amd64 + mysql: + init: true + ports: + - 3306 + environment: + - MYSQL_ROOT_PASSWORD=p@ssw0rd + command: + - --secure-file-priv=/var/lib/mysql-files + - --log-bin=mysql-bin + - --gtid_mode=ON + - --enforce_gtid_consistency=ON + - --binlog-format=row + - --binlog-row-image=full + - --binlog-row-metadata=full + - --server-id=1 + - --max-connections=500 + healthcheck: + test: + - CMD + - mysqladmin + - ping + - --password=p@ssw0rd + - --protocol=TCP + interval: 1s + start_period: 180s + volumes: + - mysqldata_primary:/var/lib/mysql + - mydata:/var/lib/mysql-files + image: mysql:9.5.0 + platform: linux/amd64 + mysql-replica: + init: true + ports: + - 3306 + environment: + - MYSQL_ROOT_PASSWORD=p@ssw0rd + command: + - --secure-file-priv=/var/lib/mysql-files + - --log-bin=mysql-bin + - --gtid_mode=ON + - --enforce_gtid_consistency=ON + - --binlog-format=row + - --binlog-row-image=full + - --binlog-row-metadata=full + - --server-id=2 + - --max-connections=500 + - --log-slave-updates + - --skip-replica-start + - --replica_parallel_workers=4 + - --replica_preserve_commit_order=ON + healthcheck: + test: + - CMD + - mysqladmin + - ping + - --password=p@ssw0rd + - --protocol=TCP + interval: 1s + start_period: 180s + volumes: + - mysqldata_replica:/var/lib/mysql + - mydata:/var/lib/mysql-files + image: mysql:9.5.0 + platform: linux/amd64 + clusterd1: + entrypoint: + - tini + - -- + command: + - clusterd + - --scratch-directory=/scratch + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd1 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - clusterd1_scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd2: + entrypoint: + - tini + - -- + command: + - clusterd + - --scratch-directory=/scratch + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd2 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - clusterd2_scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + materialized: + hostname: materialized + depends_on: + minio: + condition: service_started + postgres-metadata: + condition: service_healthy + command: + - --unsafe-mode + - --environment-id=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - --persist-blob-url=s3://minioadmin:minioadmin@persist/persist?endpoint=http://minio:9000/®ion=minio + - --orchestrator-process-propagate-crashes + - --persist-consensus-url=postgres://root@postgres-metadata:26257?options=--search_path=consensus + - --orchestrator-process-tcp-proxy-listen-addr=0.0.0.0 + - --orchestrator-process-prometheus-service-discovery-directory=/mzdata/prometheus + ports: + - 6875 + - 6876 + - 6877 + - 6878 + - 6880 + - 6881 + - 26257 + environment: + - MZ_NO_TELEMETRY=1 + - MZ_NO_BUILTIN_CONSOLE=1 + - MZ_TEST_ONLY_DUMMY_SEGMENT_CLIENT=true + - MZ_SOFT_ASSERTIONS=1 + - MZ_ORCHESTRATOR_PROCESS_TCP_PROXY_LISTEN_ADDR=0.0.0.0 + - MZ_ORCHESTRATOR_PROCESS_PROMETHEUS_SERVICE_DISCOVERY_DIRECTORY=/mzdata/prometheus + - MZ_BOOTSTRAP_ROLE=materialize + - MZ_INTERNAL_PERSIST_PUBSUB_LISTEN_ADDR=0.0.0.0:6879 + - MZ_PERSIST_PUBSUB_URL=http://127.0.0.1:6879 + - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection + - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345 + - MZ_CATALOG_STORE=persist + - 'MZ_CLUSTER_REPLICA_SIZES={"bootstrap": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=2,workers=4": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 2, "workers": 4}, "scale=1,workers=1,legacy": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + false, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=2,legacy": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": false, "memory_limit": "4 GiB", "scale": + 1, "workers": 2}, "free": {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": + "1", "disabled": true, "disk_limit": null, "is_cc": true, "memory_limit": "4 + GiB", "scale": 1, "workers": 1}, "scale=1,workers=1": {"cpu_exclusive": false, + "cpu_limit": null, "credits_per_hour": "1", "disabled": false, "disk_limit": + null, "is_cc": true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 1}, "scale=1,workers=1,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 1}, "scale=1,workers=1,mem=32GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "32 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=1GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "1 GiB", "scale": + 1, "workers": 1}, "scale=1,workers=2": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 2}, "scale=1,workers=2,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 2}, "scale=1,workers=2,mem=32GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "32 GiB", "scale": 1, "workers": 2}, "scale=2,workers=1": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 2, "workers": 1}, "scale=2,workers=2": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 2, "workers": 2}, "scale=1,workers=2,mem=2GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "2 GiB", "scale": + 1, "workers": 2}, "scale=1,workers=4": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 4}, "scale=1,workers=4,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 4}, "scale=1,workers=4,mem=32GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "32 GiB", "scale": 1, "workers": 4}, "scale=4,workers=1": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 4, "workers": 1}, "scale=4,workers=4": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 4, "workers": 4}, "scale=1,workers=8": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 8}, "scale=1,workers=8,mem=4GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=8GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale": + 1, "workers": 8}, "scale=1,workers=8,mem=16GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "16 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=32GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale": + 1, "workers": 8}, "scale=8,workers=1": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 8, "workers": 1}, "scale=8,workers=8": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "64", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 8, "workers": 8}, "scale=1,workers=16": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 16}, "scale=1,workers=16,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 16}, "scale=1,workers=16,mem=32GiB": {"cpu_exclusive": false, + "cpu_limit": null, "credits_per_hour": "16", "disabled": false, "disk_limit": + null, "is_cc": true, "memory_limit": "32 GiB", "scale": 1, "workers": 16}, "scale=16,workers=1": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 16, "workers": 1}, "scale=16,workers=16": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "256", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 16, "workers": 16}, "scale=1,workers=32": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 32}, "scale=1,workers=32,mem=4GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=8GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale": + 1, "workers": 32}, "scale=1,workers=32,mem=16GiB": {"cpu_exclusive": false, + "cpu_limit": null, "credits_per_hour": "32", "disabled": false, "disk_limit": + null, "is_cc": true, "memory_limit": "16 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=32GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale": + 1, "workers": 32}, "scale=32,workers=1": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 32, "workers": 1}, "scale=32,workers=32": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1024", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 32, "workers": 32}}' + - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_SUPPORT_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_CATALOG_SERVER_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_ANALYTICS_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICATION_FACTOR=1 + - MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICATION_FACTOR=1 + - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1 + - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s + - COCKROACH_LOG_MAX_SYNC_DURATION=120s + - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1;unsafe_enable_unorchestrated_cluster_replicas=true + - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle + - MZ_NO_BUILTIN_POSTGRES=1 + - MZ_NO_BUILTIN_COCKROACH=1 + - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + tmpfs: + - /tmp + healthcheck: + test: + - CMD + - curl + - -f + - localhost:6878/api/readyz + interval: 1s + start_period: 600s + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + workload: + depends_on: + materialized: + condition: service_healthy + clusterd1: + condition: service_started + clusterd2: + condition: service_started + kafka: + condition: service_healthy + schema-registry: + condition: service_started + mysql: + condition: service_healthy + mysql-replica: + condition: service_healthy + environment: + - PGHOST=materialized + - PGPORT=6875 + - PGUSER=materialize + - PGPORT_INTERNAL=6877 + - PGUSER_INTERNAL=mz_system + - KAFKA_BROKER=kafka:9092 + - SCHEMA_REGISTRY_URL=http://schema-registry:8081 + - MZ_ANTITHESIS_CLUSTER=antithesis_cluster + - MYSQL_HOST=mysql + - MYSQL_REPLICA_HOST=mysql-replica + - MYSQL_PASSWORD=p@ssw0rd + platform: linux/amd64 + image: ${ANTITHESIS_WORKLOAD_IMAGE} +networks: {} +volumes: + mzdata: null + pgdata: null + mysqldata: null + mssqldata: null + sourcedata_512Mb: + driver_opts: + device: tmpfs + type: tmpfs + o: size=512m + mydata: null + tmp: null + secrets: null + scratch: null + mysqldata_primary: null + mysqldata_replica: null + clusterd1_scratch: null + clusterd2_scratch: null diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml new file mode 100644 index 0000000000000..f3491f546dbb5 --- /dev/null +++ b/test/antithesis/config/mzbuild.yml @@ -0,0 +1,33 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# FROM-scratch image holding the docker-compose.yaml + .env for the +# Antithesis environment. Antithesis pulls this image and reads +# `/docker-compose.yaml` to bring up the system under test; `.env` supplies +# `${MATERIALIZED_IMAGE}` / `${ANTITHESIS_WORKLOAD_IMAGE}` at compose-parse +# time. +# +# The compose YAML (committed, topology-only) is generated from +# `test/antithesis/mzcompose.py` via `bin/pyactivate +# test/antithesis/export-compose.py`. Regenerate when topology changes; CI +# verifies the committed copy is up to date. +# +# `.env` (generated, gitignored) is written by +# `bin/pyactivate test/antithesis/export-env.py` at build time. Its content +# changes every materialized fingerprint shift, which is what propagates +# fresh fingerprints into this image without touching the committed YAML. +# +# `publish: false` keeps the standard `ci.test.build` flow from trying to +# build this image — it would fail on `COPY docker-compose.yaml .env /` +# because `.env` is gitignored and only `build-antithesis.sh` writes it. +# The antithesis nightly step builds and pushes the image directly via +# push-antithesis.py. + +name: antithesis-config +publish: false diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py new file mode 100644 index 0000000000000..a204a76fdbf87 --- /dev/null +++ b/test/antithesis/export-compose.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Export the resolved docker-compose YAML for the Antithesis composition. + +Loads `test/antithesis/mzcompose.py` and dumps a docker-compose YAML to +stdout where Materialize-built images are emitted as compose env-var +placeholders (`${MATERIALIZED_IMAGE}`, `${ANTITHESIS_WORKLOAD_IMAGE}`). +The actual fingerprint values are supplied separately in a `.env` file +generated by `export-env.py`. This separation lets the committed YAML stay +stable across materialized source changes — only `.env` shifts per +fingerprint. + +Image-reference policy: + + * Materialize-built images (`materialized`, `antithesis-workload`) + become `${MATERIALIZED_IMAGE}` / `${ANTITHESIS_WORKLOAD_IMAGE}`. + Compose interpolates them from `.env` at parse time. The actual specs + are `ghcr.io/materializeinc/materialize/:mzbuild-` with + `antithesis=True` participating in the fingerprint. + + * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with + the public upstream image. Our mzbuild variants bake in test-friendly + patches (eatmydata, no_fsync) that defeat Antithesis's fault injection; + Antithesis runs against vanilla. + +The script also strips mzcompose-only keys, host bind-mounts, and host-path +env vars that don't resolve inside the Antithesis sandbox, and inlines the +postgres bootstrap SQL into the entrypoint (the bind-mount path won't +exist). + +Usage: + bin/pyactivate test/antithesis/export-compose.py \\ + > test/antithesis/config/docker-compose.yaml +""" + +import sys +from pathlib import Path +from typing import Any + +import yaml + +from materialize import MZ_ROOT +from materialize.mzbuild import Repository +from materialize.mzcompose.composition import Composition +from materialize.xcompile import Arch + +# mzbuild image names that we publish under our fingerprint. Each maps to +# the compose env-var placeholder; `.env` (export-env.py) supplies the +# concrete ref at compose-parse time. Keep in sync with `export-env.py`. +MATERIALIZE_IMAGES = { + "materialized": "${MATERIALIZED_IMAGE}", + "antithesis-workload": "${ANTITHESIS_WORKLOAD_IMAGE}", +} + +# Public-image fallbacks for mzbuild images whose Materialize-specific +# customizations subvert Antithesis (eatmydata, fsync no-ops, etc.). +# Antithesis can reach public registries — we just need to make sure the +# compose points at the upstream image, not our patched mzbuild build. +PUBLIC_FALLBACKS = { + "postgres": "postgres:17.7", + "minio": "minio/minio:latest", +} + +# Header prepended to the generated YAML so check-copyright passes and +# readers know the file isn't hand-edited. +HEADER = """\ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# GENERATED FILE — do not edit. Regenerate via: +# bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml +# Source of truth: test/antithesis/mzcompose.py. + +""" + + +def resolve_mzbuild(svc: dict[str, Any]) -> None: + """Replace `mzbuild:` with a concrete or templated `image:` ref.""" + name = svc.pop("mzbuild") + if name in MATERIALIZE_IMAGES: + svc["image"] = MATERIALIZE_IMAGES[name] + elif name in PUBLIC_FALLBACKS: + svc["image"] = PUBLIC_FALLBACKS[name] + else: + raise ValueError( + f"mzbuild image {name!r} has no Antithesis policy — add it to " + f"MATERIALIZE_IMAGES (use a `.env` placeholder) or " + f"PUBLIC_FALLBACKS (swap to a public image) in export-compose.py." + ) + + +def inline_postgres_setup(svc: dict[str, Any]) -> None: + """Replace the bind-mounted setup SQL with an inline entrypoint write. + + Antithesis has no host filesystem, so we can't mount the SQL file. + Read it from misc/postgres/setup_materialize.sql (one source of truth) + and bake it into the service entrypoint. + """ + if not svc.get("image", "").startswith("postgres:"): + return + + env = svc.setdefault("environment", []) + # eatmydata isn't installed in the public postgres image. + env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")] + # Trust auth — Antithesis-internal traffic only. + env.append("POSTGRES_HOST_AUTH_METHOD=trust") + + # Drop the bind-mounted setup SQL; we'll inline it. + vols = svc.get("volumes", []) + vols[:] = [v for v in vols if "setup_materialize.sql" not in v] + if not vols: + svc.pop("volumes", None) + + setup_sql = (MZ_ROOT / "misc" / "postgres" / "setup_materialize.sql").read_text() + # Strip comment lines + collapse to one statement per output line so we + # can safely double-quote it inside the sh -c here. + setup_sql = "\n".join( + line for line in setup_sql.splitlines() if line and not line.startswith("--") + ) + svc["entrypoint"] = [ + "sh", + "-c", + # `$$@` survives compose's $-interpolation and arrives as `$@` at the + # shell, forwarding any args (e.g., the `postgres` CMD) verbatim. + f"cat <<'SQL' > /docker-entrypoint-initdb.d/z_setup_materialize.sql\n" + f"{setup_sql}\n" + f"SQL\n" + f'exec docker-entrypoint.sh "$$@"', + "--", + ] + + +def strip_host_bindmounts(svc: dict[str, Any]) -> None: + """Drop volume entries that bind-mount a host path.""" + if "volumes" not in svc: + return + svc["volumes"] = [ + v + for v in svc["volumes"] + if not isinstance(v, str) + or ":" not in v + or not v.split(":", 1)[0].startswith("/") + ] + if not svc["volumes"]: + del svc["volumes"] + + +def strip_incompatible_env(svc: dict[str, Any]) -> None: + """Drop env vars that are unsafe or unresolvable under Antithesis. + + - `MZ_EAT_MY_DATA` enables `libeatmydata.so` (fsync no-op) — fatal for + crash-recovery testing under fault injection. + - `MZ_LISTENERS_CONFIG_PATH` and `MZ_EXTERNAL_LOGIN_PASSWORD_*` reference + host paths or host secrets that don't exist in the sandbox. + - Bare env vars (no `=`) inherit from the host environment, which is + empty under Antithesis; drop them so materialized's built-in defaults + apply. + """ + if "environment" not in svc: + return + drop_prefixes = ( + "MZ_EAT_MY_DATA=", + "MZ_LISTENERS_CONFIG_PATH=", + "MZ_EXTERNAL_LOGIN_PASSWORD_", + ) + svc["environment"] = [ + e for e in svc["environment"] if "=" in e and not e.startswith(drop_prefixes) + ] + + +def strip_mzcompose_keys(svc: dict[str, Any]) -> None: + """Drop keys understood by mzcompose but not by docker/podman compose.""" + for key in ("propagate_uid_gid", "allow_host_ports", "publish"): + svc.pop(key, None) + + +def register_referenced_named_volumes(compose: dict[str, Any]) -> None: + """Declare any named volume referenced by a service that isn't already + declared at the top level. Docker Compose rejects the file otherwise. + + mzcompose's `Composition` only auto-declares the fixed `DEFAULT_MZ_VOLUMES` + set; per-service custom named volumes (e.g. `clusterd1_scratch`) reference + names that have no top-level entry and fail `docker compose config`. + """ + top_level: dict[str, Any] = compose.setdefault("volumes", {}) or {} + compose["volumes"] = top_level + + for svc in compose.get("services", {}).values(): + for entry in svc.get("volumes", []) or []: + if not isinstance(entry, str): + continue + # Bind mounts (`/host:/container`) start with `/`; named volumes + # are bare identifiers. We only auto-declare the latter. + if entry.startswith("/"): + continue + name = entry.split(":", 1)[0] + if not name or name in top_level: + continue + top_level[name] = None + + +def main() -> None: + # munge_services=False keeps ports bare (e.g., `6875` instead of + # `127.0.0.1::6875`) — Antithesis is container-to-container, no host + # binding. We do our own mzbuild→image substitution below and don't + # need fingerprint resolution since Materialize-built images become + # `${...}` placeholders. + repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True) + c = Composition(repo, "antithesis", munge_services=False) + + for svc in c.compose["services"].values(): + svc["platform"] = "linux/amd64" + if "mzbuild" in svc: + resolve_mzbuild(svc) + inline_postgres_setup(svc) + strip_host_bindmounts(svc) + strip_incompatible_env(svc) + strip_mzcompose_keys(svc) + + register_referenced_named_volumes(c.compose) + + sys.stdout.write(HEADER) + yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) + + +if __name__ == "__main__": + main() diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py new file mode 100644 index 0000000000000..5488a0f097673 --- /dev/null +++ b/test/antithesis/export-env.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Emit the `.env` file consumed by Antithesis's docker-compose.yaml. + +The compose YAML (export-compose.py) is committed with `${MATERIALIZED_IMAGE}` +/ `${ANTITHESIS_WORKLOAD_IMAGE}` placeholders so it stays stable across +materialized source changes. This script writes the corresponding `.env` +with the current mzbuild fingerprints so compose can interpolate them. + +Run at CI build time (build-antithesis.sh) and at local-dev `make build`. +The `antithesis-config` mzbuild image copies in the .env produced by this +script, so the image's own fingerprint tracks the materialized fingerprint +transitively — same materialized → same .env → same antithesis-config. + +With `--registry`, the emitted refs use that registry prefix instead of +the default (whatever `spec()` returns based on `MZ_GHCR`). CI passes the +Antithesis GCP Artifact Registry so the compose Antithesis pulls +references images at the registry Antithesis can actually reach. + +Usage: + bin/pyactivate test/antithesis/export-env.py \\ + > test/antithesis/config/.env + bin/pyactivate test/antithesis/export-env.py \\ + --registry us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository \\ + > test/antithesis/config/.env +""" + +import argparse +import sys +from pathlib import Path + +from materialize.mzbuild import Repository +from materialize.xcompile import Arch + +# Mapping of `.env` variable name → mzbuild image name. Keep in sync with +# MATERIALIZE_IMAGES in export-compose.py. +ENV_VARS = { + "MATERIALIZED_IMAGE": "materialized", + "ANTITHESIS_WORKLOAD_IMAGE": "antithesis-workload", +} + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--registry", + default=None, + help=( + "Registry prefix to use for emitted refs. If unset, uses the " + "default `spec()` (GHCR when MZ_GHCR=1, else Docker Hub)." + ), + ) + args = parser.parse_args() + + repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True) + images = [repo.images[name] for name in ENV_VARS.values()] + deps = repo.resolve_dependencies(images) + + sys.stdout.write( + "# GENERATED FILE — do not edit. Regenerate via:\n" + "# bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env\n" + "# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time.\n" + ) + for var, image_name in ENV_VARS.items(): + if args.registry: + ref = ( + f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}" + ) + else: + ref = deps[image_name].spec() + sys.stdout.write(f"{var}={ref}\n") + + +if __name__ == "__main__": + main() diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py new file mode 100644 index 0000000000000..584f39852dd12 --- /dev/null +++ b/test/antithesis/mzcompose.py @@ -0,0 +1,193 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +""" +Antithesis test composition for Materialize. + +Topology exercised under Antithesis: + - postgres-metadata : consensus/catalog/timestamp-oracle store + - minio : S3-compatible blob storage for persist + - zookeeper + kafka : Kafka broker for source ingestion + - schema-registry : Avro/Protobuf schemas for kafka sources + - clusterd1, clusterd2 : two external compute+storage processes — each + backs one replica of `antithesis_cluster`, so + Antithesis killing either container exercises the + compute/storage-replica recovery and rebalancing + paths without taking the cluster offline. + - materialized : the SUT (environmentd; clusterd is external) + - workload : Python test driver wired to the Antithesis SDK + +Usage: + bin/mzcompose --find antithesis run default # bring up the cluster + bin/pyactivate test/antithesis/export-compose.py > config/... # dump compose YAML +""" + +from materialize.mzcompose.composition import Composition +from materialize.mzcompose.service import Service, ServiceConfig +from materialize.mzcompose.services.clusterd import Clusterd +from materialize.mzcompose.services.kafka import Kafka +from materialize.mzcompose.services.materialized import Materialized +from materialize.mzcompose.services.minio import Minio +from materialize.mzcompose.services.mysql import MySql, create_mysql_server_args +from materialize.mzcompose.services.postgres import PostgresMetadata +from materialize.mzcompose.services.schema_registry import SchemaRegistry +from materialize.mzcompose.services.zookeeper import Zookeeper + + +class Workload(Service): + """Antithesis workload client — Python test driver.""" + + def __init__(self) -> None: + config: ServiceConfig = { + "mzbuild": "antithesis-workload", + "depends_on": { + "materialized": {"condition": "service_healthy"}, + "clusterd1": {"condition": "service_started"}, + "clusterd2": {"condition": "service_started"}, + "kafka": {"condition": "service_healthy"}, + "schema-registry": {"condition": "service_started"}, + "mysql": {"condition": "service_healthy"}, + "mysql-replica": {"condition": "service_healthy"}, + }, + "environment": [ + "PGHOST=materialized", + "PGPORT=6875", + "PGUSER=materialize", + # Internal SQL port for system-privileged setup (CREATE CLUSTER). + "PGPORT_INTERNAL=6877", + "PGUSER_INTERNAL=mz_system", + "KAFKA_BROKER=kafka:9092", + "SCHEMA_REGISTRY_URL=http://schema-registry:8081", + # Name of the unmanaged cluster the workload-entrypoint + # provisions against clusterd1 before emitting setup-complete. + "MZ_ANTITHESIS_CLUSTER=antithesis_cluster", + # MySQL primary and replica connection details. + "MYSQL_HOST=mysql", + "MYSQL_REPLICA_HOST=mysql-replica", + f"MYSQL_PASSWORD={MySql.DEFAULT_ROOT_PASSWORD}", + ], + } + super().__init__(name="workload", config=config) + + +SERVICES = [ + PostgresMetadata(), + Minio(setup_materialize=True), + Zookeeper(), + Kafka(auto_create_topics=True), + SchemaRegistry(), + # MySQL primary — GTID-enabled. WRITESET binlog dependency tracking + # is what lets the replica run parallel workers without losing commit + # order; in MySQL 8.4+ WRITESET is the default and the explicit knob + # was removed (`binlog_transaction_dependency_tracking` is unknown + # past 8.4, and the antithesis image is `mysql:9.5.0`). + MySql( + use_seeded_image=False, + volumes=[ + "mysqldata_primary:/var/lib/mysql", + "mydata:/var/lib/mysql-files", + ], + additional_args=create_mysql_server_args(server_id="1", is_master=True), + ), + # MySQL replica — multithreaded replication (4 workers, commit-order + # preserved). Replication is configured at runtime by + # first_mysql_replica_setup.py after both containers are healthy. + MySql( + name="mysql-replica", + use_seeded_image=False, + volumes=[ + "mysqldata_replica:/var/lib/mysql", + "mydata:/var/lib/mysql-files", + ], + additional_args=create_mysql_server_args(server_id="2", is_master=False) + + [ + "--replica_parallel_workers=4", + "--replica_preserve_commit_order=ON", + ], + ), + # Two clusterd processes, one per replica of the unmanaged + # `antithesis_cluster`. Provisioning both replicas in the same cluster + # exercises multi-replica source ingestion and compute paths + # (notably the `compute-replica-epoch-isolation` property), and lets + # Antithesis kill either replica's backing container without taking + # the workload offline. + # + # `workers=4` per clusterd means each replica runs four timely worker + # threads in one process. The extra intra-process parallelism is the + # surface area Antithesis's thread-pausing fault targets — with a + # single worker, "pause one thread" effectively pauses the whole + # process, which the container-pause fault already covers. The matching + # `WORKERS 4` in the CREATE CLUSTER REPLICAS statement must stay in + # lockstep with this value (it's read by the controller, not by + # clusterd). + # + # Each clusterd MUST have its own /scratch volume — the upsert + # operator's RocksDB state lives there and takes an exclusive file + # lock per worker (`/scratch/storage/upsert///LOCK`). + # The DEFAULT_MZ_VOLUMES list uses a single named volume + # `scratch:/scratch` shared across containers; passing per-instance + # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the + # locks separate while leaving the other volumes shared. Found via + # an Antithesis run where clusterd1 deadlocked retrying to open + # `/scratch/storage/upsert/u3/0/LOCK` because clusterd2 held it, + # which then drove a continuous suspend-and-restart loop that + # corrupted the upsert state. + Clusterd( + name="clusterd1", + workers=4, + volumes=[ + "mzdata:/mzdata", + "mydata:/var/lib/mysql-files", + "tmp:/share/tmp", + "clusterd1_scratch:/scratch", + ], + ), + Clusterd( + name="clusterd2", + workers=4, + volumes=[ + "mzdata:/mzdata", + "mydata:/var/lib/mysql-files", + "tmp:/share/tmp", + "clusterd2_scratch:/scratch", + ], + ), + Materialized( + external_blob_store=True, + external_metadata_store=True, + metadata_store="postgres-metadata", + unsafe_mode=True, + soft_assertions=True, + sanity_restart=False, + support_external_clusterd=True, + # Allow creating an unmanaged cluster pointed at clusterd1 — without + # this, CREATE CLUSTER ... STORAGECTL ADDRESSES is rejected. + additional_system_parameter_defaults={ + "unsafe_enable_unorchestrated_cluster_replicas": "true", + }, + ), + Workload(), +] + + +def workflow_default(c: Composition) -> None: + """Bring up the Antithesis test cluster.""" + c.up( + "postgres-metadata", + "minio", + "zookeeper", + "kafka", + "schema-registry", + "clusterd1", + "clusterd2", + "mysql", + "mysql-replica", + ) + c.up("materialized") + c.up("workload") diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py new file mode 100755 index 0000000000000..fe1dc7555ea74 --- /dev/null +++ b/test/antithesis/push-antithesis.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Retag + push antithesis-flavored images to Antithesis's GCP registry. + +Antithesis's sandbox pulls images by reference. Our standard mzbuild flow +publishes to GHCR with `mzbuild-` tags, but new GHCR packages default +to private visibility — Antithesis hits a 4001 (image-not-reachable) when +trying to pull them. Pushing to a GCP Artifact Registry whose IAM grants +Antithesis read access avoids the visibility dance entirely. + +This script presumes `ci.test.build` has already run (so the source images +exist locally) and that `docker login` against the target registry has +already happened (build-antithesis.sh handles that via +GCP_SERVICE_ACCOUNT_JSON). + +Usage: + bin/pyactivate test/antithesis/push-antithesis.py \\ + --registry us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository +""" + +import argparse +from pathlib import Path + +from materialize import spawn, ui +from materialize.mzbuild import Repository +from materialize.xcompile import Arch + +# Images Antithesis needs to be able to pull: +# - antithesis-config holds the docker-compose.yaml + .env Antithesis runs. +# - materialized + antithesis-workload are referenced by that compose. +# Keep this list in sync with the `antithesis_images` branch in +# ci/test/build.py — that's where CI_ANTITHESIS scopes the mzbuild walk so +# the nightly doesn't waste time building images Antithesis never consumes. +ANTITHESIS_IMAGES = ["materialized", "antithesis-workload", "antithesis-config"] + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--registry", + required=True, + help="Antithesis registry prefix, e.g. us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository", + ) + args = parser.parse_args() + + # Match the Repository configuration used by ci.test.build so that + # `deps[name].spec()` returns the same local tag that build actually + # produced (materialize/:mzbuild-, not the GHCR-prefixed one). + repo = Repository( + Path("."), + arch=Arch.X86_64, + antithesis=True, + image_registry="materialize", + ) + deps = repo.resolve_dependencies([repo.images[name] for name in ANTITHESIS_IMAGES]) + + # Ensure each image is actually present locally before retag — ci.test.build's + # `ensure()` path may short-circuit to "already pushed" without leaving a + # local copy if the fingerprint was already in the cache. + deps.acquire() + + for name in ANTITHESIS_IMAGES: + resolved = deps[name] + source = resolved.spec() + target = f"{args.registry}/{name}:mzbuild-{resolved.fingerprint()}" + ui.section(f"Pushing {name}") + print(f" source: {source}") + print(f" target: {target}") + spawn.runv(["docker", "tag", source, target]) + spawn.runv(["docker", "push", target]) + + +if __name__ == "__main__": + main() diff --git a/test/antithesis/scratchbook/bug-candidates.md b/test/antithesis/scratchbook/bug-candidates.md new file mode 100644 index 0000000000000..f90d8b377a706 --- /dev/null +++ b/test/antithesis/scratchbook/bug-candidates.md @@ -0,0 +1,161 @@ +# Bug Candidates for Antithesis Reproduction + +Bugs found by mining the Materialize git history for timing/concurrency fixes +that Antithesis's deterministic scheduling would reliably find. + +## 1. Persist Lease Race (Best Candidate) + +**Commit**: `43f024da36` — "persist: Make sure to obtain a lease before selecting a batch" +**PR**: #35554 +**Severity**: Production incident — read-time halt +**Category**: TOCTOU race + +### The Bug + +Persist uses "seqno leases" to prevent GC from deleting batches a reader is +still processing. Before the fix, readers selected a batch *then* obtained a +lease. GC could delete the batch in between: + +``` +Reader GC +────── ── +1. snapshot() at SeqNo 5 + → picks BatchA (blob: part-0001) + 2. Compaction merges BatchA away → SeqNo 6 + 3. seqno_since advances (no lease on 5) + 4. Deletes part-0001 from blob storage +5. lease_seqno() → SeqNo 7 (too late) +6. fetch(BatchA) → 404 → HALT +``` + +The fix reorders to: lease first, then select batch. The lease prevents GC +from advancing past the leased SeqNo. + +### Code Paths Affected + +- `Listen::next` (read.rs:287) — continuous feed that hydrates MVs. Runs in + the background for every materialized view with an active source. This is the + most natural trigger — always active, exercises the lease path on every new + batch. +- `snapshot_cursor` (read.rs:1176) — used by "persist peeks" (SELECT on + unindexed tables). Less common than the listen path. +- `snapshot_and_fetch` (read.rs:889) — used by catalog ops and txn-WAL reads. + +All three now go through `snapshot_batches()` (read.rs:846), which does +lease-then-snapshot. + +### Workload to Trigger + +Simple mixed read/write traffic exercises the listen path: +- Continuous INSERTs into a table (creates new batches → SeqNo churn → GC pressure) +- A materialized view over that table (its listen is always running) +- Concurrent SELECTs on the MV (served from in-memory arrangements, but + the listen feeding the MV is the actual race target) + +Compaction and GC run automatically in the background. Antithesis's scheduler +can interleave GC between batch selection and lease acquisition. + +### Properties + +- `persist-cas-monotonicity` — batch data should never disappear +- `critical-reader-fence-linearization` — leases should protect batches +- Workload-side: reads never hang or error unexpectedly +- SUT-side: the panic at read.rs:864 fires if a batch is missing after the + upper advanced (added by the fix — would need to be preserved in a + revert-and-detect test) + +### Testing Notes + +A pure `git revert` of `43f024da36` removes both the fix AND the panic that +detects the impossible state. To validate, surgically revert only the ordering +(put lease back after snapshot) while keeping the panic or replacing it with +`assert_unreachable!`. + +--- + +## 2. Compute Dependency Frontier Race + +**Commit**: `42a22b7ff5` — "compute: fix a race condition in collecting dependency frontiers" +**Severity**: Compute controller panic +**Category**: TOCTOU — check-then-act across async boundary + +### The Bug + +The compute controller checked whether storage collections existed, then +collected their frontiers in a second step. Collections could be dropped +between the two steps: + +``` +Step 1: check_exists(collection_id) → true + storage drops collection_id +Step 2: collections_frontiers([collection_id]) → panic! missing key +``` + +Fix: replaced the two-step check-then-read with a single +`collection_frontiers(id).ok()` that handles missing collections atomically. + +**File**: `src/compute-client/src/controller/instance.rs` + +### Workload to Trigger + +Rapid concurrent DDL — CREATE/DROP of sources and MVs while the compute +controller is resolving dependency frontiers. + +### Properties + +- `compute-replica-epoch-isolation` +- System should never panic from DDL operations + +--- + +## 3. Reclock Upper Race with as_of + +**Commit**: `e3805ad790` — "Fetch latest upper in reclock to avoid races with as_of" +**Severity**: Panic (fixes database-issues#8698) +**Category**: Stale cached value in timing-sensitive decision + +A cached `upper` became stale between caching and `as_of` calculation, causing +panic when `as_of > upper`. + +### Properties + +- `strict-serializable-reads` + +--- + +## 4. MV-Sink Discarding Valid Batch Descriptions + +**Commit**: `0886c94dc2` — "mv-sink: stop discarding valid batch descriptions" +**Severity**: Silent data loss +**Category**: Stale frontier view + +Incorrect persist frontier view caused valid batch descriptions to be rejected +as "outdated." No crash, no error — just silently dropped data. + +### Properties + +- `mv-reflects-source-updates` + +--- + +## 5. Introspection Collection Frontier Regression + +**Commit**: `ec4f8996bb` — "compute: avoid frontier regressions for introspection collections" +**Severity**: Frontier monotonicity violation +**Category**: Initialization ordering mismatch + +### Properties + +- `persist-cas-monotonicity` + +--- + +## 6. as_of Selection Upper Constraint Bugs + +**Commit**: `e6ca4801fa` — "as_of_selection: fix two bugs around upper constraints" +**Severity**: 0dt upgrade availability blocked +**Category**: Incorrect boundary calculation + +### Properties + +- `deployment-promotion-safety` diff --git a/test/antithesis/scratchbook/deployment-topology.md b/test/antithesis/scratchbook/deployment-topology.md new file mode 100644 index 0000000000000..b03f0aa469449 --- /dev/null +++ b/test/antithesis/scratchbook/deployment-topology.md @@ -0,0 +1,157 @@ +# Deployment Topology: Materialize + +## Approach: mzcompose-Generated Docker Compose + +The most straightforward path is to use Materialize's **mzcompose** framework to generate the Docker Compose configuration for Antithesis. mzcompose already defines all the service classes, health checks, environment variables, and dependencies needed to run a complete Materialize test environment. + +**Strategy**: Write an `mzcompose.py` file that defines the Antithesis test topology, use mzcompose to generate the Docker Compose YAML, then adapt it for Antithesis (adding test template mounts). + +## Topology Overview + +``` ++---------------------+ +---------------------+ +| workload-client | ---> | materialized | +| (test driver, | <--- | (environmentd + | +| Antithesis SDK, | | embedded clusterd) | +| test templates) | | | ++---------------------+ +---------+-----------+ + | + +------------------+------------------+ + | | | + v v v + +----------------+ +----------------+ +----------------+ + | postgres- | | minio | | redpanda | + | metadata | | (blob storage) | | (Kafka-compat) | + | (consensus) | | | | | + +----------------+ +----------------+ +----------------+ +``` + +## Container Specifications + +### 1. postgres-metadata (Dependency) + +| | | +|---|---| +| **Role** | Metadata store / consensus for persist and catalog | +| **Image** | `postgres:16` (or mzcompose's `PostgresMetadata` service) | +| **Why** | Default metadata store in modern mzcompose. Lighter than CockroachDB. Sufficient for single-node testing. | +| **Ports** | 5432 | +| **Health check** | `pg_isready -U postgres` | +| **Network connections** | materialized reads/writes catalog and persist consensus | +| **Replicas** | 1 | + +PostgreSQL is the default metadata store in modern Materialize testing (`EXTERNAL_METADATA_STORE=postgres-metadata`). CockroachDB is an alternative but adds complexity and state space without benefit for single-coordinator testing. + +### 2. minio (Dependency) + +| | | +|---|---| +| **Role** | S3-compatible blob storage for persist data | +| **Image** | `minio/minio` (or mzcompose's `Minio` with `setup_materialize=True`) | +| **Why** | Persist stores all durable data (source data, MV data, catalog snapshots) in blob storage. MinIO is the standard test substitute for S3. | +| **Ports** | 9000 (S3 API), 9001 (console) | +| **Health check** | `curl --fail http://localhost:9000/minio/health/live` | +| **Network connections** | materialized writes/reads persist blobs | +| **Replicas** | 1 | +| **Config** | Pre-create `/data/persist` bucket. `MINIO_STORAGE_CLASS_STANDARD=EC:0` | + +### 3. redpanda (Dependency) + +| | | +|---|---| +| **Role** | Kafka-compatible message broker for stream source ingestion | +| **Image** | `redpandadata/redpanda` (or mzcompose's `Redpanda` service) | +| **Why** | Enables testing the Kafka source ingestion path, which is the most common production use case. Redpanda is lighter than Kafka+Zookeeper and includes a built-in Schema Registry. | +| **Ports** | 9092 (Kafka API), 8081 (Schema Registry) | +| **Health check** | `rpk cluster health` | +| **Network connections** | materialized reads source data; workload-client may produce test data | +| **Replicas** | 1 | + +### 4. materialized (Service — SUT) + +| | | +|---|---| +| **Role** | The system under test. Runs environmentd (coordinator) with embedded clusterd (compute/storage workers). | +| **Image** | `materialized` (mzcompose's `Materialized` service, built via `mzbuild`) | +| **Why** | This is the core SUT. The embedded clusterd mode runs everything in one process, simplifying the topology while still exercising all three layers (adapter, compute, storage). | +| **Ports** | 6875 (pgwire), 6876-6878 (API/admin), 6879 (persist pubsub), 26257 (pg-compat) | +| **Health check** | `curl -f localhost:6878/api/readyz` (interval 1s, start_period 600s) | +| **Network connections** | postgres-metadata (consensus), minio (blob), redpanda (sources) | +| **Replicas** | 1 | +| **Key environment** | `MZ_NO_TELEMETRY=1`, `MZ_SOFT_ASSERTIONS=1`, `MZ_CATALOG_STORE=persist`, `MZ_BOOTSTRAP_ROLE=materialize`, `MZ_UNSAFE_MODE=1` | +| **Key command args** | `--unsafe-mode`, `--persist-blob-url=s3://minioadmin:minioadmin@persist/persist?endpoint=http://minio:9000/®ion=minio`, `--environment-id=...` | +| **Depends on** | postgres-metadata, minio | + +**Design decision**: Use embedded clusterd (single process) rather than separate clusterd containers. This reduces state space while still exercising all code paths. Separate clusterd testing can be added as a second topology later. + +### 5. workload-client (Client — Test Driver) + +| | | +|---|---| +| **Role** | Runs Antithesis test commands. Emits `setup_complete`. Contains test templates. | +| **Image** | Custom image built on top of testdrive or a Python-based client | +| **Why** | Exercises the system via SQL (pgwire), produces Kafka messages, and asserts properties via the Antithesis SDK. | +| **Ports** | None exposed | +| **Network connections** | materialized (pgwire:6875), redpanda (Kafka:9092, SR:8081) | +| **Replicas** | 1 | +| **Test template mount** | `/opt/antithesis/test/v1/materialize/` | + +The workload client needs: +1. PostgreSQL client library (psycopg2 or psql) to issue SQL +2. Kafka producer library to push test data +3. Antithesis Python SDK for assertions and lifecycle signals +4. Test command scripts with appropriate prefixes (`first_`, `parallel_driver_`, `eventually_`, `finally_`) + +## SDK Selection + +| Component | Language | SDK Needed | +|-----------|----------|------------| +| workload-client | Python | `antithesis-sdk` Python package — for assertions, lifecycle signals | +| materialized (optional, future) | Rust | `antithesis-sdk` Rust crate — for SUT-side reachability/safety assertions | + +The workload client **must** have the SDK for emitting assertions. SUT-side Rust SDK instrumentation is optional but recommended for deeper coverage of internal invariants (persist CaS correctness, frontier monotonicity, catalog consistency). + +## mzcompose Integration Path + +### Option A: Static Docker Compose (Recommended for v1) + +1. Write an `mzcompose.py` that defines the topology above +2. Run `mzcompose --find antithesis gen-docker-compose` (or equivalent) to emit YAML +3. Add any Antithesis-specific adaptations as needed +4. Place the resulting `docker-compose.yml` in `guest/opt/materialize/` + +### Option B: Dynamic mzcompose (Future) + +1. Package the entire mzcompose framework into the workload-client image +2. Use a `first_` test command to generate and start the compose topology +3. More flexible but more complex; requires mzcompose to work inside Antithesis + +Option A is the pragmatic choice. It generates a compose file that Antithesis can directly manage. + +## Workload Design (High Level) + +Test commands in `/opt/antithesis/test/v1/materialize/`: + +| Command | Type | Purpose | +|---------|------|---------| +| `first_setup.sh` | first_ | Create sources, materialized views, tables. Establish baseline state. | +| `parallel_driver_sql_workload.py` | parallel_driver_ | Continuously run SQL operations: INSERTs, SELECTs, CREATE/DROP views. Assert consistency properties. | +| `parallel_driver_kafka_producer.py` | parallel_driver_ | Produce messages to Kafka topics. Verify they appear in materialized views. | +| `eventually_consistency_check.py` | eventually_ | Verify that all acknowledged writes are visible in materialized views. | +| `finally_invariant_check.py` | finally_ | Final consistency sweep: compare source data with MV contents. | +| `anytime_health_check.sh` | anytime_ | Verify system health endpoint and basic SQL connectivity. | + +## Assumptions + +- Embedded clusterd (single process) is sufficient for initial testing +- PostgreSQL is the preferred metadata store (simpler than CockroachDB) +- Redpanda is preferred over Kafka+Zookeeper (lighter, built-in schema registry) +- The workload client will be Python-based (leveraging existing testdrive patterns) +- Static Docker Compose generation (Option A) is the right starting point + +## Open Questions + +- Should we also test with external clusterd processes (separate compute replicas)? +- Should materialized be subject to fault injection, or only the network between it and dependencies? +- What is the best base image for the workload client — extend the existing testdrive image or build from scratch? +- Should the workload client use testdrive's `.td` format or raw SQL via psycopg? diff --git a/test/antithesis/scratchbook/evaluation/synthesis.md b/test/antithesis/scratchbook/evaluation/synthesis.md new file mode 100644 index 0000000000000..fff919f61edf2 --- /dev/null +++ b/test/antithesis/scratchbook/evaluation/synthesis.md @@ -0,0 +1,81 @@ +# Property Catalog Evaluation — Kafka Source Additions + +**Scope**: The 16 properties added to Category 7 in `property-catalog.md` on 2026-05-11 targeting the Kafka source ingestion pipeline (NONE + UPSERT envelopes), and the assertion sites in `existing-assertions.md`. Pre-existing properties in Categories 1-6 are *not* re-evaluated here — they passed evaluation on 2026-05-06 and nothing has changed in their code paths. The 16 are: 5 user-visible Kafka source properties (`kafka-source-no-data-loss`, `-no-data-duplication`, `-frontier-monotonic`, `-survives-broker-fault`, `-survives-clusterd-restart`), 4 UPSERT envelope properties (`upsert-key-reflects-latest-value`, `-tombstone-removes-key`, `-state-rehydrates-correctly`, `-decode-error-retractable`), 3 UPSERT operator-internal properties (`upsert-no-internal-panic`, `-state-consolidation-wellformed`, `-ensure-decoded-called-before-access`), and 4 reclock / source-reader operator-internal properties (`kafka-source-no-internal-panic`, `remap-shard-antichain-wellformed`, `reclock-mint-eventually-succeeds`, `offset-known-not-below-committed`). + +This evaluation was performed in single-agent mode across the four lenses, written as a single synthesis. Per-lens evidence files are inline below; spawning four parallel ensemble agents for a 16-property targeted addition would have been over-engineering given that one human's worth of catalog review is the better fit. + +## Lens 1 — Antithesis Fit + +**Passes**: + +- All 16 properties target timing-sensitive, concurrency-sensitive, or partial-failure scenarios. None can be fully verified by a deterministic unit test. +- Mix of assertion types is healthy: 7 Safety (`Always`), 3 Liveness (`Sometimes`), 3 Reachability (`Unreachable`), 2 properties combine multiple assertion families internally. +- Several properties (`kafka-source-survives-clusterd-restart`, `upsert-state-rehydrates-correctly`, `reclock-mint-eventually-succeeds`) explicitly need fault injection that deterministic tests can't sequence — strong Antithesis fit. +- The SUT-side instrumentation properties (`upsert-no-internal-panic`, `upsert-state-consolidation-wellformed`, `upsert-ensure-decoded-called-before-access`, `kafka-source-no-internal-panic`) wrap *existing* asserts/panics rather than adding new logic; this is the cheapest possible instrumentation cost. + +**Refinements**: + +- `offset-known-not-below-committed` is borderline unit-test material — the invariant could be tested by mocking the statistics update path. Kept in the catalog because the *interesting* failure is the restart-window timing, which is genuinely Antithesis territory; lowered priority from P1 to P2 (already P2 in the catalog). +- `upsert-decode-error-retractable` could be tested as integration. It earns its catalog slot only if the test exercises crash recovery between the bad and good message; the evidence file already calls this out. No change needed. + +**Findings**: None. Antithesis fit is good across the addition. + +## Lens 2 — Coverage Balance + +**Passes**: + +- Both envelopes (NONE and UPSERT) get dedicated coverage. +- The SUT analysis's Appendix A failure-prone areas table has 9 rows; 8 of them are covered by at least one new property. The one uncovered row is "Flag flip mid-append on persist sink (commit 68e1dfd86d)" — see Gap below. +- Liveness, Safety, and Reachability are all represented. +- Both workload-observable and SUT-side properties exist; the workload-only properties form the user-visible contract (`kafka-source-no-data-loss`, etc.) and the SUT-side properties form the operator-internal correctness backbone. + +**Gaps identified** (addressed during this pass — see "Addressing findings" below): + +- **G1: Persist sink flag-flip TOCTOU** — commit `68e1dfd86d` (database-issues#9585) regression is not represented. The bug was a config flag re-evaluated multiple times during `append_batches`. Decision: **Acknowledged but not added**. This is a persist-sink generic correctness property, not Kafka-source-specific; it belongs in Category 1 (Persist Layer Safety), not in the Kafka section. Filing as a follow-up note in `property-relationships.md` would clutter the relationships; instead, called out here as a known omission for a future persist-focused research pass. + +- **G2: Partition reassignment correctness** — Kafka topic adding/removing partitions while the source is live is mentioned in the SUT analysis but not captured as a property. The closest is `kafka-source-no-internal-panic` which catches *panics* on the rebalance path but not *correctness* (no data loss, no duplicates, correct partition→worker assignment under rebalance). Decision: **Catalog as a future expansion item**, not added in this pass because it requires non-trivial workload support (the test driver must be able to dynamically add Kafka partitions, and the worker-hash assignment property requires multi-worker clusterd). + +- **G3: Schema Registry interaction** — Avro / Protobuf decoding via Schema Registry is a significant Kafka source code path that is unmentioned. Schema evolution mid-source is a known operational hazard. Decision: **Future expansion item**. The workload is realistically text/JSON for v1 of these properties; Schema Registry coverage is a v2 expansion. + +**Refinements**: + +- The pre-existing `source-ingestion-progress` property is now redundant with `kafka-source-no-data-loss` for Kafka specifically. The relationships file calls this out. Decision: **Keep both** — `source-ingestion-progress` remains valid for non-Kafka sources (Postgres CDC, MySQL, generators), so it doesn't go away. The new property is more specific. No catalog edit needed beyond the cross-reference in `property-relationships.md`. + +## Lens 3 — Implementability + +**Passes**: + +- All workload-level properties can be checked via standard SQL queries against `mz_internal.mz_source_statistics_per_worker` and direct `SELECT` from the source. The workload only needs a PostgreSQL client and a Kafka producer (both already required by the existing topology in `deployment-topology.md`). +- All SUT-side properties wrap *existing* code (panic / assert / unreachable sites). No new SUT instrumentation logic is required, only replacing the existing macro with the Antithesis SDK equivalent and giving each callsite a unique message. +- Deployment topology already provides Kafka (Redpanda) and `materialized` in separate containers; network partition between them is a supported fault. +- Multi-replica scenarios for `upsert-state-consolidation-wellformed` and the upsert internals require a topology variation (multiple compute replicas serving the same source). The existing topology is single-replica; this is flagged. + +**Refinements**: + +- `kafka-source-survives-clusterd-restart` requires **node-termination faults**, which the `faults.md` reference says are disabled by default in Antithesis tenants. Flagged in the evidence file. The user should confirm this fault class is enabled. +- `upsert-state-consolidation-wellformed` (and `kafka-source-no-data-duplication` for the historical multi-replica regression) gain significant value from a multi-replica topology. Suggest adding a second topology variant to `deployment-topology.md` as a follow-up — single-replica is sufficient to start, but the multi-replica drain bug (commit `1accbe28b3`) requires multi-replica to reproduce. + +**Findings (refinements applied or noted in evidence files)**: + +- R1: Added a note to `properties/kafka-source-survives-clusterd-restart.md` calling out the node-termination-faults dependency. +- R2: Added a note to `properties/upsert-state-consolidation-wellformed.md` explaining the multi-replica relevance. + +## Lens 4 — Wildcard + +**Things the other lenses missed**: + +- **W1: Multi-topic / multi-source interaction.** The 16 properties all treat a single Kafka source as the unit of analysis. The real-world failure mode of "two Kafka sources on the same cluster, one is healthy, the other is partitioned" is unaddressed. The `materialized` container hosts both; partitioning one source from its broker should not affect the other. Decision: **Future expansion**. Adding this now would expand the workload significantly. + +- **W2: Clock-jump interaction with Kafka timestamps.** The SUT analysis flags `expect("kafka sources always have upstream_time")` at kafka.rs:1209 — this depends on the Kafka message timestamp being valid. Clock jumps on the *Kafka broker* could produce future or past message timestamps. The current property set doesn't address how Materialize handles a backward-clocked Kafka broker. Decision: **Acknowledged as a known gap**, similar to W1. + +- **W3: Reading the catalog as a whole, the SUT-side instrumentation properties feel like a single "wrap all the existing panics in Antithesis SDK" project rather than four separate properties.** Decision: **Keep the four-property structure** anyway, because the slugs give Antithesis distinct property tags and the per-site message uniqueness requirement makes them genuinely distinct invariants. But operationally, a single PR can implement all four. + +## Addressing Findings + +- **Refinements applied**: R1, R2 (noted in evidence files during this pass). +- **Gaps held as known omissions**: G1 (persist-sink flag flip — belongs in Category 1), G2 (partition reassignment — needs workload extension), G3 (schema registry — v2 expansion), W1 (multi-source interaction), W2 (clock jumps on broker). +- **Biases escalated to user**: None — the catalog framing matches the user's stated scope ("basic properties for Kafka sources, both normal and upsert workloads"). The "basic" qualifier explicitly suggests that some areas like partition reassignment, schema registry, and multi-source scenarios are intentionally deferred to future passes. + +## Conclusion + +The 16-property Kafka source addition is implementable, well-scoped to Antithesis's strengths, and covers both envelopes plus the shared reclock layer. Known gaps are documented above as follow-up candidates. No biases escalated; the user's "basic" framing aligns with the catalog scope. diff --git a/test/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md new file mode 100644 index 0000000000000..592d71d368c15 --- /dev/null +++ b/test/antithesis/scratchbook/existing-assertions.md @@ -0,0 +1,88 @@ +# Existing Antithesis SDK Assertions + +## Summary + +**No Antithesis SDK assertions exist in the Materialize source code.** + +A comprehensive search of the Rust codebase at `materialize/src/` found: + +- No `use antithesis` import statements +- No Cargo.toml dependencies on any antithesis crate +- No assertion macros: `assert_always!`, `assert_sometimes!`, `assert_reachable!`, `assert_unreachable!` +- No antithesis function calls in the Python test code within the materialize repository + +## Existing Antithesis Integration (Customer Level) + +Antithesis integration exists at the **customer-repo level** (outside the materialize source), using the legacy experiment-script approach: + +### Experiment Scripts (`guest/opt/antithesis/experiment/`) + +- **`materialize.py`**: Docker Compose-based experiment. Uses `antithesis.start_customer_containers()`, `antithesis.start_fault_injector()`, `antithesis.run_process()`, `antithesis.fuzz_msg()`, `antithesis.end_test()`. Orchestrates testdrive workloads with network chaos (latency, packet loss, partitions). +- **`testdrive.py`**: K8s-based variant. Sets up k3s cluster with minio, redpanda, postgres, environmentd. Runs testdrive via kubectl. +- **`materialize-k8s.sh`**: Bash setup for K8s resources. + +### Docker Compose Topology (`guest/opt/materialize/docker-compose.yml`) + +Uses custom Antithesis-instrumented images: +- `antithesis-cp-combined` (Kafka + Schema Registry) +- `antithesis-materialized` (Materialize) +- `antithesis-testdrive` (Test workload) + +### K8s Manifests (`guest/opt/materialize/k8s/antithesis/`) + +Full Kubernetes topology: environmentd StatefulSet, postgres StatefulSet, redpanda Deployment, testdrive Pod, with PVs and services. + +## Implications for New Work + +All property assertions will need to be added fresh. The existing integration provides a starting point for topology but uses an older approach (experiment scripts, custom instrumented images). The new approach should leverage mzcompose for compose generation and add Antithesis SDK assertions either in the workload client or (for deeper coverage) in the Materialize Rust source. + +## Storage/Kafka/UPSERT Path — Candidate Instrumentation Sites + +Added 2026-05-11 during Kafka-source property discovery. These are existing `panic!`/`assert!`/`unreachable!` sites in the storage code that are direct candidates for being wrapped with the Antithesis SDK so that violations surface as reportable property failures rather than process aborts. Confirmed by grepping the source at commit `007c7af9d9970fb2030c7212368b232e0fbc363e`. + +### `src/storage/src/source/kafka.rs` + +- `:158` — `expect("positive pid")` +- `:265` — `expect("all source exports must be present in source resume uppers")` +- `:276` — `panic!("unexpected source export details: {:?}", details)` +- `:282` — `expect("statistics have been initialized")` +- `:345` — `expect("restored kafka offsets must fit into i64")` +- `:606, :853, :855, :891, :894, :897, :903, :907, :997` — various `expect()` and `assert!()` on reader state +- `:1142-1147` — `assert!(self.last_offsets[output_index].contains_key(&partition))` +- `:1193-1197` — `panic!("got negative offset (...) from otherwise non-error'd kafka message")` +- `:1209` — `expect("kafka sources always have upstream_time")` +- `:1457` — `assert!(…)` on payload structure + +### `src/storage/src/source/reclock.rs` and `reclock/compat.rs` + +- `reclock.rs:124` — `assert!(!new_into_upper.less_equal(&binding_ts))` +- `reclock.rs:321` — `assert!(prev < RB::before(pid))` +- `reclock/compat.rs:144` — `assert!(…)` on persist handle state +- `reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")` + +### `src/storage/src/upsert.rs` + +- `:541` — `assert!(diff.is_positive(), "invalid upsert input")` +- `:636` — `panic!("key missing from commands_state")` +- `:1031` — `unreachable!("pending future never returns")` + +### `src/storage/src/upsert_continual_feedback.rs` + +- `:626` — `assert!(diff.is_positive(), "invalid upsert input")` +- `:800` — `panic!("key missing from commands_state")` + +### `src/storage/src/upsert_continual_feedback_v2.rs` + +- `:315` — `assert!(diff.is_positive(), "invalid upsert input")` +- `:483` — `unreachable!()` on `(None, None)` from joined prior/new state + +### `src/storage/src/upsert/types.rs` — `StateValue` and `ensure_decoded` + +- `:297, :369, :403, :416, :430, :440` — six `panic!("called \`\` without calling \`ensure_decoded\`")` sites (`into_decoded`, `into_provisional_value`, `into_provisional_tombstone`, `provisional_order`, `provisional_value_ref`, `into_finalized_value`) +- `:580` — `panic!("\`merge_update_state\` called with non-consolidating state")` +- `:621` — `assert_eq!(checksum_sum.0, seahash::hash(value) as i64, …)` inside `ensure_decoded` (diff_sum == 1) +- `:632, :637, :642` — three checks for `diff_sum == 0` (`len_sum`, `checksum_sum`, all-zero `value_xor`) +- `:672` — `panic!("invalid upsert state: non 0/1 diff_sum: …")` +- `:1062` — `panic!("attempted completion of already completed upsert snapshot")` + +Per the property catalog, each of these gets a *distinct, specific* Antithesis assertion message so a fired assertion names exactly the site reached. No site shares a message with another. See `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, `properties/upsert-ensure-decoded-called-before-access.md`, and `properties/kafka-source-no-internal-panic.md` for the per-site rename table. diff --git a/test/antithesis/scratchbook/properties/catalog-recovery-consistency.md b/test/antithesis/scratchbook/properties/catalog-recovery-consistency.md new file mode 100644 index 0000000000000..8b581a99adf60 --- /dev/null +++ b/test/antithesis/scratchbook/properties/catalog-recovery-consistency.md @@ -0,0 +1,33 @@ +# catalog-recovery-consistency + +## Summary +After coordinator crash and restart, the catalog state is consistent: upper never decreases, snapshot is consolidated, all committed transactions visible. + +## Evidence + +### Code Paths +- `src/catalog/src/durable/persist.rs:536-539` — `sync_to_current_upper` +- `src/catalog/src/durable/persist.rs:575-577` — ListenEvent::Progress antichain logic +- `src/catalog/src/durable/persist.rs:706-724` — `consolidate` method +- `src/catalog/src/durable/persist.rs:593-612` — sync applies updates by timestamp, consolidates after each +- `src/catalog/src/durable/persist.rs:1092` — Assertion on snapshot consolidation +- `src/catalog/src/durable/persist.rs:1167-1170` — Fence token generation syncs to upper + +### How It Works +On startup, the coordinator reads the persist shard from the latest rollup + incremental diffs, reconstructing the full catalog state. `sync_to_current_upper()` applies all updates up to the current upper antichain and consolidates the snapshot. The existing code has a debug assertion at line 1092 checking consolidation. + +### What Goes Wrong on Violation +- Upper regression: coordinator sees older schema state than what was committed, losing recent DDL +- Unconsolidated snapshot: duplicate entries cause incorrect catalog lookups, potential panics +- Missing transactions: committed DDL not visible after restart, users lose tables/views + +### Key Subtlety +Crash during `maybe_consolidate()` (lines 596, 610) could leave the snapshot in an intermediate state. On restart, the next sync must handle this gracefully by reconsolidating from the durable upper. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions (debug_assert exists at line 1092 but only in debug builds) +- Candidate: After `sync_to_current_upper()`, add `assert_always!` that upper >= previous upper +- Candidate: After consolidation, add `assert_always!` that no duplicate (kind, key) entries exist + +### Provenance +Surfaced by Failure Recovery focus (merged from catalog-upper-monotonicity and catalog-snapshot-consolidation). diff --git a/test/antithesis/scratchbook/properties/command-channel-ordering.md b/test/antithesis/scratchbook/properties/command-channel-ordering.md new file mode 100644 index 0000000000000..0f47965189999 --- /dev/null +++ b/test/antithesis/scratchbook/properties/command-channel-ordering.md @@ -0,0 +1,28 @@ +# command-channel-ordering + +## Summary +Timely workers must see CreateDataflow commands in identical order — code explicitly acknowledges this is not guaranteed by Timely. + +## Evidence + +### Code Paths +- `src/compute/src/command_channel.rs:88-90` — Comment: "relies on Timely channels preserving order of inputs, which is not something they guarantee" +- `src/compute/src/command_channel.rs:96-100` — Source operator activation sequence +- `src/compute/src/command_channel.rs:41-58` — Sender using `Arc` activator + +### How It Works +The command channel broadcasts commands from worker 0 to all other Timely workers via a Timely dataflow operator. Commands are fed in order, but the code explicitly notes that Timely does not guarantee preservation of input ordering. + +### What Goes Wrong on Violation +Workers execute dataflows in different orders, causing divergent state. Since all workers must agree on dataflow state for correct results, reordering leads to inconsistent query results or panics during distributed computation. + +### Why This Is an Antithesis Target +This is the kind of bug that almost never manifests in normal testing because thread scheduling is usually consistent. Antithesis's deterministic scheduling exploration can systematically vary worker activation timing to expose reordering. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: On each worker, log the command sequence and add `assert_always!` that worker N's command sequence matches worker 0's +- This is a strong candidate for SUT-side instrumentation since the invariant is internal to the compute engine + +### Provenance +Surfaced by Concurrency focus. diff --git a/test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md b/test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md new file mode 100644 index 0000000000000..019445cc28632 --- /dev/null +++ b/test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md @@ -0,0 +1,25 @@ +# compute-replica-epoch-isolation + +## Summary +Compute replica incarnations are isolated by epoch — commands from old epochs cannot execute after a new epoch starts. + +## Evidence + +### Code Paths +- `src/compute-client/src/controller/replica.rs:70-107` — Epoch at line 93, ReplicaTask at line 146 +- `src/compute-client/src/protocol/command.rs:45-54` — Hello command with nonce for protocol iteration +- `src/compute-client/src/controller/replica.rs:142-144` — Task abortion on rehydration clears old commands + +### How It Works +Each replica incarnation gets a unique epoch (nonce + u64). On rehydration, the controller aborts the old ReplicaTask and creates a new one with an incremented epoch. The Hello command includes the new nonce, and the replica rejects commands with mismatched nonces. + +### What Goes Wrong on Violation +Stale commands from a previous incarnation execute on the new replica, causing it to diverge from the coordinator's expected state. Query results become inconsistent across replicas. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: On command receipt, add `assert_always!(command.epoch >= current_epoch)` in the replica's command handler +- Candidate: After rehydration, add `assert_reachable!` that the new epoch is used for the first command + +### Provenance +Surfaced by Distributed Coordination focus. diff --git a/test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md b/test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md new file mode 100644 index 0000000000000..5da820a7d464c --- /dev/null +++ b/test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md @@ -0,0 +1,24 @@ +# critical-reader-fence-linearization + +## Summary +Critical reader opaque token comparison linearizes correctly — concurrent readers cannot bypass the fencing mechanism. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/state.rs:1937-1979` — `compare_and_downgrade_since()` with opaque fencing +- `src/persist-client/src/critical.rs` — `CriticalReaderId` and `Opaque` definitions + +### How It Works +Critical readers hold a `since` frontier that prevents GC of data at held timestamps. The `compare_and_downgrade_since` operation uses an opaque token to fence: the caller provides `expected_opaque`, and if it doesn't match the current opaque in state, the operation fails (but still commits a SeqNo increment to prevent ABA). Only the caller with the correct opaque can advance the since. + +### What Goes Wrong on Violation +If fencing is bypassed, two readers could both think they hold the since, leading to premature GC. Data needed by active readers is deleted, causing read failures or panics. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After successful downgrade, add `assert_always!(state.opaque == my_opaque)` to confirm fencing +- Candidate: On mismatch, add `assert_always!(seqno_advanced)` to confirm ABA prevention + +### Provenance +Surfaced by Data Integrity focus. diff --git a/test/antithesis/scratchbook/properties/deployment-lag-detection.md b/test/antithesis/scratchbook/properties/deployment-lag-detection.md new file mode 100644 index 0000000000000..213c3dd2f904b --- /dev/null +++ b/test/antithesis/scratchbook/properties/deployment-lag-detection.md @@ -0,0 +1,26 @@ +# deployment-lag-detection + +## Summary +0DT caught-up check eventually detects lagging or crash-looping replicas and blocks promotion. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/caught_up.rs:53-150` — `maybe_check_caught_up` with replica frontier snapshot +- `src/adapter/src/coord/caught_up.rs:127-136` — Lag comparison against allowed threshold +- `src/adapter/src/coord/caught_up.rs:145-149` — `problematic_replicas` detection +- Dynamic configs: `WITH_0DT_CAUGHT_UP_CHECK_ALLOWED_LAG`, `ENABLE_0DT_CAUGHT_UP_REPLICA_STATUS_CHECK` + +### How It Works +Periodically during catchup, the coordinator queries `MZ_CLUSTER_REPLICA_FRONTIERS` and compares each replica's frontier against the expected threshold. If any replica's frontier lags beyond `allowed_lag`, promotion is blocked. Additionally, `analyze_replica_looping()` checks `mz_cluster_replica_status_history` for crash patterns. + +### What Goes Wrong on Violation +If a stuck/crashing replica is not detected, promotion proceeds with an unhealthy replica. Post-promotion, queries routed to that replica fail or return stale results. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: Add `assert_sometimes!(lagging_replica_blocked_promotion)` to confirm the detection path is exercised +- This is a liveness property — we want to confirm the system can detect the problem, not just that it doesn't happen + +### Provenance +Surfaced by Lifecycle focus. diff --git a/test/antithesis/scratchbook/properties/deployment-promotion-safety.md b/test/antithesis/scratchbook/properties/deployment-promotion-safety.md new file mode 100644 index 0000000000000..e6794631a0aec --- /dev/null +++ b/test/antithesis/scratchbook/properties/deployment-promotion-safety.md @@ -0,0 +1,26 @@ +# deployment-promotion-safety + +## Summary +0DT deployment promotion happens only after all replicas have caught up to required frontiers. + +## Evidence + +### Code Paths +- `src/environmentd/src/deployment/state.rs:92-108` — `set_ready_to_promote` transitions Initializing->CatchingUp->ReadyToPromote +- `src/environmentd/src/deployment/preflight.rs:57-120` — `preflight_0dt` with `caught_up_max_wait` and `caught_up_trigger` +- `src/adapter/src/coord/caught_up.rs:53-150` — Replica frontier checks via `MZ_CLUSTER_REPLICA_FRONTIERS` +- `src/catalog/src/durable/error.rs:115-124` — `FenceError::DeployGeneration` + +### How It Works +During 0DT deployment, the new coordinator boots in read-only mode. It runs preflight checks including `maybe_check_caught_up()` which compares replica frontiers against a cutoff threshold. Only after all replicas pass the check does the coordinator transition to ReadyToPromote. On promotion, the deployment generation is incremented, fencing out the old coordinator. + +### What Goes Wrong on Violation +Premature promotion causes the new coordinator to serve queries while replicas are still rehydrating from storage. Users see stale data or timeouts. In the worst case, the old coordinator continues writing with a lower generation, causing split-brain. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: At promotion time, add `assert_always!` that all tracked replica frontiers >= cutoff +- Candidate: Add `assert_reachable!("0dt_promotion_completed")` to confirm the promotion path is exercised + +### Provenance +Surfaced by Lifecycle and Distributed Coordination focuses. diff --git a/test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md b/test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md new file mode 100644 index 0000000000000..3fb5167f9edf7 --- /dev/null +++ b/test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md @@ -0,0 +1,35 @@ +# epoch-fencing-prevents-split-brain + +## Summary +Epoch-based leader fencing prevents two coordinators from concurrently writing to the catalog persist shard. + +## Evidence + +### Code Paths +- `src/catalog/src/durable/persist.rs:149-169` — `FenceableToken::validate()` and `maybe_fence()` check epoch on every write +- `src/catalog/src/durable/persist.rs:393-461` — `compare_and_append` with fence validation before consensus write +- `src/catalog/src/durable/error.rs:114-131` — `FenceError` enum: `DeployGeneration` and `Epoch` variants +- `src/catalog/src/durable/persist.rs:1166-1192` — Fence token generation during `open_inner` +- `src/environmentd/src/deployment/state.rs:24-123` — Deployment state machine transitions + +### How It Works +On startup, the coordinator reads the current fence token from consensus and increments the epoch. The new token is written via CaS. All subsequent writes include the token; if consensus contains a higher epoch, the write fails with `FenceError::Epoch`. + +### What Goes Wrong on Violation +Two coordinators with the same epoch could both write catalog mutations, leading to divergent schema state. Users would see inconsistent table definitions, lost DDL operations, or catalog corruption requiring manual intervention. + +### Failure Scenario +1. Coordinator A is running with epoch 10 +2. Coordinator A becomes partitioned from consensus +3. Coordinator B starts, reads epoch 10, increments to epoch 11 +4. Partition heals; A attempts to write with epoch 10 +5. **Expected**: A's write fails with FenceError +6. **Bug**: If A's CaS succeeds despite lower epoch (race in validation) + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions in codebase +- Candidate instrumentation point: `FenceableToken::validate()` — add `assert_always!` that validates token comparison result matches expected fencing behavior +- Candidate: `compare_and_append` success path — add `assert_always!` that current_epoch >= write_epoch + +### Provenance +Surfaced independently by Distributed Coordination and Failure Recovery focuses. diff --git a/test/antithesis/scratchbook/properties/fault-recovery-exercised.md b/test/antithesis/scratchbook/properties/fault-recovery-exercised.md new file mode 100644 index 0000000000000..d6499991da5a6 --- /dev/null +++ b/test/antithesis/scratchbook/properties/fault-recovery-exercised.md @@ -0,0 +1,28 @@ +# fault-recovery-exercised + +## Summary +After coordinator crash, the system eventually recovers and serves queries. + +## Evidence + +### Code Paths +- `src/environmentd/src/environmentd/main.rs` — Main startup, catalog recovery +- `src/environmentd/src/http/probe.rs` — `/health/ready` endpoint +- `src/catalog/src/durable/persist.rs:1166-1192` — `open_inner` recovery path + +### How It Works +On restart, environmentd re-reads the catalog from persist, increments the epoch, rehydrates compute/storage clusters, and starts accepting connections. The readiness probe (`/health/ready`) returns 200 only after the adapter is fully initialized. + +### What Goes Wrong on Violation +The system fails to recover: it crashes on startup due to corrupt catalog state, enters an infinite restart loop, or becomes ready but cannot serve queries due to incomplete rehydration. + +### Why This Is a Property +This is the most fundamental liveness property. It doesn't test a specific invariant — it tests that the entire recovery pipeline works end-to-end under adversarial crash timing. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Best verified at workload level: crash environmentd, wait for readiness, issue SELECT query, assert success +- Candidate: Add `assert_sometimes!(recovery_completed_successfully)` after catalog recovery succeeds + +### Provenance +Surfaced by Failure Recovery focus. diff --git a/test/antithesis/scratchbook/properties/group-commit-toctou-safety.md b/test/antithesis/scratchbook/properties/group-commit-toctou-safety.md new file mode 100644 index 0000000000000..bae54fcc085cc --- /dev/null +++ b/test/antithesis/scratchbook/properties/group-commit-toctou-safety.md @@ -0,0 +1,28 @@ +# group-commit-toctou-safety + +## Summary +No phantom writes to tables deleted between write deferral and group_commit execution. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/appends.rs:479-486` — Explicit TOCTOU check: "If the table... has been deleted while the write was deferred" +- `src/adapter/src/coord/appends.rs:214-216` — `defer_op` enqueue point +- `src/adapter/src/coord/appends.rs:394-399` — JIT lock acquisition in group_commit + +### How It Works +When a write arrives and cannot immediately acquire the write lock, it is deferred. Later, group_commit processes deferred writes. Before applying each write, it checks `catalog().try_get_entry(table_id)`. If the table was dropped between deferral and execution, the write is silently dropped. + +### What Goes Wrong on Violation +Writes land in a shard for a table that no longer exists in the catalog. This causes inconsistency between the catalog (table doesn't exist) and persist (shard has data). Downstream queries may panic or return garbage. + +### The TOCTOU Window +The explicit comment at appends.rs:479 acknowledges the race. The window is between line 214 (write enqueued) and line 484 (catalog check during group_commit). Concurrent DDL (DROP TABLE) within this window is the trigger. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After group_commit drops a deferred write, add `assert_reachable!("group_commit_dropped_deferred_write_to_deleted_table")` to confirm this path is exercised +- Candidate: After group_commit succeeds, add `assert_always!` that all written table_ids still exist in catalog + +### Provenance +Surfaced by Concurrency focus. diff --git a/test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md b/test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md new file mode 100644 index 0000000000000..0837770823d9a --- /dev/null +++ b/test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md @@ -0,0 +1,28 @@ +# idempotent-write-under-indeterminate + +## Summary +Compare-and-append retries with the same idempotency token produce exactly one committed write — never duplicates, never loss. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/machine.rs:387-468` — Detailed comments on Indeterminate error handling and retry-with-idempotency-token +- `src/persist-client/src/internal/state.rs:1687` — `compare_and_append` function +- `src/persist-client/src/write.rs:409` — Retry wrapper with `IdempotencyToken` +- `src/persist-client/src/internal/state.rs:1715-1724` — Writer state and lease tracking + +### How It Works +Each writer holds an `IdempotencyToken`. On Indeterminate error, the retry includes the same token. The state machine checks if a write with that token already succeeded (checking writer state). If so, it returns `AlreadyCommitted`. If not, it proceeds normally. + +### What Goes Wrong on Violation +Duplicate writes: the shard contains two copies of the same batch, leading to double-counting in materialized views. Or lost writes: the batch is neither committed nor retried successfully, causing data loss. + +### Key Subtlety +The comments at machine.rs:387-468 describe subtle scenarios where the writer must distinguish between "my write succeeded but I didn't get the ack" vs "my write failed and I need to retry." The IdempotencyToken is the mechanism, but the window between consensus write and state observation is where bugs hide. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After Indeterminate retry, add `assert_always!` that shard trace contains exactly one instance of the batch + +### Provenance +Surfaced by Data Integrity focus. diff --git a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md new file mode 100644 index 0000000000000..b22aa8d0e6852 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md @@ -0,0 +1,54 @@ +# kafka-source-frontier-monotonic + +## Summary + +The `upper` frontier of the source's data persist shard never regresses across the source's lifetime, including across clusterd restarts and `compare_and_append` retries. + +## Code paths + +- `src/storage/src/render/persist_sink.rs` — `append_batches` calls `WriteHandle::compare_and_append`. Cached upper is the failure-prone spot (commit `505dc96aaa`: cached upper went stale under concurrent writers; fix uses `fetch_recent_upper`). +- `src/storage/src/source/reclock.rs` — `ReclockOperator::sync`: must not let the operator's `upper` field regress across `compare_and_append` retries. +- `src/storage/src/source/reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")`: this is the assertion that catches genuinely invalid persist calls (vs. legitimate `UpperMismatch` which is retried). + +## How to check it + +- Workload polls `mz_internal.mz_source_statistics_per_worker.offset_committed` (or equivalent shard upper view) on a tight cadence and `assert_always!(upper_monotonic, "kafka: source shard upper non-monotonic")` whenever a new sample is `< previous sample`. +- SUT-side: in `append_batches`, immediately before `compare_and_append`, capture the previous upper from the local cached state and `assert_always!(new_upper >= prev_upper, "persist sink: upper regression on append")`. Distinct messages on the reclock side. + +## What goes wrong on violation + +Downstream operators panic when `as_of > upper` (the reclock-`as_of` race in commit `e3805ad790`, database-issues#8698, was exactly this shape). `AS OF` SQL queries return wrong results. + +## Antithesis angle + +- Kill clusterd mid-`compare_and_append`. On restart, the cached upper must be refreshed before the next append. +- Concurrent reclock writers (two storage workers racing during a transient split-brain): both attempt CaS; only one wins; the other's local upper must catch up before it tries again. +- Inject persist consensus latency to widen the cache-staleness window. + +## Open question (resolved) + +Q: Does the reclock retry loop in `ReclockOperator::mint` (reclock.rs:160-166) protect against this, or is the bug in code that doesn't go through `sync`? + +A: The retry loop does protect — but only if `sync()` is called *before* the local upper is used in subsequent code. The historical bug (`e3805ad790`) was in the `as_of` computation path which ran *outside* `mint` and used a cached upper from the read handle. Workload-level monotonicity assertion is sufficient to catch both paths. + +## Existing instrumentation + +None. The persist-side `panic!("compare_and_append failed: …")` in `reclock/compat.rs:306` is informational, not a property. Wrap with `assert_unreachable!` for the genuinely-invalid case and add an `assert_always!` for the workload-observable monotonicity. + +## Implementation status + +Implemented 2026-05-11 (workload-side) as `test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py`. The `anytime_` driver runs throughout the timeline alongside other drivers while faults are active. Each poll iteration: + +1. Lists every source in `SOURCES = ["upsert_text_src", "none_text_src"]` that currently exists in the catalog (so an early-timeline poll before sources are created doesn't fire false negatives). +2. For each source, calls `helper_source_stats.offset_committed()` (a `MAX(offset_committed)` over `mz_internal.mz_source_statistics` joined to `mz_sources` by name). +3. Compares against the previous observation for that source in `last_seen`. The assertion `always("kafka: source offset_committed non-monotonic", details)` fires only when both observations succeeded — partition/clusterd unavailable is expected under faults and not an assertion target. + +`details` carries `source`, `previous`, `observed`, and `regression` (`previous - observed`). + +The SUT-side `assert_always!` in `append_batches` and the `reclock/compat.rs` `compare_and_append` paths (commit `e3805ad790`'s and `505dc96aaa`'s code paths) are deferred — the workload signal is sufficient to catch any externally-visible regression. Add SUT instrumentation later if Antithesis surfaces failures that need internal localization. + +The complementary `offset-known-not-below-committed` property is similar shape and could be added to this same driver with minimal cost; that's deliberately deferred to keep this commit scoped to the user-requested three properties. + +## Provenance + +Surfaced by: Data Integrity, Distributed Coordination. Direct regression target for commits `e3805ad790` and `505dc96aaa`. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md new file mode 100644 index 0000000000000..21780e5d10211 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md @@ -0,0 +1,53 @@ +# kafka-source-no-data-duplication + +## Summary + +After settling, the source contains no duplicates — at most one row per `(partition, offset)` for NONE-envelope and at most one row per key for UPSERT-envelope. + +## Why this property + +Duplication is the symmetric failure mode to `kafka-source-no-data-loss`. It is silent, propagates into every downstream aggregate, and historically arose in the upsert operator under multi-replica drain (commit `1accbe28b3`, database-issues#9160). It is the more dangerous of the two failure modes because it is harder to detect operationally — the workload sees "extra" rows that look plausible. + +## Code paths + +- `src/storage/src/source/kafka.rs:1158` — per-incarnation dedup against `last_offsets` (drops messages with offset `<= last_offset`). Per-incarnation only; does not survive restart. +- `src/storage/src/render/persist_sink.rs` — the persist sink is responsible for ensuring writes are idempotent across restarts. Compare-and-append with idempotency tokens on retry handles the indeterminate-error case (compare with `idempotent-write-under-indeterminate`). +- `src/storage/src/upsert_continual_feedback.rs` — `drain_staged_input`: the regression target for commit `1accbe28b3`. Single-replica clusters masked the bug because capabilities were always singletons; multi-replica drained the same staged input twice. +- `src/storage/src/upsert.rs:541`, `upsert_continual_feedback*.rs` — `assert!(diff.is_positive(), "invalid upsert input")`. Retractions on the input would be the canonical "duplicate retraction" symptom. + +## How to check it + +Workload-level: +- NONE envelope: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1, 2 HAVING COUNT(*) > 1` returns 0 rows. Assert with `assert_always!(no_dupes, "kafka source: no duplicate (partition, offset)")`. +- UPSERT envelope: `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns 0 rows. Same assertion shape with a unique message. + +These run on every check fire, ideally on a polling cadence, not just at end-of-test. + +SUT-side: convert the existing `assert!(diff.is_positive(), "invalid upsert input")` into `assert_always!(diff.is_positive(), "upsert: input diff positive")` so a duplicate retraction surfaces as a property failure rather than a process abort. Distinct messages at each of the three callsites. + +## What goes wrong on violation + +Aggregates over the source double-count. Joins fan out. Downstream MVs become wrong in ways that are hard to attribute to ingestion. + +## Antithesis angle + +- Crash storage worker between `write_batches` and `append_batches`. Restart and verify that no `(partition, offset)` appears twice in the resulting persist shard. +- For UPSERT: multi-replica cluster topology (the historical bug requires it). Run two replicas on the same source and observe the persisted output for duplicate retractions. +- Race the upsert feedback-driven snapshot replay against new input. + +## Existing instrumentation + +The runtime `assert!` in upsert.rs already aborts on negative input diffs — it just doesn't surface as an Antithesis property. Wrapping each callsite with `assert_always!` (per-site unique message) gives Antithesis the signal it needs without changing semantics outside Antithesis (the underlying `assert!` already aborts on violation). + +## Implementation status + +Implemented 2026-05-11 in two halves: + +- **NONE envelope, workload-side**: `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py` runs `SELECT partition, "offset", COUNT(*) FROM none_text_src WHERE text LIKE prefix:% GROUP BY 1,2 HAVING COUNT(*) > 1` after each catchup and asserts the result is empty via `always("kafka source: no duplicate (partition, offset)", details)`. Up to five offending rows are carried in `details` for triage. +- **UPSERT envelope, SUT-side**: the `assert_always!(diff.is_positive(), ...)` family added by `upsert-no-internal-panic` covers the "duplicate retraction on input" symptom directly inside the operator at the three call sites in `upsert.rs`, `upsert_continual_feedback.rs`, `upsert_continual_feedback_v2.rs`. The workload-side per-key dedup check is part of `upsert-key-reflects-latest-value`. + +Per-payload visibility (the inverse-pair `kafka-source-no-data-loss` check) shares the same driver — both run on the same produce + catchup cycle to maximize signal per invocation. + +## Provenance + +Surfaced by: Data Integrity, Concurrency, Failure Recovery. Direct regression target for database-issues#9160. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md new file mode 100644 index 0000000000000..e999c42b76083 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md @@ -0,0 +1,57 @@ +# kafka-source-no-data-loss + +## Summary + +Every Kafka message produced by the workload is eventually visible in the source — either as a row (NONE envelope) or as the latest value for its key (UPSERT envelope). + +## Why this property + +This is the headline guarantee of a streaming database. The previous catalog entry `source-ingestion-progress` covered the generic "frontier advances" liveness signal; this property is the Kafka-specific, workload-checkable version that compares produced records against `SELECT` output. + +## Code paths + +- `src/storage/src/source/kafka.rs` — `render_reader`: the reader loop that drains `PartitionQueue`s, deduplicates against `last_offsets`, and emits `(SourceMessage, KafkaTimestamp, +1)` triples. +- `src/storage/src/source/source_reader_pipeline.rs` — `create_raw_source`: assembles reader, remap, reclock. +- `src/storage/src/source/reclock.rs` — `ReclockOperator::mint`: binds source timestamps to Materialize timestamps and persists the binding via `compare_and_append` on the remap shard. +- `src/storage/src/render/persist_sink.rs` — `mint_batch_descriptions` → `write_batches` → `append_batches`: the path that actually puts rows into the source's data persist shard. +- For UPSERT: `src/storage/src/upsert.rs` (`upsert_classic`) and the continual-feedback variants in `upsert_continual_feedback*.rs`. + +## How to check it + +Workload-level: +1. The workload tracks every `(topic, partition, offset, key, value)` it produces. +2. After produce settles, the workload calls `ANTITHESIS_STOP_FAULTS` and waits for `mz_internal.mz_source_statistics_per_worker` to report `offset_committed >= max_produced_offset`. +3. The workload asserts via `assert_sometimes!("kafka source caught up to produced offsets", expected_rowcount_visible)` that `COUNT(*) FROM source >= produced_count` (NONE) or that the per-key latest-value model matches the source (UPSERT). + +SUT-side anchor: `assert_sometimes!(persist_sink_appended_batch)` inside `append_batches` after the first successful `compare_and_append` for this source. + +## What goes wrong on violation + +Silent data loss: the source ingests fewer rows than were produced; the workload sees a stall that doesn't resolve even with faults paused. Downstream MVs see incomplete data. + +## Antithesis angle + +The interesting window is mid-batch crash: a clusterd kill between the persist sink's `write_batches` (which uploads parts) and `append_batches` (which compare-and-appends). The resume frontier on restart determines what gets re-read. Bugs here look like: wrong resume offset (commit history: kafka.rs:1158 dedup is per-incarnation only — across restart, idempotency depends on persist-sink correctness). + +## Existing instrumentation + +None. No `assert_sometimes!` in the source path today (verified against `existing-assertions.md`). To implement: add an `assert_sometimes!` in the persist sink's `append_batches` after a successful append, plus a workload-side `assert_sometimes!` after the quiet-period catch-up check. + +## Implementation status + +Implemented 2026-05-11 (NONE envelope, workload-side) as `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. The driver shares a flight with `kafka-source-no-data-duplication` because both check the same dataflow: + +| Message | Type | Fires when | +|---------|------|------------| +| `"kafka source caught up to produced offsets after quiet period (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor | +| `"kafka source: every produced payload is visible exactly once"` | `always` | Per produced payload, after catchup; carries `payload`, `present`, `observed_count` in details | + +The UPSERT-envelope arm of this property is covered by `upsert-key-reflects-latest-value`. + +The SUT-side `assert_sometimes!(persist_sink_appended_batch, ...)` anchor in `append_batches` is **deferred** — it would tighten replay anchoring but the workload check above is already specific enough that triage can localize a failure without it. + +New helper: `helper_none_source.py` — idempotent `CREATE SOURCE ... FORMAT TEXT INCLUDE PARTITION, OFFSET ENVELOPE NONE`, reusing the shared `antithesis_kafka_conn` connection from `helper_upsert_source.py`. + +## Provenance + +Surfaced by: Data Integrity, Failure Recovery, Product Context. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md b/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md new file mode 100644 index 0000000000000..6f6106aedbcce --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md @@ -0,0 +1,44 @@ +# kafka-source-no-internal-panic + +## Summary + +The explicit panics and `assert!`s in the Kafka source reader never fire under any Antithesis-injected fault sequence. Each site is converted to a uniquely-messaged Antithesis assertion so a firing is a reportable property failure rather than a clusterd crash. + +## Targeted sites + +`src/storage/src/source/kafka.rs`: + +| Line | Site | Antithesis form | +|------|------|------------------| +| 276 | `panic!("unexpected source export details: {:?}", details)` | `assert_unreachable!("kafka: unexpected source export details")` | +| 891 | `assert!(reader.partition_consumers.is_empty())` | `assert_always!(reader.partition_consumers.is_empty(), "kafka: partition_consumers not drained at shutdown")` | +| 1142 | `assert!(self.last_offsets.get(output_index).unwrap().contains_key(&partition))` | `assert_always!(…, "kafka: partition missing from last_offsets")` | +| 1193 | `panic!("got negative offset ({}) from otherwise non-error'd kafka message", msg.offset())` | `assert_unreachable!("kafka: negative offset from non-error message")` | +| 1457 | `assert!(…)` (debug-mode payload validation) | `assert_always!(…, "kafka: payload check")` | + +Plus the cluster of `expect()` sites that are structurally similar — resume-upper missing (265), statistics not initialized (282), restored offset out of `i64` range (345), `position()` failure (606), `partition_known` lookup (853, 855), offset arithmetic (997, 1055, 1060, 1063, 1072, 1082), watermark not negative (1492). These are lower-priority but mass-conversion to `assert_always!(false, ...)` is cheap. + +## Why these sites matter + +- The "negative offset" panic at 1193 is the most interesting: rdkafka has shipped negative offsets in the past under certain protocol bugs, and an `i64` cast that wraps silently would be worse than the panic. Antithesis can reach this through manual broker-state manipulation in the workload. +- The capability-downgrade assertion family (relevant to commit `99ad668af5`'s topic-recreation panic) — currently that code path *logs and continues* rather than panicking, but if a future refactor reintroduces a `panic!` on offset regression, this property catches it. +- The `partition_consumers.is_empty()` assertion at 891 catches a shutdown-ordering bug that would manifest as a clusterd crash on source drop. + +## Antithesis angle + +- Topic deletion + recreation on the Kafka container. Specifically: drop a topic with offsets `[0..1000]`, recreate it with offsets `[0..100]` (lower watermark). The source's resume frontier sees `last_offset = 1000` and rdkafka delivers offset `100`. The dedup at kafka.rs:1158 handles this; the assertion at 1142 catches the case where the *partition itself* is missing from the dedup table. +- Partition rebalance: increase Kafka topic partition count from the broker side mid-run. The metadata fetcher must discover and assign the new partitions correctly. +- Manual offset reset: most relevant for the negative-offset panic at 1193. +- Clock jumps: Kafka's internal timestamp arithmetic uses millisecond offsets; clock jitter has historically interacted poorly with the `expect("kafka sources always have upstream_time")` at line 1209. + +## Existing instrumentation + +The panics and asserts already exist. They currently abort clusterd. The work is wrapping each site with the Antithesis SDK so the abort becomes a reportable, replayable property failure. Each site uses a distinct message naming exactly the invariant violated. + +## Relationship to other properties + +This is the SUT-side counterpart to the workload-level `kafka-source-no-data-loss` and `kafka-source-no-data-duplication`. A workload-level row-count mismatch tells you data is wrong; a fired SUT-side assertion tells you *where* it went wrong. + +## Provenance + +Surfaced by: Failure Recovery, External Dependencies. Regression targets: commits `99ad668af5`, `3e32df1f69`. diff --git a/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md b/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md new file mode 100644 index 0000000000000..fd05df6b47e70 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md @@ -0,0 +1,40 @@ +# kafka-source-survives-broker-fault + +## Summary + +After a network partition or Kafka outage that prevents the source from making progress, once connectivity is restored the source resumes ingestion and eventually visits every message produced during the outage. + +## Code paths + +- `src/storage/src/source/kafka.rs` — `render_reader` polls per-partition `PartitionQueue`s. rdkafka's internal reconnect logic handles broker reconnect; the storage reader must not enter a permanent stall state when the consumer errors out. +- `src/storage/src/healthcheck.rs` — the source's `HealthStatusUpdate` transitions: `Running` → `Stalled { hint }` during the outage → back to `Running` after recovery. `Ceased` would be a violation (terminal failure for a transient fault). +- `src/storage/src/statistics.rs` — `offset_known` and `offset_committed` resume advancing post-recovery. The rehydration-latency reset (commit `0a34b6c79d`) is relevant if the reconnect goes through a dataflow restart. + +## How to check it + +Workload procedure: +1. Produce N messages. +2. Inject a network partition between the `materialized` container and the Kafka container. The partition isolates only that pair; persist/metadata remain reachable. +3. Produce N more messages while the partition is active. +4. Heal the partition (Antithesis fault scheduler) and call `ANTITHESIS_STOP_FAULTS`. +5. Poll `mz_internal.mz_source_statistics_per_worker.offset_committed` until it advances past `max_produced_offset`. Bound the poll loop with a generous timeout. +6. `assert_sometimes!(source_resumed_after_broker_fault, "kafka source resumed after Kafka container partition")`. + +## What goes wrong on violation + +The source enters a permanent stall: rdkafka thinks it's reconnected but the reader never re-reads; or the operator transitions to `Ceased` and the source must be manually dropped/recreated. + +## Antithesis angle + +- Bidirectional network partition: `materialized` ↔ Kafka. +- Asymmetric partition: outbound packets to Kafka dropped but inbound responses allowed (or vice versa). rdkafka may not detect this and may sit waiting for a response forever. +- Repeated short partitions: stress reconnect cadence. +- Kafka container hang (CPU throttling to zero rather than network partition). + +## Existing instrumentation + +None. Workload-level `assert_sometimes!` is the entry point. Optional SUT-side: `assert_sometimes!(kafka_consumer_reconnected, ...)` inside the reader after rdkafka reports a successful reconnect. + +## Provenance + +Surfaced by: Failure Recovery, External Dependencies. diff --git a/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md b/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md new file mode 100644 index 0000000000000..072f374048ae6 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md @@ -0,0 +1,47 @@ +# kafka-source-survives-clusterd-restart + +## Summary + +After clusterd is killed and restarted, the Kafka source recovers its state, computes the correct resume offsets, and ingests messages produced before, during, and after the restart. + +## Code paths + +- `src/storage-client/src/controller.rs` — the storage controller's command-replay logic; this is the entry point for the `storage-command-replay-idempotent` property cluster. +- `src/storage/src/storage_state.rs` — `RunIngestionCommand` handling. The async storage worker serializes ingestion vs. compaction (commit `3e5259782c`). +- `src/storage/src/source/source_reader_pipeline.rs:481-493` — remap operator bootstraps by loading the entire initial batch from the remap shard before resuming new mints. +- `src/storage/src/source/kafka.rs:346-349` — `start_offsets` derived from persisted resume frontier. +- For UPSERT: `src/storage/src/upsert.rs` and `upsert_continual_feedback*.rs` — state reconstruction via the feedback stream (drain all values at or below resume frontier, then transition to normal mint mode). + +## How to check it + +Workload procedure: +1. Produce N messages; wait for source to ingest them. +2. Kill clusterd via Antithesis node-termination fault. +3. Produce M more messages while clusterd is down. +4. Wait for restart, call `ANTITHESIS_STOP_FAULTS`. +5. Poll until `offset_committed >= max_produced_offset`. +6. `assert_sometimes!(clusterd_restart_recovered, "kafka source recovered after clusterd kill")`. Combine with `kafka-source-no-data-duplication` to rule out double-counting; combine with `kafka-source-no-data-loss` to rule out gaps. + +## What goes wrong on violation + +- Resume offset is wrong (too low → duplicates; too high → gap). +- UPSERT state is wrong (stale value per key, or missing keys). +- Source never recovers because remap-shard bootstrap fails. + +## Antithesis angle + +The most interesting timing is a kill *between* the persist sink's `compare_and_append` returning success and the controller's frontier-report channel actually delivering the new frontier upstream. The source on restart must compute its resume frontier from the durably-recorded shard upper, not from any cached or in-flight state. + +For UPSERT specifically: kill during the snapshot phase. The feedback-driven snapshot must restart cleanly and complete with the same final state. + +## Dependency + +Requires **node-termination faults** to be enabled in the Antithesis tenant. Confirm with the user. Without this fault, the property is vacuous. + +## Existing instrumentation + +None. Workload-level assertion only, until SUT-side rehydration anchors are added. Candidate SUT anchors: `assert_sometimes!(snapshot_phase_completed, …)` in the upsert operator's snapshot-completion path, and `assert_sometimes!(remap_bootstrap_complete, …)` in `source_reader_pipeline.rs:481`. + +## Provenance + +Surfaced by: Failure Recovery. Builds on `storage-command-replay-idempotent` and `fault-recovery-exercised`. diff --git a/test/antithesis/scratchbook/properties/mv-reflects-source-updates.md b/test/antithesis/scratchbook/properties/mv-reflects-source-updates.md new file mode 100644 index 0000000000000..a500f32fb1b0a --- /dev/null +++ b/test/antithesis/scratchbook/properties/mv-reflects-source-updates.md @@ -0,0 +1,32 @@ +# mv-reflects-source-updates + +## Summary +Materialized views eventually reflect changes to their source data. + +## Evidence + +### Code Paths +- `src/compute/src/render/` — Dataflow rendering for materialized views +- `src/compute/src/server.rs` — Compute server receives commands and renders dataflows +- `src/adapter/src/coord/sequencer/` — CREATE MATERIALIZED VIEW sequencing + +### How It Works +When source data changes, differential dataflow operators in the compute layer process the deltas and update the materialized view's persist shard. The MV's frontier advances as updates are committed. + +### What Goes Wrong on Violation +MVs show stale data permanently despite source updates. Users query a materialized view expecting fresh data and get results that never update. This is the core value proposition failure. + +### Why This Is an End-to-End Property +Unlike internal properties (epoch fencing, CaS monotonicity), this property is directly observable by users. It combines source ingestion, compute processing, and persist writes into a single check. + +### Workload Verification +1. INSERT INTO table1 VALUES (1, 'test') +2. Wait for MV that SELECTs from table1 +3. SELECT * FROM mv1 — must eventually contain (1, 'test') + +### SUT-Side Instrumentation Notes +- Best verified at workload level via SQL assertions +- Candidate: Add `assert_sometimes!(mv_frontier_advanced)` in the compute persist sink + +### Provenance +Surfaced by Product Context focus. diff --git a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md new file mode 100644 index 0000000000000..19f6d02d68974 --- /dev/null +++ b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md @@ -0,0 +1,120 @@ +# mysql-source-no-data-loss — Every Row Written to MySQL Primary Is Eventually Visible in Materialize + +## Summary + +Every row inserted to the MySQL primary must eventually appear — with the +correct value — in the Materialize CDC source that reads from the +multithreaded MySQL replica. The pipeline is: + +``` +MySQL primary --GTID binlog--> MySQL replica (4 parallel workers) + | + Materialize CDC source + (antithesis_cluster) + | + antithesis_cdc table +``` + +## Instrumentation + +**Workload-side** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py`. + +Each `parallel_driver_` invocation: +1. Assigns a per-invocation `batch_id` prefix (Antithesis-seeded RNG). +2. Inserts `ROWS_PER_INVOCATION` (20) rows to `antithesis.cdc_test` on the + MySQL primary, recording the expected `{id → value}` map locally. +3. Requests an Antithesis quiet period (25 s) and polls `antithesis_cdc` in + Materialize until all expected rows appear or the 90 s budget expires. +4. Fires: + - `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` + — liveness anchor; confirms at least one invocation reaches full catchup. + - `always("mysql: CDC source row has correct value after catchup", …)` — safety; + fired once per row, catches wrong-value corruption. + - `always("mysql: CDC source row count matches inserted count after catchup", …)` + — safety; catches extra phantom rows (count > expected) or missing rows + (count < expected) at the batch level. + +**First-run setup** — `test/antithesis/workload/test/first_mysql_replica_setup.py`. + +Runs once per Antithesis timeline before any parallel drivers start: +- Creates `antithesis.cdc_test` on the primary. +- Configures the replica channel (`CHANGE REPLICATION SOURCE TO … SOURCE_AUTO_POSITION=1`). +- Sets `replica_parallel_workers = 4`, `replica_preserve_commit_order = ON`. +- Starts the replica. +- Creates the Materialize connection (`antithesis_mysql_conn`), source + (`mysql_cdc_source`), and table (`antithesis_cdc`). +- Fires `reachable("mysql: first-run setup complete …")` so Antithesis can + confirm the setup path is exercised in every timeline. +- Fires `sometimes("mysql replica: antithesis.cdc_test replicated from primary within 90s", …)` + to confirm initial replication is flowing before the source is created. + +## Why This Property Matters + +MySQL CDC via a multithreaded replica is a distinct and failure-prone code +path compared to the Kafka/upsert path that the existing drivers exercise. +Key fault scenarios exposed: + +- **Replica lag under faults** — if Antithesis kills the MySQL replica + container, the replica restarts from its persisted GTID position (the + replica data volume is persistent). The Materialize source must reconnect + and resume without dropping rows. + +- **Parallel replication ordering** — with 4 parallel workers and + `replica_preserve_commit_order=ON`, the replica applies transactions + concurrently but in primary commit order. Antithesis can inject scheduling + jitter that stresses the ordering protocol. + +- **Primary kills** — if Antithesis kills the MySQL primary, the replica + loses its upstream. Materialize's CDC source must handle the replica going + silent gracefully (not panic, not report wrong data). + +- **Materialize clusterd restarts** — the MySQL CDC source resumes from the + last committed GTID in the persist shard, similar to the Kafka source + resume-offset logic. Existing `storage-command-replay-idempotent` property + is stressed through the MySQL code path. + +## Assertion Types Chosen + +- `sometimes(…)` for liveness (catchup): the system must make progress at + least once per run. Under heavy fault injection catchup may not complete + every invocation; that's expected. We care that it succeeds at least once. + +- `always(…)` for safety (per-row value, batch count): once we've confirmed + catchup, every observable row must be correct. This is a hard safety + invariant. + +- `reachable(…)` for setup completion: ensures Antithesis counts the + first-run setup as an exercised path across the run. + +## Related Properties + +- `storage-command-replay-idempotent` — MySQL CDC resume on clusterd restart + exercises the same command-history replay path as Kafka sources. +- `fault-recovery-exercised` — the `sometimes(…)` recovery probe also fires + after MySQL-induced coordinator failures. +- `kafka-source-survives-clusterd-restart` — shares the "source resumes after + storage worker kill" structure; MySQL adds the replica-replication dimension. + +## Schema + +```sql +-- MySQL (primary and replica via replication): +CREATE TABLE antithesis.cdc_test ( + id VARCHAR(64) NOT NULL PRIMARY KEY, + batch_id VARCHAR(64) NOT NULL, + value TEXT NOT NULL, + updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6) + ON UPDATE CURRENT_TIMESTAMP(6) +); + +-- Materialize: +CREATE SECRET antithesis_mysql_password AS '…'; +CREATE CONNECTION antithesis_mysql_conn TO MYSQL ( + HOST 'mysql-replica', USER 'root', + PASSWORD SECRET antithesis_mysql_password +); +CREATE SOURCE mysql_cdc_source IN CLUSTER antithesis_cluster + FROM MYSQL CONNECTION antithesis_mysql_conn; +CREATE TABLE antithesis_cdc + FROM SOURCE mysql_cdc_source (REFERENCE antithesis.cdc_test); +``` diff --git a/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md b/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md new file mode 100644 index 0000000000000..7b1d830ee91d6 --- /dev/null +++ b/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md @@ -0,0 +1,39 @@ +# offset-known-not-below-committed + +## Summary + +For every Kafka source, the statistics view always reports `offset_known >= offset_committed`. Causally, what the broker has told us is available cannot lag what Materialize has durably ingested. + +## Code + +- `src/storage/src/statistics.rs` (around line 56-71) — the statistics update path that previously allowed regression. Commit `3e32df1f69` introduced clamping so that on a restart where `offset_known` would be loaded from the broker watermark while `offset_committed` is restored from persist, the metric does not flip into the wrong order. + +## How to check it + +Workload-side polling: + +```sql +SELECT id, offset_known, offset_committed +FROM mz_internal.mz_source_statistics_per_worker +WHERE id = ? +``` + +`assert_always!(offset_known >= offset_committed, "kafka source statistics: offset_known < offset_committed")`. + +SUT-side: mirror as an `assert_always!` inside the statistics update path itself, immediately after both fields are computed but before the value is published. + +## What goes wrong on violation + +The lag metric `offset_known - offset_committed` becomes a small negative number that wraps to a huge positive number in dashboards (commonly displayed as `u64` or with `MAX(0, …)` clamping that hides the actual bug). Operational tooling that drives autoscaling or alerting off lag becomes unreliable. + +## Antithesis angle + +The most interesting timing is the very first sample after a clusterd restart. The order in which the source restores `offset_committed` (from the persist shard upper) and learns `offset_known` (from rdkafka's first metadata response) determines whether the invariant holds during the window where one is set and the other is zero. The fix in commit `3e32df1f69` clamps; Antithesis should verify the clamp covers every interleaving. + +## Existing instrumentation + +None. Pure workload-side polling assertion, optionally mirrored SUT-side. + +## Provenance + +Surfaced by: Data Integrity (metrics correctness). Direct regression target for commit `3e32df1f69`. diff --git a/test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md b/test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md new file mode 100644 index 0000000000000..e0a3c6c682336 --- /dev/null +++ b/test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md @@ -0,0 +1,35 @@ +# peek-lifecycle-exactly-once + +## Summary +Each peek command produces exactly one response — no duplicates, no leaks, no orphaned state. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/peek.rs:80-95` — Explicit "1:1 contract between Peek and PeekResponseUnary" comment +- `src/adapter/src/coord/peek.rs:873-920` — Response routing with UUID tracking +- `src/adapter/src/coord/peek.rs:1174-1209` — `cancel_pending_peeks`: removes from client_pending_peeks then pending_peeks +- `src/adapter/src/coord/peek.rs:1256-1268` — `remove_pending_peek`: consistency check between two maps +- `src/adapter/src/coord/peek.rs:1221-1227` — `handle_peek_notification` removes before response + +### How It Works +Peeks are tracked in two maps: `pending_peeks` (UUID -> PendingPeek) and `client_pending_peeks` (ConnectionId -> Set). On response or cancellation, the peek is removed from both maps. Each UUID is unique (generated per-peek). + +### What Goes Wrong on Violation +- Leaked peeks: UUID stays in pending_peeks forever, growing memory until OOM +- Duplicate responses: client receives two result sets for one query +- Missing responses: client hangs waiting for a peek that was silently dropped + +### The Race Condition +The two-map removal (client_pending_peeks + pending_peeks) at lines 1256-1268 is not atomic. If CancelPendingPeeks races with PeekNotification: +1. Cancel removes UUID from client_pending_peeks +2. Peek response arrives, finds UUID in pending_peeks but not in client_pending_peeks +3. Orphaned state or double-processing + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: At coordinator shutdown or periodically, add `assert_always!(pending_peeks.is_empty() || active_connections_exist)` to detect leaks +- Candidate: On peek response, add `assert_always!` that UUID existed in pending_peeks before removal + +### Provenance +Surfaced by Protocol Contracts and Concurrency focuses. diff --git a/test/antithesis/scratchbook/properties/persist-cas-monotonicity.md b/test/antithesis/scratchbook/properties/persist-cas-monotonicity.md new file mode 100644 index 0000000000000..46ab8e6dd7bfe --- /dev/null +++ b/test/antithesis/scratchbook/properties/persist-cas-monotonicity.md @@ -0,0 +1,34 @@ +# persist-cas-monotonicity + +## Summary +Persist shard state versions (SeqNo) must never decrease across any observation point. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/state_versions.rs:48-87` — State version invariants: `earliest <= current.seqno` +- `src/persist-client/src/internal/state.rs:84-95` — `ROLLUP_THRESHOLD` and seqno-based rollup logic +- `src/persist-client/src/internal/state.rs:1324` — Invariant comment on rollup seqno +- `src/persist-client/src/internal/gc.rs` — GC respects seqno ordering +- `src/persist-client/src/write.rs:70-123` — WriteHandle CaS loop context + +### How It Works +Every state mutation increments SeqNo. The CaS loop in Machine reads current state, computes new state with SeqNo+1, and atomically writes via consensus. If another writer interleaved, the CaS fails and the writer retries with the newer SeqNo. Rollups periodically snapshot state; rollup seqno must be <= current seqno. + +### What Goes Wrong on Violation +SeqNo regression means state reconstruction from rollup + diffs produces wrong state. GC could delete diffs that are still needed. Writers could overwrite each other's changes. This is a data corruption scenario. + +### Failure Scenario +1. Writer A reads state at SeqNo 100, begins computing new state +2. Writer B reads state at SeqNo 100, writes SeqNo 101 +3. Writer A attempts to write SeqNo 101 — CaS should fail (current is now 101) +4. **Expected**: A retries, reads SeqNo 101, writes SeqNo 102 +5. **Bug**: If CaS comparison is stale and A's write at 101 succeeds despite B's 101 + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: `Machine::apply_unbatched_cmd` — add `assert_always!(new_seqno > old_seqno)` after every state transition +- Candidate: State reconstruction from rollup + diffs — add `assert_always!` that reconstructed state matches expected + +### Provenance +Surfaced by Data Integrity and Distributed Coordination focuses. diff --git a/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md b/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md new file mode 100644 index 0000000000000..ee2fb633240e4 --- /dev/null +++ b/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md @@ -0,0 +1,61 @@ +# reclock-mint-eventually-succeeds + +## Summary + +Under transient persist outages and competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, `src/storage/src/source/reclock.rs:160-166`) eventually completes for every source-frontier advance that has data to bind. + +## Code + +```rust +// src/storage/src/source/reclock.rs (around line 150-170) +loop { + match handle.compare_and_append(updates, prev_upper, new_into_upper).await { + Ok(()) => break, + Err(UpperMismatch { current, .. }) => { + self.sync(¤t).await; + // recompute updates and retry + } + } +} +``` + +There is no upper bound on this loop. It depends on the persist backend eventually being responsive and on competing writers not livelocking the source. + +## Why this is a liveness property + +Antithesis's job is to assert that the loop terminates in adversarial schedules. The catalog entry asserts both: + +1. The retry path is *exercised* (the loop runs more than once at least once during a run): `Sometimes(saw_cas_retry)`. +2. The source frontier eventually advances past the contention point: a workload-observable liveness check. + +## How to check it + +SUT-side anchor: +- Add an `assert_sometimes!(reclock_cas_retry_succeeded, "reclock: mint compare_and_append retry succeeded")` immediately after a successful `compare_and_append` that was preceded by at least one `UpperMismatch`. The local counter is reset on each `mint()` invocation. + +Workload-side liveness check: +- After injecting persist consensus latency or a competing-writer scenario, observe the source's `offset_committed` advancing in `mz_internal.mz_source_statistics_per_worker`. `assert_sometimes!(source_advanced_post_contention, …)`. + +## What goes wrong on violation + +The source's frontier stops advancing without any external signal that something is wrong. Health reports `Running`. The reclock operator is in an infinite `compare_and_append` → `UpperMismatch` → `sync` → `compare_and_append` cycle. To an operator looking from outside it looks like Kafka is the problem. + +## Antithesis angle + +- Inject high persist consensus latency. With many concurrent storage workers (or restart-induced competing writers), the CaS contention rate climbs and the retry loop runs many times. Antithesis tests that progress still happens. +- Race the metadata fetcher's partition-add against an in-flight mint. The mint is now reckoning with an extended `source_upper`; the CaS retry must recompute updates correctly. +- Concurrent kill+restart cycles that create competing-writer scenarios. + +## Open question (resolved) + +Q: Is there any input under which `compare_and_append` returns a non-retryable error and the loop should exit? + +A: Yes — `InvalidUsage` errors (handled by `panic!("compare_and_append failed: {invalid_use}")` at `reclock/compat.rs:306`). Those terminate the source. The retry loop only handles `UpperMismatch`. Antithesis fault injection should not produce `InvalidUsage` under correct code; if it does, that is a separate property (`reclock-cas-no-invalid-usage`) but it falls under the broader `kafka-source-no-internal-panic` property already cataloged. + +## Existing instrumentation + +None. The retry loop is silent. + +## Provenance + +Surfaced by: Failure Recovery, Distributed Coordination. diff --git a/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md b/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md new file mode 100644 index 0000000000000..75a17a7446664 --- /dev/null +++ b/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md @@ -0,0 +1,55 @@ +# remap-shard-antichain-wellformed + +## Summary + +At every Materialize timestamp `t`, the contents of the source's remap shard accumulated to `t` form a well-formed `Antichain`. Each source-time element has multiplicity exactly 1; for multi-partition Kafka sources, there is one element per partition range with no overlaps. + +## Origin + +This invariant is stated explicitly in the `ReclockOperator` doc comment (`src/storage/src/source/reclock.rs:31-34`): + +> "The `ReclockOperator` will always maintain the invariant that for any time `IntoTime` the remap collection accumulates into an Antichain where each `FromTime` timestamp has frequency `1`. In other words the remap collection describes a well formed `Antichain` as it is marching forwards." + +## Code paths + +- `src/storage/src/source/reclock.rs:118-169` — `ReclockOperator::mint`. Each call: + 1. Emits retractions (`-1`) of the prior `source_upper`. + 2. Emits insertions (`+1`) of the new `source_upper`. + 3. Calls `compare_and_append` on the remap shard. + 4. On `UpperMismatch`, `sync()` and retry. +- `src/storage/src/source/reclock.rs:124` — `assert!(!new_into_upper.less_equal(&binding_ts))` guards the mint precondition. +- `src/storage/src/source/reclock.rs:321` — `assert!(prev < RB::before(pid))` guards the partition-range ordering. +- `src/storage/src/source/reclock/compat.rs:144` — `assert!` on persist handle state. +- `src/storage/src/source/reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")` for genuinely invalid CaS calls. + +## Antithesis form + +Two complementary checks: + +1. **SUT-side** inside `ReclockOperator::sync` / `mint`, after every update: walk the local accumulated state and `assert_always!(antichain_wellformed, "reclock: remap shard accumulates to well-formed antichain")` — every source-time element has multiplicity 1. This is the tightest expression of the invariant. + +2. **Workload-side** as a periodic SQL probe: select the remap shard's contents (via `mz_internal` introspection views if available) and verify the well-formed property externally. This catches the case where the SUT-side check is correct but the durable persist state diverges. + +## What goes wrong on violation + +A malformed remap antichain corrupts every subsequent restart's resume frontier. The source either skips data (resume frontier too far ahead), re-reads data (too far back), or panics in downstream operators that depend on well-formed antichains (e.g., the as_of computation in commit `e3805ad790`). + +## Antithesis angle + +- Concurrent reclock writers across restart: kill the storage worker mid-mint, restart, the new worker must `sync()` the durable state and re-mint from there. If `sync()` is wrong, the new worker may insert without retracting, breaking multiplicity. +- Partition adds/removes interleaved with mints: the partition-range encoding in `RangeBound` is the part that has to stay consistent across discovery and binding. +- `compare_and_append` retry loop interactions: the historical bug at reclock.rs:160-166 was retried correctly, but the cached upper drift (commit `e3805ad790`) bypassed it. + +## Open question (resolved) + +Q: Can the in-memory `source_upper` and the persisted remap state ever diverge enough that the operator emits a malformed update batch? + +A: The `MutableAntichain` in `ReclockOperator::source_upper` is the source of truth for what *should* be persisted next. `mint()` constructs the update batch by diffing the new desired upper against the current `source_upper`. The retraction-insertion structure is what preserves the antichain-multiplicity invariant. The only divergence path is if `sync()` after `UpperMismatch` reads a state inconsistent with what `source_upper` thinks — i.e., a true persist corruption. The assertion at compat.rs:144 is meant to catch this. + +## Existing instrumentation + +The `assert!` and `panic!` calls at reclock.rs:124, :321 and compat.rs:144, :306 exist. None of them check the *accumulated antichain* property directly — they check local invariants. The recommended new assertion is a `assert_always!` over the in-memory accumulator that runs at every state transition. + +## Provenance + +Surfaced by: Data Integrity, Distributed Coordination. Foundational invariant for the entire reclocking subsystem. diff --git a/test/antithesis/scratchbook/properties/source-ingestion-progress.md b/test/antithesis/scratchbook/properties/source-ingestion-progress.md new file mode 100644 index 0000000000000..aa3b83c54f9cd --- /dev/null +++ b/test/antithesis/scratchbook/properties/source-ingestion-progress.md @@ -0,0 +1,27 @@ +# source-ingestion-progress + +## Summary +Kafka source ingestion eventually makes progress — the source frontier advances. + +## Evidence + +### Code Paths +- `src/storage/src/render/sources.rs` — Source operator assembly (Kafka, Postgres, MySQL connectors) +- `src/storage/src/source/reclock.rs` — Timestamp reclocking from source timestamps to Materialize timeline +- `src/storage/src/render/persist_sink.rs` — Writes ingested data to persist shards + +### How It Works +Storage workers connect to external sources (Kafka brokers, Postgres replication slots), read data, reclock timestamps, and write to persist. The source's upper frontier advances as data is ingested and persisted. + +### What Goes Wrong on Violation +Source stalls: materialized views stop updating, users see stale data indefinitely. This is the most visible user-facing failure mode for a streaming database. + +### Why This Is a Liveness Property +We want to confirm the system reaches a state where source data is flowing. Under fault injection (network partitions to Kafka, storage worker crashes), the source should eventually resume and make progress. + +### SUT-Side Instrumentation Notes +- Best verified at workload level: produce N messages to Kafka, query the source table, assert row count eventually reaches N +- Candidate: Add `assert_sometimes!(source_frontier_advanced)` in the persist sink write path + +### Provenance +Surfaced by Product Context focus. diff --git a/test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md b/test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md new file mode 100644 index 0000000000000..8046c29c5612e --- /dev/null +++ b/test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md @@ -0,0 +1,28 @@ +# storage-command-replay-idempotent + +## Summary +Replaying storage command history after reconnection is idempotent — no duplicate ingestion or state divergence. + +## Evidence + +### Code Paths +- `src/storage-controller/src/history.rs:20-80` — CommandHistory reduces and replays +- `src/storage-controller/src/instance.rs:46-80` — Replica rehydration via command history +- `src/storage-controller/src/persist_handles.rs:98-120` — Append retry semantics with Timestamp tracking + +### How It Works +The storage controller maintains a command history for each replica. On reconnection, it replays the reduced history. The history is compacted to remove superseded commands (e.g., only the latest configuration for each source). Sources resume from persisted offsets in persist, not from the beginning. + +### What Goes Wrong on Violation +Duplicate data appears in sources. Since materialized views are computed incrementally from sources, duplicates propagate to all downstream views. Users see incorrect aggregation results (double-counted rows). + +### Key Subtlety +Command history compaction assumes idempotency, but no explicit duplicate detection is observed in the code. If a RunIngestionCommand is partially executed (source starts but crashes before position is persisted), replay could re-ingest data from the last persisted offset, which may differ from the actual last-processed offset. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After replay, add `assert_always!` that source read position >= position before crash +- Candidate: After ingestion resumes, add `assert_always!` comparing row counts with expected deduplication + +### Provenance +Surfaced by Failure Recovery focus. diff --git a/test/antithesis/scratchbook/properties/strict-serializable-reads.md b/test/antithesis/scratchbook/properties/strict-serializable-reads.md new file mode 100644 index 0000000000000..450d623b4c6f3 --- /dev/null +++ b/test/antithesis/scratchbook/properties/strict-serializable-reads.md @@ -0,0 +1,34 @@ +# strict-serializable-reads + +## Summary +Reads respect the timestamp oracle's linearization point — later reads see all changes visible to earlier reads. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/timestamp_selection.rs:40-52` — When `chosen_ts` differs from `oracle_ts`, peek results must be delayed until oracle catches up +- `src/adapter/src/coord/sequencer/inner.rs:2097-2116` — Strict serializable reads tracked via `strict_serializable_reads_tx` +- `src/adapter/src/coord/timestamp_selection.rs:228-240` — `needs_linearized_read_ts` check +- `src/adapter/src/coord/in_memory_oracle.rs:92-101` — Oracle timestamp advancement + +### How It Works +The coordinator assigns every read a timestamp from the oracle. The oracle maintains a monotonically advancing timestamp. Strict serializable reads wait for the oracle to confirm their timestamp is linearized before returning results. This ensures no read can see a state "in the past" relative to another concurrent read. + +### What Goes Wrong on Violation +Users observe non-repeatable reads: query A at time T sees data that query B at time T+1 does not see. This violates the strict serializability contract that is Materialize's primary differentiator from other streaming systems. + +### Workload-Level Verification +This property is best verified at the workload level: +1. Client A writes row R and receives acknowledgment +2. Client B reads and must see R (or a later state including R) +3. Client C reads and must see at least what B saw + +The workload checks SQL results, not internal state. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: `timestamp_selection.rs` oracle advancement — add `assert_always!` that oracle timestamp never decreases +- Candidate: After peek response, add workload-side `Always` assertion comparing read timestamp ordering with data ordering + +### Provenance +Surfaced by Protocol Contracts focus (merged from timestamp-oracle-linearization and strict-serializable-ordering). diff --git a/test/antithesis/scratchbook/properties/tombstone-sealing-finality.md b/test/antithesis/scratchbook/properties/tombstone-sealing-finality.md new file mode 100644 index 0000000000000..bc97da01197ae --- /dev/null +++ b/test/antithesis/scratchbook/properties/tombstone-sealing-finality.md @@ -0,0 +1,22 @@ +# tombstone-sealing-finality + +## Summary +Once a shard is tombstoned (upper and since both empty antichain), no further mutations are possible. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/state.rs:2128-2134` — `is_tombstone()` checks upper.is_empty() && since.is_empty() && writers.is_empty() && critical_readers.is_empty() +- `src/persist-client/src/internal/state.rs:1703-1712` — compare_and_append short-circuits on tombstone +- `src/persist-client/src/internal/state.rs:2146-2159` — `become_tombstone_and_shrink()` transition + +### What Goes Wrong on Violation +If a tombstoned shard accepts new writes, deleted tables/views could have data resurrected. This would confuse users and violate the contract that DROP TABLE removes data permanently. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After `is_tombstone()` returns true, add `assert_always!` that subsequent append attempts return error +- Candidate: `become_tombstone_and_shrink()` — add `assert_unreachable!` after the transition if any subsequent mutation succeeds + +### Provenance +Surfaced by Data Integrity focus. diff --git a/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md b/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md new file mode 100644 index 0000000000000..850914b374346 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md @@ -0,0 +1,42 @@ +# upsert-decode-error-retractable + +## Summary + +An `UpsertError` (key decode failure, null key, or value decode failure) for a key is retracted once a subsequent valid `(key, value)` message for the same key is ingested. After settling, the source reflects the corrected value and contains no remaining error row for that key. + +This is the upsert envelope's recovery contract for upstream schema mistakes — "fix the bad message and continue" without dropping the source. + +## Code paths + +- `src/storage/src/render/sources.rs` — `upsert_commands` (line ~509-560 and following): maps decode failures to `UpsertError::NullKey` / `KeyDecode` / `Value`. The result still flows through the upsert pipeline keyed by `UpsertKey::from_key(Err(&err))` so a future good value can retract it. +- `src/storage-types/src/errors.rs:161-199` — `EnvelopeError::Upsert(UpsertError)` is the *retractable* error variant. `EnvelopeError::Flat(text)` is explicitly *not retractable*. +- `src/storage/src/upsert.rs:748-750` — error emission paths. + +## How to check it + +Workload procedure: +1. Produce a malformed message for key `K` (e.g., invalid Avro under a schema-registry-backed source, or null key on a non-null-key source). +2. Verify the source contains an error row keyed by `K`. +3. Produce a valid `(K, value)` message. +4. After quiet period, `assert_always!(upsert_error_retracted, "upsert: bad value retracted by subsequent good value")` checking that `SELECT * FROM source WHERE key = K` returns exactly one row with `value`, no error row. + +## What goes wrong on violation + +If the error is not retractable, the source carries a stuck error row that nothing can clear — the only recovery is to drop and re-create the source. + +## Distinguishing retractable from non-retractable + +This property targets `EnvelopeError::Upsert(_)` only. `EnvelopeError::Flat(_)` is explicitly non-retractable and should not be tested with this property. Workloads must take care to produce errors that map to the Upsert variant — null key, malformed key/value under upsert mode — rather than envelope-fatal errors. + +## Antithesis angle + +- Race the bad and good messages closely. Verify ordering is preserved. +- Crash clusterd between the bad message ingesting and the good message ingesting. The error row must persist across the restart and the good message must retract it on resume. + +## Existing instrumentation + +None. Workload-side check. + +## Provenance + +Surfaced by: Protocol Contracts, Failure Recovery. diff --git a/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md b/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md new file mode 100644 index 0000000000000..244fb4a4ed01d --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md @@ -0,0 +1,43 @@ +# upsert-ensure-decoded-called-before-access + +## Summary + +The six `StateValue` accessors that require the cell to be in `Value` form are always called after `ensure_decoded` has been called on that cell — the panics that currently guard the type-state protocol never fire. + +## Targeted sites + +`src/storage/src/upsert/types.rs`: + +| Line | Accessor | Message | +|------|----------|---------| +| 297 | `into_decoded` | `panic!("called \`into_decoded without calling \`ensure_decoded\`")` | +| 369 | `into_provisional_value` | `panic!("called \`into_provisional_value\` without calling \`ensure_decoded\`")` | +| 403 | `into_provisional_tombstone` | `panic!("called \`into_provisional_tombstone\` without calling \`ensure_decoded\`")` | +| 416 | `provisional_order` | `panic!("called \`provisional_order\` without calling \`ensure_decoded\`")` | +| 430 | `provisional_value_ref` | `panic!("called \`provisional_value_ref\` without calling \`ensure_decoded\`")` | +| 440 | `into_finalized_value` | `panic!("called \`into_finalized_value\` without calling \`ensure_decoded\`")` | + +Each becomes `assert_unreachable!("upsert: on Consolidating StateValue")` with a distinct, accessor-specific message. + +## Why this is a real property, not just dead code + +Two reasons. + +1. **Refactor net.** The upsert operator has been rewritten twice (`upsert_classic`, `upsert_continual_feedback`, `upsert_continual_feedback_v2`). Every rewrite added new call sites that touch `StateValue`. A future refactor that forgets to call `ensure_decoded` would today abort clusterd; with the Antithesis SDK in place, it surfaces as a property failure during the very first nightly run after the change. +2. **Replay anchors.** If Antithesis ever does trip one of these, the failure pinpoints the exact accessor and code path. That is materially more useful than a stack trace from a process abort, especially in a multi-replica scenario where the abort is invisible behind clusterd's auto-restart. + +## What this property does *not* catch + +This property only checks the type-state protocol — "ensure_decoded was called first." It does not check that the consolidating math itself is correct (that is `upsert-state-consolidation-wellformed`). The two are complementary. + +## Antithesis angle + +These panics are most likely to fire after a code change to the upsert operator's hot path. Antithesis exercises every operator branch with random fault injection — it should reach the rewrite-sensitive accessor sites if any exist. Cost of instrumenting is trivial (rename `panic!` to `assert_unreachable!`); the value is the regression net. + +## Existing instrumentation + +The `panic!`s already exist. They abort the process on misuse. The work is wrapping each with `assert_unreachable!` so the misuse is reported. + +## Provenance + +Surfaced by: Wildcard (this is the type-state guard family that doesn't fit a standard focus). diff --git a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md new file mode 100644 index 0000000000000..90341358df926 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md @@ -0,0 +1,63 @@ +# upsert-key-reflects-latest-value + +## Summary + +At a settled timestamp, every key in an UPSERT-envelope source maps to the value from the last `(key, value)` message produced — or to no row if the last message for that key was a tombstone. + +## Code paths + +- `src/storage/src/render/sources.rs` — `upsert_commands` converts `DecodeResult` into `(UpsertKey, Option, FromTime)`. `UpsertKey` is a SHA-256 of the key bytes (collision probability `2^-128`). +- `src/storage/src/upsert.rs` — `upsert_classic`: the main operator. For each input update at `from_time`: + 1. `multi_get(key)` → returns prior value + prior order key. + 2. Skip if `from_time <= prior_order` (stale update). + 3. Emit retraction of prior value at the new timestamp. + 4. Emit insertion of new value at the new timestamp. + 5. `multi_put(key, new_value)` updates the state store. +- `src/storage/src/upsert_continual_feedback.rs` and `_v2.rs` — alternative implementations driven by persist feedback. Same contract, different consolidation strategy. +- `src/storage/src/upsert/types.rs` — `StateValue::ensure_decoded` (~line 589) finalizes the XOR-checksum consolidating state into either a `Value` or a `tombstone`. Critical for snapshot replay correctness. + +## How to check it + +Workload-level: +1. Workload tracks `expected_state: Map>` of what was last produced per key. +2. After fault quiet period, for a sampled set of keys: `SELECT value FROM source WHERE key = ?` and compare to `expected_state[key]`. +3. `assert_always!(upsert_value_matches_latest_produced, "upsert: key value matches latest produced")` — checked on every sample. If the workload notices a divergence, it logs the diff (expected vs. observed) for replay. + +## What goes wrong on violation + +The source returns a stale value for a key. The user's downstream MV uses it. The bug is invisible until someone manually compares the source to the upstream system. + +## Antithesis angle + +- Crash clusterd between `multi_get` and `multi_put`. The next incarnation must reconstruct state correctly from feedback. +- Race produce ordering: if Kafka delivers `(k, v1)` then `(k, v2)`, the source's order-key tracking must serialize them. Order-key regression caused a historical panic (commit `f177db8286`, materialize#26655). +- For RocksDB backend: race `multi_put` against the merge operator running async. +- For multi-replica: both replicas process the same key concurrently (commit `1accbe28b3`). + +## Open question (resolved) + +Q: Does the workload need to know about the per-source `order_key` to validate, or is `from_time` ordering sufficient? + +A: For correctness asserting at quiet periods, the workload only needs the *Kafka* produce order — the operator's job is to translate that into the correct visible value. Since Antithesis injects faults but doesn't reorder Kafka's per-partition delivery, the workload can rely on per-partition produce order to determine `expected_state`. Cross-partition reordering is not a concern because the workload assigns each key to a fixed partition. + +## Existing instrumentation + +None. Pure workload-side check. Optional SUT anchor: an `assert_sometimes!(upsert_emit_correct_retraction, …)` inside `upsert.rs` after a retraction is emitted whose prior value matched what was stored — this gives Antithesis a positive signal that the prior-value-lookup path is being exercised. + +## Implementation status + +Implemented 2026-05-11 as `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Three assertion messages, each unique: + +| Message | Type | When | +|---------|------|------| +| `"upsert: SELECT for key matches latest produced value"` | `always` | Per sampled live key after quiet-period catchup | +| `"upsert: tombstoned key has no row in source"` | `always` | Per sampled key whose last produced message was a tombstone | +| `"upsert: source caught up to produced offsets after quiet period"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data | + +Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_quiet.py` (`ANTITHESIS_STOP_FAULTS` wrapper), `helper_random.py` (deterministic randomness with Antithesis SDK), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`). + +No SUT-side instrumentation added in this pass — that is the candidate work in `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, and `properties/upsert-ensure-decoded-called-before-access.md`. + +## Provenance + +Surfaced by: Data Integrity, Concurrency. Direct regression target for materialize#26655. diff --git a/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md b/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md new file mode 100644 index 0000000000000..e9d097626e601 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md @@ -0,0 +1,43 @@ +# upsert-no-internal-panic + +## Summary + +The upsert operator's explicit `assert!`s and `panic!`s — currently process-aborting guards — never fire under any Antithesis-injected fault sequence. Each site is converted to a uniquely-messaged `assert_always!` / `assert_unreachable!` so a firing surfaces as a reportable Antithesis property failure rather than a clusterd crash. + +## Targeted assertion sites + +| File | Line | Site | Antithesis form | +|------|------|------|------------------| +| `src/storage/src/upsert.rs` | 541 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (classic)")` | +| `src/storage/src/upsert.rs` | 636 | `panic!("key missing from commands_state")` | `assert_unreachable!("upsert: key missing from commands_state (classic)")` | +| `src/storage/src/upsert.rs` | 1031 | `unreachable!("pending future never returns")` | `assert_unreachable!("upsert: pending future returned (classic)")` | +| `src/storage/src/upsert_continual_feedback.rs` | 626 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (cf v1)")` | +| `src/storage/src/upsert_continual_feedback.rs` | 800 | `panic!("key missing from commands_state")` | `assert_unreachable!("upsert: key missing from commands_state (cf v1)")` | +| `src/storage/src/upsert_continual_feedback_v2.rs` | 315 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (cf v2)")` | +| `src/storage/src/upsert_continual_feedback_v2.rs` | 483 | `unreachable!()` on `(None, None)` from joined prior/new state | `assert_unreachable!("upsert: cf v2 join produced (None, None)")` | +| `src/storage/src/upsert/types.rs` | 580 | `panic!("merge_update_state called with non-consolidating state")` | `assert_unreachable!("upsert: merge_update_state on non-Consolidating state")` | +| `src/storage/src/upsert/types.rs` | 1062 | `panic!("attempted completion of already completed upsert snapshot")` | `assert_unreachable!("upsert: snapshot completion called twice")` | + +Each message is unique; an Antithesis failure report names exactly the site that was reached. + +## Why these sites + +These are structural invariants the operator's authors believed to be impossible. Bug history confirms several have fired in production (commits `f177db8286`, `1accbe28b3`). The cost of wrapping them with the Antithesis SDK is trivial; the upside is reportable, replayable property failures. + +## Antithesis angle + +- Multi-replica clusters: most relevant for `key missing from commands_state` and the `unreachable!` on `(None, None)`. +- Order-key edge cases: maps to the `assert!(diff.is_positive())` family. +- Snapshot completion: the `panic!("attempted completion of already completed upsert snapshot")` is reached if the snapshot-completion state machine is re-entered (rehydration after a crash that already completed snapshot). + +## Relationship to other properties + +This property is the *operator-internal* counterpart to `upsert-state-consolidation-wellformed` (which guards the math in `ensure_decoded`) and `upsert-ensure-decoded-called-before-access` (which guards the type-state protocol on `StateValue` accessors). Together they form the SUT-side instrumentation backbone for the UPSERT envelope. + +## Existing instrumentation + +The `assert!` / `panic!` calls already exist as process-aborting guards. They abort in test today; the work is converting them to `assert_always!`/`assert_unreachable!` so failures are *reported* rather than masked as "clusterd was restarted." Each site gets a distinct, specific message per the property-catalog requirement that assertion messages be unique. + +## Provenance + +Surfaced by: Concurrency, Failure Recovery. Regression targets: commits `f177db8286`, `1accbe28b3`, materialize#26655, database-issues#9160. diff --git a/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md b/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md new file mode 100644 index 0000000000000..d65161bba6766 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md @@ -0,0 +1,75 @@ +# upsert-state-consolidation-wellformed + +## Summary + +`StateValue::ensure_decoded` always finalizes a `Consolidating` cell into either a `Value(value)` (when `diff_sum == 1` and the recovered bytes match the stored `len_sum` and seahash `checksum_sum`) or a `tombstone()` (when `diff_sum == 0` and the entire accumulator is zero). Any other state — non-{0,1} `diff_sum`, mismatched checksum, non-zero residue on a tombstone — is an XOR/accounting corruption and must never be observed. + +## Code + +`src/storage/src/upsert/types.rs:584-682`: + +```rust +pub fn ensure_decoded(&mut self, bincode_opts, source_id, key) { + match self { + StateValue::Consolidating(consolidating) => { + match consolidating.diff_sum.0 { + 1 => { + let len = usize::try_from(consolidating.len_sum.0)...expect(...); + let value = &consolidating.value_xor.get(..len)...expect(...); + assert_eq!(consolidating.checksum_sum.0, seahash::hash(value) as i64, ...); + *self = Self::finalized_value(bincode_opts.deserialize(value).unwrap()); + } + 0 => { + assert_eq!(consolidating.len_sum.0, 0, ...); + assert_eq!(consolidating.checksum_sum.0, 0, ...); + assert!(consolidating.value_xor.iter().all(|&x| x == 0), ...); + *self = Self::tombstone(); + } + other => panic!("invalid upsert state: non 0/1 diff_sum: {other}, ..."), + } + } + StateValue::Value(_) => {} + } +} +``` + +## Antithesis form + +Each of the four assertions in this function becomes a uniquely-messaged `assert_always!`: + +| Existing | Antithesis form | Message | +|---|---|---| +| `assert_eq!(checksum_sum, seahash::hash(value))` (621) | `assert_always!(checksum_sum == seahash::hash(value), …)` | `"upsert: consolidating checksum_sum mismatch (diff_sum=1)"` | +| `assert_eq!(len_sum, 0)` (632) | `assert_always!(len_sum == 0, …)` | `"upsert: consolidating len_sum nonzero (diff_sum=0)"` | +| `assert_eq!(checksum_sum, 0)` (637) | `assert_always!(checksum_sum == 0, …)` | `"upsert: consolidating checksum_sum nonzero (diff_sum=0)"` | +| `assert!(value_xor.iter().all(==0))` (642) | `assert_always!(value_xor.iter().all(==0), …)` | `"upsert: consolidating value_xor nonzero (diff_sum=0)"` | +| `panic!("invalid upsert state: non 0/1 diff_sum: {other}, …")` (672) | `assert_always!(false, …)` | `"upsert: consolidating diff_sum not in {0,1}"` | + +Plus the two `expect("invalid upsert state")` calls at 606 and 619 (slice-into-bytes failures); these should become `assert_always!(value_xor.len() >= len, …)` with a distinct message. + +## What goes wrong on violation + +The XOR-based consolidation collapses many `(diff, bytes)` updates per key into a single accumulator. The math only works if every retraction is exactly paired with its insertion. A trip into the non-{0,1} branch indicates one of: + +- A duplicate retraction (commit `1accbe28b3` style multi-replica double-drain). +- A retraction without a matching insertion in the replay stream (incomplete feedback delivery across crash). +- A `seahash` collision (negligible probability — if seen, it's a bug elsewhere, not the hash). +- A bug in the `merge_update_state` math (`upsert/types.rs:533+`). + +## Antithesis angle + +- Kill clusterd mid-feedback-replay; restart and assert that `ensure_decoded` always completes cleanly. +- Multi-replica with concurrent drains feeding the same RocksDB backend. +- Race RocksDB's async merge operator against `multi_put`. + +## Why this is the deepest signal + +The XOR/checksum consolidation is the *math*: if this assertion ever trips, something upstream — feedback delivery, retraction emission, or order-key tracking — produced an inconsistent update sequence. The signal is high because the assertion is at the *bottom* of the pipeline; everything else has had a chance to introduce the bug, but only this site can detect it. + +## Existing instrumentation + +The runtime `panic!` and `assert!`s already exist and would abort clusterd on violation. Today, an abort in test looks like "the storage worker crashed" — possibly retried, possibly noticed only via a log scrape. Wrapping them with Antithesis assertions turns each into a reportable, replay-anchored property failure with a unique signature. + +## Provenance + +Surfaced by: Data Integrity, Concurrency (via the multi-replica drain bug history). diff --git a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md new file mode 100644 index 0000000000000..287d967d02c47 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md @@ -0,0 +1,63 @@ +# upsert-state-rehydrates-correctly + +## Summary + +After a clusterd restart, the rehydrated upsert state — observed via `SELECT * FROM source` — equals the state at the most recent durable timestamp before the restart, for every key produced so far. + +## Code paths + +- `src/storage/src/upsert.rs:791-799` — snapshot phase: drain input at `resume_upper` boundary, all snapshot values marked with `provisional_order = None` (sorts lowest). +- `src/storage/src/upsert/types.rs:1062` — `panic!("attempted completion of already completed upsert snapshot")` is the guard for the snapshot-completion state machine. +- `src/storage/src/upsert/types.rs:584-682` — `StateValue::ensure_decoded` finalizes the consolidating state. The `diff_sum ∈ {0, 1}` invariant must hold at completion time. +- `src/storage/src/upsert_continual_feedback.rs` — the continual-feedback variant uses a persist `Listen` to receive feedback values; the same correctness contract applies. + +## How to check it + +Workload procedure: +1. Produce many `(key, value)` and `(key, null)` messages; track `expected_state`. +2. Wait for `offset_committed` to advance past last produced offset. +3. Snapshot `expected_state` and the source's `SELECT * FROM source` content side-by-side; assert equality. +4. Kill clusterd; wait for restart and quiet period. +5. Re-run the comparison: `SELECT * FROM source` must equal the pre-kill snapshot. +6. `assert_always!(upsert_state_rehydrated_correctly, "upsert: rehydrated state equals pre-restart state")`. + +## What goes wrong on violation + +The source comes back with wrong values per key, missing keys, or keys that should be tombstoned but are present. The bug is silent — the source reports healthy and the workload sees plausible-but-wrong data. + +## Antithesis angle + +The interesting window is between the persist sink's `compare_and_append` succeeding for batch N and the upsert operator's *next* snapshot-completion. If a crash drops feedback delivery between those two points, the next incarnation's snapshot may see partial state and complete with the wrong tombstone/value mapping. + +Compounded by RocksDB merge operator behavior (commit `0d8d740b47`): if the merge operator interleaves with snapshot completion in a way that drops a tombstone, the rehydrated state diverges. + +## Dependencies + +- Requires node-termination faults enabled. +- Combine with `upsert-state-consolidation-wellformed` (the deeper `ensure_decoded` correctness check) for full coverage of the snapshot path. +- Combine with `kafka-source-no-data-duplication` to rule out the related failure mode where rehydration introduces duplicates rather than wrong values. + +## Existing instrumentation + +None. Candidate SUT anchors: an `assert_sometimes!(upsert_snapshot_completed, "upsert: snapshot phase completed")` at the snapshot-completion call site, and `assert_always!(diff_sum_in_range, …)` mirroring the existing `panic!` in `ensure_decoded`. + +## Implementation status + +Implemented 2026-05-11 (workload-side) as `test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py`. The `singleton_driver_` runs exactly once per timeline and lives across multiple produce/settle/assert cycles, holding `expected_state` in process memory across cycles: + +| Message | Type | Fires when | +|---------|------|------------| +| `"upsert: rehydrated state matches local model (live key)"` | `always` | Per live key, per cycle, after catchup. Cross-cycle stability of `expected` is the rehydration check. | +| `"upsert: rehydrated state matches local model (tombstoned key)"` | `always` | Per tombstoned key, per cycle, after catchup. | +| `"upsert: rehydration driver ran 2+ assertion cycles"` | `sometimes` | Once per invocation; confirms the safety check ran against multiple settle cycles (not just one early cycle that masks rehydration). | +| `"upsert: rehydration driver observed clusterd replica non-online"` | `sometimes` | Best-effort proxy: `mz_internal.mz_cluster_replica_statuses` showed an `antithesis_cluster` replica in a non-`online` status during the run. Not a guarantee that a restart happened, but a noisy yes-signal that something disturbed the cluster. | + +Knobs: `CYCLE_COUNT=8`, `PRODUCES_PER_CYCLE=30`, `DISTINCT_KEYS=6` (small enough that keys are revisited within and across cycles), `TOMBSTONE_PROB=0.20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=120`, `INTER_CYCLE_SLEEP_S=2`. + +**Requires node-termination faults enabled** in the Antithesis tenant for the property to be exercised at full strength. Without restarts, the cross-cycle stability check still catches divergence from the operator processing a sequence of upserts/tombstones (i.e., it falls back to a slower version of `upsert-key-reflects-latest-value`). + +SUT-side anchors at the upsert snapshot-completion call sites are deferred and would tighten replay anchoring. + +## Provenance + +Surfaced by: Failure Recovery, Data Integrity. diff --git a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md new file mode 100644 index 0000000000000..50ee185c746f1 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md @@ -0,0 +1,47 @@ +# upsert-tombstone-removes-key + +## Summary + +A `(key, null)` tombstone message eventually removes the key from the UPSERT source, and the key stays absent until a non-null value is produced for it. + +## Code paths + +- `src/storage/src/render/sources.rs` — `upsert_commands` maps `None` value → tombstone signal: `(UpsertKey, None, from_time)`. +- `src/storage/src/upsert.rs` — `upsert_classic`: on `None` value with existing prior value, emit retraction at new timestamp and `multi_put(key, tombstone)`. +- `src/storage/src/upsert/types.rs` — `StateValue::tombstone()` constructor; `ensure_decoded` with `diff_sum == 0` produces this state. + +## How to check it + +Workload procedure: +1. Produce `(key, v)` to topic. +2. Wait for source to ingest it; verify row visible. +3. Produce `(key, null)`. +4. After quiet period, `assert_always!(tombstoned_key_absent, "upsert: tombstoned key has no row")` checking `SELECT count(*) FROM source WHERE key = ? = 0`. +5. Bonus: kill clusterd, restart, assert the row is still absent (no resurrection). + +## What goes wrong on violation + +A deleted row reappears after restart. Compliance and correctness hazard. The likely cause is the snapshot replay misinterpreting a tombstone consolidating state — the `diff_sum == 0` branch of `ensure_decoded` is what guards this. + +## Antithesis angle + +- Crash between tombstone retraction emit and `multi_put(tombstone)`. The state store is now ahead/behind the persisted output; the snapshot replay on restart is what reconciles. +- Race `(k, v)`, `(k, null)`, `(k, v')` deliveries: every interleaving must end with `v'` visible. +- For the no-resurrection half: produce tombstone, wait for `offset_committed` to advance past its offset, then kill clusterd. On restart, the key must not reappear. + +## Existing instrumentation + +None. Workload-side check. The `StateValue::tombstone` construction path and the `ensure_decoded` tombstone branch are the relevant code; adding `assert_sometimes!(tombstone_emitted, ...)` inside the tombstone-emit path gives a coverage signal. + +## Implementation status + +Implemented 2026-05-11 (workload-side) inside the existing `parallel_driver_upsert_latest_value.py`: + +- Safety half: `always("upsert: tombstoned key has no row in source", ...)` (already existed for `upsert-key-reflects-latest-value`) — fires per key whose latest produced message was a tombstone. +- Path-exercise anchor: new `sometimes("upsert: tombstone overwrote a live value at least once this invocation", ...)`. The driver counts `tombstoned_after_value` — the number of tombstone produces where the immediately-prior produced value for that key was a live value. Without this anchor, the `always` could be vacuously satisfied by tombstones against never-written keys. + +The "no resurrection across restart" half is covered structurally by `upsert-state-rehydrates-correctly`'s cross-cycle stability check, which includes tombstoned keys in its per-key assertion loop (`"upsert: rehydrated state matches local model (tombstoned key)"`). + +## Provenance + +Surfaced by: Data Integrity, Lifecycle Transitions (delete operations). diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md new file mode 100644 index 0000000000000..2c308cf3e2e2b --- /dev/null +++ b/test/antithesis/scratchbook/property-catalog.md @@ -0,0 +1,437 @@ +--- +commit: 007c7af9d9970fb2030c7212368b232e0fbc363e +updated: 2026-05-12 +--- + + +# Property Catalog: Materialize + +## Category 1: Data Integrity Under Faults + +Properties that verify data correctness when crashes, network partitions, and concurrent access interact with the persist layer and catalog. + +### epoch-fencing-prevents-split-brain — Epoch-Based Fencing Prevents Split-Brain Writes + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — fundamental split-brain prevention; failure here corrupts all state | +| **Status** | **Partially implemented (SUT-side, single-coordinator scope)** — `src/catalog/src/durable/persist.rs`: an `assert_always_greater_than!(new_epoch, prior_durable_epoch, "catalog fencing: new durable epoch did not strictly increase after fence-token CaS", …)` fires after each successful fence-token CaS in `open_inner`. Every environmentd restart in the Antithesis topology exercises this path. **The cross-coordinator half of the property (a `Fenced` writer being correctly rejected at validate time) is NOT exercised today and is not planned.** Materialize does not run multiple concurrent environmentd processes against the same catalog shard in any supported topology, so the `FenceableToken::Fenced` state is unreachable here. The two `assert!` panics in `FenceableToken::validate` would be the natural Antithesis anchor for that half; they are intentionally left as bare panics with an in-source comment pointing back to this entry, to be promoted to `assert_always!` if a 0DT-preflight-style multi-environmentd topology is ever added. | +| **Property** | After a coordinator restart with a higher epoch, the old coordinator (lower epoch) cannot successfully write to the catalog persist shard. | +| **Invariant** | `Always`: once a higher epoch is written to consensus, any compare_and_append from a lower epoch must fail with FenceError. This is a strict safety invariant — every check must hold. | +| **Antithesis Angle** | Network partition separates old coordinator from consensus while new coordinator starts with higher epoch. When partition heals, old coordinator's in-flight writes must be rejected. Antithesis explores the timing window between old coordinator's last successful write and new coordinator's first write. | +| **Why It Matters** | Split-brain writes corrupt the catalog, potentially causing data loss or inconsistent schema state. This is the fundamental distributed safety mechanism. Surfaced by: Distributed Coordination, Failure Recovery. | + +### persist-cas-monotonicity — Persist SeqNo Never Decreases + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — backbone of persist consistency; all other persist properties depend on this | +| **Status** | **Implemented (SUT-side)** — `src/persist-client/src/internal/apply.rs`: alongside the existing `assert_eq!(expected.next(), new_state.seqno(), …)` strict-increment check in `compute_next_state_locked`, an `assert_always_greater_than!(new_state.seqno().0, expected.0, "persist: state seqno did not strictly increase across CaS apply", …)` makes the broader monotonicity invariant a reportable Antithesis property rather than only a process panic. The strict-equality `assert_eq!` is retained so the narrower invariant (next == seqno) still surfaces. The companion rollup-seqno invariant (`state.rs:1324` doc comment) is deferred. | +| **Property** | Persist shard state versions (SeqNo) form a strictly increasing sequence. No writer can observe or apply a lower SeqNo after observing a higher one. | +| **Invariant** | `Always`: for any shard, if SeqNo N is observed, no subsequent observation returns SeqNo < N. Rollups maintain seqno <= seqno_since. This must hold on every check — a single violation means state corruption. | +| **Antithesis Angle** | Partition storage from persist backend mid-write. One writer races to increment SeqNo while another caches an old value and retries. Crash during GC/rollup operations. Antithesis explores interleaving of concurrent CaS loops. | +| **Why It Matters** | SeqNo monotonicity is the backbone of persist's consistency model. Violations cause state reconstruction failures and data loss. Surfaced by: Data Integrity, Distributed Coordination. | + +### tombstone-sealing-finality — Tombstoned Shards Are Immutable + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — prevents zombie writes to dropped collections | +| **Property** | Once a shard's upper and since both advance to the empty antichain (tombstone), no new writes, reader registrations, or writer registrations can succeed. The transition is irreversible. | +| **Invariant** | `Always`: after `is_tombstone()` returns true, any append, downgrade_since, or registration attempt must fail. The state machine must never revert from tombstone. | +| **Antithesis Angle** | Crash and restart after tombstone. Fire concurrent write/read attempts while state is being replayed from consensus. Antithesis explores whether recovery code can accidentally un-tombstone a shard. | +| **Why It Matters** | Tombstone finality prevents zombie writes to dropped collections. Violation could resurface deleted data. Surfaced by: Data Integrity. | + +### idempotent-write-under-indeterminate — Compare-and-Append Idempotency on Retry + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — indeterminate errors are the hardest distributed systems edge case | +| **Property** | When compare_and_append receives an Indeterminate error from consensus and retries with the same idempotency token, the shard contains exactly one copy of the write — never zero, never two. | +| **Invariant** | `Always`: after retry with identical IdempotencyToken, the shard's upper reflects exactly one successful write. Duplicate data must never appear in the shard trace. | +| **Antithesis Angle** | Inject network failures on consensus calls mid-flight. Kill writer after batch is queued but before state is committed. Antithesis explores the window between consensus write and acknowledgment. | +| **Why It Matters** | Indeterminate errors are the hardest to handle correctly in distributed systems. Duplication or loss here silently corrupts downstream materialized views. Surfaced by: Data Integrity. | + +### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — incorrect fencing allows premature GC causing data loss | +| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. | +| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. | +| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. | +| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. | + +## Category 2: Consistency Model Enforcement + +Properties that verify Materialize's strict serializability guarantee and timestamp oracle correctness. + +### strict-serializable-reads — Reads Respect Timestamp Oracle Linearization + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — Materialize's core advertised guarantee; user-visible | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py`. Inserts one row per step into `mv_input_table` and, between steps, opens a *fresh* psycopg connection (explicit `SET transaction_isolation TO 'strict serializable'`) to SELECT the rolling-count MV's row for the invocation's prefix. After a quiet-period closing observation, asserts (a) `always("…fresh-connection read regressed across adjacent observations", …)` for every adjacent pair, and (b) `always("…closing fresh-connection read regressed below earlier maximum", …)` for the closing read versus the historical max. One `sometimes("…final fresh-connection read reached inserted count", …)` liveness anchor. The SUT-side oracle-timestamp-non-decreasing mirror in `src/adapter/src/coord/in_memory_oracle.rs` is deferred. | +| **Property** | Two reads on the same collection at timestamps t1 < t2 (assigned by the oracle) must observe consistent ordering: if t1 sees state S, t2 cannot observe a state prior to S. | +| **Invariant** | `Always`: for any two reads where oracle assigns t1 < t2, the result at t2 must include all changes visible at t1. The oracle read timestamp must advance monotonically. | +| **Antithesis Angle** | Run parallel transactions in StrictSerializable mode. One writes, another reads concurrently. Inject delays in oracle timestamp advancement. Antithesis explores whether reads can bypass the linearization point. | +| **Why It Matters** | Strict serializability is Materialize's core advertised guarantee. Users explicitly choose it over eventual consistency. Violation is a correctness bug visible to end users. Surfaced by: Protocol Contracts. | + +### catalog-recovery-consistency — Catalog State Consistent After Crash Recovery + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — catalog corruption on recovery prevents system from starting | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py`. Long-running singleton driver holds an in-process `expected_tables` set across cycles. Each cycle runs one CREATE TABLE or DROP TABLE under `execute_retry`, then opens a *fresh* psycopg connection and SELECTs `mz_tables` filtered to the driver's namespace, asserting `always("catalog recovery: live catalog table set matches in-process expected model", …)`. Cross-cycle stability is exactly the recovery check: if an environmentd restart lands between cycles, the next cycle's read is the post-recovery snapshot. Two `sometimes(...)` anchors record (a) "2+ assertion cycles ran" so the post-restart half is exercised, and (b) "observed environmentd connect failure during run" as a corroborating signal that a fault actually landed. The SUT-side upper-non-regression mirror in `sync_to_current_upper` and the consolidation `assert_always!` are deferred. | +| **Property** | After coordinator crash and restart, the recovered catalog state is equivalent to the pre-crash state: upper never decreases, snapshot is consolidated, and all committed transactions are visible. | +| **Invariant** | `Always`: upper(post_restart) >= upper(pre_crash). After sync_to_current_upper(), the snapshot contains no unconsolidated entries (all diffs resolved). | +| **Antithesis Angle** | Crash coordinator during catalog_transact (after some updates persist but before upper advances). Crash during consolidation. Antithesis explores the timing of crashes within the catalog write path. | +| **Why It Matters** | Catalog inconsistency after recovery can cause schema corruption, lost DDL, or inability to restart. Surfaced by: Failure Recovery. | + +## Category 3: Compute and Storage Recovery + +Properties that verify correct behavior during and after process crashes in the compute and storage layers. + +### compute-replica-epoch-isolation — Stale Replica Commands Rejected After Rehydration + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — stale commands cause compute divergence and wrong query results | +| **Property** | Each compute replica incarnation has a unique epoch (nonce + u64). After rehydration with epoch N+1, no commands from epoch N can execute or affect dataflow state. | +| **Invariant** | `Always`: once a command with epoch N+1 is processed, all epoch N commands are dropped. The epoch forms a strict ordering on replica incarnations. | +| **Antithesis Angle** | Kill compute replica mid-dataflow. Controller rehydrates with new epoch. In-flight commands from the old epoch leak back due to network buffering. Antithesis explores whether stale commands can sneak past the epoch check. | +| **Why It Matters** | Stale command execution causes compute replicas to diverge from the coordinator's expected state, potentially returning wrong query results. Surfaced by: Distributed Coordination. | + +### storage-command-replay-idempotent — Storage Command History Replay Is Idempotent + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — non-idempotent replay causes data duplication in all downstream MVs | +| **Property** | When a storage replica reconnects, the controller replays command history from the last frontier. Replaying the same commands twice yields identical state — no duplicated ingestion or state divergence. | +| **Invariant** | `Always`: apply(history[0:i]) + apply(history[0:i]) == apply(history[0:i]). Source ingestion positions must resume from persisted offsets, not restart from zero. | +| **Antithesis Angle** | Crash storage controller mid-send of RunIngestionCommand. Restart and replay history. Antithesis explores whether partial command delivery causes duplicate ingestion. | +| **Why It Matters** | Non-idempotent replay causes duplicate data in sources, which propagates to all downstream materialized views. Surfaced by: Failure Recovery. | + +## Category 4: Concurrency and Race Conditions + +Properties that verify correctness under concurrent access patterns within the coordinator. + +### group-commit-toctou-safety — No Phantom Writes to Deleted Tables + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — phantom writes corrupt catalog; TOCTOU explicitly acknowledged in code | +| **Property** | If a table is deleted between when a write is deferred and when group_commit executes, the write is silently dropped — not persisted. No phantom writes to non-existent tables. | +| **Invariant** | `Always`: if catalog.try_get_entry(table_id) returns None at group_commit time, the write's updates are not included in the committed batch. | +| **Antithesis Angle** | Concurrent table deletion + write operations. Antithesis delays between deferred write queuing and group_commit catalog check, exposing the TOCTOU window where the table ceases to exist between validation and execution. | +| **Why It Matters** | Phantom writes to deleted tables corrupt the catalog or cause panics during downstream processing. The explicit TOCTOU check in appends.rs:479-486 acknowledges this risk. Surfaced by: Concurrency. | + +### peek-lifecycle-exactly-once — Each Peek Gets Exactly One Response + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — leaked peeks cause OOM; explicit 1:1 contract documented | +| **Property** | For each peek command sent to compute, exactly one PeekResponse is delivered to the client — no duplicates, no missing responses, no orphaned pending_peeks entries. | +| **Invariant** | `Always`: count(peek_commands) == count(peek_responses) with bijective UUID mapping. When CancelPendingPeeks races with PeekNotification, exactly one of (canceled, completed) occurs — never both, never neither. | +| **Antithesis Angle** | Trigger replica failures mid-peek. Race cancel requests with response delivery. Antithesis explores the two-map removal sequence (client_pending_peeks + pending_peeks) that is not atomic. | +| **Why It Matters** | Leaked peeks cause memory growth and eventually OOM. Duplicate responses confuse clients. The 1:1 contract is explicitly documented in peek.rs:80-95. Surfaced by: Protocol Contracts, Concurrency. | + +### command-channel-ordering — Timely Workers See Commands in Identical Order + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — code explicitly acknowledges ordering is unguaranteed; hard to trigger | +| **Property** | CreateDataflow commands broadcast through the command channel execute in identical order across all Timely workers — no reordering. | +| **Invariant** | `Always`: for any two workers W1 and W2, if W1 sees command A before B, W2 also sees A before B. Code comment at command_channel.rs:88-90 explicitly notes this relies on "Timely channels preserving order of inputs, which is not something they guarantee." | +| **Antithesis Angle** | Inject timing delays in the source operator between command channel invocations. Stress the sync_activator bridge between sync and async contexts. Antithesis explores whether worker scheduling variations cause reordering. | +| **Why It Matters** | Command reordering causes workers to diverge, producing inconsistent dataflow results. The code explicitly acknowledges this is unguaranteed. Surfaced by: Concurrency. | + +## Category 5: Lifecycle Transitions + +Properties about 0DT deployment, startup, and shutdown correctness. + +### deployment-promotion-safety — 0DT Promotion Only After Full Catchup + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — relevant for cloud deployments; requires multi-coordinator setup | +| **Property** | During 0DT deployment, the new coordinator transitions to ReadyToPromote only after catalog is loaded, caught-up checks pass, and all replica frontiers have advanced past the required threshold. Promotion with stale replicas is prevented. | +| **Invariant** | `Always`: at the moment set_ready_to_promote() is called, all collections tracked by caught_up checks have frontiers >= the cutoff threshold. The deployment generation fence prevents the old coordinator from writing after promotion. | +| **Antithesis Angle** | Trigger preflight concurrently with replica startup. Crash replicas during catchup. Antithesis explores whether the caught_up check can pass while a replica is still lagging or crash-looping. | +| **Why It Matters** | Premature promotion causes the new coordinator to serve stale data or fail to serve at all. This is the primary risk in zero-downtime deployments. Surfaced by: Lifecycle, Distributed Coordination. | + +### deployment-lag-detection — Caught-Up Check Detects Stuck Replicas + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P2 — companion to deployment-promotion-safety; requires 0DT setup | +| **Property** | During 0DT catchup, maybe_check_caught_up() eventually detects replicas that are lagging beyond configured thresholds or crash-looping, and prevents promotion until resolved. | +| **Invariant** | `Sometimes(lagging_replica_detected)`: Antithesis should observe at least one scenario where a lagging/crashing replica is detected and promotion is blocked. This is a liveness property — the detection must eventually happen. | +| **Antithesis Angle** | Inject replica crashes during catchup phase. Verify the analyze_replica_looping() function identifies the problem via mz_cluster_replica_status_history. | +| **Why It Matters** | Undetected stuck replicas during 0DT deployment lead to silent data staleness in production. Surfaced by: Lifecycle. | + +## Category 6: Reachability and Coverage + +Properties that verify the system reaches interesting states under fault injection. + +### fault-recovery-exercised — System Recovers from Coordinator Crash + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P0 — most fundamental operational property; prerequisite for all others | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_fault_recovery_exercised.py`. Anytime driver probes `SELECT 1` with a short connect timeout (bypassing helper_pg's retry budget so the fault-active window is observable) and records `sometimes("...succeeded after a previously-observed connect failure", …)` for the recovery transition, plus corroborating `sometimes` anchors for "observed replica non-online" and "at least one probe succeeded this invocation". | +| **Property** | After the coordinator (environmentd) crashes and restarts, the system eventually becomes healthy (readiness endpoint returns 200) and can serve SQL queries. | +| **Invariant** | `Sometimes(healthy_after_crash)`: the system must reach a state where it can serve queries after a crash. This confirms recovery works end-to-end, not just in unit tests. | +| **Antithesis Angle** | Kill environmentd at various points during operation. Verify it restarts, reconnects to persist, recovers catalog, and serves queries. Antithesis explores crash timing — during DDL, during peek, during group_commit. | +| **Why It Matters** | Recovery is the most critical operational property. If it doesn't work, nothing else matters. Surfaced by: Failure Recovery. | + +### source-ingestion-progress — Kafka Source Eventually Catches Up + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P2 — important but requires Kafka/Redpanda in topology | +| **Property** | After creating a Kafka source, Materialize eventually ingests all available data and the source's write frontier advances past the data's timestamps. | +| **Invariant** | `Sometimes(source_frontier_advances)`: the source's upper antichain must advance at least once during the test run, confirming data is flowing through the ingestion pipeline. | +| **Antithesis Angle** | Create a Kafka source, produce messages, then inject network faults between Materialize and Redpanda. Verify the source eventually catches up when connectivity is restored. | +| **Why It Matters** | Source ingestion is the primary data path. If it stalls, all downstream materialized views stop updating. Surfaced by: Product Context. | + +### mv-reflects-source-updates — Materialized Views Eventually Reflect Source Changes + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P1 — end-to-end user-visible correctness; Materialize's core value | +| **Status** | **Implemented (workload-side, table-backed)** — `test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py` + `helper_table_mv.py`. Each invocation inserts N rows tagged with a per-invocation prefix into `mv_input_table`, polls the rolling-count MV `mv_input_count` after a quiet period, and pairs `sometimes("mv: row_count caught up …", …)` (liveness anchor) with `always("mv: row_count equals inserted count …", …)` (safety on the settled count). Kafka-source-backed MV is covered indirectly by the Kafka-source drivers — direct MV-on-Kafka-source coverage is deferred. | +| **Property** | After data is written to a source, materialized views that depend on that source eventually reflect the new data. | +| **Invariant** | `Sometimes(mv_contains_new_data)`: after inserting data into a table or producing to a Kafka source, a SELECT on a dependent materialized view must eventually return the new data. | +| **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. | +| **Why It Matters** | This is the end-to-end user-visible correctness property. Materialize's value proposition is that MVs are always up-to-date. Surfaced by: Product Context. | + +## Category 7: Kafka Source Ingestion (Append-Only + UPSERT) + +Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` → `ReclockOperator` → optional decode/UPSERT → `persist_sink`. Both envelopes are covered, with shared properties for reclocking and source-frontier behavior. Workload-level checks compare produced Kafka records against what a SQL `SELECT` over the source returns; SUT-side checks live in the source/upsert/reclock operators. + +### kafka-source-no-data-loss — Every Produced Record Is Eventually Visible + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P0 — primary user-visible contract; "data is in Kafka but not in Materialize" is the worst possible streaming bug | +| **Status** | **Implemented (workload-side, NONE envelope)** — `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. Per-payload `always("kafka source: every produced payload is visible exactly once", …)` joined to a quiet-period catchup wait. UPSERT-envelope version is covered by `upsert-key-reflects-latest-value`. The SUT-side `assert_sometimes!(persist_sink_appended_batch)` anchor in `append_batches` is deferred. | +| **Property** | After producing a message to a Kafka topic, the Materialize source over that topic eventually contains a row corresponding to that message (NONE envelope) or a row reflecting the latest value for that key (UPSERT envelope). | +| **Invariant** | `Sometimes(all_produced_records_visible)`: at least once during a run, after a quiet period, the workload observes `COUNT(*) FROM source` >= number of produced records (NONE) or every produced (key, value) pair is reflected in the source state (UPSERT). Liveness, so `Sometimes` on the catch-up event. | +| **Antithesis Angle** | Network partitions between Materialize and Kafka, clusterd kills mid-ingestion, persist write retries, and rebalances. The interesting timing is the *crash mid-batch* window: some offsets are in persist, some are not, and the resume frontier determines what we re-read. Antithesis explores whether the re-read covers exactly the missing offsets. | +| **Why It Matters** | This is the headline guarantee of a streaming database. A bug here is silent data loss visible to every user of the source. Supersedes the more generic `source-ingestion-progress` for Kafka specifically. | + +### kafka-source-no-data-duplication — No Record Appears Twice After Settling + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — silent duplication corrupts every aggregate downstream MV | +| **Status** | **Implemented (workload-side, NONE envelope)** — `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. `always("kafka source: no duplicate (partition, offset)", …)` over a `GROUP BY partition, "offset" HAVING COUNT(*) > 1` query scoped to the invocation's prefix; carries up to five offending rows in `details`. UPSERT-envelope version is covered indirectly by `upsert-key-reflects-latest-value` (per-key uniqueness assertion) and directly by the SUT-side `assert_always!(diff.is_positive(), …)` of `upsert-no-internal-panic`. | +| **Property** | After settling, the NONE-envelope source contains at most one row per `(partition, offset)` tuple; the UPSERT-envelope source contains at most one row per key. | +| **Invariant** | `Always`: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1,2 HAVING COUNT(*) > 1` returns no rows for NONE; `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns no rows for UPSERT. Checked on every assertion firing — must hold on every observation. | +| **Antithesis Angle** | Reader crashes between persist-sink batch write and `compare_and_append`; rehydration re-reads offsets we already wrote. The protection lives in `last_offsets` filtering (kafka.rs:1158) but only for the *current* incarnation — across restart, idempotency depends on the persist sink and (for UPSERT) the feedback-driven snapshot. Antithesis explores crash/restart timing across batch boundaries. Direct regression target for upsert double-retraction bug (commit 1accbe28b3, database-issues#9160). | +| **Why It Matters** | Duplicate rows in the source flow into every downstream materialized view's aggregates and joins. Silent and devastating. | + +### kafka-source-frontier-monotonic — Source Persist Shard Upper Never Regresses + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — frontier regression panics downstream operators and breaks `AS OF` queries | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py`. Continuous `anytime_` driver polls `mz_internal.mz_source_statistics.offset_committed` for every known Kafka source every 500ms and asserts `always("kafka: source offset_committed non-monotonic", details)` whenever a new sample is less than the previous one. Faults are active throughout. SUT-side `assert_always!(new_upper >= prev_upper, ...)` in `append_batches` is deferred. | +| **Property** | The `upper` frontier of the source's data persist shard never regresses across the lifetime of the source, including across clusterd restarts and `compare_and_append` retries. | +| **Invariant** | `Always`: observed `upper(t2) >= upper(t1)` for any observation order `t1 < t2`. Checked on every observation in a workload polling loop, and ideally also as a SUT-side `assert_always!` next to the persist sink's `compare_and_append`. | +| **Antithesis Angle** | Kill clusterd mid-`compare_and_append`; resume the source with a stale cached upper; concurrent reclock and persist-sink writers. Direct regression target for the `as_of`/reclock-upper race (commit e3805ad790, database-issues#8698) and the persist-sink cached upper bug (commit 505dc96aaa). | +| **Why It Matters** | Frontier regression manifests as panics (`as_of > upper`) or as observably incorrect AS OF queries. Documented invariant for persist. | + +### kafka-source-survives-broker-fault — Source Resumes After Broker Connectivity Restored + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P1 — operational expectation; broker faults are a routine condition | +| **Status** | **Implemented (workload-side, shared driver)** — `test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py`. Continuous polling state machine per Kafka source: `OBSERVING` -> `STALLED` after N consecutive identical `offset_committed` samples, then `Reachable("...resumed advancing after a sustained stall", …)` on the first strictly-greater sample. The driver tags each recovery with `saw_kafka_metadata_failure` (broker-fault signal) and `saw_replica_non_online` (clusterd-restart signal) so triage can distinguish the two fault classes. | +| **Property** | After a transient network partition or Kafka broker outage that prevents the source from making progress, once connectivity is restored, the source eventually ingests all messages that were produced during the outage. | +| **Invariant** | `Sometimes(source_resumes_after_broker_fault)`: at least once per run, after injecting a network fault between materialized and Kafka and then calling `ANTITHESIS_STOP_FAULTS`, the workload observes the source's `COUNT(*)` advance past its pre-fault value. | +| **Antithesis Angle** | Network partition between the `materialized` container and the Kafka container; persist+metadata stay reachable. Tests rdkafka reconnect, snapshot statistics restoration (commit 0a34b6c79d), and that no permanent stall mode is entered. | +| **Why It Matters** | Cloud streaming setups routinely see transient Kafka unavailability. A source that gets stuck and never recovers is an outage. | + +### kafka-source-survives-clusterd-restart — Source Resumes After clusterd Crash + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P1 — recovery from clusterd kill is the most common operational fault path | +| **Status** | **Implemented (workload-side, shared driver)** — same `test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py` as `kafka-source-survives-broker-fault`. The stall-then-advance transition is fault-kind-agnostic; `saw_replica_non_online` corroborates that the source recovered specifically from a clusterd kill. Combines with the existing `kafka-source-no-data-duplication` and `kafka-source-no-data-loss` assertions to also rule out double-counting and gaps on the rehydrated path. Requires node-termination faults to be enabled in the Antithesis tenant. | +| **Property** | After clusterd (storage worker) is killed and restarted, the Kafka source recovers, replays the right resume offsets, and ingests messages produced before, during, and after the restart. | +| **Invariant** | `Sometimes(source_recovered_after_clusterd_restart)`: after a kill+restart, eventually `COUNT(*) FROM source >= produced_count`. Combined with `kafka-source-no-data-duplication` to also rule out double-counting. | +| **Antithesis Angle** | Direct test of the `storage-command-replay-idempotent` mechanism end-to-end through Kafka. Antithesis explores crash timing across the reclock mint, persist-sink append, and upsert snapshot-completion windows. Requires node-termination faults to be enabled. | +| **Why It Matters** | This is the recovery contract the storage controller is built around. Failure here makes every higher-level property meaningless. | + +### upsert-key-reflects-latest-value — UPSERT Source Reflects Latest Value Per Key + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — the entire user-visible promise of the UPSERT envelope | +| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets after quiet period"). | +| **Property** | At a settled timestamp, for each key produced by the workload, the UPSERT source contains exactly the value from the last `(key, value)` message produced — or no row if the last message for that key was a tombstone. | +| **Invariant** | `Always`: for every workload-tracked key, `SELECT value FROM source WHERE key = ?` returns the expected value (or empty for tombstoned keys), as determined by the workload's local model of what it produced. Checked after `ANTITHESIS_STOP_FAULTS` quiet periods. | +| **Antithesis Angle** | Reorder produce timing, kill clusterd between the prior-value lookup (`multi_get`) and the new-value write (`multi_put`), inject delays in the feedback-driven snapshot phase. Tests order-key monotonicity (commit f177db8286), state-backend consistency, and snapshot-completion correctness. | +| **Why It Matters** | UPSERT semantics — "the source mirrors the upstream key/value store" — is the reason customers pick this envelope. Wrong value per key is silent corruption that flows into all downstream MVs. | + +### upsert-tombstone-removes-key — Tombstone Eventually Removes the Key + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — delete semantics are routinely relied on for GDPR/correctness | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. The existing `always("upsert: tombstoned key has no row in source", ...)` covers the safety half; a new `sometimes("upsert: tombstone overwrote a live value at least once this invocation", ...)` confirms the *interesting* tombstone path (tombstone replacing a live value) is exercised rather than the trivial "tombstone a never-written key" case. | +| **Property** | After producing a `(key, null)` tombstone message to the Kafka topic, the UPSERT source eventually contains no row for that key, and the row stays absent until a new non-null value is produced. | +| **Invariant** | `Always`: at any settled observation after the tombstone has been ingested (resume_upper > tombstone offset), `SELECT * FROM source WHERE key = ?` returns 0 rows. The "no resurrection" half is also `Always`: a key that has been tombstoned and not re-inserted must not reappear after a clusterd restart or rehydration cycle. | +| **Antithesis Angle** | Race the tombstone against a state-store snapshot completion. Crash clusterd between persist sink writing the retraction and the upsert state recording the tombstone. The `StateValue::Value` -> tombstone path in `upsert/types.rs` is the relevant code; bugs here look like resurrected rows. | +| **Why It Matters** | A "deleted" row reappearing is both a correctness bug and a compliance hazard. | + +### upsert-state-rehydrates-correctly — UPSERT State Reconstructs Exactly After Restart + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — incorrect rehydration produces wrong-but-plausible-looking output | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py`. Long-running `singleton_driver_` runs N produce→settle→assert cycles holding `expected_state` in process memory. Cross-cycle stability is the rehydration check: if a clusterd restart lands between cycles, the next cycle's `always("upsert: rehydrated state matches local model (live key|tombstoned key)", ...)` verifies the rebuilt source matches the pre-restart model. Requires node-termination faults enabled. | +| **Property** | After a clusterd restart, the rehydrated upsert state, as observed via `SELECT * FROM source`, equals the state at the most recent durable timestamp before the restart, for every key produced so far. | +| **Invariant** | `Always`: after a kill+restart quiet period, the workload's local key/value model matches the source's contents for every key whose latest message has `offset <= resume_upper`. Combines with `kafka-source-no-data-duplication` (no double inserts on rehydration) and `upsert-key-reflects-latest-value` (correct value per key). | +| **Antithesis Angle** | The interesting window is between `compare_and_append` of the persist sink and the upsert operator's feedback-driven snapshot completion. If the feedback replay deduplication is wrong, rehydrated state diverges from durable state. Direct regression target for the upsert snapshot-completion logic in `upsert/types.rs` and `upsert_continual_feedback*`. | +| **Why It Matters** | Wrong rehydration is silent — the source comes up "healthy" and serves bad data. Hardest class of bug to detect in production. | + +### upsert-decode-error-retractable — Bad Value Errors Are Retracted By Subsequent Good Value + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — documented contract; supports operational "fix the bad message and continue" recovery | +| **Property** | When a Kafka message decoding produces an `UpsertError::Value` (or `UpsertError::KeyDecode` or `UpsertError::NullKey`) for a key, and a subsequent message produces a valid `(key, value)` pair for the same key, the source state for that key transitions from "row containing error" to "row containing the new value" — i.e. the error is retracted. | +| **Invariant** | `Always`: at a settled timestamp after the corrective message has been ingested, `SELECT * FROM source WHERE key = ?` returns the corrected value with no remaining error row. Note this is the *upsert*-specific retractability (`EnvelopeError::Upsert(..)`); `EnvelopeError::Flat(..)` is explicitly non-retractable. | +| **Antithesis Angle** | Produce an undecodable value, then a good value for the same key, while injecting delays between the two. Race against snapshot completion (errored value during snapshot vs. corrected value post-snapshot). | +| **Why It Matters** | Encoded as the operational contract by which users recover from upstream schema mistakes without dropping the source. Code in `upsert_commands` (render/sources.rs) and `upsert.rs` is the relevant path. | + +### upsert-no-internal-panic — Upsert Operator's Internal Asserts Never Fire + +| | | +|---|---| +| **Type** | Reachability (Unreachable) | +| **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit | +| **Status** | **Implemented (SUT-side, reachable sites only)** — every targeted *reachable* site has a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`: `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion). The mirror sites in `src/storage/src/upsert.rs` (classic) were dropped: `upsert_operator` hard-codes `use_continual_feedback_upsert = true` (commit a63d1763e5, Feb 2025), so the classic-upsert code is provably unreachable in supported configurations and Antithesis-instrumenting it added dead-weight assertions. Panics still terminate the process; Antithesis receives a reportable property failure with rich details for every reachable site. | +| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically (reachable sites): `assert!(diff.is_positive(), "invalid upsert input")` (upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). | +| **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. | +| **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. | +| **Why It Matters** | These panics indicate the operator entered an internal state its author thought was impossible. Past bugs (commits f177db8286, 1accbe28b3) reached production exactly through these paths. The asserts already exist; we just need to wrap them with the Antithesis SDK so the failures become reportable properties rather than process kills. | + +### upsert-state-consolidation-wellformed — `ensure_decoded` Resolves To `diff_sum ∈ {0, 1}` With Matching Checksums + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — directly guards upsert state-store data integrity; catches XOR/checksum corruption | +| **Status** | **Implemented (SUT-side)** — `src/storage/src/upsert/types.rs`. Five `assert_always!` calls inside `ensure_decoded` covering the `diff_sum == 1` checksum match, the three `diff_sum == 0` zero-residue checks, and the `diff_sum ∉ {0,1}` impossible-state path. Each carries the consolidating state's diagnostic in `details`. | +| **Property** | When the upsert state backend's `StateValue::ensure_decoded` finalizes a `Consolidating` cell into either a live `Value` or a `tombstone`, the consolidating accumulator is well-formed: `diff_sum ∈ {0, 1}`; if `diff_sum == 1` the recovered bytes match the recorded `len_sum` and `checksum_sum` (seahash of `value_xor[..len_sum]`); if `diff_sum == 0` then `len_sum == 0`, `checksum_sum == 0`, and every byte of `value_xor` is zero. | +| **Invariant** | `Always`: the `panic!("invalid upsert state: non 0/1 diff_sum: …")` at `upsert/types.rs:672` becomes an `assert_always!(false, "upsert: non 0/1 diff_sum")` with a unique message. The intermediate `assert_eq!`s at :621, :632, :637 and the `assert!` at :642 are likewise upgraded to `assert_always!` so they report rather than crash. Each site gets a distinct, specific message. | +| **Antithesis Angle** | The consolidating state collapses many `(diff, bytes)` updates per key into running `diff_sum`, `len_sum`, `checksum_sum`, and an XOR-merged `value_xor` blob. The invariant relies on (a) every retraction being paired with an identical insertion in the snapshot stream, and (b) the snapshot completion contract delivering exactly the durable state at the resume frontier. Antithesis explores: crash mid-snapshot-replay, RocksDB merge operator interleaved with multi_put, partial feedback delivery across restart, and (most subtly) duplicated retractions from multi-replica drain (commit 1accbe28b3). Any of these can break the XOR cancellation and trip a non-{0,1} diff_sum. | +| **Why It Matters** | This is the deepest "the math broke" guard in the upsert pipeline. A trip here means either the feedback stream replayed wrong contents or a duplicate retraction snuck through. The existing panic already dumps a rich diagnostic — wrapping it as an Antithesis assertion turns it into a reportable, replayable property failure rather than a process abort. | + +### upsert-ensure-decoded-called-before-access — Consolidating State Is Always Decoded Before Use + +| | | +|---|---| +| **Type** | Reachability (Unreachable) | +| **Priority** | P2 — type-state protocol invariant; high-signal as a replay anchor | +| **Status** | **Implemented (SUT-side)** — `src/storage/src/upsert/types.rs`. Six `assert_unreachable!` calls, one per accessor (`into_decoded`, `into_provisional_value`, `into_provisional_tombstone`, `provisional_order`, `provisional_value_ref`, `into_finalized_value`), each with a distinct message naming the accessor. Original `panic!` preserved after the assertion. | +| **Property** | Every accessor on `StateValue` that requires the cell to be in `Value` form is preceded by a call to `ensure_decoded` for that cell. The six accessor panics — `into_decoded` (297), `into_provisional_value` (369), `into_provisional_tombstone` (403), `provisional_order` (416), `provisional_value_ref` (430), `into_finalized_value` (440) — never fire. | +| **Invariant** | `Unreachable`: each `panic!("called \`...\` without calling \`ensure_decoded\`")` site is converted to a distinct `assert_unreachable!("upsert: on Consolidating")`. Six unique assertion messages, one per accessor, so an Antithesis report distinguishes which contract was violated. These are pure protocol-misuse guards — they cannot fire in valid execution. | +| **Antithesis Angle** | These panics are most likely to fire after a code change to the upsert operator (e.g. a new code path that forgets `ensure_decoded` before reading `provisional_value`). Antithesis exercises every operator branch under fault injection; turning these into reachability assertions gives a cheap regression-detection net for future refactors of `upsert.rs` / `upsert_continual_feedback*.rs`. They are also useful replay anchors — if Antithesis ever does reach them, the bug is reproducible. | +| **Why It Matters** | These guard a type-state contract that is currently enforced only at runtime. The cost of instrumenting them is essentially zero (rename `panic!` to `assert_unreachable!`), and the upside is that any future violation surfaces as a property failure that can be replayed deterministically. | + +### kafka-source-no-internal-panic — Kafka Source Reader's Explicit Panics Never Fire + +| | | +|---|---| +| **Type** | Reachability (Unreachable) | +| **Priority** | P1 — direct regression target for topic-recreation and offset-handling bugs | +| **Status** | **Implemented (SUT-side, production sites)** — `src/storage/src/source/kafka.rs` covers the four production panic/assert sites (`unexpected source export details`, `partition_consumers not drained at shutdown`, `partition missing from last_offsets`, `negative offset from non-error message`); `src/storage/src/source/reclock/compat.rs` covers `compare_and_append InvalidUsage`. The remaining `expect()` sites on resume-upper / statistics / offset arithmetic are deferred to a follow-up; they would be a wide mechanical conversion to soft assertions rather than reportable properties. | +| **Property** | The explicit panics in `kafka.rs` never fire: `panic!("got negative offset (...)")` (kafka.rs:1193); `panic!("unexpected source export details: ...")` (kafka.rs:276); the `assert!(self.last_offsets[output][partition])` (kafka.rs:1142); plus the `expect()` sites on resume-upper / statistics / offset arithmetic. | +| **Invariant** | `Unreachable`: each site converted to a unique `assert_unreachable!("kafka: ")`. The "negative offset" panic in particular is a known structural-invariant violation that has fired before. | +| **Antithesis Angle** | Topic deletion + recreation, partition rebalancing, manual offset reset on the Kafka broker, clock jumps that interact with Kafka's internal offset arithmetic. Direct regression target for commit 99ad668af5 (capability downgrade on topic recreation). | +| **Why It Matters** | A panic in the source reader takes down the storage worker. Replacing the panic with an Antithesis assertion gives a *reportable* failure rather than a crash that masks itself as "clusterd was restarted." | + +### remap-shard-antichain-wellformed — Remap Shard Accumulates To Well-Formed Antichain + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — load-bearing invariant for reclock correctness; explicitly stated in source doc comment | +| **Property** | At every Materialize timestamp `t`, the remap shard's contents accumulated to `t` form a well-formed `Antichain`: each source-time element has frequency exactly 1, the antichain is not empty if any source data has been bound, and (under multi-partition source) there is one element per partition range with no overlaps. | +| **Invariant** | `Always`: enforced as an `assert_always!` inside `ReclockOperator::mint`/`sync` after every state update — that's where the doc comment promises the invariant (reclock.rs:31-34). Workload-level approximation: a periodic SQL query that joins source/remap progress with computed offsets and verifies one-to-one. | +| **Antithesis Angle** | Concurrent reclock writers (across restart), partition adds/removes between mints, `compare_and_append` retries that interleave with metadata refresh. The remap shard is the only place where source-time → into-time is durably recorded; a malformed antichain corrupts every subsequent restart's resume frontier. | +| **Why It Matters** | This is the foundational reclock invariant. Violation here breaks recovery (resume_upper computed wrong), `AS OF` semantics, and the upsert operator's snapshot phase. | + +### reclock-mint-eventually-succeeds — Reclock Mint Completes Despite CaS Retries + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P2 — pre-existing concern under persist instability | +| **Status** | **Implemented (SUT-side anchor)** — `src/storage/src/source/reclock.rs`: `ReclockOperator::mint` carries a local `cas_retry_count` and fires `assert_reachable!("reclock: mint completed after at least one compare_and_append UpperMismatch", …)` after the while-loop terminates when at least one `UpperMismatch` was observed. The reachability anchor covers the "retry path was exercised AND mint terminated" half of the property. The workload-side "source frontier advanced past the contention point" liveness check is approximated by the existing `anytime_kafka_frontier_monotonic.py` + `anytime_kafka_source_resumes_after_fault.py` drivers and is not duplicated here. | +| **Property** | Under transient persist outages or competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, reclock.rs:160-166) eventually completes for every source-frontier advance that has data to bind. | +| **Invariant** | `Sometimes(mint_completed_after_cas_retry)`: at least once per run, Antithesis observes a reclock mint that took >1 CaS attempt and then completed (i.e. a successful retry path was exercised). Critically, the workload should also observe that the source frontier eventually advances past the value of `source_upper` captured at the time of the contention — i.e. the loop is not livelocked. | +| **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. | +| **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. | + +## Category 8: MySQL CDC Source + +Properties specific to Materialize's MySQL CDC source pipeline, which reads +from a multithreaded MySQL replica. The topology adds a MySQL primary (GTID + +WRITESET dependency tracking) and a MySQL replica (4 parallel workers, +commit-order preservation) to the Antithesis environment. + +### mysql-source-no-data-loss — Every Row Written to MySQL Primary Is Eventually Visible + +| | | +|---|---| +| **Type** | Liveness + Safety | +| **Priority** | P1 — end-to-end correctness of the MySQL CDC pipeline; tests a distinct code path from Kafka | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. | +| **Property** | After inserting a row to the MySQL primary (via the binlog + GTID-based multithreaded replica), the Materialize CDC source eventually contains that row with the correct value. | +| **Invariant** | `Always`: after catchup, for every row inserted to `antithesis.cdc_test` on the primary, `SELECT value FROM antithesis_cdc WHERE id = ?` returns the expected value. `Sometimes`: catchup completes within the quiet-period budget at least once per run. | +| **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. | +| **Why It Matters** | MySQL CDC is a distinct ingestion code path from Kafka. Wrong behavior here — dropped rows, wrong values after restart, duplicate rows after resume — is not caught by the Kafka-source drivers. | + +### offset-known-not-below-committed — Source Statistics Causality + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — observable statistics correctness; regression target for commit 3e32df1f69 | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py`. Continuous polling driver queries every Kafka source's `mz_source_statistics_per_worker` row and fires `always("kafka: source offset_known < offset_committed", …)` whenever a single per-worker row has `offset_known < offset_committed`. Both fields are read from the same row of the same query so the comparison cannot cross a metric-update boundary. The SUT-side mirror in `src/storage/src/statistics.rs` is deferred. | +| **Property** | For every Kafka source, the source-statistics view always reports `offset_known >= offset_committed`. The metric `offset_known` reflects what the broker has told us is available; `offset_committed` reflects what Materialize has durably ingested. Causally, `offset_known` cannot lag `offset_committed`. | +| **Invariant** | `Always`: a polling assertion in the workload — `SELECT offset_known, offset_committed FROM mz_internal.mz_source_statistics_per_worker WHERE id = ?` — invariant `offset_known >= offset_committed`. Mirror as an `assert_always!` inside the statistics update path in `src/storage/src/statistics.rs`. | +| **Antithesis Angle** | Clusterd restart resets `offset_known` to broker-reported watermark while `offset_committed` is restored from persist. If the restoration order is wrong, the invariant flips. Direct regression target for commit 3e32df1f69. | +| **Why It Matters** | The statistics view is consumed by users and by operational tooling to compute lag. A regression in causality makes lag metrics meaningless and is the kind of bug that survives unit tests but fails under adversarial timing. | diff --git a/test/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md new file mode 100644 index 0000000000000..b8b250fc37233 --- /dev/null +++ b/test/antithesis/scratchbook/property-relationships.md @@ -0,0 +1,95 @@ +# Property Relationships + +## Cluster 1: Persist Layer Safety + +**Properties**: `persist-cas-monotonicity`, `tombstone-sealing-finality`, `idempotent-write-under-indeterminate`, `critical-reader-fence-linearization` + +These properties share the persist state machine code in `src/persist-client/src/internal/`. They all exercise the compare-and-swap loop in `Machine` and the `State` transitions. A bug in the CaS loop or state validation could violate multiple properties simultaneously. + +**Suspected dominance**: `persist-cas-monotonicity` is foundational — if SeqNo monotonicity breaks, all other persist properties likely break too. It dominates `tombstone-sealing-finality` and `critical-reader-fence-linearization`. + +## Cluster 2: Fencing and Split-Brain Prevention + +**Properties**: `epoch-fencing-prevents-split-brain`, `compute-replica-epoch-isolation`, `deployment-promotion-safety` + +These properties all use epoch-based fencing to prevent stale actors from mutating state. They share the pattern of "increment epoch on new incarnation, reject operations from old epoch." The catalog fencing (`epoch-fencing-prevents-split-brain`) and deployment fencing (`deployment-promotion-safety`) share code paths in `src/catalog/src/durable/persist.rs`. + +**Suspected dominance**: `epoch-fencing-prevents-split-brain` is the most fundamental — it protects the catalog. `deployment-promotion-safety` builds on it by also requiring caught-up checks before promotion. `compute-replica-epoch-isolation` is independent (different epoch mechanism for compute). + +## Cluster 3: Crash Recovery Pipeline + +**Properties**: `catalog-recovery-consistency`, `storage-command-replay-idempotent`, `fault-recovery-exercised` + +These properties test the recovery path after process crashes. `fault-recovery-exercised` is the end-to-end liveness check; `catalog-recovery-consistency` and `storage-command-replay-idempotent` test specific subsystems within recovery. + +**Suspected dominance**: `fault-recovery-exercised` is the weakest check (just "system comes back"). `catalog-recovery-consistency` is strictly stronger (catalog state is correct after recovery). If catalog recovery fails, the end-to-end recovery also fails. + +## Cluster 4: Consistency Model + +**Properties**: `strict-serializable-reads`, `mv-reflects-source-updates`, `source-ingestion-progress` + +These properties form a chain: source ingestion feeds materialized views, which serve reads. `strict-serializable-reads` depends on correct timestamp oracle behavior and frontier management. If `source-ingestion-progress` fails (data doesn't flow), `mv-reflects-source-updates` also fails, but `strict-serializable-reads` could still pass on stale but consistent data. + +**Suspected dominance**: `strict-serializable-reads` is independent of the liveness properties. `mv-reflects-source-updates` implies `source-ingestion-progress` (if MVs update, sources must have made progress). + +## Cluster 5: Coordinator Concurrency + +**Properties**: `group-commit-toctou-safety`, `peek-lifecycle-exactly-once`, `command-channel-ordering` + +These properties target different concurrency mechanisms within the coordinator and compute engine. They share the coordinator's event loop as the execution context but test independent subsystems (write path, read path, command dispatch). + +**No dominance**: These properties are independent of each other. A bug in peek handling doesn't imply a bug in group_commit or command channels. + +## Cluster 6: Deployment Lifecycle + +**Properties**: `deployment-promotion-safety`, `deployment-lag-detection` + +Both test the 0DT deployment pipeline. `deployment-lag-detection` is a prerequisite for `deployment-promotion-safety` — if lag detection fails, promotion may proceed unsafely. + +**Suspected dominance**: `deployment-promotion-safety` is stronger — it requires both lag detection and correct fencing. `deployment-lag-detection` is a liveness check on a subsystem of the promotion pipeline. + +## Cluster 7: Kafka Source — User-Visible Ingestion Correctness + +**Properties**: `kafka-source-no-data-loss`, `kafka-source-no-data-duplication`, `kafka-source-frontier-monotonic`, `kafka-source-survives-broker-fault`, `kafka-source-survives-clusterd-restart` + +End-to-end Kafka source ingestion contract observable from the workload side. `kafka-source-no-data-loss` and `kafka-source-no-data-duplication` are the inverse-pair safety/liveness checks: every produced message must show up *exactly once*. The two recovery properties (`survives-broker-fault`, `survives-clusterd-restart`) exercise the same contract under different fault classes. `kafka-source-frontier-monotonic` is the lower-level safety property that both no-loss and no-duplication depend on. + +**Suspected dominance**: `kafka-source-frontier-monotonic` underpins both `no-data-loss` and `no-data-duplication` — if the persist shard upper goes backwards, both higher-level properties fail. `survives-clusterd-restart` strictly implies `survives-broker-fault` for the recovery code path (clusterd restart triggers all the same rehydration logic plus more), but the two stress different fault classes. + +## Cluster 8: UPSERT Envelope — Per-Key Semantics + +**Properties**: `upsert-key-reflects-latest-value`, `upsert-tombstone-removes-key`, `upsert-state-rehydrates-correctly`, `upsert-decode-error-retractable` + +The user-visible UPSERT contract. `upsert-key-reflects-latest-value` is the headline: latest produced value per key wins. `upsert-tombstone-removes-key` is the special-case for `None` values. `upsert-state-rehydrates-correctly` is the post-crash version of `latest-value`. `upsert-decode-error-retractable` is the error-recovery half of the contract — bad messages can be retracted. + +**Suspected dominance**: `upsert-state-rehydrates-correctly` implies `upsert-key-reflects-latest-value` in steady state (rehydration produces the right state, and that state is what subsequent operations operate on). `upsert-tombstone-removes-key` is a special case of `upsert-key-reflects-latest-value` (the "last produced was null" case). `upsert-decode-error-retractable` is independent. + +## Cluster 9: UPSERT Operator Internals — SUT-Side Asserts + +**Properties**: `upsert-no-internal-panic`, `upsert-state-consolidation-wellformed`, `upsert-ensure-decoded-called-before-access` + +Operator-internal correctness backbone for the UPSERT envelope. All three properties are about converting existing `panic!`/`assert!` sites in the upsert code into Antithesis-reportable assertions. `upsert-state-consolidation-wellformed` is the math-correctness check (XOR/checksum invariants in `ensure_decoded`); `upsert-ensure-decoded-called-before-access` is the type-state protocol check on `StateValue` accessors; `upsert-no-internal-panic` is the broader umbrella covering the diff-positive / commands-state / snapshot-completion guards. + +**Suspected dominance**: `upsert-state-consolidation-wellformed` is the deepest signal — a trip there indicates upstream code already failed to preserve some invariant. `upsert-no-internal-panic`'s `assert!(diff.is_positive())` family catches a similar class of upstream-bug-evidence higher up the stack. + +## Cluster 10: Kafka Source Internals — SUT-Side Asserts + +**Properties**: `kafka-source-no-internal-panic`, `remap-shard-antichain-wellformed`, `reclock-mint-eventually-succeeds`, `offset-known-not-below-committed` + +Reclock and source-reader operator-internal correctness. `remap-shard-antichain-wellformed` is the load-bearing invariant for the entire reclocking subsystem; `reclock-mint-eventually-succeeds` is its liveness companion. `kafka-source-no-internal-panic` is the umbrella for the explicit reader asserts. `offset-known-not-below-committed` is a much narrower statistics-causality check. + +**Suspected dominance**: `remap-shard-antichain-wellformed` underpins everything in Cluster 7 — a malformed remap antichain corrupts the resume frontier, which breaks both data-loss and data-duplication properties at the next restart. + +## Cross-Cluster Connections + +- `epoch-fencing-prevents-split-brain` (Cluster 2) protects `catalog-recovery-consistency` (Cluster 3) — fencing ensures only one writer during recovery +- `persist-cas-monotonicity` (Cluster 1) underpins `catalog-recovery-consistency` (Cluster 3) — catalog is stored in persist, so CaS correctness is a prerequisite +- `strict-serializable-reads` (Cluster 4) depends on `epoch-fencing-prevents-split-brain` (Cluster 2) — split-brain would allow inconsistent timestamp assignments +- `idempotent-write-under-indeterminate` (Cluster 1) protects `storage-command-replay-idempotent` (Cluster 3) — storage ingestion uses persist writes, so idempotency matters for both +- `persist-cas-monotonicity` (Cluster 1) underpins `kafka-source-frontier-monotonic` (Cluster 7) — frontier monotonicity at the source level is a direct consequence of CaS monotonicity at the persist level +- `storage-command-replay-idempotent` (Cluster 3) supports `kafka-source-survives-clusterd-restart` (Cluster 7) — correct command replay is required for source recovery to be idempotent +- `idempotent-write-under-indeterminate` (Cluster 1) supports `kafka-source-no-data-duplication` (Cluster 7) — the no-duplicate-write guarantee at the persist level is what makes no-data-duplication observable at the source level +- `remap-shard-antichain-wellformed` (Cluster 10) underpins `kafka-source-no-data-loss` and `kafka-source-no-data-duplication` (Cluster 7) — a malformed remap antichain breaks the resume frontier across restart +- `upsert-state-consolidation-wellformed` (Cluster 9) underpins `upsert-state-rehydrates-correctly` (Cluster 8) — if the consolidating math is wrong, rehydration is wrong +- `source-ingestion-progress` (Cluster 4, pre-existing) is now subsumed by `kafka-source-no-data-loss` (Cluster 7) for Kafka specifically; `source-ingestion-progress` remains relevant for non-Kafka sources (Postgres CDC, MySQL CDC, generators) +- `mv-reflects-source-updates` (Cluster 4) depends on every Cluster 7 and Cluster 8 property — MVs over Kafka sources inherit those sources' correctness diff --git a/test/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md new file mode 100644 index 0000000000000..c38442d9d96c0 --- /dev/null +++ b/test/antithesis/scratchbook/sut-analysis.md @@ -0,0 +1,298 @@ +# SUT Analysis: Materialize + +## System Overview + +Materialize is a real-time data integration platform and streaming SQL database written primarily in Rust. It reads change data from PostgreSQL (logical replication), MySQL, Kafka/Redpanda, and webhooks, then maintains materialized views incrementally using differential dataflow. It speaks the PostgreSQL wire protocol, so any psql client or Postgres driver can connect. + +The system claims **strict serializability** for interactive queries and provides **incremental, consistent, low-latency** results over streaming data. It does not offer approximate answers or eventual consistency. + +## Architecture + +### Three-Layer Design + +Materialize is organized into three logical layers that run as separate processes: + +**1. Adapter Layer (environmentd)** +- Main coordinator process (`src/environmentd/`) +- Hosts pgwire server (port 6875), HTTP API (6878), and internal coordination endpoints +- Parses SQL, plans queries, manages sessions, enforces consistency +- Contains the Catalog (schema metadata) in memory, persisted to durable storage +- Runs a **single-threaded async event loop** on a Tokio runtime for coordination +- Multiplexes ComputeController and StorageController to manage downstream clusters + +**2. Compute Layer (clusterd - compute)** +- Worker processes running Timely Dataflow engines (`src/compute*/`, `src/clusterd/`) +- Executes views, maintains materialized views, performs joins +- Stateless — can be rehydrated from storage on crash +- Multiple replicas provide active replication for HA +- Workers parallelize via native OS threads (one per Timely worker) + +**3. Storage Layer (clusterd - storage)** +- Worker processes for data ingestion (`src/storage*/`) +- Reads from external sources (Kafka, Postgres CDC, MySQL, webhooks) +- Reclocks source timestamps to Materialize's internal timeline +- Writes to Persist (blob storage + consensus) for durability +- Manages sinks (Kafka sinks with exactly-once semantics) + +### Communication Protocols + +| Path | Protocol | Details | +|------|----------|---------| +| Client -> Balancerd -> Environmentd | pgwire (PostgreSQL wire protocol) | TLS, port 6875 | +| Environmentd -> Clusterd | CTP (Cluster Transport Protocol) | Length-prefixed bincode over TCP/UDS, ports 2100-2101 | +| Clusterd workers <-> workers | Timely mesh | Generation-epoch protocol, ports 2102-2103 | +| Clusterd -> Persist | HTTP/S3 API | Blob storage writes + consensus CaS | +| Environmentd -> Persist | Direct | Catalog stored in persist shard | +| Clusterd -> Environmentd | Persist PubSub | HTTP on port 6879, state change subscriptions | + +### Key Entrypoints + +- `src/environmentd/src/environmentd/main.rs` — main server startup +- `src/clusterd/src/bin/clusterd.rs` — compute/storage worker startup +- `src/balancerd/` — stateless connection router +- `src/pgwire/` — PostgreSQL wire protocol implementation +- `src/adapter/` — SQL planning, coordination, session management + +## State Management + +### Five Tiers of State + +1. **Catalog metadata** — table/view/source/sink definitions, roles, clusters + - Stored in a persist shard (blob + consensus) + - Reconstructed into `CatalogState` in-memory on startup + - Mutated via `catalog_transact()` with atomic `TransactionBatch` writes + +2. **Source/ingestion data** — rows from Kafka, Postgres CDC, MySQL, webhooks + - Written to persist shards by storage workers + - Keyed by Materialize-assigned timestamps (reclocked from source timestamps) + +3. **Materialized view data** — output of incrementally-maintained computations + - Written to persist shards by compute workers + - Stored as columnar batches in blob storage + +4. **Timestamps/frontiers** — read/write boundaries tracking collection completeness + - `since` (read frontier): minimum time a collection can be read + - `upper` (write frontier): maximum time written + - Tracked as `Antichain` lattice values + - Global timestamp oracle provides causally-consistent read times + +5. **In-flight state** — active dataflow computations, pending peeks, session state + - Held in memory by compute/storage workers and the coordinator + - Lost on crash, recovered via replay from persist + +### Persistence Architecture + +**Blob Storage (S3/MinIO/Azure/Postgres-backed):** +- Immutable data batches (columnar Parquet/Arrow format) +- Rollups (periodic snapshots of shard state for fast recovery) + +**Consensus (CockroachDB/PostgreSQL/FoundationDB):** +- Shard metadata: `since`, `upper`, spine structure +- Writer/reader leases with heartbeats +- Sequence numbers (`SeqNo`) for version linearity +- Catalog mutations as `StateUpdate` events + +**Atomic Writes:** +- Compare-and-append via `Machine`: writers must match expected `upper` antichain +- Idempotency tokens prevent duplicates on retries +- Fencing via `FenceToken` (deploy generation + epoch) prevents split-brain + +## Concurrency Model + +### Coordinator (environmentd) +- **Single-threaded event loop** on Tokio runtime +- Processes commands via `tokio::select!` from multiple MPSC channels +- Per-object write locks (`Arc>`) serialize DDL to same object +- Catalog shared as `Arc` for read-only off-thread access; mutations are serialized through the event loop +- Timeline state (`global_timelines`) accessed serially within event loop + +### Compute/Storage Workers (clusterd) +- One native OS thread per Timely worker (configurable count) +- Workers coordinate via Timely's internal barriers and distributed snapshot semantics +- Commands received via MPSC channels from controllers +- Worker 0 broadcasts commands to other workers per Timely conventions + +### Synchronization Primitives +- `Arc` for per-object write locks +- `mpsc::UnboundedSender/Receiver` for coordinator internal messaging +- `watch::Sender/Receiver` for per-connection cancellation +- `Arc` (std) for low-contention shared state (metrics, log writers) +- Timely's own worker-to-worker channels for dataflow coordination + +## Safety and Liveness Guarantees + +### Claimed Safety Guarantees + +1. **Strict Serializability** (design doc 20220516): "Transactions in Materialize are strictly serializable with respect to operations inside of Materialize" (SELECT, INSERT, UPDATE, DELETE). All timestamp transitions made durable before response issued. + +2. **Definiteness** (design doc 20210831): Collections are "definite" — all uses yield exactly the same time-varying data at each logical time. Data definite for times in range `[since, upper)`. + +3. **Exactly-Once Kafka Sinks** (design doc 20200520): Transactional consistency for Kafka sink output with consistency topic. + +4. **Acknowledged Writes Survive Failures**: All data written to persist (blob + consensus) before acknowledgment. Catalog mutations durable before response. + +5. **Epoch-Based Leader Fencing**: New coordinators increment epoch on startup; old coordinators' transactions fail. Prevents split-brain after coordinator crash. + +### Claimed Liveness Guarantees + +1. **Persist Reader/Writer Liveness**: "At least one reader/writer can always make progress" even when peers are paused or restarted. + +2. **Collection Progress**: "The collection upper advances so long as one writer can make progress." + +3. **Active Replication Recovery**: "Masking of recovery delay can only be guaranteed when compute controller can reach at least one non-faulty replica." + +4. **Automatic Failover**: Compute replicas automatically rehydrate from storage on crash. Multiple replicas mask recovery latency. + +### Limitations +- HA (multi-active replication) is cloud-only; self-managed has single coordinator +- SUBSCRIBE, sinks, and `AS OF` queries may circumvent strict serializability +- No byzantine fault tolerance; system assumes honest coordinator +- Single coordinator bottleneck for timestamp oracle + +## Failure and Degradation Modes + +### Failure-Prone Areas + +1. **Startup/Configuration**: Many `expect()`/`unwrap()` calls in startup path — misconfiguration causes immediate crash rather than degraded operation. + +2. **Replica Reconnection**: Infinite retry with exponential backoff (capped at 1s). Can cause minutes-long recovery latency during transient failures. No circuit breakers. + +3. **Persist Layer Failures**: No circuit breaker for blob/consensus unavailability. System retries with backoff, creating backpressure rather than failing fast. Bounded retry loops (3-5 attempts) for some storage management operations. + +4. **0DT Deployment**: Preflight checks with configurable timeout. Can either panic or proceed degraded if standby doesn't catch up. Read-only promotion before full read-write. + +### Health Checking +- `/health/liveness` — always returns 200 (process is alive) +- `/health/ready` — returns 503 until adapter client available; optional `wait=true` blocks +- `curl localhost:6878/api/readyz` used in Docker healthchecks + +### Graceful Degradation +- Compute replicas: partial replica failure tolerated; system serves from remaining replicas +- 0DT standby boots read-only, promotes after catching up +- Feature flags return 503 rather than crashing when disabled +- No graceful degradation for metadata store (CRDB/PG) unavailability — system halts + +## External Dependencies + +| Dependency | Role | Criticality | +|-----------|------|-------------| +| CockroachDB / PostgreSQL / FoundationDB | Consensus for persist + catalog | CRITICAL — system halts without it | +| S3 / MinIO / Azure Blob | Blob storage for persist data | CRITICAL — writes fail without it | +| Kafka / Redpanda | Stream source ingestion | CRITICAL for streaming workflows | +| PostgreSQL (source) | CDC replication source | CRITICAL for CDC workflows | +| MySQL (source) | CDC replication source | Optional | +| Schema Registry | Avro/Protobuf schema management | Required for typed Kafka sources | +| Balancerd | pgwire connection routing | CRITICAL for multi-tenant | + +## Existing Test Strategy + +### mzcompose Framework (`misc/python/materialize/mzcompose/`) +- Meta-test framework generating Docker Compose files dynamically +- `Composition` class loads `mzcompose.py` files, discovers `workflow_*()` functions +- Pre-built service classes: `Materialized`, `Clusterd`, `Kafka`, `Redpanda`, `Postgres`, `CockroachOrPostgresMetadata`, `Minio`, `Toxiproxy`, etc. +- Granular lifecycle control: `c.up()`, `c.kill()`, `c.stop()`, `c.pause()`, `c.override()` +- Generates YAML on-demand, passes to `docker compose` via file descriptors +- Health-check driven startup with configurable intervals + +### Test Frameworks +1. **testdrive (.td)** — declarative SQL test language with timeout assertions and version-conditional tests +2. **sqllogictest (.slt)** — standard SQL logic test format for correctness +3. **Platform Checks** — "write once, run everywhere" tests across upgrade/restart/failure scenarios +4. **parallel-workload** — random concurrent SQL operations stress testing + +### Failure Testing Coverage +**Tested**: clusterd crashes/recovery, CockroachDB restarts, network faults (Toxiproxy), failpoint injection, statement timeouts, source/sink resilience, 0DT deployments + +**Not tested at scale**: coordinated multi-node cascading failures, deterministic replay of timing-sensitive bugs, property-based invariant testing under adversarial fault injection — this is where Antithesis adds value + +## Assumptions +- The mzcompose-based Docker Compose approach is the right integration path (vs. K8s) +- The existing Antithesis K8s-based experiment scripts represent an older approach to be superseded +- Materialize's self-managed/community edition (single-node) is the target, not the cloud multi-tenant version + +## Open Questions +- Which mzcompose test suite(s) provide the best starting workload? (platform-checks, parallel-workload, or custom) +- What is the preferred metadata store for Antithesis testing — CockroachDB or PostgreSQL? +- Should we test with multiple compute replicas or single replica? +- Are there specific failure scenarios the Materialize team wants prioritized? + +## Appendix A: Kafka Source Ingestion (Detail) + +Added 2026-05-11 in response to scoping toward Kafka source properties (append-only + UPSERT envelope). + +### Pipeline shape + +`KafkaSourceReader` → `ReclockOperator` → (optional `decode`) → (optional `upsert` operator) → `persist_sink`. + +The dataflow is rendered in `src/storage/src/render/sources.rs`. The reader and metadata-fetcher are constructed by `SourceRender for KafkaSourceConnection` in `src/storage/src/source/kafka.rs`. Reclocking is in `src/storage/src/source/reclock.rs` plus `reclock/compat.rs` (the persist-backed remap handle). UPSERT logic is in `src/storage/src/upsert.rs` (classic) and `src/storage/src/upsert_continual_feedback.rs` / `upsert_continual_feedback_v2.rs` (continual-feedback variants). + +### Source-time vs into-time + +* **Source time** for Kafka is `Partitioned, MzOffset>` (`mz_storage_types::sources::kafka`). The frontier is a multi-partition antichain. +* **Into time** is Materialize's `mz_repr::Timestamp` (ms since epoch). The mapping from source time → into time is the *remap shard*: a persist shard whose contents accumulate to a well-formed `Antichain` at every into-time. See `ReclockOperator` doc comment: "for any time `IntoTime` the remap collection accumulates into an Antichain where each `FromTime` timestamp has frequency `1`." +* On startup the remap operator loads existing bindings, downgrades to the recovered upper, then mints new bindings when `mint()` receives a probe. + +### Partition handling + +* Partition → worker assignment is round-robin by hash: `((source_id + partition_id) % worker_count) == worker_id` (`kafka.rs`). +* New partitions are picked up by the metadata fetcher and routed through reclocking. +* Per-partition offsets are tracked in `last_offsets`. Code-stated invariant: "if we see offset x, we have seen all offsets [0, x-1] that we are ever going to see" (kafka.rs near line 1005). +* Offsets that arrive `<=` `last_offset` are silently dropped (kafka.rs ~1158). This is the path that protects against rdkafka redelivery on reconnect. +* Negative offsets from an otherwise non-errored message cause `panic!` in `construct_source_message` (kafka.rs ~1193). + +### Append-only (NONE envelope) workload shape + +Decoded rows flow directly into `persist_sink` keyed by Materialize timestamp. Each `(partition, offset)` produces exactly one row (plus metadata columns if requested). There is no retraction unless an upstream EvalError occurs in a downstream operator. + +### UPSERT envelope + +`upsert_commands` (render/sources.rs) maps each `DecodeResult` into `(UpsertKey, Option, FromTime)`: + +* `UpsertKey` is a 32-byte SHA-256 digest of the key bytes; collisions are treated as impossible (probabilistic). +* `Some(value)` is an insert/update for `key`; `None` is a tombstone (delete). +* Key decode failures produce `UpsertError::KeyDecode`; null keys produce `UpsertError::NullKey`; value decode failures produce `UpsertError::Value`. These flow as `Err` values keyed by the (errored) key and can be *retracted* by a subsequent good `(key, value)` for the same key — this is the contract that makes "fix the bad message" recovery possible without dropping the source. + +The upsert operator (`upsert_classic` in `upsert.rs`) consults a state store (`UpsertStateBackend`) for the prior value before emitting updates. Two backends ship: + +* `InMemoryHashMap` — `BTreeMap`. Lost on restart. +* `RocksDB` — persistent, with a merge operator. Bug history shows the merge operator must always return `Some` or RocksDB aborts the process (commit 0d8d740b47). + +State is reconstructed on restart by replaying the persist *feedback* stream (the output of the upsert operator's previous incarnation) up to the resume frontier. The operator passes through a *snapshot* phase that drains all feedback values for keys at or below the resume frontier, then transitions to normal mint-on-input mode. + +Key invariants stated in code: + +* `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541; mirrored in `upsert_continual_feedback*.rs`) — the upsert operator never sees retractions on its input; only inserts/tombstones. +* `panic!("key missing from commands_state")` (upsert.rs:636) — the operator's internal dedup table must always contain a key it is about to emit for; missing key is a structural invariant violation. +* Order-key monotonicity within a key is enforced by `consolidate_snapshot_chunk` / `drain_staged_input`. A regression here previously caused a panic that was "as close to data loss as possible" (commit f177db8286, issue materialize#26655). The fix skips violating updates rather than panicking. +* In continual-feedback v2: `assert!(diff.is_positive())` again (v2:315) plus `unreachable!()` on `(None, None)` from joined prior/new state (v2:483) and an empty-output assertion in tests (v2:957). + +### Reclock invariants and failure modes + +* `compare_and_append` on the remap shard can return `UpperMismatch` if a racing writer (e.g. across restart) has advanced the shard. `ReclockOperator::mint` retries by `sync()`-ing and re-minting (reclock.rs:160-166). +* `panic!("compare_and_append failed: {invalid_use}")` in `reclock/compat.rs:306` catches genuinely invalid persist calls (vs. retryable upper mismatch). +* Reclock's cached `upper` has a known staleness pitfall (commit e3805ad790, issue database-issues#8698) — fixed by always fetching the recent upper for `as_of` calculation. + +### Statistics and progress signals + +`statistics.rs` reports per-source counters that have correctness invariants of their own: + +* `offset_known >= offset_committed` (commit 3e32df1f69 enforces clamping after a regression bug). +* `snapshot_records_known >= snapshot_records_staged`, both decrease to zero (clear) at end of snapshot. + +These are user-visible numbers and form weak but easily-checkable correctness signals from the workload side. + +### Failure-prone areas relevant to Antithesis + +| Area | Risk | Code | +|------|------|------| +| Negative offset from rdkafka | hard panic | kafka.rs:1193 | +| Late offset on reconnect | silent drop (correct behavior, but check via `assert_sometimes!(saw_late_offset)`) | kafka.rs:1158 | +| Topic recreated with fewer offsets | previously panicked on capability downgrade (commit 99ad668af5) | source_reader_pipeline / kafka.rs | +| Upsert key with timestamp regression | previously panicked (commit f177db8286) | upsert.rs:475-487 | +| RocksDB merge returning `None` | SIGABRT (commit 0d8d740b47) | upsert/rocksdb.rs | +| Reclock `compare_and_append` UpperMismatch retry loop | unbounded retry, can block forever under persist outage | reclock.rs:160 | +| Multi-replica `drain_staged_input` double-pass | duplicate retractions (commit 1accbe28b3) | upsert_continual_feedback.rs | +| Persist sink cached upper across concurrent sinks | stale read leads to false errors (commit 505dc96aaa) | render/persist_sink.rs | +| Flag flip mid-append on persist sink | spurious `InvalidBatchBounds` (commit 68e1dfd86d) | render/persist_sink.rs | + +These are the seeds for the Kafka-specific property catalog in Category 7 of `property-catalog.md`. diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile new file mode 100644 index 0000000000000..5cca619ed8234 --- /dev/null +++ b/test/antithesis/workload/Dockerfile @@ -0,0 +1,44 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Antithesis workload client for Materialize. +# +# Python-based test driver that connects to materialized via pgwire, +# produces Kafka messages, and emits Antithesis assertions. + +FROM python:3.12-slim-bookworm + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + psycopg[binary]==3.2.9 \ + confluent-kafka==2.8.0 \ + antithesis==0.2.0 \ + PyMySQL==1.1.1 + +# setup-complete script +COPY setup-complete.sh /usr/local/bin/setup-complete.sh +RUN chmod +x /usr/local/bin/setup-complete.sh + +# Test template directory — populated by antithesis-workload skill later +RUN mkdir -p /opt/antithesis/test/v1/materialize + +# Catalog directory for Python assertion cataloging +RUN mkdir -p /opt/antithesis/catalog + +# Copy test templates and entrypoint +COPY test/ /opt/antithesis/test/v1/materialize/ +COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh +RUN chmod +x /usr/local/bin/workload-entrypoint.sh +RUN chmod +x /opt/antithesis/test/v1/materialize/* 2>/dev/null || true + +ENTRYPOINT ["/usr/local/bin/workload-entrypoint.sh"] diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml new file mode 100644 index 0000000000000..f62b4c073bb00 --- /dev/null +++ b/test/antithesis/workload/mzbuild.yml @@ -0,0 +1,10 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +name: antithesis-workload diff --git a/test/antithesis/workload/setup-complete.sh b/test/antithesis/workload/setup-complete.sh new file mode 100755 index 0000000000000..ecae58fa23e44 --- /dev/null +++ b/test/antithesis/workload/setup-complete.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +set -euo pipefail + +# Run this script to inform Antithesis that it can start running Test Composer +# Commands. You can also use the Antithesis SDK to emit setup-complete from your +# system if that is easier. +# +# Antithesis sets the `ANTITHESIS_OUTPUT_DIR` environment variable +# automatically. This script is setup to emit `setup_complete` to the +# `sdk.jsonl` file in that directory. + +OUTPUT_PATH="/tmp/antithesis_sdk.jsonl" +if [[ -n "${ANTITHESIS_OUTPUT_DIR:-}" ]]; then + OUTPUT_PATH="${ANTITHESIS_OUTPUT_DIR}/sdk.jsonl" + echo "Running in Antithesis, emitting setup_complete to ${OUTPUT_PATH}" +elif [[ -n "${ANTITHESIS_SDK_LOCAL_OUTPUT:-}" ]]; then + OUTPUT_PATH="${ANTITHESIS_SDK_LOCAL_OUTPUT}" + echo "Antithesis SDK local output override detected, emitting setup_complete to ${OUTPUT_PATH}" +fi + +mkdir -p $(dirname "$OUTPUT_PATH") +echo '{"antithesis_setup":{"status":"complete","details":{"message":"ready to go"}}}' >> "${OUTPUT_PATH}" diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py new file mode 100755 index 0000000000000..143dd8c103dce --- /dev/null +++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `fault-recovery-exercised`. + +The most fundamental liveness property in the catalog: after the system +takes a hit from Antithesis fault injection, it must eventually come back +and serve SQL again. The catalog frames this in terms of the `/health/ready` +endpoint returning 200; this workload uses `SELECT 1` (the cheapest +end-to-end pgwire round trip) as the proxy, and observes the cluster +replica status as a corroborating signal. + +Approach: + - Probe `materialized` with a *short-budget* psycopg connect on every + tick. Long retry budgets in `helper_pg` would mask the actual + fault-active periods we want to detect — here we want to observe the + transitions. + - Track per-tick state: was this probe a success or a connect/query + failure? + - If we observe a failure at tick T and a success at tick T+k (any k>=1) + within this invocation, that is the recovery transition we care about, + and we fire `sometimes("...query succeeded after observed fault")`. + + - Separately, fire `sometimes("...observed cluster replica non-online")` + when `mz_cluster_replica_statuses` reports any antithesis replica + `offline`. This is a corroborating signal so triage can distinguish + "no fault ever landed" from "faults landed but no recovery observed." + +This is an `anytime_` driver — Antithesis launches it many times, each +short-lived. Recovery transitions accumulate across invocations. +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +import psycopg +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGUSER, + query_one_retry, +) + +from antithesis.assertions import sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.fault_recovery_exercised") + +POLL_INTERVAL_S = 0.5 +RUN_BUDGET_S = 30.0 +PROBE_CONNECT_TIMEOUT_S = 2.0 + +ANTITHESIS_CLUSTER = "antithesis_cluster" + + +def _probe_select_one() -> bool: + """Run `SELECT 1` with a short connect timeout. Return True on success. + + Distinct from the resilient `helper_pg.query_*` paths because we *want* + to observe transient failures here — they are the fault-active half of + the recovery transition we are looking for. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + return row is not None and row[0] == 1 + except Exception: # noqa: BLE001 + return False + + +def _replica_non_online() -> bool: + """Best-effort: is any antithesis-cluster replica reporting non-online? + + Uses the retry-budgeted query helper because we want a clear yes/no, not + a probe outcome — if the helper can't get an answer we conservatively + return False so the corroborating signal stays silent rather than + accidentally firing on a probe-side failure. + """ + try: + row = query_one_retry( + """ + SELECT EXISTS ( + SELECT 1 + FROM mz_internal.mz_cluster_replica_statuses s + JOIN mz_cluster_replicas r ON r.id = s.replica_id + JOIN mz_clusters c ON c.id = r.cluster_id + WHERE c.name = %s AND s.status != 'online' + ) + """, + (ANTITHESIS_CLUSTER,), + ) + except Exception: # noqa: BLE001 + return False + return bool(row and row[0]) + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + + # Per-invocation state. The driver is short-lived; Antithesis covers the + # full timeline by launching many invocations. + saw_failure = False + saw_recovery_after_failure = False + saw_replica_non_online = False + successes = 0 + failures = 0 + + while time.monotonic() < deadline: + ok = _probe_select_one() + if ok: + successes += 1 + if saw_failure: + saw_recovery_after_failure = True + else: + failures += 1 + saw_failure = True + + if _replica_non_online(): + saw_replica_non_online = True + + time.sleep(POLL_INTERVAL_S) + + sometimes( + saw_recovery_after_failure, + "fault recovery: SELECT 1 succeeded after a previously-observed connect failure", + { + "successes": successes, + "failures": failures, + "saw_replica_non_online": saw_replica_non_online, + }, + ) + sometimes( + saw_replica_non_online, + "fault recovery: observed antithesis_cluster replica non-online at least once", + {"successes": successes, "failures": failures}, + ) + # Bare-minimum healthy-coverage signal: at least one successful probe in + # the invocation. If this ever goes 0/N across a run, no driver was + # ever able to talk to Materialize and the entire test is suspect — + # downstream property assertions would be vacuous. + sometimes( + successes > 0, + "fault recovery: at least one SELECT 1 succeeded this invocation", + {"successes": successes, "failures": failures}, + ) + + LOG.info( + "fault-recovery probe done; successes=%d failures=%d recovery=%s replica_offline=%s", + successes, + failures, + saw_recovery_after_failure, + saw_replica_non_online, + ) + return 0 + + +if __name__ == "__main__": + # Reference PGUSER/PGPORT/PGHOST/PGDATABASE so static analysis sees them + # used through helper_pg's re-export rather than as dead imports. + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os) + sys.exit(main()) diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh new file mode 100755 index 0000000000000..641aed971be93 --- /dev/null +++ b/test/antithesis/workload/test/anytime_health_check.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +set -euo pipefail + +# Basic health check — verifies materialized is responding to SQL. +# This is a minimal placeholder; the antithesis-workload skill will add +# real test commands with property assertions. + +PGHOST="${PGHOST:-materialized}" +PGPORT="${PGPORT:-6875}" +PGUSER="${PGUSER:-materialize}" + +result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>/dev/null) +if [ "$result" = "1" ]; then + echo "Health check passed" + exit 0 +else + echo "Health check failed: $result" + exit 1 +fi diff --git a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py new file mode 100755 index 0000000000000..faee0fd0c680e --- /dev/null +++ b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `kafka-source-frontier-monotonic`. + +The `upper` of a Kafka source's persist data shard must never regress across +its lifetime, including across clusterd restarts and `compare_and_append` +retries. Approximated via the workload-visible `offset_committed` reported +in `mz_internal.mz_source_statistics`, which is the durably-ingested +upstream offset for the source. + +This is an `anytime_` driver — it runs continuously throughout the timeline, +polling all of this workload's Kafka sources and asserting that each one's +`offset_committed` never decreases between successive observations. Faults +are active while it runs, which is the right shape for a continuous safety +invariant: Antithesis can crash clusterd between two of our polls and the +next poll must still report a value >= the previous one. + +The driver exits after a bounded budget so Antithesis can re-launch it +freely without one instance pinning resources. Cross-invocation: each +instance reads the state from before-restart only via `offset_committed` +itself (no in-process memory carries across) — `last_seen` is reset on each +launch, but Antithesis runs many instances in parallel and the union of +their observations covers the regression window. + +Errors during polling (network partitions, clusterd unavailable) are +*expected* under fault injection and must not produce false-positive +failures. We only assert when we have two successive successful reads for +the same source. +""" + +from __future__ import annotations + +import logging +import sys +import time + +from helper_pg import query_retry +from helper_source_stats import offset_committed + +from antithesis.assertions import always + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_frontier_monotonic") + +# Knobs. +POLL_INTERVAL_S = 0.5 +RUN_BUDGET_S = 30.0 + +# The Antithesis cluster every driver in this workload provisions sources into. +# Discovering sources dynamically (rather than hardcoding names) means new +# drivers that introduce new Kafka sources get monotonicity coverage for free. +ANTITHESIS_CLUSTER = "antithesis_cluster" + + +def _sources_present() -> list[str]: + """Return every Kafka source currently owned by `antithesis_cluster`.""" + rows = query_retry( + """ + SELECT s.name + FROM mz_sources s + JOIN mz_clusters c ON c.id = s.cluster_id + WHERE c.name = %s AND s.type = 'kafka' + """, + (ANTITHESIS_CLUSTER,), + ) + return [r[0] for r in rows] + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + # Per-source highest committed offset observed across this invocation's + # polls. Each successful new read for a source must be >= last_seen. + last_seen: dict[str, int] = {} + polled = 0 + + while time.monotonic() < deadline: + try: + sources = _sources_present() + except Exception as exc: # noqa: BLE001 + LOG.info("source list query failed: %s; sleeping and retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + for source in sources: + try: + observed = offset_committed(source) + except Exception as exc: # noqa: BLE001 + LOG.info("offset_committed query failed for %s: %s", source, exc) + continue + if observed is None: + # Statistics row not initialized yet (very early in source + # lifetime, or post-restart before stats first reported). + # Not an assertion target. + continue + + prev = last_seen.get(source) + if prev is not None: + always( + observed >= prev, + "kafka: source offset_committed non-monotonic", + { + "source": source, + "previous": prev, + "observed": observed, + "regression": prev - observed, + }, + ) + + # Always update last_seen, even on regression — we want to keep + # asserting against the most recent observation so a regression + # surfaces once per discrete drop, not on every subsequent poll. + last_seen[source] = observed + polled += 1 + + time.sleep(POLL_INTERVAL_S) + + LOG.info( + "frontier monotonic check done; %d samples across %d sources", + polled, + len(last_seen), + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py new file mode 100755 index 0000000000000..9801c4dfa65b7 --- /dev/null +++ b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `offset-known-not-below-committed`. + +For every Kafka source, `mz_internal.mz_source_statistics_per_worker` must +always report `offset_known >= offset_committed`. `offset_known` reflects +what the broker has told us is available; `offset_committed` reflects what +Materialize has durably ingested. Causally, the broker's idea of "this +offset exists" cannot lag what we've already durably read past it. Direct +regression target for commit 3e32df1f69, which clamped the metric to +prevent this flip on the first sample after a clusterd restart. + +This is an `anytime_` driver — it runs continuously throughout the timeline +under active fault injection. The interesting timing per the catalog is the +very first sample after a clusterd restart, where `offset_known` is +restored from the broker watermark while `offset_committed` is restored +from persist; we want Antithesis to drop a poll into that window. + +Both fields are read in the same row of the same SELECT so the comparison +never crosses a metric-update boundary. The per-worker view is queried +(not the rolled-up `mz_source_statistics`) because the invariant must hold +per worker — averaging would mask a single worker that crossed the line. + +Errors during polling (clusterd down, network partitioned) are *expected* +under fault injection and must not produce false-positive failures; we +just skip the sample. +""" + +from __future__ import annotations + +import logging +import sys +import time + +from helper_pg import query_retry + +from antithesis.assertions import always + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_offset_known_not_below_committed") + +POLL_INTERVAL_S = 0.5 +RUN_BUDGET_S = 30.0 + +ANTITHESIS_CLUSTER = "antithesis_cluster" + + +def _samples() -> list[tuple[str, int, int, int]]: + """Return (source_name, worker_id, offset_known, offset_committed) per worker. + + Joins `mz_source_statistics_per_worker` to `mz_sources` so the assertion + `details` can name the source by name rather than by opaque id. Filters + to Kafka sources owned by the antithesis cluster so the assertion does + not fire against the introspection cluster's bookkeeping sources. + + Rows with NULL `offset_known` or `offset_committed` are dropped — those + are early-lifetime samples that have not been populated yet. + """ + rows = query_retry( + """ + SELECT + s.name, + ss.worker_id::bigint, + ss.offset_known::bigint, + ss.offset_committed::bigint + FROM mz_internal.mz_source_statistics_per_worker ss + JOIN mz_sources s ON s.id = ss.id + JOIN mz_clusters c ON c.id = s.cluster_id + WHERE c.name = %s + AND s.type = 'kafka' + AND ss.offset_known IS NOT NULL + AND ss.offset_committed IS NOT NULL + """, + (ANTITHESIS_CLUSTER,), + ) + return [(str(n), int(w), int(k), int(o)) for (n, w, k, o) in rows] + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + polled = 0 + + while time.monotonic() < deadline: + try: + samples = _samples() + except Exception as exc: # noqa: BLE001 + LOG.info("source stats query failed: %s; sleeping and retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + for source, worker, known, committed in samples: + always( + known >= committed, + "kafka: source offset_known < offset_committed", + { + "source": source, + "worker_id": worker, + "offset_known": known, + "offset_committed": committed, + "deficit": committed - known, + }, + ) + polled += 1 + + time.sleep(POLL_INTERVAL_S) + + LOG.info("offset_known-not-below-committed check done; %d samples", polled) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py new file mode 100755 index 0000000000000..85042a317d7cb --- /dev/null +++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `kafka-source-survives-broker-fault` and +`kafka-source-survives-clusterd-restart` (combined liveness signal). + +Both catalog properties amount to: after a transient fault that prevents +the source from making progress, once the fault is over the source must +ingest the messages it was unable to read during the outage. Externally +this looks identical for either fault kind — `offset_committed` stalls +during the outage and resumes advancing afterward — so one anytime driver +records the stall-then-advance transition and we tag the corroborating +fault signal (kafka broker reachable / replica online) in `details` so +triage can distinguish the two cases on a hit. + +Per-invocation state machine, per source: + - `IDLE` (initial). On a successful sample, store the offset and move + to `OBSERVING`. + - `OBSERVING`. If the sample equals the stored value for STALL_TICKS + consecutive ticks, move to `STALLED` (the source has stopped + progressing — most likely fault-induced). Otherwise, refresh the + stored value. + - `STALLED`. On any sample strictly greater than the stalled value, fire + the `sometimes(...)` recovery anchor and return to `OBSERVING` with + the new value. Otherwise stay stalled. + +Failed samples (clusterd unavailable, network partition) do not transition +the state machine — they are the fault-active condition we want to bridge +over. They are counted only so the `details` payload can corroborate the +recovery transition. + +The driver also records two corroborating `sometimes(...)` signals so +triage can confirm Antithesis actually hit each of the two fault classes +this property cluster cares about: + - replica went non-online (clusterd-restart signal) + - direct Kafka admin metadata fetch failed (broker-fault signal) +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +from helper_pg import query_one_retry, query_retry + +from antithesis.assertions import reachable, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_source_resumes_after_fault") + +POLL_INTERVAL_S = 1.0 +RUN_BUDGET_S = 45.0 +# Number of consecutive identical samples after which we consider the source +# "stalled" rather than just briefly idle. Five seconds (5 ticks * 1s) +# comfortably exceeds the natural quiet-period between produces but is well +# below the fault-injection windows Antithesis schedules. +STALL_TICKS = 5 + +ANTITHESIS_CLUSTER = "antithesis_cluster" +KAFKA_BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") + + +def _kafka_sources() -> list[str]: + rows = query_retry( + """ + SELECT s.name + FROM mz_sources s + JOIN mz_clusters c ON c.id = s.cluster_id + WHERE c.name = %s AND s.type = 'kafka' + """, + (ANTITHESIS_CLUSTER,), + ) + return [r[0] for r in rows] + + +def _offset_committed(source_name: str) -> int | None: + """Aggregated offset_committed across workers for `source_name`.""" + row = query_one_retry( + """ + SELECT MAX(ss.offset_committed)::bigint + FROM mz_internal.mz_source_statistics ss + JOIN mz_sources s ON s.id = ss.id + WHERE s.name = %s + """, + (source_name,), + ) + if row is None or row[0] is None: + return None + return int(row[0]) + + +def _replica_non_online() -> bool: + try: + row = query_one_retry( + """ + SELECT EXISTS ( + SELECT 1 + FROM mz_internal.mz_cluster_replica_statuses s + JOIN mz_cluster_replicas r ON r.id = s.replica_id + JOIN mz_clusters c ON c.id = r.cluster_id + WHERE c.name = %s AND s.status != 'online' + ) + """, + (ANTITHESIS_CLUSTER,), + ) + except Exception: # noqa: BLE001 + return False + return bool(row and row[0]) + + +def _kafka_metadata_failed() -> bool: + """Best-effort: did a direct Kafka metadata fetch fail? + + A successful Materialize-side ingestion still goes through the broker, + so a metadata fetch failure here is a strong signal that the + `materialized <-> kafka` channel was partitioned even though the + `materialized <-> postgres-metadata` channel still works (the + `kafka-source-survives-broker-fault` shape). + + Defensive imports because the kafka admin client only runs cleanly with + a reachable broker. We avoid raising into the polling loop. + """ + try: + from confluent_kafka.admin import AdminClient + except Exception: # noqa: BLE001 + return False + try: + AdminClient({"bootstrap.servers": KAFKA_BROKER}).list_topics(timeout=2) + return False + except Exception: # noqa: BLE001 + return True + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + + # Per-source state machine. + # state: "OBSERVING" or "STALLED" + # last_value: most recent committed offset observed + # stall_streak: consecutive ticks at last_value + states: dict[str, dict] = {} + + # Cross-source corroborating signals collected throughout this run. + saw_replica_non_online = False + saw_kafka_metadata_failure = False + # Per-source: did we observe stall->advance at least once. + resumed_after_stall: dict[str, bool] = {} + + while time.monotonic() < deadline: + if _replica_non_online(): + saw_replica_non_online = True + if _kafka_metadata_failed(): + saw_kafka_metadata_failure = True + + try: + sources = _kafka_sources() + except Exception as exc: # noqa: BLE001 + LOG.info("source list query failed: %s; sleeping", exc) + time.sleep(POLL_INTERVAL_S) + continue + + for source in sources: + try: + observed = _offset_committed(source) + except Exception as exc: # noqa: BLE001 + LOG.info("offset_committed query failed for %s: %s", source, exc) + continue + if observed is None: + continue + + st = states.setdefault( + source, + {"state": "OBSERVING", "last_value": observed, "stall_streak": 0}, + ) + + if st["state"] == "OBSERVING": + if observed == st["last_value"]: + st["stall_streak"] += 1 + if st["stall_streak"] >= STALL_TICKS: + st["state"] = "STALLED" + else: + # Progress: reset. + st["last_value"] = observed + st["stall_streak"] = 0 + else: # STALLED + if observed > st["last_value"]: + # Recovery transition: fire the per-source signal once + # per invocation (we still update state so we can detect + # additional stalls and resumes). + if not resumed_after_stall.get(source, False): + resumed_after_stall[source] = True + # Reaching here is the property: a source was stalled, + # then advanced. Use `reachable(...)` rather than + # `sometimes(True, ...)` per the SDK assertion-type + # guidance. + reachable( + "kafka source: offset_committed resumed advancing after a sustained stall", + { + "source": source, + "stalled_at": st["last_value"], + "observed_after_recovery": observed, + "stall_ticks_required": STALL_TICKS, + "saw_replica_non_online": saw_replica_non_online, + "saw_kafka_metadata_failure": saw_kafka_metadata_failure, + }, + ) + st["state"] = "OBSERVING" + st["last_value"] = observed + st["stall_streak"] = 0 + + time.sleep(POLL_INTERVAL_S) + + sometimes( + saw_replica_non_online, + "kafka source resumes: observed antithesis_cluster replica non-online", + {"resumed_sources": sorted(resumed_after_stall.keys())}, + ) + sometimes( + saw_kafka_metadata_failure, + "kafka source resumes: observed direct Kafka metadata fetch failure", + {"resumed_sources": sorted(resumed_after_stall.keys())}, + ) + + LOG.info( + "kafka-source-resumes-after-fault done; sources_resumed=%d replica_offline=%s metadata_failed=%s", + sum(1 for v in resumed_after_stall.values() if v), + saw_replica_non_online, + saw_kafka_metadata_failure, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py new file mode 100644 index 0000000000000..4380b5f4bd40d --- /dev/null +++ b/test/antithesis/workload/test/first_mysql_replica_setup.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis first_ command: configure MySQL multithreaded replica replication +and create the Materialize MySQL CDC source. + +Runs once per Antithesis timeline before any parallel/singleton drivers start. +Steps: + 1. Wait for both MySQL containers to accept connections. + 2. Create the `antithesis` database and `cdc_test` table on the primary. + 3. Configure the replica to replicate from the primary via GTID with 4 + parallel worker threads (multithreaded replication). + 4. Start the replica. + 5. Wait for `antithesis.cdc_test` to appear on the replica (confirms + replication is flowing). + 6. Create the Materialize connection, source, and table from the replica. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_mysql +from helper_mysql_source import ensure_mysql_cdc_source + +from antithesis.assertions import reachable, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("first.mysql_replica_setup") + + +def setup_primary() -> None: + """Create the antithesis schema and cdc_test table on the MySQL primary.""" + LOG.info("creating antithesis database and cdc_test table on primary") + helper_mysql.execute_primary("CREATE DATABASE IF NOT EXISTS antithesis") + helper_mysql.execute_primary( + """ + CREATE TABLE IF NOT EXISTS antithesis.cdc_test ( + id VARCHAR(64) NOT NULL PRIMARY KEY, + batch_id VARCHAR(64) NOT NULL, + value TEXT NOT NULL, + updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6) + ON UPDATE CURRENT_TIMESTAMP(6) + ) + """, + database="antithesis", + ) + LOG.info("antithesis.cdc_test ready on primary") + + +def configure_replica() -> None: + """Configure the MySQL replica to replicate from the primary. + + Uses GTID auto-positioning with 4 parallel workers. The replica starts + with --skip-replica-start so we configure the channel before starting. + Idempotent: stops and resets any existing channel first. + """ + LOG.info( + "configuring replica to replicate from %s with 4 parallel workers", + helper_mysql.MYSQL_HOST, + ) + # Stop and reset any existing channel (no-op on a fresh container). + try: + helper_mysql.execute_replica("STOP REPLICA") + except Exception: # noqa: BLE001 + pass + try: + helper_mysql.execute_replica("RESET REPLICA ALL") + except Exception: # noqa: BLE001 + pass + + helper_mysql.execute_replica( + f"CHANGE REPLICATION SOURCE TO " + f"SOURCE_HOST='{helper_mysql.MYSQL_HOST}', " + f"SOURCE_USER='root', " + f"SOURCE_PASSWORD='{helper_mysql.MYSQL_PASSWORD}', " + f"SOURCE_AUTO_POSITION=1, " + f"GET_SOURCE_PUBLIC_KEY=1" + ) + # Set parallel replication parameters before starting. + helper_mysql.execute_replica("SET GLOBAL replica_parallel_workers = 4") + helper_mysql.execute_replica("SET GLOBAL replica_preserve_commit_order = ON") + helper_mysql.execute_replica("START REPLICA") + LOG.info("MySQL replica started") + + +def wait_for_replica_table(timeout_s: float = 90.0) -> bool: + """Wait until antithesis.cdc_test is visible on the replica. + + Returns True when the table appears (replication is flowing), False on + timeout. + """ + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + try: + rows = helper_mysql.query_replica( + "SELECT 1 FROM information_schema.tables " + "WHERE table_schema = 'antithesis' AND table_name = 'cdc_test'", + ) + if rows: + LOG.info("antithesis.cdc_test visible on replica — replication flowing") + return True + except Exception as exc: # noqa: BLE001 + LOG.info("waiting for replica table: %s", exc) + time.sleep(2) + LOG.warning("timed out waiting for antithesis.cdc_test on replica") + return False + + +def main() -> int: + LOG.info("waiting for MySQL primary (%s)...", helper_mysql.MYSQL_HOST) + helper_mysql.wait_for_primary() + + LOG.info("waiting for MySQL replica (%s)...", helper_mysql.MYSQL_REPLICA_HOST) + helper_mysql.wait_for_replica() + + setup_primary() + configure_replica() + + replica_ready = wait_for_replica_table() + sometimes( + replica_ready, + "mysql replica: antithesis.cdc_test replicated from primary within 90s", + { + "primary": helper_mysql.MYSQL_HOST, + "replica": helper_mysql.MYSQL_REPLICA_HOST, + }, + ) + if not replica_ready: + # Proceed anyway — replication may catch up before Materialize tries to + # validate the source, but log a warning so triage can correlate. + LOG.warning("replica table not yet visible; proceeding with source creation") + + ensure_mysql_cdc_source() + + reachable( + "mysql: first-run setup complete — replica configured, Materialize source created", + { + "primary": helper_mysql.MYSQL_HOST, + "replica": helper_mysql.MYSQL_REPLICA_HOST, + }, + ) + LOG.info("MySQL CDC setup complete") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/first_select_upsert_implementation.py b/test/antithesis/workload/test/first_select_upsert_implementation.py new file mode 100755 index 0000000000000..03394a1ebd7f7 --- /dev/null +++ b/test/antithesis/workload/test/first_select_upsert_implementation.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis test command: pick v1 or v2 of the upsert continual feedback +operator at the start of each timeline. + +The selection is made via `helper_random.random_u64()` (routes through the +Antithesis SDK for deterministic replay) and applied via `ALTER SYSTEM SET +enable_upsert_v2 = ...` against the `mz_system` internal port. Because this +script is a `first_*` Test Composer action it runs after `setup-complete` +but before any `parallel_driver_*` / `singleton_driver_*` creates a source, +so every source rendered in this timeline reads the chosen value. + +Each branch records a `sometimes` assertion so Antithesis surfaces "v1 +covered" and "v2 covered" as separate dashboard signals — if either ever +goes 0/N across the run, we've lost that arm of coverage. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from helper_pg import execute_internal_retry + +from antithesis.assertions import sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("first.select_upsert_implementation") + + +def main() -> int: + # Low bit of a SDK-sourced u64 — under Antithesis this routes through the + # SDK so timeline replay picks the same arm; outside Antithesis it falls + # back to a stdlib-seeded RNG (see helper_random). + enable_v2 = (helper_random.random_u64() & 1) == 1 + LOG.info("rolled enable_upsert_v2=%s for this timeline", enable_v2) + + # Set explicitly in both branches so the chosen value is part of the + # timeline's recorded state, not implicit in the bootstrap default. + if enable_v2: + execute_internal_retry("ALTER SYSTEM SET enable_upsert_v2 = true") + sometimes(True, "upsert continual feedback v2 enabled for timeline", {}) + else: + execute_internal_retry("ALTER SYSTEM SET enable_upsert_v2 = false") + sometimes(True, "upsert continual feedback v1 enabled for timeline", {}) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/helper_kafka.py b/test/antithesis/workload/test/helper_kafka.py new file mode 100644 index 0000000000000..a9bf2eac600a1 --- /dev/null +++ b/test/antithesis/workload/test/helper_kafka.py @@ -0,0 +1,90 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Thin confluent-kafka producer wrapper for Antithesis drivers. + +Tracks the highest delivered offset per topic so drivers can poll Materialize +statistics for catchup. Retries delivery failures on partition; surfaces +permanent errors. +""" + +from __future__ import annotations + +import logging +import os +import threading +from dataclasses import dataclass, field + +from confluent_kafka import KafkaException, Producer +from confluent_kafka.admin import AdminClient, NewTopic + +LOG = logging.getLogger("antithesis.helper_kafka") + +BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") + + +@dataclass +class DeliveryTracker: + """Records highest delivered offset per (topic, partition) and any error.""" + + max_offset: dict[tuple[str, int], int] = field(default_factory=dict) + last_error: KafkaException | None = None + _lock: threading.Lock = field(default_factory=threading.Lock) + + def callback(self, err, msg): + if err is not None: + with self._lock: + self.last_error = KafkaException(err) + LOG.warning("kafka delivery error: %s", err) + return + key = (msg.topic(), msg.partition()) + with self._lock: + existing = self.max_offset.get(key, -1) + if msg.offset() > existing: + self.max_offset[key] = msg.offset() + + def topic_max_offset(self, topic: str) -> int: + with self._lock: + offsets = [o for (t, _), o in self.max_offset.items() if t == topic] + return max(offsets) if offsets else -1 + + +def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTracker]: + """Construct a Producer with a fresh DeliveryTracker.""" + config: dict[str, object] = { + "bootstrap.servers": BROKER, + "linger.ms": 5, + "enable.idempotence": True, + "acks": "all", + } + if client_id: + config["client.id"] = client_id + return Producer(config), DeliveryTracker() + + +def ensure_topic(topic: str, num_partitions: int = 1) -> None: + """Create the topic if it doesn't already exist. No-op on race with auto-create.""" + admin = AdminClient({"bootstrap.servers": BROKER}) + existing = admin.list_topics(timeout=10).topics + if topic in existing: + return + LOG.info("creating kafka topic %s with %d partition(s)", topic, num_partitions) + futures = admin.create_topics( + [NewTopic(topic, num_partitions=num_partitions, replication_factor=1)] + ) + for t, fut in futures.items(): + try: + fut.result(timeout=30) + except KafkaException as exc: + # TOPIC_ALREADY_EXISTS = 36 + err = exc.args[0] if exc.args else None + if err is not None and getattr(err, "code", lambda: None)() == 36: + LOG.info("kafka topic %s raced with auto-create; continuing", t) + continue + raise diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py new file mode 100644 index 0000000000000..e99b3656cb4dd --- /dev/null +++ b/test/antithesis/workload/test/helper_mysql.py @@ -0,0 +1,159 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""MySQL connection helpers for Antithesis drivers. + +Connects to the MySQL primary and replica via PyMySQL. All calls retry +transient network and operational errors up to a fixed budget so the +workload keeps progressing under fault injection. +""" + +from __future__ import annotations + +import logging +import os +import time + +import pymysql +import pymysql.cursors + +LOG = logging.getLogger("antithesis.helper_mysql") + +MYSQL_HOST = os.environ.get("MYSQL_HOST", "mysql") +MYSQL_REPLICA_HOST = os.environ.get("MYSQL_REPLICA_HOST", "mysql-replica") +MYSQL_PORT = int(os.environ.get("MYSQL_PORT", "3306")) +MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd") + +_RETRY_BUDGET_S = 120 +_RETRY_INITIAL_S = 0.5 +_RETRY_MAX_S = 4.0 + + +def _retryable(exc: BaseException) -> bool: + return isinstance(exc, (pymysql.OperationalError, pymysql.InterfaceError)) + + +def _open(host: str, database: str) -> pymysql.connections.Connection: + """Open a single MySQL connection with retries on transient errors.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + return pymysql.connect( + host=host, + port=MYSQL_PORT, + user="root", + password=MYSQL_PASSWORD, + database=database, + connect_timeout=15, + autocommit=True, + ) + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info( + "mysql connect to %s retrying after %s; backoff=%.2fs", + host, + exc, + backoff, + ) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def _execute(host: str, sql: str, params: tuple = (), database: str = "mysql") -> None: + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + conn = _open(host, database) + with conn.cursor() as cur: + cur.execute(sql, params) + conn.close() + return + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("mysql execute on %s retrying after %s", host, exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def _query( + host: str, sql: str, params: tuple = (), database: str = "mysql" +) -> list[tuple]: + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + conn = _open(host, database) + with conn.cursor() as cur: + cur.execute(sql, params) + result = list(cur.fetchall()) + conn.close() + return result + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("mysql query on %s retrying after %s", host, exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def execute_primary(sql: str, params: tuple = (), database: str = "mysql") -> None: + """Execute a statement on the MySQL primary.""" + _execute(MYSQL_HOST, sql, params, database) + + +def execute_replica(sql: str, params: tuple = (), database: str = "mysql") -> None: + """Execute a statement on the MySQL replica.""" + _execute(MYSQL_REPLICA_HOST, sql, params, database) + + +def query_primary( + sql: str, params: tuple = (), database: str = "mysql" +) -> list[tuple]: + """Run a query on the MySQL primary and return all rows.""" + return _query(MYSQL_HOST, sql, params, database) + + +def query_replica( + sql: str, params: tuple = (), database: str = "mysql" +) -> list[tuple]: + """Run a query on the MySQL replica and return all rows.""" + return _query(MYSQL_REPLICA_HOST, sql, params, database) + + +def wait_for_host(host: str, timeout_s: float = 180.0) -> None: + """Block until MySQL on `host` accepts connections.""" + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + try: + conn = pymysql.connect( + host=host, + port=MYSQL_PORT, + user="root", + password=MYSQL_PASSWORD, + connect_timeout=5, + ) + conn.close() + LOG.info("mysql %s is ready", host) + return + except Exception as exc: # noqa: BLE001 + LOG.info("waiting for mysql %s: %s", host, exc) + time.sleep(2) + raise TimeoutError(f"MySQL at {host} not ready after {timeout_s}s") + + +def wait_for_primary(timeout_s: float = 180.0) -> None: + wait_for_host(MYSQL_HOST, timeout_s) + + +def wait_for_replica(timeout_s: float = 180.0) -> None: + wait_for_host(MYSQL_REPLICA_HOST, timeout_s) diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py new file mode 100644 index 0000000000000..6572eddc9c7e4 --- /dev/null +++ b/test/antithesis/workload/test/helper_mysql_source.py @@ -0,0 +1,97 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis MySQL CDC source in Materialize. + +The MySQL CDC pipeline: + mysql (primary) --binlog--> mysql-replica --CDC--> Materialize + +Materialize reads from the replica so that faults to the replica exercise +the Materialize source recovery path independently of faults to the primary. + +Objects created in Materialize: + - SECRET antithesis_mysql_password + - CONNECTION antithesis_mysql_conn -> mysql-replica + - SOURCE mysql_cdc_source (IN CLUSTER antithesis_cluster) + - TABLE antithesis_cdc (REFERENCE antithesis.cdc_test) +""" + +from __future__ import annotations + +import logging +import os + +import psycopg + +from helper_pg import create_source_idempotent, execute_retry, query_retry + +LOG = logging.getLogger("antithesis.helper_mysql_source") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") +MYSQL_REPLICA_HOST = os.environ.get("MYSQL_REPLICA_HOST", "mysql-replica") +MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd") + +MYSQL_DATABASE = "antithesis" +MYSQL_TABLE = "cdc_test" + +SECRET_NAME = "antithesis_mysql_password" +CONNECTION_NAME = "antithesis_mysql_conn" +SOURCE_NAME = "mysql_cdc_source" +TABLE_NAME = "antithesis_cdc" + + +def ensure_mysql_connection() -> None: + """Create the MySQL secret and connection in Materialize (idempotent).""" + execute_retry( + f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'" + ) + execute_retry( + f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} TO MYSQL (" + f"HOST '{MYSQL_REPLICA_HOST}', " + f"USER 'root', " + f"PASSWORD SECRET {SECRET_NAME}" + f")" + ) + LOG.info("mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST) + + +def ensure_mysql_cdc_table() -> None: + """Create the Materialize table from the MySQL CDC source (idempotent).""" + try: + execute_retry( + f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} " + f"FROM SOURCE {SOURCE_NAME} " + f"(REFERENCE {MYSQL_DATABASE}.{MYSQL_TABLE})" + ) + except psycopg.errors.InternalError as exc: + if "already exists" not in str(exc): + raise + rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME,)) + if rows: + LOG.info("table %s landed concurrently; tolerating collision", TABLE_NAME) + return + raise + LOG.info("mysql cdc table %s ready", TABLE_NAME) + + +def ensure_mysql_cdc_source() -> None: + """Create the full MySQL CDC pipeline in Materialize (idempotent). + + Requires antithesis.cdc_test to already exist on the MySQL replica. + Call first_mysql_replica_setup.py before this in any standalone use. + """ + ensure_mysql_connection() + create_source_idempotent( + f"CREATE SOURCE IF NOT EXISTS {SOURCE_NAME} " + f"IN CLUSTER {CLUSTER} " + f"FROM MYSQL CONNECTION {CONNECTION_NAME}", + SOURCE_NAME, + ) + LOG.info("mysql cdc source %s ready", SOURCE_NAME) + ensure_mysql_cdc_table() diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py new file mode 100644 index 0000000000000..87a90b1ac6087 --- /dev/null +++ b/test/antithesis/workload/test/helper_none_source.py @@ -0,0 +1,60 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis NONE-envelope (append-only) Kafka source. + +Used by drivers that exercise the append-only contract. The source has columns +`text TEXT, partition INTEGER, offset BIGINT` — `partition` and `offset` are +the Kafka metadata projected via `INCLUDE PARTITION, OFFSET`, which give us +the per-`(partition, offset)` uniqueness check called out in +`kafka-source-no-data-duplication.md`. +""" + +from __future__ import annotations + +import logging +import os + +from helper_kafka import ensure_topic +from helper_pg import create_source_idempotent +from helper_upsert_source import ensure_kafka_connection + +LOG = logging.getLogger("antithesis.helper_none_source") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") + +TOPIC_NONE_TEXT = "antithesis-none-text" +SOURCE_NONE_TEXT = "none_text_src" + + +def ensure_none_text_source() -> None: + """Create the append-only source over a text-valued Kafka topic. + + Resulting columns: `text TEXT NOT NULL, partition INTEGER, offset BIGINT`. + Reuses the shared `antithesis_kafka_conn` Kafka connection so multiple + drivers don't proliferate connections. + """ + ensure_kafka_connection() + # CREATE SOURCE issues a Kafka metadata fetch that fails fast if the topic + # is missing; broker auto-create only fires on a producer write, which + # comes later in the driver. Pre-create via admin client so the metadata + # fetch succeeds on the first run. + ensure_topic(TOPIC_NONE_TEXT) + create_source_idempotent( + f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} " + f"IN CLUSTER {CLUSTER} " + f"FROM KAFKA CONNECTION antithesis_kafka_conn (TOPIC '{TOPIC_NONE_TEXT}') " + f"FORMAT TEXT " + f"INCLUDE PARTITION, OFFSET " + f"ENVELOPE NONE", + SOURCE_NONE_TEXT, + ) + LOG.info( + "none-envelope source %s ready (topic=%s)", SOURCE_NONE_TEXT, TOPIC_NONE_TEXT + ) diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py new file mode 100644 index 0000000000000..5c74276fe5f90 --- /dev/null +++ b/test/antithesis/workload/test/helper_pg.py @@ -0,0 +1,187 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Resilient Materialize/pgwire connection helpers for Antithesis drivers. + +The workload runs under active fault injection. Every call retries network and +admission errors transparently; everything else propagates. +""" + +from __future__ import annotations + +import logging +import os +import time +from collections.abc import Iterator, Sequence +from contextlib import contextmanager +from typing import Any + +import psycopg + +LOG = logging.getLogger("antithesis.helper_pg") + +PGHOST = os.environ.get("PGHOST", "materialized") +PGPORT = int(os.environ.get("PGPORT", "6875")) +PGUSER = os.environ.get("PGUSER", "materialize") +PGDATABASE = os.environ.get("PGDATABASE", "materialize") + +# Internal pgwire endpoint for system-privileged operations (ALTER SYSTEM SET). +PGPORT_INTERNAL = int(os.environ.get("PGPORT_INTERNAL", "6877")) +PGUSER_INTERNAL = os.environ.get("PGUSER_INTERNAL", "mz_system") + +# Retry tuning. Antithesis injects partitions and node hangs; conservative bounds +# keep drivers progressing without masking real correctness signals. +# +# These need to absorb a full Antithesis quiet period plus restart time for the +# system to come back. Quiet-period requests in the workload are typically +# 20-25s; the container then takes a few seconds to become responsive, so the +# overall budget must comfortably exceed ~30s. The per-attempt connect timeout +# also has to be long enough to actually complete a TCP+TLS handshake against +# a hung but recovering materialized — too short and every attempt fails fast +# and the budget is burned without giving the system a chance to answer. +_CONNECT_TIMEOUT_S = 15 +_RETRY_BUDGET_S = 120 +_RETRY_INITIAL_S = 0.1 +_RETRY_MAX_S = 2.0 + + +def _retryable(exc: BaseException) -> bool: + if isinstance(exc, psycopg.OperationalError): + return True + # psycopg wraps server-side admin shutdowns as InterfaceError on next op. + if isinstance(exc, psycopg.InterfaceError): + return True + return False + + +@contextmanager +def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]: + """Yield a connection, retrying transient failures up to RETRY_BUDGET_S.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + conn = psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=_CONNECT_TIMEOUT_S, + autocommit=autocommit, + ) + break + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg connect retrying after %s; backoff=%.2fs", exc, backoff) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + try: + yield conn + finally: + try: + conn.close() + except Exception: # noqa: BLE001 + pass + + +def execute_retry(sql: str, params: Sequence[Any] | None = None) -> None: + """Execute a statement, retrying transient errors. No result returned.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + with connect() as conn, conn.cursor() as cur: + cur.execute(sql, params or ()) + return + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg execute retrying after %s", exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any, ...]]: + """Run a query and return all rows, retrying transient errors.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + with connect() as conn, conn.cursor() as cur: + cur.execute(sql, params or ()) + return list(cur.fetchall()) + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg query retrying after %s", exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def query_one_retry( + sql: str, params: Sequence[Any] | None = None +) -> tuple[Any, ...] | None: + rows = query_retry(sql, params) + return rows[0] if rows else None + + +def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> None: + """Execute a system-privileged statement on the internal port (mz_system). + + Used for ALTER SYSTEM SET and other operations the regular `materialize` + role cannot perform. Retries the same transient errors as `execute_retry`. + """ + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT_INTERNAL, + user=PGUSER_INTERNAL, + dbname=PGDATABASE, + connect_timeout=_CONNECT_TIMEOUT_S, + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute(sql, params or ()) + return + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg internal execute retrying after %s", exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def create_source_idempotent(create_sql: str, source_name: str) -> None: + """Run a CREATE SOURCE statement, tolerating IF-NOT-EXISTS race gaps. + + `CREATE SOURCE IF NOT EXISTS` only short-circuits on the primary source + name. When two driver invocations race past the existence check, or when + a fault-injected crash mid-DDL leaves an orphan `_progress` + subsource in the catalog, the primary create errors with "catalog item + ... already exists" despite `IF NOT EXISTS`. Re-check `mz_sources` after + such an error; if the source landed concurrently, treat as success. + Otherwise re-raise so a true orphan still surfaces. + """ + try: + execute_retry(create_sql) + return + except psycopg.errors.InternalError as exc: + if "already exists" not in str(exc): + raise + rows = query_retry( + "SELECT 1 FROM mz_sources WHERE name = %s", + (source_name,), + ) + if rows: + LOG.info("source %s landed concurrently; tolerating collision", source_name) + return + raise diff --git a/test/antithesis/workload/test/helper_quiet.py b/test/antithesis/workload/test/helper_quiet.py new file mode 100644 index 0000000000000..adb4f9ead3e6d --- /dev/null +++ b/test/antithesis/workload/test/helper_quiet.py @@ -0,0 +1,38 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Wrapper around the Antithesis ANTITHESIS_STOP_FAULTS binary. + +Outside Antithesis (e.g. snouty local validate), the env var is unset and this +becomes a no-op so the workload still runs end-to-end. +""" + +from __future__ import annotations + +import logging +import os +import subprocess + +LOG = logging.getLogger("antithesis.helper_quiet") + + +def request_quiet_period(seconds: int) -> bool: + """Request that Antithesis pause all faults for `seconds`. + + Returns True if the request was issued, False if not in Antithesis. Either + way callers must still poll for the system to stabilize — the binary + returns immediately and the actual quiet window unfolds asynchronously. + """ + binary = os.environ.get("ANTITHESIS_STOP_FAULTS") + if not binary: + LOG.info("ANTITHESIS_STOP_FAULTS not set; skipping quiet-period request") + return False + LOG.info("requesting %ds quiet period via %s", seconds, binary) + subprocess.run([binary, str(seconds)], check=False) + return True diff --git a/test/antithesis/workload/test/helper_random.py b/test/antithesis/workload/test/helper_random.py new file mode 100644 index 0000000000000..cb749227d6f17 --- /dev/null +++ b/test/antithesis/workload/test/helper_random.py @@ -0,0 +1,64 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Deterministic randomness for Antithesis drivers. + +All driver randomness must go through the Antithesis SDK so timelines replay +deterministically. Outside Antithesis we fall back to the stdlib `random` with a +fixed-but-arbitrary seed per process so local runs are not flaky. +""" + +from __future__ import annotations + +import os +import random as _stdlib_random +from collections.abc import Sequence +from typing import TypeVar + +try: + from antithesis import random as _ar + + _ANTITHESIS = True +except ImportError: + _ANTITHESIS = False + +T = TypeVar("T") + +# A stable per-process seed so local snouty validate runs are deterministic +# within one process but pick a different sequence per process invocation. +_FALLBACK = _stdlib_random.Random(int.from_bytes(os.urandom(8), "little")) + + +def random_u64() -> int: + if _ANTITHESIS: + return _ar.get_random() + return _FALLBACK.getrandbits(64) + + +def random_choice(seq: Sequence[T]) -> T: + if not seq: + raise ValueError("random_choice on empty sequence") + if _ANTITHESIS: + return _ar.random_choice(list(seq)) + return _FALLBACK.choice(seq) + + +def random_int(low: int, high: int) -> int: + """Inclusive on both ends.""" + if low > high: + raise ValueError("low > high") + span = high - low + 1 + return low + (random_u64() % span) + + +def random_bool(true_prob: float) -> bool: + if not 0.0 <= true_prob <= 1.0: + raise ValueError("true_prob out of range") + # Use 16 bits of entropy to avoid floating-point quirks under replay. + return (random_u64() & 0xFFFF) < int(true_prob * 0x10000) diff --git a/test/antithesis/workload/test/helper_source_stats.py b/test/antithesis/workload/test/helper_source_stats.py new file mode 100644 index 0000000000000..54af7f0e29866 --- /dev/null +++ b/test/antithesis/workload/test/helper_source_stats.py @@ -0,0 +1,86 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Catchup polling against `mz_internal.mz_source_statistics`. + +Used by drivers to wait until a Kafka source has durably ingested at least +some target offset (typically the maximum produced offset). All durations are +budgeted; callers handle timeouts. +""" + +from __future__ import annotations + +import logging +import time + +from helper_pg import query_one_retry + +LOG = logging.getLogger("antithesis.helper_source_stats") + + +def offset_committed(source_name: str) -> int | None: + """Return the maximum offset_committed for `source_name`, or None. + + `mz_source_statistics.offset_committed` is the durably-ingested upstream + offset, aggregated across replicas in the view. Returns None if the + statistics row does not exist yet (very early in source lifetime) so + callers can distinguish "not initialized" from "still behind." + """ + row = query_one_retry( + """ + SELECT MAX(ss.offset_committed)::bigint + FROM mz_internal.mz_source_statistics ss + JOIN mz_sources s ON s.id = ss.id + WHERE s.name = %s + """, + (source_name,), + ) + if row is None or row[0] is None: + return None + return int(row[0]) + + +def wait_for_catchup( + source_name: str, + target_offset: int, + timeout_s: float = 60.0, + poll_interval_s: float = 0.5, +) -> bool: + """Wait until offset_committed for `source_name` reaches `target_offset`. + + Returns True if catchup completed within `timeout_s`, False on timeout. + """ + deadline = time.monotonic() + timeout_s + last_seen: int | None = None + while time.monotonic() < deadline: + observed = offset_committed(source_name) + if observed is not None and observed >= target_offset: + LOG.info( + "source %s caught up: observed=%d target=%d", + source_name, + observed, + target_offset, + ) + return True + if observed != last_seen: + LOG.info( + "source %s waiting for catchup: observed=%s target=%d", + source_name, + observed, + target_offset, + ) + last_seen = observed + time.sleep(poll_interval_s) + LOG.warning( + "source %s catchup timeout: observed=%s target=%d", + source_name, + last_seen, + target_offset, + ) + return False diff --git a/test/antithesis/workload/test/helper_table_mv.py b/test/antithesis/workload/test/helper_table_mv.py new file mode 100644 index 0000000000000..e865f3f2f5e89 --- /dev/null +++ b/test/antithesis/workload/test/helper_table_mv.py @@ -0,0 +1,64 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis table + materialized view scaffolding. + +Used by the `mv-reflects-source-updates` driver. The table holds rows with a +per-invocation `prefix` so concurrent driver instances scope to disjoint +groups, and the materialized view rolls those rows up by prefix: + + CREATE TABLE mv_input_table (id BIGINT NOT NULL, prefix TEXT NOT NULL); + CREATE MATERIALIZED VIEW mv_input_count AS + SELECT prefix, COUNT(*)::BIGINT AS row_count + FROM mv_input_table + GROUP BY prefix; + +Defining the MV on the local coordinator's table (rather than a Kafka +source) deliberately tests the end-to-end path independent of source +ingestion: dataflow rendering, persist write of the MV output, and +frontier advancement through compute. Source-side faults are still +exercised because the workload runs under the same fault-injection regime +as everything else. +""" + +from __future__ import annotations + +import logging +import os + +from helper_pg import execute_retry + +LOG = logging.getLogger("antithesis.helper_table_mv") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") + +TABLE_MV_INPUT = "mv_input_table" +MV_NAME = "mv_input_count" + + +def ensure_table_and_mv() -> None: + """Create the input table and the materialized view if absent. + + Both DDLs use IF NOT EXISTS so concurrent driver instances racing + through setup do not collide. The MV is created in the antithesis + cluster so dataflow execution is colocated with the rest of the + workload's compute. + """ + execute_retry( + f"CREATE TABLE IF NOT EXISTS {TABLE_MV_INPUT} " + f"(id BIGINT NOT NULL, prefix TEXT NOT NULL)" + ) + execute_retry( + f"CREATE MATERIALIZED VIEW IF NOT EXISTS {MV_NAME} " + f"IN CLUSTER {CLUSTER} AS " + f"SELECT prefix, COUNT(*)::BIGINT AS row_count " + f"FROM {TABLE_MV_INPUT} " + f"GROUP BY prefix" + ) + LOG.info("table %s and MV %s ready", TABLE_MV_INPUT, MV_NAME) diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py new file mode 100644 index 0000000000000..6fac93cdd4f24 --- /dev/null +++ b/test/antithesis/workload/test/helper_upsert_source.py @@ -0,0 +1,59 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis UPSERT-envelope Kafka source. + +Used by all drivers that exercise UPSERT semantics. The topic is pre-created +via the Kafka admin client (broker auto-create only triggers on producer +write, but CREATE SOURCE does a metadata fetch that fails fast otherwise). +The source/connection are created at most once across all drivers +(CREATE ... IF NOT EXISTS). +""" + +from __future__ import annotations + +import logging +import os + +from helper_kafka import ensure_topic +from helper_pg import create_source_idempotent, execute_retry + +LOG = logging.getLogger("antithesis.helper_upsert_source") + +KAFKA_BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") + +CONNECTION_NAME = "antithesis_kafka_conn" +TOPIC_UPSERT_TEXT = "antithesis-upsert-text" +SOURCE_UPSERT_TEXT = "upsert_text_src" + + +def ensure_kafka_connection() -> None: + execute_retry( + f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} " + f"TO KAFKA (BROKER '{KAFKA_BROKER}', SECURITY PROTOCOL = 'PLAINTEXT')" + ) + + +def ensure_upsert_text_source() -> None: + """Create the upsert-envelope source over a text key/value Kafka topic. + + The resulting source has columns `key TEXT NOT NULL` and `text TEXT`. + """ + ensure_kafka_connection() + ensure_topic(TOPIC_UPSERT_TEXT) + create_source_idempotent( + f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} " + f"IN CLUSTER {CLUSTER} " + f"FROM KAFKA CONNECTION {CONNECTION_NAME} (TOPIC '{TOPIC_UPSERT_TEXT}') " + f"KEY FORMAT TEXT VALUE FORMAT TEXT " + f"ENVELOPE UPSERT", + SOURCE_UPSERT_TEXT, + ) + LOG.info("upsert source %s ready (topic=%s)", SOURCE_UPSERT_TEXT, TOPIC_UPSERT_TEXT) diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py new file mode 100755 index 0000000000000..9c3c0e2461cbe --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for the inverse-pair NONE-envelope properties: + - `kafka-source-no-data-loss` — every produced (partition, offset) is visible + - `kafka-source-no-data-duplication` — no (partition, offset) appears twice + +The two run on the same dataflow because they are the symmetric failure modes +of the same contract: one says "no row gone missing," the other says "no row +duplicated." Settling once and asserting both halves catches both bugs from +the same produce pass. + +Each invocation: + 1. Ensures the NONE-envelope source exists. + 2. Picks a per-invocation prefix so concurrent driver instances scope to + disjoint payloads. Every produced message has a `:` prefix so the + workload can filter the source down to its own rows when asserting. + 3. Produces N distinct payloads, recording the broker-assigned `(partition, + offset)` for each via the delivery callback. + 4. Requests an Antithesis quiet period and waits for `offset_committed` + to reach the highest produced offset. + 5. Runs two `assert_always` checks: + - "kafka source: no duplicate (partition, offset)" — `GROUP BY 1, 2 HAVING COUNT(*) > 1` is empty + - "kafka source: every produced payload is visible exactly once" — + fires per produced payload; payload, presence, and observed count + go into `details` so triage can localize which payloads went missing + or duplicated + 6. Records one `assert_sometimes` liveness anchor confirming the safety + checks ran against settled data. + +This is a `parallel_driver_` — many concurrent instances exercise the source +without colliding because each invocation owns its prefix range. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from helper_kafka import make_producer +from helper_none_source import ( + SOURCE_NONE_TEXT, + TOPIC_NONE_TEXT, + ensure_none_text_source, +) +from helper_pg import query_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_none_envelope") + +# Knobs. Tuned so each invocation is a small, self-contained unit of work +# — Antithesis launches the driver many times and accumulates coverage +# across invocations, not within one giant batch. +PRODUCES_PER_INVOCATION = 50 +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 + + +def main() -> int: + ensure_none_text_source() + + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; prefix=%s", prefix) + + producer, tracker = make_producer(client_id=f"antithesis-none-{prefix}") + + # The set of payloads we attempted to produce. Each is unique to + # (prefix, index) so we can filter the source on `text LIKE prefix:%` + # and join payloads back to (partition, offset) without tracking them + # at produce time. + expected_payloads: set[str] = set() + for i in range(PRODUCES_PER_INVOCATION): + payload = f"{prefix}:{i:06d}" + producer.produce( + topic=TOPIC_NONE_TEXT, + value=payload.encode("utf-8"), + on_delivery=tracker.callback, + ) + expected_payloads.add(payload) + producer.poll(0) + + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + # Same fail-closed pattern as the upsert driver: under sustained + # fault injection we cannot prove which messages Kafka accepted, so + # the expected set may name payloads the source never saw. Bail + # before running safety assertions. + LOG.info( + "skipping assertions: producer.flush pending=%d last_error=%s", + pending, + tracker.last_error, + ) + return 0 + + max_produced = tracker.topic_max_offset(TOPIC_NONE_TEXT) + if max_produced < 0: + LOG.info("no messages confirmed delivered this invocation; exiting cleanly") + return 0 + + # Each payload is unique to this invocation (prefix:NNNNNN), so the + # source query below joins payloads back to (partition, offset) + # assignments without us needing to track them at produce time. + + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_NONE_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + + sometimes( + caught_up, + "kafka source caught up to produced offsets after quiet period (none envelope)", + {"source": SOURCE_NONE_TEXT, "target_offset": max_produced}, + ) + + if not caught_up: + LOG.info("catchup did not complete in budget; skipping per-payload assertions") + return 0 + + # ----- no-data-duplication ----- + # `GROUP BY partition, "offset" HAVING COUNT(*) > 1` filtered to this + # invocation's payloads. The catalog's `kafka-source-no-data-duplication` + # property names this exact query shape. + dup_rows = query_retry( + f""" + SELECT partition, "offset", COUNT(*)::bigint + FROM {SOURCE_NONE_TEXT} + WHERE text LIKE %s + GROUP BY 1, 2 + HAVING COUNT(*) > 1 + """, + (f"{prefix}:%",), + ) + always( + len(dup_rows) == 0, + "kafka source: no duplicate (partition, offset)", + { + "source": SOURCE_NONE_TEXT, + "prefix": prefix, + "dupe_count": len(dup_rows), + # Carry up to a handful of offending rows for triage. + "examples": [ + {"partition": int(p), "offset": int(o), "count": int(c)} + for (p, o, c) in dup_rows[:5] + ], + }, + ) + + # ----- no-data-loss ----- + # Confirm every payload we produced is visible *exactly once*. We do this + # via a left-join: enumerate produced payloads, ask the source for each. + # An always-pass requires every produced payload to map to exactly one + # source row whose `text` matches. + # + # We batch all payloads into one query rather than one round-trip per + # payload, so the assertion fires once per payload but the SQL cost + # stays bounded. + rows = query_retry( + f""" + SELECT text, partition, "offset", COUNT(*)::bigint + FROM {SOURCE_NONE_TEXT} + WHERE text LIKE %s + GROUP BY 1, 2, 3 + """, + (f"{prefix}:%",), + ) + by_payload: dict[str, tuple[int, int, int]] = {} + for text, partition, offset, count in rows: + by_payload[text] = (int(partition), int(offset), int(count)) + + for payload in expected_payloads: + info = by_payload.get(payload) + present = info is not None + count = info[2] if info else 0 + always( + present and count == 1, + "kafka source: every produced payload is visible exactly once", + { + "source": SOURCE_NONE_TEXT, + "prefix": prefix, + "payload": payload, + "present": present, + "observed_count": count, + }, + ) + + LOG.info( + "driver done; asserted no-dupe + per-payload visibility on %d produced payloads", + len(expected_payloads), + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py new file mode 100755 index 0000000000000..c026be09ea522 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `mv-reflects-source-updates`. + +End-to-end user-visible property: after data is written to an upstream +collection, materialized views that depend on that collection eventually +reflect the new data. Materialize's headline value proposition. + +This driver uses a TABLE (not a Kafka source) so the property is exercised +independent of source ingestion: the test path is INSERT -> coordinator +group_commit -> persist write of the table -> MV's compute dataflow -> +persist write of the MV output -> SELECT. Kafka-source-specific liveness +is covered by the other Kafka-source drivers. + +Each invocation: + 1. Ensures `mv_input_table` + materialized view `mv_input_count` exist. + 2. Picks a per-invocation prefix so concurrent driver instances scope to + disjoint MV rows. + 3. INSERTs N rows tagged with the prefix. + 4. Requests an Antithesis quiet period and polls the MV until the count + for the prefix equals N. + 5. Asserts: + - `always(...)` the MV count matches what was inserted (no over- or + under-counting after settle). + - `sometimes(...)` the catchup completed within the budget (the + liveness anchor — without this, the always check could be vacuous + on a slow-catchup invocation). + +This is a `parallel_driver_` — many concurrent instances exercise the MV +without colliding because each invocation owns its prefix range. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_random +from helper_pg import execute_retry, query_one_retry +from helper_quiet import request_quiet_period +from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.mv_reflects_table_updates") + +INSERTS_PER_INVOCATION = 40 +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 +CATCHUP_POLL_INTERVAL_S = 0.5 + + +def _mv_count_for_prefix(prefix: str) -> int | None: + """Return the row_count the MV currently reports for `prefix`, or None. + + None means "no row exists for that prefix yet" — distinct from zero, + which the MV would not produce for the `count(*)`+`group by` shape (a + fully-deleted prefix would not appear at all). + """ + row = query_one_retry( + f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s", + (prefix,), + ) + if row is None: + return None + return int(row[0]) + + +def main() -> int: + ensure_table_and_mv() + + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("mv driver starting; prefix=%s", prefix) + + # Insert N rows tagged with the prefix. We batch into a single statement + # so the coordinator processes them as one group_commit, which keeps the + # workload-visible target offset for catchup well-defined (otherwise a + # mid-insert crash would split the row count and the MV would catch up + # to "some" count rather than exactly N). + placeholders = ", ".join(["(%s, %s)"] * INSERTS_PER_INVOCATION) + params: list[object] = [] + for i in range(INSERTS_PER_INVOCATION): + params.extend([i, prefix]) + execute_retry( + f"INSERT INTO {TABLE_MV_INPUT} (id, prefix) VALUES {placeholders}", + params, + ) + + request_quiet_period(QUIET_PERIOD_S) + + # Poll the MV until the row_count for this prefix reaches N. The MV's + # `COUNT(*) GROUP BY prefix` shape means the row for this prefix may + # appear partially populated during the catchup window. + deadline = time.monotonic() + CATCHUP_TIMEOUT_S + observed = _mv_count_for_prefix(prefix) + while observed != INSERTS_PER_INVOCATION and time.monotonic() < deadline: + time.sleep(CATCHUP_POLL_INTERVAL_S) + observed = _mv_count_for_prefix(prefix) + + caught_up = observed == INSERTS_PER_INVOCATION + + sometimes( + caught_up, + "mv: row_count caught up to inserted count after quiet period", + { + "mv": MV_NAME, + "table": TABLE_MV_INPUT, + "prefix": prefix, + "expected": INSERTS_PER_INVOCATION, + "observed": observed, + }, + ) + + if not caught_up: + LOG.info( + "catchup did not complete in budget; skipping safety assertion " + "(observed=%s expected=%d)", + observed, + INSERTS_PER_INVOCATION, + ) + return 0 + + # Safety check: the MV must report exactly the inserted count. A + # higher count would be double-counting (corruption); a lower count + # at this point would mean the catchup poll above gave us a stale + # read between observations, which is itself a correctness bug worth + # surfacing. + always( + observed == INSERTS_PER_INVOCATION, + "mv: row_count equals inserted count for prefix after settle", + { + "mv": MV_NAME, + "table": TABLE_MV_INPUT, + "prefix": prefix, + "expected": INSERTS_PER_INVOCATION, + "observed": observed, + }, + ) + + LOG.info( + "mv driver done; inserted=%d mv_count=%s prefix=%s", + INSERTS_PER_INVOCATION, + observed, + prefix, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py new file mode 100644 index 0000000000000..67a9627e1e386 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for property `mysql-source-no-data-loss`. + +Every row inserted to the MySQL primary must eventually appear — with the +correct value — in the Materialize CDC source that reads from the +multithreaded replica. + +Each invocation: + 1. Checks the MySQL CDC source exists (created by first_mysql_replica_setup). + 2. Picks a per-invocation `batch_id` prefix so concurrent drivers don't + collide. + 3. Inserts ROWS_PER_INVOCATION rows to the MySQL primary, recording the + expected {id → value} map locally. + 4. Requests an Antithesis quiet period and polls the Materialize source + table until all expected rows appear (or the budget expires). + 5. Asserts correctness via `always(...)` on count and per-row values. + A `sometimes(...)` liveness anchor fires on successful catchup. + +This is a `parallel_driver_` — Antithesis runs many concurrent instances. +Each assigns itself a fresh prefix from the Antithesis-seeded RNG so +parallel drivers exercise the MySQL CDC path simultaneously without +interfering with each other's expected-state model. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_mysql +import helper_random +from helper_mysql_source import SOURCE_NAME, TABLE_NAME +from helper_pg import query_retry +from helper_quiet import request_quiet_period + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.mysql_cdc") + +ROWS_PER_INVOCATION = 20 +QUIET_PERIOD_S = 25 +CATCHUP_TIMEOUT_S = 90.0 +POLL_INTERVAL_S = 1.0 + + +def _source_exists() -> bool: + rows = query_retry("SELECT 1 FROM mz_sources WHERE name = %s", (SOURCE_NAME,)) + return bool(rows) + + +def _insert_rows(batch_id: str) -> dict[str, str]: + """Insert ROWS_PER_INVOCATION rows to the MySQL primary. + + Returns {id → value} for every successfully inserted row. + """ + expected: dict[str, str] = {} + for i in range(ROWS_PER_INVOCATION): + row_id = f"{batch_id}:{i}" + value = f"v{helper_random.random_int(0, 9999):04d}" + try: + helper_mysql.execute_primary( + "INSERT INTO antithesis.cdc_test (id, batch_id, value) " + "VALUES (%s, %s, %s) " + "ON DUPLICATE KEY UPDATE value = VALUES(value), batch_id = VALUES(batch_id)", + (row_id, batch_id, value), + database="antithesis", + ) + expected[row_id] = value + except Exception as exc: # noqa: BLE001 + # Under fault injection a write to the primary may fail. Skip the + # row rather than crashing so the driver keeps inserting others. + LOG.info("insert failed for row %s: %s; skipping", row_id, exc) + return expected + + +def _wait_for_catchup(batch_id: str, expected_count: int) -> bool: + """Poll Materialize until all expected rows for `batch_id` appear. + + Returns True when `COUNT(*) WHERE batch_id = ?` reaches expected_count, + False on timeout. + """ + deadline = time.monotonic() + CATCHUP_TIMEOUT_S + last_seen = -1 + while time.monotonic() < deadline: + try: + rows = query_retry( + f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s", + (batch_id,), + ) + count = int(rows[0][0]) if rows and rows[0][0] is not None else 0 + except Exception as exc: # noqa: BLE001 + LOG.info("catchup poll failed: %s; retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + if count != last_seen: + LOG.info( + "mysql cdc catchup: batch=%s observed=%d target=%d", + batch_id, + count, + expected_count, + ) + last_seen = count + + if count >= expected_count: + return True + time.sleep(POLL_INTERVAL_S) + + LOG.warning( + "mysql cdc catchup timeout: batch=%s last_seen=%d target=%d", + batch_id, + last_seen, + expected_count, + ) + return False + + +def _check_rows(expected: dict[str, str]) -> None: + """Assert every expected row has the correct value in the Materialize source.""" + for row_id, want in expected.items(): + rows = query_retry( + f"SELECT value FROM {TABLE_NAME} WHERE id = %s", + (row_id,), + ) + found = bool(rows) + observed = rows[0][0] if found else None + always( + found and observed == want, + "mysql: CDC source row has correct value after catchup", + { + "source": TABLE_NAME, + "id": row_id, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + + +def main() -> int: + if not _source_exists(): + # first_mysql_replica_setup must run before this driver. Outside + # Antithesis (e.g. snouty validate) the source may not exist yet — + # exit cleanly rather than erroring so validate can still proceed. + LOG.warning( + "mysql cdc source %s not found; skipping " + "(first_mysql_replica_setup must run first)", + SOURCE_NAME, + ) + return 0 + + batch_id = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; batch_id=%s", batch_id) + + expected = _insert_rows(batch_id) + if not expected: + LOG.info("no rows inserted successfully this invocation; exiting cleanly") + return 0 + + LOG.info("inserted %d rows; requesting quiet period", len(expected)) + request_quiet_period(QUIET_PERIOD_S) + + caught_up = _wait_for_catchup(batch_id, len(expected)) + + # Liveness anchor: at least one invocation should fully catch up. If this + # never fires across an entire run the safety assertions below are vacuous. + sometimes( + caught_up, + "mysql: CDC source caught up to all primary inserts after quiet period", + { + "source": TABLE_NAME, + "batch_id": batch_id, + "rows_inserted": len(expected), + }, + ) + + if not caught_up: + # Don't run per-row safety assertions on stale data — a slow catchup + # is a separate concern from row-level correctness. + LOG.info("catchup did not complete in budget; skipping per-row assertions") + return 0 + + # Safety: every row we inserted must be present with the correct value. + _check_rows(expected) + + # Count-level safety check: no extra rows for our batch_id should exist. + rows = query_retry( + f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s", + (batch_id,), + ) + count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0 + always( + count_in_mz == len(expected), + "mysql: CDC source row count matches inserted count after catchup", + { + "source": TABLE_NAME, + "batch_id": batch_id, + "expected_count": len(expected), + "observed_count": count_in_mz, + }, + ) + + LOG.info( + "driver done; asserted on %d rows for batch_id=%s", len(expected), batch_id + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py new file mode 100755 index 0000000000000..c4af73b434635 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `strict-serializable-reads`. + +Materialize's headline consistency guarantee: two reads on the same +collection at oracle-assigned timestamps t1 < t2 must observe consistent +ordering — anything visible at t1 must remain visible at t2. This driver +exercises the cross-read half of that property: a sequence of fresh- +connection reads against a materialized view, interleaved with writes, +must yield a non-decreasing count. + +Approach: + 1. Reuse `helper_table_mv` (table `mv_input_table` + MV `mv_input_count`) + so this driver does not introduce new schema. Each invocation owns a + fresh prefix so concurrent driver instances scope to disjoint rows. + 2. For each step k = 1..N: + - INSERT one row tagged with the prefix in autocommit mode (each + insert is its own oracle-timestamped write). + - Open a *fresh* psycopg connection, set `transaction_isolation` + to `strict serializable` explicitly, and SELECT the MV's row + count for the prefix. Record (k, observed_count). + - Fresh connections are deliberate: a single long-lived connection + could mask a read-regression bug behind connection-local caching. + 3. After all steps, run one more fresh-connection SELECT as the final + observation. + 4. Assertions: + - `always(count[k+1] >= count[k], …)` between every adjacent pair + of recorded reads — the core strict-serializable read ordering + invariant. + - `always(final >= max(count), …)` for the closing observation. + - `sometimes(...)` liveness anchor confirming the closing + observation reached the inserted count after the quiet period. + +Read failures (connect timeout, server unavailable mid-fault) are skipped +rather than recorded — they are not regression evidence, and a False +positive on transient unavailability would obscure real bugs. + +This is a `parallel_driver_` — many concurrent instances run because the +property is about read monotonicity *within* each client's observation +stream, and prefix-scoping isolates each instance's expected count. +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +import helper_random +import psycopg +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGUSER, + execute_retry, +) +from helper_quiet import request_quiet_period +from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.strict_serializable_reads") + +STEPS_PER_INVOCATION = 12 +QUIET_PERIOD_S = 15 +FINAL_READ_TIMEOUT_S = 30.0 +FINAL_READ_POLL_S = 0.5 +PROBE_CONNECT_TIMEOUT_S = 5 + + +def _fresh_select_count(prefix: str) -> int | None: + """Open a *new* connection, force strict serializable, and SELECT the + MV's row_count for `prefix`. Returns None on any connect/query failure + so the caller can skip the observation without conflating fault-induced + unavailability with a read regression. + + Setting `transaction_isolation` explicitly costs one extra round trip + but defends against future changes to the system default. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=PROBE_CONNECT_TIMEOUT_S, + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute("SET transaction_isolation TO 'strict serializable'") + cur.execute( + f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s", + (prefix,), + ) + row = cur.fetchone() + except Exception: # noqa: BLE001 + return None + if row is None: + return 0 # MV has no row for this prefix yet + return int(row[0]) + + +def main() -> int: + ensure_table_and_mv() + + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("strict-serializable driver starting; prefix=%s", prefix) + + # Sequence of (step_index, observed_count). Reads that failed are + # represented as None and dropped before assertions. + observations: list[tuple[int, int]] = [] + + for step in range(1, STEPS_PER_INVOCATION + 1): + # Each INSERT is one autocommit write; the coordinator stamps it + # with an oracle timestamp. We INSERT before the read so the + # *expected* monotone behaviour is that every read is >= the + # previous one and the final read equals the total insert count + # (modulo catchup; covered by the liveness anchor below). + try: + execute_retry( + f"INSERT INTO {TABLE_MV_INPUT} (id, prefix) VALUES (%s, %s)", + (step, prefix), + ) + except Exception as exc: # noqa: BLE001 + # Persistent insert failure under sustained fault — bail. + # Already-recorded observations are still valid evidence for + # the monotonicity assertion below. + LOG.info("step %d: insert failed (%s); ending step loop", step, exc) + break + + observed = _fresh_select_count(prefix) + if observed is None: + # Fault-window read; skip. We do NOT record it so the + # adjacent-pair assertion below doesn't see a spurious zero. + continue + observations.append((step, observed)) + + # Settle and take the closing observation. The driver is short and the + # observations list is small, so a generous timeout here is fine. + request_quiet_period(QUIET_PERIOD_S) + expected_final = len(observations) and observations[-1][0] + # `expected_final` is the largest step that was actually INSERTed (we + # may have bailed early). It's an *upper bound* on the count — the + # final count may equal it (fully caught up) or be slightly less + # (catchup still in flight). The monotonicity assertion only cares + # that final >= every earlier observation. + + deadline = time.monotonic() + FINAL_READ_TIMEOUT_S + final: int | None = _fresh_select_count(prefix) + while final is None and time.monotonic() < deadline: + time.sleep(FINAL_READ_POLL_S) + final = _fresh_select_count(prefix) + + sometimes( + final is not None and final == expected_final, + "strict-serializable reads: final fresh-connection read reached inserted count", + { + "prefix": prefix, + "expected_final": expected_final, + "final_observed": final, + "observations": len(observations), + }, + ) + + # ----- monotonicity: adjacent-pair assertion ----- + # Across the recorded fresh-connection reads, no read may regress. + # This is the strict-serializable read-ordering property. + for i in range(1, len(observations)): + prev_step, prev_count = observations[i - 1] + curr_step, curr_count = observations[i] + always( + curr_count >= prev_count, + "strict-serializable reads: fresh-connection read regressed across adjacent observations", + { + "prefix": prefix, + "prev_step": prev_step, + "prev_count": prev_count, + "curr_step": curr_step, + "curr_count": curr_count, + }, + ) + + # ----- monotonicity: closing observation dominates the maximum ----- + # If the closing observation succeeded, it must be >= every earlier + # observation. (The final equality with `expected_final` is covered by + # the `sometimes` liveness anchor above and is not asserted here.) + if final is not None and observations: + max_observed = max(c for _, c in observations) + always( + final >= max_observed, + "strict-serializable reads: closing fresh-connection read regressed below earlier maximum", + { + "prefix": prefix, + "final": final, + "max_earlier": max_observed, + }, + ) + + LOG.info( + "strict-serializable driver done; observations=%d final=%s expected_final=%s", + len(observations), + final, + expected_final, + ) + return 0 + + +if __name__ == "__main__": + # Touch the imported env constants so static analysis treats them as + # used; helper_pg re-exports them for drivers that bypass its retry + # helpers (as this one does for fresh connections). + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os) + sys.exit(main()) diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py new file mode 100755 index 0000000000000..066620aaf6ded --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for property `upsert-key-reflects-latest-value`. + +For each key produced to a Kafka UPSERT-envelope source, after a quiet period +that lets Materialize catch up, the source's row for that key must reflect the +last value produced — or be absent if the last message was a tombstone. + +Each invocation: + 1. Ensures the upsert source exists (idempotent CREATE ... IF NOT EXISTS). + 2. Picks a per-invocation key prefix so concurrent driver instances don't + interfere with each other's expected-state model. + 3. Produces a deterministic mix of upserts and tombstones, tracking the + local "what should the source say" model. + 4. Requests an Antithesis quiet period and waits for offset_committed to + reach the highest produced offset. + 5. For every tracked key, asserts that what's in the source matches the + local model. Live keys use one assertion message, tombstoned keys use + another, so triage can distinguish the two failure modes. + +This is a `parallel_driver_` — Antithesis runs many concurrent instances and +each one assigns itself a fresh prefix from deterministic randomness, so +multiple drivers exercise the source without colliding. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from helper_kafka import make_producer +from helper_pg import query_one_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup +from helper_upsert_source import ( + SOURCE_UPSERT_TEXT, + TOPIC_UPSERT_TEXT, + ensure_upsert_text_source, +) + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.upsert_latest_value") + +# Knobs. Kept small per-invocation because Antithesis launches the driver many +# times; total coverage comes from re-invocations, not from one huge run. +PRODUCES_PER_INVOCATION = 40 +DISTINCT_KEYS = 8 # small key space so we re-write the same key often +DISTINCT_VALUES = 16 +TOMBSTONE_PROB = 0.15 + +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 + + +def _produce(producer, tracker, topic: str, key: str, value: str | None) -> None: + """Encode value=None as a Kafka tombstone (null payload).""" + payload = None if value is None else value.encode("utf-8") + producer.produce( + topic=topic, + key=key.encode("utf-8"), + value=payload, + on_delivery=tracker.callback, + ) + + +def _select_value_for_key(key: str) -> tuple[bool, str | None]: + """Return (found, value) for the single source row matching `key`. + + Returns (False, None) when no row exists (the tombstone case for an + UPSERT source). Returns (True, value) when exactly one row exists. + Raises if more than one row exists — that would mean the source is + multi-rowed per key and violates the UPSERT contract itself, which is + out of scope for this property and should be caught by + `kafka-source-no-data-duplication`. + """ + row = query_one_retry( + f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", + (key,), + ) + if row is None: + return False, None + count, value = row + if count == 0: + return False, None + if count != 1: + raise RuntimeError( + f"upsert source has {count} rows for key {key!r}; this driver assumes " + "the per-key uniqueness property holds" + ) + return True, value + + +def main() -> int: + ensure_upsert_text_source() + + # Per-invocation prefix isolates this driver's keys from other concurrent + # drivers and from previous invocations of this same driver. + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; prefix=%s", prefix) + + producer, tracker = make_producer(client_id=f"antithesis-{prefix}") + + # Local "what should the source say" model for this invocation's keys. + # Value of None means "the last message was a tombstone". + expected: dict[str, str | None] = {} + + # Count of times we tombstoned a key whose immediately-prior produced + # value was a live value (not absent, not already tombstoned). This is + # the exact `upsert-tombstone-removes-key` exercise pattern: the + # interesting case is "remove a row that was just there," not "tombstone + # a key we never wrote to." + tombstoned_after_value = 0 + + keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)] + for _ in range(PRODUCES_PER_INVOCATION): + key = helper_random.random_choice(keys) + if helper_random.random_bool(TOMBSTONE_PROB): + if expected.get(key) is not None: + tombstoned_after_value += 1 + _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None) + expected[key] = None + else: + value = f"v{helper_random.random_int(0, DISTINCT_VALUES - 1):04d}" + _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, value) + expected[key] = value + producer.poll(0) + + # Flush all pending deliveries. We poll callbacks while flushing so the + # tracker reflects the true max produced offset. + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + # Under sustained fault injection we cannot prove which of the just- + # produced messages Kafka actually accepted, so `expected` may name + # values the source never sees. Bail out before running safety + # assertions — fault-induced delivery loss is not what this property + # is testing. The catchup `sometimes()` is also skipped because we + # have no trustworthy target offset. + LOG.info( + "skipping assertions: producer.flush pending=%d last_error=%s", + pending, + tracker.last_error, + ) + return 0 + + max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT) + if max_produced < 0: + LOG.info("no messages confirmed delivered this invocation; exiting cleanly") + return 0 + + # Now ask Antithesis to pause faults and wait for Materialize to catch up. + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + + # Liveness signal: at least one invocation should reach catchup. If this + # never fires across an entire run, the safety assertions below would be + # vacuous and the run is uninteresting. + sometimes( + caught_up, + "upsert: source caught up to produced offsets after quiet period", + {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced}, + ) + + if not caught_up: + # Don't run the per-key safety assertions on stale data — that would + # blame the property for a slow catchup that's a separate concern. + LOG.info("catchup did not complete in budget; skipping per-key assertions") + return 0 + + # Per-key safety assertions. Two distinct messages so triage reports tell + # us *which* invariant broke: a value mismatch or a tombstone resurrection. + for key, want in expected.items(): + found, observed = _select_value_for_key(key) + + if want is None: + # The last produced message for this key was a tombstone; the + # source must not contain a row for it. + always( + not found, + "upsert: tombstoned key has no row in source", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "observed_value": observed, + }, + ) + else: + # Live key: there must be exactly one row, with the latest value. + always( + found and observed == want, + "upsert: SELECT for key matches latest produced value", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + + # Liveness anchor for `upsert-tombstone-removes-key`: confirms the + # interesting tombstone path (tombstone replacing a live value) was + # exercised at least once during the run. Without this, the + # `always(not found, "upsert: tombstoned key has no row in source", ...)` + # check above might fire only against keys that were never live. + sometimes( + tombstoned_after_value > 0, + "upsert: tombstone overwrote a live value at least once this invocation", + { + "tombstoned_after_value": tombstoned_after_value, + "produces": PRODUCES_PER_INVOCATION, + }, + ) + + LOG.info("driver done; asserted on %d keys", len(expected)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py new file mode 100755 index 0000000000000..59385a59a7ac7 --- /dev/null +++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `catalog-recovery-consistency`. + +After environmentd crashes and restarts, the catalog state must be +consistent with what was committed pre-crash: every previously-acknowledged +DDL operation must remain visible, and the catalog upper must not regress. +The user-visible form of this property is: "if I created a table and +received an OK, the table is still there after a restart." + +Approach mirrors `singleton_driver_upsert_state_rehydration.py`: + - One `singleton_driver_` per timeline, long enough to span multiple + Antithesis-injected environmentd restarts. + - In-process `expected_tables: set[str]` model holds the authoritative + "what should be in the catalog right now" view. + - Per cycle, do some DDL (CREATE TABLE or DROP TABLE), then open a + *fresh* psycopg connection and SELECT from `mz_tables` scoped to the + driver's namespace, asserting the live catalog matches `expected`. + - Cross-cycle stability is the recovery check: if an environmentd + restart lands between cycle N and cycle N+1, cycle N+1's read is the + post-recovery snapshot and the assertion catches any lost or stuck + DDL. + +`helper_pg.execute_retry` retries OperationalError transparently, so when +environmentd is down mid-DDL the call will block-and-retry until the next +incarnation is reachable. That's exactly the timing we want: the DDL +either committed pre-crash (in which case it must reappear post-recovery) +or never committed (in which case we record it failed and update the +local model). When the retry budget elapses before recovery, we abandon +that cycle's DDL without updating the local model — fault windows +exceeding the budget are *not* property failures. + +Two corroborating `sometimes(...)` anchors record (a) whether the driver +observed a coord-side connect failure during its run, and (b) whether at +least two assertion-bearing cycles ran (so the assertion at cycle N+1 +genuinely reads post-restart state, not just the same state as N). +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +import helper_random +import psycopg +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGUSER, + execute_retry, + query_retry, +) + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.catalog_recovery_consistency") + +# Long-running knobs: the driver owns its timeline and the per-cycle budget +# has to comfortably exceed environmentd's restart time so a fault landing +# mid-DDL still resolves before the next cycle. CYCLE_COUNT high enough to +# give Antithesis multiple windows to land a restart between cycles. +CYCLE_COUNT = 10 +DROP_PROBABILITY = 0.20 +INTER_CYCLE_SLEEP_S = 2.0 + +PROBE_CONNECT_TIMEOUT_S = 2.0 + + +def _fresh_observed_tables(name_prefix: str) -> set[str] | None: + """Open a new connection and SELECT mz_tables filtered to `name_prefix`. + + Returns the set of observed table names on success, or `None` on any + connect/query failure. None lets the caller skip the cycle's assertion + rather than blaming the property for a fault-window read. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute( + "SELECT name FROM mz_tables WHERE name LIKE %s", + (f"{name_prefix}%",), + ) + return {row[0] for row in cur.fetchall()} + except Exception: # noqa: BLE001 + return None + + +def _saw_coord_unavailable() -> bool: + """Best-effort one-shot probe with the same short connect timeout as + the assertion reads. A failure here means a coord-side connection was + refused or timed out within the last ~tick — a strong proxy for + "environmentd is down or just restarted." This is corroborating signal + only; it does not gate the safety assertion. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as _conn: + pass + return False + except Exception: # noqa: BLE001 + return True + + +def _run_cycle( + expected: set[str], + name_prefix: str, + cycle_idx: int, + next_id: int, +) -> tuple[bool, int]: + """One create-or-drop + verify cycle. + + Returns (assertions_ran, next_id_after) where `assertions_ran` is True + iff this cycle landed a successful post-DDL read against a fresh + connection (i.e. the cycle contributes to the safety property). The + `next_id` counter is monotonic across cycles so table names are unique + even after drops. + + The DDL is run via `execute_retry`, which already retries transient + OperationalError until the retry budget. If it raises anyway the + cycle aborts and the local model is not updated — exactly the + semantics needed: a DDL we never acknowledged is allowed to be + missing from the post-recovery catalog. + """ + new_id = next_id + if expected and helper_random.random_bool(DROP_PROBABILITY): + # Drop a random existing table. Choosing from `expected` keeps the + # drop deterministic w.r.t. the local model. + table = sorted(expected)[helper_random.random_int(0, len(expected) - 1)] + try: + execute_retry(f"DROP TABLE {table}") + except Exception as exc: # noqa: BLE001 + LOG.info("cycle %d: DROP %s failed (%s); not updating model", cycle_idx, table, exc) + return False, new_id + expected.discard(table) + else: + table = f"{name_prefix}_t{new_id:06d}" + try: + execute_retry(f"CREATE TABLE {table} (id BIGINT NOT NULL)") + except Exception as exc: # noqa: BLE001 + LOG.info("cycle %d: CREATE %s failed (%s); not updating model", cycle_idx, table, exc) + return False, new_id + expected.add(table) + new_id += 1 + + # Verify via a fresh connection. If this read fails, we skip the + # assertion — a fault-window read is not regression evidence. + observed = _fresh_observed_tables(name_prefix) + if observed is None: + LOG.info("cycle %d: fresh-connection read failed; skipping assertion", cycle_idx) + return False, new_id + + always( + observed == expected, + "catalog recovery: live catalog table set matches in-process expected model", + { + "cycle": cycle_idx, + "name_prefix": name_prefix, + "expected_count": len(expected), + "observed_count": len(observed), + # Cap the explicit diffs so the assertion details stay compact + # even on a large divergence. + "missing_from_catalog": sorted(expected - observed)[:5], + "unexpected_in_catalog": sorted(observed - expected)[:5], + }, + ) + return True, new_id + + +def main() -> int: + # Per-timeline namespace so concurrent timelines and any future + # parallel_driver_ instances do not collide on table names. + name_prefix = f"catrec_{helper_random.random_u64():016x}" + LOG.info("catalog recovery driver starting; name_prefix=%s", name_prefix) + + expected: set[str] = set() + next_id = 0 + cycles_ran = 0 + saw_coord_unavailable = False + + for cycle_idx in range(CYCLE_COUNT): + ran, next_id = _run_cycle(expected, name_prefix, cycle_idx, next_id) + if ran: + cycles_ran += 1 + if _saw_coord_unavailable(): + saw_coord_unavailable = True + time.sleep(INTER_CYCLE_SLEEP_S) + + sometimes( + cycles_ran >= 2, + "catalog recovery: 2+ assertion-bearing cycles ran in this timeline", + {"cycles_ran": cycles_ran, "cycles_planned": CYCLE_COUNT}, + ) + sometimes( + saw_coord_unavailable, + "catalog recovery: observed environmentd connect failure during run", + {"cycles_ran": cycles_ran, "saw_coord_unavailable": saw_coord_unavailable}, + ) + + LOG.info( + "catalog recovery driver done; cycles_ran=%d/%d expected_size=%d saw_coord_unavailable=%s", + cycles_ran, + CYCLE_COUNT, + len(expected), + saw_coord_unavailable, + ) + return 0 + + +if __name__ == "__main__": + # Touch helper_pg env constants so static analysis treats them as + # used; the helper module re-exports them for drivers (like this one) + # that open their own connections. + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os, query_retry) + sys.exit(main()) diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py new file mode 100755 index 0000000000000..5f3c13bcdce57 --- /dev/null +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `upsert-state-rehydrates-correctly`. + +After a clusterd restart, the rehydrated upsert state — observed via +`SELECT * FROM source` — must equal the state at the most recent durable +timestamp before the restart, for every key produced so far. + +Implementation strategy: a `singleton_driver_` runs exactly once per +timeline and lives long enough to span multiple produce/settle/assert +cycles. Local memory holds the authoritative "what the source should say" +model across cycles. If Antithesis kills clusterd between two cycles, the +next cycle's `SELECT` is effectively a rehydration check — and because the +local model is unchanged across the restart, any divergence in the source +output is exactly the property's failure mode. + +Each cycle: + 1. Produce a batch of (key, value) and (key, null) messages, updating the + in-memory `expected_state` model. + 2. Request a quiet period and wait for `offset_committed` to reach the + highest produced offset. + 3. SELECT every tracked key's current source state and assert it matches + `expected_state` via `always("upsert: rehydrated state equals + local model", ...)`. Across-cycle stability is exactly what + rehydration correctness is. + +The driver also records one `sometimes` anchor confirming that at least +two assertion-bearing cycles ran (without this, the safety check could be +vacuously satisfied by a single early settle). + +A previous version of this driver also recorded a "clusterd observed +non-online" `sometimes` anchor via a once-per-cycle SELECT of +`mz_internal.mz_cluster_replica_statuses`. That assertion was structurally +unable to fire here: each cycle requests a 25-second Antithesis quiet +period before its assertions, the probe runs *after* the quiet period +(when faults are paused and killed containers have been restored), and +the introspection view itself lags clusterd death by the +orchestrator-process 5-second poll. The "did we see a replica go +offline" signal lives in `anytime_fault_recovery_exercised.py` instead, +which polls continuously and never requests a quiet period, so it has +the right shape to observe the offline window. + +Distinct prefix per timeline keeps multiple parallel timelines independent. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_random +from helper_kafka import make_producer +from helper_pg import query_one_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup +from helper_upsert_source import ( + SOURCE_UPSERT_TEXT, + TOPIC_UPSERT_TEXT, + ensure_upsert_text_source, +) + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.upsert_state_rehydration") + +# Long-running knobs — this driver owns its timeline alongside parallel +# drivers, so the per-cycle budget is generous and the cycle count high +# enough that a node-termination fault has a chance to land between cycles. +CYCLE_COUNT = 8 +PRODUCES_PER_CYCLE = 30 +DISTINCT_KEYS = 6 +DISTINCT_VALUES = 12 +TOMBSTONE_PROB = 0.20 + +QUIET_PERIOD_S = 25 +CATCHUP_TIMEOUT_S = 120.0 +INTER_CYCLE_SLEEP_S = 2.0 + + +def _select_value_for_key(key: str) -> tuple[bool, str | None]: + """Duplicate of `_select_value_for_key` in `parallel_driver_upsert_latest_value.py`. + Kept inline to avoid expanding helper surface for one shared private function.""" + row = query_one_retry( + f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", + (key,), + ) + if row is None: + return False, None + count, value = row + if count == 0: + return False, None + if count != 1: + raise RuntimeError( + f"upsert source has {count} rows for key {key!r}; this driver " + "assumes the per-key uniqueness property holds (see " + "`upsert-key-reflects-latest-value` and " + "`kafka-source-no-data-duplication`)" + ) + return True, value + + +def _run_cycle( + producer, tracker, expected: dict[str, str | None], cycle_idx: int +) -> bool: + """Produce one batch, settle, and assert state for every tracked key. + + Returns True if assertions ran (cycle settled), False if we bailed early. + """ + keys = [f"reh-k{i}" for i in range(DISTINCT_KEYS)] + for _ in range(PRODUCES_PER_CYCLE): + key = helper_random.random_choice(keys) + if helper_random.random_bool(TOMBSTONE_PROB): + producer.produce( + topic=TOPIC_UPSERT_TEXT, + key=key.encode("utf-8"), + value=None, + on_delivery=tracker.callback, + ) + expected[key] = None + else: + value = f"reh-v{cycle_idx:02d}-{helper_random.random_int(0, DISTINCT_VALUES - 1):04d}" + producer.produce( + topic=TOPIC_UPSERT_TEXT, + key=key.encode("utf-8"), + value=value.encode("utf-8"), + on_delivery=tracker.callback, + ) + expected[key] = value + producer.poll(0) + + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + LOG.info( + "cycle %d: skipping assertions; flush pending=%d last_error=%s", + cycle_idx, + pending, + tracker.last_error, + ) + return False + + max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT) + if max_produced < 0: + LOG.info("cycle %d: no messages confirmed delivered; skipping", cycle_idx) + return False + + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + if not caught_up: + LOG.info( + "cycle %d: catchup did not complete in budget; skipping asserts", cycle_idx + ) + return False + + # Per-key assertion. The cross-cycle stability of `expected` is what + # makes this a rehydration check: if a clusterd restart happened + # between this cycle and the previous, the source has been rebuilt + # from feedback and must agree with `expected` again. + for key, want in expected.items(): + found, observed = _select_value_for_key(key) + if want is None: + always( + not found, + "upsert: rehydrated state matches local model (tombstoned key)", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "cycle": cycle_idx, + "observed_value": observed, + }, + ) + else: + always( + found and observed == want, + "upsert: rehydrated state matches local model (live key)", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "cycle": cycle_idx, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + return True + + +def main() -> int: + ensure_upsert_text_source() + LOG.info("rehydration driver starting; %d cycles planned", CYCLE_COUNT) + + producer, tracker = make_producer(client_id="antithesis-rehydration") + expected: dict[str, str | None] = {} + + cycles_run = 0 + + for cycle_idx in range(CYCLE_COUNT): + if _run_cycle(producer, tracker, expected, cycle_idx): + cycles_run += 1 + time.sleep(INTER_CYCLE_SLEEP_S) + + # The "did this run actually span a clusterd restart" anchor is + # deliberately not in this driver — see the module docstring. The + # `cycles_run >= 2` check below is the rehydration-coverage anchor: + # without two post-quiet-period reads, the safety assertions could + # be vacuously satisfied by a single early settle. + sometimes( + cycles_run >= 2, + "upsert: rehydration driver ran 2+ assertion cycles", + {"cycles_run": cycles_run, "cycles_planned": CYCLE_COUNT}, + ) + + LOG.info("rehydration driver done; %d/%d cycles ran", cycles_run, CYCLE_COUNT) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh new file mode 100755 index 0000000000000..1a8aab5234f51 --- /dev/null +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +set -euo pipefail + +PGHOST="${PGHOST:-materialized}" +PGPORT="${PGPORT:-6875}" +PGUSER="${PGUSER:-materialize}" +PGPORT_INTERNAL="${PGPORT_INTERNAL:-6877}" +PGUSER_INTERNAL="${PGUSER_INTERNAL:-mz_system}" +CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}" + +# Wait for materialized to be ready. +echo "Waiting for materialized to become healthy..." +until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do + sleep 1 +done +echo "materialized is healthy." + +# Provision an unmanaged cluster with one replica per external clusterd +# process. Multi-replica gives Antithesis the option to kill one +# clusterd at a time without taking the workload offline, and exercises +# the multi-replica compute/storage code paths (notably +# `compute-replica-epoch-isolation`). +# +# This must run before setup-complete so Test Composer assertions can +# target the cluster from the start. Idempotent — `IF NOT EXISTS` is +# unsupported on `CREATE CLUSTER REPLICAS (...)`, so we query +# mz_clusters first. +existing=$( + psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \ + "SELECT 1 FROM mz_clusters WHERE name = '$CLUSTER'" +) +if [[ -z "$existing" ]]; then + echo "Provisioning cluster '$CLUSTER' with replicas on clusterd1 + clusterd2..." + psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" <