diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index c2c8d9a7..d76bc840 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -623,13 +623,18 @@ jobs: fi echo "✓ Gateway image upgraded to $NEW_GATEWAY" - # Verify DocumentDB schema version updated + # Verify DocumentDB schema version unchanged (two-phase default: schema stays at old version) VERSION_AFTER=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') echo "DocumentDB schema version after upgrade: $VERSION_AFTER" if [[ -z "$VERSION_AFTER" ]]; then echo "❌ status.schemaVersion is empty after upgrade" exit 1 fi + if [[ "$VERSION_AFTER" != "$VERSION_BEFORE" ]]; then + echo "❌ Schema version changed from $VERSION_BEFORE to $VERSION_AFTER — expected unchanged (two-phase default)" + exit 1 + fi + echo "✓ Schema version unchanged after binary upgrade: $VERSION_AFTER (two-phase default validated)" # Verify status fields echo "" @@ -979,7 +984,263 @@ jobs: fi rm -f /tmp/pf_output.log - - name: Collect comprehensive logs on failure + # ============================================================ + # Steps 5-8: Two-phase schema upgrade and webhook validation + # ============================================================ + + - name: "Step 5: Re-upgrade binary (setup for schema tests)" + run: | + echo "=== Step 5: Re-upgrade Binary ===" + echo "Re-upgrading extension and gateway images to new version for schema tests..." + + NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" + NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" + + # Patch both images back to new version + echo "Patching images to new versions..." + echo " Extension: → $NEW_EXTENSION" + echo " Gateway: → $NEW_GATEWAY" + kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ + -p "{\"spec\":{\"documentDBImage\":\"$NEW_EXTENSION\",\"gatewayImage\":\"$NEW_GATEWAY\"}}" + + echo "Waiting for cluster to be healthy with new images..." + timeout 600 bash -c ' + while true; do + DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) + CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) + echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" + if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then + HEALTHY_PODS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.instancesStatus.healthy}" 2>/dev/null | jq length 2>/dev/null || echo "0") + if [[ "$HEALTHY_PODS" -ge "1" ]]; then + POD_IMAGES=$(kubectl get pods -n "$2" -l cnpg.io/cluster="$1" -o jsonpath="{.items[*].spec.volumes[*].image.reference}" 2>/dev/null) + if echo "$POD_IMAGES" | grep -q "$3"; then + echo "✓ Cluster healthy with $HEALTHY_PODS pods running new images" + break + else + echo "Pods not yet running new extension image, waiting..." + fi + fi + fi + sleep 10 + done + ' -- "$DB_NAME" "$DB_NS" "$NEW_EXTENSION" + + # Verify schema version is still at baseline + VERSION_CURRENT=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') + echo "Schema version after re-upgrade: $VERSION_CURRENT" + echo "SCHEMA_BASELINE=$VERSION_CURRENT" >> $GITHUB_ENV + + echo "" + echo "✅ Step 5 passed: Binary re-upgraded for schema tests" + echo " Extension: $NEW_EXTENSION" + echo " Gateway: $NEW_GATEWAY" + echo " Schema: $VERSION_CURRENT (baseline)" + + - name: Setup port forwarding for re-upgrade verification + uses: ./.github/actions/setup-port-forwarding + with: + namespace: ${{ env.DB_NS }} + cluster-name: ${{ env.DB_NAME }} + port: ${{ env.DB_PORT }} + architecture: ${{ matrix.architecture }} + test-type: 'comprehensive' + + - name: Verify data persistence after re-upgrade + run: | + echo "=== Data Persistence: Verifying after re-upgrade ===" + mongosh 127.0.0.1:$DB_PORT \ + -u $DB_USERNAME \ + -p $DB_PASSWORD \ + --authenticationMechanism SCRAM-SHA-256 \ + --tls \ + --tlsAllowInvalidCertificates \ + --eval ' + db = db.getSiblingDB("upgrade_test_db"); + var count = db.test_collection.countDocuments(); + assert(count === 2, "Expected 2 documents but found " + count + " after re-upgrade"); + print("✓ All " + count + " documents persisted through re-upgrade"); + ' + echo "✓ Data persistence verified after re-upgrade" + + - name: Cleanup port forwarding after re-upgrade verification + if: always() + run: | + if [ -f /tmp/pf_pid ]; then + PF_PID=$(cat /tmp/pf_pid) + kill $PF_PID 2>/dev/null || true + rm -f /tmp/pf_pid + fi + rm -f /tmp/pf_output.log + + - name: "Step 6: Schema Finalization (two-phase commit)" + run: | + echo "=== Step 6: Schema Finalization ===" + echo "Setting spec.schemaVersion to finalize the schema migration..." + + NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" + SCHEMA_BASELINE="${{ env.SCHEMA_BASELINE }}" + + # Determine the new schema version from the new extension image tag + # Strip any architecture suffix (e.g., "0.112.0-amd64" → "0.112.0") + RAW_TAG=$(echo "$NEW_EXTENSION" | sed 's/.*://') + NEW_SCHEMA_VERSION=$(echo "$RAW_TAG" | grep -oP '^\d+\.\d+\.\d+') + if [[ -z "$NEW_SCHEMA_VERSION" ]]; then + echo "✗ Could not extract semver from image tag: $RAW_TAG" + exit 1 + fi + echo "Baseline schema version: $SCHEMA_BASELINE" + echo "Target schema version: $NEW_SCHEMA_VERSION" + + # Set spec.schemaVersion to trigger ALTER EXTENSION UPDATE + kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ + -p "{\"spec\":{\"schemaVersion\":\"$NEW_SCHEMA_VERSION\"}}" + + echo "Waiting for schema version to update..." + timeout 300 bash -c ' + while true; do + STATUS_SCHEMA=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.schemaVersion}" 2>/dev/null) + DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) + echo "status.schemaVersion: $STATUS_SCHEMA, status: $DB_STATUS" + if [[ "$STATUS_SCHEMA" == "$3" && "$DB_STATUS" == "Cluster in healthy state" ]]; then + echo "✓ Schema version updated to $STATUS_SCHEMA" + break + fi + sleep 10 + done + ' -- "$DB_NAME" "$DB_NS" "$NEW_SCHEMA_VERSION" + + # Verify schema version changed + FINAL_SCHEMA=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') + if [[ "$FINAL_SCHEMA" == "$NEW_SCHEMA_VERSION" ]]; then + echo "✓ Schema finalized: $SCHEMA_BASELINE → $FINAL_SCHEMA" + else + echo "❌ Schema version should be $NEW_SCHEMA_VERSION but is $FINAL_SCHEMA" + exit 1 + fi + + echo "NEW_SCHEMA_VERSION=$NEW_SCHEMA_VERSION" >> $GITHUB_ENV + echo "" + echo "✅ Step 6 passed: Schema finalized to $NEW_SCHEMA_VERSION" + + - name: Setup port forwarding for schema finalization verification + uses: ./.github/actions/setup-port-forwarding + with: + namespace: ${{ env.DB_NS }} + cluster-name: ${{ env.DB_NAME }} + port: ${{ env.DB_PORT }} + architecture: ${{ matrix.architecture }} + test-type: 'comprehensive' + + - name: Verify data persistence after schema finalization + run: | + echo "=== Data Persistence: Verifying after schema finalization ===" + mongosh 127.0.0.1:$DB_PORT \ + -u $DB_USERNAME \ + -p $DB_PASSWORD \ + --authenticationMechanism SCRAM-SHA-256 \ + --tls \ + --tlsAllowInvalidCertificates \ + --eval ' + db = db.getSiblingDB("upgrade_test_db"); + var count = db.test_collection.countDocuments(); + assert(count === 2, "Expected 2 documents but found " + count + " after schema finalization"); + print("✓ All " + count + " documents persisted through schema finalization"); + ' + echo "✓ Data persistence verified after schema finalization" + + - name: Cleanup port forwarding after schema finalization verification + if: always() + run: | + if [ -f /tmp/pf_pid ]; then + PF_PID=$(cat /tmp/pf_pid) + kill $PF_PID 2>/dev/null || true + rm -f /tmp/pf_pid + fi + rm -f /tmp/pf_output.log + + - name: "Step 7: Webhook — Reject Rollback Below Schema" + run: | + echo "=== Step 7: Webhook — Reject Rollback Below Schema ===" + echo "Attempting to roll back documentDBImage below status.schemaVersion..." + + OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" + CURRENT_SCHEMA="${{ env.NEW_SCHEMA_VERSION }}" + + echo "Current schema version: $CURRENT_SCHEMA" + echo "Attempting rollback to: $OLD_EXTENSION" + + # This SHOULD fail — the webhook must reject rollback below schema version + PATCH_OUTPUT=$(kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ + -p "{\"spec\":{\"documentDBImage\":\"$OLD_EXTENSION\"}}" 2>&1) && { + echo "❌ Webhook did NOT reject the rollback — patch succeeded unexpectedly" + echo "Output: $PATCH_OUTPUT" + exit 1 + } + + echo "Patch rejected (expected). Output:" + echo "$PATCH_OUTPUT" + + # Verify the error message mentions rollback blocking + if echo "$PATCH_OUTPUT" | grep -qi "rollback blocked\|older than installed schema"; then + echo "✓ Webhook correctly rejected rollback with expected error message" + else + echo "⚠️ Patch was rejected but error message doesn't match expected pattern" + echo " (Still passing — the important thing is the rejection)" + fi + + # Verify cluster state is unchanged + CURRENT_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') + NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" + if [[ "$CURRENT_IMAGE" == "$NEW_EXTENSION" ]]; then + echo "✓ Cluster state unchanged — documentDBImage still at $CURRENT_IMAGE" + else + echo "❌ documentDBImage changed unexpectedly to $CURRENT_IMAGE" + exit 1 + fi + + echo "" + echo "✅ Step 7 passed: Webhook correctly blocked rollback below schema version" + + - name: "Step 8: Webhook — Reject Schema Exceeds Binary" + run: | + echo "=== Step 8: Webhook — Reject Schema Exceeds Binary ===" + echo "Attempting to set schemaVersion higher than binary version..." + + # Use an artificially high version that exceeds any binary + INVALID_SCHEMA="99.999.0" + echo "Attempting schemaVersion: $INVALID_SCHEMA" + + # This SHOULD fail — the webhook must reject schema > binary + PATCH_OUTPUT=$(kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ + -p "{\"spec\":{\"schemaVersion\":\"$INVALID_SCHEMA\"}}" 2>&1) && { + echo "❌ Webhook did NOT reject the invalid schema version — patch succeeded unexpectedly" + echo "Output: $PATCH_OUTPUT" + exit 1 + } + + echo "Patch rejected (expected). Output:" + echo "$PATCH_OUTPUT" + + # Verify the error message mentions schema exceeding binary + if echo "$PATCH_OUTPUT" | grep -qi "exceeds.*binary"; then + echo "✓ Webhook correctly rejected schema version with expected error message" + else + echo "⚠️ Patch was rejected but error message doesn't match expected pattern" + echo " (Still passing — the important thing is the rejection)" + fi + + # Verify schema version is unchanged + CURRENT_SCHEMA=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') + EXPECTED_SCHEMA="${{ env.NEW_SCHEMA_VERSION }}" + if [[ "$CURRENT_SCHEMA" == "$EXPECTED_SCHEMA" ]]; then + echo "✓ Schema version unchanged: $CURRENT_SCHEMA" + else + echo "❌ Schema version changed unexpectedly to $CURRENT_SCHEMA" + exit 1 + fi + + echo "" + echo "✅ Step 8 passed: Webhook correctly blocked schema version exceeding binary" if: failure() uses: ./.github/actions/collect-logs with: @@ -1008,6 +1269,10 @@ jobs: echo "- Step 2: Upgrade both extension and gateway images" >> $GITHUB_STEP_SUMMARY echo "- Step 3: Rollback extension image" >> $GITHUB_STEP_SUMMARY echo "- Step 4: Rollback gateway image" >> $GITHUB_STEP_SUMMARY + echo "- Step 5: Re-upgrade binary (setup for schema tests)" >> $GITHUB_STEP_SUMMARY + echo "- Step 6: Schema finalization (two-phase commit)" >> $GITHUB_STEP_SUMMARY + echo "- Step 7: Webhook — reject rollback below schema" >> $GITHUB_STEP_SUMMARY + echo "- Step 8: Webhook — reject schema exceeds binary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY if [[ "${{ job.status }}" == "success" ]]; then diff --git a/CHANGELOG.md b/CHANGELOG.md index 236e6159..b52b8f9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [Unreleased] + +### Major Features +- **Two-Phase Extension Upgrade**: New `spec.schemaVersion` field separates binary upgrades (`spec.documentDBVersion`) from irreversible schema migrations (`ALTER EXTENSION UPDATE`). The default behavior gives you a rollback-safe window — update the binary first, validate, then finalize the schema. Set `schemaVersion: "auto"` for single-step upgrades in development environments. See the [upgrade guide](docs/operator-public-documentation/preview/operations/upgrades.md) for details. + +### Breaking Changes +- **Validating webhook added**: A new `ValidatingWebhookConfiguration` enforces that `spec.schemaVersion` never exceeds the binary version and blocks `spec.documentDBVersion` rollbacks below the committed schema version. This requires [cert-manager](https://cert-manager.io/) to be installed in the cluster (it is already a prerequisite for the sidecar injector). Existing clusters upgrading to this release will have the webhook activated automatically via `helm upgrade`. + ## [0.2.0] - 2026-03-25 ### Major Features diff --git a/docs/operator-public-documentation/preview/operations/upgrades.md b/docs/operator-public-documentation/preview/operations/upgrades.md index f279c969..bbbd3a60 100644 --- a/docs/operator-public-documentation/preview/operations/upgrades.md +++ b/docs/operator-public-documentation/preview/operations/upgrades.md @@ -1,6 +1,6 @@ --- title: Upgrades -description: Upgrade the DocumentDB operator Helm chart and CRDs, update DocumentDB extension and gateway images per cluster, and roll back to a previous version. +description: Upgrade the DocumentDB operator and DocumentDB clusters, including version and schema management. tags: - operations - upgrades @@ -13,19 +13,21 @@ tags: Upgrades keep your DocumentDB deployment current with the latest features, security patches, and bug fixes. -There are two types of upgrades in a DocumentDB deployment: +A DocumentDB deployment has two independently upgradable components: -| Upgrade Type | What Changes | How to Trigger | -|-------------|-------------|----------------| -| **DocumentDB operator** | The Kubernetes operator + bundled CloudNative-PG | Helm chart upgrade | -| **DocumentDB components** | Extension + gateway (same version) | Update `spec.documentDBVersion` | +| Component | What Changes | How to Upgrade | +|-----------|-------------|----------------| +| **DocumentDB Operator** | Operator binary + bundled CloudNative-PG | `helm upgrade` | +| **DocumentDB Clusters** | Extension binary + gateway sidecar + database schema | Update `spec.documentDBVersion` and optionally `spec.schemaVersion` | -!!! info - The operator Helm chart bundles [CloudNative-PG](https://cloudnative-pg.io/) as a dependency. Upgrading the operator automatically upgrades the bundled CloudNative-PG version — this cannot be skipped separately. +--- -## DocumentDB Operator Upgrade +## Upgrading the Operator -The DocumentDB operator is deployed via Helm. Upgrade it by updating the Helm release. +The operator is deployed via Helm. Upgrading it does **not** restart your DocumentDB cluster pods or change any cluster components. + +!!! info + The operator Helm chart bundles [CloudNative-PG](https://cloudnative-pg.io/) as a dependency. Upgrading the operator automatically upgrades the bundled CloudNative-PG version. ### Step 1: Update the Helm Repository @@ -47,8 +49,8 @@ helm search repo documentdb/documentdb-operator --versions Helm only installs CRDs on initial `helm install` — it does **not** update them on `helm upgrade`. If the new operator version introduces CRD schema changes, you must apply them manually first: ```bash -# Set this to the release tag you are upgrading to (e.g., v0.2.0) -TARGET_VERSION=v0.2.0 +# Set this to the release tag you are upgrading to (e.g., 0.2.0) +TARGET_VERSION=0.2.0 kubectl apply --server-side --force-conflicts \ -f https://raw.githubusercontent.com/documentdb/documentdb-kubernetes-operator/${TARGET_VERSION}/operator/documentdb-helm-chart/crds/documentdb.io_dbs.yaml \ @@ -61,7 +63,7 @@ Server-side apply (`--server-side --force-conflicts`) is required because the Do !!! warning Always use CRDs from the **same version** as the Helm chart you are installing. Using CRDs from `main` or a different release may introduce schema mismatches. -### Step 4: Upgrade the DocumentDB Operator +### Step 4: Upgrade the Operator ```bash helm upgrade documentdb-operator documentdb/documentdb-operator \ @@ -88,7 +90,7 @@ kubectl get deployment -n documentdb-operator kubectl logs -n documentdb-operator deployment/documentdb-operator --tail=50 ``` -### Rollback +### Operator Rollback If the new operator version causes issues, roll back to the previous Helm release: @@ -101,43 +103,153 @@ helm rollback documentdb-operator -n documentdb-operator ``` !!! note - `helm rollback` reverts the operator deployment but does **not** revert CRDs. This is usually safe — CRD changes are additive, and the older operator ignores fields it does not recognize. Do **not** revert CRDs unless the release notes explicitly instruct you to, as removing fields from a CRD can invalidate existing resources. + `helm rollback` reverts the operator deployment but does **not** revert CRDs. This is usually safe — CRD changes are additive, and the older operator ignores fields it does not recognize. Do **not** revert CRDs unless the [release notes](https://github.com/documentdb/documentdb-kubernetes-operator/releases) explicitly instruct you to, as removing fields from a CRD can invalidate existing resources. + +--- + +## Upgrading DocumentDB Clusters + +!!! warning "Migration note for existing clusters" + If you are upgrading from operator version 0.2.0 or earlier, the new operator introduces a validating webhook and the `spec.schemaVersion` field. After upgrading the operator, your existing clusters will continue to work without changes — `schemaVersion` defaults to unset (binary-only upgrades). No action is needed unless you want to enable automatic schema upgrades via `schemaVersion: "auto"`. + +Upgrading a DocumentDB cluster has two dimensions: the **binary** (container images) and the **schema** (database catalog). You control each independently: + +| Field | What It Does | Reversible? | +|-------|-------------|-------------| +| `spec.documentDBVersion` | Updates the **binary** — the extension image and gateway sidecar are replaced via rolling restart. | ✅ Yes — revert the field to roll back. | +| `spec.schemaVersion` | Runs `ALTER EXTENSION UPDATE` to migrate the **database schema** to match the binary. | ❌ No — schema changes are permanent. | -### DocumentDB Operator Upgrade Notes +Think of it as: **`documentDBVersion` installs the software, `schemaVersion` applies the database migration.** -- The DocumentDB operator upgrade does **not** restart your DocumentDB cluster pods. -- After upgrading the operator, update individual DocumentDB clusters to the latest component version. See [Component Upgrades](#component-upgrades) below. +!!! info "Why two fields?" + The binary (container image) can be swapped freely — if something goes wrong, revert `documentDBVersion` and the pods roll back to the previous image. But `ALTER EXTENSION UPDATE` modifies database catalog tables and cannot be undone. Separating these two steps gives you a safe rollback window between deploying new code and committing the schema change. -## Component Upgrades +### Schema Version Modes -Updating `spec.documentDBVersion` upgrades **both** the DocumentDB extension and the gateway together, since they share the same version. +| `spec.schemaVersion` | Behavior | Recommended For | +|----------------------|----------|-----------------| +| *(not set)* — default | Only the binary upgrades. The schema stays at its current version until you explicitly set `schemaVersion`. | **Production** — gives you a rollback-safe window before committing the schema change. | +| `"auto"` | Schema updates automatically whenever the binary version changes. | **Development and testing** — simple, one-step upgrades. | +| Explicit version (e.g., `"0.112.0"`) | Schema updates to exactly that version. | **Controlled rollouts** — you choose when and what version to finalize. | ### Pre-Upgrade Checklist -1. **Check the CHANGELOG** — review release notes for breaking changes. -2. **Verify DocumentDB cluster health** — ensure all instances are running and healthy. +1. **Check the [DocumentDB release notes](https://github.com/documentdb/documentdb/releases)** — review for breaking changes or new features. +2. **Verify DocumentDB cluster health** — ensure all instances are running and healthy: + + ```bash + kubectl get documentdb my-cluster -n default + kubectl get pods -n default -l documentdb.io/cluster=my-cluster + ``` 3. **Back up the DocumentDB cluster** — create an on-demand [backup](backup-and-restore.md) before upgrading. -### Step 1: Update the DocumentDB Version +### Upgrade Walkthrough -!!! danger - **Downgrades are not supported.** If you set `documentDBVersion` to a version lower than the currently installed schema version, the operator will still update the container images but will **skip the schema migration** (`ALTER EXTENSION UPDATE`) and emit a warning event. This leaves the DocumentDB cluster running an older binary against a newer schema, which may cause unexpected behavior. Always move forward to a newer version or restore from backup. +Choose the approach that matches your use case: -```yaml title="documentdb.yaml" -apiVersion: documentdb.io/preview -kind: DocumentDB -metadata: - name: my-cluster - namespace: default -spec: - documentDBVersion: "" -``` +=== "Production (two-phase upgrade)" -```bash -kubectl apply -f documentdb.yaml -``` + **Step 1: Update the binary version.** The schema stays unchanged — this is safe to roll back. + + ```yaml title="documentdb.yaml" + apiVersion: documentdb.io/preview + kind: DocumentDB + metadata: + name: my-cluster + namespace: default + spec: + documentDBVersion: "" + # schemaVersion is not set — schema stays at current version + ``` + + ```bash + kubectl apply -f documentdb.yaml + ``` + + **Step 2: Validate.** Confirm the cluster is healthy and the new binary works as expected. + + ```bash + # Watch the rolling restart + kubectl get pods -n default -w + + # Check cluster status + kubectl get documentdb my-cluster -n default + + # Verify the schema version has NOT changed + kubectl get documentdb my-cluster -n default -o jsonpath='{.status.schemaVersion}' + ``` + + **Step 3: Finalize the schema.** Once you're confident the new binary is stable, commit the schema migration: + + ```bash + kubectl patch documentdb my-cluster -n default \ + --type merge -p '{"spec":{"schemaVersion":""}}' + ``` + + !!! tip + On subsequent upgrades, just update `documentDBVersion` again. The schema stays pinned at the previous `schemaVersion` value until you update it. + +=== "Production (rolling safety gap)" + + Keep the binary always one version ahead of the schema. This ensures you can roll back at any time because the running binary has already been validated with the current schema. + + **Example:** Your cluster is at binary `0.110.0` with schema `0.110.0`. A new version `0.111.0` is available. -### Step 2: Monitor the Upgrade + **Step 1: Upgrade the binary and finalize the *previous* schema together.** + + ```yaml title="documentdb.yaml" + apiVersion: documentdb.io/preview + kind: DocumentDB + metadata: + name: my-cluster + namespace: default + spec: + documentDBVersion: "0.111.0" # upgrade binary to new version + schemaVersion: "0.110.0" # finalize schema to current (previous) version + ``` + + ```bash + kubectl apply -f documentdb.yaml + ``` + + Now the binary is `0.111.0` and the schema is `0.110.0`. Each new binary version is backward-compatible with the previous schema version, so this is safe. + + **Step 2: Validate.** Run your tests. If something goes wrong, revert `documentDBVersion` to `0.110.0` — the schema is still at `0.110.0`, so rollback is safe. + + **On the next upgrade** (e.g., `0.112.0`), repeat the pattern: + + ```yaml + spec: + documentDBVersion: "0.112.0" # upgrade binary to next version + schemaVersion: "0.111.0" # finalize schema to previous binary version + ``` + + !!! info + This pattern keeps a permanent rollback window. The schema is always one version behind the binary, so you never commit a schema change until the *next* binary has proven stable with it. + +=== "Development (auto mode)" + + Update both the binary and schema in one step: + + ```yaml title="documentdb.yaml" + apiVersion: documentdb.io/preview + kind: DocumentDB + metadata: + name: my-cluster + namespace: default + spec: + documentDBVersion: "" + schemaVersion: "auto" + ``` + + ```bash + kubectl apply -f documentdb.yaml + ``` + + !!! warning + With `schemaVersion: "auto"`, the schema migration is irreversible once applied. You cannot roll back to the previous version — only restore from backup. + +### Monitoring the Upgrade ```bash # Watch the rolling restart @@ -146,45 +258,64 @@ kubectl get pods -n default -w # Check DocumentDB cluster status kubectl get documentdb my-cluster -n default -# Check the schema version after upgrade +# Check the current schema version kubectl get documentdb my-cluster -n default -o jsonpath='{.status.schemaVersion}' ``` ### Rollback and Recovery +Two rules govern rollback: + +1. **Schema cannot be rolled back.** `ALTER EXTENSION UPDATE` modifies database catalog tables permanently. There is no `ALTER EXTENSION DOWNGRADE`. +2. **`documentDBVersion` cannot be set below `status.schemaVersion`.** The operator blocks this because running an older binary against a newer schema is untested and may cause data corruption. + Whether you can roll back depends on whether the schema has been updated: -=== "Schema not yet updated" +=== "Roll back binary only (documentDBVersion ≥ schemaVersion)" - If `status.schemaVersion` still shows the **previous** version, the extension schema migration has not run yet. You can roll back by reverting `spec.documentDBVersion` to the previous value: + If `status.schemaVersion` still shows the **previous** version, the schema migration has not run yet. You can safely roll back by reverting `spec.documentDBVersion` to any version that is ≥ `status.schemaVersion`: ```bash - # Check the current schema version + # Verify the schema version is unchanged kubectl get documentdb my-cluster -n default -o jsonpath='{.status.schemaVersion}' ``` - If the schema version is unchanged, revert the `spec.documentDBVersion` field in your manifest and reapply: + If the schema version is unchanged, revert `spec.documentDBVersion` in your manifest and reapply: ```bash kubectl apply -f documentdb.yaml ``` -=== "Schema already updated" +=== "Roll back both binary and schema (restore from backup)" + + If `status.schemaVersion` shows the **new** version, the schema migration has already been applied. At this point: - If `status.schemaVersion` shows the **new** version, the schema migration has already been applied and **cannot be reversed**. To recover: Use the backup you created in the [Pre-Upgrade Checklist](#pre-upgrade-checklist) to restore the DocumentDB cluster to its pre-upgrade state. See [Backup and Restore](backup-and-restore.md) for instructions. + - You **cannot** revert `schemaVersion` — the database schema change is permanent. + - You **cannot** set `documentDBVersion` below `status.schemaVersion` — the operator rejects it. + + To recover: restore from the backup you created in the [Pre-Upgrade Checklist](#pre-upgrade-checklist). See [Backup and Restore](backup-and-restore.md) for instructions. !!! tip - This is why backing up before a component upgrade is critical. Once the schema is updated, there is no rollback — only restore. + This is why the default two-phase mode exists — it gives you a rollback-safe window before committing the schema change. Always back up before upgrading, and validate the new binary before setting `schemaVersion`. + +--- + +## Multi-Region Upgrades + +When running DocumentDB across multiple regions or clusters, use the two-phase upgrade pattern across all regions: -### How It Works +1. **Back up every region.** Create a backup in each region before starting. +2. **Upgrade the binary in all regions.** Update `spec.documentDBVersion` in every cluster. Validate that all regions are healthy and replication is working correctly with the new binary. +3. **Finalize the schema in all regions.** Once every region is running the new binary successfully, set `spec.schemaVersion` across all clusters. -1. You update the `spec.documentDBVersion` field. -2. The operator detects the version change and updates both the database image and the gateway sidecar image. -3. The underlying cluster manager performs a **rolling restart**: replicas are restarted first one at a time, then the **primary is restarted in place**. Expect a brief period of downtime while the primary pod restarts. -4. After the primary pod is healthy, the operator runs `ALTER EXTENSION documentdb UPDATE` to update the database schema. -5. The operator tracks the schema version in `status.schemaVersion`. +This keeps a rollback-safe window — if any region fails the binary upgrade, you can revert `documentDBVersion` everywhere before any schema change is committed. + +!!! note + Multi-region upgrade orchestration is performed manually — the operator manages individual clusters and does not coordinate across regions automatically. + +--- -### Advanced: Independent Image Overrides +## Advanced: Independent Image Overrides In most cases, use `spec.documentDBVersion` to upgrade both components together. For advanced scenarios, you can override individual images: diff --git a/operator/documentdb-helm-chart/crds/documentdb.io_dbs.yaml b/operator/documentdb-helm-chart/crds/documentdb.io_dbs.yaml index ee2827f5..36191073 100644 --- a/operator/documentdb-helm-chart/crds/documentdb.io_dbs.yaml +++ b/operator/documentdb-helm-chart/crds/documentdb.io_dbs.yaml @@ -1264,6 +1264,25 @@ spec: required: - storage type: object + schemaVersion: + description: |- + SchemaVersion controls the desired schema version for the DocumentDB extension. + + This field decouples the extension binary (image) update from the schema update + (ALTER EXTENSION documentdb UPDATE), providing a rollback-safe upgrade window. + + Values: + - "" (empty, default): Two-phase mode. Image upgrades happen automatically, + but ALTER EXTENSION UPDATE does NOT run. Users must explicitly set this + field to finalize the schema upgrade. This is the safest option for production + as it allows rollback by reverting the image before committing the schema change. + - "auto": Schema automatically updates to match the binary version whenever + the binary is upgraded. This is the simplest mode but provides no rollback + safety window. Recommended for development and testing environments. + - "" (e.g. "0.112.0"): Schema updates to exactly this version. + Must be <= the binary version. + pattern: ^(auto|[0-9]+\.[0-9]+\.[0-9]+)?$ + type: string sidecarInjectorPluginName: description: SidecarInjectorPluginName is the name of the sidecar injector plugin to use. diff --git a/operator/documentdb-helm-chart/templates/09_documentdb_operator.yaml b/operator/documentdb-helm-chart/templates/09_documentdb_operator.yaml index 59a24c66..6202d565 100644 --- a/operator/documentdb-helm-chart/templates/09_documentdb_operator.yaml +++ b/operator/documentdb-helm-chart/templates/09_documentdb_operator.yaml @@ -22,6 +22,39 @@ spec: - name: documentdb-operator image: "{{ .Values.image.documentdbk8soperator.repository }}:{{ .Values.image.documentdbk8soperator.tag | default .Chart.AppVersion }}" imagePullPolicy: "{{ .Values.image.documentdbk8soperator.pullPolicy }}" + args: + - --webhook-cert-path=/tmp/k8s-webhook-server/serving-certs + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + - containerPort: 8081 + name: health + protocol: TCP + # Readiness probe gates the Service endpoint so the API server cannot + # route webhook requests until the TLS cert is loaded (CNPG pattern). + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 15 + periodSeconds: 20 + startupProbe: + httpGet: + path: /readyz + port: health + failureThreshold: 30 + periodSeconds: 2 + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: webhook-cert + readOnly: true env: - name: GATEWAY_PORT value: "10260" @@ -37,3 +70,12 @@ spec: - name: DOCUMENTDB_IMAGE_PULL_POLICY value: "{{ .Values.documentDbImagePullPolicy }}" {{- end }} + volumes: + - name: webhook-cert + secret: + secretName: documentdb-webhook-tls + defaultMode: 420 + # Allow the pod to start before cert-manager creates the Secret. + # The webhook server (and readiness probe) will stay unhealthy until + # the cert files appear, keeping the pod out of the Service endpoints. + optional: true diff --git a/operator/documentdb-helm-chart/templates/10_documentdb_webhook.yaml b/operator/documentdb-helm-chart/templates/10_documentdb_webhook.yaml new file mode 100644 index 00000000..254efd08 --- /dev/null +++ b/operator/documentdb-helm-chart/templates/10_documentdb_webhook.yaml @@ -0,0 +1,98 @@ +{{- $ns := .Values.namespace | default .Release.Namespace -}} +# Self-signed Issuer for webhook TLS certificates in the operator namespace. +# The sidecar injector uses a separate Issuer in cnpg-system; this one is for the operator. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: documentdb-operator-selfsigned-issuer + namespace: {{ $ns }} + labels: + app.kubernetes.io/name: {{ include "documentdb-chart.name" . }} + app.kubernetes.io/managed-by: "Helm" +spec: + selfSigned: {} +--- +# TLS certificate for the validating webhook server. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: documentdb-webhook-cert + namespace: {{ $ns }} + labels: + app.kubernetes.io/name: {{ include "documentdb-chart.name" . }} + app.kubernetes.io/managed-by: "Helm" +spec: + commonName: documentdb-webhook + dnsNames: + - documentdb-webhook-service.{{ $ns }}.svc + - documentdb-webhook-service.{{ $ns }}.svc.cluster.local + duration: 2160h # 90 days + renewBefore: 360h # 15 days + isCA: false + issuerRef: + group: cert-manager.io + kind: Issuer + name: documentdb-operator-selfsigned-issuer + secretName: documentdb-webhook-tls + usages: + - server auth +--- +# Service fronting the operator's webhook server on port 9443. +# NOTE: The service name is hardcoded because the operator currently assumes +# a single installation per cluster. If multi-instance support is needed, +# template the name using a fullname helper and update the +# ValidatingWebhookConfiguration's clientConfig.service.name to match. +apiVersion: v1 +kind: Service +metadata: + name: documentdb-webhook-service + namespace: {{ $ns }} + labels: + app.kubernetes.io/name: {{ include "documentdb-chart.name" . }} + app.kubernetes.io/managed-by: "Helm" +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + app: {{ .Release.Name }} +--- +# ValidatingWebhookConfiguration for DocumentDB resources. +# cert-manager injects the CA bundle automatically via the annotation. +# NOTE: This is a cluster-scoped resource with a hardcoded name. Multiple +# Helm releases would overwrite each other. If multi-instance support is +# needed, template the name and the clientConfig.service references. +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: documentdb-validating-webhook + labels: + app.kubernetes.io/name: {{ include "documentdb-chart.name" . }} + app.kubernetes.io/managed-by: "Helm" + annotations: + cert-manager.io/inject-ca-from: {{ $ns }}/documentdb-webhook-cert +webhooks: + - name: vdocumentdb.kb.io + admissionReviewVersions: + - v1 + clientConfig: + service: + name: documentdb-webhook-service + namespace: {{ $ns }} + path: /validate-documentdb-io-preview-documentdb + # Fail-closed: reject requests when the webhook is unreachable. + # Safe because readiness/startup probes keep the pod out of the + # Service endpoints until the TLS cert is loaded (CNPG pattern). + failurePolicy: Fail + rules: + - apiGroups: + - documentdb.io + apiVersions: + - preview + operations: + - CREATE + - UPDATE + resources: + - dbs + sideEffects: None diff --git a/operator/src/api/preview/documentdb_types.go b/operator/src/api/preview/documentdb_types.go index 72185332..4d95ac9c 100644 --- a/operator/src/api/preview/documentdb_types.go +++ b/operator/src/api/preview/documentdb_types.go @@ -103,6 +103,31 @@ type DocumentDBSpec struct { // +kubebuilder:validation:XValidation:rule="self.all(key, key in ['ChangeStreams'])",message="unsupported feature gate key; allowed keys: ChangeStreams" FeatureGates map[string]bool `json:"featureGates,omitempty"` + // SchemaVersion controls the desired schema version for the DocumentDB extension. + // + // The operator never changes your database schema unless you ask: + // - Set documentDBVersion → updates the binary (safe to roll back) + // - Set schemaVersion → updates the database schema (irreversible) + // - Set schemaVersion: "auto" → schema auto-updates with binary + // + // Once the schema has been updated, the operator blocks image rollback below the + // installed schema version to prevent running an untested binary/schema combination. + // + // Values: + // - "" (empty, default): Two-phase mode. Image upgrades happen automatically, + // but ALTER EXTENSION UPDATE does NOT run. Users must explicitly set this + // field to finalize the schema upgrade. This is the safest option for production + // as it allows rollback by reverting the image before committing the schema change. + // - "auto": Schema automatically updates to match the binary version whenever + // the binary is upgraded. This is the simplest mode but provides no rollback + // safety window. + // - "" (e.g. "0.112.0"): Schema updates to exactly this version. + // Must be <= the binary version. + // + // +kubebuilder:validation:Pattern=`^(auto|[0-9]+\.[0-9]+\.[0-9]+)?$` + // +optional + SchemaVersion string `json:"schemaVersion,omitempty"` + // Affinity/Anti-affinity rules for Pods (cnpg passthrough) // +optional Affinity cnpgv1.AffinityConfiguration `json:"affinity,omitempty"` diff --git a/operator/src/cmd/main.go b/operator/src/cmd/main.go index bf5f2ddf..f7d9258b 100644 --- a/operator/src/cmd/main.go +++ b/operator/src/cmd/main.go @@ -29,6 +29,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" dbpreview "github.com/documentdb/documentdb-operator/api/preview" "github.com/documentdb/documentdb-operator/internal/controller" + webhookhandler "github.com/documentdb/documentdb-operator/internal/webhook" fleetv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1" // +kubebuilder:scaffold:imports ) @@ -248,6 +249,12 @@ func main() { // +kubebuilder:scaffold:builder + // Register the DocumentDB validating webhook + if err = (&webhookhandler.DocumentDBValidator{}).SetupWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "DocumentDB") + os.Exit(1) + } + if metricsCertWatcher != nil { setupLog.Info("Adding metrics certificate watcher to manager") if err := mgr.Add(metricsCertWatcher); err != nil { @@ -272,6 +279,14 @@ func main() { setupLog.Error(err, "unable to set up ready check") os.Exit(1) } + // Gate readiness on webhook server startup so the Service has no ready + // endpoints until the TLS cert is loaded. This prevents the API server + // from routing admission requests to an operator that cannot serve them + // (matching CNPG's readiness-probe pattern for failurePolicy: Fail). + if err := mgr.AddReadyzCheck("webhook", webhookServer.StartedChecker()); err != nil { + setupLog.Error(err, "unable to set up webhook ready check") + os.Exit(1) + } setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { diff --git a/operator/src/config/crd/bases/documentdb.io_dbs.yaml b/operator/src/config/crd/bases/documentdb.io_dbs.yaml index ee2827f5..36191073 100644 --- a/operator/src/config/crd/bases/documentdb.io_dbs.yaml +++ b/operator/src/config/crd/bases/documentdb.io_dbs.yaml @@ -1264,6 +1264,25 @@ spec: required: - storage type: object + schemaVersion: + description: |- + SchemaVersion controls the desired schema version for the DocumentDB extension. + + This field decouples the extension binary (image) update from the schema update + (ALTER EXTENSION documentdb UPDATE), providing a rollback-safe upgrade window. + + Values: + - "" (empty, default): Two-phase mode. Image upgrades happen automatically, + but ALTER EXTENSION UPDATE does NOT run. Users must explicitly set this + field to finalize the schema upgrade. This is the safest option for production + as it allows rollback by reverting the image before committing the schema change. + - "auto": Schema automatically updates to match the binary version whenever + the binary is upgraded. This is the simplest mode but provides no rollback + safety window. Recommended for development and testing environments. + - "" (e.g. "0.112.0"): Schema updates to exactly this version. + Must be <= the binary version. + pattern: ^(auto|[0-9]+\.[0-9]+\.[0-9]+)?$ + type: string sidecarInjectorPluginName: description: SidecarInjectorPluginName is the name of the sidecar injector plugin to use. diff --git a/operator/src/internal/cnpg/cnpg_cluster.go b/operator/src/internal/cnpg/cnpg_cluster.go index 614469fc..6d7faa17 100644 --- a/operator/src/internal/cnpg/cnpg_cluster.go +++ b/operator/src/internal/cnpg/cnpg_cluster.go @@ -65,8 +65,9 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu }, Spec: func() cnpgv1.ClusterSpec { spec := cnpgv1.ClusterSpec{ - Instances: documentdb.Spec.InstancesPerNode, - ImageName: documentdb.Spec.PostgresImage, + Instances: documentdb.Spec.InstancesPerNode, + ImageName: documentdb.Spec.PostgresImage, + PrimaryUpdateMethod: cnpgv1.PrimaryUpdateMethodSwitchover, StorageConfiguration: cnpgv1.StorageConfiguration{ StorageClass: storageClassPointer, // Use configured storage class or default Size: documentdb.Spec.Resource.Storage.PvcSize, diff --git a/operator/src/internal/controller/documentdb_controller.go b/operator/src/internal/controller/documentdb_controller.go index 0b265ebc..2f2009f8 100644 --- a/operator/src/internal/controller/documentdb_controller.go +++ b/operator/src/internal/controller/documentdb_controller.go @@ -835,8 +835,9 @@ func parseExtensionVersionsFromOutput(output string) (defaultVersion, installedV // upgradeDocumentDBIfNeeded handles the complete DocumentDB image upgrade process: // 1. Checks if extension image and/or gateway image need updating (builds a single JSON Patch) +// 1b. Blocks extension image rollback below the installed schema version (irreversible) // 2. If images changed, applies the patch atomically (triggers one CNPG rolling restart) and returns -// 3. After rolling restart completes, runs ALTER EXTENSION documentdb UPDATE if needed +// 3. After rolling restart completes, checks spec.schemaVersion to decide whether to run ALTER EXTENSION // 4. Updates the DocumentDB status with the new extension version func (r *DocumentDBReconciler) upgradeDocumentDBIfNeeded(ctx context.Context, currentCluster, desiredCluster *cnpgv1.Cluster, documentdb *dbpreview.DocumentDB) error { logger := log.FromContext(ctx) @@ -852,6 +853,9 @@ func (r *DocumentDBReconciler) upgradeDocumentDBIfNeeded(ctx context.Context, cu return fmt.Errorf("failed to build image patch operations: %w", err) } + // Note: Image rollback below installed schema version is blocked by the validating + // webhook at admission time. The controller trusts that persisted specs are valid. + // Step 2: Apply patch if any images need updating if len(patchOps) > 0 { patchBytes, err := json.Marshal(patchOps) @@ -951,7 +955,10 @@ func (r *DocumentDBReconciler) upgradeDocumentDBIfNeeded(ctx context.Context, cu return nil } - // Step 5b: Rollback detection — check if the new binary is older than the installed schema + // Step 5b: Rollback detection — secondary safety net. + // The primary rollback block is in Step 1b (prevents image patch when new image < schema). + // This handles edge cases where the binary version is already running but older than schema + // (e.g., if Step 1b was added after an earlier rollback already took effect). cmp, err := util.CompareExtensionVersions(defaultVersion, installedVersion) if err != nil { logger.Error(err, "Failed to compare extension versions, skipping ALTER EXTENSION as a safety measure", @@ -976,27 +983,39 @@ func (r *DocumentDBReconciler) upgradeDocumentDBIfNeeded(ctx context.Context, cu return nil } - // Step 6: Run ALTER EXTENSION to upgrade (cmp > 0: defaultVersion > installedVersion) + // Step 6: Determine schema target based on spec.schemaVersion (two-phase upgrade logic) + // + // Three modes: + // - "" (empty): Two-phase mode — do NOT auto-update schema. User must explicitly + // set spec.schemaVersion to finalize. This provides a rollback-safe window. + // - "auto": Auto-finalize — update schema to match binary version automatically. + // - "": Explicit pin — update schema to exactly this version. + schemaTarget, updateSQL := r.determineSchemaTarget(ctx, documentdb, defaultVersion, installedVersion) + if schemaTarget == "" { + // Two-phase mode or validation failure — do not run ALTER EXTENSION + return nil + } + + // Step 7: Run ALTER EXTENSION to upgrade logger.Info("Upgrading DocumentDB extension", "fromVersion", installedVersion, - "toVersion", defaultVersion) + "toVersion", schemaTarget) - updateSQL := "ALTER EXTENSION documentdb UPDATE" if _, err := r.SQLExecutor(ctx, currentCluster, updateSQL); err != nil { return fmt.Errorf("failed to run ALTER EXTENSION documentdb UPDATE: %w", err) } logger.Info("Successfully upgraded DocumentDB extension", "fromVersion", installedVersion, - "toVersion", defaultVersion) + "toVersion", schemaTarget) - // Step 7: Update DocumentDB schema version in status after upgrade + // Step 8: Update DocumentDB schema version in status after upgrade // Re-fetch to get latest resourceVersion before status update if err := r.Get(ctx, types.NamespacedName{Name: documentdb.Name, Namespace: documentdb.Namespace}, documentdb); err != nil { return fmt.Errorf("failed to refetch DocumentDB after schema upgrade: %w", err) } // Convert from pg_available_extensions format ("0.110-0") to semver ("0.110.0") - documentdb.Status.SchemaVersion = util.ExtensionVersionToSemver(defaultVersion) + documentdb.Status.SchemaVersion = util.ExtensionVersionToSemver(schemaTarget) if err := r.Status().Update(ctx, documentdb); err != nil { logger.Error(err, "Failed to update DocumentDB status after schema upgrade") return fmt.Errorf("failed to update DocumentDB status after schema upgrade: %w", err) @@ -1005,6 +1024,89 @@ func (r *DocumentDBReconciler) upgradeDocumentDBIfNeeded(ctx context.Context, cu return nil } +// determineSchemaTarget decides the target schema version based on spec.schemaVersion. +// Returns the target version (in pg_available_extensions format) and the SQL to execute. +// Returns ("", "") if no schema update should run (two-phase mode or validation failure). +func (r *DocumentDBReconciler) determineSchemaTarget( + ctx context.Context, + documentdb *dbpreview.DocumentDB, + binaryVersion string, + installedVersion string, +) (string, string) { + logger := log.FromContext(ctx) + specSchemaVersion := documentdb.Spec.SchemaVersion + + switch { + case specSchemaVersion == "": + // Two-phase mode: schema stays at current version until user explicitly sets schemaVersion. + // Use V(1) to avoid log spam — this fires every reconcile while upgrade is pending. + logger.V(1).Info("Schema update available but not requested (two-phase mode). "+ + "Set spec.schemaVersion to finalize the upgrade.", + "binaryVersion", binaryVersion, + "installedVersion", installedVersion) + // Only emit the event when the binary is actually ahead of the installed schema + // (avoids firing on new clusters where status.schemaVersion is empty). + // Kubernetes deduplicates events with the same reason+message, but skipping + // entirely is cleaner. + if r.Recorder != nil && binaryVersion != installedVersion { + msg := fmt.Sprintf( + "Schema update available: binary version is %s, schema is at %s. "+ + "Set spec.schemaVersion to %q or \"auto\" to finalize the upgrade.", + util.ExtensionVersionToSemver(binaryVersion), + util.ExtensionVersionToSemver(installedVersion), + util.ExtensionVersionToSemver(binaryVersion)) + r.Recorder.Event(documentdb, corev1.EventTypeNormal, "SchemaUpdateAvailable", msg) + } + return "", "" + + case specSchemaVersion == "auto": + // Auto-finalize: update schema to match binary version + return binaryVersion, "ALTER EXTENSION documentdb UPDATE" + + default: + // Explicit version: update to that specific version. + // Note: schemaVersion <= binary version is enforced by the validating webhook at + // admission time. This is a defense-in-depth guard. + targetPgVersion := util.SemverToExtensionVersion(specSchemaVersion) + + // Guard: target must not exceed binary version + targetCmp, err := util.CompareExtensionVersions(targetPgVersion, binaryVersion) + if err != nil { + logger.Error(err, "Failed to compare target schema version with binary version", + "targetVersion", specSchemaVersion, + "binaryVersion", binaryVersion) + return "", "" + } + if targetCmp > 0 { + logger.Info("Skipping schema update: schemaVersion exceeds binary version (should have been rejected by webhook)", + "schemaVersion", specSchemaVersion, + "binaryVersion", util.ExtensionVersionToSemver(binaryVersion)) + return "", "" + } + + // Check: target must be > installed version (otherwise no-op) + installedCmp, err := util.CompareExtensionVersions(targetPgVersion, installedVersion) + if err != nil { + logger.Error(err, "Failed to compare target schema version with installed version", + "targetVersion", specSchemaVersion, + "installedVersion", installedVersion) + return "", "" + } + if installedCmp <= 0 { + logger.V(1).Info("Schema already at or beyond requested version", + "requestedVersion", specSchemaVersion, + "installedVersion", installedVersion) + return "", "" + } + + // Safety: targetPgVersion is derived from spec.schemaVersion via SemverToExtensionVersion, + // which produces a "Major.Minor-Patch" string (e.g., "0.112-0"). The input is validated + // by the CRD schema regex pattern and the validating webhook's version parsing, so it + // contains only digits, dots, and hyphens — no SQL injection risk. + return targetPgVersion, fmt.Sprintf("ALTER EXTENSION documentdb UPDATE TO '%s'", targetPgVersion) + } +} + // buildImagePatchOps compares the current and desired CNPG cluster specs and returns // JSON Patch operations for any image differences (extension image settings and/or gateway image). // This is a pure function with no API calls. Returns: diff --git a/operator/src/internal/controller/documentdb_controller_test.go b/operator/src/internal/controller/documentdb_controller_test.go index 59ef63b5..49394a41 100644 --- a/operator/src/internal/controller/documentdb_controller_test.go +++ b/operator/src/internal/controller/documentdb_controller_test.go @@ -1696,6 +1696,9 @@ var _ = Describe("DocumentDB Controller", func() { Name: "test-documentdb", Namespace: clusterNamespace, }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "auto", + }, } fakeClient := fake.NewClientBuilder(). @@ -1767,6 +1770,9 @@ var _ = Describe("DocumentDB Controller", func() { Name: "test-documentdb", Namespace: clusterNamespace, }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "auto", + }, } fakeClient := fake.NewClientBuilder(). @@ -2241,6 +2247,9 @@ var _ = Describe("DocumentDB Controller", func() { Name: "test-documentdb", Namespace: clusterNamespace, }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "auto", + }, Status: dbpreview.DocumentDBStatus{ // Images match cluster so updateImageStatus is a no-op DocumentDBImage: "documentdb/documentdb:v1.0.0", @@ -2282,6 +2291,428 @@ var _ = Describe("DocumentDB Controller", func() { }) }) + Describe("two-phase upgrade (spec.schemaVersion)", func() { + It("should skip ALTER EXTENSION when schemaVersion is empty (two-phase mode)", func() { + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: clusterNamespace, + }, + Spec: cnpgv1.ClusterSpec{ + PostgresConfiguration: cnpgv1.PostgresConfiguration{ + Extensions: []cnpgv1.ExtensionConfiguration{ + { + Name: "documentdb", + ImageVolumeSource: corev1.ImageVolumeSource{ + Reference: "documentdb/documentdb:v1.0.0", + }, + }, + }, + }, + }, + Status: cnpgv1.ClusterStatus{ + CurrentPrimary: "test-cluster-1", + InstancesStatus: map[cnpgv1.PodStatus][]string{ + cnpgv1.PodHealthy: {"test-cluster-1"}, + }, + }, + } + + desiredCluster := cluster.DeepCopy() + + // schemaVersion is empty (default) → two-phase mode + documentdb := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-documentdb", + Namespace: clusterNamespace, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, documentdb). + WithStatusSubresource(&dbpreview.DocumentDB{}). + Build() + + sqlCalls := []string{} + reconciler := &DocumentDBReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + SQLExecutor: func(_ context.Context, _ *cnpgv1.Cluster, sql string) (string, error) { + sqlCalls = append(sqlCalls, sql) + // Binary is 0.110-0, installed is 0.109-0 → upgrade available + return " default_version | installed_version \n-----------------+-------------------\n 0.110-0 | 0.109-0 \n", nil + }, + } + + err := reconciler.upgradeDocumentDBIfNeeded(ctx, cluster, desiredCluster, documentdb) + Expect(err).ToNot(HaveOccurred()) + + // Only version-check SQL should have been called (no ALTER EXTENSION) + Expect(sqlCalls).To(HaveLen(1)) + Expect(sqlCalls[0]).To(ContainSubstring("pg_available_extensions")) + + // Status should reflect installed version, NOT the binary version + updatedDB := &dbpreview.DocumentDB{} + Expect(fakeClient.Get(ctx, types.NamespacedName{Name: "test-documentdb", Namespace: clusterNamespace}, updatedDB)).To(Succeed()) + Expect(updatedDB.Status.SchemaVersion).To(Equal("0.109.0")) + + // SchemaUpdateAvailable event should have been emitted + Expect(recorder.Events).To(HaveLen(1)) + event := <-recorder.Events + Expect(event).To(ContainSubstring("SchemaUpdateAvailable")) + }) + + It("should run ALTER EXTENSION when schemaVersion is 'auto'", func() { + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: clusterNamespace, + }, + Spec: cnpgv1.ClusterSpec{ + PostgresConfiguration: cnpgv1.PostgresConfiguration{ + Extensions: []cnpgv1.ExtensionConfiguration{ + { + Name: "documentdb", + ImageVolumeSource: corev1.ImageVolumeSource{ + Reference: "documentdb/documentdb:v1.0.0", + }, + }, + }, + }, + }, + Status: cnpgv1.ClusterStatus{ + CurrentPrimary: "test-cluster-1", + InstancesStatus: map[cnpgv1.PodStatus][]string{ + cnpgv1.PodHealthy: {"test-cluster-1"}, + }, + }, + } + + desiredCluster := cluster.DeepCopy() + + documentdb := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-documentdb", + Namespace: clusterNamespace, + }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "auto", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, documentdb). + WithStatusSubresource(&dbpreview.DocumentDB{}). + Build() + + sqlCalls := []string{} + reconciler := &DocumentDBReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + SQLExecutor: func(_ context.Context, _ *cnpgv1.Cluster, sql string) (string, error) { + sqlCalls = append(sqlCalls, sql) + if len(sqlCalls) == 1 { + return " default_version | installed_version \n-----------------+-------------------\n 0.110-0 | 0.109-0 \n", nil + } + return "ALTER EXTENSION", nil + }, + } + + err := reconciler.upgradeDocumentDBIfNeeded(ctx, cluster, desiredCluster, documentdb) + Expect(err).ToNot(HaveOccurred()) + + // Both version-check and ALTER EXTENSION should have been called + Expect(sqlCalls).To(HaveLen(2)) + Expect(sqlCalls[0]).To(ContainSubstring("pg_available_extensions")) + Expect(sqlCalls[1]).To(Equal("ALTER EXTENSION documentdb UPDATE")) + + // Status should reflect the upgraded version + updatedDB := &dbpreview.DocumentDB{} + Expect(fakeClient.Get(ctx, types.NamespacedName{Name: "test-documentdb", Namespace: clusterNamespace}, updatedDB)).To(Succeed()) + Expect(updatedDB.Status.SchemaVersion).To(Equal("0.110.0")) + }) + + It("should run ALTER EXTENSION UPDATE TO specific version when schemaVersion is explicit", func() { + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: clusterNamespace, + }, + Spec: cnpgv1.ClusterSpec{ + PostgresConfiguration: cnpgv1.PostgresConfiguration{ + Extensions: []cnpgv1.ExtensionConfiguration{ + { + Name: "documentdb", + ImageVolumeSource: corev1.ImageVolumeSource{ + Reference: "documentdb/documentdb:v1.0.0", + }, + }, + }, + }, + }, + Status: cnpgv1.ClusterStatus{ + CurrentPrimary: "test-cluster-1", + InstancesStatus: map[cnpgv1.PodStatus][]string{ + cnpgv1.PodHealthy: {"test-cluster-1"}, + }, + }, + } + + desiredCluster := cluster.DeepCopy() + + // Explicit version: update to 0.110.0 (binary is also 0.110-0) + documentdb := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-documentdb", + Namespace: clusterNamespace, + }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "0.110.0", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, documentdb). + WithStatusSubresource(&dbpreview.DocumentDB{}). + Build() + + sqlCalls := []string{} + reconciler := &DocumentDBReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + SQLExecutor: func(_ context.Context, _ *cnpgv1.Cluster, sql string) (string, error) { + sqlCalls = append(sqlCalls, sql) + if len(sqlCalls) == 1 { + // Binary is 0.110-0, installed is 0.109-0 + return " default_version | installed_version \n-----------------+-------------------\n 0.110-0 | 0.109-0 \n", nil + } + return "ALTER EXTENSION", nil + }, + } + + err := reconciler.upgradeDocumentDBIfNeeded(ctx, cluster, desiredCluster, documentdb) + Expect(err).ToNot(HaveOccurred()) + + // Should run ALTER EXTENSION UPDATE TO specific version + Expect(sqlCalls).To(HaveLen(2)) + Expect(sqlCalls[0]).To(ContainSubstring("pg_available_extensions")) + Expect(sqlCalls[1]).To(Equal("ALTER EXTENSION documentdb UPDATE TO '0.110-0'")) + + // Status should reflect the explicit version + updatedDB := &dbpreview.DocumentDB{} + Expect(fakeClient.Get(ctx, types.NamespacedName{Name: "test-documentdb", Namespace: clusterNamespace}, updatedDB)).To(Succeed()) + Expect(updatedDB.Status.SchemaVersion).To(Equal("0.110.0")) + }) + + It("should skip when explicit schemaVersion exceeds binary version", func() { + // Note: In production, the validating webhook rejects schemaVersion > binary + // at admission time. This test verifies the controller gracefully handles the + // case as a no-op (defense-in-depth). + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: clusterNamespace, + }, + Spec: cnpgv1.ClusterSpec{ + PostgresConfiguration: cnpgv1.PostgresConfiguration{ + Extensions: []cnpgv1.ExtensionConfiguration{ + { + Name: "documentdb", + ImageVolumeSource: corev1.ImageVolumeSource{ + Reference: "documentdb/documentdb:v1.0.0", + }, + }, + }, + }, + }, + Status: cnpgv1.ClusterStatus{ + CurrentPrimary: "test-cluster-1", + InstancesStatus: map[cnpgv1.PodStatus][]string{ + cnpgv1.PodHealthy: {"test-cluster-1"}, + }, + }, + } + + desiredCluster := cluster.DeepCopy() + + // Explicit version 0.112.0 exceeds binary version 0.110-0 + documentdb := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-documentdb", + Namespace: clusterNamespace, + }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "0.112.0", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, documentdb). + WithStatusSubresource(&dbpreview.DocumentDB{}). + Build() + + sqlCalls := []string{} + reconciler := &DocumentDBReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + SQLExecutor: func(_ context.Context, _ *cnpgv1.Cluster, sql string) (string, error) { + sqlCalls = append(sqlCalls, sql) + // Binary is 0.110-0, installed is 0.109-0 + return " default_version | installed_version \n-----------------+-------------------\n 0.110-0 | 0.109-0 \n", nil + }, + } + + err := reconciler.upgradeDocumentDBIfNeeded(ctx, cluster, desiredCluster, documentdb) + Expect(err).ToNot(HaveOccurred()) + + // Only version-check SQL should have been called (no ALTER EXTENSION) + Expect(sqlCalls).To(HaveLen(1)) + + // Status should reflect installed version (not upgraded) + updatedDB := &dbpreview.DocumentDB{} + Expect(fakeClient.Get(ctx, types.NamespacedName{Name: "test-documentdb", Namespace: clusterNamespace}, updatedDB)).To(Succeed()) + Expect(updatedDB.Status.SchemaVersion).To(Equal("0.109.0")) + }) + + It("should skip when explicit schemaVersion matches installed version", func() { + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: clusterNamespace, + }, + Spec: cnpgv1.ClusterSpec{ + PostgresConfiguration: cnpgv1.PostgresConfiguration{ + Extensions: []cnpgv1.ExtensionConfiguration{ + { + Name: "documentdb", + ImageVolumeSource: corev1.ImageVolumeSource{ + Reference: "documentdb/documentdb:v1.0.0", + }, + }, + }, + }, + }, + Status: cnpgv1.ClusterStatus{ + CurrentPrimary: "test-cluster-1", + InstancesStatus: map[cnpgv1.PodStatus][]string{ + cnpgv1.PodHealthy: {"test-cluster-1"}, + }, + }, + } + + desiredCluster := cluster.DeepCopy() + + // Explicit version matches installed → no-op + documentdb := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-documentdb", + Namespace: clusterNamespace, + }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "0.109.0", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, documentdb). + WithStatusSubresource(&dbpreview.DocumentDB{}). + Build() + + sqlCalls := []string{} + reconciler := &DocumentDBReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + SQLExecutor: func(_ context.Context, _ *cnpgv1.Cluster, sql string) (string, error) { + sqlCalls = append(sqlCalls, sql) + // Binary is 0.110-0, installed is 0.109-0 + return " default_version | installed_version \n-----------------+-------------------\n 0.110-0 | 0.109-0 \n", nil + }, + } + + err := reconciler.upgradeDocumentDBIfNeeded(ctx, cluster, desiredCluster, documentdb) + Expect(err).ToNot(HaveOccurred()) + + // Only version-check SQL should have been called (no ALTER EXTENSION) + Expect(sqlCalls).To(HaveLen(1)) + }) + + It("should handle unparseable binary version gracefully", func() { + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: clusterNamespace, + }, + Spec: cnpgv1.ClusterSpec{ + PostgresConfiguration: cnpgv1.PostgresConfiguration{ + Extensions: []cnpgv1.ExtensionConfiguration{ + { + Name: "documentdb", + ImageVolumeSource: corev1.ImageVolumeSource{ + Reference: "documentdb/documentdb:v1.0.0", + }, + }, + }, + }, + }, + Status: cnpgv1.ClusterStatus{ + CurrentPrimary: "test-cluster-1", + InstancesStatus: map[cnpgv1.PodStatus][]string{ + cnpgv1.PodHealthy: {"test-cluster-1"}, + }, + }, + } + + desiredCluster := cluster.DeepCopy() + + documentdb := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-documentdb", + Namespace: clusterNamespace, + }, + Spec: dbpreview.DocumentDBSpec{ + SchemaVersion: "0.110.0", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, documentdb). + WithStatusSubresource(&dbpreview.DocumentDB{}). + Build() + + sqlCalls := []string{} + reconciler := &DocumentDBReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + SQLExecutor: func(_ context.Context, _ *cnpgv1.Cluster, sql string) (string, error) { + sqlCalls = append(sqlCalls, sql) + // Return unparseable binary version + return " default_version | installed_version \n-----------------+-------------------\n invalid | 0.109-0 \n", nil + }, + } + + err := reconciler.upgradeDocumentDBIfNeeded(ctx, cluster, desiredCluster, documentdb) + Expect(err).ToNot(HaveOccurred()) + + // Only version-check SQL called, no ALTER EXTENSION (graceful skip) + Expect(sqlCalls).To(HaveLen(1)) + }) + + // Note: Image rollback blocking is now enforced by the validating webhook at + // admission time. The controller no longer needs to check for this case. + }) + Describe("updateImageStatus", func() { It("should set DocumentDBImage and GatewayImage from cluster spec", func() { cluster := &cnpgv1.Cluster{ diff --git a/operator/src/internal/utils/util.go b/operator/src/internal/utils/util.go index fec8852f..751d2bce 100644 --- a/operator/src/internal/utils/util.go +++ b/operator/src/internal/utils/util.go @@ -510,6 +510,16 @@ func ExtensionVersionToSemver(v string) string { return v } +// SemverToExtensionVersion converts a semver string (e.g., "0.110.0") to the +// PostgreSQL extension version format (e.g., "0.110-0") used by pg_available_extensions. +// This is the inverse of ExtensionVersionToSemver. +func SemverToExtensionVersion(v string) string { + if idx := strings.LastIndex(v, "."); idx >= 0 { + return v[:idx] + "-" + v[idx+1:] + } + return v +} + // CompareExtensionVersions compares two DocumentDB extension version strings. // Format: "Major.Minor-Patch" (e.g., "0.110-0"). // Returns: -1 if v1 < v2, 0 if equal, +1 if v1 > v2. diff --git a/operator/src/internal/utils/util_test.go b/operator/src/internal/utils/util_test.go index c10f019b..ab0da533 100644 --- a/operator/src/internal/utils/util_test.go +++ b/operator/src/internal/utils/util_test.go @@ -1125,3 +1125,28 @@ func TestExtensionVersionToSemver(t *testing.T) { }) } } + +func TestSemverToExtensionVersion(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {name: "standard semver", input: "0.110.0", expected: "0.110-0"}, + {name: "non-zero patch", input: "0.110.1", expected: "0.110-1"}, + {name: "major version", input: "1.0.0", expected: "1.0-0"}, + {name: "all non-zero", input: "2.15.3", expected: "2.15-3"}, + {name: "large numbers", input: "100.999.50", expected: "100.999-50"}, + {name: "already extension format (no dot)", input: "110", expected: "110"}, + {name: "empty string", input: "", expected: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := SemverToExtensionVersion(tt.input) + if result != tt.expected { + t.Errorf("SemverToExtensionVersion(%q) = %q, want %q", tt.input, result, tt.expected) + } + }) + } +} diff --git a/operator/src/internal/webhook/documentdb_webhook.go b/operator/src/internal/webhook/documentdb_webhook.go new file mode 100644 index 00000000..01e26d18 --- /dev/null +++ b/operator/src/internal/webhook/documentdb_webhook.go @@ -0,0 +1,255 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package webhook + +import ( + "context" + "fmt" + "strconv" + "strings" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + dbpreview "github.com/documentdb/documentdb-operator/api/preview" + util "github.com/documentdb/documentdb-operator/internal/utils" +) + +var documentdbLog = logf.Log.WithName("documentdb-webhook") + +// DocumentDBValidator validates DocumentDB resources on create and update. +type DocumentDBValidator struct { + client.Client +} + +var _ webhook.CustomValidator = &DocumentDBValidator{} + +// SetupWebhookWithManager registers the validating webhook with the manager. +func (v *DocumentDBValidator) SetupWebhookWithManager(mgr ctrl.Manager) error { + v.Client = mgr.GetClient() + return ctrl.NewWebhookManagedBy(mgr). + For(&dbpreview.DocumentDB{}). + WithValidator(v). + Complete() +} + +// NOTE: The kubebuilder marker below is used for local development with `make run`. +// For Helm-based deployments, the authoritative webhook configuration is in +// operator/documentdb-helm-chart/templates/10_documentdb_webhook.yaml. +// +kubebuilder:webhook:path=/validate-documentdb-io-preview-documentdb,mutating=false,failurePolicy=fail,sideEffects=None,groups=documentdb.io,resources=dbs,verbs=create;update,versions=preview,name=vdocumentdb.kb.io,admissionReviewVersions=v1 + +// ValidateCreate validates a DocumentDB resource on creation. +func (v *DocumentDBValidator) ValidateCreate(_ context.Context, obj runtime.Object) (admission.Warnings, error) { + documentdb, ok := obj.(*dbpreview.DocumentDB) + if !ok { + return nil, fmt.Errorf("expected DocumentDB but got %T", obj) + } + documentdbLog.Info("Validation for DocumentDB upon creation", "name", documentdb.Name, "namespace", documentdb.Namespace) + + allErrs := v.validate(documentdb) + if len(allErrs) == 0 { + return nil, nil + } + return nil, apierrors.NewInvalid( + schema.GroupKind{Group: "documentdb.io", Kind: "DocumentDB"}, + documentdb.Name, allErrs) +} + +// ValidateUpdate validates a DocumentDB resource on update. +func (v *DocumentDBValidator) ValidateUpdate(_ context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + newDB, ok := newObj.(*dbpreview.DocumentDB) + if !ok { + return nil, fmt.Errorf("expected DocumentDB but got %T", newObj) + } + oldDB, ok := oldObj.(*dbpreview.DocumentDB) + if !ok { + return nil, fmt.Errorf("expected DocumentDB but got %T", oldObj) + } + documentdbLog.Info("Validation for DocumentDB upon update", "name", newDB.Name, "namespace", newDB.Namespace) + + allErrs := append( + v.validate(newDB), + v.validateChanges(newDB, oldDB)..., + ) + if len(allErrs) == 0 { + return nil, nil + } + return nil, apierrors.NewInvalid( + schema.GroupKind{Group: "documentdb.io", Kind: "DocumentDB"}, + newDB.Name, allErrs) +} + +// ValidateDelete is a no-op for DocumentDB. +func (v *DocumentDBValidator) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) { + return nil, nil +} + +// --------------------------------------------------------------------------- +// Spec-level validations (run on both create and update) +// --------------------------------------------------------------------------- + +// validate runs all spec-level validation rules, returning a combined error list. +func (v *DocumentDBValidator) validate(db *dbpreview.DocumentDB) (allErrs field.ErrorList) { + type validationFunc func(*dbpreview.DocumentDB) field.ErrorList + validations := []validationFunc{ + v.validateSchemaVersionNotExceedsBinary, + // Add new spec-level validations here. + } + for _, fn := range validations { + allErrs = append(allErrs, fn(db)...) + } + return allErrs +} + +// validateSchemaVersionNotExceedsBinary ensures spec.schemaVersion <= binary version. +func (v *DocumentDBValidator) validateSchemaVersionNotExceedsBinary(db *dbpreview.DocumentDB) field.ErrorList { + if db.Spec.SchemaVersion == "" || db.Spec.SchemaVersion == "auto" { + return nil + } + + binaryVersion := resolveBinaryVersion(db) + if binaryVersion == "" { + return field.ErrorList{field.Invalid( + field.NewPath("spec", "schemaVersion"), + db.Spec.SchemaVersion, + "cannot set an explicit schemaVersion without also setting spec.documentDBVersion or spec.documentDBImage; "+ + "the webhook needs a binary version to validate against", + )} + } + + schemaExtensionVersion := util.SemverToExtensionVersion(db.Spec.SchemaVersion) + binaryExtensionVersion := util.SemverToExtensionVersion(binaryVersion) + + cmp, err := util.CompareExtensionVersions(schemaExtensionVersion, binaryExtensionVersion) + if err != nil { + return field.ErrorList{field.Invalid( + field.NewPath("spec", "schemaVersion"), + db.Spec.SchemaVersion, + fmt.Sprintf("cannot validate schemaVersion: version comparison failed: %v", err), + )} + } + if cmp > 0 { + return field.ErrorList{field.Invalid( + field.NewPath("spec", "schemaVersion"), + db.Spec.SchemaVersion, + fmt.Sprintf("schemaVersion %s exceeds the binary version %s; schema version must be <= binary version", + db.Spec.SchemaVersion, binaryVersion), + )} + } + return nil +} + +// --------------------------------------------------------------------------- +// Update-only validations (compare old and new) +// --------------------------------------------------------------------------- + +// validateChanges runs all update-specific validation rules that compare old vs new state. +func (v *DocumentDBValidator) validateChanges(newDB, oldDB *dbpreview.DocumentDB) (allErrs field.ErrorList) { + type validationFunc func(newDB, oldDB *dbpreview.DocumentDB) field.ErrorList + validations := []validationFunc{ + v.validateImageRollback, + // Add new update-only validations here. + } + for _, fn := range validations { + allErrs = append(allErrs, fn(newDB, oldDB)...) + } + return allErrs +} + +// validateImageRollback blocks image downgrades below the installed schema version. +// Once ALTER EXTENSION UPDATE has run, the schema is irreversible. Running an older +// binary against a newer schema is untested and may cause data corruption. +func (v *DocumentDBValidator) validateImageRollback(newDB, oldDB *dbpreview.DocumentDB) field.ErrorList { + installedSchemaVersion := oldDB.Status.SchemaVersion + if installedSchemaVersion == "" { + return nil + } + + newBinaryVersion := resolveBinaryVersion(newDB) + if newBinaryVersion == "" { + return nil + } + + newBinaryExtensionVersion := util.SemverToExtensionVersion(newBinaryVersion) + schemaExtensionVersion := util.SemverToExtensionVersion(installedSchemaVersion) + + cmp, err := util.CompareExtensionVersions(newBinaryExtensionVersion, schemaExtensionVersion) + if err != nil { + return field.ErrorList{field.Forbidden( + field.NewPath("spec"), + fmt.Sprintf("cannot validate image rollback: version comparison failed: %v", err), + )} + } + if cmp < 0 { + return field.ErrorList{field.Forbidden( + field.NewPath("spec"), + fmt.Sprintf( + "image rollback blocked: requested version %s is older than installed schema version %s. "+ + "ALTER EXTENSION has no downgrade path — running an older binary with a newer schema may cause data corruption. "+ + "To recover, restore from backup or update to a version >= %s.", + newBinaryVersion, installedSchemaVersion, installedSchemaVersion), + )} + } + return nil +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// resolveBinaryVersion extracts the effective binary version from a DocumentDB spec. +// Priority: documentDBImage tag > documentDBVersion > "" (unknown). +// Digest-only references (e.g., "image@sha256:...") are not parseable as versions +// and return "". +func resolveBinaryVersion(db *dbpreview.DocumentDB) string { + if db.Spec.DocumentDBImage != "" { + ref := db.Spec.DocumentDBImage + // Ignore digest-only references — they don't carry a version tag + if strings.Contains(ref, "@sha256:") { + return db.Spec.DocumentDBVersion + } + if tagIdx := strings.LastIndex(ref, ":"); tagIdx >= 0 { + tag := ref[tagIdx+1:] + // Extract leading semver (X.Y.Z) from tags like "0.112.0-amd64" + if semver := extractSemver(tag); semver != "" { + return semver + } + } + } + return db.Spec.DocumentDBVersion +} + +// extractSemver returns the leading "X.Y.Z" portion from a tag string, +// or "" if the tag doesn't start with a valid semver pattern. +func extractSemver(tag string) string { + // Match digits.digits.digits at start of string + parts := strings.SplitN(tag, ".", 3) + if len(parts) < 3 { + return "" + } + // Validate major and minor are numeric + if _, err := strconv.Atoi(parts[0]); err != nil { + return "" + } + if _, err := strconv.Atoi(parts[1]); err != nil { + return "" + } + // Third part may have a suffix (e.g., "0-amd64"), take only leading digits + thirdPart := parts[2] + i := 0 + for i < len(thirdPart) && thirdPart[i] >= '0' && thirdPart[i] <= '9' { + i++ + } + if i == 0 { + return "" + } + return parts[0] + "." + parts[1] + "." + thirdPart[:i] +} diff --git a/operator/src/internal/webhook/documentdb_webhook_test.go b/operator/src/internal/webhook/documentdb_webhook_test.go new file mode 100644 index 00000000..6b2484f1 --- /dev/null +++ b/operator/src/internal/webhook/documentdb_webhook_test.go @@ -0,0 +1,319 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package webhook + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + dbpreview "github.com/documentdb/documentdb-operator/api/preview" +) + +func newTestDocumentDB(version, schemaVersion, image string) *dbpreview.DocumentDB { + db := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-db", + Namespace: "default", + }, + Spec: dbpreview.DocumentDBSpec{ + NodeCount: 1, + InstancesPerNode: 1, + Resource: dbpreview.Resource{ + Storage: dbpreview.StorageConfiguration{PvcSize: "10Gi"}, + }, + }, + } + if version != "" { + db.Spec.DocumentDBVersion = version + } + if schemaVersion != "" { + db.Spec.SchemaVersion = schemaVersion + } + if image != "" { + db.Spec.DocumentDBImage = image + } + return db +} + +var _ = Describe("schema version validation", func() { + var v *DocumentDBValidator + + BeforeEach(func() { + v = &DocumentDBValidator{} + }) + + It("allows an empty schemaVersion", func() { + db := newTestDocumentDB("0.112.0", "", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(BeEmpty()) + }) + + It("allows schemaVersion set to auto", func() { + db := newTestDocumentDB("0.112.0", "auto", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(BeEmpty()) + }) + + It("allows schemaVersion equal to binary version", func() { + db := newTestDocumentDB("0.112.0", "0.112.0", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(BeEmpty()) + }) + + It("allows schemaVersion below binary version", func() { + db := newTestDocumentDB("0.112.0", "0.110.0", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(BeEmpty()) + }) + + It("rejects schemaVersion above binary version", func() { + db := newTestDocumentDB("0.110.0", "0.112.0", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("exceeds the binary version")) + }) + + It("allows schemaVersion equal to image tag version", func() { + db := newTestDocumentDB("", "0.112.0", "ghcr.io/documentdb/documentdb:0.112.0") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(BeEmpty()) + }) + + It("rejects schemaVersion above image tag version", func() { + db := newTestDocumentDB("", "0.115.0", "ghcr.io/documentdb/documentdb:0.112.0") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("exceeds the binary version")) + }) + + It("rejects explicit schemaVersion when no binary version can be resolved", func() { + db := newTestDocumentDB("", "0.112.0", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("cannot set an explicit schemaVersion without also setting")) + }) + + It("rejects when version comparison fails due to unparseable version", func() { + db := newTestDocumentDB("invalid", "0.112.0", "") + result := v.validateSchemaVersionNotExceedsBinary(db) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("version comparison failed")) + }) +}) + +var _ = Describe("image rollback validation", func() { + var v *DocumentDBValidator + + BeforeEach(func() { + v = &DocumentDBValidator{} + }) + + It("allows upgrade above installed schema version", func() { + oldDB := newTestDocumentDB("0.110.0", "", "") + oldDB.Status.SchemaVersion = "0.110.0" + newDB := newTestDocumentDB("0.112.0", "", "") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(BeEmpty()) + }) + + It("blocks image rollback below installed schema version", func() { + oldDB := newTestDocumentDB("0.112.0", "auto", "") + oldDB.Status.SchemaVersion = "0.112.0" + newDB := newTestDocumentDB("0.110.0", "auto", "") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("image rollback blocked")) + }) + + It("allows rollback when no schema version is installed", func() { + oldDB := newTestDocumentDB("0.112.0", "", "") + newDB := newTestDocumentDB("0.110.0", "", "") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(BeEmpty()) + }) + + It("allows same version on update", func() { + oldDB := newTestDocumentDB("0.112.0", "auto", "") + oldDB.Status.SchemaVersion = "0.112.0" + newDB := newTestDocumentDB("0.112.0", "auto", "") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(BeEmpty()) + }) + + It("blocks image rollback via documentDBImage field", func() { + oldDB := newTestDocumentDB("", "", "ghcr.io/documentdb/documentdb:0.112.0") + oldDB.Status.SchemaVersion = "0.112.0" + newDB := newTestDocumentDB("", "", "ghcr.io/documentdb/documentdb:0.110.0") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("image rollback blocked")) + }) + + It("skips validation when new binary version cannot be resolved", func() { + oldDB := newTestDocumentDB("", "", "") + oldDB.Status.SchemaVersion = "0.112.0" + newDB := newTestDocumentDB("", "", "") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(BeEmpty()) + }) + + It("rejects when version comparison fails due to unparseable version", func() { + oldDB := newTestDocumentDB("invalid", "", "") + oldDB.Status.SchemaVersion = "invalid" + newDB := newTestDocumentDB("invalid", "", "") + result := v.validateImageRollback(newDB, oldDB) + Expect(result).To(HaveLen(1)) + Expect(result[0].Detail).To(ContainSubstring("version comparison failed")) + }) +}) + +var _ = Describe("ValidateCreate admission handler", func() { + var v *DocumentDBValidator + + BeforeEach(func() { + v = &DocumentDBValidator{} + }) + + It("allows a valid DocumentDB resource", func() { + db := newTestDocumentDB("0.112.0", "", "") + warnings, err := v.ValidateCreate(context.Background(), db) + Expect(err).ToNot(HaveOccurred()) + Expect(warnings).To(BeEmpty()) + }) + + It("rejects a resource with schemaVersion above binary", func() { + db := newTestDocumentDB("0.110.0", "0.112.0", "") + _, err := v.ValidateCreate(context.Background(), db) + Expect(err).To(HaveOccurred()) + }) + + It("returns error for non-DocumentDB object", func() { + _, err := v.ValidateCreate(context.Background(), &dbpreview.Backup{}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("expected DocumentDB")) + }) +}) + +var _ = Describe("ValidateUpdate admission handler", func() { + var v *DocumentDBValidator + + BeforeEach(func() { + v = &DocumentDBValidator{} + }) + + It("allows a valid upgrade", func() { + oldDB := newTestDocumentDB("0.110.0", "", "") + oldDB.Status.SchemaVersion = "0.110.0" + newDB := newTestDocumentDB("0.112.0", "", "") + warnings, err := v.ValidateUpdate(context.Background(), oldDB, newDB) + Expect(err).ToNot(HaveOccurred()) + Expect(warnings).To(BeEmpty()) + }) + + It("rejects rollback below installed schema version", func() { + oldDB := newTestDocumentDB("0.112.0", "auto", "") + oldDB.Status.SchemaVersion = "0.112.0" + newDB := newTestDocumentDB("0.110.0", "auto", "") + _, err := v.ValidateUpdate(context.Background(), oldDB, newDB) + Expect(err).To(HaveOccurred()) + }) + + It("rejects schemaVersion above binary on update", func() { + oldDB := newTestDocumentDB("0.110.0", "", "") + oldDB.Status.SchemaVersion = "0.110.0" + newDB := newTestDocumentDB("0.110.0", "0.112.0", "") + _, err := v.ValidateUpdate(context.Background(), oldDB, newDB) + Expect(err).To(HaveOccurred()) + }) + + It("returns error when newObj is not a DocumentDB", func() { + oldDB := newTestDocumentDB("0.110.0", "", "") + _, err := v.ValidateUpdate(context.Background(), oldDB, &dbpreview.Backup{}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("expected DocumentDB")) + }) + + It("returns error when oldObj is not a DocumentDB", func() { + newDB := newTestDocumentDB("0.112.0", "", "") + _, err := v.ValidateUpdate(context.Background(), &dbpreview.Backup{}, newDB) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("expected DocumentDB")) + }) +}) + +var _ = Describe("ValidateDelete admission handler", func() { + It("always allows deletion", func() { + v := &DocumentDBValidator{} + db := newTestDocumentDB("0.112.0", "auto", "") + warnings, err := v.ValidateDelete(context.Background(), db) + Expect(err).ToNot(HaveOccurred()) + Expect(warnings).To(BeEmpty()) + }) +}) + +var _ = Describe("resolveBinaryVersion helper", func() { + It("prefers the image tag over documentDBVersion", func() { + db := newTestDocumentDB("0.110.0", "", "ghcr.io/documentdb/documentdb:0.112.0") + Expect(resolveBinaryVersion(db)).To(Equal("0.112.0")) + }) + + It("falls back to documentDBVersion when no image is set", func() { + db := newTestDocumentDB("0.110.0", "", "") + Expect(resolveBinaryVersion(db)).To(Equal("0.110.0")) + }) + + It("returns empty when neither image nor version is set", func() { + db := newTestDocumentDB("", "", "") + Expect(resolveBinaryVersion(db)).To(BeEmpty()) + }) + + It("extracts semver from tag with architecture suffix", func() { + db := newTestDocumentDB("", "", "ghcr.io/documentdb/documentdb:0.112.0-amd64") + Expect(resolveBinaryVersion(db)).To(Equal("0.112.0")) + }) + + It("falls back to documentDBVersion for digest-only references", func() { + db := newTestDocumentDB("0.112.0", "", "ghcr.io/documentdb/documentdb@sha256:abc123") + Expect(resolveBinaryVersion(db)).To(Equal("0.112.0")) + }) + + It("returns empty for digest-only reference with no documentDBVersion", func() { + db := newTestDocumentDB("", "", "ghcr.io/documentdb/documentdb@sha256:abc123") + Expect(resolveBinaryVersion(db)).To(BeEmpty()) + }) + + It("handles image with port in registry and tag", func() { + db := newTestDocumentDB("", "", "localhost:5000/documentdb:0.112.0") + Expect(resolveBinaryVersion(db)).To(Equal("0.112.0")) + }) +}) + +var _ = Describe("extractSemver helper", func() { + It("extracts clean semver", func() { + Expect(extractSemver("0.112.0")).To(Equal("0.112.0")) + }) + + It("extracts semver from tag with suffix", func() { + Expect(extractSemver("0.112.0-amd64")).To(Equal("0.112.0")) + }) + + It("returns empty for non-semver tag", func() { + Expect(extractSemver("latest")).To(BeEmpty()) + }) + + It("returns empty for empty string", func() { + Expect(extractSemver("")).To(BeEmpty()) + }) + + It("returns empty for non-numeric major", func() { + Expect(extractSemver("abc.112.0")).To(BeEmpty()) + }) + + It("returns empty for non-numeric minor", func() { + Expect(extractSemver("0.abc.0")).To(BeEmpty()) + }) +}) diff --git a/operator/src/internal/webhook/suite_test.go b/operator/src/internal/webhook/suite_test.go new file mode 100644 index 00000000..b2c64e96 --- /dev/null +++ b/operator/src/internal/webhook/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package webhook + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestWebhook(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Webhook Suite") +}