From af13ac341168483be56d7169bca6072df09fc007 Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Tue, 9 Sep 2025 08:17:19 -0500 Subject: [PATCH 01/25] add k8s prod readiness checklist --- .../kubernetes/k-production-checklist.adoc | 1312 +++++++++++++++++ .../kubernetes/k-production-deployment.adoc | 4 + .../kubernetes/k-production-workflow.adoc | 1 + .../redpanda/kubernetes/k-requirements.adoc | 5 +- .../deploy/partials/high-availability.adoc | 4 + 5 files changed, 1325 insertions(+), 1 deletion(-) create mode 100644 modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc new file mode 100644 index 0000000000..85dfe45570 --- /dev/null +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -0,0 +1,1312 @@ += Redpanda Kubernetes Production Readiness Checklist +:description: Comprehensive checklist for validating Redpanda deployments in Kubernetes against production readiness standards. +:page-context-links: [{"name": "Linux", "to": "deploy:redpanda/linux/index.adoc" },{"name": "Kubernetes", "to": "deploy:redpanda/kubernetes/index.adoc" } ] +:page-categories: Production, Deployment + +This checklist validates Redpanda deployments in Kubernetes against production readiness standards. Use this guide to ensure your cluster meets all critical requirements and follows recommended best practices for production deployments. + +NOTE: For non-Kubernetes deployments (Linux/VM-based), see the xref:deploy:redpanda/manual/production/production-readiness.adoc[Production Readiness Checklist for Linux]. + +== Critical Production Requirements + +These checks are essential for a stable, reliable production deployment. All critical requirements must pass before going live. + +=== Authentication note + +The `rpk` commands throughout this checklist include SASL authentication flags (`-X user`, `-X pass`, `-X sasl.mechanism`). + +**If your cluster does not use SASL authentication**, you can omit these flags from all commands. For example: +[,bash] +---- +# With SASL authentication +kubectl exec -n -c redpanda -- rpk cluster health -X user= -X pass= -X sasl.mechanism= + +# Without SASL authentication +kubectl exec -n -c redpanda -- rpk cluster health +---- + +Common SASL mechanisms are `SCRAM-SHA-256` or `SCRAM-SHA-512`. Update these values as needed for your deployment. + +=== Cluster health status + +Verify the cluster reports as healthy with no broker issues. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster health -X user= -X pass= -X sasl.mechanism= +---- + +All brokers must report `HEALTHY` status with zero leaderless partitions and zero under-replicated partitions. + +=== Minimum broker count (≥3) + +Ensure at least 3 brokers are running for production fault tolerance. + +Production clusters should have odd numbers of brokers (3, 5, 7, etc.) for optimal consensus behavior. + +Verify the running broker count: + +[,bash] +---- +kubectl get pods -n -l app.kubernetes.io/component=redpanda-statefulset +---- + +Expected output showing 3 or more brokers: +[,bash,role=no-copy] +---- +NAME READY STATUS RESTARTS AGE +redpanda-0 2/2 Running 0 10d +redpanda-1 2/2 Running 0 10d +redpanda-2 2/2 Running 0 10d +---- + +Verify the configured replica count in your deployment: + +[tabs] +====== +Helm:: ++ +-- +[,bash] +---- +helm get values redpanda -n | grep -A 1 "statefulset:" +---- + +Expected output: +[,bash,role=no-copy] +---- +statefulset: + replicas: 3 +---- +-- + +Operator:: ++ +-- +[,bash] +---- +kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.statefulset.replicas}' +---- + +Expected output: +[,bash,role=no-copy] +---- +3 +---- +-- +====== + +=== All brokers active membership + +Verify all brokers are in active state and not being decommissioned. + +Decommissioning is used to permanently remove a broker from the cluster, such as during node pool migrations or cluster downsizing. Brokers in decommissioned state should not be present in production clusters unless actively performing a planned migration. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk redpanda admin brokers list -X user= -X pass= -X sasl.mechanism= +---- + +Expected output showing all brokers with `active` membership status: +[,bash,role=no-copy] +---- +NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION +0 4 active true v24.2.4 +1 4 active true v24.2.4 +2 4 active true v24.2.4 +---- + +All brokers must show `active` status. If any broker shows `draining` or `decommissioned`, investigate immediately. + +See xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] for more information. + +=== No brokers in maintenance mode + +Ensure no brokers are currently in maintenance mode during normal operations. + +Maintenance mode is used when modifying brokers that will remain as members of the cluster, such as during rolling upgrades or hardware maintenance. While necessary during planned maintenance windows, brokers should not remain in maintenance mode during normal operations. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster maintenance status -X user= -X pass= -X sasl.mechanism= +---- + +Expected output showing no brokers in maintenance mode: +[,bash,role=no-copy] +---- +NODE-ID ENABLED FINISHED ERRORS PARTITIONS ELIGIBLE TRANSFERRING FAILED +0 false - - - - - - +1 false - - - - - - +2 false - - - - - - +---- + +All brokers should show `ENABLED: false`. If any broker shows `ENABLED: true` outside of a planned maintenance window, investigate immediately. + +See xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] for more information. + +=== Consistent Redpanda version + +Ensure all brokers run the same Redpanda version. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk redpanda admin brokers list -X user= -X pass= -X sasl.mechanism= +---- + +Version mismatches can cause compatibility issues and must be resolved. + +=== Version pinning + +**CRITICAL**: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. + +Verify that versions are explicitly pinned in your deployment configuration: + +[tabs] +====== +Helm:: ++ +-- +[,yaml] +---- +image: + tag: v24.2.4 # Pin specific Redpanda version + +console: + enabled: true + image: + tag: v2.4.5 # Pin specific Console version + +connectors: + enabled: true + image: + tag: v1.0.15 # Pin specific Connectors version +---- + +Verify pinned versions: +[,bash] +---- +helm get values redpanda -n +---- + +Expected output showing explicit version tags (not `latest` or version ranges): +[,bash,role=no-copy] +---- +image: + tag: v24.2.4 +console: + image: + tag: v2.4.5 +connectors: + image: + tag: v1.0.15 +---- +-- + +Operator:: ++ +-- +[,yaml] +---- +apiVersion: cluster.redpanda.com/v1alpha2 +kind: Redpanda +metadata: + name: redpanda +spec: + clusterSpec: + image: + tag: v24.2.4 # Pin specific Redpanda version + + console: + enabled: true + image: + tag: v2.4.5 # Pin specific Console version + + connectors: + enabled: true + image: + tag: v1.0.15 # Pin specific Connectors version +---- + +Verify pinned versions: +[,bash] +---- +kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" +---- +-- +====== + +**Why this matters**: + +* Prevents automatic upgrades during unintended times (e.g., during high-traffic periods) +* Ensures all environments (dev/staging/prod) run the same tested versions +* Allows controlled upgrade testing in non-production environments first +* Avoids compatibility issues between Redpanda and its components +* Provides rollback capability to known-good versions + +**Avoid using**: +* `latest` tag - always pulls the newest version +* Version ranges (e.g., `v24.2.x`) - may auto-update to patch releases +* Unspecified tags - defaults to latest or chart-defined versions + +=== Default topic replication factor (≥3) + +Verify the default replication factor is set appropriately for production. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get default_topic_replications -X user= -X pass= -X sasl.mechanism= +---- + +Expected output: +[,bash,role=no-copy] +---- +3 +---- + +Setting `default_topic_replications` to 3 or greater ensures new topics are created with adequate fault tolerance. + +=== Existing topics replication factor (≥3) + +Check that all existing topics have adequate replication. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk topic list -X user= -X pass= -X sasl.mechanism= +---- + +Expected output showing replication factor ≥3 for all topics: +[,bash,role=no-copy] +---- +NAME PARTITIONS REPLICAS +_schemas 1 3 +orders 12 3 +payments 8 3 +user-events 16 3 +---- + +All production topics should have `REPLICAS` of 3 or greater. Topics with single-digit replication are at risk of data loss if a broker fails. + +See xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] if remediation is needed. + +=== Persistent storage configuration + +Verify using persistent storage (not hostPath or emptyDir) for data persistence. + +[,bash] +---- +kubectl get pvc -n +---- + +Expected output showing bound PersistentVolumeClaims: +[,bash,role=no-copy] +---- +NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE +datadir-redpanda-0 Bound pvc-a1b2c3d4-e5f6-7890-abcd-ef1234567890 100Gi RWO fast-ssd 10d +datadir-redpanda-1 Bound pvc-b2c3d4e5-f6g7-8901-bcde-fg2345678901 100Gi RWO fast-ssd 10d +datadir-redpanda-2 Bound pvc-c3d4e5f6-g7h8-9012-cdef-gh3456789012 100Gi RWO fast-ssd 10d +---- + +Verify the StatefulSet uses PersistentVolumeClaims: +[,bash] +---- +kubectl describe statefulset -n redpanda | grep -A 5 "Volume Claims" +---- + +Expected output: +[,bash,role=no-copy] +---- +Volume Claims: + Name: datadir + StorageClass: fast-ssd + Labels: + Annotations: + Capacity: 100Gi +---- + +HostPath and emptyDir storage are not suitable for production as they lack durability guarantees. + +=== RAID/LVM stripe configuration (multiple disks only) + +If using multiple physical disks, verify they are configured as RAID-0 or LVM stripe (not linear/concat). + +[,bash] +---- +# Check block device configuration on nodes +kubectl debug node/ -it -- chroot /host /bin/bash +lsblk -o NAME,TYPE,SIZE,MOUNTPOINT,FSTYPE +lvs -o lv_name,stripes,stripe_size +mdadm --detail /dev/md* # if using software RAID +---- + +Expected output for properly configured LVM stripe: +[,bash,role=no-copy] +---- +# lsblk output +NAME TYPE SIZE MOUNTPOINT FSTYPE +nvme0n1 disk 1.8T +nvme1n1 disk 1.8T +vg0-data lvm 3.6T /var/lib/redpanda xfs + +# lvs output - note stripes > 1 indicates striping +LV #Stripes StripeSize +data 2 256.00k +---- + +Expected output for software RAID-0: +[,bash,role=no-copy] +---- +# mdadm output +/dev/md0: + Raid Level : raid0 + Array Size : 3515625472 (3.27 TiB) + Raid Devices : 2 + + Number Major Minor RaidDevice State + 0 259 0 0 active sync /dev/nvme0n1 + 1 259 1 1 active sync /dev/nvme1n1 +---- + +Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance. Each disk must be striped for optimal I/O throughput. Single disk configurations do not require striping. + +=== Storage performance requirements + +Ensure storage classes provide adequate IOPS and throughput for your workload. + +**Performance specifications:** + +* Use NVMe-based storage classes for production deployments +* Minimum 16,000 IOPS (Input/Output Operations Per Second) required +* Consider provisioned IOPS where available to meet or exceed the minimum +* Enabling xref:develop:config-topics.adoc#configure-write-caching[write caching] can help Redpanda perform better in environments with disks that don't meet the recommended IOPS +* NFS (Network File System) is not supported +* Test storage performance under load + +**Multi-tenant disk warning:** + +WARNING: Avoid cloud instance types that use multi-tenant or shared disks, as these can lead to unpredictable performance due to noisy neighbor effects. Examples of instances with shared/multi-tenant storage include AWS is4gen.xlarge and similar instance types across cloud providers. Instead, use instances with dedicated local NVMe storage or provisioned IOPS volumes that guarantee consistent performance. + +Multi-tenant disks can experience: + +* Unpredictable latency spikes from other tenants' workloads +* Inconsistent throughput that varies based on neighbor activity +* IOPS throttling that impacts Redpanda's performance +* Difficulty troubleshooting performance issues due to external factors + +See xref:deploy:redpanda/kubernetes/k-requirements.adoc#storage[Storage requirements] for detailed specifications and xref:deploy:redpanda/kubernetes/k-requirements.adoc#cloud-instance-types[Cloud Instance Types] for recommended instance types across AWS, Azure, and Google Cloud. + +**Volume sizing**:: Plan storage capacity for data growth and retention requirements. ++ +* Account for replication overhead +* Include space for compaction operations +* Monitor disk usage trends + +=== CPU and memory resource limits + +Verify pods have resource requests and limits configured. + +[,bash] +---- +kubectl get pod -n -o jsonpath='{.spec.containers[?(@.name=="redpanda")].resources}' | jq +---- + +Expected output showing both requests and limits: +[,bash,role=no-copy] +---- +{ + "limits": { + "cpu": "4", + "memory": "8Gi" + }, + "requests": { + "cpu": "4", + "memory": "8Gi" + } +} +---- + +All Redpanda pods must have: + +* CPU requests and limits configured and **identical** (`requests.cpu == limits.cpu`) +* Memory requests and limits configured and **identical** (`requests.memory == limits.memory`) + +Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS class, which prevents CPU throttling and reduces the risk of Pod eviction. + +See xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] for detailed configuration guidance. + +=== CPU to memory ratio (1:2 minimum) + +Ensure adequate memory allocation relative to CPU for optimal performance. + +Production deployments should provision at least 2 GiB of memory per CPU core. + +Verify the CPU to memory ratio in your configuration: + +[tabs] +====== +Helm:: ++ +-- +[,bash] +---- +helm get values redpanda -n | grep -A 2 "resources:" +---- + +Expected output showing proper ratio: +[,bash,role=no-copy] +---- +resources: + cpu: + cores: 4 + memory: + container: + min: 8Gi + max: 8Gi +---- +-- + +Operator:: ++ +-- +[,bash] +---- +kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.resources}' | jq +---- + +Expected output showing proper ratio: +[,bash,role=no-copy] +---- +{ + "cpu": { + "cores": 4 + }, + "memory": { + "container": { + "min": "8Gi", + "max": "8Gi" + } + } +} +---- +-- +====== + +In the examples above, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). + +=== No fractional CPU requests + +Ensure CPU requests use whole numbers for consistent performance. + +Fractional CPUs can lead to performance variability in production. Use whole integer values (4, 8, 16 - not 3.5, 7.5). + +Verify CPU configuration: + +[,bash] +---- +kubectl get pod -n -o jsonpath='{.spec.containers[?(@.name=="redpanda")].resources.requests.cpu}' +---- + +Expected output showing whole number: +[,bash,role=no-copy] +---- +4 +---- + +Avoid fractional values like `3500m` (3.5 cores) or `7500m` (7.5 cores). + +**Resource capacity planning**:: Ensure nodes have adequate resources for the configured limits. ++ +* Verify cluster has sufficient total resources +* Account for other workloads on shared nodes +* Plan for resource growth and burst capacity + +=== Authorization enabled + +Verify Kafka authorization is enabled for access control. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get kafka_enable_authorization -X user= -X pass= -X sasl.mechanism= +---- + +Expected output: +[,bash,role=no-copy] +---- +true +---- + +Without authorization enabled, any client can access Kafka APIs without authentication. See xref:manage:security/authorization/index.adoc[Authorization] for configuration details. + +=== Developer mode disabled + +Ensure developer mode is disabled in production configuration. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get developer_mode -X user= -X pass= -X sasl.mechanism= +---- + +Expected output: +[,bash,role=no-copy] +---- +false +---- + +Developer mode should never be enabled in production environments. Developer mode disables fsync and bypasses safety checks designed for production workloads. + +=== Overprovisioned disabled + +Ensure overprovisioned mode is disabled for production stability. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get overprovisioned -X user= -X pass= -X sasl.mechanism= +---- + +Expected output: +[,bash,role=no-copy] +---- +false +---- + +Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. + +=== TLS configuration + +Configure TLS encryption for all client and inter-broker communication. TLS prevents eavesdropping and man-in-the-middle attacks on network traffic. + +Verify TLS is enabled on all listeners: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config export -X user= -X pass= -X sasl.mechanism= | grep -A 10 "kafka_api:" +---- + +Expected output showing TLS configuration: +[,bash,role=no-copy] +---- +redpanda: + kafka_api: + - address: 0.0.0.0 + port: 9093 + name: internal + authentication_method: sasl + kafka_api_tls: + - name: internal + enabled: true + cert_file: /etc/tls/certs/tls.crt + key_file: /etc/tls/certs/tls.key +---- + +**Required TLS listeners:** + +* **kafka_api** - Client connections to Kafka API +* **admin_api** - Administrative REST API access +* **rpc_server** - Inter-broker communication +* **schema_registry** - Schema Registry API (if used) + +Verify certificates are properly mounted: + +[,bash] +---- +kubectl exec -n -c redpanda -- ls -la /etc/tls/certs/ +---- + +Expected output showing certificate files: +[,bash,role=no-copy] +---- +total 16 +-rw-r--r-- 1 redpanda redpanda 1234 Dec 15 10:00 ca.crt +-rw-r--r-- 1 redpanda redpanda 1675 Dec 15 10:00 tls.crt +-rw------- 1 redpanda redpanda 1704 Dec 15 10:00 tls.key +---- + +See xref:manage:security/encryption.adoc[TLS Encryption] for detailed configuration instructions. + +=== Authentication configuration + +Configure appropriate authentication mechanisms to control access to Redpanda resources. + +Verify SASL users are configured: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk acl user list -X user= -X pass= -X sasl.mechanism= +---- + +Expected output showing configured users: +[,bash,role=no-copy] +---- +USERNAME +admin +app-producer +app-consumer +monitoring +---- + +**Authentication requirements:** + +* Set up SASL authentication for client connections +* Configure TLS certificates for encryption (see TLS configuration above) +* Implement proper user management with principle of least privilege +* Configure ACLs (Access Control Lists) for resource authorization + +Verify ACLs are configured: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk acl list -X user= -X pass= -X sasl.mechanism= +---- + +Expected output showing ACL rules: +[,bash,role=no-copy] +---- +PRINCIPAL HOST RESOURCE-TYPE RESOURCE-NAME OPERATION PERMISSION +User:app-producer * TOPIC orders.* WRITE ALLOW +User:app-consumer * TOPIC orders.* READ ALLOW +User:app-consumer * GROUP consumer-group-1 READ ALLOW +---- + +See xref:manage:security/authentication.adoc[Authentication] and xref:manage:security/authorization/index.adoc[Authorization] for configuration details. + +=== Network security + +Secure network access to the cluster using Kubernetes-native controls. + +Verify NetworkPolicies are configured: + +[,bash] +---- +kubectl get networkpolicy -n +---- + +Expected output showing network policies: +[,bash,role=no-copy] +---- +NAME POD-SELECTOR AGE +redpanda-allow-internal app.kubernetes.io/name=redpanda 10d +redpanda-allow-clients app.kubernetes.io/name=redpanda 10d +redpanda-deny-all-ingress app.kubernetes.io/name=redpanda 10d +---- + +Check NetworkPolicy rules: + +[,bash] +---- +kubectl describe networkpolicy -n +---- + +**Network security requirements:** + +* Configure NetworkPolicies to restrict pod-to-pod communication +* Use TLS for all client connections (see TLS configuration) +* Secure admin API endpoints with authentication and authorization +* Limit ingress traffic to only necessary ports and sources +* Use Kubernetes Services to control external access + +Verify services and exposed ports: + +[,bash] +---- +kubectl get svc -n +---- + +Expected output showing service configuration: +[,bash,role=no-copy] +---- +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +redpanda ClusterIP None 9093/TCP,9644/TCP,8082/TCP +redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP +---- + +See xref:manage:security/listener-configuration.adoc[Listener Configuration] for securing network endpoints. + +=== Pod Disruption Budget configured + +Set up PDBs to control voluntary disruptions during maintenance. + +[,bash] +---- +kubectl get pdb -n +---- + +Expected output: +[,bash,role=no-copy] +---- +NAME MIN AVAILABLE MAX UNAVAILABLE ALLOWED DISRUPTIONS AGE +redpanda N/A 1 1 10d +---- + +Production deployments must have a PodDisruptionBudget with `maxUnavailable: 1` to prevent simultaneous broker disruptions during voluntary operations like node drains, upgrades, or autoscaler actions. + +See xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] for configuration details. + +=== Rack awareness and topology spread + +Configure topology spread constraints to distribute brokers across availability zones. + +Verify pod distribution across zones: + +[,bash] +---- +kubectl get pod -n -o wide +---- + +Expected output showing pods spread across different zones: +[,bash,role=no-copy] +---- +NAME READY STATUS NODE ZONE +redpanda-0 2/2 Running node-us-west-2a-1.internal us-west-2a +redpanda-1 2/2 Running node-us-west-2b-1.internal us-west-2b +redpanda-2 2/2 Running node-us-west-2c-1.internal us-west-2c +---- + +Check node availability zone labels: + +[,bash] +---- +kubectl get nodes --show-labels | grep topology.kubernetes.io/zone +---- + +**Configuration requirements:** + +* Configure `topologySpreadConstraints` to spread pods across zones +* Use node labels for availability zone awareness (typically `topology.kubernetes.io/zone`) +* Prevents single zone failures from affecting multiple brokers + +See xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness in Kubernetes] for configuration details. + +=== Redpanda license verification + +Validate Enterprise license if using Enterprise features. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster license info -X user= -X pass= -X sasl.mechanism= +---- + +Expected output for valid license: +[,bash,role=no-copy] +---- +LICENSE INFORMATION +=================== +Organization: Your Company Name +Type: enterprise +Expires: Dec 31 2025 +---- + +Production deployments using Enterprise features (Tiered Storage, Schema Registry, Continuous Data Balancing, etc.) must have a valid Enterprise license with sufficient expiration date. + +See xref:get-started:licensing/index.adoc[Redpanda Licensing] for more information. + +=== Operator CRDs validation (Operator deployments only) + +**CRITICAL**: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. + +Verify all required CRDs are installed: + +[,bash] +---- +kubectl get crd | grep redpanda.com +---- + +Expected output: +[,bash,role=no-copy] +---- +clusters.cluster.redpanda.com +topics.cluster.redpanda.com +users.cluster.redpanda.com +schemas.cluster.redpanda.com +---- + +Required CRDs: + +* `clusters.cluster.redpanda.com` - Manages Redpanda cluster configuration +* `topics.cluster.redpanda.com` - Manages topic lifecycle +* `users.cluster.redpanda.com` - Manages SASL users +* `schemas.cluster.redpanda.com` - Manages Schema Registry schemas + +If any CRDs are missing or incompatible with your Operator version, the Operator will fail to reconcile resources. + +== Recommended Production Enhancements + +These checks improve operational robustness and performance but are not critical for basic functionality. + +=== Deployment method detection + +Verify that the deployment method (Helm or Operator) is correctly identified for your cluster. Understanding your deployment method is important for troubleshooting, upgrades, and configuration management. + +[tabs] +====== +Helm:: ++ +-- +[,bash] +---- +helm list -n +---- + +Expected output: +[,bash,role=no-copy] +---- +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +redpanda redpanda 1 2024-01-15 10:30:00.123456 -0800 PST deployed redpanda-5.0.0 v24.1.1 +---- + +The presence of a Helm release indicates a Helm-managed deployment. +-- + +Operator:: ++ +-- +[,bash] +---- +kubectl get redpanda -n +---- + +Expected output: +[,bash,role=no-copy] +---- +NAME READY STATUS +redpanda True Redpanda reconciliation succeeded +---- + +The presence of a Redpanda custom resource indicates an Operator-managed deployment. +-- +====== + +**Why this matters**: Knowing your deployment method helps determine: + +* Which configuration approach to use (Helm values vs. Redpanda CR) +* How to perform upgrades and rollbacks +* Where to find deployment logs and troubleshooting information +* Which documentation sections apply to your environment + +=== XFS filesystem for data directory + +Verify data directories use XFS filesystem for optimal performance. + +[,bash] +---- +kubectl exec -n -c redpanda -- df -khT /var/lib/redpanda/data +---- + +Expected output showing XFS filesystem: +[,bash,role=no-copy] +---- +Filesystem Type Size Used Avail Use% Mounted on +/dev/nvme0n1 xfs 1.8T 14G 1.8T 1% /var/lib/redpanda/data +---- + +XFS provides better performance characteristics for Redpanda workloads compared to ext4. While ext4 is supported, XFS is strongly recommended for production deployments. + +**Storage performance tuning**:: Optimize storage configuration for production workloads. ++ +* Configure appropriate `vm.swappiness` settings +* Tune filesystem mount options +* Consider storage class performance characteristics + +=== Pod anti-affinity rules + +Configure pod anti-affinity to spread brokers across nodes. + +[,bash] +---- +kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.affinity}' | jq +---- + +Expected output showing pod anti-affinity rules: +[,bash,role=no-copy] +---- +{ + "podAntiAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": [ + { + "labelSelector": { + "matchLabels": { + "app.kubernetes.io/name": "redpanda" + } + }, + "topologyKey": "kubernetes.io/hostname" + } + ] + } +} +---- + +This prevents single node failures from affecting multiple brokers by ensuring each Redpanda pod runs on a different node. + +See xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] configuration options. + +=== Node isolation configuration + +Configure taints/tolerations or nodeSelector for workload isolation. + +[,bash] +---- +kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.nodeSelector}' | jq +---- + +Example output showing node isolation: +[,bash,role=no-copy] +---- +{ + "workload-type": "redpanda" +} +---- + +Isolating Redpanda workloads on dedicated nodes improves performance predictability by preventing resource contention with other applications. + +**CPU pinning and NUMA awareness**:: Configure CPU affinity for optimal performance on multi-core systems. + +**Memory allocation strategy**:: Optimize memory settings for your workload patterns. + +=== Continuous data balancing enabled + +xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] is a major benefit of Redpanda for managing production deployments. It automatically rebalances partition replicas across brokers based on disk usage and node changes, eliminating manual intervention and preventing performance degradation. + +**This feature should be enabled for all licensed production clusters.** + +Verify that Continuous Data Balancing is configured: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get partition_autobalancing_mode -X user= -X pass= -X sasl.mechanism= +---- + +Expected output: +[,bash,role=no-copy] +---- +continuous +---- + +Setting this to `continuous` enables automatic partition rebalancing based on: + +* Node additions or removals +* High disk usage conditions +* Broker availability changes + +Without Continuous Data Balancing, partition distribution becomes skewed over time, leading to hotspots and manual rebalancing operations. + +=== Core balancing enabled + +Configure core balancing for CPU core partition distribution. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get core_balancing_on_core_count_change -X user= -X pass= -X sasl.mechanism= +---- + +Expected output: +[,bash,role=no-copy] +---- +true +---- + +When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, especially beneficial after broker restarts or configuration changes. + +=== System requirements validation + +Run system checks to validate optimal configuration. + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk redpanda check -X user= -X pass= -X sasl.mechanism= +---- + +Expected output showing all checks passed: +[,bash,role=no-copy] +---- +CONDITION REQUIRED CURRENT SEVERITY PASSED +Data directory is writable true true Fatal true +Free memory per CPU [MB] >= 2048 8192 Warning true +NTP Synced true true Warning true +Swappiness 1 1 Warning true +---- + +Review any failed checks and remediate before proceeding to production. See xref:reference:rpk/rpk-redpanda/rpk-redpanda-check.adoc[rpk redpanda check] for details on each validation. + +=== Debug bundle generation + +Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that when an actual issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. + +Generate a debug bundle: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk debug bundle -o /tmp/bundle.zip +---- + +For additional options and arguments, see xref:reference:rpk/rpk-debug/rpk-debug-bundle.adoc[rpk debug bundle]. + +Expected output: +[,bash,role=no-copy] +---- +Creating bundle file... +Collecting cluster info... +Collecting logs... +Collecting configuration... +Debug bundle saved to '/tmp/bundle.zip' +---- + +**Why this matters**: Debug bundles collect critical diagnostic information including: + +* Cluster configuration and metadata +* Redpanda logs from all brokers +* System resource usage and performance metrics +* Kubernetes resource definitions + +**Common issues to watch for**: + +* Permission errors preventing log collection +* Insufficient disk space for bundle creation +* Network policies blocking bundle transfer +* RBAC restrictions on accessing pod logs or exec + +Testing bundle generation early ensures this critical troubleshooting tool works when you need it most. Debug bundles are often required by Redpanda support to diagnose production issues efficiently. + +See xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] for detailed information about bundle contents and collection methods. + +=== Tiered Storage configuration + +Configure Tiered Storage for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. + +Verify Tiered Storage configuration: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_enabled -X user= -X pass= -X sasl.mechanism= +---- + +Expected output if Tiered Storage is enabled: +[,bash,role=no-copy] +---- +true +---- + +**Benefits of Tiered Storage:** + +* Reduces local storage costs by offloading cold data to cheaper object storage +* Enables longer data retention periods without provisioning additional disk +* Required for advanced features like Remote Read Replicas and Iceberg integration +* Provides disaster recovery capabilities through cloud-backed data + +**Verification steps:** + +[,bash] +---- +# Check bucket configuration +kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_bucket -X user= -X pass= -X sasl.mechanism= + +# Check region/endpoint +kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_region -X user= -X pass= -X sasl.mechanism= +---- + +See xref:manage:tiered-storage.adoc[Tiered Storage] for configuration details and best practices. + +=== Security scanning + +Regularly scan container images and configurations for vulnerabilities to maintain security posture. + +**Container image scanning:** + +Verify that container images are scanned before deployment: + +[,bash] +---- +# Check current image in use +kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.containers[?(@.name=="redpanda")].image}' +---- + +Expected output: +[,bash,role=no-copy] +---- +docker.redpanda.com/redpandadata/redpanda:v24.2.4 +---- + +**Security scanning practices:** + +* Scan images using tools like Trivy, Snyk, or cloud-native scanners before deployment +* Set up automated scanning in CI/CD pipelines +* Monitor for CVE announcements and security advisories +* Keep Redpanda and related components up to date with security patches +* Review Kubernetes RBAC policies and ServiceAccount permissions + +**Configuration scanning:** + +[,bash] +---- +# Scan Kubernetes manifests +kubectl get redpanda,statefulset,deployment -n -o yaml > cluster-config.yaml +# Use kubesec, kube-bench, or similar tools to scan cluster-config.yaml +---- + +Establish a regular cadence for security scanning (e.g., weekly or with each deployment). + +=== Backup and recovery procedures + +Implement and test backup and recovery processes to ensure business continuity. + +**Backup strategy with Tiered Storage:** + +Tiered Storage provides built-in backup capabilities by storing data in object storage. Verify Tiered Storage is configured: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_enabled -X user= -X pass= -X sasl.mechanism= +---- + +**Recovery testing:** + +Regularly test recovery procedures to validate RTO/RPO targets: + +[,bash] +---- +# Test topic restoration from Tiered Storage +kubectl exec -n -c redpanda -- rpk topic describe -X user= -X pass= -X sasl.mechanism= +---- + +**Backup and recovery checklist:** + +* Configure and validate Tiered Storage for automatic data backup +* Document recovery procedures for different failure scenarios +* Test cluster recovery procedures in non-production environments +* Establish Recovery Time Objective (RTO) and Recovery Point Objective (RPO) +* Maintain runbooks for disaster recovery scenarios +* Verify IAM roles/permissions for object storage access + +See xref:manage:whole-cluster-restore.adoc[Whole Cluster Restore] for detailed recovery procedures. + +=== Audit logging + +Enable and configure audit logging for compliance and security monitoring requirements. + +Verify audit logging configuration: + +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster config get audit_enabled -X user= -X pass= -X sasl.mechanism= +---- + +Expected output if audit logging is enabled: +[,bash,role=no-copy] +---- +true +---- + +**Audit log verification:** + +Check where audit logs are being written: + +[,bash] +---- +# Check audit log topic +kubectl exec -n -c redpanda -- rpk topic list -X user= -X pass= -X sasl.mechanism= | grep audit +---- + +Expected output: +[,bash,role=no-copy] +---- +_redpanda.audit_log 1 3 +---- + +**Audit logging best practices:** + +* Forward audit logs to SIEM (Security Information and Event Management) systems +* Configure retention policies appropriate for compliance requirements +* Monitor audit logs for suspicious activities +* Exclude high-volume, low-value events to reduce noise +* Ensure audit log topic has adequate replication and retention + +**Compliance considerations:** + +Audit logging may be required for: + +* SOC 2 compliance +* HIPAA regulations +* PCI DSS requirements +* GDPR data access tracking +* Internal security policies + +See xref:manage:audit-logging.adoc[Audit Logging] for configuration details and event types. + +== Monitoring and Observability + +**Monitoring setup**:: Deploy comprehensive monitoring for cluster health and performance. ++ +* Set up Prometheus metrics collection +* Configure ServiceMonitor for automatic scraping +* Validate metrics endpoint accessibility ++ +[,bash] +---- +kubectl get servicemonitor -n +---- + +**Grafana dashboards**:: Import and configure Redpanda monitoring dashboards. ++ +* Import official Redpanda dashboards +* Verify dashboards display data correctly +* Customize dashboards for your environment + +**Alerting rules**:: Implement alerting for critical metrics and conditions. ++ +* CPU and memory utilization alerts +* Disk space alerts +* Replication lag alerts +* Broker health alerts + +**Log aggregation**:: Configure centralized log collection and analysis. ++ +* Forward Redpanda logs to central logging system +* Set up log retention policies +* Configure log-based alerting + +**Health checks**:: Implement application-level health checks. ++ +* Configure Kubernetes liveness and readiness probes +* Set up external health monitoring +* Define SLI/SLO metrics + +== Operational Readiness + +**Deployment automation**:: Implement Infrastructure as Code for reproducible deployments. ++ +* Use Helm charts or Kubernetes manifests in version control +* Implement GitOps workflows +* Automate testing and validation + +**Non-production environments**:: Maintain separate environments for testing and validation. ++ +* Set up dedicated dev/staging/production clusters +* Test changes in non-production first +* Mirror production configuration in staging + +**Upgrade procedures**:: Document and test cluster upgrade processes. ++ +* Plan for rolling upgrades with zero downtime +* Test upgrade procedures in staging environments +* Implement rollback capabilities +* Document upgrade policy and procedures + +**Incident response**:: Prepare for operational incidents and outages. ++ +* Document troubleshooting procedures +* Establish on-call processes +* Create incident response playbooks + +**Resource quotas**:: Configure namespace resource quotas to prevent resource exhaustion. ++ +[,bash] +---- +kubectl get resourcequota -n +---- + +== Next Steps + +After completing this checklist: + +1. **Performance testing**: Conduct load testing to validate performance under expected traffic. +2. **Disaster recovery testing**: Test backup and recovery procedures. +3. **Security review**: Conduct security assessment and penetration testing. +4. **Operational validation**: Verify monitoring, alerting, and incident response procedures. +5. **Documentation**: Complete operational runbooks and troubleshooting guides. \ No newline at end of file diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc index 393ac88948..1485890a48 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc @@ -777,6 +777,10 @@ include::deploy:partial$kubernetes/guides/troubleshoot.adoc[leveloffset=+1] == Next steps +After deploying Redpanda, validate your production readiness: + +- xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Production readiness checklist] - Comprehensive validation of your deployment against production standards + See the xref:manage:kubernetes/index.adoc[Manage Kubernetes topics] to learn how to customize your deployment to meet your needs. include::shared:partial$suggested-reading.adoc[] diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc index 814f4eaacd..ebf5576038 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc @@ -10,3 +10,4 @@ The production deployment tasks involve Kubernetes administrators (admins) as we . All: xref:deploy:redpanda/kubernetes/k-requirements.adoc[Review the requirements and recommendations] to align on prerequisites. . Admin: xref:deploy:redpanda/kubernetes/k-tune-workers.adoc[Tune the worker nodes] for best performance. . User: xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy Redpanda] using either the Redpanda Operator or the Redpanda Helm chart. +. All: xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Validate production readiness] using the comprehensive checklist to ensure your deployment meets production standards. diff --git a/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc b/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc index e287ed8d02..2f7ef9cfb5 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc @@ -11,7 +11,10 @@ include::deploy:partial$requirements.adoc[] == Next steps -xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[]. +After meeting these requirements, proceed to: + +- xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy Redpanda for production] +- xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Validate production readiness] with the comprehensive checklist include::shared:partial$suggested-reading.adoc[] diff --git a/modules/deploy/partials/high-availability.adoc b/modules/deploy/partials/high-availability.adoc index 920a3200ef..4543e66c83 100644 --- a/modules/deploy/partials/high-availability.adoc +++ b/modules/deploy/partials/high-availability.adoc @@ -531,6 +531,10 @@ cat debug.log | grep -v ApiVersions | egrep 'opening|read' include::shared:partial$suggested-reading.adoc[] +ifdef::env-kubernetes[] +* xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Production readiness checklist] - Validate your Kubernetes deployment against production standards +endif::[] + * https://redpanda.com/blog/redpanda-official-jepsen-report-and-analysis?utm_assettype=report&utm_assetname=roi_report&utm_source=gated_content&utm_medium=content&utm_campaign=jepsen_blog[Redpanda's official Jepsen report^] * https://redpanda.com/blog/simplifying-raft-replication-in-redpanda[Simplifying Redpanda Raft implementation^] * https://redpanda.com/blog/kafka-redpanda-availability[An availability footprint of the Redpanda and Apache Kafka replication protocols^] From 5463696da7af23759d053d1d84f44b045944e951 Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Thu, 18 Dec 2025 08:21:40 -0600 Subject: [PATCH 02/25] add link to choosing replica factor in k8s prod checklist --- .../pages/redpanda/kubernetes/k-production-checklist.adoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index 85dfe45570..1809af303a 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -265,6 +265,8 @@ Expected output: Setting `default_topic_replications` to 3 or greater ensures new topics are created with adequate fault tolerance. +See xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] for guidance on selecting appropriate replication factors. + === Existing topics replication factor (≥3) Check that all existing topics have adequate replication. From 598f4d1cb46ae7ef5e7645070c3d4c96c1824b7b Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Mon, 5 Jan 2026 09:49:28 -0600 Subject: [PATCH 03/25] add numbering to k8s prod checklist entries --- .../kubernetes/k-production-checklist.adoc | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index 1809af303a..c8eb8cc392 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -11,7 +11,7 @@ NOTE: For non-Kubernetes deployments (Linux/VM-based), see the xref:deploy:redpa These checks are essential for a stable, reliable production deployment. All critical requirements must pass before going live. -=== Authentication note +=== 1. Authentication note The `rpk` commands throughout this checklist include SASL authentication flags (`-X user`, `-X pass`, `-X sasl.mechanism`). @@ -27,7 +27,7 @@ kubectl exec -n -c redpanda -- rpk cluster health Common SASL mechanisms are `SCRAM-SHA-256` or `SCRAM-SHA-512`. Update these values as needed for your deployment. -=== Cluster health status +=== 2. Cluster health status Verify the cluster reports as healthy with no broker issues. @@ -38,7 +38,7 @@ kubectl exec -n -c redpanda -- rpk cluster health -X user All brokers must report `HEALTHY` status with zero leaderless partitions and zero under-replicated partitions. -=== Minimum broker count (≥3) +=== 3. Minimum broker count (≥3) Ensure at least 3 brokers are running for production fault tolerance. @@ -96,7 +96,7 @@ Expected output: -- ====== -=== All brokers active membership +=== 4. All brokers active membership Verify all brokers are in active state and not being decommissioned. @@ -120,7 +120,7 @@ All brokers must show `active` status. If any broker shows `draining` or `decomm See xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] for more information. -=== No brokers in maintenance mode +=== 5. No brokers in maintenance mode Ensure no brokers are currently in maintenance mode during normal operations. @@ -144,7 +144,7 @@ All brokers should show `ENABLED: false`. If any broker shows `ENABLED: true` ou See xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] for more information. -=== Consistent Redpanda version +=== 6. Consistent Redpanda version Ensure all brokers run the same Redpanda version. @@ -155,7 +155,7 @@ kubectl exec -n -c redpanda -- rpk redpanda admin brokers Version mismatches can cause compatibility issues and must be resolved. -=== Version pinning +=== 7. Version pinning **CRITICAL**: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. @@ -248,7 +248,7 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" * Version ranges (e.g., `v24.2.x`) - may auto-update to patch releases * Unspecified tags - defaults to latest or chart-defined versions -=== Default topic replication factor (≥3) +=== 8. Default topic replication factor (≥3) Verify the default replication factor is set appropriately for production. @@ -267,7 +267,7 @@ Setting `default_topic_replications` to 3 or greater ensures new topics are crea See xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] for guidance on selecting appropriate replication factors. -=== Existing topics replication factor (≥3) +=== 9. Existing topics replication factor (≥3) Check that all existing topics have adequate replication. @@ -290,7 +290,7 @@ All production topics should have `REPLICAS` of 3 or greater. Topics with single See xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] if remediation is needed. -=== Persistent storage configuration +=== 10. Persistent storage configuration Verify using persistent storage (not hostPath or emptyDir) for data persistence. @@ -327,7 +327,7 @@ Volume Claims: HostPath and emptyDir storage are not suitable for production as they lack durability guarantees. -=== RAID/LVM stripe configuration (multiple disks only) +=== 11. RAID/LVM stripe configuration (multiple disks only) If using multiple physical disks, verify they are configured as RAID-0 or LVM stripe (not linear/concat). @@ -370,7 +370,7 @@ Expected output for software RAID-0: Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance. Each disk must be striped for optimal I/O throughput. Single disk configurations do not require striping. -=== Storage performance requirements +=== 12. Storage performance requirements Ensure storage classes provide adequate IOPS and throughput for your workload. @@ -402,7 +402,7 @@ See xref:deploy:redpanda/kubernetes/k-requirements.adoc#storage[Storage requirem * Include space for compaction operations * Monitor disk usage trends -=== CPU and memory resource limits +=== 13. CPU and memory resource limits Verify pods have resource requests and limits configured. @@ -435,7 +435,7 @@ Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS c See xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] for detailed configuration guidance. -=== CPU to memory ratio (1:2 minimum) +=== 14. CPU to memory ratio (1:2 minimum) Ensure adequate memory allocation relative to CPU for optimal performance. @@ -494,7 +494,7 @@ Expected output showing proper ratio: In the examples above, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). -=== No fractional CPU requests +=== 15. No fractional CPU requests Ensure CPU requests use whole numbers for consistent performance. @@ -521,7 +521,7 @@ Avoid fractional values like `3500m` (3.5 cores) or `7500m` (7.5 cores). * Account for other workloads on shared nodes * Plan for resource growth and burst capacity -=== Authorization enabled +=== 16. Authorization enabled Verify Kafka authorization is enabled for access control. @@ -538,7 +538,7 @@ true Without authorization enabled, any client can access Kafka APIs without authentication. See xref:manage:security/authorization/index.adoc[Authorization] for configuration details. -=== Developer mode disabled +=== 17. Developer mode disabled Ensure developer mode is disabled in production configuration. @@ -555,7 +555,7 @@ false Developer mode should never be enabled in production environments. Developer mode disables fsync and bypasses safety checks designed for production workloads. -=== Overprovisioned disabled +=== 18. Overprovisioned disabled Ensure overprovisioned mode is disabled for production stability. @@ -572,7 +572,7 @@ false Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. -=== TLS configuration +=== 19. TLS configuration Configure TLS encryption for all client and inter-broker communication. TLS prevents eavesdropping and man-in-the-middle attacks on network traffic. @@ -624,7 +624,7 @@ total 16 See xref:manage:security/encryption.adoc[TLS Encryption] for detailed configuration instructions. -=== Authentication configuration +=== 20. Authentication configuration Configure appropriate authentication mechanisms to control access to Redpanda resources. @@ -670,7 +670,7 @@ User:app-consumer * GROUP consumer-group-1 READ ALLOW See xref:manage:security/authentication.adoc[Authentication] and xref:manage:security/authorization/index.adoc[Authorization] for configuration details. -=== Network security +=== 21. Network security Secure network access to the cluster using Kubernetes-native controls. @@ -722,7 +722,7 @@ redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP See xref:manage:security/listener-configuration.adoc[Listener Configuration] for securing network endpoints. -=== Pod Disruption Budget configured +=== 22. Pod Disruption Budget configured Set up PDBs to control voluntary disruptions during maintenance. @@ -742,7 +742,7 @@ Production deployments must have a PodDisruptionBudget with `maxUnavailable: 1` See xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] for configuration details. -=== Rack awareness and topology spread +=== 23. Rack awareness and topology spread Configure topology spread constraints to distribute brokers across availability zones. @@ -777,7 +777,7 @@ kubectl get nodes --show-labels | grep topology.kubernetes.io/zone See xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness in Kubernetes] for configuration details. -=== Redpanda license verification +=== 24. Redpanda license verification Validate Enterprise license if using Enterprise features. @@ -800,7 +800,7 @@ Production deployments using Enterprise features (Tiered Storage, Schema Registr See xref:get-started:licensing/index.adoc[Redpanda Licensing] for more information. -=== Operator CRDs validation (Operator deployments only) +=== 25. Operator CRDs validation (Operator deployments only) **CRITICAL**: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. @@ -833,7 +833,7 @@ If any CRDs are missing or incompatible with your Operator version, the Operator These checks improve operational robustness and performance but are not critical for basic functionality. -=== Deployment method detection +=== 26. Deployment method detection Verify that the deployment method (Helm or Operator) is correctly identified for your cluster. Understanding your deployment method is important for troubleshooting, upgrades, and configuration management. @@ -883,7 +883,7 @@ The presence of a Redpanda custom resource indicates an Operator-managed deploym * Where to find deployment logs and troubleshooting information * Which documentation sections apply to your environment -=== XFS filesystem for data directory +=== 27. XFS filesystem for data directory Verify data directories use XFS filesystem for optimal performance. @@ -907,7 +907,7 @@ XFS provides better performance characteristics for Redpanda workloads compared * Tune filesystem mount options * Consider storage class performance characteristics -=== Pod anti-affinity rules +=== 28. Pod anti-affinity rules Configure pod anti-affinity to spread brokers across nodes. @@ -939,7 +939,7 @@ This prevents single node failures from affecting multiple brokers by ensuring e See xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] configuration options. -=== Node isolation configuration +=== 29. Node isolation configuration Configure taints/tolerations or nodeSelector for workload isolation. @@ -962,7 +962,7 @@ Isolating Redpanda workloads on dedicated nodes improves performance predictabil **Memory allocation strategy**:: Optimize memory settings for your workload patterns. -=== Continuous data balancing enabled +=== 30. Continuous data balancing enabled xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] is a major benefit of Redpanda for managing production deployments. It automatically rebalances partition replicas across brokers based on disk usage and node changes, eliminating manual intervention and preventing performance degradation. @@ -989,7 +989,7 @@ Setting this to `continuous` enables automatic partition rebalancing based on: Without Continuous Data Balancing, partition distribution becomes skewed over time, leading to hotspots and manual rebalancing operations. -=== Core balancing enabled +=== 31. Core balancing enabled Configure core balancing for CPU core partition distribution. @@ -1006,7 +1006,7 @@ true When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, especially beneficial after broker restarts or configuration changes. -=== System requirements validation +=== 32. System requirements validation Run system checks to validate optimal configuration. @@ -1027,7 +1027,7 @@ Swappiness 1 1 Warning true Review any failed checks and remediate before proceeding to production. See xref:reference:rpk/rpk-redpanda/rpk-redpanda-check.adoc[rpk redpanda check] for details on each validation. -=== Debug bundle generation +=== 33. Debug bundle generation Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that when an actual issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. @@ -1068,7 +1068,7 @@ Testing bundle generation early ensures this critical troubleshooting tool works See xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] for detailed information about bundle contents and collection methods. -=== Tiered Storage configuration +=== 34. Tiered Storage configuration Configure Tiered Storage for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. @@ -1105,7 +1105,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo See xref:manage:tiered-storage.adoc[Tiered Storage] for configuration details and best practices. -=== Security scanning +=== 35. Security scanning Regularly scan container images and configurations for vulnerabilities to maintain security posture. @@ -1144,7 +1144,7 @@ kubectl get redpanda,statefulset,deployment -n -o yaml > cluster-con Establish a regular cadence for security scanning (e.g., weekly or with each deployment). -=== Backup and recovery procedures +=== 36. Backup and recovery procedures Implement and test backup and recovery processes to ensure business continuity. @@ -1178,7 +1178,7 @@ kubectl exec -n -c redpanda -- rpk topic describe Date: Tue, 6 Jan 2026 08:31:50 -0600 Subject: [PATCH 04/25] Update modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc Co-authored-by: David Yu --- .../pages/redpanda/kubernetes/k-production-checklist.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index c8eb8cc392..4777764c88 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -11,7 +11,7 @@ NOTE: For non-Kubernetes deployments (Linux/VM-based), see the xref:deploy:redpa These checks are essential for a stable, reliable production deployment. All critical requirements must pass before going live. -=== 1. Authentication note +=== 1. Check connectivity from `rpk` to brokers with/without SASL The `rpk` commands throughout this checklist include SASL authentication flags (`-X user`, `-X pass`, `-X sasl.mechanism`). From 138677ff0fc7da62c97e2b7467820e1f22ccf71f Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Tue, 6 Jan 2026 08:41:09 -0600 Subject: [PATCH 05/25] move authentication note above critical section, update numbering --- .../kubernetes/k-production-checklist.adoc | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index 4777764c88..e7e2b85155 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -7,11 +7,7 @@ This checklist validates Redpanda deployments in Kubernetes against production r NOTE: For non-Kubernetes deployments (Linux/VM-based), see the xref:deploy:redpanda/manual/production/production-readiness.adoc[Production Readiness Checklist for Linux]. -== Critical Production Requirements - -These checks are essential for a stable, reliable production deployment. All critical requirements must pass before going live. - -=== 1. Check connectivity from `rpk` to brokers with/without SASL +=== Authentication note The `rpk` commands throughout this checklist include SASL authentication flags (`-X user`, `-X pass`, `-X sasl.mechanism`). @@ -27,7 +23,11 @@ kubectl exec -n -c redpanda -- rpk cluster health Common SASL mechanisms are `SCRAM-SHA-256` or `SCRAM-SHA-512`. Update these values as needed for your deployment. -=== 2. Cluster health status +== Critical Production Requirements + +These checks are essential for a stable, reliable production deployment. All critical requirements must pass before going live. + +=== 1. Cluster health status Verify the cluster reports as healthy with no broker issues. @@ -38,7 +38,7 @@ kubectl exec -n -c redpanda -- rpk cluster health -X user All brokers must report `HEALTHY` status with zero leaderless partitions and zero under-replicated partitions. -=== 3. Minimum broker count (≥3) +=== 2. Minimum broker count (≥3) Ensure at least 3 brokers are running for production fault tolerance. @@ -96,7 +96,7 @@ Expected output: -- ====== -=== 4. All brokers active membership +=== 3. All brokers active membership Verify all brokers are in active state and not being decommissioned. @@ -120,7 +120,7 @@ All brokers must show `active` status. If any broker shows `draining` or `decomm See xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] for more information. -=== 5. No brokers in maintenance mode +=== 4. No brokers in maintenance mode Ensure no brokers are currently in maintenance mode during normal operations. @@ -144,7 +144,7 @@ All brokers should show `ENABLED: false`. If any broker shows `ENABLED: true` ou See xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] for more information. -=== 6. Consistent Redpanda version +=== 5. Consistent Redpanda version Ensure all brokers run the same Redpanda version. @@ -155,7 +155,7 @@ kubectl exec -n -c redpanda -- rpk redpanda admin brokers Version mismatches can cause compatibility issues and must be resolved. -=== 7. Version pinning +=== 6. Version pinning **CRITICAL**: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. @@ -248,7 +248,7 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" * Version ranges (e.g., `v24.2.x`) - may auto-update to patch releases * Unspecified tags - defaults to latest or chart-defined versions -=== 8. Default topic replication factor (≥3) +=== 7. Default topic replication factor (≥3) Verify the default replication factor is set appropriately for production. @@ -267,7 +267,7 @@ Setting `default_topic_replications` to 3 or greater ensures new topics are crea See xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] for guidance on selecting appropriate replication factors. -=== 9. Existing topics replication factor (≥3) +=== 8. Existing topics replication factor (≥3) Check that all existing topics have adequate replication. @@ -290,7 +290,7 @@ All production topics should have `REPLICAS` of 3 or greater. Topics with single See xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] if remediation is needed. -=== 10. Persistent storage configuration +=== 9. Persistent storage configuration Verify using persistent storage (not hostPath or emptyDir) for data persistence. @@ -327,7 +327,7 @@ Volume Claims: HostPath and emptyDir storage are not suitable for production as they lack durability guarantees. -=== 11. RAID/LVM stripe configuration (multiple disks only) +=== 10. RAID/LVM stripe configuration (multiple disks only) If using multiple physical disks, verify they are configured as RAID-0 or LVM stripe (not linear/concat). @@ -370,7 +370,7 @@ Expected output for software RAID-0: Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance. Each disk must be striped for optimal I/O throughput. Single disk configurations do not require striping. -=== 12. Storage performance requirements +=== 11. Storage performance requirements Ensure storage classes provide adequate IOPS and throughput for your workload. @@ -402,7 +402,7 @@ See xref:deploy:redpanda/kubernetes/k-requirements.adoc#storage[Storage requirem * Include space for compaction operations * Monitor disk usage trends -=== 13. CPU and memory resource limits +=== 12. CPU and memory resource limits Verify pods have resource requests and limits configured. @@ -435,7 +435,7 @@ Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS c See xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] for detailed configuration guidance. -=== 14. CPU to memory ratio (1:2 minimum) +=== 13. CPU to memory ratio (1:2 minimum) Ensure adequate memory allocation relative to CPU for optimal performance. @@ -494,7 +494,7 @@ Expected output showing proper ratio: In the examples above, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). -=== 15. No fractional CPU requests +=== 14. No fractional CPU requests Ensure CPU requests use whole numbers for consistent performance. @@ -521,7 +521,7 @@ Avoid fractional values like `3500m` (3.5 cores) or `7500m` (7.5 cores). * Account for other workloads on shared nodes * Plan for resource growth and burst capacity -=== 16. Authorization enabled +=== 15. Authorization enabled Verify Kafka authorization is enabled for access control. @@ -538,7 +538,7 @@ true Without authorization enabled, any client can access Kafka APIs without authentication. See xref:manage:security/authorization/index.adoc[Authorization] for configuration details. -=== 17. Developer mode disabled +=== 16. Developer mode disabled Ensure developer mode is disabled in production configuration. @@ -555,7 +555,7 @@ false Developer mode should never be enabled in production environments. Developer mode disables fsync and bypasses safety checks designed for production workloads. -=== 18. Overprovisioned disabled +=== 17. Overprovisioned disabled Ensure overprovisioned mode is disabled for production stability. @@ -572,7 +572,7 @@ false Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. -=== 19. TLS configuration +=== 18. TLS configuration Configure TLS encryption for all client and inter-broker communication. TLS prevents eavesdropping and man-in-the-middle attacks on network traffic. @@ -624,7 +624,7 @@ total 16 See xref:manage:security/encryption.adoc[TLS Encryption] for detailed configuration instructions. -=== 20. Authentication configuration +=== 19. Authentication configuration Configure appropriate authentication mechanisms to control access to Redpanda resources. @@ -670,7 +670,7 @@ User:app-consumer * GROUP consumer-group-1 READ ALLOW See xref:manage:security/authentication.adoc[Authentication] and xref:manage:security/authorization/index.adoc[Authorization] for configuration details. -=== 21. Network security +=== 20. Network security Secure network access to the cluster using Kubernetes-native controls. @@ -722,7 +722,7 @@ redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP See xref:manage:security/listener-configuration.adoc[Listener Configuration] for securing network endpoints. -=== 22. Pod Disruption Budget configured +=== 21. Pod Disruption Budget configured Set up PDBs to control voluntary disruptions during maintenance. @@ -742,7 +742,7 @@ Production deployments must have a PodDisruptionBudget with `maxUnavailable: 1` See xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] for configuration details. -=== 23. Rack awareness and topology spread +=== 22. Rack awareness and topology spread Configure topology spread constraints to distribute brokers across availability zones. @@ -777,7 +777,7 @@ kubectl get nodes --show-labels | grep topology.kubernetes.io/zone See xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness in Kubernetes] for configuration details. -=== 24. Redpanda license verification +=== 23. Redpanda license verification Validate Enterprise license if using Enterprise features. @@ -800,7 +800,7 @@ Production deployments using Enterprise features (Tiered Storage, Schema Registr See xref:get-started:licensing/index.adoc[Redpanda Licensing] for more information. -=== 25. Operator CRDs validation (Operator deployments only) +=== 24. Operator CRDs validation (Operator deployments only) **CRITICAL**: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. @@ -833,7 +833,7 @@ If any CRDs are missing or incompatible with your Operator version, the Operator These checks improve operational robustness and performance but are not critical for basic functionality. -=== 26. Deployment method detection +=== 25. Deployment method detection Verify that the deployment method (Helm or Operator) is correctly identified for your cluster. Understanding your deployment method is important for troubleshooting, upgrades, and configuration management. @@ -883,7 +883,7 @@ The presence of a Redpanda custom resource indicates an Operator-managed deploym * Where to find deployment logs and troubleshooting information * Which documentation sections apply to your environment -=== 27. XFS filesystem for data directory +=== 26. XFS filesystem for data directory Verify data directories use XFS filesystem for optimal performance. @@ -907,7 +907,7 @@ XFS provides better performance characteristics for Redpanda workloads compared * Tune filesystem mount options * Consider storage class performance characteristics -=== 28. Pod anti-affinity rules +=== 27. Pod anti-affinity rules Configure pod anti-affinity to spread brokers across nodes. @@ -939,7 +939,7 @@ This prevents single node failures from affecting multiple brokers by ensuring e See xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] configuration options. -=== 29. Node isolation configuration +=== 28. Node isolation configuration Configure taints/tolerations or nodeSelector for workload isolation. @@ -962,7 +962,7 @@ Isolating Redpanda workloads on dedicated nodes improves performance predictabil **Memory allocation strategy**:: Optimize memory settings for your workload patterns. -=== 30. Continuous data balancing enabled +=== 29. Continuous data balancing enabled xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] is a major benefit of Redpanda for managing production deployments. It automatically rebalances partition replicas across brokers based on disk usage and node changes, eliminating manual intervention and preventing performance degradation. @@ -989,7 +989,7 @@ Setting this to `continuous` enables automatic partition rebalancing based on: Without Continuous Data Balancing, partition distribution becomes skewed over time, leading to hotspots and manual rebalancing operations. -=== 31. Core balancing enabled +=== 30. Core balancing enabled Configure core balancing for CPU core partition distribution. @@ -1006,7 +1006,7 @@ true When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, especially beneficial after broker restarts or configuration changes. -=== 32. System requirements validation +=== 31. System requirements validation Run system checks to validate optimal configuration. @@ -1027,7 +1027,7 @@ Swappiness 1 1 Warning true Review any failed checks and remediate before proceeding to production. See xref:reference:rpk/rpk-redpanda/rpk-redpanda-check.adoc[rpk redpanda check] for details on each validation. -=== 33. Debug bundle generation +=== 32. Debug bundle generation Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that when an actual issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. @@ -1068,7 +1068,7 @@ Testing bundle generation early ensures this critical troubleshooting tool works See xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] for detailed information about bundle contents and collection methods. -=== 34. Tiered Storage configuration +=== 33. Tiered Storage configuration Configure Tiered Storage for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. @@ -1105,7 +1105,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo See xref:manage:tiered-storage.adoc[Tiered Storage] for configuration details and best practices. -=== 35. Security scanning +=== 34. Security scanning Regularly scan container images and configurations for vulnerabilities to maintain security posture. @@ -1144,7 +1144,7 @@ kubectl get redpanda,statefulset,deployment -n -o yaml > cluster-con Establish a regular cadence for security scanning (e.g., weekly or with each deployment). -=== 36. Backup and recovery procedures +=== 35. Backup and recovery procedures Implement and test backup and recovery processes to ensure business continuity. @@ -1178,7 +1178,7 @@ kubectl exec -n -c redpanda -- rpk topic describe Date: Thu, 8 Jan 2026 19:58:29 -0300 Subject: [PATCH 06/25] initial pass --- modules/ROOT/nav.adoc | 1 + .../kubernetes/k-production-checklist.adoc | 119 ++++++++++++------ 2 files changed, 81 insertions(+), 39 deletions(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 9c94fe464d..c95646225a 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -75,6 +75,7 @@ **** xref:deploy:redpanda/kubernetes/k-requirements.adoc[Requirements and Recommendations] **** xref:deploy:redpanda/kubernetes/k-tune-workers.adoc[Tune Worker Nodes] **** xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy Redpanda] +**** xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[] **** xref:deploy:redpanda/kubernetes/k-high-availability.adoc[High Availability] *** xref:deploy:redpanda/manual/index.adoc[Linux] **** xref:deploy:redpanda/manual/production/requirements.adoc[Hardware and Software Requirements] diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index e7e2b85155..f672b71208 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -5,7 +5,7 @@ This checklist validates Redpanda deployments in Kubernetes against production readiness standards. Use this guide to ensure your cluster meets all critical requirements and follows recommended best practices for production deployments. -NOTE: For non-Kubernetes deployments (Linux/VM-based), see the xref:deploy:redpanda/manual/production/production-readiness.adoc[Production Readiness Checklist for Linux]. +NOTE: For Linux deployments, see the xref:deploy:redpanda/manual/production/production-readiness.adoc[Production Readiness Checklist for Linux]. === Authentication note @@ -25,18 +25,39 @@ Common SASL mechanisms are `SCRAM-SHA-256` or `SCRAM-SHA-512`. Update these valu == Critical Production Requirements -These checks are essential for a stable, reliable production deployment. All critical requirements must pass before going live. +The Critical Production Requirements checklist helps you to confirm that: + +- All required defaults and configuration items are specified. +- You have the optimal hardware setup. +- Security is enabled. +- You are set up to run in production. === 1. Cluster health status -Verify the cluster reports as healthy with no broker issues. +Check that all brokers are connected and running. Run xref:reference:rpk/rpk-cluster/rpk-cluster-health.adoc[`rpk cluster health`] to check the health of the cluster. No nodes should be down, and there should be zero leaderless or under-replicated partitions. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster health -X user= -X pass= -X sasl.mechanism= ---- -All brokers must report `HEALTHY` status with zero leaderless partitions and zero under-replicated partitions. +.Output +[,bash,role=no-copy] +---- +CLUSTER HEALTH OVERVIEW +======================= +Healthy: true +Unhealthy reasons: [] +Controller ID: 0 +All nodes: [0 1 2] +Nodes down: [] +Leaderless partitions (0): [] +Under-replicated partitions (0): [] +---- +-- === 2. Minimum broker count (≥3) @@ -118,20 +139,23 @@ NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION All brokers must show `active` status. If any broker shows `draining` or `decommissioned`, investigate immediately. -See xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] for more information. +See also: xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] === 4. No brokers in maintenance mode -Ensure no brokers are currently in maintenance mode during normal operations. +Check that no brokers are in maintenance mode during normal operations. Maintenance mode is used when modifying brokers that will remain as members of the cluster, such as during rolling upgrades or hardware maintenance. While necessary during planned maintenance windows, brokers should not remain in maintenance mode during normal operations. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster maintenance status -X user= -X pass= -X sasl.mechanism= ---- -Expected output showing no brokers in maintenance mode: +.Output [,bash,role=no-copy] ---- NODE-ID ENABLED FINISHED ERRORS PARTITIONS ELIGIBLE TRANSFERRING FAILED @@ -139,20 +163,24 @@ NODE-ID ENABLED FINISHED ERRORS PARTITIONS ELIGIBLE TRANSFERRING FAILED 1 false - - - - - - 2 false - - - - - - ---- +-- All brokers should show `ENABLED: false`. If any broker shows `ENABLED: true` outside of a planned maintenance window, investigate immediately. -See xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] for more information. +See also: xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] === 5. Consistent Redpanda version -Ensure all brokers run the same Redpanda version. +Check that Redpanda is running the https://github.com/redpanda-data/redpanda/releases[latest point release^] for the major version you're on and that all brokers run the same version. +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk redpanda admin brokers list -X user= -X pass= -X sasl.mechanism= ---- +// TODO: Add expected output example + Version mismatches can cause compatibility issues and must be resolved. === 6. Version pinning @@ -250,22 +278,26 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" === 7. Default topic replication factor (≥3) -Verify the default replication factor is set appropriately for production. +Check that the default replication factor is set appropriately for production. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get default_topic_replications -X user= -X pass= -X sasl.mechanism= ---- -Expected output: +.Output [,bash,role=no-copy] ---- 3 ---- +-- Setting `default_topic_replications` to 3 or greater ensures new topics are created with adequate fault tolerance. -See xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] for guidance on selecting appropriate replication factors. +See also: xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] === 8. Existing topics replication factor (≥3) @@ -288,7 +320,7 @@ user-events 16 3 All production topics should have `REPLICAS` of 3 or greater. Topics with single-digit replication are at risk of data loss if a broker fails. -See xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] if remediation is needed. +See also: xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] === 9. Persistent storage configuration @@ -394,7 +426,10 @@ Multi-tenant disks can experience: * IOPS throttling that impacts Redpanda's performance * Difficulty troubleshooting performance issues due to external factors -See xref:deploy:redpanda/kubernetes/k-requirements.adoc#storage[Storage requirements] for detailed specifications and xref:deploy:redpanda/kubernetes/k-requirements.adoc#cloud-instance-types[Cloud Instance Types] for recommended instance types across AWS, Azure, and Google Cloud. +See also: + +* xref:deploy:redpanda/kubernetes/k-requirements.adoc#storage[Storage requirements] +* xref:deploy:redpanda/kubernetes/k-requirements.adoc#cloud-instance-types[Cloud Instance Types] **Volume sizing**:: Plan storage capacity for data growth and retention requirements. + @@ -433,7 +468,7 @@ All Redpanda pods must have: Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS class, which prevents CPU throttling and reduces the risk of Pod eviction. -See xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] for detailed configuration guidance. +See also: xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] === 13. CPU to memory ratio (1:2 minimum) @@ -622,7 +657,7 @@ total 16 -rw------- 1 redpanda redpanda 1704 Dec 15 10:00 tls.key ---- -See xref:manage:security/encryption.adoc[TLS Encryption] for detailed configuration instructions. +See also: xref:manage:security/encryption.adoc[TLS Encryption] === 19. Authentication configuration @@ -668,7 +703,10 @@ User:app-consumer * TOPIC orders.* READ ALLOW User:app-consumer * GROUP consumer-group-1 READ ALLOW ---- -See xref:manage:security/authentication.adoc[Authentication] and xref:manage:security/authorization/index.adoc[Authorization] for configuration details. +See also: + +* xref:manage:security/authentication.adoc[Authentication] +* xref:manage:security/authorization/index.adoc[Authorization] === 20. Network security @@ -720,7 +758,7 @@ redpanda ClusterIP None 9093/TCP,9644/TC redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP ---- -See xref:manage:security/listener-configuration.adoc[Listener Configuration] for securing network endpoints. +See also: xref:manage:security/listener-configuration.adoc[Listener Configuration] === 21. Pod Disruption Budget configured @@ -740,7 +778,7 @@ redpanda N/A 1 1 10d Production deployments must have a PodDisruptionBudget with `maxUnavailable: 1` to prevent simultaneous broker disruptions during voluntary operations like node drains, upgrades, or autoscaler actions. -See xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] for configuration details. +See also: xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] === 22. Rack awareness and topology spread @@ -775,7 +813,7 @@ kubectl get nodes --show-labels | grep topology.kubernetes.io/zone * Use node labels for availability zone awareness (typically `topology.kubernetes.io/zone`) * Prevents single zone failures from affecting multiple brokers -See xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness in Kubernetes] for configuration details. +See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] === 23. Redpanda license verification @@ -798,7 +836,7 @@ Expires: Dec 31 2025 Production deployments using Enterprise features (Tiered Storage, Schema Registry, Continuous Data Balancing, etc.) must have a valid Enterprise license with sufficient expiration date. -See xref:get-started:licensing/index.adoc[Redpanda Licensing] for more information. +See also: xref:get-started:licensing/index.adoc[Redpanda Licensing] === 24. Operator CRDs validation (Operator deployments only) @@ -831,7 +869,7 @@ If any CRDs are missing or incompatible with your Operator version, the Operator == Recommended Production Enhancements -These checks improve operational robustness and performance but are not critical for basic functionality. +The Recommended Production Enhancements checklist confirms that you have adhered to day-2 operations best practices and can diagnose and recover from issues or failures. === 25. Deployment method detection @@ -937,7 +975,7 @@ Expected output showing pod anti-affinity rules: This prevents single node failures from affecting multiple brokers by ensuring each Redpanda pod runs on a different node. -See xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] configuration options. +See also: xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] === 28. Node isolation configuration @@ -1066,7 +1104,7 @@ Debug bundle saved to '/tmp/bundle.zip' Testing bundle generation early ensures this critical troubleshooting tool works when you need it most. Debug bundles are often required by Redpanda support to diagnose production issues efficiently. -See xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] for detailed information about bundle contents and collection methods. +See also: xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] === 33. Tiered Storage configuration @@ -1103,7 +1141,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_region -X user= -X pass= -X sasl.mechanism= ---- -See xref:manage:tiered-storage.adoc[Tiered Storage] for configuration details and best practices. +See also: xref:manage:tiered-storage.adoc[Tiered Storage] === 34. Security scanning @@ -1176,7 +1214,7 @@ kubectl exec -n -c redpanda -- rpk topic describe * Set up external health monitoring * Define SLI/SLO metrics -== Operational Readiness +See also: xref:manage:monitoring.adoc[Monitor Redpanda] + +=== Operational readiness **Deployment automation**:: Implement Infrastructure as Code for reproducible deployments. + @@ -1277,19 +1321,21 @@ kubectl get servicemonitor -n * Implement GitOps workflows * Automate testing and validation -**Non-production environments**:: Maintain separate environments for testing and validation. +**Environment configuration**:: Maintain separate environments for testing and validation. + * Set up dedicated dev/staging/production clusters * Test changes in non-production first * Mirror production configuration in staging -**Upgrade procedures**:: Document and test cluster upgrade processes. +**Upgrade policy**:: Document and test cluster upgrade processes. + * Plan for rolling upgrades with zero downtime * Test upgrade procedures in staging environments * Implement rollback capabilities * Document upgrade policy and procedures +See also: xref:upgrade:k-rolling-upgrade.adoc[Upgrade Redpanda in Kubernetes] + **Incident response**:: Prepare for operational incidents and outages. + * Document troubleshooting procedures @@ -1303,12 +1349,7 @@ kubectl get servicemonitor -n kubectl get resourcequota -n ---- -== Next Steps - -After completing this checklist: +== Suggested reading -1. **Performance testing**: Conduct load testing to validate performance under expected traffic. -2. **Disaster recovery testing**: Test backup and recovery procedures. -3. **Security review**: Conduct security assessment and penetration testing. -4. **Operational validation**: Verify monitoring, alerting, and incident response procedures. -5. **Documentation**: Complete operational runbooks and troubleshooting guides. +- xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy for Production] +- xref:manage:kubernetes/k-configure-helm-chart.adoc[Customize the Helm Chart] From 374084ee751b8677931f33279119674bc6aed926 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Mon, 12 Jan 2026 18:11:28 -0300 Subject: [PATCH 07/25] general reorganization --- .../kubernetes/k-production-checklist.adoc | 290 ++++++++++++------ .../production/production-readiness.adoc | 2 +- 2 files changed, 190 insertions(+), 102 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index f672b71208..f2aea42a27 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -1,17 +1,28 @@ -= Redpanda Kubernetes Production Readiness Checklist += Production Readiness Checklist :description: Comprehensive checklist for validating Redpanda deployments in Kubernetes against production readiness standards. :page-context-links: [{"name": "Linux", "to": "deploy:redpanda/linux/index.adoc" },{"name": "Kubernetes", "to": "deploy:redpanda/kubernetes/index.adoc" } ] :page-categories: Production, Deployment -This checklist validates Redpanda deployments in Kubernetes against production readiness standards. Use this guide to ensure your cluster meets all critical requirements and follows recommended best practices for production deployments. +Before running a production workload on Redpanda in Kubernetes, follow this readiness checklist. NOTE: For Linux deployments, see the xref:deploy:redpanda/manual/production/production-readiness.adoc[Production Readiness Checklist for Linux]. -=== Authentication note +== Critical requirements -The `rpk` commands throughout this checklist include SASL authentication flags (`-X user`, `-X pass`, `-X sasl.mechanism`). +The Critical requirements checklist helps you to confirm that: -**If your cluster does not use SASL authentication**, you can omit these flags from all commands. For example: +- All required defaults and configuration items are specified. +- You have the optimal hardware setup. +- Security is enabled. +- You are set up to run in production. + +[NOTE] +==== +**SASL authentication flags** + +The `rpk` commands throughout this checklist include SASL authentication flags (`-X user`, `-X pass`, `-X sasl.mechanism`). If your cluster does not use SASL authentication, you can omit these flags from all commands. For example: + +.Input [,bash] ---- # With SASL authentication @@ -22,17 +33,9 @@ kubectl exec -n -c redpanda -- rpk cluster health ---- Common SASL mechanisms are `SCRAM-SHA-256` or `SCRAM-SHA-512`. Update these values as needed for your deployment. +==== -== Critical Production Requirements - -The Critical Production Requirements checklist helps you to confirm that: - -- All required defaults and configuration items are specified. -- You have the optimal hardware setup. -- Security is enabled. -- You are set up to run in production. - -=== 1. Cluster health status +=== Cluster health status Check that all brokers are connected and running. Run xref:reference:rpk/rpk-cluster/rpk-cluster-health.adoc[`rpk cluster health`] to check the health of the cluster. No nodes should be down, and there should be zero leaderless or under-replicated partitions. @@ -59,7 +62,7 @@ Under-replicated partitions (0): [] ---- -- -=== 2. Minimum broker count (≥3) +=== Minimum broker count (≥3) Ensure at least 3 brokers are running for production fault tolerance. @@ -67,12 +70,13 @@ Production clusters should have odd numbers of brokers (3, 5, 7, etc.) for optim Verify the running broker count: +.Input [,bash] ---- kubectl get pods -n -l app.kubernetes.io/component=redpanda-statefulset ---- -Expected output showing 3 or more brokers: +.Output [,bash,role=no-copy] ---- NAME READY STATUS RESTARTS AGE @@ -88,12 +92,13 @@ Verify the configured replica count in your deployment: Helm:: + -- +.Input [,bash] ---- helm get values redpanda -n | grep -A 1 "statefulset:" ---- -Expected output: +.Output [,bash,role=no-copy] ---- statefulset: @@ -104,12 +109,13 @@ statefulset: Operator:: + -- +.Input [,bash] ---- kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.statefulset.replicas}' ---- -Expected output: +.Output [,bash,role=no-copy] ---- 3 @@ -117,18 +123,19 @@ Expected output: -- ====== -=== 3. All brokers active membership +=== All brokers active membership Verify all brokers are in active state and not being decommissioned. Decommissioning is used to permanently remove a broker from the cluster, such as during node pool migrations or cluster downsizing. Brokers in decommissioned state should not be present in production clusters unless actively performing a planned migration. +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk redpanda admin brokers list -X user= -X pass= -X sasl.mechanism= ---- -Expected output showing all brokers with `active` membership status: +.Output [,bash,role=no-copy] ---- NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION @@ -141,14 +148,12 @@ All brokers must show `active` status. If any broker shows `draining` or `decomm See also: xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] -=== 4. No brokers in maintenance mode +=== No brokers in maintenance mode Check that no brokers are in maintenance mode during normal operations. Maintenance mode is used when modifying brokers that will remain as members of the cluster, such as during rolling upgrades or hardware maintenance. While necessary during planned maintenance windows, brokers should not remain in maintenance mode during normal operations. -[.side-by-side] --- .Input [,bash] ---- @@ -163,13 +168,12 @@ NODE-ID ENABLED FINISHED ERRORS PARTITIONS ELIGIBLE TRANSFERRING FAILED 1 false - - - - - - 2 false - - - - - - ---- --- All brokers should show `ENABLED: false`. If any broker shows `ENABLED: true` outside of a planned maintenance window, investigate immediately. See also: xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] -=== 5. Consistent Redpanda version +=== Consistent Redpanda version Check that Redpanda is running the https://github.com/redpanda-data/redpanda/releases[latest point release^] for the major version you're on and that all brokers run the same version. @@ -183,7 +187,7 @@ kubectl exec -n -c redpanda -- rpk redpanda admin brokers Version mismatches can cause compatibility issues and must be resolved. -=== 6. Version pinning +=== Version pinning **CRITICAL**: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. @@ -211,12 +215,13 @@ connectors: ---- Verify pinned versions: +.Input [,bash] ---- helm get values redpanda -n ---- -Expected output showing explicit version tags (not `latest` or version ranges): +.Output [,bash,role=no-copy] ---- image: @@ -256,6 +261,7 @@ spec: ---- Verify pinned versions: +.Input [,bash] ---- kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" @@ -276,7 +282,7 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" * Version ranges (e.g., `v24.2.x`) - may auto-update to patch releases * Unspecified tags - defaults to latest or chart-defined versions -=== 7. Default topic replication factor (≥3) +=== Default topic replication factor (≥3) Check that the default replication factor is set appropriately for production. @@ -299,16 +305,19 @@ Setting `default_topic_replications` to 3 or greater ensures new topics are crea See also: xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] -=== 8. Existing topics replication factor (≥3) +=== Existing topics replication factor (≥3) Check that all existing topics have adequate replication. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk topic list -X user= -X pass= -X sasl.mechanism= ---- -Expected output showing replication factor ≥3 for all topics: +.Output [,bash,role=no-copy] ---- NAME PARTITIONS REPLICAS @@ -317,21 +326,25 @@ orders 12 3 payments 8 3 user-events 16 3 ---- +-- All production topics should have `REPLICAS` of 3 or greater. Topics with single-digit replication are at risk of data loss if a broker fails. See also: xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] -=== 9. Persistent storage configuration +=== Persistent storage configuration Verify using persistent storage (not hostPath or emptyDir) for data persistence. +[.side-by-side] +-- +.Input [,bash] ---- kubectl get pvc -n ---- -Expected output showing bound PersistentVolumeClaims: +.Output [,bash,role=no-copy] ---- NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE @@ -339,14 +352,16 @@ datadir-redpanda-0 Bound pvc-a1b2c3d4-e5f6-7890-abcd-ef1234567890 100G datadir-redpanda-1 Bound pvc-b2c3d4e5-f6g7-8901-bcde-fg2345678901 100Gi RWO fast-ssd 10d datadir-redpanda-2 Bound pvc-c3d4e5f6-g7h8-9012-cdef-gh3456789012 100Gi RWO fast-ssd 10d ---- +-- Verify the StatefulSet uses PersistentVolumeClaims: +.Input [,bash] ---- kubectl describe statefulset -n redpanda | grep -A 5 "Volume Claims" ---- -Expected output: +.Output [,bash,role=no-copy] ---- Volume Claims: @@ -359,10 +374,11 @@ Volume Claims: HostPath and emptyDir storage are not suitable for production as they lack durability guarantees. -=== 10. RAID/LVM stripe configuration (multiple disks only) +=== RAID/LVM stripe configuration (multiple disks only) If using multiple physical disks, verify they are configured as RAID-0 or LVM stripe (not linear/concat). +.Input [,bash] ---- # Check block device configuration on nodes @@ -372,7 +388,7 @@ lvs -o lv_name,stripes,stripe_size mdadm --detail /dev/md* # if using software RAID ---- -Expected output for properly configured LVM stripe: +.Output [,bash,role=no-copy] ---- # lsblk output @@ -386,7 +402,7 @@ LV #Stripes StripeSize data 2 256.00k ---- -Expected output for software RAID-0: +.Output [,bash,role=no-copy] ---- # mdadm output @@ -402,7 +418,7 @@ Expected output for software RAID-0: Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance. Each disk must be striped for optimal I/O throughput. Single disk configurations do not require striping. -=== 11. Storage performance requirements +=== Storage performance requirements Ensure storage classes provide adequate IOPS and throughput for your workload. @@ -437,16 +453,19 @@ See also: * Include space for compaction operations * Monitor disk usage trends -=== 12. CPU and memory resource limits +=== CPU and memory resource limits Verify pods have resource requests and limits configured. +[.side-by-side] +-- +.Input [,bash] ---- kubectl get pod -n -o jsonpath='{.spec.containers[?(@.name=="redpanda")].resources}' | jq ---- -Expected output showing both requests and limits: +.Output [,bash,role=no-copy] ---- { @@ -460,6 +479,7 @@ Expected output showing both requests and limits: } } ---- +-- All Redpanda pods must have: @@ -470,7 +490,7 @@ Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS c See also: xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] -=== 13. CPU to memory ratio (1:2 minimum) +=== CPU to memory ratio (1:2 minimum) Ensure adequate memory allocation relative to CPU for optimal performance. @@ -483,12 +503,13 @@ Verify the CPU to memory ratio in your configuration: Helm:: + -- +.Input [,bash] ---- helm get values redpanda -n | grep -A 2 "resources:" ---- -Expected output showing proper ratio: +.Output [,bash,role=no-copy] ---- resources: @@ -504,12 +525,13 @@ resources: Operator:: + -- +.Input [,bash] ---- kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.resources}' | jq ---- -Expected output showing proper ratio: +.Output [,bash,role=no-copy] ---- { @@ -529,7 +551,7 @@ Expected output showing proper ratio: In the examples above, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). -=== 14. No fractional CPU requests +=== No fractional CPU requests Ensure CPU requests use whole numbers for consistent performance. @@ -537,12 +559,13 @@ Fractional CPUs can lead to performance variability in production. Use whole int Verify CPU configuration: +.Input [,bash] ---- kubectl get pod -n -o jsonpath='{.spec.containers[?(@.name=="redpanda")].resources.requests.cpu}' ---- -Expected output showing whole number: +.Output [,bash,role=no-copy] ---- 4 @@ -556,33 +579,40 @@ Avoid fractional values like `3500m` (3.5 cores) or `7500m` (7.5 cores). * Account for other workloads on shared nodes * Plan for resource growth and burst capacity -=== 15. Authorization enabled +=== Authorization enabled Verify Kafka authorization is enabled for access control. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get kafka_enable_authorization -X user= -X pass= -X sasl.mechanism= ---- -Expected output: +.Output [,bash,role=no-copy] ---- true ---- +-- Without authorization enabled, any client can access Kafka APIs without authentication. See xref:manage:security/authorization/index.adoc[Authorization] for configuration details. -=== 16. Developer mode disabled +=== Production mode settings + +Verify that developer mode and overprovisioned mode are disabled for production stability. -Ensure developer mode is disabled in production configuration. +Check developer mode: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get developer_mode -X user= -X pass= -X sasl.mechanism= ---- -Expected output: +.Output [,bash,role=no-copy] ---- false @@ -590,16 +620,15 @@ false Developer mode should never be enabled in production environments. Developer mode disables fsync and bypasses safety checks designed for production workloads. -=== 17. Overprovisioned disabled - -Ensure overprovisioned mode is disabled for production stability. +Check overprovisioned mode: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get overprovisioned -X user= -X pass= -X sasl.mechanism= ---- -Expected output: +.Output [,bash,role=no-copy] ---- false @@ -607,18 +636,19 @@ false Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. -=== 18. TLS configuration +=== TLS configuration Configure TLS encryption for all client and inter-broker communication. TLS prevents eavesdropping and man-in-the-middle attacks on network traffic. Verify TLS is enabled on all listeners: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config export -X user= -X pass= -X sasl.mechanism= | grep -A 10 "kafka_api:" ---- -Expected output showing TLS configuration: +.Output [,bash,role=no-copy] ---- redpanda: @@ -643,12 +673,13 @@ redpanda: Verify certificates are properly mounted: +.Input [,bash] ---- kubectl exec -n -c redpanda -- ls -la /etc/tls/certs/ ---- -Expected output showing certificate files: +.Output [,bash,role=no-copy] ---- total 16 @@ -659,18 +690,19 @@ total 16 See also: xref:manage:security/encryption.adoc[TLS Encryption] -=== 19. Authentication configuration +=== Authentication configuration Configure appropriate authentication mechanisms to control access to Redpanda resources. Verify SASL users are configured: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk acl user list -X user= -X pass= -X sasl.mechanism= ---- -Expected output showing configured users: +.Output [,bash,role=no-copy] ---- USERNAME @@ -689,12 +721,13 @@ monitoring Verify ACLs are configured: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk acl list -X user= -X pass= -X sasl.mechanism= ---- -Expected output showing ACL rules: +.Output [,bash,role=no-copy] ---- PRINCIPAL HOST RESOURCE-TYPE RESOURCE-NAME OPERATION PERMISSION @@ -708,18 +741,19 @@ See also: * xref:manage:security/authentication.adoc[Authentication] * xref:manage:security/authorization/index.adoc[Authorization] -=== 20. Network security +=== Network security Secure network access to the cluster using Kubernetes-native controls. Verify NetworkPolicies are configured: +.Input [,bash] ---- kubectl get networkpolicy -n ---- -Expected output showing network policies: +.Output [,bash,role=no-copy] ---- NAME POD-SELECTOR AGE @@ -730,6 +764,7 @@ redpanda-deny-all-ingress app.kubernetes.io/name=redpanda 10d Check NetworkPolicy rules: +.Input [,bash] ---- kubectl describe networkpolicy -n @@ -745,12 +780,13 @@ kubectl describe networkpolicy -n Verify services and exposed ports: +.Input [,bash] ---- kubectl get svc -n ---- -Expected output showing service configuration: +.Output [,bash,role=no-copy] ---- NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) @@ -760,38 +796,43 @@ redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP See also: xref:manage:security/listener-configuration.adoc[Listener Configuration] -=== 21. Pod Disruption Budget configured +=== Pod Disruption Budget configured Set up PDBs to control voluntary disruptions during maintenance. +[.side-by-side] +-- +.Input [,bash] ---- kubectl get pdb -n ---- -Expected output: +.Output [,bash,role=no-copy] ---- NAME MIN AVAILABLE MAX UNAVAILABLE ALLOWED DISRUPTIONS AGE redpanda N/A 1 1 10d ---- +-- Production deployments must have a PodDisruptionBudget with `maxUnavailable: 1` to prevent simultaneous broker disruptions during voluntary operations like node drains, upgrades, or autoscaler actions. See also: xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] -=== 22. Rack awareness and topology spread +=== Rack awareness and topology spread Configure topology spread constraints to distribute brokers across availability zones. Verify pod distribution across zones: +.Input [,bash] ---- kubectl get pod -n -o wide ---- -Expected output showing pods spread across different zones: +.Output [,bash,role=no-copy] ---- NAME READY STATUS NODE ZONE @@ -802,6 +843,7 @@ redpanda-2 2/2 Running node-us-west-2c-1.internal us-west-2c Check node availability zone labels: +.Input [,bash] ---- kubectl get nodes --show-labels | grep topology.kubernetes.io/zone @@ -815,16 +857,19 @@ kubectl get nodes --show-labels | grep topology.kubernetes.io/zone See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] -=== 23. Redpanda license verification +=== Redpanda license verification Validate Enterprise license if using Enterprise features. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster license info -X user= -X pass= -X sasl.mechanism= ---- -Expected output for valid license: +.Output [,bash,role=no-copy] ---- LICENSE INFORMATION @@ -833,23 +878,25 @@ Organization: Your Company Name Type: enterprise Expires: Dec 31 2025 ---- +-- Production deployments using Enterprise features (Tiered Storage, Schema Registry, Continuous Data Balancing, etc.) must have a valid Enterprise license with sufficient expiration date. See also: xref:get-started:licensing/index.adoc[Redpanda Licensing] -=== 24. Operator CRDs validation (Operator deployments only) +=== Operator CRDs validation (Operator deployments only) **CRITICAL**: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. Verify all required CRDs are installed: +.Input [,bash] ---- kubectl get crd | grep redpanda.com ---- -Expected output: +.Output [,bash,role=no-copy] ---- clusters.cluster.redpanda.com @@ -867,11 +914,15 @@ Required CRDs: If any CRDs are missing or incompatible with your Operator version, the Operator will fail to reconcile resources. -== Recommended Production Enhancements +== Recommended requirements -The Recommended Production Enhancements checklist confirms that you have adhered to day-2 operations best practices and can diagnose and recover from issues or failures. +The Recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: -=== 25. Deployment method detection +- You have adhered to day-2 operations best practices. +- You can diagnose and recover from issues or failures. +- You have configured monitoring, backup, and security scanning. + +=== Deployment method detection Verify that the deployment method (Helm or Operator) is correctly identified for your cluster. Understanding your deployment method is important for troubleshooting, upgrades, and configuration management. @@ -880,12 +931,13 @@ Verify that the deployment method (Helm or Operator) is correctly identified for Helm:: + -- +.Input [,bash] ---- helm list -n ---- -Expected output: +.Output [,bash,role=no-copy] ---- NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION @@ -898,12 +950,13 @@ The presence of a Helm release indicates a Helm-managed deployment. Operator:: + -- +.Input [,bash] ---- kubectl get redpanda -n ---- -Expected output: +.Output [,bash,role=no-copy] ---- NAME READY STATUS @@ -921,21 +974,25 @@ The presence of a Redpanda custom resource indicates an Operator-managed deploym * Where to find deployment logs and troubleshooting information * Which documentation sections apply to your environment -=== 26. XFS filesystem for data directory +=== XFS filesystem for data directory Verify data directories use XFS filesystem for optimal performance. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- df -khT /var/lib/redpanda/data ---- -Expected output showing XFS filesystem: +.Output [,bash,role=no-copy] ---- Filesystem Type Size Used Avail Use% Mounted on /dev/nvme0n1 xfs 1.8T 14G 1.8T 1% /var/lib/redpanda/data ---- +-- XFS provides better performance characteristics for Redpanda workloads compared to ext4. While ext4 is supported, XFS is strongly recommended for production deployments. @@ -945,16 +1002,19 @@ XFS provides better performance characteristics for Redpanda workloads compared * Tune filesystem mount options * Consider storage class performance characteristics -=== 27. Pod anti-affinity rules +=== Pod anti-affinity rules Configure pod anti-affinity to spread brokers across nodes. +[.side-by-side] +-- +.Input [,bash] ---- kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.affinity}' | jq ---- -Expected output showing pod anti-affinity rules: +.Output [,bash,role=no-copy] ---- { @@ -972,15 +1032,17 @@ Expected output showing pod anti-affinity rules: } } ---- +-- This prevents single node failures from affecting multiple brokers by ensuring each Redpanda pod runs on a different node. See also: xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] -=== 28. Node isolation configuration +=== Node isolation configuration Configure taints/tolerations or nodeSelector for workload isolation. +.Input [,bash] ---- kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.nodeSelector}' | jq @@ -1000,20 +1062,25 @@ Isolating Redpanda workloads on dedicated nodes improves performance predictabil **Memory allocation strategy**:: Optimize memory settings for your workload patterns. -=== 29. Continuous data balancing enabled +=== Partition balancing + +Configure automatic partition balancing across brokers and CPU cores. + +**Continuous Data Balancing** xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] is a major benefit of Redpanda for managing production deployments. It automatically rebalances partition replicas across brokers based on disk usage and node changes, eliminating manual intervention and preventing performance degradation. **This feature should be enabled for all licensed production clusters.** -Verify that Continuous Data Balancing is configured: +Check continuous data balancing: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get partition_autobalancing_mode -X user= -X pass= -X sasl.mechanism= ---- -Expected output: +.Output [,bash,role=no-copy] ---- continuous @@ -1027,16 +1094,17 @@ Setting this to `continuous` enables automatic partition rebalancing based on: Without Continuous Data Balancing, partition distribution becomes skewed over time, leading to hotspots and manual rebalancing operations. -=== 30. Core balancing enabled +**Core Balancing** -Configure core balancing for CPU core partition distribution. +Check core balancing for CPU core partition distribution: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get core_balancing_on_core_count_change -X user= -X pass= -X sasl.mechanism= ---- -Expected output: +.Output [,bash,role=no-copy] ---- true @@ -1044,16 +1112,19 @@ true When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, especially beneficial after broker restarts or configuration changes. -=== 31. System requirements validation +=== System requirements validation Run system checks to validate optimal configuration. +[.side-by-side] +-- +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk redpanda check -X user= -X pass= -X sasl.mechanism= ---- -Expected output showing all checks passed: +.Output [,bash,role=no-copy] ---- CONDITION REQUIRED CURRENT SEVERITY PASSED @@ -1062,15 +1133,17 @@ Free memory per CPU [MB] >= 2048 8192 Warning true NTP Synced true true Warning true Swappiness 1 1 Warning true ---- +-- Review any failed checks and remediate before proceeding to production. See xref:reference:rpk/rpk-redpanda/rpk-redpanda-check.adoc[rpk redpanda check] for details on each validation. -=== 32. Debug bundle generation +=== Debug bundle generation Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that when an actual issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. Generate a debug bundle: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk debug bundle -o /tmp/bundle.zip @@ -1078,7 +1151,7 @@ kubectl exec -n -c redpanda -- rpk debug bundle -o /tmp/b For additional options and arguments, see xref:reference:rpk/rpk-debug/rpk-debug-bundle.adoc[rpk debug bundle]. -Expected output: +.Output [,bash,role=no-copy] ---- Creating bundle file... @@ -1106,18 +1179,19 @@ Testing bundle generation early ensures this critical troubleshooting tool works See also: xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] -=== 33. Tiered Storage configuration +=== Tiered Storage configuration Configure Tiered Storage for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. Verify Tiered Storage configuration: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_enabled -X user= -X pass= -X sasl.mechanism= ---- -Expected output if Tiered Storage is enabled: +.Output [,bash,role=no-copy] ---- true @@ -1132,6 +1206,7 @@ true **Verification steps:** +.Input [,bash] ---- # Check bucket configuration @@ -1143,7 +1218,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo See also: xref:manage:tiered-storage.adoc[Tiered Storage] -=== 34. Security scanning +=== Security scanning Regularly scan container images and configurations for vulnerabilities to maintain security posture. @@ -1151,13 +1226,14 @@ Regularly scan container images and configurations for vulnerabilities to mainta Verify that container images are scanned before deployment: +.Input [,bash] ---- # Check current image in use kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.containers[?(@.name=="redpanda")].image}' ---- -Expected output: +.Output [,bash,role=no-copy] ---- docker.redpanda.com/redpandadata/redpanda:v24.2.4 @@ -1173,6 +1249,7 @@ docker.redpanda.com/redpandadata/redpanda:v24.2.4 **Configuration scanning:** +.Input [,bash] ---- # Scan Kubernetes manifests @@ -1182,7 +1259,7 @@ kubectl get redpanda,statefulset,deployment -n -o yaml > cluster-con Establish a regular cadence for security scanning (e.g., weekly or with each deployment). -=== 35. Backup and recovery procedures +=== Backup and recovery procedures Implement and test backup and recovery processes to ensure business continuity. @@ -1190,6 +1267,7 @@ Implement and test backup and recovery processes to ensure business continuity. Tiered Storage provides built-in backup capabilities by storing data in object storage. Verify Tiered Storage is configured: +.Input [,bash] ---- kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_enabled -X user= -X pass= -X sasl.mechanism= @@ -1199,6 +1277,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo Regularly test recovery procedures to validate RTO/RPO targets: +.Input [,bash] ---- # Test topic restoration from Tiered Storage @@ -1216,18 +1295,19 @@ kubectl exec -n -c redpanda -- rpk topic describe -c redpanda -- rpk cluster config get audit_enabled -X user= -X pass= -X sasl.mechanism= ---- -Expected output if audit logging is enabled: +.Output [,bash,role=no-copy] ---- true @@ -1237,13 +1317,14 @@ true Check where audit logs are being written: +.Input [,bash] ---- # Check audit log topic kubectl exec -n -c redpanda -- rpk topic list -X user= -X pass= -X sasl.mechanism= | grep audit ---- -Expected output: +.Output [,bash,role=no-copy] ---- _redpanda.audit_log 1 3 @@ -1269,9 +1350,14 @@ Audit logging may be required for: See also: xref:manage:audit-logging.adoc[Audit Logging] -== Best Practices +== Advanced requirements + +The Advanced requirements checklist ensures full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The Advanced requirements checklist confirms the following: -The following best practices improve operational robustness but are not critical for basic functionality. +- You are proactively monitoring mission-critical workloads. +- You have business continuity solutions in place. +- You have integrated into enterprise security and operational systems. +- Your enterprise is ready to run mission-critical workloads. === Monitoring and observability @@ -1281,6 +1367,7 @@ The following best practices improve operational robustness but are not critical * Configure ServiceMonitor for automatic scraping * Validate metrics endpoint accessibility + +.Input [,bash] ---- kubectl get servicemonitor -n @@ -1344,6 +1431,7 @@ See also: xref:upgrade:k-rolling-upgrade.adoc[Upgrade Redpanda in Kubernetes] **Resource quotas**:: Configure namespace resource quotas to prevent resource exhaustion. + +.Input [,bash] ---- kubectl get resourcequota -n diff --git a/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc b/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc index 52da709883..dab60dbbb6 100644 --- a/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc +++ b/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc @@ -1,7 +1,7 @@ = Production Readiness Checklist :page-aliases: deploy:deployment-option/self-hosted/manual/production/production-readiness.adoc -Before running a production workload on Redpanda, follow this readiness checklist to ensure that you're set up for success. Redpanda Data recommends using the xref:deploy:redpanda/manual/production/production-deployment-automation.adoc[automated deployment instructions] with Ansible. If you cannot deploy with Ansible, use the xref:deploy:redpanda/manual/production/production-deployment.adoc[manual deployment instructions]. +Before running a production workload on Redpanda, follow this readiness checklist. Redpanda Data recommends using the xref:deploy:redpanda/manual/production/production-deployment-automation.adoc[automated deployment instructions] with Ansible. If you cannot deploy with Ansible, use the xref:deploy:redpanda/manual/production/production-deployment.adoc[manual deployment instructions]. == Level 1 production readiness From ca3051fc2f7a91c39a15d112e615118fd8b09b79 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Mon, 12 Jan 2026 19:57:08 -0300 Subject: [PATCH 08/25] move verification and change broker setting check --- .../kubernetes/k-production-checklist.adoc | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index f2aea42a27..6d0308407e 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -16,6 +16,33 @@ The Critical requirements checklist helps you to confirm that: - Security is enabled. - You are set up to run in production. +=== Redpanda license verification + +Validate Enterprise license if using Enterprise features. + +[.side-by-side] +-- +.Input +[,bash] +---- +kubectl exec -n -c redpanda -- rpk cluster license info -X user= -X pass= -X sasl.mechanism= +---- + +.Output +[,bash,role=no-copy] +---- +LICENSE INFORMATION +=================== +Organization: Your Company Name +Type: enterprise +Expires: Dec 31 2025 +---- +-- + +Production deployments using Enterprise features (Tiered Storage, Schema Registry, Continuous Data Balancing, etc.) must have a valid Enterprise license with sufficient expiration date. + +See also: xref:get-started:licensing/index.adoc[Redpanda Licensing] + [NOTE] ==== **SASL authentication flags** @@ -609,13 +636,13 @@ Check developer mode: .Input [,bash] ---- -kubectl exec -n -c redpanda -- rpk cluster config get developer_mode -X user= -X pass= -X sasl.mechanism= +kubectl exec -n -c redpanda -- grep developer_mode /etc/redpanda/redpanda.yaml ---- .Output [,bash,role=no-copy] ---- -false +developer_mode: false ---- Developer mode should never be enabled in production environments. Developer mode disables fsync and bypasses safety checks designed for production workloads. @@ -625,13 +652,13 @@ Check overprovisioned mode: .Input [,bash] ---- -kubectl exec -n -c redpanda -- rpk cluster config get overprovisioned -X user= -X pass= -X sasl.mechanism= +kubectl exec -n -c redpanda -- grep overprovisioned /etc/redpanda/redpanda.yaml ---- .Output [,bash,role=no-copy] ---- -false +overprovisioned: false ---- Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. @@ -857,32 +884,7 @@ kubectl get nodes --show-labels | grep topology.kubernetes.io/zone See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] -=== Redpanda license verification - -Validate Enterprise license if using Enterprise features. - -[.side-by-side] --- -.Input -[,bash] ----- -kubectl exec -n -c redpanda -- rpk cluster license info -X user= -X pass= -X sasl.mechanism= ----- - -.Output -[,bash,role=no-copy] ----- -LICENSE INFORMATION -=================== -Organization: Your Company Name -Type: enterprise -Expires: Dec 31 2025 ----- --- -Production deployments using Enterprise features (Tiered Storage, Schema Registry, Continuous Data Balancing, etc.) must have a valid Enterprise license with sufficient expiration date. - -See also: xref:get-started:licensing/index.adoc[Redpanda Licensing] === Operator CRDs validation (Operator deployments only) From e901c114b53cfb1c38b7bc5d777d0aded7afc107 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Mon, 12 Jan 2026 23:21:21 -0300 Subject: [PATCH 09/25] enforce styleguide --- .../kubernetes/k-production-checklist.adoc | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc index 6d0308407e..fbc13be3d9 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc @@ -16,7 +16,7 @@ The Critical requirements checklist helps you to confirm that: - Security is enabled. - You are set up to run in production. -=== Redpanda license verification +=== Redpanda license Validate Enterprise license if using Enterprise features. @@ -62,7 +62,7 @@ kubectl exec -n -c redpanda -- rpk cluster health Common SASL mechanisms are `SCRAM-SHA-256` or `SCRAM-SHA-512`. Update these values as needed for your deployment. ==== -=== Cluster health status +=== Cluster health Check that all brokers are connected and running. Run xref:reference:rpk/rpk-cluster/rpk-cluster-health.adoc[`rpk cluster health`] to check the health of the cluster. No nodes should be down, and there should be zero leaderless or under-replicated partitions. @@ -89,7 +89,7 @@ Under-replicated partitions (0): [] ---- -- -=== Minimum broker count (≥3) +=== Minimum broker count Ensure at least 3 brokers are running for production fault tolerance. @@ -150,7 +150,7 @@ kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.sta -- ====== -=== All brokers active membership +=== Active broker membership Verify all brokers are in active state and not being decommissioned. @@ -627,7 +627,7 @@ true Without authorization enabled, any client can access Kafka APIs without authentication. See xref:manage:security/authorization/index.adoc[Authorization] for configuration details. -=== Production mode settings +=== Production mode enabled Verify that developer mode and overprovisioned mode are disabled for production stability. @@ -663,7 +663,7 @@ overprovisioned: false Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. -=== TLS configuration +=== TLS enabled Configure TLS encryption for all client and inter-broker communication. TLS prevents eavesdropping and man-in-the-middle attacks on network traffic. @@ -717,7 +717,7 @@ total 16 See also: xref:manage:security/encryption.adoc[TLS Encryption] -=== Authentication configuration +=== Authentication enabled Configure appropriate authentication mechanisms to control access to Redpanda resources. @@ -823,7 +823,7 @@ redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP See also: xref:manage:security/listener-configuration.adoc[Listener Configuration] -=== Pod Disruption Budget configured +=== Pod Disruption Budget Set up PDBs to control voluntary disruptions during maintenance. @@ -886,7 +886,7 @@ See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] -=== Operator CRDs validation (Operator deployments only) +=== Operator CRDs (Operator deployments only) **CRITICAL**: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. @@ -924,7 +924,7 @@ The Recommended requirements checklist confirms that you can monitor and support - You can diagnose and recover from issues or failures. - You have configured monitoring, backup, and security scanning. -=== Deployment method detection +=== Deployment method Verify that the deployment method (Helm or Operator) is correctly identified for your cluster. Understanding your deployment method is important for troubleshooting, upgrades, and configuration management. @@ -976,7 +976,7 @@ The presence of a Redpanda custom resource indicates an Operator-managed deploym * Where to find deployment logs and troubleshooting information * Which documentation sections apply to your environment -=== XFS filesystem for data directory +=== XFS filesystem Verify data directories use XFS filesystem for optimal performance. @@ -1004,7 +1004,7 @@ XFS provides better performance characteristics for Redpanda workloads compared * Tune filesystem mount options * Consider storage class performance characteristics -=== Pod anti-affinity rules +=== Pod anti-affinity Configure pod anti-affinity to spread brokers across nodes. @@ -1040,7 +1040,7 @@ This prevents single node failures from affecting multiple brokers by ensuring e See also: xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] -=== Node isolation configuration +=== Node isolation Configure taints/tolerations or nodeSelector for workload isolation. @@ -1114,7 +1114,7 @@ true When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, especially beneficial after broker restarts or configuration changes. -=== System requirements validation +=== System requirements Run system checks to validate optimal configuration. @@ -1139,7 +1139,7 @@ Swappiness 1 1 Warning true Review any failed checks and remediate before proceeding to production. See xref:reference:rpk/rpk-redpanda/rpk-redpanda-check.adoc[rpk redpanda check] for details on each validation. -=== Debug bundle generation +=== Debug bundle Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that when an actual issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. @@ -1181,7 +1181,7 @@ Testing bundle generation early ensures this critical troubleshooting tool works See also: xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] -=== Tiered Storage configuration +=== Tiered Storage Configure Tiered Storage for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. @@ -1261,7 +1261,7 @@ kubectl get redpanda,statefulset,deployment -n -o yaml > cluster-con Establish a regular cadence for security scanning (e.g., weekly or with each deployment). -=== Backup and recovery procedures +=== Backup and recovery Implement and test backup and recovery processes to ensure business continuity. From ab3394a82fadb2cac7d50a6465a514813ebe0ef5 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Tue, 13 Jan 2026 12:00:11 -0300 Subject: [PATCH 10/25] rename to match linux --- modules/ROOT/nav.adoc | 4 ++-- ...-production-checklist.adoc => k-production-readiness.adoc} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename modules/deploy/pages/redpanda/kubernetes/{k-production-checklist.adoc => k-production-readiness.adoc} (99%) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index c95646225a..f20ebd5836 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -75,7 +75,7 @@ **** xref:deploy:redpanda/kubernetes/k-requirements.adoc[Requirements and Recommendations] **** xref:deploy:redpanda/kubernetes/k-tune-workers.adoc[Tune Worker Nodes] **** xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy Redpanda] -**** xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[] +**** xref:deploy:redpanda/kubernetes/k-production-readiness.adoc[] **** xref:deploy:redpanda/kubernetes/k-high-availability.adoc[High Availability] *** xref:deploy:redpanda/manual/index.adoc[Linux] **** xref:deploy:redpanda/manual/production/requirements.adoc[Hardware and Software Requirements] @@ -201,7 +201,7 @@ *** xref:console:ui/schema-reg.adoc[Manage in Redpanda Console] ** xref:manage:high-availability.adoc[High Availability] ** xref:manage:disaster-recovery/index.adoc[Disaster Recovery] -*** xref:manage:disaster-recovery/shadowing/index.adoc[Shadowing] +*** xref:manage:disaster-recovery/shadowing/index.adoc[Shadowing]f **** xref:manage:disaster-recovery/shadowing/overview.adoc[Overview] **** xref:manage:disaster-recovery/shadowing/setup.adoc[Configure Shadowing] **** xref:manage:disaster-recovery/shadowing/monitor.adoc[Monitor Shadowing] diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc similarity index 99% rename from modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc rename to modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index fbc13be3d9..d32fc50203 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-checklist.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -35,7 +35,7 @@ LICENSE INFORMATION =================== Organization: Your Company Name Type: enterprise -Expires: Dec 31 2025 +Expires: Dec 31 2026 ---- -- From 1aaadda0bba2808b59a36e5680b0eb0ddc23aa3a Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Tue, 13 Jan 2026 15:05:00 -0300 Subject: [PATCH 11/25] match production readiness for linux --- .../manual/production/production-readiness.adoc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc b/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc index dab60dbbb6..db0d4b5c3e 100644 --- a/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc +++ b/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc @@ -3,9 +3,11 @@ Before running a production workload on Redpanda, follow this readiness checklist. Redpanda Data recommends using the xref:deploy:redpanda/manual/production/production-deployment-automation.adoc[automated deployment instructions] with Ansible. If you cannot deploy with Ansible, use the xref:deploy:redpanda/manual/production/production-deployment.adoc[manual deployment instructions]. -== Level 1 production readiness +NOTE: For Kubernetes deployments, see the xref:deploy:redpanda/kubernetes/k-production-readiness.adoc[Production Readiness Checklist for Kubernetes]. -The Level 1 readiness checklist helps you to confirm that: +== Critical requirements + +The critical requirements checklist helps you to confirm that: - All required defaults and configuration items are specified. - You have the optimal hardware setup. @@ -550,9 +552,9 @@ NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION See also: xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] -== Level 2 production readiness +== Recommended requirements -The Level 2 readiness checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: +The recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: - You have adhered to day-2 operations best practices. - You can diagnose and recover from issues or failures. @@ -624,9 +626,9 @@ See also: * xref:deploy:redpanda/manual/high-availability.adoc#multi-az-deployments[Multi-AZ deployments] * xref:manage:kubernetes/k-rack-awareness.adoc#configure-rack-awareness[Configure rack awareness in Kubernetes] -== Level 3 production readiness +== Advanced requirements -The Level 3 readiness checklist ensures full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The Level 3 readiness confirms the following: +The advanced requirements checklist ensure full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The advanced requirements confirm the following: - You are proactively monitoring mission-critical workloads, business continuity solutions, and integration into enterprise security systems. - Your enterprise is ready to run mission-critical workloads. From c3c12ccbf32cbaec1ad03980b3d7f633dd16ebde Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Tue, 13 Jan 2026 15:10:59 -0300 Subject: [PATCH 12/25] add tuners --- .../redpanda/kubernetes/k-production-readiness.adoc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index d32fc50203..def9f37507 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -517,11 +517,11 @@ Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS c See also: xref:manage:kubernetes/k-manage-resources.adoc[Manage Pod Resources] -=== CPU to memory ratio (1:2 minimum) +=== CPU to memory ratio Ensure adequate memory allocation relative to CPU for optimal performance. -Production deployments should provision at least 2 GiB of memory per CPU core. +Production deployments should provision at least 2 GiB of memory per CPU core. The ratio should be at least 1:2 (2 GiB per core). Verify the CPU to memory ratio in your configuration: @@ -663,6 +663,8 @@ overprovisioned: false Overprovisioned mode bypasses critical resource checks and should never be enabled in production. This mode is intended only for development environments with constrained resources. +Verify in Helm values that `resources.cpu.overprovisioned` is not explicitly set to `true` (it's automatically calculated based on CPU allocation). + === TLS enabled Configure TLS encryption for all client and inter-broker communication. TLS prevents eavesdropping and man-in-the-middle attacks on network traffic. @@ -916,6 +918,10 @@ Required CRDs: If any CRDs are missing or incompatible with your Operator version, the Operator will fail to reconcile resources. +=== Run Redpanda tuners + +Check that you have configured tuners for optimal performance. Tuners can significantly impact latency and throughput. In Kubernetes, tuners are configured through the Helm chart or may need to be run on worker nodes themselves. For details, see xref:deploy:redpanda/kubernetes/k-tune-workers.adoc[Tune Kubernetes Worker Nodes for Production]. + == Recommended requirements The Recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: @@ -1354,7 +1360,7 @@ See also: xref:manage:audit-logging.adoc[Audit Logging] == Advanced requirements -The Advanced requirements checklist ensures full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The Advanced requirements checklist confirms the following: +The Advanced requirements checklist ensure full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The Advanced requirements checklist confirms the following: - You are proactively monitoring mission-critical workloads. - You have business continuity solutions in place. From 5c6fe01155bfa6a87d76241b9278f48b030a5362 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Tue, 13 Jan 2026 20:51:06 -0300 Subject: [PATCH 13/25] continue changing the doc to match linux. --- .../kubernetes/k-production-readiness.adoc | 116 +++++------------- 1 file changed, 31 insertions(+), 85 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index def9f37507..e6b87fa7a5 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -216,7 +216,7 @@ Version mismatches can cause compatibility issues and must be resolved. === Version pinning -**CRITICAL**: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. +WARNING: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. Verify that versions are explicitly pinned in your deployment configuration: @@ -890,7 +890,7 @@ See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] === Operator CRDs (Operator deployments only) -**CRITICAL**: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. +WARNING: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. Verify all required CRDs are installed: @@ -1338,112 +1338,58 @@ kubectl exec -n -c redpanda -- rpk topic list -X user= ---- -**Grafana dashboards**:: Import and configure Redpanda monitoring dashboards. -+ -* Import official Redpanda dashboards -* Verify dashboards display data correctly -* Customize dashboards for your environment +=== System log retention -**Alerting rules**:: Implement alerting for critical metrics and conditions. -+ -* CPU and memory utilization alerts -* Disk space alerts -* Replication lag alerts -* Broker health alerts +Check that system logs are being captured and stored for an appropriate period of time (minimally, 7 days). Configure log forwarding using tools like Fluentd or your cloud provider's logging solution to send logs to a central location. -**Log aggregation**:: Configure centralized log collection and analysis. -+ -* Forward Redpanda logs to central logging system -* Set up log retention policies -* Configure log-based alerting +=== Environment configuration -**Health checks**:: Implement application-level health checks. -+ -* Configure Kubernetes liveness and readiness probes -* Set up external health monitoring -* Define SLI/SLO metrics +Check that you have a development or test environment configured to evaluate upgrades and configuration changes before applying them to production. -See also: xref:manage:monitoring.adoc[Monitor Redpanda] +=== Upgrade policy -=== Operational readiness +Check that you have an upgrade policy defined and implemented. Redpanda supports xref:upgrade:k-rolling-upgrade.adoc[rolling upgrades], so upgrades do not require downtime. However, make sure that upgrades are scheduled on a regular basis, ideally using automation with xref:manage:kubernetes/k-configure-helm-chart.adoc[Helm] or GitOps workflows. -**Deployment automation**:: Implement Infrastructure as Code for reproducible deployments. -+ -* Use Helm charts or Kubernetes manifests in version control -* Implement GitOps workflows -* Automate testing and validation +== Advanced requirements -**Environment configuration**:: Maintain separate environments for testing and validation. -+ -* Set up dedicated dev/staging/production clusters -* Test changes in non-production first -* Mirror production configuration in staging +The Advanced requirements checklist ensure full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The Advanced requirements checklist confirms the following: -**Upgrade policy**:: Document and test cluster upgrade processes. -+ -* Plan for rolling upgrades with zero downtime -* Test upgrade procedures in staging environments -* Implement rollback capabilities -* Document upgrade policy and procedures +- You are proactively monitoring mission-critical workloads. +- You have business continuity solutions in place. +- You have integrated into enterprise security and operational systems. +- Your enterprise is ready to run mission-critical workloads. -See also: xref:upgrade:k-rolling-upgrade.adoc[Upgrade Redpanda in Kubernetes] +=== Configure alerts -**Incident response**:: Prepare for operational incidents and outages. -+ -* Document troubleshooting procedures -* Establish on-call processes -* Create incident response playbooks +A standard set of alerts for xref:manage:monitoring.adoc#generate-grafana-dashboard[Grafana] or xref:manage:monitoring.adoc#configure-prometheus[Prometheus] is provided in the https://github.com/redpanda-data/observability[GitHub Redpanda observability repo^]. Customize these alerts for your specific needs. -**Resource quotas**:: Configure namespace resource quotas to prevent resource exhaustion. -+ -.Input -[,bash] ----- -kubectl get resourcequota -n ----- +See also: xref:reference:monitor-metrics.adoc[Monitoring Metrics] + +=== Deployment automation + +Review your deployment automation. Ensure that cluster configuration is managed using xref:manage:kubernetes/k-configure-helm-chart.adoc[Helm] or GitOps workflows, and that all configuration is saved in source control. + +=== Monitor security settings + +Regularly review your cluster's security settings using the link:/api/doc/admin/operation/operation-get_security_report[`/v1/security/report`] Admin API endpoint. Investigate and address any issues identified in the alerts section. + +include::manage:partial$security-report.adoc[] == Suggested reading From b328eb3105a2f5cde09e888a99c2dffb5be33b1f Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Mon, 19 Jan 2026 12:20:48 -0300 Subject: [PATCH 14/25] Update modules/ROOT/nav.adoc --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index f20ebd5836..5eb34695dc 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -201,7 +201,7 @@ *** xref:console:ui/schema-reg.adoc[Manage in Redpanda Console] ** xref:manage:high-availability.adoc[High Availability] ** xref:manage:disaster-recovery/index.adoc[Disaster Recovery] -*** xref:manage:disaster-recovery/shadowing/index.adoc[Shadowing]f +*** xref:manage:disaster-recovery/shadowing/index.adoc[Shadowing] **** xref:manage:disaster-recovery/shadowing/overview.adoc[Overview] **** xref:manage:disaster-recovery/shadowing/setup.adoc[Configure Shadowing] **** xref:manage:disaster-recovery/shadowing/monitor.adoc[Monitor Shadowing] From 5a0fd160a54fc69a5752619953a9207f9d65a2e5 Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Mon, 19 Jan 2026 15:05:49 -0600 Subject: [PATCH 15/25] remove TODO, add command output example and info regarding verifying operator/helm version --- .../kubernetes/k-production-readiness.adoc | 74 ++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index e6b87fa7a5..197e88fe53 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -204,15 +204,85 @@ See also: xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Ma Check that Redpanda is running the https://github.com/redpanda-data/redpanda/releases[latest point release^] for the major version you're on and that all brokers run the same version. +**Verify Redpanda broker versions:** + .Input [,bash] ---- kubectl exec -n -c redpanda -- rpk redpanda admin brokers list -X user= -X pass= -X sasl.mechanism= ---- -// TODO: Add expected output example +.Output +[,bash,role=no-copy] +---- +NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION +0 4 active true v25.2.4 +1 4 active true v25.2.4 +2 4 active true v25.2.4 +---- + +All brokers must show the same `BROKER-VERSION`. Version mismatches between brokers can cause compatibility issues and must be resolved. + +**Verify Helm Chart or Operator version compatibility:** + +For Kubernetes deployments, you must also verify that your deployment tool (Helm Chart or Operator) version is compatible with your Redpanda version. The Helm Chart or Operator version must be within one minor version of the Redpanda version. + +For example, if running Redpanda v25.2.x, the Helm Chart or Operator version must be v25.1.x, v25.2.x, or v25.3.x. + +[tabs] +====== +Helm:: ++ +-- +.Input +[,bash] +---- +helm list -n +---- + +.Output +[,bash,role=no-copy] +---- +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +redpanda redpanda 1 2024-01-15 10:30:00.123456 -0800 PST deployed redpanda-5.2.4 v25.2.4 +---- + +The `CHART` column shows the Helm Chart version (e.g., `redpanda-5.2.4`), which should be compatible with the `APP VERSION` (Redpanda version). +-- + +Operator:: ++ +-- +.Input +[,bash] +---- +kubectl get deployment redpanda-controller-manager -n -o jsonpath='{.spec.template.spec.containers[0].image}' +---- + +.Output +[,bash,role=no-copy] +---- +docker.redpanda.com/redpandadata/redpanda-operator:v25.2.4 +---- + +The Operator version is shown in the image tag (e.g., `v25.2.4`), which should be compatible with your Redpanda broker version. + +You can also check the Operator version using: + +.Input +[,bash] +---- +kubectl get redpanda redpanda -n -o jsonpath='{.metadata.annotations.redpanda\.com/operator-version}' +---- +-- +====== + +**Version compatibility requirements:** -Version mismatches can cause compatibility issues and must be resolved. +* All Redpanda brokers must run the same version +* Helm Chart or Operator version must be within ±1 minor version of Redpanda version +* Example: Redpanda v25.2.x requires Helm/Operator v25.1.x, v25.2.x, or v25.3.x +* Running incompatible versions can lead to deployment failures or cluster instability === Version pinning From adff72467371b9ebcf9d4ecb40440513b66c0172 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 23 Jan 2026 15:59:13 -0300 Subject: [PATCH 16/25] Apply suggestions from code review Co-authored-by: Joyce Fee <102751339+Feediver1@users.noreply.github.com> --- .../pages/redpanda/kubernetes/k-production-deployment.adoc | 2 +- .../pages/redpanda/kubernetes/k-production-workflow.adoc | 2 +- .../deploy/pages/redpanda/kubernetes/k-requirements.adoc | 2 +- .../redpanda/manual/production/production-readiness.adoc | 6 +++--- modules/deploy/partials/high-availability.adoc | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc index 1485890a48..27faaef75b 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-deployment.adoc @@ -779,7 +779,7 @@ include::deploy:partial$kubernetes/guides/troubleshoot.adoc[leveloffset=+1] After deploying Redpanda, validate your production readiness: -- xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Production readiness checklist] - Comprehensive validation of your deployment against production standards +- xref:deploy:redpanda/kubernetes/k-production-readiness.adoc[Production readiness checklist] - Comprehensive validation of your deployment against production standards See the xref:manage:kubernetes/index.adoc[Manage Kubernetes topics] to learn how to customize your deployment to meet your needs. diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc index ebf5576038..e7258c1a08 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-workflow.adoc @@ -10,4 +10,4 @@ The production deployment tasks involve Kubernetes administrators (admins) as we . All: xref:deploy:redpanda/kubernetes/k-requirements.adoc[Review the requirements and recommendations] to align on prerequisites. . Admin: xref:deploy:redpanda/kubernetes/k-tune-workers.adoc[Tune the worker nodes] for best performance. . User: xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy Redpanda] using either the Redpanda Operator or the Redpanda Helm chart. -. All: xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Validate production readiness] using the comprehensive checklist to ensure your deployment meets production standards. +. All: xref:deploy:redpanda/kubernetes/k-production-readiness.adoc[Validate production readiness] using the comprehensive checklist to ensure your deployment meets production standards. diff --git a/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc b/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc index 2f7ef9cfb5..0f273af4d6 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-requirements.adoc @@ -14,7 +14,7 @@ include::deploy:partial$requirements.adoc[] After meeting these requirements, proceed to: - xref:deploy:redpanda/kubernetes/k-production-deployment.adoc[Deploy Redpanda for production] -- xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Validate production readiness] with the comprehensive checklist +- xref:deploy:redpanda/kubernetes/k-production-readiness.adoc[Validate production readiness] with the comprehensive checklist include::shared:partial$suggested-reading.adoc[] diff --git a/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc b/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc index db0d4b5c3e..552f47182f 100644 --- a/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc +++ b/modules/deploy/pages/redpanda/manual/production/production-readiness.adoc @@ -7,7 +7,7 @@ NOTE: For Kubernetes deployments, see the xref:deploy:redpanda/kubernetes/k-prod == Critical requirements -The critical requirements checklist helps you to confirm that: +The Critical requirements checklist helps you to confirm that: - All required defaults and configuration items are specified. - You have the optimal hardware setup. @@ -554,7 +554,7 @@ See also: xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission == Recommended requirements -The recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: +The Recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: - You have adhered to day-2 operations best practices. - You can diagnose and recover from issues or failures. @@ -628,7 +628,7 @@ See also: == Advanced requirements -The advanced requirements checklist ensure full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The advanced requirements confirm the following: +The Advanced requirements checklist ensures full enterprise readiness, indicates that your system is operating at the highest level of availability, and can prevent or recover from the most serious incidents. The advanced requirements confirm the following: - You are proactively monitoring mission-critical workloads, business continuity solutions, and integration into enterprise security systems. - Your enterprise is ready to run mission-critical workloads. diff --git a/modules/deploy/partials/high-availability.adoc b/modules/deploy/partials/high-availability.adoc index 4543e66c83..5801bf4f76 100644 --- a/modules/deploy/partials/high-availability.adoc +++ b/modules/deploy/partials/high-availability.adoc @@ -532,7 +532,7 @@ cat debug.log | grep -v ApiVersions | egrep 'opening|read' include::shared:partial$suggested-reading.adoc[] ifdef::env-kubernetes[] -* xref:deploy:redpanda/kubernetes/k-production-checklist.adoc[Production readiness checklist] - Validate your Kubernetes deployment against production standards +* xref:deploy:redpanda/kubernetes/k-production-readiness.adoc[Production readiness checklist] - Validate your Kubernetes deployment against production standards endif::[] * https://redpanda.com/blog/redpanda-official-jepsen-report-and-analysis?utm_assettype=report&utm_assetname=roi_report&utm_source=gated_content&utm_medium=content&utm_campaign=jepsen_blog[Redpanda's official Jepsen report^] From 51e9f4163c45e55059de492f8a5e3a7821a09c98 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 23 Jan 2026 19:57:13 -0300 Subject: [PATCH 17/25] Apply suggestions from code review Co-authored-by: Joyce Fee <102751339+Feediver1@users.noreply.github.com> --- .../kubernetes/k-production-readiness.adoc | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index 197e88fe53..66a3ba1521 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -9,7 +9,7 @@ NOTE: For Linux deployments, see the xref:deploy:redpanda/manual/production/prod == Critical requirements -The Critical requirements checklist helps you to confirm that: +The Critical requirements checklist helps ensure that: - All required defaults and configuration items are specified. - You have the optimal hardware setup. @@ -18,7 +18,7 @@ The Critical requirements checklist helps you to confirm that: === Redpanda license -Validate Enterprise license if using Enterprise features. +If using Enterprise features, validate that you are using a valid Enterprise license: [.side-by-side] -- @@ -39,7 +39,7 @@ Expires: Dec 31 2026 ---- -- -Production deployments using Enterprise features (Tiered Storage, Schema Registry, Continuous Data Balancing, etc.) must have a valid Enterprise license with sufficient expiration date. +Production deployments using Enterprise features (such as Tiered Storage, Schema Registry, or Continuous Data Balancing) must have a valid Enterprise license with a sufficient expiration date. See also: xref:get-started:licensing/index.adoc[Redpanda Licensing] @@ -93,7 +93,7 @@ Under-replicated partitions (0): [] Ensure at least 3 brokers are running for production fault tolerance. -Production clusters should have odd numbers of brokers (3, 5, 7, etc.) for optimal consensus behavior. +Production clusters should have an odd number of brokers (3, 5, 7, etc.) for optimal consensus behavior. Verify the running broker count: @@ -152,9 +152,9 @@ kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.sta === Active broker membership -Verify all brokers are in active state and not being decommissioned. +Verify that all brokers are in active state and not being decommissioned. -Decommissioning is used to permanently remove a broker from the cluster, such as during node pool migrations or cluster downsizing. Brokers in decommissioned state should not be present in production clusters unless actively performing a planned migration. +Decommissioning is used to permanently remove a broker from the cluster, such as during node pool migrations or cluster downsizing. Brokers in a decommissioned state should not be present in production clusters unless actively performing a planned migration. .Input [,bash] @@ -171,7 +171,7 @@ NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION 2 4 active true v24.2.4 ---- -All brokers must show `active` status. If any broker shows `draining` or `decommissioned`, investigate immediately. +All brokers must show `active` status. If any broker shows the status `draining` or `decommissioned`, investigate immediately. See also: xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] @@ -221,7 +221,7 @@ NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION 2 4 active true v25.2.4 ---- -All brokers must show the same `BROKER-VERSION`. Version mismatches between brokers can cause compatibility issues and must be resolved. +All brokers must show the same `BROKER-VERSION`. Version mismatches between brokers can cause compatibility issues and must be resolved before advancing to production. **Verify Helm Chart or Operator version compatibility:** @@ -247,7 +247,7 @@ NAME NAMESPACE REVISION UPDATED STATUS CH redpanda redpanda 1 2024-01-15 10:30:00.123456 -0800 PST deployed redpanda-5.2.4 v25.2.4 ---- -The `CHART` column shows the Helm Chart version (e.g., `redpanda-5.2.4`), which should be compatible with the `APP VERSION` (Redpanda version). +The `CHART` column shows the Helm Chart version (for example, `redpanda-5.2.4`), which should be compatible with the `APP VERSION` (Redpanda version). -- Operator:: @@ -265,7 +265,7 @@ kubectl get deployment redpanda-controller-manager -n -o jsonpath='{ docker.redpanda.com/redpandadata/redpanda-operator:v25.2.4 ---- -The Operator version is shown in the image tag (e.g., `v25.2.4`), which should be compatible with your Redpanda broker version. +The Operator version is shown in the image tag (for example, `v25.2.4`), which should be compatible with your Redpanda broker version. You can also check the Operator version using: @@ -280,7 +280,7 @@ kubectl get redpanda redpanda -n -o jsonpath='{.metadata.annotations **Version compatibility requirements:** * All Redpanda brokers must run the same version -* Helm Chart or Operator version must be within ±1 minor version of Redpanda version +* The Helm Chart or Operator version must be within ±1 minor version of Redpanda version * Example: Redpanda v25.2.x requires Helm/Operator v25.1.x, v25.2.x, or v25.3.x * Running incompatible versions can lead to deployment failures or cluster instability @@ -312,6 +312,7 @@ connectors: ---- Verify pinned versions: + .Input [,bash] ---- @@ -358,6 +359,7 @@ spec: ---- Verify pinned versions: + .Input [,bash] ---- @@ -368,20 +370,21 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" **Why this matters**: -* Prevents automatic upgrades during unintended times (e.g., during high-traffic periods) +* Prevents automatic upgrades during unintended times (for example, during high-traffic periods) * Ensures all environments (dev/staging/prod) run the same tested versions * Allows controlled upgrade testing in non-production environments first * Avoids compatibility issues between Redpanda and its components * Provides rollback capability to known-good versions **Avoid using**: -* `latest` tag - always pulls the newest version -* Version ranges (e.g., `v24.2.x`) - may auto-update to patch releases -* Unspecified tags - defaults to latest or chart-defined versions -=== Default topic replication factor (≥3) +* `latest` tag: always pulls the newest version +* Version ranges (for example, `v24.2.x`): may auto-update to patch releases +* Unspecified tags: defaults to latest or chart-defined versions + +=== Default topic replication factor -Check that the default replication factor is set appropriately for production. +Check that the default replication factor (≥3) is set appropriately for production. [.side-by-side] -- @@ -398,13 +401,13 @@ kubectl exec -n -c redpanda -- rpk cluster config get def ---- -- -Setting `default_topic_replications` to 3 or greater ensures new topics are created with adequate fault tolerance. +Setting `default_topic_replications` to `3` or greater ensures new topics are created with adequate fault tolerance. See also: xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] -=== Existing topics replication factor (≥3) +=== Existing topics replication factor -Check that all existing topics have adequate replication. +Check that all existing topics have adequate replication (default is `3`). [.side-by-side] -- @@ -431,7 +434,7 @@ See also: xref:manage:cluster-maintenance/topic-property-configuration.adoc#chan === Persistent storage configuration -Verify using persistent storage (not hostPath or emptyDir) for data persistence. +Verify that you have configured persistent storage (not hostPath or emptyDir) for data persistence. [.side-by-side] -- @@ -522,7 +525,7 @@ Ensure storage classes provide adequate IOPS and throughput for your workload. **Performance specifications:** * Use NVMe-based storage classes for production deployments -* Minimum 16,000 IOPS (Input/Output Operations Per Second) required +* Specify a minimum 16,000 IOPS (Input/Output Operations Per Second) * Consider provisioned IOPS where available to meet or exceed the minimum * Enabling xref:develop:config-topics.adoc#configure-write-caching[write caching] can help Redpanda perform better in environments with disks that don't meet the recommended IOPS * NFS (Network File System) is not supported From a1d20bb0422771e6844aa3fb42a6548afa7746d8 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 23 Jan 2026 20:05:26 -0300 Subject: [PATCH 18/25] Apply suggestion from @paulohtb6 --- .../pages/redpanda/kubernetes/k-production-readiness.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index 66a3ba1521..f590b55241 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -286,7 +286,7 @@ kubectl get redpanda redpanda -n -o jsonpath='{.metadata.annotations === Version pinning -WARNING: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. +TIP: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. Verify that versions are explicitly pinned in your deployment configuration: From 53d83381d29f879b826df58d7b5b6ebe339fca96 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 23 Jan 2026 20:27:13 -0300 Subject: [PATCH 19/25] apply review points --- .../kubernetes/k-production-readiness.adoc | 80 +++++++------------ 1 file changed, 29 insertions(+), 51 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index f590b55241..aaf5fe7e49 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -2,9 +2,14 @@ :description: Comprehensive checklist for validating Redpanda deployments in Kubernetes against production readiness standards. :page-context-links: [{"name": "Linux", "to": "deploy:redpanda/linux/index.adoc" },{"name": "Kubernetes", "to": "deploy:redpanda/kubernetes/index.adoc" } ] :page-categories: Production, Deployment +:learning-objective-1: Validate a Kubernetes-deployed Redpanda cluster against production readiness standards Before running a production workload on Redpanda in Kubernetes, follow this readiness checklist. +By completing this checklist, you will be able to: + +* [ ] {learning-objective-1} + NOTE: For Linux deployments, see the xref:deploy:redpanda/manual/production/production-readiness.adoc[Production Readiness Checklist for Linux]. == Critical requirements @@ -91,7 +96,7 @@ Under-replicated partitions (0): [] === Minimum broker count -Ensure at least 3 brokers are running for production fault tolerance. +You must have at least three brokers running to ensure production-level fault tolerance. Production clusters should have an odd number of brokers (3, 5, 7, etc.) for optimal consensus behavior. @@ -173,7 +178,7 @@ NODE-ID NUM-CORES MEMBERSHIP-STATUS IS-ALIVE BROKER-VERSION All brokers must show `active` status. If any broker shows the status `draining` or `decommissioned`, investigate immediately. -See also: xref:manage:cluster-maintenance/decommission-brokers.adoc[Decommission Brokers] +See also: xref:manage:kubernetes/k-decommission-brokers.adoc[Decommission Brokers] === No brokers in maintenance mode @@ -198,7 +203,7 @@ NODE-ID ENABLED FINISHED ERRORS PARTITIONS ELIGIBLE TRANSFERRING FAILED All brokers should show `ENABLED: false`. If any broker shows `ENABLED: true` outside of a planned maintenance window, investigate immediately. -See also: xref:manage:node-management.adoc#place-a-broker-in-maintenance-mode[Maintenance Mode] +See also: xref:manage:kubernetes/k-rolling-restart.adoc[Maintenance Mode] === Consistent Redpanda version @@ -368,19 +373,9 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" -- ====== -**Why this matters**: - -* Prevents automatic upgrades during unintended times (for example, during high-traffic periods) -* Ensures all environments (dev/staging/prod) run the same tested versions -* Allows controlled upgrade testing in non-production environments first -* Avoids compatibility issues between Redpanda and its components -* Provides rollback capability to known-good versions - -**Avoid using**: +Version pinning prevents automatic upgrades during unintended times (for example, during high-traffic periods) and ensures all environments (dev/staging/prod) run the same tested versions. It allows controlled upgrade testing in non-production environments first, avoids compatibility issues between Redpanda and its components, and provides rollback capability to known-good versions. -* `latest` tag: always pulls the newest version -* Version ranges (for example, `v24.2.x`): may auto-update to patch releases -* Unspecified tags: defaults to latest or chart-defined versions +Avoid using the `latest` tag (which always pulls the newest version), version ranges (for example, `v24.2.x` may auto-update to patch releases), or unspecified tags (which default to latest or chart-defined versions). === Default topic replication factor @@ -428,7 +423,7 @@ user-events 16 3 ---- -- -All production topics should have `REPLICAS` of 3 or greater. Topics with single-digit replication are at risk of data loss if a broker fails. +All production topics should have `REPLICAS` of three or greater. Topics with single-digit replication are at risk of data loss if a broker fails. See also: xref:manage:cluster-maintenance/topic-property-configuration.adoc#change-topic-replication-factor[Change Topic Replication Factor] @@ -476,7 +471,7 @@ HostPath and emptyDir storage are not suitable for production as they lack durab === RAID/LVM stripe configuration (multiple disks only) -If using multiple physical disks, verify they are configured as RAID-0 or LVM stripe (not linear/concat). +If using multiple physical disks, verify they are configured to stripe data across the disks as RAID-0 or LVM stripe (not linear/concat). Striping distributes data across multiple disks in parallel for improved I/O performance. .Input [,bash] @@ -516,7 +511,7 @@ data 2 256.00k 1 259 1 1 active sync /dev/nvme1n1 ---- -Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance. Each disk must be striped for optimal I/O throughput. Single disk configurations do not require striping. +Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance because data writes are serialized rather than parallelized. For optimal I/O throughput, configure multiple disks in a striped array that writes data across all disks simultaneously. Single disk configurations do not require striping. === Storage performance requirements @@ -790,7 +785,7 @@ total 16 -rw------- 1 redpanda redpanda 1704 Dec 15 10:00 tls.key ---- -See also: xref:manage:security/encryption.adoc[TLS Encryption] +See also: xref:manage:kubernetes/security/tls/index.adoc[TLS Encryption] === Authentication enabled @@ -840,7 +835,7 @@ User:app-consumer * GROUP consumer-group-1 READ ALLOW See also: -* xref:manage:security/authentication.adoc[Authentication] +* xref:manage:kubernetes/security/authentication/k-authentication.adoc[Authentication] * xref:manage:security/authorization/index.adoc[Authorization] === Network security @@ -896,7 +891,7 @@ redpanda ClusterIP None 9093/TCP,9644/TC redpanda-external LoadBalancer 10.100.200.50 9093:30001/TCP ---- -See also: xref:manage:security/listener-configuration.adoc[Listener Configuration] +See also: xref:manage:kubernetes/networking/k-configure-listeners.adoc[Listener Configuration] === Pod Disruption Budget @@ -997,11 +992,11 @@ Check that you have configured tuners for optimal performance. Tuners can signif == Recommended requirements -The Recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: +The Recommended requirements checklist ensures that you can monitor and support your environment on a sustained basis, and is appropriate for instances when you need to: -- You have adhered to day-2 operations best practices. -- You can diagnose and recover from issues or failures. -- You have configured monitoring, backup, and security scanning. +- Ensure adherence to day-2 operations best practices +- Diagnose and recover from backup issues or failures +- Configure monitoring, backup, and security scanning === Deployment method @@ -1048,12 +1043,7 @@ The presence of a Redpanda custom resource indicates an Operator-managed deploym -- ====== -**Why this matters**: Knowing your deployment method helps determine: - -* Which configuration approach to use (Helm values vs. Redpanda CR) -* How to perform upgrades and rollbacks -* Where to find deployment logs and troubleshooting information -* Which documentation sections apply to your environment +Knowing your deployment method helps determine which configuration approach to use (Helm values vs. Redpanda CR), how to perform upgrades and rollbacks, where to find deployment logs and troubleshooting information, and which documentation sections apply to your environment. === XFS filesystem @@ -1242,21 +1232,9 @@ Collecting configuration... Debug bundle saved to '/tmp/bundle.zip' ---- -**Why this matters**: Debug bundles collect critical diagnostic information including: - -* Cluster configuration and metadata -* Redpanda logs from all brokers -* System resource usage and performance metrics -* Kubernetes resource definitions - -**Common issues to watch for**: - -* Permission errors preventing log collection -* Insufficient disk space for bundle creation -* Network policies blocking bundle transfer -* RBAC restrictions on accessing pod logs or exec +Debug bundles collect critical diagnostic information including cluster configuration and metadata, Redpanda logs from all brokers, system resource usage and performance metrics, and Kubernetes resource definitions. -Testing bundle generation early ensures this critical troubleshooting tool works when you need it most. Debug bundles are often required by Redpanda support to diagnose production issues efficiently. +When testing bundle generation, watch for permission errors preventing log collection, insufficient disk space for bundle creation, network policies blocking bundle transfer, or RBAC restrictions on accessing pod logs or exec. Testing bundle generation early ensures this critical troubleshooting tool works when you need it most. Debug bundles are often required by Redpanda support to diagnose production issues efficiently. See also: xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] @@ -1297,7 +1275,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_region -X user= -X pass= -X sasl.mechanism= ---- -See also: xref:manage:tiered-storage.adoc[Tiered Storage] +See also: xref:manage:kubernetes/tiered-storage/k-tiered-storage.adoc[Tiered Storage] === Security scanning @@ -1374,7 +1352,7 @@ kubectl exec -n -c redpanda -- rpk topic describe === System log retention -Check that system logs are being captured and stored for an appropriate period of time (minimally, 7 days). Configure log forwarding using tools like Fluentd or your cloud provider's logging solution to send logs to a central location. +Check that system logs are being captured and stored for an appropriate period of time (minimally, seven days). Configure log forwarding using tools like Fluentd or your cloud provider's logging solution to send logs to a central location. === Environment configuration @@ -1450,7 +1428,7 @@ The Advanced requirements checklist ensure full enterprise readiness. This indic === Configure alerts -A standard set of alerts for xref:manage:monitoring.adoc#generate-grafana-dashboard[Grafana] or xref:manage:monitoring.adoc#configure-prometheus[Prometheus] is provided in the https://github.com/redpanda-data/observability[GitHub Redpanda observability repo^]. Customize these alerts for your specific needs. +A standard set of alerts for xref:manage:kubernetes/monitoring/k-monitor-redpanda.adoc#generate-grafana-dashboard[Grafana] or xref:manage:kubernetes/monitoring/k-monitor-redpanda.adoc#configure-prometheus[Prometheus] is provided in the https://github.com/redpanda-data/observability[GitHub Redpanda observability repo^]. Customize these alerts for your specific needs. See also: xref:reference:monitor-metrics.adoc[Monitoring Metrics] From 6108c2b3a42600c274c8beeac7ebd22c81af3f59 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 23 Jan 2026 21:24:34 -0300 Subject: [PATCH 20/25] apply code revieww --- .../kubernetes/k-production-readiness.adoc | 163 ++++++------------ 1 file changed, 57 insertions(+), 106 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index aaf5fe7e49..fd237222d9 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -14,11 +14,11 @@ NOTE: For Linux deployments, see the xref:deploy:redpanda/manual/production/prod == Critical requirements -The Critical requirements checklist helps ensure that: +The Critical requirements checklist helps you confirm that: -- All required defaults and configuration items are specified. +- You have specified all required defaults and configuration items. - You have the optimal hardware setup. -- Security is enabled. +- You have enabled security. - You are set up to run in production. === Redpanda license @@ -398,7 +398,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get def Setting `default_topic_replications` to `3` or greater ensures new topics are created with adequate fault tolerance. -See also: xref:develop:config-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] +See also: xref:manage:kubernetes/k-manage-topics.adoc#choose-the-replication-factor[Choose the Replication Factor] === Existing topics replication factor @@ -522,12 +522,10 @@ Ensure storage classes provide adequate IOPS and throughput for your workload. * Use NVMe-based storage classes for production deployments * Specify a minimum 16,000 IOPS (Input/Output Operations Per Second) * Consider provisioned IOPS where available to meet or exceed the minimum -* Enabling xref:develop:config-topics.adoc#configure-write-caching[write caching] can help Redpanda perform better in environments with disks that don't meet the recommended IOPS +* Enable xref:develop:config-topics.adoc#configure-write-caching[write caching] to help Redpanda perform better in environments with disks that don't meet the recommended IOPS * NFS (Network File System) is not supported * Test storage performance under load -**Multi-tenant disk warning:** - WARNING: Avoid cloud instance types that use multi-tenant or shared disks, as these can lead to unpredictable performance due to noisy neighbor effects. Examples of instances with shared/multi-tenant storage include AWS is4gen.xlarge and similar instance types across cloud providers. Instead, use instances with dedicated local NVMe storage or provisioned IOPS volumes that guarantee consistent performance. Multi-tenant disks can experience: @@ -542,15 +540,9 @@ See also: * xref:deploy:redpanda/kubernetes/k-requirements.adoc#storage[Storage requirements] * xref:deploy:redpanda/kubernetes/k-requirements.adoc#cloud-instance-types[Cloud Instance Types] -**Volume sizing**:: Plan storage capacity for data growth and retention requirements. -+ -* Account for replication overhead -* Include space for compaction operations -* Monitor disk usage trends - === CPU and memory resource limits -Verify pods have resource requests and limits configured. +Verify Pods have resource requests and limits configured. [.side-by-side] -- @@ -576,10 +568,10 @@ kubectl get pod -n -o jsonpath='{.spec.containers[?(@.nam ---- -- -All Redpanda pods must have: +All Redpanda Pods must have: -* CPU requests and limits configured and **identical** (`requests.cpu == limits.cpu`) -* Memory requests and limits configured and **identical** (`requests.memory == limits.memory`) +* **Identical** CPU requests and limits (`requests.cpu == limits.cpu`) +* **Identical** memory requests and limits (`requests.memory == limits.memory`) Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS class, which prevents CPU throttling and reduces the risk of Pod eviction. @@ -644,13 +636,13 @@ kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.res -- ====== -In the examples above, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). +In the preceding examples, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). === No fractional CPU requests Ensure CPU requests use whole numbers for consistent performance. -Fractional CPUs can lead to performance variability in production. Use whole integer values (4, 8, 16 - not 3.5, 7.5). +Fractional CPUs can lead to performance variability in production. Use whole integer values (`4`, `8`, or `16` are acceptable, while `3.5` or `7.5` are not). Verify CPU configuration: @@ -666,14 +658,6 @@ kubectl get pod -n -o jsonpath='{.spec.containers[?(@.nam 4 ---- -Avoid fractional values like `3500m` (3.5 cores) or `7500m` (7.5 cores). - -**Resource capacity planning**:: Ensure nodes have adequate resources for the configured limits. -+ -* Verify cluster has sufficient total resources -* Account for other workloads on shared nodes -* Plan for resource growth and burst capacity - === Authorization enabled Verify Kafka authorization is enabled for access control. @@ -814,7 +798,7 @@ monitoring * Set up SASL authentication for client connections * Configure TLS certificates for encryption (see TLS configuration above) * Implement proper user management with principle of least privilege -* Configure ACLs (Access Control Lists) for resource authorization +* Configure xref:manage:security/authorization/acl.adoc[ACLs (Access Control Lists)] for resource authorization Verify ACLs are configured: @@ -871,7 +855,7 @@ kubectl describe networkpolicy -n * Configure NetworkPolicies to restrict pod-to-pod communication * Use TLS for all client connections (see TLS configuration) -* Secure admin API endpoints with authentication and authorization +* Secure admin API endpoints with xref:manage:kubernetes/security/authentication/k-authentication.adoc[authentication] and xref:manage:security/authorization/index.adoc[authorization] * Limit ingress traffic to only necessary ports and sources * Use Kubernetes Services to control external access @@ -915,47 +899,22 @@ redpanda N/A 1 1 10d Production deployments must have a PodDisruptionBudget with `maxUnavailable: 1` to prevent simultaneous broker disruptions during voluntary operations like node drains, upgrades, or autoscaler actions. -See also: xref:manage:kubernetes/k-pod-disruption-budgets.adoc[Pod Disruption Budgets] +See also: https://kubernetes.io/docs/tasks/run-application/configure-pdb/[Kubernetes Pod Disruption Budgets^] === Rack awareness and topology spread -Configure topology spread constraints to distribute brokers across availability zones. +Configure topology spread constraints to distribute brokers across availability zones. For configuration instructions, see xref:deploy:redpanda/kubernetes/k-high-availability.adoc#multi-az-deployment[Multi-AZ deployment]. -Verify pod distribution across zones: +Production deployments require each Redpanda broker to run in a different availability zone to ensure that a single zone failure does not cause loss of quorum. For a three-broker cluster, brokers must be distributed across three separate zones. -.Input -[,bash] ----- -kubectl get pod -n -o wide ----- - -.Output -[,bash,role=no-copy] ----- -NAME READY STATUS NODE ZONE -redpanda-0 2/2 Running node-us-west-2a-1.internal us-west-2a -redpanda-1 2/2 Running node-us-west-2b-1.internal us-west-2b -redpanda-2 2/2 Running node-us-west-2c-1.internal us-west-2c ----- +To verify zone distribution, check your cluster configuration: -Check node availability zone labels: - -.Input -[,bash] ----- -kubectl get nodes --show-labels | grep topology.kubernetes.io/zone ----- - -**Configuration requirements:** - -* Configure `topologySpreadConstraints` to spread pods across zones -* Use node labels for availability zone awareness (typically `topology.kubernetes.io/zone`) -* Prevents single zone failures from affecting multiple brokers +* Verify `topologySpreadConstraints` are configured in your Helm values or Redpanda CR +* Confirm nodes have zone labels (typically `topology.kubernetes.io/zone`) +* Check that brokers are scheduled on nodes in different zones See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] - - === Operator CRDs (Operator deployments only) WARNING: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. @@ -992,11 +951,11 @@ Check that you have configured tuners for optimal performance. Tuners can signif == Recommended requirements -The Recommended requirements checklist ensures that you can monitor and support your environment on a sustained basis, and is appropriate for instances when you need to: +The Recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: -- Ensure adherence to day-2 operations best practices -- Diagnose and recover from backup issues or failures -- Configure monitoring, backup, and security scanning +- You have adhered to day-2 operations best practices. +- You can diagnose and recover from backup issues or failures. +- You have configured monitoring, backup, and security scanning. === Deployment method @@ -1043,11 +1002,11 @@ The presence of a Redpanda custom resource indicates an Operator-managed deploym -- ====== -Knowing your deployment method helps determine which configuration approach to use (Helm values vs. Redpanda CR), how to perform upgrades and rollbacks, where to find deployment logs and troubleshooting information, and which documentation sections apply to your environment. +Knowing your deployment method helps determine which configuration approach to use (Helm values vs. Redpanda CR), how to perform upgrades and rollbacks, where to find deployment logs and troubleshooting information, and which documentation sections apply to your environment. See xref:deploy:redpanda/kubernetes/k-production-workflow.adoc[Production Deployment Workflow] for the complete deployment process. === XFS filesystem -Verify data directories use XFS filesystem for optimal performance. +Verify that data directories use XFS filesystem for optimal performance. [.side-by-side] -- @@ -1067,15 +1026,9 @@ Filesystem Type Size Used Avail Use% Mounted on XFS provides better performance characteristics for Redpanda workloads compared to ext4. While ext4 is supported, XFS is strongly recommended for production deployments. -**Storage performance tuning**:: Optimize storage configuration for production workloads. -+ -* Configure appropriate `vm.swappiness` settings -* Tune filesystem mount options -* Consider storage class performance characteristics - === Pod anti-affinity -Configure pod anti-affinity to spread brokers across nodes. +Configure Pod anti-affinity to spread brokers across nodes. [.side-by-side] -- @@ -1105,9 +1058,9 @@ kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spe ---- -- -This prevents single node failures from affecting multiple brokers by ensuring each Redpanda pod runs on a different node. +This prevents single node failures from affecting multiple brokers by ensuring each Redpanda Pod runs on a different node. -See also: xref:reference:k-redpanda-helm-spec.adoc#statefulset-podantiaffinity[Pod Anti-Affinity] +See also: xref:deploy:redpanda/kubernetes/k-production-deployment.adoc#affinity-rules[Pod Anti-Affinity] === Node isolation @@ -1119,7 +1072,7 @@ Configure taints/tolerations or nodeSelector for workload isolation. kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spec.nodeSelector}' | jq ---- -Example output showing node isolation: +.Output [,bash,role=no-copy] ---- { @@ -1129,21 +1082,17 @@ Example output showing node isolation: Isolating Redpanda workloads on dedicated nodes improves performance predictability by preventing resource contention with other applications. -**CPU pinning and NUMA awareness**:: Configure CPU affinity for optimal performance on multi-core systems. - -**Memory allocation strategy**:: Optimize memory settings for your workload patterns. - === Partition balancing Configure automatic partition balancing across brokers and CPU cores. **Continuous Data Balancing** -xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] is a major benefit of Redpanda for managing production deployments. It automatically rebalances partition replicas across brokers based on disk usage and node changes, eliminating manual intervention and preventing performance degradation. +xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] can help you manage production deployments by automatically rebalancing partition replicas across brokers based on disk usage and node changes. It also eliminates manual intervention and prevents performance degradation. -**This feature should be enabled for all licensed production clusters.** +IMPORTANT: You should enable Continuous Data Balancing for all licensed production clusters. -Check continuous data balancing: +Verify that Continuous Data Balancing is configured: .Input [,bash] @@ -1157,7 +1106,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get par continuous ---- -Setting this to `continuous` enables automatic partition rebalancing based on: +The `continuous` setting enables automatic partition rebalancing based on: * Node additions or removals * High disk usage conditions @@ -1167,6 +1116,8 @@ Without Continuous Data Balancing, partition distribution becomes skewed over ti **Core Balancing** +xref:manage:cluster-maintenance/cluster-balancing.adoc#intra-broker-partition-balancing[Intra-broker partition balancing] distributes partition replicas across CPU cores within individual brokers. + Check core balancing for CPU core partition distribution: .Input @@ -1181,11 +1132,11 @@ kubectl exec -n -c redpanda -- rpk cluster config get cor true ---- -When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, especially beneficial after broker restarts or configuration changes. +When enabled, Redpanda continuously rebalances partitions between CPU cores on a broker for optimal resource utilization, which is especially beneficial after broker restarts or configuration changes. === System requirements -Run system checks to validate optimal configuration. +Run system checks to validate that you are running an optimal configuration. [.side-by-side] -- @@ -1210,7 +1161,7 @@ Review any failed checks and remediate before proceeding to production. See xref === Debug bundle -Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that when an actual issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. +Verify that you can successfully generate and collect a debug bundle from your cluster. This proactive check ensures that if an issue occurs and you need to contact Redpanda support, you won't face permission issues or silent collection failures that could delay troubleshooting. Generate a debug bundle: @@ -1234,13 +1185,13 @@ Debug bundle saved to '/tmp/bundle.zip' Debug bundles collect critical diagnostic information including cluster configuration and metadata, Redpanda logs from all brokers, system resource usage and performance metrics, and Kubernetes resource definitions. -When testing bundle generation, watch for permission errors preventing log collection, insufficient disk space for bundle creation, network policies blocking bundle transfer, or RBAC restrictions on accessing pod logs or exec. Testing bundle generation early ensures this critical troubleshooting tool works when you need it most. Debug bundles are often required by Redpanda support to diagnose production issues efficiently. +When testing bundle generation, watch for permission errors preventing log collection, insufficient disk space for bundle creation, network policies blocking bundle transfer, or RBAC restrictions on accessing Pod logs or exec. Testing bundle generation early ensures this critical troubleshooting tool works when you need it most. Debug bundles are often required by Redpanda support to diagnose production issues efficiently. See also: xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] === Tiered Storage -Configure Tiered Storage for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. +Configure xref:manage:kubernetes/tiered-storage/k-tiered-storage.adoc[Tiered Storage] for extended data retention using object storage. Tiered Storage automatically offloads older data to cloud storage (S3, GCS, Azure Blob), enabling extended retention without expanding local disk capacity. Verify Tiered Storage configuration: @@ -1258,12 +1209,12 @@ true **Benefits of Tiered Storage:** -* Reduces local storage costs by offloading cold data to cheaper object storage -* Enables longer data retention periods without provisioning additional disk +* Reduced local storage costs from offloading cold data to cheaper object storage +* Longer data retention periods without provisioning additional disk * Required for advanced features like Remote Read Replicas and Iceberg integration -* Provides disaster recovery capabilities through cloud-backed data +* Disaster recovery capabilities through cloud-backed data -**Verification steps:** +To verify your Tiered Storage configuration: .Input [,bash] @@ -1279,7 +1230,7 @@ See also: xref:manage:kubernetes/tiered-storage/k-tiered-storage.adoc[Tiered Sto === Security scanning -Regularly scan container images and configurations for vulnerabilities to maintain security posture. +Regularly scan container images and configurations for vulnerabilities to maintain security. **Container image scanning:** @@ -1298,13 +1249,13 @@ kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spe docker.redpanda.com/redpandadata/redpanda:v24.2.4 ---- -**Security scanning practices:** +**Security scanning best practices:** * Scan images using tools like Trivy, Snyk, or cloud-native scanners before deployment * Set up automated scanning in CI/CD pipelines * Monitor for CVE announcements and security advisories -* Keep Redpanda and related components up to date with security patches -* Review Kubernetes RBAC policies and ServiceAccount permissions +* Keep Redpanda and related components up-to-date with security patches (see xref:upgrade:k-rolling-upgrade.adoc[Rolling Upgrades]) +* Review Kubernetes RBAC policies and ServiceAccount permissions (see xref:manage:kubernetes/security/authorization/k-role-controller.adoc[Role Controller]) **Configuration scanning:** @@ -1316,7 +1267,7 @@ kubectl get redpanda,statefulset,deployment -n -o yaml > cluster-con # Use kubesec, kube-bench, or similar tools to scan cluster-config.yaml ---- -Establish a regular cadence for security scanning (e.g., weekly or with each deployment). +Establish a regular cadence for security scanning (for example, weekly or with each deployment). === Backup and recovery @@ -1358,7 +1309,7 @@ See also: xref:manage:kubernetes/tiered-storage/k-whole-cluster-restore.adoc[Who Enable and configure audit logging for compliance and security monitoring requirements. -Verify audit logging configuration: +Verify your audit log configuration: .Input [,bash] @@ -1372,9 +1323,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get aud true ---- -**Audit log verification:** - -Check where audit logs are being written: +Check to ensure you know where audit logs are being written: .Input [,bash] @@ -1407,7 +1356,9 @@ kubectl get servicemonitor -n === System log retention -Check that system logs are being captured and stored for an appropriate period of time (minimally, seven days). Configure log forwarding using tools like Fluentd or your cloud provider's logging solution to send logs to a central location. +Check that Redpanda logs are being captured and stored for an appropriate period of time (minimally, seven days). Configure log forwarding using tools like Fluentd or your cloud provider's logging solution to send logs to a central location for troubleshooting and compliance purposes. + +See also: xref:manage:kubernetes/troubleshooting/k-diagnostics-bundle.adoc[Diagnostics Bundles in Kubernetes] === Environment configuration @@ -1419,7 +1370,7 @@ Check that you have an upgrade policy defined and implemented. Redpanda supports == Advanced requirements -The Advanced requirements checklist ensure full enterprise readiness. This indicates that your system is operating at the highest level of availability and can prevent or recover from the most serious incidents. The Advanced requirements checklist confirms the following: +The Advanced requirements checklist ensures full enterprise readiness, indicates that your system is operating at the highest level of availability, and can prevent or recover from the most serious incidents. The Advanced requirements checklist confirms the following: - You are proactively monitoring mission-critical workloads. - You have business continuity solutions in place. @@ -1438,7 +1389,7 @@ Review your deployment automation. Ensure that cluster configuration is managed === Monitor security settings -Regularly review your cluster's security settings using the link:/api/doc/admin/operation/operation-get_security_report[`/v1/security/report`] Admin API endpoint. Investigate and address any issues identified in the alerts section. +Regularly review your cluster's security settings using the `/v1/security/report` link:/api/doc/admin/[Admin API] endpoint. Investigate and address any issues identified in the alerts section. include::manage:partial$security-report.adoc[] From 48b995649ecd23a8c487f72e31ea49a562167dab Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 23 Jan 2026 21:55:45 -0300 Subject: [PATCH 21/25] add shadowing to the guide --- .../kubernetes/k-production-readiness.adoc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index fd237222d9..f12ec126b1 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -1294,16 +1294,15 @@ Regularly test recovery procedures to validate RTO/RPO targets: kubectl exec -n -c redpanda -- rpk topic describe -X user= -X pass= -X sasl.mechanism= ---- -**Backup and recovery checklist:** +For mission-critical workloads requiring active disaster recovery, consider implementing xref:manage:kubernetes/shadowing/k-shadow-linking.adoc[Shadowing] to asynchronously replicate data to a standby cluster. Shadowing provides offset-preserving replication that maintains consumer positions, enabling faster recovery with lower RTO compared to restoration from backups. This Enterprise feature (available in Redpanda v25.3 or later) supports cross-region or cross-cloud disaster recovery with automatic failover capabilities. -* Configure and validate Tiered Storage for automatic data backup -* Document recovery procedures for different failure scenarios -* Test cluster recovery procedures in non-production environments -* Establish Recovery Time Objective (RTO) and Recovery Point Objective (RPO) -* Maintain runbooks for disaster recovery scenarios -* Verify IAM roles/permissions for object storage access +Configure and validate Tiered Storage for automatic data backup to object storage. Document and regularly test recovery procedures for different failure scenarios in non-production environments. Establish clear Recovery Time Objective (RTO) and Recovery Point Objective (RPO) targets, and maintain runbooks for disaster recovery scenarios. For Shadowing deployments, use the xref:manage:kubernetes/shadowing/k-failover-runbook.adoc[Shadowing Failover Runbook] as a starting point. Verify that IAM roles and permissions for object storage access are correctly configured and tested. -See also: xref:manage:kubernetes/tiered-storage/k-whole-cluster-restore.adoc[Whole Cluster Restore] +See also: + +* xref:manage:kubernetes/tiered-storage/k-whole-cluster-restore.adoc[Whole Cluster Restore] +* xref:manage:kubernetes/shadowing/k-shadow-linking.adoc[Configure Shadowing] +* xref:manage:kubernetes/shadowing/k-failover-runbook.adoc[Shadowing Failover Runbook] === Audit logging From 43e5cb54eb94a949024871a011f01a1535e53937 Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Thu, 5 Feb 2026 10:26:47 -0600 Subject: [PATCH 22/25] Handle PR 1352 review #1 --- .../pages/redpanda/kubernetes/k-production-readiness.adoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index f12ec126b1..0200674ca0 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -14,7 +14,7 @@ NOTE: For Linux deployments, see the xref:deploy:redpanda/manual/production/prod == Critical requirements -The Critical requirements checklist helps you confirm that: +The Critical requirements checklist helps ensure that: - You have specified all required defaults and configuration items. - You have the optimal hardware setup. @@ -951,7 +951,7 @@ Check that you have configured tuners for optimal performance. Tuners can signif == Recommended requirements -The Recommended requirements checklist confirms that you can monitor and support your environment on a sustained basis. It includes the following checks: +The Recommended requirements checklist ensures that you can monitor and support your environment on a sustained basis. It includes the following checks: - You have adhered to day-2 operations best practices. - You can diagnose and recover from backup issues or failures. From 6afb8d9a3d4a72140f50262769da282268648005 Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Thu, 5 Feb 2026 10:57:36 -0600 Subject: [PATCH 23/25] Handle PR 1352 review #2 --- .../kubernetes/k-production-readiness.adoc | 68 ++++++++++--------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index 0200674ca0..7506dc9ae7 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -287,11 +287,10 @@ kubectl get redpanda redpanda -n -o jsonpath='{.metadata.annotations * All Redpanda brokers must run the same version * The Helm Chart or Operator version must be within ±1 minor version of Redpanda version * Example: Redpanda v25.2.x requires Helm/Operator v25.1.x, v25.2.x, or v25.3.x -* Running incompatible versions can lead to deployment failures or cluster instability +* Running incompatible versions can lead to deployment failures or cluster instability. === Version pinning -TIP: Pin specific versions for Redpanda and all related components (Console, Connectors) to prevent unexpected automatic upgrades that could introduce breaking changes or cause downtime during unplanned updates. Verify that versions are explicitly pinned in your deployment configuration: @@ -373,9 +372,9 @@ kubectl get redpanda redpanda -n -o yaml | grep -A 1 "tag:" -- ====== -Version pinning prevents automatic upgrades during unintended times (for example, during high-traffic periods) and ensures all environments (dev/staging/prod) run the same tested versions. It allows controlled upgrade testing in non-production environments first, avoids compatibility issues between Redpanda and its components, and provides rollback capability to known-good versions. +Pin specific versions for Redpanda and all related components (Console, Connectors). This ensures all environments (dev/staging/prod) run the same tested versions, allows controlled upgrade testing before production rollout, and provides rollback capability to known-good versions. -Avoid using the `latest` tag (which always pulls the newest version), version ranges (for example, `v24.2.x` may auto-update to patch releases), or unspecified tags (which default to latest or chart-defined versions). +Avoid using the latest tag, version ranges (for example, v24.2.x), or unspecified tags, as these can result in unexpected upgrades that introduce breaking changes or cause downtime. === Default topic replication factor @@ -450,6 +449,7 @@ datadir-redpanda-2 Bound pvc-c3d4e5f6-g7h8-9012-cdef-gh3456789012 100G -- Verify the StatefulSet uses PersistentVolumeClaims: + .Input [,bash] ---- @@ -515,7 +515,7 @@ Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks w === Storage performance requirements -Ensure storage classes provide adequate IOPS and throughput for your workload. +Ensure storage classes provide adequate IOPS and throughput for your workload by using the following specifications when selection a storage class: **Performance specifications:** @@ -568,10 +568,10 @@ kubectl get pod -n -o jsonpath='{.spec.containers[?(@.nam ---- -- -All Redpanda Pods must have: +All Redpanda Pods **must have**: -* **Identical** CPU requests and limits (`requests.cpu == limits.cpu`) -* **Identical** memory requests and limits (`requests.memory == limits.memory`) +* Identical CPU requests and limits (`requests.cpu == limits.cpu`) +* Identical memory requests and limits (`requests.memory == limits.memory`) Setting requests equal to limits ensures the Pod receives the `Guaranteed` QoS class, which prevents CPU throttling and reduces the risk of Pod eviction. @@ -677,7 +677,9 @@ true ---- -- -Without authorization enabled, any client can access Kafka APIs without authentication. See xref:manage:security/authorization/index.adoc[Authorization] for configuration details. +Without authorization enabled, any client can access Kafka APIs without authentication. + +See also: xref:manage:security/authorization/index.adoc[Authorization] === Production mode enabled @@ -745,7 +747,7 @@ redpanda: key_file: /etc/tls/certs/tls.key ---- -**Required TLS listeners:** +Required TLS listeners include: * **kafka_api** - Client connections to Kafka API * **admin_api** - Administrative REST API access @@ -793,10 +795,10 @@ app-consumer monitoring ---- -**Authentication requirements:** +Be sure to adhere to the following authentication requirements: * Set up SASL authentication for client connections -* Configure TLS certificates for encryption (see TLS configuration above) +* Configure TLS certificates for encryption (see preceding TLS configuration guidance) * Implement proper user management with principle of least privilege * Configure xref:manage:security/authorization/acl.adoc[ACLs (Access Control Lists)] for resource authorization @@ -851,7 +853,7 @@ Check NetworkPolicy rules: kubectl describe networkpolicy -n ---- -**Network security requirements:** +Be sure to satisfy the following network security requirements: * Configure NetworkPolicies to restrict pod-to-pod communication * Use TLS for all client connections (see TLS configuration) @@ -919,6 +921,15 @@ See also: xref:manage:kubernetes/k-rack-awareness.adoc[Rack Awareness] WARNING: If your deployment uses the Redpanda Operator, all required Custom Resource Definitions (CRDs) must be installed with compatible versions. Without correct CRDs, the Operator cannot manage the cluster, leading to configuration drift, failed updates, and potential data loss. +The required CRDs are below: + +* `clusters.cluster.redpanda.com` - Manages Redpanda cluster configuration +* `topics.cluster.redpanda.com` - Manages topic lifecycle +* `users.cluster.redpanda.com` - Manages SASL users +* `schemas.cluster.redpanda.com` - Manages Schema Registry schemas + +If any CRDs are missing or incompatible with your Operator version, the Operator will fail to reconcile resources. + Verify all required CRDs are installed: .Input @@ -936,15 +947,6 @@ users.cluster.redpanda.com schemas.cluster.redpanda.com ---- -Required CRDs: - -* `clusters.cluster.redpanda.com` - Manages Redpanda cluster configuration -* `topics.cluster.redpanda.com` - Manages topic lifecycle -* `users.cluster.redpanda.com` - Manages SASL users -* `schemas.cluster.redpanda.com` - Manages Schema Registry schemas - -If any CRDs are missing or incompatible with your Operator version, the Operator will fail to reconcile resources. - === Run Redpanda tuners Check that you have configured tuners for optimal performance. Tuners can significantly impact latency and throughput. In Kubernetes, tuners are configured through the Helm chart or may need to be run on worker nodes themselves. For details, see xref:deploy:redpanda/kubernetes/k-tune-workers.adoc[Tune Kubernetes Worker Nodes for Production]. @@ -979,7 +981,7 @@ NAME NAMESPACE REVISION UPDATED STATUS CH redpanda redpanda 1 2024-01-15 10:30:00.123456 -0800 PST deployed redpanda-5.0.0 v24.1.1 ---- -The presence of a Helm release indicates a Helm-managed deployment. +The presence of a Helm release (`CHART` displays `redpanda-5.0.0`) indicates a Helm-managed deployment. -- Operator:: @@ -1086,7 +1088,7 @@ Isolating Redpanda workloads on dedicated nodes improves performance predictabil Configure automatic partition balancing across brokers and CPU cores. -**Continuous Data Balancing** +==== Continuous Data Balancing xref:manage:cluster-maintenance/continuous-data-balancing.adoc[Continuous Data Balancing] can help you manage production deployments by automatically rebalancing partition replicas across brokers based on disk usage and node changes. It also eliminates manual intervention and prevents performance degradation. @@ -1114,7 +1116,7 @@ The `continuous` setting enables automatic partition rebalancing based on: Without Continuous Data Balancing, partition distribution becomes skewed over time, leading to hotspots and manual rebalancing operations. -**Core Balancing** +==== Core Balancing xref:manage:cluster-maintenance/cluster-balancing.adoc#intra-broker-partition-balancing[Intra-broker partition balancing] distributes partition replicas across CPU cores within individual brokers. @@ -1136,7 +1138,7 @@ When enabled, Redpanda continuously rebalances partitions between CPU cores on a === System requirements -Run system checks to validate that you are running an optimal configuration. +Run system checks to get more details regarding your system configuration. [.side-by-side] -- @@ -1207,7 +1209,7 @@ kubectl exec -n -c redpanda -- rpk cluster config get clo true ---- -**Benefits of Tiered Storage:** +==== Benefits of Tiered Storage * Reduced local storage costs from offloading cold data to cheaper object storage * Longer data retention periods without provisioning additional disk @@ -1232,7 +1234,7 @@ See also: xref:manage:kubernetes/tiered-storage/k-tiered-storage.adoc[Tiered Sto Regularly scan container images and configurations for vulnerabilities to maintain security. -**Container image scanning:** +==== Container image scanning Verify that container images are scanned before deployment: @@ -1249,7 +1251,9 @@ kubectl get statefulset redpanda -n -o jsonpath='{.spec.template.spe docker.redpanda.com/redpandadata/redpanda:v24.2.4 ---- -**Security scanning best practices:** +==== Security scanning best practices + +Security scanning best practices include: * Scan images using tools like Trivy, Snyk, or cloud-native scanners before deployment * Set up automated scanning in CI/CD pipelines @@ -1257,7 +1261,7 @@ docker.redpanda.com/redpandadata/redpanda:v24.2.4 * Keep Redpanda and related components up-to-date with security patches (see xref:upgrade:k-rolling-upgrade.adoc[Rolling Upgrades]) * Review Kubernetes RBAC policies and ServiceAccount permissions (see xref:manage:kubernetes/security/authorization/k-role-controller.adoc[Role Controller]) -**Configuration scanning:** +==== Configuration scanning .Input [,bash] @@ -1273,7 +1277,7 @@ Establish a regular cadence for security scanning (for example, weekly or with e Implement and test backup and recovery processes to ensure business continuity. -**Backup strategy with Tiered Storage:** +==== Backup strategy with Tiered Storage Tiered Storage provides built-in backup capabilities by storing data in object storage. Verify Tiered Storage is configured: @@ -1283,7 +1287,7 @@ Tiered Storage provides built-in backup capabilities by storing data in object s kubectl exec -n -c redpanda -- rpk cluster config get cloud_storage_enabled -X user= -X pass= -X sasl.mechanism= ---- -**Recovery testing:** +==== Recovery testing Regularly test recovery procedures to validate RTO/RPO targets: From f40e935be182192064f946ef7355c69cabcb0df8 Mon Sep 17 00:00:00 2001 From: Josh Purcell Date: Thu, 5 Feb 2026 11:07:25 -0600 Subject: [PATCH 24/25] Handle PR 1352 review #3 --- .../redpanda/kubernetes/k-production-readiness.adoc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index 7506dc9ae7..b76ea570cb 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -155,6 +155,8 @@ kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.sta -- ====== +See also: <> + === Active broker membership Verify that all brokers are in active state and not being decommissioned. @@ -432,6 +434,7 @@ Verify that you have configured persistent storage (not hostPath or emptyDir) fo [.side-by-side] -- + .Input [,bash] ---- @@ -469,6 +472,8 @@ Volume Claims: HostPath and emptyDir storage are not suitable for production as they lack durability guarantees. +See also: xref:manage:kubernetes/storage/k-persistent-storage.adoc[Persistent Storage] + === RAID/LVM stripe configuration (multiple disks only) If using multiple physical disks, verify they are configured to stripe data across the disks as RAID-0 or LVM stripe (not linear/concat). Striping distributes data across multiple disks in parallel for improved I/O performance. @@ -513,6 +518,8 @@ data 2 256.00k Using LVM linear/concat or JBOD instead of stripe/RAID-0 across multiple disks will severely degrade performance because data writes are serialized rather than parallelized. For optimal I/O throughput, configure multiple disks in a striped array that writes data across all disks simultaneously. Single disk configurations do not require striping. +See also: xref:deploy:redpanda/kubernetes/k-production-deployment.adoc#storage[Storage] + === Storage performance requirements Ensure storage classes provide adequate IOPS and throughput for your workload by using the following specifications when selection a storage class: @@ -638,6 +645,8 @@ kubectl get redpanda redpanda -n -o jsonpath='{.spec.clusterSpec.res In the preceding examples, 4 CPU cores with 8 GiB memory provides a 1:2 ratio (2 GiB per core). +See also: xref:manage:kubernetes/k-manage-resources.adoc#memory[Memory] + === No fractional CPU requests Ensure CPU requests use whole numbers for consistent performance. @@ -1028,6 +1037,8 @@ Filesystem Type Size Used Avail Use% Mounted on XFS provides better performance characteristics for Redpanda workloads compared to ext4. While ext4 is supported, XFS is strongly recommended for production deployments. +See also: xref:deploy:redpanda/manual/production/requirements.adoc#storage[Storage Requirements] + === Pod anti-affinity Configure Pod anti-affinity to spread brokers across nodes. @@ -1341,6 +1352,8 @@ kubectl exec -n -c redpanda -- rpk topic list -X user= Date: Thu, 5 Feb 2026 11:15:33 -0600 Subject: [PATCH 25/25] Handle PR 1352 build issue #1 --- .../pages/redpanda/kubernetes/k-production-readiness.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc index b76ea570cb..01ae3cd02a 100644 --- a/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc +++ b/modules/deploy/pages/redpanda/kubernetes/k-production-readiness.adoc @@ -1,7 +1,7 @@ = Production Readiness Checklist :description: Comprehensive checklist for validating Redpanda deployments in Kubernetes against production readiness standards. :page-context-links: [{"name": "Linux", "to": "deploy:redpanda/linux/index.adoc" },{"name": "Kubernetes", "to": "deploy:redpanda/kubernetes/index.adoc" } ] -:page-categories: Production, Deployment +:page-categories: Deployment :learning-objective-1: Validate a Kubernetes-deployed Redpanda cluster against production readiness standards Before running a production workload on Redpanda in Kubernetes, follow this readiness checklist.