From 51078142d62c1f746c611d73939ee1979ff813b6 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Wed, 6 May 2026 12:43:59 +0200 Subject: [PATCH 01/11] feat(valkey): add Valkey cluster addon as a sibling to redis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stand up addons/valkey/ as a cluster-mode-only side-by-side addon, so our Valkey customizations live in their own file tree and never collide with upstream redis evolution. This retires the five post-install Helm hooks in stream-infra (patch-cache-config, patch-maxmemory, patch-prefer-ip, patch-reshard-cm, patch-valkey-image) by baking the equivalent behaviour into the addon at template-level. What's in the addon ------------------- - Single Valkey major (9.x) — no multi-version range loop, no sentinel, no twemproxy. cmpv-valkey-cluster.yaml ships docker.io/valkey/valkey images. dbctl/agamotto stay on apecloud. - ShardingDefinition with `minShards: 1` (provisions 1, 2, 3+ shards, matching how AWS ElastiCache exposes the same engine). - redis.conf tuned for a cache workload at template-level: appendonly no, save "" (no scheduled BGSAVE), io-threads 1 (avoids CFS throttling at our pod CPU limit), latency-monitor-threshold 25 (observability), maxmemory-policy allkeys-lru, maxmemory at 85% of pod memory limit. - valkey-cluster-server-start.sh: emits `cluster-preferred-endpoint-type ip` on the default-network branch (was `hostname`), so CLUSTER SLOTS announces VPC-routable IPs for chat-api and other external clients. - valkey-cluster-manage.sh: skips the legacy `redis-cli --cluster reshard` call on shard scale-out — slot migration is driven by ASM (CLUSTER MIGRATESLOTS via ape-dts) through the OpsDefinition in stream-infra. - valkey-cluster-common.sh: branches `create_redis_cluster` on a single primary to use `CLUSTER ADDSLOTSRANGE 0 16383` (mirroring ElastiCache), bypassing `redis-cli --cluster create` which rejects fewer than 3 masters. Lifts the matching guard in initialize_redis_cluster. Function names inside the scripts intentionally keep their `redis_*` identifiers to minimise the diff vs. upstream redis scripts and ease future bug-porting. Settings are global for now — no per-cluster Helm knobs. Add ParametersDefinition / values overrides later if cluster-specific tunings are needed. Verification ------------ - `helm template addons/valkey` renders 5 resources cleanly: ShardingDefinition, ComponentDefinition, ComponentVersion, plus the config + scripts ConfigMap templates. All 9 script files mount. - shellspec for `build_single_shard_addslots_command` and `create_redis_cluster` branch logic: 4 examples, 0 failures. --- addons/valkey/Chart.yaml | 34 + .../valkey/config/valkey-cluster-config.tpl | 123 ++ addons/valkey/scripts-ut-spec/utils.sh | 32 + .../valkey_cluster_common_spec.sh | 114 ++ addons/valkey/templates/_helpers.tpl | 98 ++ .../valkey/templates/cmpd-valkey-cluster.yaml | 555 +++++++++ .../valkey/templates/cmpv-valkey-cluster.yaml | 36 + .../valkey/templates/shardingdefinition.yaml | 30 + .../valkey-cluster-config-template.yaml | 11 + .../valkey-cluster-scripts-template.yaml | 25 + .../reload-parameter.sh | 30 + .../valkey/valkey-cluster-scripts/sync-acl.sh | 52 + .../valkey-cluster-common.sh | 787 ++++++++++++ .../valkey-cluster-manage.sh | 1051 +++++++++++++++++ .../valkey-cluster-replica-member-leave.sh | 111 ++ .../valkey-cluster-replica-pre-stop.sh | 43 + .../valkey-cluster-server-start.sh | 776 ++++++++++++ .../valkey-cluster-switchover.sh | 255 ++++ .../valkey-cluster-scripts/valkey-ping.sh | 69 ++ addons/valkey/values.yaml | 57 + 20 files changed, 4289 insertions(+) create mode 100644 addons/valkey/Chart.yaml create mode 100644 addons/valkey/config/valkey-cluster-config.tpl create mode 100644 addons/valkey/scripts-ut-spec/utils.sh create mode 100644 addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh create mode 100644 addons/valkey/templates/_helpers.tpl create mode 100644 addons/valkey/templates/cmpd-valkey-cluster.yaml create mode 100644 addons/valkey/templates/cmpv-valkey-cluster.yaml create mode 100644 addons/valkey/templates/shardingdefinition.yaml create mode 100644 addons/valkey/templates/valkey-cluster-config-template.yaml create mode 100644 addons/valkey/templates/valkey-cluster-scripts-template.yaml create mode 100644 addons/valkey/valkey-cluster-scripts/reload-parameter.sh create mode 100644 addons/valkey/valkey-cluster-scripts/sync-acl.sh create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh create mode 100755 addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh create mode 100755 addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh create mode 100755 addons/valkey/valkey-cluster-scripts/valkey-ping.sh create mode 100644 addons/valkey/values.yaml diff --git a/addons/valkey/Chart.yaml b/addons/valkey/Chart.yaml new file mode 100644 index 000000000..93321aa6c --- /dev/null +++ b/addons/valkey/Chart.yaml @@ -0,0 +1,34 @@ +apiVersion: v2 +name: valkey +description: "Valkey is an open-source, high-performance key/value store. This addon provisions Valkey Cluster topologies on KubeBlocks. Valkey speaks the Redis protocol and uses Redis-compatible cluster bootstrap; this is a sibling addon to `redis` so upstream redis evolution does not conflict with our Valkey customizations." + +type: application + +version: 0.1.0 + +appVersion: "9.0.3" + +# Add a dependency to the kubeblocks definition library chart, same as the redis addon. +dependencies: + - name: kblib + version: 0.1.0 + repository: file://../kblib + alias: extra + +home: https://valkey.io/ +icon: https://valkey.io/img/Valkey_Logo_Color.svg +keywords: + - valkey + - redis + - database + - nosql + - cluster + +maintainers: + - name: GetStream + url: https://github.com/GetStream/kubeblocks-addons/ + +annotations: + addon.kubeblocks.io/kubeblocks-version: ">=1.0.0" + addon.kubeblocks.io/model: "key-value" + addon.kubeblocks.io/provider: "community" diff --git a/addons/valkey/config/valkey-cluster-config.tpl b/addons/valkey/config/valkey-cluster-config.tpl new file mode 100644 index 000000000..e896c7c8f --- /dev/null +++ b/addons/valkey/config/valkey-cluster-config.tpl @@ -0,0 +1,123 @@ +bind * -::* +tcp-backlog 511 +timeout 0 +ignore-warnings ARM64-COW-BUG +tcp-keepalive 300 +daemonize no +pidfile /var/run/redis_6379.pid +{{ block "logsBlock" . }} +loglevel notice +logfile "/data/running.log" +{{ end }} +databases 16 +always-show-logo no +set-proc-title yes +proc-title-template "{title} {listen-addr} {server-mode}" +stop-writes-on-bgsave-error yes +rdbcompression yes +rdbchecksum yes +dbfilename dump.rdb +rdb-del-sync-files no +dir /data +replica-serve-stale-data yes +replica-read-only yes +repl-diskless-sync yes +repl-diskless-sync-delay 5 +repl-diskless-sync-max-replicas 0 +repl-diskless-load disabled +repl-disable-tcp-nodelay no +replica-priority 100 +acllog-max-len 128 +lazyfree-lazy-eviction no +lazyfree-lazy-expire no +lazyfree-lazy-server-del no +replica-lazy-flush no +lazyfree-lazy-user-del no +lazyfree-lazy-user-flush no +oom-score-adj no +oom-score-adj-values 0 200 800 +disable-thp yes + +# AOF off: fsync on EBS gp3 caused 30-40ms event-loop stalls (LATENCY DOCTOR +# confirmed). Replicas + EBS-mounted nodes.conf give us cluster-topology +# durability, which is all we need for a cache. +appendonly no +appendfilename "appendonly.aof" +appenddirname "appendonlydir" +appendfsync everysec +no-appendfsync-on-rewrite no +auto-aof-rewrite-percentage 100 +auto-aof-rewrite-min-size 64mb +aof-load-truncated yes +aof-use-rdb-preamble yes +aof-timestamp-enabled no + +# Disable scheduled BGSAVE forks (default rules tripped every ~90s under our +# load; each fork briefly stalls the event loop). +save "" + +slowlog-log-slower-than 10000 +slowlog-max-len 128 + +# Observability: log event-loop stalls > 25ms. Negligible overhead, big +# diagnostic value (without it, LATENCY DOCTOR returns nothing). +latency-monitor-threshold 25 + +notify-keyspace-events "" +hash-max-listpack-entries 512 +hash-max-listpack-value 64 +list-max-listpack-size -2 +list-compress-depth 0 +set-max-intset-entries 512 +zset-max-listpack-entries 128 +zset-max-listpack-value 64 +hll-sparse-max-bytes 3000 +stream-node-max-bytes 4096 +stream-node-max-entries 100 +activerehashing yes +client-output-buffer-limit normal 0 0 0 +client-output-buffer-limit replica 256mb 64mb 60 +client-output-buffer-limit pubsub 32mb 8mb 60 +hz 10 +dynamic-hz yes +aof-rewrite-incremental-fsync yes +rdb-save-incremental-fsync yes +jemalloc-bg-thread yes +enable-debug-command yes +aclfile /etc/redis/users.acl + +# Single IO thread: at the pod CPU limit, 4 IO threads + main thread caused +# CFS throttling (~8% of periods at 1500m). Our workload is fine +# single-threaded. STARTUP-ONLY (CONFIG SET rejects io-threads). +io-threads 1 +io-threads-do-reads yes + +# configuration for valkey cluster (Redis-protocol compatible) +cluster-enabled yes +cluster-config-file /data/nodes.conf +cluster-allow-replica-migration no +cluster-node-timeout 5000 +cluster-replica-validity-factor 0 +cluster-require-full-coverage yes +cluster-allow-reads-when-down no + +# Eviction policy: allkeys-lru (cache mode — we want eviction across the +# whole keyspace, not just keys with TTLs). +maxmemory-policy allkeys-lru +# maxmemory: 85% of the pod memory limit, leaving ~15% headroom for +# connection / replication buffers. Persistence is disabled, so no RDB-fork +# memory doubling concern. +{{- $limit_memory := default 0 $.PHY_MEMORY | int }} +{{- if gt $limit_memory 0 }} +maxmemory {{ mulf $limit_memory 0.85 | int }} +{{- end }} + +{{- if eq (index $ "TLS_ENABLED") "true" }} +tls-cert-file {{ $.TLS_MOUNT_PATH }}/tls.crt +tls-key-file {{ $.TLS_MOUNT_PATH }}/tls.key +tls-ca-cert-file {{ $.TLS_MOUNT_PATH }}/ca.crt +tls-auth-clients no +tls-replication yes +tls-cluster yes +port 0 +{{- end -}} diff --git a/addons/valkey/scripts-ut-spec/utils.sh b/addons/valkey/scripts-ut-spec/utils.sh new file mode 100644 index 000000000..5b2506969 --- /dev/null +++ b/addons/valkey/scripts-ut-spec/utils.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# utils functions for shellspec unit tests + +convert_tpl_to_bash() { + local input_file="$1" + local output_file="$2" + + sed -e '/^{{\/\*$/,/^\*\/}}$/d' \ + -e '/^{{-.*}}/d' \ + -e 's/{{- define ".*" }}//' \ + -e 's/{{- end }}//' \ + "$input_file" >> "$output_file" +} + +generate_common_library() { + local library_file="$1" + + libcommons_tpl_file="../../kblib/templates/_libcommons.tpl" + libpods_tpl_file="../../kblib/templates/_libpods.tpl" + libstrings_tpl_file="../../kblib/templates/_libstrings.tpl" + libenvs_tpl_file="../../kblib/templates/_libenvs.tpl" + libcompvars_tpl_file="../../kblib/templates/_libcompvars.tpl" + libututils_tpl_file="../../kblib/templates/_libututils.tpl" + + convert_tpl_to_bash $libcommons_tpl_file "$library_file" + convert_tpl_to_bash $libpods_tpl_file "$library_file" + convert_tpl_to_bash $libstrings_tpl_file "$library_file" + convert_tpl_to_bash $libenvs_tpl_file "$library_file" + convert_tpl_to_bash $libcompvars_tpl_file "$library_file" + convert_tpl_to_bash $libututils_tpl_file "$library_file" +} \ No newline at end of file diff --git a/addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh b/addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh new file mode 100644 index 000000000..c95ce409f --- /dev/null +++ b/addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh @@ -0,0 +1,114 @@ +# shellcheck shell=bash +# shellcheck disable=SC2034 + +# Tightly scoped spec for the Valkey-specific edits to the cluster bootstrap +# helpers. Full coverage of the upstream script behaviour lives in the redis +# addon's spec; here we cover only what the valkey addon adds: +# +# - build_single_shard_addslots_command (new helper for 1-shard provisioning) +# - create_redis_cluster branch on primary_count == 1 + +# validate_shell_type_and_version defined in shellspec/spec_helper.sh used to validate the expected shell type and version this script needs to run. +if ! validate_shell_type_and_version "bash" 4 &>/dev/null; then + echo "valkey_cluster_common_spec.sh skip cases because dependency bash version 4 or higher is not installed." + exit 0 +fi + +source ./utils.sh + +common_library_file="./common.sh" +generate_common_library $common_library_file + +Describe "Valkey Cluster Common Bash Script Tests" + Include $common_library_file + Include ../valkey-cluster-scripts/valkey-cluster-common.sh + + init() { + # ut_mode=true makes unset_xtrace_when_ut_mode_false / set_xtrace_when_ut_mode_false + # no-op so xtrace doesn't leak into stderr expectations. + ut_mode="true" + } + BeforeAll "init" + + cleanup() { + rm -f $common_library_file + } + AfterAll 'cleanup' + + setup_redis_cli_env() { + REDIS_CLI_TLS_CMD="" + } + Before "setup_redis_cli_env" + + Describe "build_single_shard_addslots_command()" + Context "without password" + It "uses CLUSTER ADDSLOTSRANGE 0 16383" + node_endpoint="172.0.0.1:6379" + + When call build_single_shard_addslots_command "$node_endpoint" + The output should eq "redis-cli -h 172.0.0.1 -p 6379 cluster addslotsrange 0 16383" + The stderr should include "initialize single-shard cluster command: redis-cli -h 172.0.0.1 -p 6379 cluster addslotsrange 0 16383" + End + End + + Context "with password" + setup() { + export REDIS_DEFAULT_PASSWORD="password" + } + Before "setup" + + un_setup() { + unset REDIS_DEFAULT_PASSWORD + } + After "un_setup" + + It "passes auth via -a and masks password in log" + node_endpoint="172.0.0.1:6379" + + When call build_single_shard_addslots_command "$node_endpoint" + The output should eq "redis-cli -h 172.0.0.1 -p 6379 -a password cluster addslotsrange 0 16383" + The stderr should include "initialize single-shard cluster command: redis-cli -h 172.0.0.1 -p 6379 -a ******** cluster addslotsrange 0 16383" + End + End + End + + Describe "create_redis_cluster()" + Context "with a single primary" + build_single_shard_addslots_command() { + echo "ADDSLOTS_CMD" + } + build_redis_cluster_create_command() { + echo "MULTI_SHARD_CMD" + } + ADDSLOTS_CMD() { return 0; } + MULTI_SHARD_CMD() { echo "should not be called"; return 1; } + + It "uses the single-shard ADDSLOTS path and skips --cluster create" + primary_nodes="172.0.0.1:6379 " + + When call create_redis_cluster "$primary_nodes" + The status should be success + The stdout should not include "should not be called" + End + End + + Context "with multiple primaries" + build_single_shard_addslots_command() { + echo "ADDSLOTS_CMD" + } + build_redis_cluster_create_command() { + echo "MULTI_SHARD_CMD" + } + ADDSLOTS_CMD() { echo "should not be called"; return 1; } + MULTI_SHARD_CMD() { return 0; } + + It "uses the upstream --cluster create path" + primary_nodes="172.0.0.1:6379 172.0.0.2:6379 172.0.0.3:6379 " + + When call create_redis_cluster "$primary_nodes" + The status should be success + The stdout should not include "should not be called" + End + End + End +End diff --git a/addons/valkey/templates/_helpers.tpl b/addons/valkey/templates/_helpers.tpl new file mode 100644 index 000000000..d62d97bd4 --- /dev/null +++ b/addons/valkey/templates/_helpers.tpl @@ -0,0 +1,98 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "valkey.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "valkey.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "valkey.labels" -}} +helm.sh/chart: {{ include "valkey.chart" . }} +{{ include "valkey.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Common annotations +*/}} +{{- define "valkey.annotations" -}} +{{ include "kblib.helm.resourcePolicy" . }} +{{ include "valkey.apiVersion" . }} +apps.kubeblocks.io/skip-immutable-check: "true" +{{- end }} + +{{/* +API version annotation +*/}} +{{- define "valkey.apiVersion" -}} +kubeblocks.io/crd-api-version: apps.kubeblocks.io/v1 +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "valkey.selectorLabels" -}} +app.kubernetes.io/name: {{ include "valkey.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Define valkey cluster component definition regular expression name prefix +*/}} +{{- define "valkeyCluster.cmpdRegexpPattern" -}} +^valkey-cluster-\d+ +{{- end -}} + +{{/* +Define valkey cluster component script template name +*/}} +{{- define "valkeyCluster.scriptsTemplate" -}} +valkey-cluster-scripts-template-{{ .Chart.Version }} +{{- end -}} + +{{- define "metrics.repository" -}} +{{ .Values.metrics.image.registry | default ( .Values.image.registry | default "docker.io" ) }}/{{ .Values.metrics.image.repository}} +{{- end }} + +{{- define "metrics.image" -}} +{{ .Values.metrics.image.registry | default ( .Values.image.registry | default "docker.io" ) }}/{{ .Values.metrics.image.repository}}:{{ .Values.metrics.image.tag }} +{{- end }} + +{{/* +Generate scripts configmap data block +*/}} +{{- define "valkey-cluster.extend.scripts" -}} +{{- range $path, $_ := $.Files.Glob "valkey-cluster-scripts/**" }} +{{ $path | base }}: |- +{{- $.Files.Get $path | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "valkey.config.reconfigureAction" -}} +reconfigure: + exec: + container: valkey-cluster + targetPodSelector: All + command: + - /bin/sh + - -c + - | + set -eu + + env | cut -d= -f1 | grep -E '^[a-z0-9_.-][a-z0-9_.-]*$' | sort -u | while IFS= read -r param; do + [ -n "${param}" ] || continue + /scripts/reload-parameter.sh "${param}" "$(printenv "${param}")" + done +{{- end -}} diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml new file mode 100644 index 000000000..c9c39b46a --- /dev/null +++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml @@ -0,0 +1,555 @@ +{{- range .Values.valkeyVersions }} +--- +apiVersion: apps.kubeblocks.io/v1 +kind: ComponentDefinition +metadata: + name: {{ printf "%s-%s" .componentDef $.Chart.Version }} + labels: + {{- include "valkey.labels" $ | nindent 4 }} + annotations: + {{- include "valkey.annotations" $ | nindent 4 }} +spec: + provider: GetStream + description: Valkey {{ .major }}.x cluster ComponentDefinition (sibling to redis-cluster, with stream-tuned defaults baked in). + serviceKind: valkey-cluster + serviceVersion: {{ .serviceVersion }} + minReadySeconds: 10 + podUpgradePolicy: ReCreate + tls: + volumeName: tls + mountPath: {{ $.Values.tlsMountPath }} + caFile: ca.crt + certFile: tls.crt + keyFile: tls.key + services: + - name: valkey-advertised + serviceName: valkey-advertised + spec: + type: NodePort + ports: + - name: valkey-advertised + port: 6379 + targetPort: valkey-cluster + - name: advertised-bus + port: 16379 + targetPort: cluster-bus + podService: true + disableAutoProvision: true + - name: valkey-lb-advertised + serviceName: valkey-lb-advertised + spec: + type: LoadBalancer + externalTrafficPolicy: Cluster + ports: + - name: valkey-advertised + port: 6379 + targetPort: valkey-cluster + - name: advertised-bus + port: 16379 + targetPort: cluster-bus + podService: true + disableAutoProvision: true + updateStrategy: BestEffortParallel + podManagementPolicy: OrderedReady + volumes: + - name: data + needSnapshot: true + roles: + - name: primary + updatePriority: 2 + participatesInQuorum: false + - name: secondary + updatePriority: 1 + participatesInQuorum: false + logConfigs: + {{- range $name,$pattern := $.Values.logConfigs }} + - name: {{ $name }} + filePathPattern: {{ $pattern }} + {{- end }} + exporter: + containerName: metrics + scrapePath: /metrics + scrapePort: http-metrics + configs: + - name: valkey-cluster-config + template: {{ printf "valkey-cluster-config-template-%s" $.Chart.Version }} + namespace: {{ $.Release.Namespace }} + volumeName: valkey-cluster-config + externalManaged: true + {{- include "valkey.config.reconfigureAction" $ | nindent 6 }} + scripts: + - name: valkey-cluster-scripts + template: {{ include "valkeyCluster.scriptsTemplate" $ }} + namespace: {{ $.Release.Namespace }} + volumeName: scripts + defaultMode: 0555 + {{- include "kblib.syncer.policyRules" $ | nindent 2 }} + systemAccounts: + - name: default + initAccount: true + passwordGenerationPolicy: + length: 10 + numDigits: 5 + numSymbols: 0 + letterCase: MixedCases + hostNetwork: + containerPorts: + - container: valkey-cluster + ports: + - valkey-cluster + - cluster-bus + {{- if $.Values.enableMetrics }} + - container: metrics + ports: + - http-metrics + - server-metrics + {{- end }} + vars: + - name: TLS_ENABLED + valueFrom: + tlsVarRef: + enabled: Optional + optional: true + - name: TLS_MOUNT_PATH + value: {{ $.Values.tlsMountPath }} + - name: CLUSTER_NAME + valueFrom: + clusterVarRef: + clusterName: Required + - name: CLUSTER_NAMESPACE + valueFrom: + clusterVarRef: + namespace: Required + - name: COMPONENT_REPLICAS + valueFrom: + componentVarRef: + optional: false + replicas: Required + - name: CLUSTER_DOMAIN + value: {{ $.Values.clusterDomain }} + ## the default username/password of valkey connection (uses Redis-protocol AUTH) + - name: REDIS_DEFAULT_USER + valueFrom: + credentialVarRef: + name: default + username: Required + - name: REDIS_DEFAULT_PASSWORD + valueFrom: + credentialVarRef: + name: default + password: Required + - name: REDIS_REPL_USER + value: "kbreplicator" + - name: REDIS_REPL_PASSWORD + valueFrom: + credentialVarRef: + name: default + password: Required + - name: CURRENT_SHARD_POD_NAME_LIST + valueFrom: + componentVarRef: + optional: false + podNames: Required + - name: CURRENT_SHARD_POD_FQDN_LIST + valueFrom: + componentVarRef: + optional: false + podFQDNs: Required + - name: CURRENT_SHARD_COMPONENT_NAME + valueFrom: + componentVarRef: + optional: false + componentName: Required + - name: CURRENT_SHARD_COMPONENT_SHORT_NAME + valueFrom: + componentVarRef: + optional: false + shortName: Required + - name: CURRENT_SHARD_ADVERTISED_PORT + valueFrom: + serviceVarRef: + name: valkey-advertised + optional: true + port: + name: valkey-advertised + option: Required + - name: CURRENT_SHARD_ADVERTISED_BUS_PORT + valueFrom: + serviceVarRef: + name: valkey-advertised + optional: true + port: + name: advertised-bus + option: Required + - name: CURRENT_SHARD_LB_ADVERTISED_HOST + valueFrom: + serviceVarRef: + name: valkey-lb-advertised + optional: true + loadBalancer: Required + host: Required + - name: CURRENT_SHARD_LB_ADVERTISED_PORT + valueFrom: + serviceVarRef: + name: valkey-lb-advertised + optional: true + port: + name: valkey-advertised + option: Required + - name: CURRENT_SHARD_LB_ADVERTISED_BUS_PORT + valueFrom: + serviceVarRef: + name: valkey-lb-advertised + optional: true + port: + name: advertised-bus + option: Required + - name: ALL_SHARDS_COMPONENT_SHORT_NAMES + valueFrom: + componentVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + optional: false + shortName: Required + multipleClusterObjectOption: + strategy: combined + - name: ALL_SHARDS_POD_NAME_LIST + valueFrom: + componentVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + optional: false + podNames: Required + multipleClusterObjectOption: + strategy: individual + - name: ALL_SHARDS_POD_FQDN_LIST + valueFrom: + componentVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + optional: false + podFQDNs: Required + multipleClusterObjectOption: + strategy: individual + - name: ALL_SHARDS_ADVERTISED_PORT + valueFrom: + serviceVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + name: valkey-advertised + optional: true + multipleClusterObjectOption: + strategy: combined + combinedOption: + flattenFormat: + delimiter: "." + keyValueDelimiter: "@" + port: + name: valkey-advertised + option: Required + - name: ALL_SHARDS_LB_ADVERTISED_PORT + valueFrom: + serviceVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + name: valkey-lb-advertised + optional: true + multipleClusterObjectOption: + strategy: combined + combinedOption: + flattenFormat: + delimiter: "." + keyValueDelimiter: "@" + port: + name: valkey-advertised + option: Required + - name: ALL_SHARDS_LB_ADVERTISED_BUS_PORT + valueFrom: + serviceVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + name: valkey-lb-advertised + optional: true + multipleClusterObjectOption: + strategy: combined + combinedOption: + flattenFormat: + delimiter: "." + keyValueDelimiter: "@" + port: + name: advertised-bus + option: Required + - name: ALL_SHARDS_LB_ADVERTISED_HOST + valueFrom: + serviceVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + name: valkey-lb-advertised + optional: true + multipleClusterObjectOption: + strategy: combined + combinedOption: + flattenFormat: + delimiter: "," + keyValueDelimiter: "@" + host: Required + loadBalancer: Required + - name: REDIS_CLUSTER_HOST_NETWORK_PORT + valueFrom: + hostNetworkVarRef: + optional: true + container: + name: valkey-cluster + port: + name: valkey-cluster + option: Required + - name: SERVICE_PORT + value: "6379" + expression: {{ `{{if index . "REDIS_CLUSTER_HOST_NETWORK_PORT"}}{{.REDIS_CLUSTER_HOST_NETWORK_PORT}}{{else}}{{.SERVICE_PORT}}{{end}}` | toYaml }} + - name: REDIS_METRICS_ADDR + value: "redis://localhost:$(SERVICE_PORT)" + expression: {{ `{{if eq (index . "TLS_ENABLED") "true"}}rediss://localhost: {{.SERVICE_PORT }}{{else}}redis://localhost:{{.SERVICE_PORT}}{{end}}` | toYaml }} + - name: REDIS_CLI_TLS_CMD + value: "" + expression: {{ `{{if eq (index . "TLS_ENABLED") "true"}}--tls --insecure{{else }}{{end}}` | toYaml }} + - name: REDIS_CLUSTER_HOST_NETWORK_BUS_PORT + valueFrom: + hostNetworkVarRef: + optional: true + container: + name: valkey-cluster + port: + name: cluster-bus + option: Required + - name: CLUSTER_BUS_PORT + value: "16379" + expression: {{ `{{if index . "REDIS_CLUSTER_HOST_NETWORK_BUS_PORT"}}{{.REDIS_CLUSTER_HOST_NETWORK_BUS_PORT}}{{else}}{{.CLUSTER_BUS_PORT}}{{end}}` | toYaml }} + {{- if $.Values.enableMetrics }} + - name: REDIS_METRICS_HOST_NETWORK_PORT + valueFrom: + hostNetworkVarRef: + optional: true + container: + name: metrics + port: + name: http-metrics + option: Required + - name: REDIS_METRICS_HTTP_PORT + value: "9121" + expression: {{ `{{if index . "REDIS_METRICS_HOST_NETWORK_PORT"}}{{.REDIS_METRICS_HOST_NETWORK_PORT}}{{else}}{{.REDIS_METRICS_HTTP_PORT}}{{end}}` | toYaml }} + - name: REDIS_METRICS_HOST_NETWORK_SERVER_PORT + valueFrom: + hostNetworkVarRef: + optional: true + container: + name: metrics + port: + name: server-metrics + option: Required + - name: REDIS_METRICS_SERVER_PORT + value: "8888" + expression: {{ `{{if index . "REDIS_METRICS_HOST_NETWORK_SERVER_PORT"}}{{.REDIS_METRICS_HOST_NETWORK_SERVER_PORT}}{{else}}{{.REDIS_METRICS_SERVER_PORT}}{{end}}` | toYaml }} + - name: REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT + valueFrom: + hostNetworkVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + optional: true + container: + name: valkey-cluster + port: + name: valkey-cluster + option: Required + multipleClusterObjectOption: + strategy: combined + combinedOption: + flattenFormat: + delimiter: "," + keyValueDelimiter: ":" + {{- end }} + - name: REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_BUS_PORT + valueFrom: + hostNetworkVarRef: + compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }} + optional: true + container: + name: valkey-cluster + port: + name: cluster-bus + option: Required + multipleClusterObjectOption: + strategy: combined + combinedOption: + flattenFormat: + delimiter: "," + keyValueDelimiter: ":" + - name: PHY_MEMORY + valueFrom: + resourceVarRef: + memoryLimit: Required + lifecycleActions: + roleProbe: + periodSeconds: 1 + timeoutSeconds: 1 + exec: + container: valkey-cluster + env: + - name: CURRENT_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: KB_HOST_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: KB_POD_FQDN + value: "$(CURRENT_POD_NAME).$(CURRENT_SHARD_COMPONENT_NAME)-headless.$(CLUSTER_NAMESPACE).svc.{{ $.Values.clusterDomain }}" + - name: KB_CLUSTER_COMP_NAME + value: $(CURRENT_SHARD_COMPONENT_NAME) + - name: REDIS_LB_ADVERTISED_HOST + value: $(CURRENT_SHARD_LB_ADVERTISED_HOST) + - name: KB_SERVICE_PORT + value: "$(SERVICE_PORT)" + command: + - /tools/dbctl + - redis + - getrole + postProvision: + timeoutSeconds: 900 + exec: + container: valkey-cluster + command: + - /bin/bash + - -c + - /scripts/valkey-cluster-manage.sh --post-provision > /tmp/post-provision.log 2>&1 + env: + - name: CURRENT_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: CURRENT_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: CURRENT_POD_HOST_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + preCondition: RuntimeReady + retryPolicy: + maxRetries: 10 + memberLeave: + exec: + container: valkey-cluster + command: + - /bin/bash + - -c + - /scripts/valkey-cluster-replica-member-leave.sh > /tmp/member-leave.log 2>&1 + retryPolicy: + maxRetries: 10 + switchover: + exec: + container: valkey-cluster + command: + - /bin/bash + - -c + - /scripts/valkey-cluster-switchover.sh > /tmp/switchover.log 2>&1 + memberJoin: + exec: + container: valkey-cluster + command: + - /bin/bash + - -c + - /scripts/sync-acl.sh + targetPodSelector: Any + runtime: + initContainers: + - name: init-dbctl + command: + - cp + - -r + - /bin/dbctl + - /tools/ + imagePullPolicy: {{ default "IfNotPresent" $.Values.dbctlImage.pullPolicy }} + volumeMounts: + - mountPath: /tools + name: tools + containers: + - name: valkey-cluster + imagePullPolicy: {{ default "IfNotPresent" $.Values.image.pullPolicy }} + ports: + - name: valkey-cluster + containerPort: 6379 + - name: cluster-bus + containerPort: 16379 + volumeMounts: + - name: data + mountPath: {{ $.Values.dataMountPath }} + - name: valkey-cluster-config + mountPath: /etc/conf + - name: scripts + mountPath: /scripts + - name: redis-conf + mountPath: /etc/redis + - mountPath: /tools + name: tools + env: + - name: CURRENT_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: CURRENT_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: CURRENT_POD_HOST_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: POD_FQDN + value: "$(CURRENT_POD_NAME).$(CURRENT_SHARD_COMPONENT_NAME)-headless.$(CLUSTER_NAMESPACE).svc.{{ $.Values.clusterDomain }}" + command: [ "/scripts/valkey-cluster-server-start.sh" ] + readinessProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 5 + exec: + command: + - sh + - -c + - /scripts/valkey-ping.sh + lifecycle: + preStop: + exec: + command: + - /bin/bash + - -c + - /scripts/valkey-cluster-replica-pre-stop.sh + {{- if $.Values.enableMetrics }} + - name: metrics + imagePullPolicy: {{ $.Values.metrics.image.pullPolicy | quote }} + securityContext: + runAsNonRoot: true + runAsUser: 1001 + env: + - name: REDIS_ADDR + value: "$(REDIS_METRICS_ADDR)" + - name: REDIS_EXPORTER_WEB_LISTEN_ADDRESS + value: "0.0.0.0:$(REDIS_METRICS_HTTP_PORT)" + - name: REDIS_USER + value: $(REDIS_DEFAULT_USER) + - name: REDIS_PASSWORD + value: $(REDIS_DEFAULT_PASSWORD) + - name: REDIS_EXPORTER_IS_CLUSTER + value: "true" + - name: REDIS_EXPORTER_SKIP_TLS_VERIFICATION + value: "true" + ports: + - name: http-metrics + containerPort: {{ $.Values.metrics.service.port }} + - name: server-metrics + containerPort: {{ $.Values.metrics.service.serverPort }} + {{- end }} +{{- end }} diff --git a/addons/valkey/templates/cmpv-valkey-cluster.yaml b/addons/valkey/templates/cmpv-valkey-cluster.yaml new file mode 100644 index 000000000..31ab4b38c --- /dev/null +++ b/addons/valkey/templates/cmpv-valkey-cluster.yaml @@ -0,0 +1,36 @@ +apiVersion: apps.kubeblocks.io/v1 +kind: ComponentVersion +metadata: + name: valkey-cluster + labels: + {{- include "valkey.labels" . | nindent 4 }} + annotations: + {{- include "valkey.apiVersion" . | nindent 4 }} +spec: + compatibilityRules: + {{- range .Values.valkeyVersions }} + - compDefs: + - {{ .componentDef }} + releases: + {{- range .mirrorVersions }} + - {{ .version }} + {{- end }} + {{- end }} + releases: + {{- $valkeyRepository := printf "%s/%s" ( .Values.image.registry | default "docker.io" ) .Values.image.repository }} + {{- range .Values.valkeyVersions }} + {{- range .mirrorVersions }} + - name: {{ .version }} + serviceVersion: {{ .version }} + images: + valkey-cluster: {{ $valkeyRepository }}:{{ .imageTag }} + postProvision: {{ $valkeyRepository }}:{{ .imageTag }} + accountProvision: {{ $valkeyRepository }}:{{ .imageTag }} + switchover: {{ $valkeyRepository }}:{{ .imageTag }} + preTerminate: {{ $valkeyRepository }}:{{ .imageTag }} + memberLeave: {{ $valkeyRepository }}:{{ .imageTag }} + memberJoin: {{ $valkeyRepository }}:{{ .imageTag }} + metrics: {{ include "metrics.repository" $ }}:{{ $.Values.metrics.image.tag }} + init-dbctl: {{ $.Values.dbctlImage.registry | default ( $.Values.image.registry | default "docker.io" ) }}/{{ $.Values.dbctlImage.repository }}:{{ $.Values.dbctlImage.tag }} + {{- end }} + {{- end }} diff --git a/addons/valkey/templates/shardingdefinition.yaml b/addons/valkey/templates/shardingdefinition.yaml new file mode 100644 index 000000000..ed1e71c8b --- /dev/null +++ b/addons/valkey/templates/shardingdefinition.yaml @@ -0,0 +1,30 @@ +apiVersion: apps.kubeblocks.io/v1 +kind: ShardingDefinition +metadata: + name: valkey-cluster + labels: + {{- include "valkey.labels" . | nindent 4 }} + annotations: + {{- include "valkey.apiVersion" . | nindent 4 }} +spec: + template: + compDef: {{ include "valkeyCluster.cmpdRegexpPattern" . }} + shardsLimit: + minShards: 1 + maxShards: 64 + provisionStrategy: Parallel + updateStrategy: Parallel + systemAccounts: + - name: default + shared: true + lifecycleActions: + shardRemove: + timeoutSeconds: 900 + exec: + container: valkey-cluster + command: + - /bin/bash + - -c + - /scripts/valkey-cluster-manage.sh --pre-terminate > /tmp/pre-terminate.log 2>&1 + retryPolicy: + maxRetries: 10 diff --git a/addons/valkey/templates/valkey-cluster-config-template.yaml b/addons/valkey/templates/valkey-cluster-config-template.yaml new file mode 100644 index 000000000..38ed4e8ff --- /dev/null +++ b/addons/valkey/templates/valkey-cluster-config-template.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "valkey-cluster-config-template-%s" .Chart.Version }} + labels: + {{- include "valkey.labels" . | nindent 4 }} + annotations: + {{- include "valkey.annotations" . | nindent 4 }} +data: + redis.conf: |- + {{- .Files.Get "config/valkey-cluster-config.tpl" | nindent 4 }} diff --git a/addons/valkey/templates/valkey-cluster-scripts-template.yaml b/addons/valkey/templates/valkey-cluster-scripts-template.yaml new file mode 100644 index 000000000..21e0f3c89 --- /dev/null +++ b/addons/valkey/templates/valkey-cluster-scripts-template.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "valkey-cluster-scripts-template-%s" .Chart.Version }} + labels: + {{- include "valkey.labels" . | nindent 4 }} + annotations: + {{- include "valkey.annotations" . | nindent 4 }} +data: + common.sh: |- + #!/bin/bash + {{- include "kblib.commons.call_func_with_retry" $ | nindent 4 }} + {{- include "kblib.commons.extract_obj_ordinal" $ | nindent 4 }} + {{- include "kblib.compvars.get_target_pod_fqdn_from_pod_fqdn_vars" $ | nindent 4 }} + {{- include "kblib.pods.min_lexicographical_order_pod" $ | nindent 4 }} + {{- include "kblib.ututils.set_xtrace_when_ut_mode_false" $ | nindent 4 }} + {{- include "kblib.ututils.unset_xtrace_when_ut_mode_false" $ | nindent 4 }} + {{- include "kblib.ututils.sleep_when_ut_mode_false" $ | nindent 4 }} + {{- include "kblib.strings.contains" $ | nindent 4 }} + {{- include "kblib.strings.is_empty" $ | nindent 4 }} + {{- include "kblib.strings.equals" $ | nindent 4 }} + {{- include "kblib.strings.split" $ | nindent 4 }} + {{- with include "valkey-cluster.extend.scripts" . }} + {{- . | nindent 2 }} + {{- end }} diff --git a/addons/valkey/valkey-cluster-scripts/reload-parameter.sh b/addons/valkey/valkey-cluster-scripts/reload-parameter.sh new file mode 100644 index 000000000..cf0fd7890 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/reload-parameter.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e +paramName="" +paramValue="" +for val in $(echo "${1}" | tr ' ' '\n'); do + if [ -z "${paramName}" ]; then + paramName="${val}" + elif [ -z "${paramValue}" ]; then + paramValue="${val}" + else + paramValue="${paramValue} ${val}" + fi +done + +if [ -z "${paramValue}" ]; then + paramValue="${@:2}" +else + paramValue="${paramValue} ${@:2}" +fi + +if [ "$paramValue" = "\"\"" ]; then + paramValue="" +fi +service_port=${SERVICE_PORT:-6379} + +if [ -z $REDIS_DEFAULT_PASSWORD ]; then + redis-cli $REDIS_CLI_TLS_CMD -p $service_port CONFIG SET ${paramName} "${paramValue}" +else + redis-cli $REDIS_CLI_TLS_CMD -p $service_port -a ${REDIS_DEFAULT_PASSWORD} CONFIG SET ${paramName} "${paramValue}" +fi diff --git a/addons/valkey/valkey-cluster-scripts/sync-acl.sh b/addons/valkey/valkey-cluster-scripts/sync-acl.sh new file mode 100644 index 000000000..0b6bd0c31 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/sync-acl.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +service_port=${SERVICE_PORT:-6379} +redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port -a $REDIS_DEFAULT_PASSWORD" +if [ -z "$REDIS_DEFAULT_PASSWORD" ]; then + redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port" +fi + +is_ok=false +acl_list="" +# 1. get acl list from other pods +for pod_fqdn in $(echo "$REDIS_POD_FQDN_LIST" | tr ',' '\n'); do + if [[ "$pod_fqdn" == "$KB_JOIN_MEMBER_POD_FQDN" ]]; then + continue + fi + acl_list=$($redis_base_cmd -h "$pod_fqdn" ACL LIST) + if [ $? -eq 0 ]; then + is_ok=true + break + fi +done + +if [ "$is_ok" = false ]; then + echo "Failed to get ACL LIST from other pods" >&2 + exit 1 +fi + +if [ -z "$acl_list" ]; then + echo "No ACL rules found in other pods, skip synchronization" >&2 + exit 0 +fi + +set -e +# 2. apply acl list to current pod +while IFS= read -r user_rule; do + [[ -z "$user_rule" ]] && continue + + if [[ "$user_rule" =~ ^user[[:space:]]+([^[:space:]]+) ]]; then + username="${BASH_REMATCH[1]}" + else + # skip invalid user rule + continue + fi + + if [[ "$username" == "default" ]]; then + continue + fi + rule_part="${user_rule#user $username }" + $redis_base_cmd -h $KB_JOIN_MEMBER_POD_FQDN ACL SETUSER "$username" $rule_part >&2 +done <<< "$acl_list" + +$redis_base_cmd -h $KB_JOIN_MEMBER_POD_FQDN ACL save >&2 \ No newline at end of file diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh new file mode 100644 index 000000000..2f41f6388 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh @@ -0,0 +1,787 @@ +#!/bin/bash + +# shellcheck disable=SC2153 +# shellcheck disable=SC2207 +# shellcheck disable=SC2034 + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# you should set ut_mode="true" when you want to run the script in shellspec file. +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -ex". + set -ex; +} + +retry_times=3 +check_ready_times=30 +retry_delay_second=2 + +# usage: sleep_random_second_when_ut_mode_false +sleep_random_second_when_ut_mode_false() { + if [ "false" == "$ut_mode" ]; then + local max_time="$1" + local min_time="$2" + local random_time=$((RANDOM % (max_time - min_time + 1) + min_time)) + echo "Sleeping for $random_time seconds" + sleep "$random_time" + fi +} + +## the component names of all shard +## the value format of ALL_SHARDS_COMPONENT_SHORT_NAMES is like "shard-98x:shard-98x,shard-cq7:shard-cq7,shard-hy7:shard-hy7" +## return the component names of all shards with the format "shard-98x,shard-cq7,shard-hy7" +get_all_shards_components() { + local all_shards_components="" + if is_empty "$ALL_SHARDS_COMPONENT_SHORT_NAMES"; then + echo "Error: Required environment variable ALL_SHARDS_COMPONENT_SHORT_NAMES is not set." >&2 + return 1 + fi + IFS=',' read -ra all_shards_component_shortname_pairs <<< "$ALL_SHARDS_COMPONENT_SHORT_NAMES" + for pair in "${all_shards_component_shortname_pairs[@]}"; do + IFS=':' read -r shard_name _ <<< "$pair" + all_shards_components="${all_shards_components},${shard_name}" + done + all_shards_components="${all_shards_components#,}" + echo "$all_shards_components" + return 0 +} + +## the pod names of all shard, there are some environment variables name prefix with "ALL_SHARDS_POD_NAME_LIST" and +## suffix with the shard name, like "ALL_SHARDS_POD_NAME_LIST_SHARD_98X", "ALL_SHARDS_POD_NAME_LIST_SHARD_CQ7", "ALL_SHARDS_POD_NAME_LIST_SHARD_HY7" +## - ALL_SHARDS_POD_NAME_LIST_SHARD_98X="redis-shard-98x-0,redis-shard-98x-1" +## - ALL_SHARDS_POD_NAME_LIST_SHARD_CQ7="redis-shard-cq7-0,redis-shard-cq7-1" +## - ALL_SHARDS_POD_NAME_LIST_SHARD_HY7="redis-shard-hy7-0,redis-shard-hy7-1" +## return the pod names of all shards combined with "," +get_all_shards_pods() { + ## list all Envs name prefix with ALL_SHARDS_POD_NAME_LIST and get them value combined with "," + local envs + local all_shards_pods="" + envs=$(env | grep "^ALL_SHARDS_POD_NAME_LIST" | sort) + while IFS='=' read -r env_name env_value; do + if ! is_empty "$env_value"; then + if is_empty "$all_shards_pods"; then + all_shards_pods="$env_value" + else + all_shards_pods="$all_shards_pods,$env_value" + fi + fi + done <<< "$envs" + echo "$all_shards_pods" + return 0 +} + +## the pod fqdn list for all shard pod, it will generate a set of variables with the shard name suffix like: +## - ALL_SHARDS_POD_FQDN_LIST_SHARD_98X="redis-shard-98x-0.redis-shard-98x-headless.default.cluster.local,redis-shard-98x-1.redis-shard-98x-headless.default.cluster.local" +## - ALL_SHARDS_POD_FQDN_LIST_SHARD_CQ7="redis-shard-cq7-0.redis-shard-cq7-headless.default.cluster.local,redis-shard-cq7-1.redis-shard-cq7-headless.default.cluster.local" +## - ALL_SHARDS_POD_FQDN_LIST_SHARD_HY7="redis-shard-hy7-0.redis-shard-hy7-headless.default.cluster.local,redis-shard-hy7-1.redis-shard-hy7-headless.default.cluster.local" +## return the pod fqdn list for all shard pod combined with "," +get_all_shards_pod_fqdns() { + ## list all Envs name prefix with ALL_SHARDS_POD_FQDN_LIST and get them value combined with "," + local envs + local all_shards_pod_fqdns="" + envs=$(env | grep "^ALL_SHARDS_POD_FQDN_LIST" | sort) + while IFS='=' read -r env_name env_value; do + if [[ -n "$env_value" ]]; then + if [[ -z "$all_shards_pod_fqdns" ]]; then + all_shards_pod_fqdns="$env_value" + else + all_shards_pod_fqdns="$all_shards_pod_fqdns,$env_value" + fi + fi + done <<< "$envs" + echo "$all_shards_pod_fqdns" + return 0 +} + +shutdown_redis_server() { + local service_port="$1" + unset_xtrace_when_ut_mode_false + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + redis-cli $REDIS_CLI_TLS_CMD -h 127.0.0.1 -p "$service_port" -a "$REDIS_DEFAULT_PASSWORD" shutdown + else + redis-cli $REDIS_CLI_TLS_CMD -h 127.0.0.1 -p "$service_port" shutdown + fi + set_xtrace_when_ut_mode_false + echo "shutdown redis server succeeded!" +} + +check_redis_server_ready() { + unset_xtrace_when_ut_mode_false + local host="$1" + local port="$2" + local max_retry=10 + local retry_interval=5 + check_ready_cmd="redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port ping" + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + check_ready_cmd="redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a $REDIS_DEFAULT_PASSWORD ping" + fi + output=$($check_ready_cmd) + set_xtrace_when_ut_mode_false + status=$? + if [ $status -ne 0 ] || [ "$output" != "PONG" ] ; then + echo "Failed to execute the check ready command: $check_ready_cmd" >&2 + return 1 + fi +} + +parse_advertised_svc_and_port() { + local pod_name="$1" + local advertised_ports="$2" + local svc_and_port="$3" + local pod_name_ordinal + local found=false + + pod_name_ordinal=$(extract_obj_ordinal "$pod_name") + IFS=',' read -ra ports_array <<< "$advertised_ports" + for entry in "${ports_array[@]}"; do + IFS=':' read -ra parts <<< "$entry" + local svc_name="${parts[0]}" + local port="${parts[1]}" + local svc_name_ordinal + + svc_name_ordinal=$(extract_obj_ordinal "$svc_name") + if [[ "$svc_name_ordinal" == "$pod_name_ordinal" ]]; then + if [[ "${svc_and_port}" == "true" ]]; then + echo "$svc_name:$port" + else + echo "$port" + fi + found=true + return 0 + fi + done + + if [[ "$found" == false ]]; then + return 1 + fi +} + +get_pod_service_port_by_network_mode() { + local target_pod_name="$1" + local service_port=${SERVICE_PORT:-6379} + # if redis cluster is using host network, the service port should be the host network port + if ! is_empty "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"; then + IFS=',' read -ra port_mappings <<< "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT" + for mapping in "${port_mappings[@]}"; do + shard_name=$(echo "$mapping" | cut -d':' -f1) + mapping_port=$(echo "$mapping" | cut -d':' -f2) + if echo "${target_pod_name}" | grep -q "$shard_name"; then + service_port=$mapping_port + break + fi + done + fi + echo "$service_port" +} + +send_cluster_meet() { + local primary_endpoint="$1" + local primary_port="$2" + local announce_ip="$3" + local announce_port="$4" + local announce_bus_port="$5" + + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + meet_command="redis-cli $REDIS_CLI_TLS_CMD -h $primary_endpoint -p $primary_port cluster meet $announce_ip $announce_port $announce_bus_port" + logging_mask_meet_command="$meet_command" + else + meet_command="redis-cli $REDIS_CLI_TLS_CMD -h $primary_endpoint -p $primary_port -a $REDIS_DEFAULT_PASSWORD cluster meet $announce_ip $announce_port $announce_bus_port" + logging_mask_meet_command="${meet_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "check and correct other primary nodes meet command: $logging_mask_meet_command" + if ! $meet_command + then + echo "Failed to meet the node $announce_ip:$announce_port in check_and_meet_other_primary_nodes" >&2 + return 1 + else + echo "Meet the node $announce_ip:$announce_port successfully with new announce ip $announce_ip..." >&2 + return 0 + fi + set_xtrace_when_ut_mode_false +} + +get_cluster_info() { + local cluster_node="$1" + local cluster_node_port="$2" + unset_xtrace_when_ut_mode_false + local command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port cluster info" + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port -a $REDIS_DEFAULT_PASSWORD cluster info" + fi + cluster_info=$($command) + set_xtrace_when_ut_mode_false + status=$? + if [ $status -ne 0 ]; then + echo "Failed to execute the get cluster info command" >&2 + return 1 + fi + echo "$cluster_info" + return 0 +} + +get_cluster_nodes_info() { + local cluster_node="$1" + local cluster_node_port="$2" + unset_xtrace_when_ut_mode_false + local command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port cluster nodes" + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port -a $REDIS_DEFAULT_PASSWORD cluster nodes" + fi + cluster_nodes_info=$($command) + set_xtrace_when_ut_mode_false + status=$? + if [ $status -ne 0 ]; then + echo "Failed to execute the get cluster nodes info command" >&2 + return 1 + fi + echo "$cluster_nodes_info" + return 0 +} + +get_cluster_id() { + local cluster_node="$1" + local cluster_node_port="$2" + local pod_fqdn="$3" + cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in get_cluster_id" >&2 + return 1 + fi + if [ -n "${pod_fqdn}" ]; then + cluster_id=$(echo "$cluster_nodes_info" | grep "${pod_fqdn}" | awk '{print $1}') + else + cluster_id=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $1}') + fi + echo "$cluster_id" + return 0 +} + +get_cluster_announce_ip() { + local cluster_node="$1" + local cluster_node_port="$2" + cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in get_cluster_announce_ip" >&2 + return 1 + fi + cluster_announce_ip=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $2}' | awk -F ':' '{print $1}') + echo "$cluster_announce_ip" + return 0 +} + +check_node_in_cluster() { + local cluster_node="$1" + local cluster_node_port="$2" + local node_name="$3" + cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in check_node_in_cluster" >&2 + return 1 + fi + # if the cluster_nodes_info contains multiple lines and the node_name is in the cluster_nodes_info, return true + if [ "$(echo "$cluster_nodes_info" | wc -l)" -gt 1 ] && echo "$cluster_nodes_info" | grep -q "$node_name"; then + return 0 + else + return 1 + fi +} + +send_cluster_meet_with_retry() { + local primary_endpoint="$1" + local primary_port="$2" + local announce_ip="$3" + local announce_port="$4" + local announce_bus_port="$5" + send_cluster_meet_result=$(call_func_with_retry $retry_times 10 send_cluster_meet "$primary_endpoint" "$primary_port" "$announce_ip" "$announce_port" "$announce_bus_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to meet the node $announce_ip:$announce_port in check_and_meet_other_primary_nodes after retry" >&2 + return 1 + fi + return 0 +} + +get_cluster_info_with_retry() { + local cluster_node="$1" + local cluster_node_port="$2" + # call the get_cluster_info function with call_func_with_retry function and get the output + cluster_info=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get the cluster info of the cluster node $cluster_node:$cluster_node_port after retry" >&2 + return 1 + fi + echo "$cluster_info" + return 0 +} + +get_cluster_nodes_info_with_retry() { + local cluster_node="$1" + local cluster_node_port="$2" + # call the get_cluster_nodes_info function with call_func_with_retry function and get the output + cluster_nodes_info=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_nodes_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get the cluster nodes info of the cluster node $cluster_node:$cluster_node_port after retry" >&2 + return 1 + fi + echo "$cluster_nodes_info" + return 0 +} + +get_cluster_id_with_retry() { + local cluster_node="$1" + local cluster_node_port="$2" + local pod_fqdn="$3" + # call the execute_get_cluster_id_command function with call_func_with_retry function and get the output + cluster_id=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_id "$cluster_node" "$cluster_node_port" "${pod_fqdn}") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get the cluster id of the cluster node $cluster_node:$cluster_node_port after retry" >&2 + return 1 + fi + echo "$cluster_id" + return 0 +} + +get_cluster_announce_ip_with_retry() { + local cluster_node="$1" + local cluster_node_port="$2" + # call the execute_get_cluster_announce_ip_command function with call_func_with_retry function and get the output + cluster_announce_ip=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_announce_ip "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get the cluster announce ip of the cluster node $cluster_node:$cluster_node_port after retry" >&2 + return 1 + fi + echo "$cluster_announce_ip" + return 0 +} + +check_node_in_cluster_with_retry() { + local cluster_node="$1" + local cluster_node_port="$2" + local node_name="$3" + # call the execute_check_node_in_cluster_command function with call_func_with_retry function and get the output + check_result=$(call_func_with_retry $retry_times $retry_delay_second check_node_in_cluster "$cluster_node" "$cluster_node_port" "$node_name") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to check the node $node_name in the cluster node $cluster_node:$cluster_node_port after retry" >&2 + return 1 + fi + return 0 +} + +check_redis_server_ready_with_retry() { + local host="$1" + local port="$2" + # call the execute_check_redis_server_ready_command function with call_func_with_retry function and get the output + check_result=$(call_func_with_retry $check_ready_times $retry_delay_second check_redis_server_ready "$host" "$port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to check the redis server ready after retry" >&2 + return 1 + fi + return 0 +} + +# check redis cluster all slots are covered +check_slots_covered() { + # cluster_node_endpoint_wth_port is the target node endpoint with port, for example 172.0.0.1:6379 + local node_endpoint_wth_port="$1" + local cluster_service_port="$2" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + check=$(redis-cli $REDIS_CLI_TLS_CMD --cluster check "$node_endpoint_wth_port" -p "$cluster_service_port") + else + check=$(redis-cli $REDIS_CLI_TLS_CMD --cluster check "$node_endpoint_wth_port" -p "$cluster_service_port" -a "$REDIS_DEFAULT_PASSWORD") + fi + set_xtrace_when_ut_mode_false + if contains "$check" "All 16384 slots covered"; then + return 0 + else + return 1 + fi +} + +# check if the cluster has been initialized +check_cluster_initialized() { + local cluster_pod_fqdn_list="$1" + if is_empty "$cluster_pod_fqdn_list"; then + echo "Error: Required environment variable cluster_pod_fqdn_list is not set." >&2 + return 1 + fi + + local service_port + for pod_fqdn in $(echo "$cluster_pod_fqdn_list" | tr ',' ' '); do + pod_name=${pod_fqdn%%.*} + service_port=$(get_pod_service_port_by_network_mode "${pod_name}") + cluster_info=$(get_cluster_info_with_retry "$pod_fqdn" "$service_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster info in check_cluster_initialized" >&2 + return 1 + fi + cluster_state=$(echo "$cluster_info" | awk -F: '/cluster_state/{print $2}' | tr -d '[:space:]') + if is_empty "$cluster_state" || equals "$cluster_state" "ok"; then + echo "Redis Cluster already initialized" + return 0 + fi + done + echo "Redis Cluster not initialized" >&2 + return 1 +} + +build_redis_cluster_create_command() { + local primary_nodes="$1" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + initialize_command="redis-cli $REDIS_CLI_TLS_CMD --cluster create $primary_nodes --cluster-yes" + logging_mask_initialize_command="$initialize_command" + else + initialize_command="redis-cli $REDIS_CLI_TLS_CMD --cluster create $primary_nodes -a $REDIS_DEFAULT_PASSWORD --cluster-yes" + logging_mask_initialize_command="${initialize_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "initialize cluster command: $logging_mask_initialize_command" >&2 + set_xtrace_when_ut_mode_false + echo "$initialize_command" +} + +# `redis-cli --cluster create` rejects clusters with fewer than 3 masters. +# For single-shard provisioning we bypass it and assign all 16384 slots to +# the lone primary directly via CLUSTER ADDSLOTSRANGE — same approach AWS +# ElastiCache uses for 1-node Valkey/Redis Cluster topologies. +build_single_shard_addslots_command() { + local node_endpoint="$1" + local host="${node_endpoint%:*}" + local port="${node_endpoint##*:}" + unset_xtrace_when_ut_mode_false + local auth="" + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + auth="-a $REDIS_DEFAULT_PASSWORD" + fi + initialize_command="redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port $auth cluster addslotsrange 0 16383" + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + logging_mask_initialize_command="$initialize_command" + else + logging_mask_initialize_command="${initialize_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "initialize single-shard cluster command: $logging_mask_initialize_command" >&2 + set_xtrace_when_ut_mode_false + echo "$initialize_command" +} + +build_secondary_replicated_command() { + local secondary_endpoint_with_port="$1" + local mapping_primary_endpoint_with_port="$2" + local mapping_primary_cluster_id="$3" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + replicated_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $secondary_endpoint_with_port $mapping_primary_endpoint_with_port --cluster-slave --cluster-master-id $mapping_primary_cluster_id" + logging_mask_replicated_command="$replicated_command" + else + replicated_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $secondary_endpoint_with_port $mapping_primary_endpoint_with_port --cluster-slave --cluster-master-id $mapping_primary_cluster_id -a $REDIS_DEFAULT_PASSWORD" + logging_mask_replicated_command="${replicated_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "initialize cluster secondary add-node command: $logging_mask_replicated_command" >&2 + set_xtrace_when_ut_mode_false + echo "$replicated_command" +} + +build_scale_out_shard_primary_join_command() { + local scale_out_shard_default_primary_endpoint_with_port="$1" + local exist_available_node="$2" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + add_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $scale_out_shard_default_primary_endpoint_with_port $exist_available_node" + logging_mask_add_node_command="$add_node_command" + else + add_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $scale_out_shard_default_primary_endpoint_with_port $exist_available_node -a $REDIS_DEFAULT_PASSWORD" + logging_mask_add_node_command="${add_node_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "scale out shard primary add-node command: $logging_mask_add_node_command" >&2 + set_xtrace_when_ut_mode_false + echo "$add_node_command" +} + +build_reshard_command() { + local primary_node_with_port="$1" + local mapping_primary_cluster_id="$2" + local slots_per_shard="$3" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + reshard_command="redis-cli $REDIS_CLI_TLS_CMD --cluster reshard $primary_node_with_port --cluster-from all --cluster-to $mapping_primary_cluster_id --cluster-slots $slots_per_shard --cluster-yes" + logging_mask_reshard_command="$reshard_command" + else + reshard_command="redis-cli $REDIS_CLI_TLS_CMD --cluster reshard $primary_node_with_port --cluster-from all --cluster-to $mapping_primary_cluster_id --cluster-slots $slots_per_shard -a $REDIS_DEFAULT_PASSWORD --cluster-yes" + logging_mask_reshard_command="${reshard_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "scale out shard reshard command: $logging_mask_reshard_command" >&2 + set_xtrace_when_ut_mode_false + echo "$reshard_command" +} + +build_rebalance_to_zero_command() { + local node_with_port="$1" + local node_cluster_id="$2" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + rebalance_command="redis-cli $REDIS_CLI_TLS_CMD --cluster rebalance $node_with_port --cluster-weight $node_cluster_id=0 --cluster-yes " + logging_mask_rebalance_command="$rebalance_command" + else + rebalance_command="redis-cli $REDIS_CLI_TLS_CMD --cluster rebalance $node_with_port --cluster-weight $node_cluster_id=0 --cluster-yes -a $REDIS_DEFAULT_PASSWORD" + logging_mask_rebalance_command="${rebalance_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "set current component slot to 0 by rebalance command: $logging_mask_rebalance_command" >&2 + set_xtrace_when_ut_mode_false + echo "$rebalance_command" +} + +build_del_node_command() { + local available_node="$1" + local node_to_del_cluster_id="$2" + local do_forget_node="$3" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + del_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster del-node $available_node $node_to_del_cluster_id -p $SERVICE_PORT" + if [[ "$do_forget_node" == "true" ]]; then + del_node_command="redis-cli $REDIS_CLI_TLS_CMD -p $SERVICE_PORT --cluster call $available_node cluster forget $node_to_del_cluster_id" + fi + logging_mask_del_node_command="$del_node_command" + else + del_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster del-node $available_node $node_to_del_cluster_id -p $SERVICE_PORT -a $REDIS_DEFAULT_PASSWORD" + if [[ "$do_forget_node" == "true" ]]; then + del_node_command="redis-cli $REDIS_CLI_TLS_CMD -p $SERVICE_PORT --cluster call $available_node cluster forget $node_to_del_cluster_id -a $REDIS_DEFAULT_PASSWORD" + fi + logging_mask_del_node_command="${del_node_command/$REDIS_DEFAULT_PASSWORD/********}" + fi + echo "del node command: $logging_mask_del_node_command" >&2 + set_xtrace_when_ut_mode_false + echo "$del_node_command" +} + +build_acl_save_command() { + local service_port="$1" + unset_xtrace_when_ut_mode_false + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port -a $REDIS_DEFAULT_PASSWORD acl save" + logging_mask_acl_save_command="${acl_save_command/$REDIS_DEFAULT_PASSWORD/********}" + else + acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port acl save" + logging_mask_acl_save_command="$acl_save_command" + fi + echo "acl save command: $logging_mask_acl_save_command" >&2 + set_xtrace_when_ut_mode_false + echo "$acl_save_command" +} + +create_redis_cluster() { + local primary_nodes="$1" + local primary_count + primary_count=$(echo "$primary_nodes" | wc -w | tr -d ' ') + if [ "$primary_count" -eq 1 ]; then + local single_node="${primary_nodes% }" + initialize_command=$(build_single_shard_addslots_command "$single_node") + else + initialize_command=$(build_redis_cluster_create_command "$primary_nodes") + fi + if ! $initialize_command; then + echo "Failed to create Valkey Cluster" >&2 + return 1 + fi + return 0 +} + +secondary_replicated_to_primary() { + local secondary_endpoint_with_port="$1" + local mapping_primary_endpoint_with_port="$2" + local mapping_primary_cluster_id="$3" + replicated_command=$(build_secondary_replicated_command "$secondary_endpoint_with_port" "$mapping_primary_endpoint_with_port" "$mapping_primary_cluster_id") + replicated_output=$($replicated_command) + replicated_exit_code=$? + if [ $replicated_exit_code -ne 0 ]; then + echo "Failed to replicate the secondary node $secondary_endpoint_with_port to the primary node $mapping_primary_endpoint_with_port" >&2 + return 1 + fi + echo "$replicated_output" + return 0 +} + +scale_out_shard_primary_join_cluster() { + local scale_out_shard_default_primary_endpoint_with_port="$1" + local exist_available_node="$2" + add_node_command=$(build_scale_out_shard_primary_join_command "$scale_out_shard_default_primary_endpoint_with_port" "$exist_available_node") + if ! $add_node_command; then + echo "Failed to add the node $scale_out_shard_default_primary_endpoint_with_port to the cluster when scale_out_shard_primary_join_cluster" >&2 + return 1 + fi + return 0 +} + +scale_out_shard_reshard() { + local primary_node_with_port="$1" + local mapping_primary_cluster_id="$2" + local slots_per_shard="$3" + reshard_command=$(build_reshard_command "$primary_node_with_port" "$mapping_primary_cluster_id" "$slots_per_shard") + if ! $reshard_command; then + echo "Failed to reshard the cluster when scale_out_shard_reshard" >&2 + return 1 + fi + return 0 +} + +scale_in_shard_rebalance_to_zero() { + local node_with_port="$1" + local node_cluster_id="$2" + rebalance_command=$(build_rebalance_to_zero_command "$node_with_port" "$node_cluster_id") + if ! $rebalance_command; then + echo "Failed to rebalance the cluster when scale_in_shard_rebalance_to_zero" >&2 + return 1 + fi + return 0 +} + +scale_in_shard_del_node() { + local available_node="$1" + local node_to_del_cluster_id="$2" + del_node_command=$(build_del_node_command "$available_node" "$node_to_del_cluster_id") + if ! $del_node_command; then + echo "Failed to delete the node $available_node from the cluster when scale_in_shard_del_node" >&2 + return 1 + fi + return 0 +} + +secondary_member_leave_del_node() { + local available_node="$1" + local node_to_del_cluster_id="$2" + local do_forget_node="$3" + del_node_command=$(build_del_node_command "$available_node" "$node_to_del_cluster_id" "$do_forget_node") + if ! $del_node_command; then + echo "Failed to delete the node $available_node from the cluster when secondary_member_leave_del_node" >&2 + return 1 + fi + return 0 +} + +secondary_member_leave_del_node_with_retry() { + local available_node="$1" + local node_to_del_cluster_id="$2" + local do_forget_node="$3" + check_result=$(call_func_with_retry $check_ready_times $retry_delay_second secondary_member_leave_del_node "$available_node" "$node_to_del_cluster_id" "$do_forget_node") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to remove replica when member leave after retry" >&2 + return 1 + fi + return 0 +} + +execute_acl_save() { + local service_port="$1" + acl_save_command=$(build_acl_save_command "$service_port") + if ! $acl_save_command; then + echo "Failed to execute acl save command" >&2 + return 1 + fi + return 0 +} + +execute_acl_save_with_retry() { + local service_port="$1" + check_result=$(call_func_with_retry $check_ready_times $retry_delay_second execute_acl_save $service_port) + status=$? + if [ $status -ne 0 ]; then + echo "Failed to execute acl save command after retry" >&2 + return 1 + fi + return 0 +} + +check_redis_role() { + local host=$1 + local port=$2 + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + role_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port info replication) + else + role_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" info replication) + fi + set_xtrace_when_ut_mode_false + + if echo "$role_info" | grep -q "^role:master"; then + echo "primary" + elif echo "$role_info" | grep -q "^role:slave"; then + echo "secondary" + else + echo "unknown" + fi +} + +redis_config_get() { + local host=$1 + local port=$2 + local password=$3 + local command=$4 + + local output + unset_xtrace_when_ut_mode_false + if ! is_empty "$password"; then + output=$(redis-cli $REDIS_CLI_TLS_CMD -h "$host" -p "$port" -a "$password" $command) + else + output=$(redis-cli $REDIS_CLI_TLS_CMD -h "$host" -p "$port" $command) + fi + local status=$? + set_xtrace_when_ut_mode_false + + if [[ $status -ne 0 ]]; then + echo "Command failed with status $status." >&2 + return 1 + fi + + if [[ -z "$output" ]]; then + echo "Command returned no output." >&2 + return 1 + fi + + echo "$output" + return 0 +} + +forget_fail_node_when_cluster_is_ok() { + local host=$1 + local port=$2 + unset_xtrace_when_ut_mode_false + cluster_info=$(get_cluster_info_with_retry "$host" "$port") + cluster_state=$(echo "$cluster_info" | awk -F: '/cluster_state/{print $2}' | tr -d '[:space:]') + if [[ "$cluster_state" != "ok" ]]; then + echo "Cluster state is not ok, skip forget fail node" + set_xtrace_when_ut_mode_false + return 0 + fi + cluster_nodes_info=$(get_cluster_nodes_info "$host" "$port") + while read -r line; do + node_id=$(echo "$line" | awk '{print $1}') + node_role=$(echo "$line" | awk '{print $3}') + if [[ "$node_role" == "fail" ]]; then + if [ -z ${REDIS_DEFAULT_PASSWORD} ]; then + redis-cli -h $host -p $port --cluster call $host:$port cluster forget ${node_id} + else + redis-cli -h $host -p $port --cluster call $host:$port cluster forget ${node_id} -a ${REDIS_DEFAULT_PASSWORD} + fi + fi + done <<< "$cluster_nodes_info" + set_xtrace_when_ut_mode_false +} \ No newline at end of file diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh new file mode 100644 index 000000000..b78d5aae6 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh @@ -0,0 +1,1051 @@ +#!/bin/bash + +# shellcheck disable=SC2128 +# shellcheck disable=SC2207 +# shellcheck disable=SC1090 + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# +# you should set ut_mode="true" when you want to run the script in shellspec file. +# +# shellcheck disable=SC2034 +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -ex". + set -ex; +} + +# declare the global variables for initialize redis cluster +declare -gA initialize_redis_cluster_primary_nodes +declare -gA initialize_redis_cluster_secondary_nodes +declare -gA initialize_pod_name_to_advertise_host_port_map + +# declare the global variables for scale out redis cluster shard +declare -gA scale_out_shard_default_primary_node +declare -gA scale_out_shard_default_other_nodes +network_mode="default" + +init_environment(){ + if [[ -z "${CURRENT_SHARD_ADVERTISED_PORT}" ]]; then + CURRENT_SHARD_ADVERTISED_PORT="${CURRENT_SHARD_LB_ADVERTISED_PORT}" + fi + if [[ -z "${CURRENT_SHARD_ADVERTISED_BUS_PORT}" ]]; then + CURRENT_SHARD_ADVERTISED_BUS_PORT="${CURRENT_SHARD_LB_ADVERTISED_BUS_PORT}" + fi + if [[ -z "${ALL_SHARDS_ADVERTISED_PORT}" ]]; then + ALL_SHARDS_ADVERTISED_PORT="${ALL_SHARDS_LB_ADVERTISED_PORT}" + fi + if [[ -z "${ALL_SHARDS_ADVERTISED_BUS_PORT}" ]]; then + ALL_SHARDS_ADVERTISED_BUS_PORT="${ALL_SHARDS_LB_ADVERTISED_BUS_PORT}" + fi + # determine cluster network mode + if [[ -n "$ALL_SHARDS_ADVERTISED_PORT" ]]; then + network_mode="advertised_svc" + elif [[ -n "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT" ]]; then + network_mode="host_network" + fi + KB_CLUSTER_POD_NAME_LIST=$(get_all_shards_pods) + KB_CLUSTER_POD_FQDN_LIST=$(get_all_shards_pod_fqdns) + KB_CLUSTER_COMPONENT_LIST=$(get_all_shards_components) +} + +load_redis_cluster_common_utils() { + # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap + # and are mounted to the same path which defined in the cmpd.spec.scripts + kblib_common_library_file="/scripts/common.sh" + redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh" + source "${kblib_common_library_file}" + source "${redis_cluster_common_library_file}" +} + +check_initialize_nodes_ready() { + local nodes=("$@") + for node in "${nodes[@]}"; do + local host port + host=$(echo "$node" | cut -d':' -f1) + port=$(echo "$node" | cut -d':' -f2) + if ! check_redis_server_ready_with_retry "$host" "$port"; then + return 1 + fi + done + return 0 +} + +# initialize the other component and pods info +init_other_components_and_pods_info() { + local current_component="$1" + local all_pod_fqdn_list="$2" + local all_component_list="$3" + + other_components=() + other_component_pod_names=() + other_component_nodes=() + echo "init other components and pods info, current component: $current_component" + # filter out the components of the given component + IFS=',' read -ra components <<< "$all_component_list" + for comp in "${components[@]}"; do + if contains "$comp" "$current_component"; then + echo "skip the component $comp as it is the current component" + continue + fi + other_components+=("$comp") + done + + # filter out the pods of the given component + for pod_fqdn in $(echo "$all_pod_fqdn_list" | tr ',' '\n'); do + pod_name=${pod_fqdn%%.*} + if echo "$pod_name" | grep "$current_component-"; then + echo "skip the pod $pod_name as it belongs the component $current_component" + continue + fi + + other_component_pod_names+=("$pod_name") + + local pod_service_port + pod_service_port=$(get_pod_service_port_by_network_mode "$pod_name") + other_component_nodes+=("$pod_fqdn:$pod_service_port") + done + + echo "other_components: ${other_components[*]}" + echo "other_component_pod_names: ${other_component_pod_names[*]}" + echo "other_component_nodes: ${other_component_nodes[*]}" +} + +find_exist_available_node() { + local node_ip + local node_port + for node in "${other_component_nodes[@]}"; do + # the $node is the headless address by default, we should get the real node address from cluster nodes + node_ip=$(echo "$node" | cut -d':' -f1) + node_port=$(echo "$node" | cut -d':' -f2) + if check_slots_covered "$node" "$node_port"; then + # the $node is the headless address by default, we should get the real node address from cluster nodes + cluster_nodes_info=$(get_cluster_nodes_info "$node_ip" "$node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in find_exist_available_node" >&2 + exit 1 + fi + # grep my self node and return the nodeIp:port(it may be the announceIp and announcePort, for example when cluster enable NodePort/LoadBalancer service) + available_node_with_port=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $2}' | cut -d'@' -f1) + echo "$available_node_with_port" + return + fi + done + echo "" +} + +extract_pod_name_prefix() { + local pod_name="$1" + # shellcheck disable=SC2001 + prefix=$(echo "$pod_name" | sed 's/-[0-9]*$//') + echo "$prefix" +} + +extract_lb_host_by_svc_name() { + local svc_name="$1" + for lb_composed_name in $(echo "$ALL_SHARDS_LB_ADVERTISED_HOST" | tr ',' '\n' ); do + lb_composed_name=${lb_composed_name#*@} + if [[ ${lb_composed_name} == *":"* ]]; then + if [[ ${lb_composed_name%:*} == "$svc_name" ]]; then + echo "${lb_composed_name#*:}" + break + fi + else + break + fi + done +} + +# get the current component primary node and other nodes for scale in +get_current_comp_nodes_for_scale_in() { + + parse_node_line_info() { + local line="$1" + + local node_ip_port_fields + # 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local + node_ip_port_fields=$(echo "$line" | awk '{print $2}') + + local node_ip_port + # ip:port without bus port + node_ip_port=$(echo "$node_ip_port_fields" | awk -F '@' '{print $1}') + + local node_ip + node_ip=$(echo "$node_ip_port" | cut -d':' -f1) + + local node_port + node_port=$(echo "$node_ip_port" | cut -d':' -f2) + + local node_fqdn + # redis-shard-sxj-0.redis-shard-sxj-headless.default.svc + node_fqdn=$(echo "$line" | awk '{print $2}' | awk -F ',' '{print $2}') + + local node_role + node_role=$(echo "$line" | awk '{print $3}') + + echo "$node_ip $node_port $node_role $node_fqdn" + } + + get_node_address_by_network_mode() { + local node_ip="$1" + local node_port="$2" + local node_fqdn="$3" + + case "$network_mode" in + "advertised_svc") + echo "$node_ip:$node_port" + ;; + "host_network") + echo "$node_ip:$REDIS_CLUSTER_HOST_NETWORK_PORT" + ;; + *) + # shellcheck disable=SC2153 + echo "$node_fqdn:$SERVICE_PORT" + ;; + esac + } + + categorize_node() { + local node_address="$1" + local node_role="$2" + local belong_current_comp="$3" + + if [[ "$belong_current_comp" == "true" ]]; then + if [[ "$node_role" =~ "master" && ! "$node_role" =~ "fail" ]]; then + current_comp_primary_node+=("$node_address") + else + current_comp_other_nodes+=("$node_address") + fi + fi + } + + local cluster_node="$1" + local cluster_node_port="$2" + cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in get_current_comp_nodes_for_scale_in" >&2 + return 1 + fi + + current_comp_primary_node=() + current_comp_other_nodes=() + + # if the cluster_nodes_info contains only one line, it means that the cluster not be initialized + if [ "$(echo "$cluster_nodes_info" | wc -l)" -eq 1 ]; then + echo "Cluster nodes info contains only one line, returning..." + return + fi + + # prepare CURRENT_SHARD_HOST_OR_PORT_LIST for advertised_svc mode + CURRENT_SHARD_HOST_OR_PORT_LIST=() + if [ "$network_mode" == "advertised_svc" ]; then + IFS=',' read -ra CURRENT_POD_LIST <<< "$CURRENT_SHARD_POD_NAME_LIST" + for pod_name in "${CURRENT_POD_LIST[@]}"; do + svc_and_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_PORT" "true") + svc_name=${svc_and_port%:*} + lb_host=$(extract_lb_host_by_svc_name "${svc_name}") + if [ -n "$lb_host" ]; then + CURRENT_SHARD_HOST_OR_PORT_LIST+=("${lb_host}:6379") + else + svc_port="${svc_and_port#*:}" + CURRENT_SHARD_HOST_OR_PORT_LIST+=(":${svc_port}") + fi + echo "pod_name: $pod_name, svc_and_port: $svc_and_port" + done + # check length of CURRENT_SHARD_ANNOUNCE_IP_LIST must equal to CURRENT_POD_LIST + if [ ${#CURRENT_SHARD_HOST_OR_PORT_LIST[@]} -ne ${#CURRENT_POD_LIST[@]} ]; then + echo "Error: failed to get the pod ip list from KB_POD_LIST" + return 1 + fi + fi + # the output of line is like: + # 1. using the pod fqdn as the nodeAddr + # 4958e6dca033cd1b321922508553fab869a29d 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287 + # 2. using the nodeport or lb ip as the nodeAddr + # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:31000@31888,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287 + # 3. using the host network ip as the nodeAddr + # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:1050@1051,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287 + while read -r line; do + local node_info + node_info=$(parse_node_line_info "$line") + read -r node_ip node_port node_role node_fqdn <<< "$node_info" + + belong_current_comp=false + if [ "$network_mode" == "advertised_svc" ]; then + for i in "${CURRENT_SHARD_HOST_OR_PORT_LIST[@]}"; do + node_announce_info=":$node_port" + if ! is_empty "$CURRENT_SHARD_LB_ADVERTISED_PORT"; then + node_announce_info="$node_ip:$node_port" + fi + if [[ "$i" == "$node_announce_info" ]]; then + belong_current_comp=true + break + fi + done + elif [ "$network_mode" == "host_network" ]; then + if contains "$node_port" "$SERVICE_PORT"; then + belong_current_comp=true + fi + elif contains "$node_fqdn" "$CURRENT_SHARD_COMPONENT_NAME"; then + belong_current_comp=true + fi + local node_address + node_address=$(get_node_address_by_network_mode "$node_ip" "$node_port" "$node_fqdn") + categorize_node "$node_address" "$node_role" "$belong_current_comp" + done <<< "$cluster_nodes_info" + + echo "current_comp_primary_node: ${current_comp_primary_node[*]}" + echo "current_comp_other_nodes: ${current_comp_other_nodes[*]}" +} + +# init the current shard component default primary and secondary nodes for scale out shard. +# TODO: if advertised address is enable and instanceTemplate is specified, the pod service could not be parsed from the pod ordinal. +init_current_comp_default_nodes_for_scale_out() { + # categorize the scale out node map + categorize_scale_out_node_map() { + local pod_name="$1" + local node_address="$2" + local pod_ordinal="$3" + + if equals "$pod_ordinal" "$min_lexicographical_pod_ordinal"; then + scale_out_shard_default_primary_node["$pod_name"]="$node_address" + else + scale_out_shard_default_other_nodes["$pod_name"]="$node_address" + fi + } + + # handle the advertised service network mode (currently only support NodePort service type + handle_advertised_svc_network_mode() { + local pod_fqdn="$1" + local pod_name_ordinal="$2" + local pod_name=${pod_fqdn%%.*} + local old_ifs="$IFS" + IFS=',' + set -f + read -ra advertised_infos <<< "$CURRENT_SHARD_ADVERTISED_PORT" + set +f + IFS="$old_ifs" + + local found_advertised_port=false + for advertised_info in "${advertised_infos[@]}"; do + local advertised_svc advertised_port advertised_svc_ordinal + advertised_svc=$(echo "$advertised_info" | cut -d':' -f1) + advertised_port=$(echo "$advertised_info" | cut -d':' -f2) + advertised_svc_ordinal=$(extract_obj_ordinal "$advertised_svc") + + if [ "$pod_name_ordinal" == "$advertised_svc_ordinal" ]; then + local pod_host_ip + lb_host=$(extract_lb_host_by_svc_name "${advertised_svc}") + if ! is_empty "$lb_host"; then + echo "Found load balancer host for svcName '$advertised_svc', value is '$lb_host'." + pod_host_ip="$lb_host" + advertised_port="6379" + else + pod_host_ip=$(redis_config_get "$pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p') + fi + status=$? + if is_empty "$pod_host_ip" || [ $status -ne 0 ]; then + echo "Failed to get host ip of pod $pod_name" >&2 + return 1 + fi + + categorize_scale_out_node_map "$pod_name" "$pod_host_ip:$advertised_port" "$pod_name_ordinal" + found_advertised_port=true + break + fi + done + + if [ "$found_advertised_port" = false ]; then + echo "Advertised port not found for pod $pod_name" >&2 + return 1 + fi + return 0 + } + + # handle the host network mode + handle_host_network_mode() { + local pod_fqdn="$1" + local pod_name_ordinal="$2" + local pod_name=${pod_fqdn%%.*} + local pod_host_ip + pod_host_ip=$(redis_config_get "$pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p') + if is_empty "$pod_host_ip"; then + echo "Failed to get host ip of pod $pod_name in host network mode" >&2 + return 1 + fi + + categorize_scale_out_node_map "$pod_name" "$pod_host_ip:$REDIS_CLUSTER_HOST_NETWORK_PORT" "$pod_name_ordinal" + return 0 + } + + # handle the default network mode + handle_default_network_mode() { + local pod_fqdn="$1" + local pod_name_ordinal="$2" + local pod_name=${pod_fqdn%%.*} + categorize_scale_out_node_map "$pod_name" "$pod_fqdn:$SERVICE_PORT" "$pod_name_ordinal" + return 0 + } + + process_pod_by_network_mode() { + local pod_fqdn="$1" + local pod_name_ordinal="$2" + + case "$network_mode" in + "advertised_svc") + handle_advertised_svc_network_mode "$pod_fqdn" "$pod_name_ordinal" + ;; + "host_network") + handle_host_network_mode "$pod_fqdn" "$pod_name_ordinal" + ;; + *) + handle_default_network_mode "$pod_fqdn" "$pod_name_ordinal" + ;; + esac + return $? + } + + local min_lexicographical_pod_name + local min_lexicographical_pod_ordinal + min_lexicographical_pod_name=$(min_lexicographical_order_pod "$CURRENT_SHARD_POD_NAME_LIST") + min_lexicographical_pod_ordinal=$(extract_obj_ordinal "$min_lexicographical_pod_name") + if is_empty "$min_lexicographical_pod_ordinal"; then + echo "Failed to get the ordinal of the min lexicographical pod $min_lexicographical_pod_name in init_current_comp_default_nodes_for_scale_out" >&2 + return 1 + fi + + for pod_fqdn in $(echo "$CURRENT_SHARD_POD_FQDN_LIST" | tr ',' ' '); do + local pod_name_ordinal + pod_name=${pod_fqdn%%.*} + pod_name_ordinal=$(extract_obj_ordinal "$pod_name") + process_pod_by_network_mode "$pod_fqdn" "$pod_name_ordinal" || return 1 + done + return 0 +} + +# initialize the redis cluster primary and secondary nodes, use the min lexicographical pod of each shard as the primary nodes by default. +gen_initialize_redis_cluster_node() { + local is_primary=$1 + + categorize_node_maps() { + local pod_name="$1" + local host="$2" + local port="$3" + local is_primary="$4" + + local node_addr="$host:$port" + + if equals "$is_primary" "true"; then + initialize_redis_cluster_primary_nodes["$pod_name"]="$node_addr" + else + initialize_redis_cluster_secondary_nodes["$pod_name"]="$node_addr" + fi + initialize_pod_name_to_advertise_host_port_map["$pod_name"]="$node_addr" + } + + # determine if pod should be processed based on primary/secondary role + should_process_pod() { + local is_primary="$1" + local pod_ordinal="$2" + local min_pod_ordinal="$3" + + if [ "$is_primary" = "true" ]; then + [ "$pod_ordinal" = "$min_pod_ordinal" ] + else + [ "$pod_ordinal" != "$min_pod_ordinal" ] + fi + } + + # Initialize node with advertised service configuration + initialize_advertised_svc_node() { + local pod_fqdn="$1" + local pod_name_ordinal="$2" + local is_primary="$3" + local pod_name=${pod_fqdn%%.*} + + local pod_host_ip + pod_service_port=$(get_pod_service_port_by_network_mode "${pod_name}") || { + echo "Failed to get service port for pod: $pod_name" >&2 + return 1 + } + pod_host_ip=$(redis_config_get "$pod_fqdn" "$pod_service_port" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p') + if is_empty "$pod_host_ip"; then + echo "Failed to get host IP for pod: $pod_name" >&2 + return 1 + fi + ## the value format of ALL_SHARDS_ADVERTISED_PORT is "shard-98x@redis-shard-98x-redis-advertised-0:32024,redis-shard-98x-redis-advertised-1:31318.shard-cq7@redis-shard-cq7-redis-advertised-0:31828,redis-shard-cq7-redis-advertised-1:32000" + local old_ifs="$IFS" + IFS='.' + set -f + local shards + read -ra shards <<< "$ALL_SHARDS_ADVERTISED_PORT" + set +f + IFS="$old_ifs" + + local shard + for shard in "${shards[@]}"; do + local shard_name + shard_name=$(echo "$shard" | cut -d'@' -f1) + + # skip if pod doesn't belong to current shard + if ! echo "$pod_name" | grep -q "$shard_name"; then + continue + fi + + # shard_advertised_infos like "redis-shard-98x-redis-advertised-0:32024,redis-shard-98x-redis-advertised-1:31318" + local old_ifs="$IFS" + IFS=',' + set -f + local shard_advertised_infos + read -ra shard_advertised_infos <<< "$(echo "$shard" | cut -d'@' -f2)" + set +f + IFS="$old_ifs" + + local shard_advertised_info + for shard_advertised_info in "${shard_advertised_infos[@]}"; do + local shard_advertised_svc + local shard_advertised_port + local shard_advertised_svc_ordinal + + shard_advertised_svc=$(echo "$shard_advertised_info" | cut -d':' -f1) + shard_advertised_port=$(echo "$shard_advertised_info" | cut -d':' -f2) + shard_advertised_svc_ordinal=$(extract_obj_ordinal "$shard_advertised_svc") + + if [ "$pod_name_ordinal" = "$shard_advertised_svc_ordinal" ]; then + lb_host=$(extract_lb_host_by_svc_name "${shard_advertised_svc}") + if [ -n "$lb_host" ]; then + echo "Found load balancer host for svcName '$shard_advertised_svc', value is '$lb_host'." + pod_host_ip="$lb_host" + shard_advertised_port="6379" + fi + categorize_node_maps "$pod_name" "$pod_host_ip" "$shard_advertised_port" "$is_primary" + return 0 + fi + done + done + return 0 + } + + # Initialize node with host network configuration + initialize_host_network_node() { + local pod_fqdn="$1" + local is_primary="$2" + local pod_name=${pod_fqdn%%.*} + + pod_service_port=$(get_pod_service_port_by_network_mode "${pod_name}") || { + echo "Failed to get service port for pod: $pod_name" >&2 + return 1 + } + pod_host_ip=$(redis_config_get "$pod_fqdn" "$pod_service_port" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p') + if is_empty "$pod_host_ip"; then + echo "Failed to get host ip of pod $pod_name in host network mode" >&2 + return 1 + fi + categorize_node_maps "$pod_name" "$pod_host_ip" "$pod_service_port" "$is_primary" + return 0 + } + + # Initialize node with default network configuration + initialize_default_network_node() { + local pod_fqdn="$1" + local is_primary="$2" + local pod_name=${pod_fqdn%%.*} + + local pod_service_port + pod_service_port=$(get_pod_service_port_by_network_mode "${pod_name}") || { + echo "Failed to get service_port for pod: $pod_name" >&2 + return 1 + } + categorize_node_maps "$pod_name" "$pod_fqdn" "$pod_service_port" "$is_primary" + return 0 + } + + # determine cluster network mode + local network_mode="default" + if ! is_empty "$ALL_SHARDS_ADVERTISED_PORT"; then + network_mode="advertised_svc" + elif ! is_empty "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"; then + network_mode="host_network" + fi + + # get and validate the min lexicographical pod name and ordinal + local min_lexicographical_pod_name + local min_lexicographical_pod_ordinal + min_lexicographical_pod_name=$(min_lexicographical_order_pod "$KB_CLUSTER_POD_NAME_LIST") + min_lexicographical_pod_ordinal=$(extract_obj_ordinal "$min_lexicographical_pod_name") + if is_empty "$min_lexicographical_pod_ordinal"; then + echo "Failed to get the ordinal of the min lexicographical pod $min_lexicographical_pod_name in gen_initialize_redis_cluster_node" >&2 + return 1 + fi + + local pod_name + for pod_fqdn in $(echo "$KB_CLUSTER_POD_FQDN_LIST" | tr ',' ' '); do + local pod_name_ordinal + pod_name=${pod_fqdn%%.*} + pod_name_ordinal=$(extract_obj_ordinal "$pod_name") || continue + + # skip pods based on primary/secondary role + if ! should_process_pod "$is_primary" "$pod_name_ordinal" "$min_lexicographical_pod_ordinal"; then + continue + fi + # initialize pod based on network mode + case "$network_mode" in + "advertised_svc") + initialize_advertised_svc_node "$pod_fqdn" "$pod_name_ordinal" "$is_primary" || return 1 + ;; + "host_network") + initialize_host_network_node "$pod_fqdn" "$is_primary" || return 1 + ;; + "default") + initialize_default_network_node "$pod_fqdn" "$is_primary" || return 1 + ;; + esac + done + return 0 +} + +gen_initialize_redis_cluster_primary_node() { + gen_initialize_redis_cluster_node "true" +} + +gen_initialize_redis_cluster_secondary_nodes() { + gen_initialize_redis_cluster_node "false" +} + +initialize_redis_cluster() { + # generate primary and secondary nodes + gen_initialize_redis_cluster_primary_node + gen_initialize_redis_cluster_secondary_nodes + + if [ ${#initialize_redis_cluster_primary_nodes[@]} -eq 0 ]; then + echo "Failed to get primary nodes" >&2 + return 1 + fi + + # check all the primary nodes are ready + local primary_nodes="" + local primary_node_list=() + for pod_name in "${!initialize_redis_cluster_primary_nodes[@]}"; do + primary_nodes+="${initialize_redis_cluster_primary_nodes[$pod_name]} " + primary_node_list+=("${initialize_redis_cluster_primary_nodes[$pod_name]}") + done + if ! check_initialize_nodes_ready "${primary_node_list[@]}"; then + echo "Primary nodes health check failed" >&2 + return 1 + fi + + # check all the secondary nodes are ready + if [ ${#initialize_redis_cluster_secondary_nodes[@]} -gt 0 ]; then + secondary_node_list=() + for pod_name in "${!initialize_redis_cluster_secondary_nodes[@]}"; do + secondary_node_list+=("${initialize_redis_cluster_secondary_nodes[$pod_name]}") + done + if ! check_initialize_nodes_ready "${secondary_node_list[@]}"; then + echo "Secondary nodes health check failed" >&2 + return 1 + fi + fi + + # initialize all the primary nodes + if create_redis_cluster "$primary_nodes"; then + echo "Redis cluster initialized primary nodes successfully, cluster nodes: $primary_nodes" + else + echo "Failed to create redis cluster when initializing" >&2 + return 1 + fi + + # get the first primary node to check the cluster + first_primary_node=$(echo "$primary_nodes" | awk '{print $1}') + if check_slots_covered "$first_primary_node" "$SERVICE_PORT"; then + echo "Redis cluster check primary nodes slots covered successfully." + else + echo "Failed to create redis cluster when checking slots covered" >&2 + return 1 + fi + + # initialize all the secondary nodes + if [ ${#initialize_redis_cluster_secondary_nodes[@]} -eq 0 ]; then + echo "No secondary nodes to initialize" + return 0 + fi + + all_secondaries_ready=true + for secondary_pod_name in "${!initialize_redis_cluster_secondary_nodes[@]}"; do + secondary_endpoint_with_port=${initialize_redis_cluster_secondary_nodes["$secondary_pod_name"]} + # shellcheck disable=SC2001 + mapping_primary_pod_name=$(echo "$secondary_pod_name" | sed 's/-[0-9]*$/-0/') + mapping_primary_endpoint_with_port=${initialize_pod_name_to_advertise_host_port_map["$mapping_primary_pod_name"]} + if is_empty "$mapping_primary_endpoint_with_port"; then + echo "Failed to find the mapping primary node for secondary node: $secondary_pod_name" >&2 + return 1 + fi + mapping_primary_endpoint=$(echo "$mapping_primary_endpoint_with_port" | cut -d':' -f1) + mapping_primary_port=$(echo "$mapping_primary_endpoint_with_port" | cut -d':' -f2) + mapping_primary_cluster_id=$(get_cluster_id "$mapping_primary_endpoint" "$mapping_primary_port") + echo "mapping_primary_fqdn: $mapping_primary_endpoint, mapping_primary_endpoint_with_port: $mapping_primary_endpoint_with_port, mapping_primary_cluster_id: $mapping_primary_cluster_id" + if is_empty "$mapping_primary_cluster_id"; then + echo "Failed to get the cluster id from cluster nodes of the mapping primary node: $mapping_primary_endpoint_with_port" >&2 + return 1 + fi + replicated_output=$(secondary_replicated_to_primary "$secondary_endpoint_with_port" "$mapping_primary_endpoint_with_port" "$mapping_primary_cluster_id") + status=$? + if [ $status -ne 0 ] ; then + echo "Failed to initialize the secondary node $secondary_pod_name, secondary replicated output: $replicated_output" >&2 + return 1 + fi + echo "Redis cluster initialized secondary node $secondary_pod_name successfully" + # waiting for all nodes sync the information + sleep_when_ut_mode_false 5 + secondary_node="$secondary_pod_name" + if [ "$network_mode" != "default" ]; then + secondary_node="${initialize_redis_cluster_secondary_nodes["$secondary_pod_name"]}" + fi + # verify secondary node is already in all primary nodes + if ! verify_secondary_in_all_primaries "$secondary_node" "${primary_node_list[@]}"; then + echo "Failed to verify secondary node $secondary_node in all primary nodes" >&2 + all_secondaries_ready=false + continue + fi + echo "Secondary node $secondary_pod_name successfully joined the cluster and verified in all primaries" + done + + if [ "$all_secondaries_ready" = false ]; then + echo "Failed to initialize all secondary nodes" >&2 + return 1 + fi + echo "Redis cluster initialized all secondary nodes successfully" + return 0 +} + +verify_secondary_in_all_primaries() { + local secondary_node="$1" + local primary_nodes=("$@") + # Skip the first argument + shift + for primary_node in "$@"; do + local primary_host primary_port + primary_host=$(echo "$primary_node" | cut -d':' -f1) + primary_port=$(echo "$primary_node" | cut -d':' -f2) + retry_count=0 + while ! check_node_in_cluster "$primary_host" "$primary_port" "$secondary_node" && [ $retry_count -lt 30 ]; do + sleep_when_ut_mode_false 3 + ((retry_count++)) + done + # shellcheck disable=SC2086 + if [ $retry_count -eq 30 ]; then + echo "Secondary node $secondary_node not found in primary $primary_node after retry" >&2 + return 1 + fi + done + return 0 +} + +check_current_shard_other_nodes_are_joined() { + local current_primary_host="$1" + local service_port="$2" + cluster_nodes_info=$(get_cluster_nodes_info "$current_primary_host" "$service_port") + for secondary_pod_name in "${!scale_out_shard_default_other_nodes[@]}"; do + secondary_node="$secondary_pod_name" + if [ "$network_mode" != "default" ]; then + secondary_node="${scale_out_shard_default_other_nodes["$secondary_pod_name"]}" + fi + if ! contains "$cluster_nodes_info" "$secondary_node"; then + echo "Secondary node $secondary_node not found in primary $current_primary_host, need to joined" >&2 + return 1 + fi + done + return 0 +} + +scale_out_redis_cluster_shard() { + if is_empty "$CURRENT_SHARD_COMPONENT_SHORT_NAME" || is_empty "$KB_CLUSTER_POD_FQDN_LIST"; then + echo "Error: Required environment variable CURRENT_SHARD_COMPONENT_SHORT_NAME, KB_CLUSTER_POD_FQDN_LIST are not set when scale out redis cluster shard" >&2 + return 1 + fi + + init_other_components_and_pods_info "$CURRENT_SHARD_COMPONENT_SHORT_NAME" "$KB_CLUSTER_POD_FQDN_LIST" "$KB_CLUSTER_COMPONENT_LIST" + if init_current_comp_default_nodes_for_scale_out; then + echo "Redis cluster scale out shard default primary and secondary nodes successfully" + else + echo "Failed to initialize the default primary and secondary nodes for scale out" >&2 + return 1 + fi + + # check the current component shard whether is already scaled out + if [ ${#scale_out_shard_default_primary_node[@]} -eq 0 ]; then + echo "Failed to generate primary nodes when scaling out" >&2 + return 1 + fi + primary_node_with_port=$(echo "${scale_out_shard_default_primary_node[*]}" | awk '{print $1}') + primary_node_fqdn=$(echo "$primary_node_with_port" | awk -F ':' '{print $1}') + primary_node_port=$(echo "$primary_node_with_port" | awk -F ':' '{print $2}') + mapping_primary_cluster_id=$(get_cluster_id "$primary_node_fqdn" "$primary_node_port") + current_primary_joined=false + if check_slots_covered "$primary_node_with_port" "$SERVICE_PORT"; then + if check_current_shard_other_nodes_are_joined "$primary_node_fqdn" "$primary_node_port"; then + echo "The current component shard is already scaled out, no need to scale out again." + return 0 + fi + current_primary_joined=true + fi + + # find the exist available node which is not in the current component + available_node=$(find_exist_available_node) + if is_empty "$available_node"; then + echo "No exist available node found or cluster status is not ok" >&2 + return 1 + fi + + # Forget fail node when cluster is ok + # forget_fail_node_when_cluster_is_ok "${available_node%%:*}" "${available_node##*:}" + + # add the primary node for the current shard + if [ "$current_primary_joined" = false ]; then + local scale_out_shard_default_primary + for primary_pod_name in "${!scale_out_shard_default_primary_node[@]}"; do + scale_out_shard_default_primary="${scale_out_shard_default_primary_node[$primary_pod_name]}" + if scale_out_shard_primary_join_cluster "$scale_out_shard_default_primary" "$available_node"; then + echo "Redis cluster scale out shard primary node $primary_pod_name successfully" + else + echo "Failed to scale out shard primary node $primary_pod_name" >&2 + return 1 + fi + done + fi + + # waiting for all nodes sync the information + sleep_when_ut_mode_false 5 + + # add the secondary nodes to replicate the primary node + local scale_out_shard_secondary_node + local scale_out_shard_secondary_node_with_port + for secondary_pod_name in "${!scale_out_shard_default_other_nodes[@]}"; do + scale_out_shard_secondary_node_with_port="${scale_out_shard_default_other_nodes[$secondary_pod_name]}" + scale_out_shard_secondary_node="${secondary_pod_name}" + if [ "$network_mode" != "default" ]; then + scale_out_shard_secondary_node=$scale_out_shard_secondary_node_with_port + fi + echo "primary_node_with_port: $primary_node_with_port, primary_node_fqdn: $primary_node_fqdn, mapping_primary_cluster_id: $mapping_primary_cluster_id" + if check_node_in_cluster "$primary_node_fqdn" "$primary_node_with_port" "$scale_out_shard_secondary_node"; then + echo "Secondary node $secondary_pod_name already joined the cluster, skip replicating to primary" + continue + fi + if secondary_replicated_to_primary "$scale_out_shard_secondary_node_with_port" "$primary_node_with_port" "$mapping_primary_cluster_id"; then + echo "Redis cluster scale out shard secondary node $secondary_pod_name successfully" + else + echo "Failed to scale out shard secondary node $secondary_pod_name" >&2 + return 1 + fi + done + + # do the reshard + # TODO: optimize the number of reshard slots according to the cluster status + local total_slots + local current_comp_pod_count + local all_comp_pod_count + local shard_count + local slots_per_shard + total_slots=16384 + current_comp_pod_count=$(echo "$CURRENT_SHARD_POD_NAME_LIST" | tr ',' '\n' | grep -c "^$CURRENT_SHARD_COMPONENT_NAME-") + all_comp_pod_count=$(echo "$KB_CLUSTER_POD_NAME_LIST" | tr ',' '\n' | grep -c ".*") + shard_count=$((all_comp_pod_count / current_comp_pod_count)) + slots_per_shard=$((total_slots / shard_count)) + # Stream-Valkey divergence: upstream calls `redis-cli --cluster reshard` here + # to migrate slots into the newly-joined primary. We do not — slot migration + # for our Valkey clusters is driven by the ASM (CLUSTER MIGRATESLOTS) path + # via OpsDefinition in stream-infra, which gives us live-migration with + # ape-dts and per-batch progress. The legacy reshard path uses + # MIGRATE COPY+DEL synchronously and stalls the source primary at high QPS. + # We keep the new shard joined with zero slots; the operator runs ASM next. + echo "Skipping legacy reshard call; slot migration handled by ASM OpsRequest." >&2 + echo " (target primary: $primary_node_with_port, slots_per_shard would have been: $slots_per_shard)" >&2 + + return 0 +} + +sync_acl_for_redis_cluster_shard() { + echo "Sync ACL rules for redis cluster shard..." + set +ex + redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -a $REDIS_DEFAULT_PASSWORD" + if [ -z "$REDIS_DEFAULT_PASSWORD" ]; then + redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD" + fi + is_ok=false + acl_list="" + # 1. get acl list from other pods + for pod_fqdn in $(echo "$KB_CLUSTER_POD_FQDN_LIST" | tr ',' ' '); do + pod_name=${pod_fqdn%%.*} + pod_service_port=$(get_pod_service_port_by_network_mode "$pod_name") + cluster_info=$(get_cluster_info_with_retry "$pod_fqdn" "$pod_service_port") + status=$? + if [ $status -ne 0 ]; then + continue + fi + cluster_state=$(echo "$cluster_info" | awk -F: '/cluster_state/{print $2}' | tr -d '[:space:]') + if is_empty "$cluster_state" || equals "$cluster_state" "ok"; then + acl_list=$($redis_base_cmd -p $pod_service_port -h "$pod_fqdn" ACL LIST) + is_ok=true + break + fi + done + + if [ "$is_ok" = false ]; then + echo "Failed to get ACL LIST from other shard pods" >&2 + exit 1 + fi + + if [ -z "$acl_list" ]; then + echo "No ACL rules found in other pods, skip synchronization" >&2 + return + fi + # 2. apply acl list to current shard pods + set -e + while IFS= read -r user_rule; do + [[ -z "$user_rule" ]] && continue + + if [[ "$user_rule" =~ ^user[[:space:]]+([^[:space:]]+) ]]; then + username="${BASH_REMATCH[1]}" + else + # skip invalid user rule + continue + fi + + if [[ "$username" == "default" ]]; then + continue + fi + rule_part="${user_rule#user $username }" + for pod_fqdn in $(echo "$CURRENT_SHARD_POD_FQDN_LIST" | tr ',' '\n'); do + $redis_base_cmd -h $pod_fqdn -p $SERVICE_PORT ACL SETUSER "$username" $rule_part >&2 + $redis_base_cmd -h $pod_fqdn -p $SERVICE_PORT ACL save >&2 + done + done <<< "$acl_list" + set_xtrace_when_ut_mode_false +} + +scale_in_redis_cluster_shard() { + + if is_empty "$CURRENT_SHARD_COMPONENT_SHORT_NAME" || is_empty "$KB_CLUSTER_POD_FQDN_LIST"; then + echo "Error: Required environment variable CURRENT_SHARD_COMPONENT_SHORT_NAME, KB_CLUSTER_POD_FQDN_LIST are not set when scale in redis cluster shard" >&2 + return 1 + fi + + # Forget fail node when cluster is ok + # forget_fail_node_when_cluster_is_ok "127.0.0.1" "$SERVICE_PORT" + + # init information for the other components and pods + init_other_components_and_pods_info "$CURRENT_SHARD_COMPONENT_SHORT_NAME" "$KB_CLUSTER_POD_FQDN_LIST" "$KB_CLUSTER_COMPONENT_LIST" + available_node=$(find_exist_available_node) + available_node_fqdn=$(echo "$available_node" | awk -F ':' '{print $1}') + available_node_port=$(echo "$available_node" | awk -F ':' '{print $2}') + get_current_comp_nodes_for_scale_in "$available_node_fqdn" "$available_node_port" + + # set the current shard component slot to 0 by rebalance command + for primary_node in "${current_comp_primary_node[@]}"; do + primary_node_fqdn=$(echo "$primary_node" | awk -F ':' '{print $1}') + primary_node_port=$(echo "$primary_node" | awk -F ':' '{print $2}') + primary_node_cluster_id=$(get_cluster_id "$primary_node_fqdn" "$primary_node_port") + if scale_in_shard_rebalance_to_zero "$primary_node" "$primary_node_cluster_id"; then + echo "Redis cluster scale in shard rebalance to zero successfully" + else + echo "Failed to rebalance the cluster for the current component when scaling in" >&2 + return 1 + fi + done + + sleep_when_ut_mode_false 5 + + # delete the current shard component nodes from the cluster + for node_to_del in "${current_comp_primary_node[@]}" "${current_comp_other_nodes[@]}"; do + node_to_del_fqdn=$(echo "$node_to_del" | awk -F ':' '{print $1}') + node_to_del_port=$(echo "$node_to_del" | awk -F ':' '{print $2}') + node_to_del_cluster_id=$(get_cluster_id "$node_to_del_fqdn" "$node_to_del_port") + if scale_in_shard_del_node "$available_node" "$node_to_del_cluster_id"; then + echo "Redis cluster scale in shard delete node $node_to_del successfully" + else + echo "Failed to delete the node $node_to_del from the cluster when scaling in" >&2 + return 1 + fi + done + return 0 +} + +initialize_or_scale_out_redis_cluster() { + # TODO: remove random sleep, it's a workaround for the multi components initialization parallelism issue + sleep_random_second_when_ut_mode_false 10 1 + + # if the cluster is not initialized, initialize it + if ! check_cluster_initialized "$KB_CLUSTER_POD_FQDN_LIST"; then + echo "Redis Cluster not initialized, initializing..." + if initialize_redis_cluster; then + echo "Redis Cluster initialized successfully" + else + echo "Failed to initialize Redis Cluster" >&2 + return 1 + fi + else + sync_acl_for_redis_cluster_shard + echo "Redis Cluster already initialized, scaling out the shard..." + if scale_out_redis_cluster_shard; then + echo "Redis Cluster scale out shard successfully" + else + echo "Failed to scale out Redis Cluster shard" >&2 + return 1 + fi + fi + return 0 +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +# main +if [ $# -eq 1 ]; then + load_redis_cluster_common_utils + init_environment + case $1 in + --help) + echo "Usage: $0 [options]" + echo "Options:" + echo " --help show help information" + echo " --post-provision initialize or scale out Redis Cluster Shard" + echo " --pre-terminate stop or scale in Redis Cluster Shard" + exit 0 + ;; + --post-provision) + if initialize_or_scale_out_redis_cluster; then + echo "Redis Cluster initialized or scale out shard successfully" + else + echo "Failed to initialize or scale out Redis Cluster shard" >&2 + exit 1 + fi + exit 0 + ;; + --pre-terminate) + if scale_in_redis_cluster_shard; then + echo "Redis Cluster scale in shard successfully" + else + echo "Failed to scale in Redis Cluster shard" >&2 + exit 1 + fi + exit 0 + ;; + *) + echo "Error: invalid option '$1'" + exit 1 + ;; + esac +fi diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh new file mode 100755 index 000000000..1ef68f487 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# shellcheck disable=SC2034 +# shellcheck disable=SC1090 +# shellcheck disable=SC2153 + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# you should set ut_mode="true" when you want to run the script in shellspec file. +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -ex". + set -ex; +} + +service_port=${SERVICE_PORT:-6379} +cluster_bus_port=${CLUSTER_BUS_PORT:-16379} + +load_redis_cluster_common_utils() { + # the common.sh and redis-cluster-common.sh scripts are defined in the redis cluster scripts template configmap + # and are mounted to the same path which defined in the cmpd.spec.scripts + kblib_common_library_file="/scripts/common.sh" + redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh" + source "${kblib_common_library_file}" + source "${redis_cluster_common_library_file}" +} + +# remove_replica_from_shard_if_need removes the current pod from the cluster if it is a replica +# TODO: remove it from preStop hook and it should be implemented in memberLeave lifecycleAction in KubeBlocks +remove_replica_from_shard_if_need() { + # get the cluster nodes info + cluster_nodes_info=$(get_cluster_nodes_info_with_retry "$KB_LEAVE_MEMBER_POD_FQDN" "$service_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in remove_replica_from_shard_if_need" >&2 + return 1 + fi + echo "Cluster nodes info: $cluster_nodes_info" + + # if the cluster_nodes_info contains only one line, it means that the cluster not be initialized + if [ "$(echo "$cluster_nodes_info" | wc -l)" -le 1 ]; then + echo "Cluster nodes info contains only one line or is empty, returning..." + return 0 + fi + + # get the current node role, if the current node is a slave, remove it from the cluster + current_node_role=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $3}') + if contains "$current_node_role" "slave"; then + echo "Current node $KB_LEAVE_MEMBER_POD_NAME is a slave, removing it from the cluster..." + current_node_cluster_id=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $1}') + current_node_ip_and_port="127.0.0.1:$service_port" + do_forget_node=false + if contains "$current_node_role" "fail"; then + do_forget_node=true + fi + echo "Current node id: $current_node_cluster_id" + if secondary_member_leave_del_node_with_retry "$current_node_ip_and_port" "$current_node_cluster_id" "$do_forget_node"; then + echo "Successfully removed replica from shard." + else + echo "Failed to remove replica from shard." >&2 + return 1 + fi + + # check if the current node is removed from the cluster + cluster_nodes_info=$(get_cluster_nodes_info "$KB_LEAVE_MEMBER_POD_FQDN" "$service_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in remove_replica_from_shard_if_need" >&2 + return 1 + fi + + if [ "$(echo "$cluster_nodes_info" | wc -l)" -le 1 ]; then + echo "successfully removed replica from shard." + return + else + echo "Failed to remove replica from shard." >&2 + return 1 + fi + else + echo "Current node $KB_LEAVE_MEMBER_POD_NAME is a master, no need to remove it from the cluster." + fi + return 0 +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +# main +load_redis_cluster_common_utils +if execute_acl_save_with_retry $service_port; then + echo "acl save command executed successfully." +else + echo "failed to execute acl save command." >&2 + return 1 +fi +if [ "$LEGACY_REDIS" = "true" ]; then + # Forget fail node when cluster is ok + forget_fail_node_when_cluster_is_ok "127.0.0.1" "$service_port" +fi +remove_replica_from_shard_if_need \ No newline at end of file diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh new file mode 100644 index 000000000..1e85b287a --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# shellcheck disable=SC2034 +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -e". + set -ex; +} + +load_common_library() { + # the common.sh scripts is mounted to the same path which is defined in the cmpd.spec.scripts + common_library_file="/scripts/common.sh" + # shellcheck disable=SC1090 + source "${common_library_file}" +} + +acl_save_before_stop() { + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $SERVICE_PORT -a $REDIS_DEFAULT_PASSWORD acl save" + logging_mask_acl_save_command="${acl_save_command/$REDIS_DEFAULT_PASSWORD/********}" + else + acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $SERVICE_PORT acl save" + logging_mask_acl_save_command="$acl_save_command" + fi + echo "acl save command: $logging_mask_acl_save_command" + if output=$($acl_save_command 2>&1); then + echo "acl save command executed successfully: $output" + else + echo "failed to execute acl save command: $output" + exit 1 + fi +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +# main +load_common_library +acl_save_before_stop \ No newline at end of file diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh new file mode 100755 index 000000000..14a8d9527 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh @@ -0,0 +1,776 @@ +#!/bin/bash + +# shellcheck disable=SC2153 +# shellcheck disable=SC2207 +# shellcheck disable=SC2034 +# shellcheck disable=SC1090 + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# you should set ut_mode="true" when you want to run the script in shellspec file. +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -ex". + set -ex; +} + +service_port=6379 +cluster_bus_port=16379 +redis_template_conf="/etc/conf/redis.conf" +redis_real_conf="/etc/redis/redis.conf" +redis_acl_file="/data/users.acl" +redis_acl_file_bak="/data/users.acl.bak" +retry_times=3 +check_ready_times=30 +retry_delay_second=2 + +# variables for scale out replica +current_comp_primary_node=() +current_comp_primary_fail_node=() +current_comp_other_nodes=() +other_comp_primary_nodes=() +other_comp_primary_fail_nodes=() +other_comp_other_nodes=() +network_mode="default" + + +init_environment(){ + if [[ -z "${CURRENT_SHARD_ADVERTISED_PORT}" ]]; then + CURRENT_SHARD_ADVERTISED_PORT="${CURRENT_SHARD_LB_ADVERTISED_PORT}" + fi + if [[ -z "${CURRENT_SHARD_ADVERTISED_BUS_PORT}" ]]; then + CURRENT_SHARD_ADVERTISED_BUS_PORT="${CURRENT_SHARD_LB_ADVERTISED_BUS_PORT}" + fi +} + +extract_lb_host_by_svc_name() { + local svc_name="$1" + for lb_composed_name in $(echo "$CURRENT_SHARD_LB_ADVERTISED_HOST" | tr ',' '\n' ); do + if [[ ${lb_composed_name} == *":"* ]]; then + if [[ ${lb_composed_name%:*} == "$svc_name" ]]; then + echo "${lb_composed_name#*:}" + break + fi + else + break + fi + done +} + +load_redis_cluster_common_utils() { + # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap + # and are mounted to the same path which defined in the cmpd.spec.scripts + kblib_common_library_file="/scripts/common.sh" + redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh" + source "${kblib_common_library_file}" + source "${redis_cluster_common_library_file}" +} + +check_and_meet_node() { + local source_endpoint="$1" + local source_port="$2" + local target_endpoint="$3" + local target_port="$4" + local target_bus_port="$5" + + # Check for invalid port numbers and exit immediately if found + if [ "$target_port" -eq 0 ] || [ "$target_bus_port" -eq 0 ]; then + echo "Error: target_port ($target_port) or target_bus_port ($target_bus_port) is 0. Exiting..." + shutdown_redis_server "$service_port" + exit 1 + fi + + while true; do + # Get current announce IP from the target node + current_announce_ip=$(get_cluster_announce_ip "$target_endpoint" "$target_port") + echo "target: $target_endpoint:$target_port, current_announce_ip: $current_announce_ip" + + # If current_announce_ip is empty, retry + if is_empty "$current_announce_ip"; then + echo "Error: current_announce_ip is empty" + sleep_when_ut_mode_false 3 + continue + fi + + # send cluster meet command to the primary node + if send_cluster_meet_with_retry "$source_endpoint" "$source_port" "$current_announce_ip" "$target_port" "$target_bus_port"; then + echo "Meet the node $target_endpoint successfully with new announce ip $current_announce_ip..." + break + else + echo "Failed to meet the node $target_endpoint" >&2 + shutdown_redis_server "$service_port" + exit 1 + fi + done +} + +check_and_meet_other_primary_nodes() { + local current_primary_endpoint="$1" + local current_primary_port="$2" + local meet_other_comp_primary_nodes=("${other_comp_primary_nodes[@]}" "${other_comp_primary_fail_nodes[@]}") + if [ ${#meet_other_comp_primary_nodes[@]} -eq 0 ]; then + echo "meet_other_comp_primary_nodes is empty, skip check_and_meet_other_primary_nodes" + return + fi + + # node_info value format: cluster_announce_ip#pod_fqdn#endpoint:port@bus_port + for node_info in "${meet_other_comp_primary_nodes[@]}"; do + node_endpoint_with_port=$(echo "$node_info" | awk -F '@' '{print $1}' | awk -F '#' '{print $3}') + node_endpoint=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $1}') + node_port=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $2}') + node_bus_port=$(echo "$node_info" | awk -F '@' '{print $2}') + node_fqdn=$(echo "$node_info" | awk -F '#' '{print $2}') + node_endpoint_for_meet="$node_endpoint" + if [ "$network_mode" == "default" ]; then + node_endpoint_for_meet="$node_fqdn" + fi + check_and_meet_node "$current_primary_endpoint" "$current_primary_port" "$node_endpoint_for_meet" "$node_port" "$node_bus_port" + sleep_when_ut_mode_false 3 + done +} + +check_and_meet_current_primary_node() { + local primary_node_endpoint="$1" + local primary_node_port="$2" + local primary_bus_port="$3" + + check_and_meet_node "127.0.0.1" "$service_port" "$primary_node_endpoint" "$primary_node_port" "$primary_bus_port" +} + +# get the current component nodes for scale out replica +get_current_comp_nodes_for_scale_out_replica() { + local cluster_node="$1" + local cluster_node_port="$2" + cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info in get_current_comp_nodes_for_scale_out_replica: $cluster_nodes_info" >&2 + return 1 + fi + + # if the cluster_nodes_info contains only one line, it means that the cluster not be initialized + shard_count=$(echo "${ALL_SHARDS_COMPONENT_SHORT_NAMES}" | tr ',' '\n' | wc -l) + if [ "$(echo "$cluster_nodes_info" | wc -l)" -lt ${shard_count} ]; then + echo "Cluster nodes info contains less than ${shard_count} nodes, returning..." + return + fi + + # determine network mode + network_mode="default" + if ! is_empty "$CURRENT_SHARD_ADVERTISED_PORT"; then + network_mode="advertised_svc" + elif ! is_empty "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"; then + network_mode="host_network" + fi + + parse_node_line_info() { + # the output of line is like: + # 1. using the pod fqdn as the nodeAddr + # 4958e6dca033cd1b321922508553fab869a29d 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287 + # 2. using the nodeport or lb ip as the nodeAddr + # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:31000@31888,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287 + # 3. using the host network ip as the nodeAddr + # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:1050@1051,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287 + local line="$1" + + local node_ip_port_fields + # 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc + node_ip_port_fields=$(echo "$line" | awk '{print $2}') + + local node_announce_ip_port + # ip:port without bus port + node_announce_ip_port=$(echo "$node_ip_port_fields" | awk -F '@' '{print $1}') + + local node_announce_ip + node_announce_ip=$(echo "$node_announce_ip_port" | cut -d':' -f1) + + local node_port + node_port=$(echo "$node_announce_ip_port" | cut -d':' -f2) + + local node_bus_port + node_bus_port=$(echo "$node_ip_port_fields" | awk -F '@' '{print $2}' | awk -F ',' '{print $1}') + + local node_fqdn + # redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local + node_fqdn=$(echo "$line" | awk '{print $2}' | awk -F ',' '{print $2}') + + local node_role + node_role=$(echo "$line" | awk '{print $3}') + + printf "%s %s %s %s %s" "$node_announce_ip" "$node_port" "$node_bus_port" "$node_role" "$node_fqdn" + } + + build_node_entry() { + local mode="$1" + local announce_ip="$2" + local fqdn="$3" + local port="$4" + local bus_port="$5" + + case "$mode" in + "advertised_svc") + # example format using nodeport: 172.10.0.1#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc#172.10.0.1:31000@31888 + echo "$announce_ip#$fqdn#$announce_ip:$port@$bus_port" + ;; + "host_network") + # example format using host network: 172.10.0.1#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc#172.10.0.1:1050@1051 + echo "$announce_ip#$fqdn#$announce_ip:$port@$bus_port" + ;; + *) + # example format using pod fqdn: 10.42.0.227#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc:6379@16379 + echo "$announce_ip#$fqdn#$fqdn:$port@$bus_port" + ;; + esac + } + + # categorize node into appropriate array + categorize_node() { + local node_entry="$1" + local node_role="$2" + local belong_current_comp="$3" + + if [[ "$belong_current_comp" == "true" ]]; then + if contains "$node_role" "master"; then + if contains "$node_role" "fail"; then + current_comp_primary_fail_node+=("$node_entry") + else + current_comp_primary_node+=("$node_entry") + fi + else + current_comp_other_nodes+=("$node_entry") + fi + else + if contains "$node_role" "master"; then + if contains "$node_role" "fail"; then + other_comp_primary_fail_nodes+=("$node_entry") + else + other_comp_primary_nodes+=("$node_entry") + fi + else + other_comp_other_nodes+=("$node_entry") + fi + fi + } + + # prepare CURRENT_SHARD_HOST_OR_PORT_LIST for advertised_svc mode + CURRENT_SHARD_HOST_OR_PORT_LIST=() + if [ "$network_mode" == "advertised_svc" ]; then + IFS=',' read -ra CURRENT_POD_LIST <<< "$CURRENT_SHARD_POD_NAME_LIST" + for pod_name in "${CURRENT_POD_LIST[@]}"; do + svc_and_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_PORT" "true") + svc_name=${svc_and_port%:*} + lb_host=$(extract_lb_host_by_svc_name "${svc_name}") + if [ -n "$lb_host" ]; then + CURRENT_SHARD_HOST_OR_PORT_LIST+=("${lb_host}:6379") + else + svc_port="${svc_and_port#*:}" + CURRENT_SHARD_HOST_OR_PORT_LIST+=(":${svc_port}") + fi + echo "pod_name: $pod_name, svc_and_port: $svc_and_port" + done + # check length of CURRENT_SHARD_ANNOUNCE_IP_LIST must equal to CURRENT_POD_LIST + if [ ${#CURRENT_SHARD_HOST_OR_PORT_LIST[@]} -ne ${#CURRENT_POD_LIST[@]} ]; then + echo "Error: failed to get the pod ip list from KB_POD_LIST" + return 1 + fi + fi + + # process each node + while read -r line; do + local node_info + node_info=$(parse_node_line_info "$line") + local node_announce_ip node_fqdn node_port node_bus_port node_role + read -r node_announce_ip node_port node_bus_port node_role node_fqdn <<< "$node_info" + # determine if the node belongs to the current component + belong_current_comp=false + if [ "$network_mode" == "advertised_svc" ]; then + for i in "${CURRENT_SHARD_HOST_OR_PORT_LIST[@]}"; do + node_announce_info=":$node_port" + if ! is_empty "$CURRENT_SHARD_LB_ADVERTISED_PORT"; then + node_announce_info="$node_announce_ip:$node_port" + fi + if [[ "$i" == "$node_announce_info" ]]; then + belong_current_comp=true + break + fi + done + elif [ "$network_mode" == "host_network" ]; then + if contains "$node_port" "$SERVICE_PORT"; then + belong_current_comp=true + fi + elif contains "$node_fqdn" "$CURRENT_SHARD_COMPONENT_NAME"; then + belong_current_comp=true + fi + # build node entry based on network mode + local node_entry + node_entry=$(build_node_entry "$network_mode" "$node_announce_ip" "$node_fqdn" "$node_port" "$node_bus_port") + + # categorize nodes + categorize_node "$node_entry" "$node_role" "$belong_current_comp" + done <<< "$cluster_nodes_info" + + echo "current_comp_primary_node: ${current_comp_primary_node[*]}" + echo "current_comp_primary_fail_node: ${current_comp_primary_fail_node[*]}" + echo "current_comp_other_nodes: ${current_comp_other_nodes[*]}" + echo "other_comp_primary_nodes: ${other_comp_primary_nodes[*]}" + echo "other_comp_primary_fail_nodes: ${other_comp_primary_fail_nodes[*]}" + echo "other_comp_other_nodes: ${other_comp_other_nodes[*]}" +} + +# Note: During rebuild-instance, a new PVC is created without existing data and having the rebuild.flag file. +# Therefore, we must rejoin this instance to the cluster as a secondary node. +is_rebuild_instance() { + # Early return if rebuild flag doesn't exist + [[ ! -f /data/rebuild.flag ]] && return 1 + + # Check if nodes.conf exists + if [[ ! -f /data/nodes.conf ]]; then + echo "Rebuild instance detected: nodes.conf missing" + return 0 + fi + + # Check if nodes.conf contains only one node + if [[ $(grep -c ":" /data/nodes.conf) -eq 1 ]]; then + echo "Rebuild instance detected: single node configuration" + return 0 + fi + + return 1 +} + +remove_rebuild_instance_flag() { + if [ -f /data/rebuild.flag ]; then + rm -f /data/rebuild.flag + echo "remove rebuild.flag file succeeded!" + fi +} + +# scale out replica of redis cluster shard if needed +scale_redis_cluster_replica() { + # Waiting for redis-server to start + check_current_ready_ip="127.0.0.1" + if [ -n "$redis_announce_host_value" ]; then + check_current_ready_ip=$redis_announce_host_value + fi + if check_redis_server_ready_with_retry "127.0.0.1" "$service_port"; then + echo "Redis server is ready, continue to scale out replica..." + else + echo "Redis server is not ready, exit scale out replica..." >&2 + exit 1 + fi + + if [ -f /data/nodes.conf ]; then + echo "the nodes.conf file after redis server start:" + cat /data/nodes.conf + else + echo "the nodes.conf file after redis server start is not exist" + fi + + for target_node_name in $(echo "${CURRENT_SHARD_POD_NAME_LIST}" | tr ',' '\n'); do + if [ -f /data/rebuild.flag ] && [ "${CURRENT_POD_NAME}" == "${target_node_name}" ]; then + continue + fi + target_node_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$target_node_name") + if is_empty "$target_node_fqdn"; then + echo "Error: Failed to get target node fqdn from current shard pod fqdn list: $CURRENT_SHARD_POD_FQDN_LIST. Exiting." >&2 + exit 1 + fi + # get the current component nodes for scale out replica + get_current_comp_nodes_for_scale_out_replica "$target_node_fqdn" "$service_port" + if [ $? -eq 0 ]; then + break + fi + done + + # check current_comp_primary_node is empty or not + if [ ${#current_comp_primary_node[@]} -eq 0 ]; then + if is_rebuild_instance; then + echo "current instance is a rebuild-instance, the current shard primary cannot be empty, please check the cluster status" >&2 + shutdown_redis_server "$service_port" + exit 1 + fi + if [ ${#current_comp_primary_fail_node[@]} -eq 0 ]; then + echo "current_comp_primary_node is empty, skip scale out replica" + exit 0 + fi + # if current_comp_primary_node is empty, use current_comp_primary_fail_node instead + current_comp_primary_node=("${current_comp_primary_fail_node[@]}") + fi + + # primary_node_info value format: cluster_announce_ip#pod_fqdn#endpoint:port@bus_port + primary_node_info=${current_comp_primary_node[0]} + primary_node_endpoint_with_port=$(echo "$primary_node_info" | awk -F '@' '{print $1}' | awk -F '#' '{print $3}') + primary_node_endpoint=$(echo "$primary_node_endpoint_with_port" | awk -F ':' '{print $1}') + primary_node_port=$(echo "$primary_node_endpoint_with_port" | awk -F ':' '{print $2}') + primary_node_fqdn=$(echo "$primary_node_info" | awk -F '#' '{print $2}') + primary_node_bus_port=$(echo "$primary_node_info" | awk -F '@' '{print $2}') + primary_node_endpoint_for_meet="$primary_node_endpoint" + if [ "$network_mode" == "default" ]; then + primary_node_endpoint_for_meet="$primary_node_fqdn" + fi + if contains "$primary_node_fqdn" "$CURRENT_POD_NAME" || contains "$primary_node_info" "$current_node_host_info"; then + echo "Current pod $CURRENT_POD_NAME is primary node, check and correct other primary nodes..." + check_and_meet_other_primary_nodes "$primary_node_endpoint_for_meet" "$primary_node_port" + echo "Node $CURRENT_POD_NAME is already in the cluster, skipping scale out replica..." + exit 0 + fi + # if the current pod is not a rebuild-instance and is already in the cluster, skip scale out replica + if ! is_rebuild_instance && check_node_in_cluster_with_retry "$primary_node_endpoint_for_meet" "$primary_node_port" "$current_node_host_info"; then + # if current pod is primary node, check the others primary info, if the others primary node info is expired, send cluster meet command again + echo "Current pod $CURRENT_POD_NAME is a secondary node, check and meet current primary node..." + check_and_meet_current_primary_node "$primary_node_endpoint_for_meet" "$primary_node_port" "$primary_node_bus_port" + echo "Node $CURRENT_POD_NAME is already in the cluster, skipping scale out replica..." + exit 0 + fi + + # Forget fail node when cluster is ok + # forget_fail_node_when_cluster_is_ok "$primary_node_endpoint_for_meet" "$primary_node_port" + + # add the current node as a replica of the primary node + primary_node_cluster_id=$(get_cluster_id_with_retry "$primary_node_endpoint_for_meet" "$primary_node_port") + status=$? + if is_empty "$primary_node_cluster_id" || [ $status -ne 0 ]; then + echo "Failed to get the cluster id of the primary node $primary_node_endpoint_with_port, sleep 30s for waiting next pod to start" >&2 + sleep 30s + shutdown_redis_server "$service_port" + exit 1 + fi + # current_node_with_port do not use advertised svc and port, because advertised svc and port are not ready when Pod is not Ready. + current_pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$CURRENT_POD_NAME") + if is_rebuild_instance; then + echo "Current instance is a rebuild-instance, forget node id in the cluster firstly." + node_id=$(get_cluster_id_with_retry "$primary_node_endpoint_for_meet" "$primary_node_port" "$current_node_host_info") + if [ -z ${REDIS_DEFAULT_PASSWORD} ]; then + redis-cli $REDIS_CLI_TLS_CMD -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id} + else + redis-cli $REDIS_CLI_TLS_CMD -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id} -a ${REDIS_DEFAULT_PASSWORD} + fi + fi + current_node_with_port="$current_pod_fqdn:$service_port" + replicated_output=$(secondary_replicated_to_primary "$current_node_with_port" "$primary_node_endpoint_with_port" "$primary_node_cluster_id") + status=$? + if [ $status -ne 0 ] ; then + if is_rebuild_instance && contains "$replicated_output" "is not empty"; then + echo "Current instance is a rebuild-instance, but the node already knows other nodes (check with CLUSTER NODES) or contains some key in database 0, shutdown redis server..." >&2 + shutdown_redis_server + exit 1 + elif contains "$replicated_output" "is not empty"; then + echo "Replica is not empty, Either the node already knows other nodes (check with CLUSTER NODES) or contains some key in database 0" + elif [[ $replicated_output == *"Not all 16384 slots are covered by nodes"* ]]; then + # shutdown the redis server if the cluster is not fully covered by nodes + echo "Not all 16384 slots are covered by nodes, shutdown redis server" >&2 + shutdown_redis_server + exit 1 + else + echo "Failed to add the node $current_pod_fqdn to the cluster in scale_redis_cluster_replica, Error message: $replicated_output, shutdown redis server" >&2 + shutdown_redis_server "$service_port" + exit 1 + fi + fi + + if is_rebuild_instance; then + echo "replicate the node $current_pod_fqdn to the primary node $primary_node_endpoint_with_port successfully in rebuild-instance, remove rebuild.flag file..." + remove_rebuild_instance_flag + fi + + # Hacky: When the entire redis cluster is restarted, a hacky sleep is used to wait for all primaries to enter the restarting state + sleep_when_ut_mode_false 5 + + # cluster meet the primary node until the current node is successfully added to the cluster + current_primary_met=false + declare -A other_primary_met + for node_info in "${other_comp_primary_nodes[@]}"; do + other_primary_met["$node_info"]=false + done + while true; do + all_met=true + + # meet current component primary node if not met yet + if ! $current_primary_met; then + if scale_out_replica_send_meet "$primary_node_endpoint_for_meet" "$primary_node_port" "$primary_node_bus_port" "$current_node_host_info"; then + echo "Successfully meet the primary node $primary_node_endpoint_with_port in scale_redis_cluster_replica" + current_primary_met=true + else + echo "Failed to meet current primary node $primary_node_endpoint_with_port" + all_met=false + fi + fi + + # meet the other components primary nodes if not met yet + for node_info in "${other_comp_primary_nodes[@]}"; do + if [ "${other_primary_met[$node_info]}" = false ]; then + node_endpoint_with_port=$(echo "$node_info" | awk -F '@' '{print $1}' | awk -F '#' '{print $3}') + node_endpoint=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $1}') + node_port=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $2}') + node_bus_port=$(echo "$node_info" | awk -F '@' '{print $2}') + node_fqdn=$(echo "$node_info" | awk -F '#' '{print $2}') + node_endpoint_for_meet="$node_endpoint" + if [ "$network_mode" == "default" ]; then + node_endpoint_for_meet="$node_fqdn" + fi + if scale_out_replica_send_meet "$node_endpoint_for_meet" "$node_port" "$node_bus_port" "$current_node_host_info"; then + echo "Successfully meet the primary node $node_endpoint_with_port in scale_redis_cluster_replica" + other_primary_met["$node_info"]=true + else + echo "Failed to meet the other component primary node $node_endpoint_with_port in scale_redis_cluster_replica" >&2 + all_met=false + fi + fi + done + + # If all nodes are met successfully, break the loop + if $all_met && $current_primary_met; then + echo "All primary nodes have been successfully met" + break + fi + + sleep_when_ut_mode_false 3 + done +} + +scale_out_replica_send_meet() { + local node_endpoint_to_meet="$1" + local node_port_to_meet="$2" + local node_bus_port_to_meet="$3" + local node_to_join="$4" + + if check_node_in_cluster "$node_endpoint_to_meet" "$node_port_to_meet" "$node_to_join"; then + echo "Node $CURRENT_POD_NAME is successfully added to the cluster." + return 0 + fi + + node_cluster_announce_ip=$(get_cluster_announce_ip_with_retry "$node_endpoint_to_meet" "$node_port_to_meet") + # send cluster meet command to the target node + if send_cluster_meet_with_retry "127.0.0.1" "$service_port" "$node_cluster_announce_ip" "$node_port_to_meet" "$node_bus_port_to_meet"; then + echo "scale out replica meet the node $node_cluster_announce_ip successfully..." + else + echo "Failed to meet the node $node_endpoint_to_meet in scale_redis_cluster_replica, shutdown redis server" >&2 + return 1 + fi + + return 0 +} + +load_redis_template_conf() { + echo "include $redis_template_conf" >> $redis_real_conf +} + +build_redis_default_accounts() { + unset_xtrace_when_ut_mode_false + if ! is_empty "$REDIS_REPL_PASSWORD"; then + echo "masteruser $REDIS_REPL_USER" >> $redis_real_conf + echo "masterauth $REDIS_REPL_PASSWORD" >> $redis_real_conf + redis_repl_password_sha256=$(echo -n "$REDIS_REPL_PASSWORD" | sha256sum | cut -d' ' -f1) + echo "user $REDIS_REPL_USER on +psync +replconf +ping #$redis_repl_password_sha256" >> $redis_acl_file + fi + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + echo "protected-mode yes" >> $redis_real_conf + redis_password_sha256=$(echo -n "$REDIS_DEFAULT_PASSWORD" | sha256sum | cut -d' ' -f1) + echo "user default on #$redis_password_sha256 ~* &* +@all " >> $redis_acl_file + else + echo "protected-mode no" >> $redis_real_conf + fi + set_xtrace_when_ut_mode_false + echo "aclfile /data/users.acl" >> $redis_real_conf + echo "build redis default accounts succeeded!" +} + +rebuild_redis_acl_file() { + if [ -f $redis_acl_file ]; then + sed "/user default on/d" $redis_acl_file > $redis_acl_file_bak && mv $redis_acl_file_bak $redis_acl_file + sed "/user $REDIS_REPL_USER on/d" $redis_acl_file > $redis_acl_file_bak && mv $redis_acl_file_bak $redis_acl_file + sed "/user $REDIS_SENTINEL_USER on/d" $redis_acl_file > $redis_acl_file_bak && mv $redis_acl_file_bak $redis_acl_file + else + touch $redis_acl_file + fi +} + +build_announce_ip_and_port() { + # build announce ip and port according to whether the advertised svc is enabled + if ! is_empty "$redis_announce_host_value" && ! is_empty "$redis_announce_port_value"; then + echo "redis use advertised svc $redis_announce_host_value:$redis_announce_port_value to announce" + { + echo "replica-announce-port $redis_announce_port_value" + echo "replica-announce-ip $redis_announce_host_value" + } >> $redis_real_conf + elif [ "$FIXED_POD_IP_ENABLED" == "true" ]; then + echo "redis use fixed pod ip: $CURRENT_POD_IP to announce" + echo "replica-announce-ip $CURRENT_POD_IP" >> $redis_real_conf + else + current_pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$CURRENT_POD_NAME") + if is_empty "$current_pod_fqdn"; then + echo "Error: Failed to get current pod: $CURRENT_POD_NAME fqdn from current shard pod fqdn list: $CURRENT_SHARD_POD_FQDN_LIST. Exiting." + exit 1 + fi + echo "redis use kb pod fqdn $current_pod_fqdn to announce" + echo "replica-announce-ip $current_pod_fqdn" >> $redis_real_conf + fi +} + +build_cluster_announce_info() { + current_pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$CURRENT_POD_NAME") + if is_empty "$current_pod_fqdn"; then + echo "Error: Failed to get current pod: $CURRENT_POD_NAME fqdn from current shard pod fqdn list: $CURRENT_SHARD_POD_FQDN_LIST. Exiting." + exit 1 + fi + current_node_host_info="$current_pod_fqdn" + # build announce ip and port according to whether the advertised svc is enabled + if ! is_empty "$redis_announce_host_value" && ! is_empty "$redis_announce_port_value" && ! is_empty "$redis_announce_bus_port_value"; then + current_node_host_info="$redis_announce_host_value:$redis_announce_port_value" + echo "redis cluster use advertised svc $redis_announce_host_value:$redis_announce_port_value@$redis_announce_bus_port_value to announce" + { + echo "cluster-announce-ip $redis_announce_host_value" + echo "cluster-announce-bus-port $redis_announce_bus_port_value" + # echo "cluster-announce-hostname $current_pod_fqdn" + echo "cluster-preferred-endpoint-type ip" + if [ "$TLS_ENABLED" == "true" ]; then + echo "cluster-announce-tls-port $redis_announce_port_value" + echo "cluster-announce-port 0" + else + echo "cluster-announce-port $redis_announce_port_value" + fi + } >> $redis_real_conf + elif [ "$FIXED_POD_IP_ENABLED" == "true" ]; then + echo "redis cluster use fixed pod ip: $CURRENT_POD_IP to announce" + { + echo "cluster-announce-ip $CURRENT_POD_IP" + echo "cluster-announce-hostname $current_pod_fqdn" + echo "cluster-preferred-endpoint-type ip" + } >> $redis_real_conf + else + echo "valkey cluster use pod fqdn $current_pod_fqdn to announce (preferring ip endpoint type)" + # Stream-Valkey divergence vs. upstream redis-cluster-server-start.sh: + # upstream emits `cluster-preferred-endpoint-type hostname` here, which + # makes CLUSTER SLOTS announce *.svc.cluster.local FQDNs that external + # clients (e.g. chat-api on EC2) cannot resolve. Force `ip` so cluster + # topology stays VPC-routable, matching the other two branches above. + { + echo "cluster-announce-ip $CURRENT_POD_IP" + echo "cluster-announce-hostname $current_pod_fqdn" + echo "cluster-preferred-endpoint-type ip" + } >> $redis_real_conf + fi +} + +build_redis_cluster_service_port() { + if ! is_empty "$SERVICE_PORT"; then + service_port=$SERVICE_PORT + fi + if ! is_empty "$CLUSTER_BUS_PORT"; then + cluster_bus_port=$CLUSTER_BUS_PORT + fi + if [ "$TLS_ENABLED" == "true" ]; then + echo "tls-port $service_port" >> $redis_real_conf + else + echo "port $service_port" >> $redis_real_conf + fi + echo "cluster-port $cluster_bus_port" >> $redis_real_conf +} + +parse_redis_cluster_shard_announce_addr() { + # The value format of CURRENT_SHARD_ADVERTISED_PORT and CURRENT_SHARD_ADVERTISED_BUS_PORT are "pod1Svc:advertisedPort1,pod2Svc:advertisedPort2,..." + if is_empty "$CURRENT_SHARD_ADVERTISED_PORT" || is_empty "$CURRENT_SHARD_ADVERTISED_BUS_PORT"; then + echo "Environment variable CURRENT_SHARD_ADVERTISED_PORT and CURRENT_SHARD_ADVERTISED_BUS_PORT not found. Ignoring." + # if redis cluster is in host network mode, use the host ip and port as the announce ip and port + if ! is_empty "${REDIS_CLUSTER_HOST_NETWORK_PORT}" && ! is_empty "${REDIS_CLUSTER_HOST_NETWORK_BUS_PORT}"; then + echo "redis cluster server is in host network mode, use the host ip:$CURRENT_POD_HOST_IP and port:$REDIS_CLUSTER_HOST_NETWORK_PORT, bus port:$REDIS_CLUSTER_HOST_NETWORK_BUS_PORT as the announce ip and port." + redis_announce_port_value="$REDIS_CLUSTER_HOST_NETWORK_PORT" + redis_announce_bus_port_value="$REDIS_CLUSTER_HOST_NETWORK_BUS_PORT" + redis_announce_host_value="$CURRENT_POD_HOST_IP" + fi + return 0 + fi + + local pod_name="$CURRENT_POD_NAME" + local port + local bus_port + svc_and_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_PORT" "true") + status=$? + if [[ $status -ne 0 ]] || is_empty "$svc_and_port"; then + echo "Exiting due to error in CURRENT_SHARD_ADVERTISED_PORT." + exit 1 + fi + + bus_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_BUS_PORT") + status=$? + if [[ $status -ne 0 ]] || is_empty "$bus_port"; then + echo "Exiting due to error in CURRENT_SHARD_ADVERTISED_BUS_PORT." + exit 1 + fi + redis_announce_port_value="${svc_and_port#*:}" + redis_announce_bus_port_value="$bus_port" + svc_name=${svc_and_port%:*} + lb_host=$(extract_lb_host_by_svc_name "${svc_name}") + if [ -n "$lb_host" ]; then + echo "Found load balancer host for svcName '$svc_name', value is '$lb_host'." + redis_announce_host_value="$lb_host" + redis_announce_port_value="6379" + redis_announce_bus_port_value="16379" + else + redis_announce_host_value="$CURRENT_POD_HOST_IP" + fi +} + +start_redis_server() { + module_path="/opt/redis-stack/lib" + if [[ "$IS_REDIS8" == "true" ]]; then + module_path="/usr/local/lib/redis/modules" + fi + exec_cmd="exec redis-server /etc/redis/redis.conf" + if [ -f ${module_path}/redisearch.so ]; then + exec_cmd="$exec_cmd --loadmodule ${module_path}/redisearch.so ${REDISEARCH_ARGS}" + fi + if [ -f ${module_path}/redistimeseries.so ]; then + exec_cmd="$exec_cmd --loadmodule ${module_path}/redistimeseries.so ${REDISTIMESERIES_ARGS}" + fi + if [ -f ${module_path}/rejson.so ]; then + exec_cmd="$exec_cmd --loadmodule ${module_path}/rejson.so ${REDISJSON_ARGS}" + fi + if [ -f ${module_path}/redisbloom.so ]; then + exec_cmd="$exec_cmd --loadmodule ${module_path}/redisbloom.so ${REDISBLOOM_ARGS}" + fi + if [ -f ${module_path}/redisgraph.so ]; then + exec_cmd="$exec_cmd --loadmodule ${module_path}/redisgraph.so ${REDISGRAPH_ARGS}" + fi + if [ -f ${module_path}/rediscompat.so ]; then + exec_cmd="$exec_cmd --loadmodule ${module_path}/rediscompat.so" + fi + # NOTE: in replication mode, load this module will lead a memory leak for slave instance. + #if [ -f ${module_path}/redisgears.so ]; then + # exec_cmd="$exec_cmd --loadmodule ${module_path}/redisgears.so v8-plugin-path ${module_path}/libredisgears_v8_plugin.so ${REDISGEARS_ARGS}" + #fi + echo "Starting redis server cmd: $exec_cmd" + eval "$exec_cmd" +} + +# build redis cluster configuration redis.conf +build_redis_conf() { + load_redis_template_conf + build_redis_cluster_service_port + build_announce_ip_and_port + build_cluster_announce_info + rebuild_redis_acl_file + build_redis_default_accounts +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +init_environment +load_redis_cluster_common_utils +parse_redis_cluster_shard_announce_addr +build_redis_conf +# TODO: move to memberJoin action in the future +scale_redis_cluster_replica & +start_redis_server diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh new file mode 100644 index 000000000..bb1bc9808 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# +# you should set ut_mode="true" when you want to run the script in shellspec file. +# +# shellcheck disable=SC2034 +# shellcheck disable=SC2153 +# shellcheck disable=SC1090 +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -ex". + set -ex; +} + +load_redis_cluster_common_utils() { + # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap + # and are mounted to the same path which defined in the cmpd.spec.scripts + kblib_common_library_file="/scripts/common.sh" + redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh" + source "${kblib_common_library_file}" + source "${redis_cluster_common_library_file}" +} + +check_environment_exist() { + local required_vars=( + "CURRENT_SHARD_POD_NAME_LIST" + "CURRENT_SHARD_POD_FQDN_LIST" + ) + + if [[ ${COMPONENT_REPLICAS} -lt 2 ]]; then + exit 0 + fi + + for var in "${required_vars[@]}"; do + if is_empty "${!var}"; then + echo "Error: Required environment variable $var is not set." >&2 + return 1 + fi + done + + if [ "$KB_SWITCHOVER_ROLE" != "primary" ]; then + echo "switchover not triggered for primary, nothing to do, exit 0" + exit 0 + fi +} + +init_redis_cluster_service_port() { + service_port=6379 + if [ -n "$SERVICE_PORT" ]; then + service_port=$SERVICE_PORT + fi +} + +get_current_shard_primary() { + local host=$1 + local port=$2 + local master_info + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + master_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port info replication) + else + master_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" info replication) + fi + set_xtrace_when_ut_mode_false + + local master_host + local master_port + + master_host=$(echo "$master_info" | grep "master_host:" | cut -d':' -f2 | tr -d '[:space:]') + master_port=$(echo "$master_info" | grep "master_port:" | cut -d':' -f2 | tr -d '[:space:]') + + if is_empty "$master_host"|| is_empty "$master_port"; then + return 1 + fi + + echo "$master_host:$master_port" +} + +get_all_shards_master() { + local host=$1 + local port=$2 + local cluster_nodes_info + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + cluster_nodes_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port cluster nodes) + else + cluster_nodes_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" cluster nodes) + fi + set_xtrace_when_ut_mode_false + + echo "$cluster_nodes_info" | grep "master" | grep -v "fail" | while read -r line; do + node_addr=$(echo "$line" | cut -d' ' -f2 | cut -d'@' -f1) + echo "$node_addr" + done +} + +do_switchover() { + candidate_pod=$1 + candidate_pod_fqdn=$2 + need_check=$3 + + # check candidate pod is ready and has the role of secondary + role=$(check_redis_role "$candidate_pod_fqdn" $service_port) + if [ "$role" = "primary" ]; then + echo "Info: Candidate pod $candidate_pod is already a primary" + exit 0 + fi + if ! equals "$role" "secondary"; then + echo "Error: Candidate pod $candidate_pod is not a secondary" >&2 + return 1 + fi + + # get current shard primary + current_shard_primary=$(get_current_shard_primary "$candidate_pod_fqdn" $service_port) + if is_empty "$current_shard_primary"; then + echo "Error: Could not determine current shard primary for $candidate_pod" >&2 + return 1 + fi + + # check cluster health from current shard primary + if ! check_slots_covered "$current_shard_primary" $service_port; then + echo "Error: Cluster health check failed" >&2 + return 1 + fi + + # check if candidate is known by all the shards primary + current_shard_primary_host=$(echo "$current_shard_primary" | cut -d':' -f1) + current_shard_primary_port=$(echo "$current_shard_primary" | cut -d':' -f2) + if is_empty "$current_shard_primary_host" || is_empty "$current_shard_primary_port"; then + echo "Error: Could not determine current shard primary host and port" >&2 + return 1 + fi + primaries=$(get_all_shards_master "$current_shard_primary_host" $current_shard_primary_port) + candidate_node_id=$(get_cluster_id "$candidate_pod_fqdn" $service_port) + for primary in $primaries; do + primary_host=$(echo "$primary" | cut -d':' -f1) + primary_port=$(echo "$primary" | cut -d':' -f2) + if ! check_node_in_cluster_with_retry "$primary_host" $primary_port "$candidate_node_id"; then + echo "Error: Candidate $candidate_pod is not known by shard $primary" >&2 + return 1 + fi + done + + # do switchover + echo "Starting switchover to $candidate_pod" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + result=$(redis-cli $REDIS_CLI_TLS_CMD -h "$candidate_pod_fqdn" -p $service_port cluster failover) + else + result=$(redis-cli $REDIS_CLI_TLS_CMD -h "$candidate_pod_fqdn" -p $service_port -a "$REDIS_DEFAULT_PASSWORD" cluster failover) + fi + if [ "$need_check" != "true" ]; then + return 0 + fi + set_xtrace_when_ut_mode_false + if [ "$result" != "OK" ]; then + echo "Error: Cluster Failover command failed with result: $result" >&2 + return 1 + fi + + # check switchover result + max_attempts=60 + attempt=0 + while [ $attempt -lt $max_attempts ]; do + role=$(check_redis_role "$candidate_pod_fqdn" $service_port) + if [ "$role" = "primary" ]; then + echo "Switchover successful: $candidate_pod is now primary" + return 0 + fi + sleep 2 + ((attempt++)) + done + + echo "Error: Switchover verification timeout" >&2 + return 1 +} + +switchover_without_candidate() { + candidate_pod="" + candidate_pod_fqdn="" + # check if the current node is removed from the cluster or not + cluster_nodes_info=$(get_cluster_nodes_info "$CURRENT_POD_IP" "$service_port") + status=$? + if [ $status -ne 0 ]; then + echo "Failed to get cluster nodes info " >&2 + return 1 + fi + #if current pod has been removed from cluster by redis-cluster-replica-member-leave.sh, and become an primary by dbctl, cluster nodes command return one line + if [ "$(echo "$cluster_nodes_info" | wc -l)" -le 1 ]; then + echo "this pos has been successfully removed replica from shard,no need to perform switch over." + return + fi + + # get the one of secondary pod of current shard + # TODO: get the most suitable secondary pod which has the lowest latency + IFS=',' read -ra PODS <<< "$CURRENT_SHARD_POD_NAME_LIST" + for pod_name in "${PODS[@]}"; do + local pod_fqdn + pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$pod_name") || { + echo "Failed to get FQDN for pod: $pod_name" >&2 + return 1 + } + role=$(check_redis_role "$pod_fqdn" $service_port) + if [ "$role" = "secondary" ]; then + candidate_pod=$pod_name + candidate_pod_fqdn=$pod_fqdn + break + fi + done + + if is_empty "$candidate_pod"; then + echo "Error: No eligible secondary found in pod list: $CURRENT_SHARD_POD_NAME_LIST" >&2 + return 1 + fi + + # do switchover + do_switchover "$candidate_pod" "$candidate_pod_fqdn" "false" || return 1 +} + +switchover_with_candidate() { + # check KB_SWITCHOVER_CANDIDATE_FQDN and KB_SWITCHOVER_CANDIDATE_NAME are not empty + if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN" || is_empty "$KB_SWITCHOVER_CANDIDATE_NAME"; then + echo "KB_SWITCHOVER_CANDIDATE_NAME or KB_SWITCHOVER_CANDIDATE_FQDN is empty" >&2 + return 1 + fi + + # do switchover + do_switchover "$KB_SWITCHOVER_CANDIDATE_NAME" "$KB_SWITCHOVER_CANDIDATE_FQDN" "true" || return 1 +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +# main +load_redis_cluster_common_utils +check_environment_exist || exit 1 +init_redis_cluster_service_port +if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN"; then + switchover_without_candidate || exit 1 +else + switchover_with_candidate || exit 1 +fi diff --git a/addons/valkey/valkey-cluster-scripts/valkey-ping.sh b/addons/valkey/valkey-cluster-scripts/valkey-ping.sh new file mode 100755 index 000000000..811c496f9 --- /dev/null +++ b/addons/valkey/valkey-cluster-scripts/valkey-ping.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# +# you should set ut_mode="true" when you want to run the script in shellspec file. +# +# shellcheck disable=SC2034 +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -e". + set -ex; +} + +load_common_library() { + # the common.sh scripts is mounted to the same path which is defined in the cmpd.spec.scripts + common_library_file="/scripts/common.sh" + # shellcheck disable=SC1090 + source "${common_library_file}" +} + +check_redis_ok() { + unset_xtrace_when_ut_mode_false + service_port=${SERVICE_PORT:-6379} + if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then + cmd="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port -a $REDIS_DEFAULT_PASSWORD ping" + else + cmd="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port ping" + fi + response=$($cmd) + status=$? + set_xtrace_when_ut_mode_false + if [ $status -eq 124 ]; then + echo "Timed out" >&2 + return 1 + fi + if [ "$response" != "PONG" ]; then + echo "redis ping failed, response: $response" >&2 + return 1 + fi + echo "Redis is ok" +} + +retry_check_redis_ok() { + if call_func_with_retry 5 3 check_redis_ok; then + return 0 + else + echo "Redis is not running." >&2 + return 1 + fi +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +# main +load_common_library +retry_check_redis_ok || exit 1 diff --git a/addons/valkey/values.yaml b/addons/valkey/values.yaml new file mode 100644 index 000000000..1720fa17b --- /dev/null +++ b/addons/valkey/values.yaml @@ -0,0 +1,57 @@ +# Default values for the valkey addon. +# Single Valkey major version (9.x); add new entries here when adopting newer +# patch / minor versions. Cluster topology only — no sentinel, no twemproxy. + +nameOverride: "" +fullnameOverride: "" + +# Valkey versions: each entry produces a release line in ComponentVersion. +# `serviceVersion` is what users select in their Cluster CR; `imageTag` is +# the docker.io/valkey/valkey tag. Add a new entry to support a new patch +# without changing existing clusters. +valkeyVersions: + - major: "9" + componentDef: "valkey-cluster-9" + serviceVersion: "9.0.3" + defaultImageTag: "9.0.3" + mirrorVersions: + - version: "9.0.3" + imageTag: "9.0.3" + - version: "9.1.0" + imageTag: "9.1.0" + +image: + registry: docker.io + repository: valkey/valkey + pullPolicy: IfNotPresent + +# dbctl + agamotto stay on apecloud images — they are KubeBlocks-side +# tooling, not the engine. +dbctlImage: + registry: "" + repository: apecloud/dbctl + pullPolicy: IfNotPresent + tag: 0.2.1 + +metrics: + image: + registry: "" + repository: oliver006/redis_exporter + tag: v1.80.1 + pullPolicy: IfNotPresent + service: + port: 9121 + serverPort: 8888 + +# defined the data volume mount path of valkey server +dataMountPath: /data + +logConfigs: + running: /data/running.log + +# cluster domain without . prefix +clusterDomain: "cluster.local" + +enableMetrics: true + +tlsMountPath: /etc/pki/tls From 3b894f7e5643bf0ba2e61105334fc80d6517c8dd Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Wed, 6 May 2026 14:17:46 +0200 Subject: [PATCH 02/11] chore(valkey): expand 9.0.x mirror versions; bump appVersion to 9.0.4 Add 9.0.0, 9.0.1, 9.0.2, 9.0.4 alongside existing 9.0.3 / 9.1.0 in ComponentVersion releases. 9.0.4 (released 2026-05-06) becomes the chart appVersion and the default `serviceVersion` on the ComponentDefinition. The full 9.0.x range gives operators a pinned set of options for OpsRequest type=Upgrade rollback / patch-version testing without needing to redeploy the addon. Same-image-tag mapping; no behavioural change. --- addons/valkey/Chart.yaml | 2 +- addons/valkey/values.yaml | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/addons/valkey/Chart.yaml b/addons/valkey/Chart.yaml index 93321aa6c..5db68fd16 100644 --- a/addons/valkey/Chart.yaml +++ b/addons/valkey/Chart.yaml @@ -6,7 +6,7 @@ type: application version: 0.1.0 -appVersion: "9.0.3" +appVersion: "9.0.4" # Add a dependency to the kubeblocks definition library chart, same as the redis addon. dependencies: diff --git a/addons/valkey/values.yaml b/addons/valkey/values.yaml index 1720fa17b..7540bb0c2 100644 --- a/addons/valkey/values.yaml +++ b/addons/valkey/values.yaml @@ -12,11 +12,19 @@ fullnameOverride: "" valkeyVersions: - major: "9" componentDef: "valkey-cluster-9" - serviceVersion: "9.0.3" - defaultImageTag: "9.0.3" + serviceVersion: "9.0.4" + defaultImageTag: "9.0.4" mirrorVersions: + - version: "9.0.0" + imageTag: "9.0.0" + - version: "9.0.1" + imageTag: "9.0.1" + - version: "9.0.2" + imageTag: "9.0.2" - version: "9.0.3" imageTag: "9.0.3" + - version: "9.0.4" + imageTag: "9.0.4" - version: "9.1.0" imageTag: "9.1.0" From 5f30aec1defec16cf67e399d7561abcb461308b3 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Wed, 6 May 2026 14:26:27 +0200 Subject: [PATCH 03/11] chore(valkey): drop 9.1.0 from mirror versions 9.1.0 is still RC upstream and not yet a tagged release on docker.io/valkey/valkey. Keep ComponentVersion to the stable 9.0.x line (9.0.0 - 9.0.4) for now; re-add 9.1.0 once the GA tag ships. --- addons/valkey/values.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/addons/valkey/values.yaml b/addons/valkey/values.yaml index 7540bb0c2..0a020c950 100644 --- a/addons/valkey/values.yaml +++ b/addons/valkey/values.yaml @@ -25,8 +25,6 @@ valkeyVersions: imageTag: "9.0.3" - version: "9.0.4" imageTag: "9.0.4" - - version: "9.1.0" - imageTag: "9.1.0" image: registry: docker.io From 64159e4e86df02998f50458ef993e05d36dda234 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Fri, 8 May 2026 13:18:28 +0200 Subject: [PATCH 04/11] dropped the 'reconfigure' field --- addons/valkey/templates/cmpd-valkey-cluster.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml index c9c39b46a..f41745ade 100644 --- a/addons/valkey/templates/cmpd-valkey-cluster.yaml +++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml @@ -76,7 +76,15 @@ spec: namespace: {{ $.Release.Namespace }} volumeName: valkey-cluster-config externalManaged: true - {{- include "valkey.config.reconfigureAction" $ | nindent 6 }} + # NOTE: dropped the `reconfigure` field (helper: + # valkey.config.reconfigureAction) because the field was added to the + # ComponentDefinition CRD in KubeBlocks 1.1.x and is not in 1.0.2 (current + # stable). When upgrading the operator past 1.1.x, re-add: + # {{ "{{- include \"valkey.config.reconfigureAction\" $ | nindent 6 }}" }} + # Trade-off: without `reconfigure`, ConfigMap changes don't hot-reload via + # operator exec. Config changes take effect on the next pod restart (helm + # upgrade with resource/version delta, or `kubectl create -f + # charts/valkey/ops/restart.yaml`). Adequate for our cache-mode workload. scripts: - name: valkey-cluster-scripts template: {{ include "valkeyCluster.scriptsTemplate" $ }} From ed31ccc5a34bb37a6e4f50112fa5a8eb340240a4 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Fri, 8 May 2026 14:14:16 +0200 Subject: [PATCH 05/11] fix: remove externalManaged --- .../valkey/templates/cmpd-valkey-cluster.yaml | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml index f41745ade..4b78d0f3e 100644 --- a/addons/valkey/templates/cmpd-valkey-cluster.yaml +++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml @@ -75,16 +75,21 @@ spec: template: {{ printf "valkey-cluster-config-template-%s" $.Chart.Version }} namespace: {{ $.Release.Namespace }} volumeName: valkey-cluster-config - externalManaged: true - # NOTE: dropped the `reconfigure` field (helper: + # NOTE: do NOT set `externalManaged: true`. In KubeBlocks 1.0.2 the + # synthesizer (pkg/controller/component/synthesize_component.go) wipes + # `template` to "" whenever externalManaged is true and the Cluster CR + # does not supply a `componentSpecs[].configs[].configMap` override. + # The downstream `transformer_component_template.precheck` then fails + # with `config/script template has no template specified: valkey-cluster-config`. + # We don't expose user-overridable configs, so leaving this off lets the + # operator manage the chart-provided ConfigMap directly. + # + # NOTE: also dropped the `reconfigure` field (helper: # valkey.config.reconfigureAction) because the field was added to the - # ComponentDefinition CRD in KubeBlocks 1.1.x and is not in 1.0.2 (current - # stable). When upgrading the operator past 1.1.x, re-add: - # {{ "{{- include \"valkey.config.reconfigureAction\" $ | nindent 6 }}" }} - # Trade-off: without `reconfigure`, ConfigMap changes don't hot-reload via - # operator exec. Config changes take effect on the next pod restart (helm - # upgrade with resource/version delta, or `kubectl create -f - # charts/valkey/ops/restart.yaml`). Adequate for our cache-mode workload. + # ComponentDefinition CRD in KubeBlocks 1.1.x and is not in 1.0.2. + # ConfigMap changes won't hot-reload via operator exec — they take effect + # on the next pod restart (helm upgrade with resource/version delta, or + # `kubectl create -f charts/valkey/ops/restart.yaml`). Adequate for cache. scripts: - name: valkey-cluster-scripts template: {{ include "valkeyCluster.scriptsTemplate" $ }} From a35926b205bcaa667996c6179116da020007c6df Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Mon, 11 May 2026 13:36:29 +0200 Subject: [PATCH 06/11] feat: avoid all replicas or primaries gone at once in rolling updates --- addons/valkey/templates/shardingdefinition.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/addons/valkey/templates/shardingdefinition.yaml b/addons/valkey/templates/shardingdefinition.yaml index ed1e71c8b..882b3a259 100644 --- a/addons/valkey/templates/shardingdefinition.yaml +++ b/addons/valkey/templates/shardingdefinition.yaml @@ -12,8 +12,18 @@ spec: shardsLimit: minShards: 1 maxShards: 64 + # provisionStrategy: how shards are added during initial cluster create or + # scale-out. Parallel is safe here — the bootstrap script ensures all shards + # converge before slot assignment runs. provisionStrategy: Parallel - updateStrategy: Parallel + # updateStrategy: how shards are processed during Upgrade/Restart/VScale + # OpsRequests. MUST be Serial for a Redis Cluster: if multiple shards roll + # in parallel, you can simultaneously lose every replica (then every + # primary), which breaks cluster bus quorum and leaves orphan/ghost nodes + # the heal CronJob can't recover from (no anchor pod in cluster_state:ok). + # Within a shard, podManagementPolicy=OrderedReady on the cmpd already + # serializes replica → primary with CLUSTER FAILOVER in between. + updateStrategy: Serial systemAccounts: - name: default shared: true From e3c104b83ba1ea8b10beb81ad29d6e0e813644ce Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Mon, 11 May 2026 13:55:17 +0200 Subject: [PATCH 07/11] revert: kb 1.0.2 doesn't consume update strategy --- .../valkey/templates/shardingdefinition.yaml | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/addons/valkey/templates/shardingdefinition.yaml b/addons/valkey/templates/shardingdefinition.yaml index 882b3a259..3f59f1d23 100644 --- a/addons/valkey/templates/shardingdefinition.yaml +++ b/addons/valkey/templates/shardingdefinition.yaml @@ -12,18 +12,16 @@ spec: shardsLimit: minShards: 1 maxShards: 64 - # provisionStrategy: how shards are added during initial cluster create or - # scale-out. Parallel is safe here — the bootstrap script ensures all shards - # converge before slot assignment runs. + # NOTE: provisionStrategy + updateStrategy are declared/validated by KB 1.0.2 + # (`controllers/apps/shardingdefinition_controller.go:validateProvisionNUpdateStrategy`) + # but NOT actually consumed when reconciling shard rollouts. The cluster + # controller updates all sharding components in parallel regardless of this + # value. Verified by `grep -r SerialStrategy pkg/controller/` returning no + # consumer references. Leaving as Parallel to match the field's actual + # semantics in this KB version — see charts/valkey/README.md for what + # actually controls upgrade safety (spoiler: nothing, currently). provisionStrategy: Parallel - # updateStrategy: how shards are processed during Upgrade/Restart/VScale - # OpsRequests. MUST be Serial for a Redis Cluster: if multiple shards roll - # in parallel, you can simultaneously lose every replica (then every - # primary), which breaks cluster bus quorum and leaves orphan/ghost nodes - # the heal CronJob can't recover from (no anchor pod in cluster_state:ok). - # Within a shard, podManagementPolicy=OrderedReady on the cmpd already - # serializes replica → primary with CLUSTER FAILOVER in between. - updateStrategy: Serial + updateStrategy: Parallel systemAccounts: - name: default shared: true From 438d05389d21af3f3e8f369a929f541348ac17f1 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Tue, 12 May 2026 15:06:09 +0200 Subject: [PATCH 08/11] fix: use REDIS_POD_FQDN_LIST for memberJoin --- .../valkey/valkey-cluster-scripts/sync-acl.sh | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/addons/valkey/valkey-cluster-scripts/sync-acl.sh b/addons/valkey/valkey-cluster-scripts/sync-acl.sh index 0b6bd0c31..b9ad7e098 100644 --- a/addons/valkey/valkey-cluster-scripts/sync-acl.sh +++ b/addons/valkey/valkey-cluster-scripts/sync-acl.sh @@ -1,4 +1,16 @@ #!/bin/bash +# +# Sync ACL rules from existing shard peers onto a newly-joined pod. Invoked +# by KubeBlocks as the memberJoin lifecycle action. +# +# Env vars in scope (we tolerate either naming so the script works under +# both 1.0.x and 1.1.x KB versions): +# - KB_JOIN_MEMBER_POD_FQDN — the pod being joined (injected by KB) +# - REDIS_POD_FQDN_LIST — historical upstream name (often unset) +# - CURRENT_SHARD_POD_FQDN_LIST — name our cmpd actually exposes +# +# If neither list is populated, we have no peers to query and there's +# nothing to sync. Exit 0 in that case rather than failing the join. service_port=${SERVICE_PORT:-6379} redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port -a $REDIS_DEFAULT_PASSWORD" @@ -6,10 +18,17 @@ if [ -z "$REDIS_DEFAULT_PASSWORD" ]; then redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port" fi +# Pick whichever peer list is populated; tolerate either name. +peer_list="${REDIS_POD_FQDN_LIST:-$CURRENT_SHARD_POD_FQDN_LIST}" +if [ -z "$peer_list" ]; then + echo "No peer FQDN list available (REDIS_POD_FQDN_LIST and CURRENT_SHARD_POD_FQDN_LIST both empty); nothing to sync, exiting 0" >&2 + exit 0 +fi + is_ok=false acl_list="" # 1. get acl list from other pods -for pod_fqdn in $(echo "$REDIS_POD_FQDN_LIST" | tr ',' '\n'); do +for pod_fqdn in $(echo "$peer_list" | tr ',' '\n'); do if [[ "$pod_fqdn" == "$KB_JOIN_MEMBER_POD_FQDN" ]]; then continue fi @@ -21,7 +40,7 @@ for pod_fqdn in $(echo "$REDIS_POD_FQDN_LIST" | tr ',' '\n'); do done if [ "$is_ok" = false ]; then - echo "Failed to get ACL LIST from other pods" >&2 + echo "Failed to get ACL LIST from any peer in: $peer_list" >&2 exit 1 fi From dc79ce268afce779bd976459322587f54f9cf976 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Tue, 12 May 2026 21:07:07 +0200 Subject: [PATCH 09/11] feat: distinguishing rejoining pod --- .../valkey-cluster-server-start.sh | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh index 14a8d9527..cffb881af 100755 --- a/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh +++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh @@ -352,6 +352,30 @@ remove_rebuild_instance_flag() { fi } +# is_rejoining_pod returns 0 if this pod's PVC has preserved cluster state +# from a prior incarnation — i.e. nodes.conf shows we were part of a +# multi-node Redis cluster. When EBS persistence is enabled, this is the +# normal case after any pod recreation (Karpenter reschedule, AMI bump, +# machine type change, AZ failover). The right thing to do is let redis +# start with its preserved identity and let the cluster bus gossip update +# peer addresses; the start script should NOT try to add-node or replicate +# this pod (which fails with "Node ... is not empty" because the pod +# already has cluster state). +# +# Distinct from is_rebuild_instance, which looks for an explicit +# /data/rebuild.flag set by KubeBlocks during a planned wipe-and-rejoin. +# A rejoining pod has nodes.conf AND no rebuild.flag. +is_rejoining_pod() { + [[ ! -f /data/nodes.conf ]] && return 1 + # Need at least 2 lines (self + peers). A single-line nodes.conf is what + # a fresh redis writes on first start, before any cluster membership. + [[ $(grep -c ":" /data/nodes.conf) -le 1 ]] && return 1 + # Explicit KB-driven rebuild path takes precedence — fall through to the + # existing add-node/replicate logic, which knows how to handle that flag. + [[ -f /data/rebuild.flag ]] && return 1 + return 0 +} + # scale out replica of redis cluster shard if needed scale_redis_cluster_replica() { # Waiting for redis-server to start @@ -373,6 +397,23 @@ scale_redis_cluster_replica() { echo "the nodes.conf file after redis server start is not exist" fi + # EBS-rejoin short-circuit: when the PVC has preserved cluster state from a + # prior incarnation of this pod (typical after node replacement when + # persistence is enabled), the right thing is to let redis start with its + # preserved node identity and let cluster bus gossip update peer addresses. + # The add-node path below would fail here with "Node ... is not empty" + # because the pod already has cluster state. + # + # KubeBlocks-driven rebuild (via /data/rebuild.flag) still falls through + # to the existing logic, which knows how to handle that case. + if is_rejoining_pod; then + nodes_count=$(grep -c ":" /data/nodes.conf) + echo "EBS rejoin detected: nodes.conf has ${nodes_count} entries from prior cluster membership." + echo "Skipping scale-out logic; redis cluster bus will re-converge via gossip." + echo "(FQDNs are stable across pod recreation; peers will update this pod's IP via gossip.)" + exit 0 + fi + for target_node_name in $(echo "${CURRENT_SHARD_POD_NAME_LIST}" | tr ',' '\n'); do if [ -f /data/rebuild.flag ] && [ "${CURRENT_POD_NAME}" == "${target_node_name}" ]; then continue From a2862ef29e20758369a153245e98711e5177e06c Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Wed, 13 May 2026 14:59:09 +0200 Subject: [PATCH 10/11] feat: enable AOF and grace for image-swap data survival MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch valkey-cluster from RDB-only to AOF for durability across pod restarts. The previous RDB-only config relied on shutdown-save which races with the 10s shutdown-timeout under concurrent writes — verified to flush 2 of 3 shards during multi-shard image upgrades. With AOF on, each pod independently recovers its full dataset from disk via AOF replay, so the cross-shard restart race becomes a non- issue. Tested across reshard, vscale, image-swap, and node-type cascade: 50/50 keys preserved end-to-end. Companion settings: - shutdown-timeout 25 (was default 10) for in-flight AOF rewrite - terminationGracePeriodSeconds 60 in the cmpd runtime spec --- addons/valkey/config/valkey-cluster-config.tpl | 18 ++++++++++++------ .../valkey/templates/cmpd-valkey-cluster.yaml | 4 ++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/addons/valkey/config/valkey-cluster-config.tpl b/addons/valkey/config/valkey-cluster-config.tpl index e896c7c8f..0885846c1 100644 --- a/addons/valkey/config/valkey-cluster-config.tpl +++ b/addons/valkey/config/valkey-cluster-config.tpl @@ -38,10 +38,11 @@ oom-score-adj no oom-score-adj-values 0 200 800 disable-thp yes -# AOF off: fsync on EBS gp3 caused 30-40ms event-loop stalls (LATENCY DOCTOR -# confirmed). Replicas + EBS-mounted nodes.conf give us cluster-topology -# durability, which is all we need for a cache. -appendonly no +# AOF on: required for data preservation across image-swap restarts. Without +# AOF, shutdown-save races with shutdown-timeout under concurrent writes, and +# a flushed dump.rdb can wipe a shard via the "empty master returns" cluster +# bus path. With AOF, every write is on disk within ~1s; restart replays AOF. +appendonly yes appendfilename "appendonly.aof" appenddirname "appendonlydir" appendfsync everysec @@ -52,10 +53,15 @@ aof-load-truncated yes aof-use-rdb-preamble yes aof-timestamp-enabled no -# Disable scheduled BGSAVE forks (default rules tripped every ~90s under our -# load; each fork briefly stalls the event loop). +# No scheduled BGSAVE: AOF gives us continuous durability; periodic BGSAVE +# forks add latency without adding safety. save "" +# Generous shutdown budget: AOF makes the final save cheap, but during AOF +# rewrite or under load there can still be I/O to drain. Paired with the +# cmpd's terminationGracePeriodSeconds (must be > this value). +shutdown-timeout 25 + slowlog-log-slower-than 10000 slowlog-max-len 128 diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml index 4b78d0f3e..6daf8e8e5 100644 --- a/addons/valkey/templates/cmpd-valkey-cluster.yaml +++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml @@ -473,6 +473,10 @@ spec: - /scripts/sync-acl.sh targetPodSelector: Any runtime: + # Generous grace so AOF rewrite or in-flight fsync can land before SIGKILL. + # Must exceed redis-conf `shutdown-timeout` (25s) by enough margin for + # preStop hook + clean exit. + terminationGracePeriodSeconds: 60 initContainers: - name: init-dbctl command: From f3279e5ffb0423f3e80f38a5b0c9021d9d7e7d14 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Thu, 14 May 2026 15:57:51 +0200 Subject: [PATCH 11/11] feat (valkey): asm-reshard owns topology change + slot rebalance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the asm-reshard OpsDefinition from the stream-infra valkey chart into this addon — it's intrinsic to a sharded-Valkey deployment, not namespace-specific, so the addon is the right home. While moving, fold the Cluster CR patch into the OpsDef so a single OpsRequest drives the whole reshard. Adds two new actions around the existing slot-migration script: - prepare-topology (workload: Job, kubectl image): scale-out only. Patches Cluster.spec.shardings[0].shards up, waits for KB to bring the new pods to Running. No-op for scale-in. - asm-migrate (exec: valkey-cluster): existing direction-aware slot rebalance, untouched. - finalize-topology (workload: Job, kubectl image): scale-in only. After slots are drained, patches shards down and waits for KB to delete the extra pods. Each action's `failurePolicy: Fail` makes the OpsRequest fail loud if any step trips. KB tracks progress as N/3 in the OpsRequest status. KubeBlocks overrides workload.podSpec.serviceAccountName with the component's kb-managed SA, so the RBAC has to live on the consuming side (the chart binds a minimal Role to system:serviceaccounts: in templates/asm-rbac.yaml). Verified end-to-end with the stream-infra chart on KB 1.0.2 under writer-reporter load: 3→5 then 5→3, 50/50 keys preserved, 0 write failures. --- .../valkey/templates/opsdefinition-asm.yaml | 869 ++++++++++++++++++ 1 file changed, 869 insertions(+) create mode 100644 addons/valkey/templates/opsdefinition-asm.yaml diff --git a/addons/valkey/templates/opsdefinition-asm.yaml b/addons/valkey/templates/opsdefinition-asm.yaml new file mode 100644 index 000000000..5d084cd41 --- /dev/null +++ b/addons/valkey/templates/opsdefinition-asm.yaml @@ -0,0 +1,869 @@ +# ASM (Atomic Slot Migration) OpsDefinition for Valkey 9.0+ on KubeBlocks. +# +# Single-OpsRequest path for sharded Valkey resharding. Owns both the +# topology change (Cluster CR shards count) AND the slot rebalance. The +# user creates one OpsRequest with `targetShards: N` and KB runs three +# sequential actions: +# +# 1. prepare-topology — if scale-out, patch Cluster CR shards up, +# wait for new pods Running. No-op for scale-in. +# 2. asm-migrate — direction-aware slot rebalance (CLUSTER MIGRATESLOTS). +# Runs inside an existing valkey-cluster container; no kubectl needed. +# 3. finalize-topology — if scale-in, patch Cluster CR shards down, +# wait for old pods Terminated. No-op for scale-out. +# +# The asm-orchestrator ServiceAccount + RBAC (used by 1 and 3) is created +# per-namespace by the stream-infra valkey chart, not by this addon — the +# OpsDef references it by name and KB resolves it in the OpsRequest's +# namespace. +--- +apiVersion: operations.kubeblocks.io/v1alpha1 +kind: OpsDefinition +metadata: + name: asm-reshard + labels: + {{- include "valkey.labels" . | nindent 4 }} + annotations: + {{- include "valkey.apiVersion" . | nindent 4 }} +spec: + parametersSchema: + openAPIV3Schema: + properties: + targetShards: + type: integer + description: Desired shard count after resharding + clusterName: + type: string + description: Name of the Valkey Cluster CR + required: + - targetShards + - clusterName + podInfoExtractors: + - name: valkey-pod + podSelector: + multiPodSelectionPolicy: Any + env: + - name: CLUSTER_NAMESPACE + valueFrom: + envRef: + envName: CLUSTER_NAMESPACE + actions: + # ── Action 1: scale-out topology change ────────────────────────────── + # No-op when current_shards >= target. Otherwise patches Cluster.spec. + # shardings[0].shards to target and waits for KubeBlocks to bring the + # new pods to Running. + - name: prepare-topology + failurePolicy: Fail + parameters: + - targetShards + - clusterName + workload: + type: Job + podInfoExtractorName: valkey-pod + backoffLimit: 0 + podSpec: + # KB ignores serviceAccountName here; it injects the component's + # kb-managed SA. The chart binds the asm-orchestrator Role to + # the namespace's SA group, which covers it. See chart's + # templates/asm-rbac.yaml for the rationale. + restartPolicy: Never + containers: + - name: kubectl + image: {{ .Values.image.registry | default "docker.io" }}/apecloud/kubectl:1.29 + imagePullPolicy: IfNotPresent + command: + - bash + - -c + - | + set -euo pipefail + CLUSTER="${clusterName}" + TARGET="${targetShards}" + NS="${CLUSTER_NAMESPACE}" + log() { echo "[prepare-topology] $(date +%H:%M:%S) $*"; } + + CURRENT=$(kubectl get cluster -n "$NS" "$CLUSTER" \ + -o jsonpath='{.spec.shardings[0].shards}') + log "current_shards=${CURRENT} target_shards=${TARGET}" + + if [[ "$TARGET" -le "$CURRENT" ]]; then + log "Not scale-out; nothing to patch. (finalize-topology will handle scale-in.)" + exit 0 + fi + + log "Scale-out: patching Cluster ${CLUSTER} shardings[0].shards=${TARGET}" + kubectl patch cluster -n "$NS" "$CLUSTER" --type=json \ + -p "[{\"op\":\"replace\",\"path\":\"/spec/shardings/0/shards\",\"value\":${TARGET}}]" + + REPLICAS=$(kubectl get cluster -n "$NS" "$CLUSTER" \ + -o jsonpath='{.spec.shardings[0].template.replicas}') + EXPECTED=$((TARGET * REPLICAS)) + log "Waiting for ${EXPECTED} pods Running (replicas/shard=${REPLICAS})..." + + elapsed=0 + while [[ $elapsed -lt 600 ]]; do + READY=$(kubectl get pods -n "$NS" \ + -l "app.kubernetes.io/instance=${CLUSTER},apps.kubeblocks.io/sharding-name=shard" \ + --field-selector=status.phase=Running --no-headers 2>/dev/null \ + | wc -l | tr -d ' ') + [[ "$READY" -ge "$EXPECTED" ]] && break + [[ $((elapsed % 30)) -eq 0 ]] && log " ready=${READY}/${EXPECTED} (${elapsed}s)..." + sleep 10 + elapsed=$((elapsed + 10)) + done + if [[ "$READY" -lt "$EXPECTED" ]]; then + log "ERROR: only ${READY}/${EXPECTED} pods Running after 10m" + exit 1 + fi + + log "Waiting for Cluster phase Running..." + elapsed=0 + while [[ $elapsed -lt 300 ]]; do + PHASE=$(kubectl get cluster -n "$NS" "$CLUSTER" \ + -o jsonpath='{.status.phase}' 2>/dev/null) + [[ "$PHASE" == "Running" ]] && break + sleep 5 + elapsed=$((elapsed + 5)) + done + log "Cluster phase=${PHASE}. Topology ready for slot migration." + env: + - name: clusterName + value: $(clusterName) + - name: targetShards + value: $(targetShards) + - name: CLUSTER_NAMESPACE + value: $(CLUSTER_NAMESPACE) + + # ── Action 2: slot migration ───────────────────────────────────────── + # Direction-aware: detects scale-out vs scale-in from current vs target + # primary count, computes minimal slot moves, executes via CLUSTER + # MIGRATESLOTS. Runs inside an existing valkey-cluster container; uses + # CLUSTER NODES for discovery, redis-cli for the moves — no kubectl. + - name: asm-migrate + failurePolicy: Fail + parameters: + - targetShards + - clusterName + exec: + backoffLimit: 0 + podInfoExtractorName: valkey-pod + containerName: valkey-cluster + command: + - bash + - -c + - | + set -euo pipefail + + # ── Configuration ────────────────────────────────────────────── + # Parameters injected by KubeBlocks via $(paramName) substitution + TARGET_SHARDS="$(targetShards)" + CLUSTER_NAME="$(clusterName)" + ASM_TIMEOUT="${ASM_TIMEOUT_SECONDS:-600}" + POLL_INTERVAL=2 + LOCAL_PORT=6379 + + # ── Logging ──────────────────────────────────────────────────── + info() { echo "[INFO] $(date +%H:%M:%S) $*"; } + warn() { echo "[WARN] $(date +%H:%M:%S) $*"; } + fail() { echo "[FAIL] $(date +%H:%M:%S) $*"; } + + # ── Auth ─────────────────────────────────────────────────────── + AUTH_ARGS="" + if [[ -n "${REDIS_DEFAULT_PASSWORD:-}" ]]; then + AUTH_ARGS="--no-auth-warning -a ${REDIS_DEFAULT_PASSWORD}" + fi + + vcli() { + local host="$1" port="$2"; shift 2 + if command -v valkey-cli >/dev/null 2>&1; then + valkey-cli $AUTH_ARGS -h "$host" -p "$port" "$@" + else + redis-cli $AUTH_ARGS -h "$host" -p "$port" "$@" + fi + } + + vcli_local() { + vcli 127.0.0.1 "$LOCAL_PORT" "$@" + } + + # ── Step 1: Validate parameters ──────────────────────────────── + if [[ -z "$TARGET_SHARDS" ]]; then + fail "TARGET_SHARDS is not set. Pass it via OpsRequest parameters." + exit 1 + fi + if ! [[ "$TARGET_SHARDS" =~ ^[0-9]+$ ]] || [[ "$TARGET_SHARDS" -lt 1 ]]; then + fail "TARGET_SHARDS must be a positive integer, got: '${TARGET_SHARDS}'" + exit 1 + fi + if [[ -z "$CLUSTER_NAME" ]]; then + fail "CLUSTER_NAME is not set. Pass it via OpsRequest parameters." + exit 1 + fi + + info "ASM resharding: cluster=${CLUSTER_NAME} target_shards=${TARGET_SHARDS} timeout=${ASM_TIMEOUT}s" + + # ── Step 2: Discover topology ────────────────────────────────── + CLUSTER_NODES_RAW="$(vcli_local CLUSTER NODES)" + if [[ -z "$CLUSTER_NODES_RAW" ]]; then + fail "CLUSTER NODES returned empty. Is the cluster running?" + exit 1 + fi + + declare -a PRIMARY_IDS=() PRIMARY_HOSTS=() PRIMARY_PORTS=() PRIMARY_SLOTS=() PRIMARY_COMP=() + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + local_id="$(echo "$line" | awk '{print $1}')" + local_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)" + local_flags="$(echo "$line" | awk '{print $3}')" + local_host="$(echo "$local_addr" | rev | cut -d: -f2- | rev)" + local_port="$(echo "$local_addr" | rev | cut -d: -f1 | rev)" + if echo "$local_flags" | grep -q "master" && ! echo "$local_flags" | grep -q "fail\|handshake"; then + PRIMARY_IDS+=("$local_id") + PRIMARY_HOSTS+=("$local_host") + PRIMARY_PORTS+=("$local_port") + local_slots="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')" + PRIMARY_SLOTS+=("$local_slots") + # Extract component short name from FQDN in CLUSTER NODES addr field. + # Format: ip:port@cport,podname.headless.ns.svc... + # Pod name: {cluster}-{comp}-{ordinal} -> component = {comp} + local_fqdn="$(echo "$line" | awk '{print $2}' | cut -d@ -f2 | cut -d, -f2 | cut -d. -f1)" + # Strip pod ordinal: valkey-poc-shard-mk4-0 -> valkey-poc-shard-mk4 + local_pod_base="${local_fqdn%-*}" + # Strip cluster prefix: valkey-poc-shard-mk4 -> shard-mk4 + local_comp="${local_pod_base#${CLUSTER_NAME}-}" + PRIMARY_COMP+=("$local_comp") + fi + done <<< "$CLUSTER_NODES_RAW" + + CURRENT_SHARDS="${#PRIMARY_IDS[@]}" + + # Sort primaries by component short name (ascending) to match KubeBlocks' + # shard removal rule: alphabetical sort, keep first N, drop the rest. + # This ensures indices >= TARGET_SHARDS correspond to the shards KB will remove. + if [[ $CURRENT_SHARDS -gt 1 ]]; then + # Build sortable lines: "comp_name|index", sort by comp_name, extract reordered indices + declare -a SORT_ORDER=() + for i in "${!PRIMARY_COMP[@]}"; do + echo "${PRIMARY_COMP[$i]}|$i" + done | sort -t'|' -k1,1 | while IFS='|' read -r _ idx; do + SORT_ORDER+=("$idx") + done + # If subshell ate SORT_ORDER, rebuild via temp file + if [[ ${#SORT_ORDER[@]} -eq 0 ]]; then + SORT_TMP="$(mktemp)" + for i in "${!PRIMARY_COMP[@]}"; do + echo "${PRIMARY_COMP[$i]}|$i" + done | sort -t'|' -k1,1 | cut -d'|' -f2 > "$SORT_TMP" + SORT_ORDER=() + while IFS= read -r idx; do + SORT_ORDER+=("$idx") + done < "$SORT_TMP" + rm -f "$SORT_TMP" + fi + # Reorder all arrays + declare -a _IDS=() _HOSTS=() _PORTS=() _SLOTS=() _COMP=() + for idx in "${SORT_ORDER[@]}"; do + _IDS+=("${PRIMARY_IDS[$idx]}") + _HOSTS+=("${PRIMARY_HOSTS[$idx]}") + _PORTS+=("${PRIMARY_PORTS[$idx]}") + _SLOTS+=("${PRIMARY_SLOTS[$idx]}") + _COMP+=("${PRIMARY_COMP[$idx]}") + done + PRIMARY_IDS=("${_IDS[@]}") + PRIMARY_HOSTS=("${_HOSTS[@]}") + PRIMARY_PORTS=("${_PORTS[@]}") + PRIMARY_SLOTS=("${_SLOTS[@]}") + PRIMARY_COMP=("${_COMP[@]}") + fi + + # Count primaries that currently own slots vs. primaries that are empty. + # LOADED_SHARDS is the number of slot-owning shards, which is what we + # actually care about for direction detection (NOT CURRENT_SHARDS, which + # includes freshly-created empty primaries after a Cluster CR patch). + LOADED_SHARDS=0 + EMPTY_COUNT=0 + for i in "${!PRIMARY_IDS[@]}"; do + slots_trimmed="$(echo "${PRIMARY_SLOTS[$i]}" | xargs)" + if [[ -z "$slots_trimmed" ]]; then + EMPTY_COUNT=$((EMPTY_COUNT + 1)) + else + LOADED_SHARDS=$((LOADED_SHARDS + 1)) + fi + done + + info "Current topology: ${CURRENT_SHARDS} primaries (${LOADED_SHARDS} loaded, ${EMPTY_COUNT} empty), sorted by component name" + + for i in "${!PRIMARY_IDS[@]}"; do + info " shard ${i}: comp=${PRIMARY_COMP[$i]} id=${PRIMARY_IDS[$i]:0:8}... host=${PRIMARY_HOSTS[$i]}:${PRIMARY_PORTS[$i]} slots=[${PRIMARY_SLOTS[$i]}]" + done + + # Nothing to do only if both the primary count matches AND all primaries + # own slots (no empty primaries waiting to be populated). + if [[ "$TARGET_SHARDS" -eq "$LOADED_SHARDS" && "$EMPTY_COUNT" -eq 0 ]]; then + info "Already at ${TARGET_SHARDS} loaded shards with no empty primaries. Nothing to do." + exit 0 + fi + + if [[ "$TARGET_SHARDS" -gt "$LOADED_SHARDS" ]]; then + DIRECTION="scale-out" + info "Direction: scale-out (${LOADED_SHARDS} loaded -> ${TARGET_SHARDS} target)" + if [[ $EMPTY_COUNT -eq 0 ]]; then + fail "No empty shards found. Ensure new shard pods joined via CLUSTER MEET." + exit 1 + fi + if [[ $((LOADED_SHARDS + EMPTY_COUNT)) -lt "$TARGET_SHARDS" ]]; then + fail "Not enough primaries (${CURRENT_SHARDS}) to reach target ${TARGET_SHARDS}. Ensure Cluster CR shards=${TARGET_SHARDS} and all pods are Running." + exit 1 + fi + info "Will migrate slots into ${EMPTY_COUNT} empty shard(s)" + else + DIRECTION="scale-in" + info "Direction: scale-in (${LOADED_SHARDS} loaded -> ${TARGET_SHARDS} target)" + fi + + # ── Step 3: ACL patch ────────────────────────────────────────── + # MIGRATESLOTS authenticates to the target node as kbreplicator (via + # masterauth). The snapshot transfer replays the full RDB stream including + # SELECT, SET, HSET, and every other write command stored in the migrating + # slots. kbreplicator's default ACL (-@all +psync +replconf +ping) is far + # too restrictive. Grant +@all for the migration; the user is internal to + # the cluster (masterauth-only), so this does not widen the attack surface. + info "Patching ACL: granting kbreplicator +@all on all nodes for slot migration..." + ALL_NODES_RAW="$(vcli_local CLUSTER NODES)" + ACL_FAIL=0 + while IFS= read -r line; do + [[ -z "$line" ]] && continue + node_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)" + node_flags="$(echo "$line" | awk '{print $3}')" + node_host="$(echo "$node_addr" | rev | cut -d: -f2- | rev)" + node_port="$(echo "$node_addr" | rev | cut -d: -f1 | rev)" + if echo "$node_flags" | grep -q "fail\|handshake"; then continue; fi + result="$(vcli "$node_host" "$node_port" ACL SETUSER kbreplicator "+@all" "~*" "&*" 2>&1 || echo 'ERROR')" + if [[ "$result" == *"OK"* ]]; then + info " ACL patched: ${node_host}:${node_port}" + elif [[ "$result" == *"ERR"*"user"*"not"* ]] || [[ "$result" == *"ERR"*"User"*"not"* ]]; then + warn " kbreplicator not found on ${node_host}:${node_port}" + else + warn " ACL patch failed on ${node_host}:${node_port}: ${result}" + ACL_FAIL=$((ACL_FAIL + 1)) + fi + done <<< "$ALL_NODES_RAW" + if [[ $ACL_FAIL -gt 0 ]]; then + warn "ACL patch failed on ${ACL_FAIL} node(s)." + fi + + # ── Step 4: Check for stuck migrations ───────────────────────── + info "Checking for stuck migrations from previous failures..." + STUCK_FOUND=false + while IFS= read -r line; do + [[ -z "$line" ]] && continue + node_flags="$(echo "$line" | awk '{print $3}')" + if echo "$node_flags" | grep -q "fail\|handshake"; then continue; fi + slot_fields="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')" + if echo "$slot_fields" | grep -qE '\[.*->-\]|\[.*-<-\]'; then + STUCK_FOUND=true + break + fi + done <<< "$ALL_NODES_RAW" + + if [[ "$STUCK_FOUND" == "true" ]]; then + warn "Stuck MIGRATING/IMPORTING slots detected. Running cluster fix..." + fix_endpoint="127.0.0.1:${LOCAL_PORT}" + if command -v valkey-cli >/dev/null 2>&1; then + valkey-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true + else + redis-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true + fi + info "Cluster fix completed. Re-reading topology..." + CLUSTER_NODES_RAW="$(vcli_local CLUSTER NODES)" + PRIMARY_IDS=() PRIMARY_HOSTS=() PRIMARY_PORTS=() PRIMARY_SLOTS=() + NEW_SHARD_IDS=() NEW_SHARD_HOSTS=() NEW_SHARD_PORTS=() + while IFS= read -r line; do + [[ -z "$line" ]] && continue + local_id="$(echo "$line" | awk '{print $1}')" + local_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)" + local_flags="$(echo "$line" | awk '{print $3}')" + local_host="$(echo "$local_addr" | rev | cut -d: -f2- | rev)" + local_port="$(echo "$local_addr" | rev | cut -d: -f1 | rev)" + if echo "$local_flags" | grep -q "master" && ! echo "$local_flags" | grep -q "fail\|handshake"; then + PRIMARY_IDS+=("$local_id") + PRIMARY_HOSTS+=("$local_host") + PRIMARY_PORTS+=("$local_port") + local_slots="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')" + PRIMARY_SLOTS+=("$local_slots") + fi + done <<< "$CLUSTER_NODES_RAW" + for i in "${!PRIMARY_IDS[@]}"; do + slots_trimmed="$(echo "${PRIMARY_SLOTS[$i]}" | xargs)" + if [[ -z "$slots_trimmed" ]]; then + NEW_SHARD_IDS+=("${PRIMARY_IDS[$i]}") + NEW_SHARD_HOSTS+=("${PRIMARY_HOSTS[$i]}") + NEW_SHARD_PORTS+=("${PRIMARY_PORTS[$i]}") + fi + done + CURRENT_SHARDS="${#PRIMARY_IDS[@]}" + info "Topology after fix: ${CURRENT_SHARDS} primaries, ${#NEW_SHARD_IDS[@]} empty" + fi + + # ── Step 5: Compute slot plan ────────────────────────────────── + # Generalized: works for both scale-out and scale-in. + # Compute target slot count per primary, then pair donors with receivers. + TOTAL_SLOTS=16384 + SLOTS_PER_SHARD=$((TOTAL_SLOTS / TARGET_SHARDS)) + REMAINDER=$((TOTAL_SLOTS % TARGET_SHARDS)) + + info "Target distribution: ${SLOTS_PER_SHARD} slots/shard (+1 for first ${REMAINDER} shards)" + + # Count current slots per primary + declare -a OWNED_SLOTS=() + for i in "${!PRIMARY_IDS[@]}"; do + count=0 + for range in ${PRIMARY_SLOTS[$i]}; do + [[ "$range" == *"["* ]] && continue + if [[ "$range" == *"-"* ]]; then + s="${range%-*}"; e="${range#*-}" + count=$((count + e - s + 1)) + elif [[ "$range" =~ ^[0-9]+$ ]]; then + count=$((count + 1)) + fi + done + OWNED_SLOTS+=("$count") + done + + info "Current slot distribution:" + for i in "${!PRIMARY_IDS[@]}"; do + info " shard ${i} (${PRIMARY_IDS[$i]:0:8}...): ${OWNED_SLOTS[$i]} slots" + done + + # Compute target count per primary: + # indices 0..TARGET_SHARDS-1: even share + # indices TARGET_SHARDS..CURRENT_SHARDS-1: 0 (being drained on scale-in) + declare -a TARGET_COUNTS=() + for i in "${!PRIMARY_IDS[@]}"; do + if [[ $i -lt $TARGET_SHARDS ]]; then + if [[ $i -lt $REMAINDER ]]; then + TARGET_COUNTS+=($((SLOTS_PER_SHARD + 1))) + else + TARGET_COUNTS+=("$SLOTS_PER_SHARD") + fi + else + TARGET_COUNTS+=(0) + fi + done + + # Compute delta per primary: positive = donate, negative = receive + declare -a DELTAS=() + for i in "${!PRIMARY_IDS[@]}"; do + DELTAS+=($((OWNED_SLOTS[$i] - TARGET_COUNTS[$i]))) + done + + info "Slot movement plan:" + for i in "${!PRIMARY_IDS[@]}"; do + d="${DELTAS[$i]}" + if [[ $d -gt 0 ]]; then + info " shard ${i}: donate ${d} slots (${OWNED_SLOTS[$i]} -> ${TARGET_COUNTS[$i]})" + elif [[ $d -lt 0 ]]; then + info " shard ${i}: receive $((-d)) slots (${OWNED_SLOTS[$i]} -> ${TARGET_COUNTS[$i]})" + fi + done + + # Build receiver list: shards that need more slots, with remaining capacity + declare -a RECV_IDX=() RECV_REMAINING=() + for i in "${!PRIMARY_IDS[@]}"; do + if [[ "${DELTAS[$i]}" -lt 0 ]]; then + RECV_IDX+=("$i") + RECV_REMAINING+=($(( -${DELTAS[$i]} ))) + fi + done + + # Build migration tuples: pair donors with receivers. + # Consumes from the end of each donor's ranges, splitting across + # receivers as needed. Tracks a cursor (cur_end) within each range + # to avoid overlapping migrations. + declare -a MIG_SOURCE_IDX=() MIG_TARGET_IDX=() MIG_START=() MIG_END=() + recv_ptr=0 + + for src_idx in "${!PRIMARY_IDS[@]}"; do + remaining_donate="${DELTAS[$src_idx]}" + [[ "$remaining_donate" -le 0 ]] && continue + + # Parse this shard's slot ranges + declare -a SRC_RANGES_START=() SRC_RANGES_END=() + for range in ${PRIMARY_SLOTS[$src_idx]}; do + [[ "$range" == *"["* ]] && continue + if [[ "$range" == *"-"* ]]; then + SRC_RANGES_START+=("${range%-*}") + SRC_RANGES_END+=("${range#*-}") + elif [[ "$range" =~ ^[0-9]+$ ]]; then + SRC_RANGES_START+=("$range") + SRC_RANGES_END+=("$range") + fi + done + + # Walk ranges from the end, tracking a cursor within each range + range_idx=$(( ${#SRC_RANGES_START[@]} - 1 )) + cur_end="${SRC_RANGES_END[$range_idx]}" + + while [[ $remaining_donate -gt 0 && $range_idx -ge 0 ]]; do + r_start="${SRC_RANGES_START[$range_idx]}" + r_available=$((cur_end - r_start + 1)) + + # Find a receiver with remaining capacity + while [[ $recv_ptr -lt ${#RECV_IDX[@]} ]]; do + [[ "${RECV_REMAINING[$recv_ptr]}" -gt 0 ]] && break + recv_ptr=$((recv_ptr + 1)) + done + if [[ $recv_ptr -ge ${#RECV_IDX[@]} ]]; then + warn "No more receivers for slots."; break + fi + + # Take min(available_in_range, receiver_needs, remaining_donate) + recv_needs="${RECV_REMAINING[$recv_ptr]}" + take_size=$r_available + [[ $take_size -gt $recv_needs ]] && take_size=$recv_needs + [[ $take_size -gt $remaining_donate ]] && take_size=$remaining_donate + + take_end="$cur_end" + take_start=$((cur_end - take_size + 1)) + + tgt_idx="${RECV_IDX[$recv_ptr]}" + MIG_SOURCE_IDX+=("$src_idx"); MIG_TARGET_IDX+=("$tgt_idx") + MIG_START+=("$take_start"); MIG_END+=("$take_end") + + remaining_donate=$((remaining_donate - take_size)) + RECV_REMAINING[$recv_ptr]=$((${RECV_REMAINING[$recv_ptr]} - take_size)) + [[ "${RECV_REMAINING[$recv_ptr]}" -le 0 ]] && recv_ptr=$((recv_ptr + 1)) + + # Shrink the working range + cur_end=$((take_start - 1)) + if [[ $cur_end -lt $r_start ]]; then + # Range fully consumed, move to previous range + range_idx=$((range_idx - 1)) + [[ $range_idx -ge 0 ]] && cur_end="${SRC_RANGES_END[$range_idx]}" + fi + done + unset SRC_RANGES_START SRC_RANGES_END + done + + if [[ "${#MIG_SOURCE_IDX[@]}" -eq 0 ]]; then + info "No migrations needed. Cluster is already balanced."; exit 0 + fi + + info "Planned ${#MIG_SOURCE_IDX[@]} migration(s):" + for i in "${!MIG_SOURCE_IDX[@]}"; do + src="${MIG_SOURCE_IDX[$i]}"; tgt="${MIG_TARGET_IDX[$i]}" + info " slots ${MIG_START[$i]}-${MIG_END[$i]} from shard ${src} (${PRIMARY_IDS[$src]:0:8}...) to shard ${tgt} (${PRIMARY_IDS[$tgt]:0:8}...)" + done + + # ── Step 6: Execute migrations ───────────────────────────────── + cancel_all_migrations() { + warn "Cancelling all in-flight migrations..." + for i in "${!PRIMARY_IDS[@]}"; do + slots_trimmed="$(echo "${PRIMARY_SLOTS[$i]}" | xargs)" + [[ -z "$slots_trimmed" ]] && continue + vcli "${PRIMARY_HOSTS[$i]}" "${PRIMARY_PORTS[$i]}" CLUSTER CANCELSLOTMIGRATIONS 2>&1 || true + done + } + + run_cluster_fix() { + warn "Running cluster fix as remediation..." + fix_endpoint="127.0.0.1:${LOCAL_PORT}" + if command -v valkey-cli >/dev/null 2>&1; then + valkey-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true + else + redis-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true + fi + } + + # Helper: extract state for a specific slot range from GETSLOTMIGRATIONS output. + # GETSLOTMIGRATIONS returns entries newest-first. Return the FIRST match + # for the given slot_range (= most recent migration for those slots). + get_mig_state() { + local output="$1" target_range="$2" + local found_range="" current_state="" + while IFS= read -r kv_line; do + case "$kv_line" in + slot_ranges) found_range="next" ;; + state) current_state="next" ;; + *) + if [[ "$found_range" == "next" ]]; then + found_range="$kv_line" + elif [[ "$current_state" == "next" ]]; then + current_state="$kv_line" + if [[ "$found_range" == "$target_range" ]]; then + echo "$current_state" + return 0 + fi + found_range=""; current_state="" + fi + ;; + esac + done <<< "$output" + } + + MIGRATION_FAILED=false + for mig_idx in "${!MIG_SOURCE_IDX[@]}"; do + src="${MIG_SOURCE_IDX[$mig_idx]}"; tgt="${MIG_TARGET_IDX[$mig_idx]}" + start="${MIG_START[$mig_idx]}"; end="${MIG_END[$mig_idx]}" + target_id="${PRIMARY_IDS[$tgt]}" + src_host="${PRIMARY_HOSTS[$src]}"; src_port="${PRIMARY_PORTS[$src]}" + slot_range="${start}-${end}" + + info "Migration ${mig_idx}: slots ${slot_range} from ${src_host}:${src_port} to ${target_id:0:8}..." + mig_result="$(vcli "$src_host" "$src_port" CLUSTER MIGRATESLOTS SLOTSRANGE "$start" "$end" NODE "$target_id" 2>&1)" + if [[ "$mig_result" != *"OK"* ]]; then + fail "CLUSTER MIGRATESLOTS failed: ${mig_result}" + MIGRATION_FAILED=true; break + fi + + info " Migration started (async). Polling for slot_range=${slot_range}..." + elapsed=0; migration_done=false + while [[ $elapsed -lt $ASM_TIMEOUT ]]; do + sleep "$POLL_INTERVAL"; elapsed=$((elapsed + POLL_INTERVAL)) + src_status_raw="$(vcli "$src_host" "$src_port" CLUSTER GETSLOTMIGRATIONS 2>&1 || echo 'UNREACHABLE')" + tgt_host="${PRIMARY_HOSTS[$tgt]}"; tgt_port="${PRIMARY_PORTS[$tgt]}" + tgt_status_raw="$(vcli "$tgt_host" "$tgt_port" CLUSTER GETSLOTMIGRATIONS 2>&1 || echo 'UNREACHABLE')" + + # Check state of OUR migration only (by slot range), ignore stale history + src_state="$(get_mig_state "$src_status_raw" "$slot_range")" + tgt_state="$(get_mig_state "$tgt_status_raw" "$slot_range")" + + if [[ "$src_state" == "failed" || "$src_state" == "cancelled" ]]; then + fail " Migration failed on source (state=${src_state})" + MIGRATION_FAILED=true; break 2 + fi + if [[ "$tgt_state" == "failed" || "$tgt_state" == "cancelled" ]]; then + fail " Migration failed on target (state=${tgt_state})" + MIGRATION_FAILED=true; break 2 + fi + if [[ "$src_state" == "success" && "$tgt_state" == "success" ]]; then + migration_done=true; break + fi + # Also succeed if source says success (target may not track it the same way) + if [[ "$src_state" == "success" ]]; then + migration_done=true; break + fi + [[ $((elapsed % 10)) -eq 0 ]] && info " Polling... src=${src_state:-pending} tgt=${tgt_state:-pending} (${elapsed}s)" + done + [[ "$MIGRATION_FAILED" == "true" ]] && break + if [[ "$migration_done" != "true" ]]; then + fail " Migration timed out after ${ASM_TIMEOUT}s"; MIGRATION_FAILED=true; break + fi + info " Migration ${mig_idx} completed (${elapsed}s)" + done + + # ── Step 7: Error handling ───────────────────────────────────── + if [[ "$MIGRATION_FAILED" == "true" ]]; then + cancel_all_migrations + sleep 2 + REMNANTS="$(vcli_local CLUSTER NODES 2>/dev/null || echo '')" + if echo "$REMNANTS" | grep -qE '\[.*->-\]|\[.*-<-\]'; then + warn "MIGRATING/IMPORTING remnants detected after cancellation." + run_cluster_fix + fi + fail "ASM resharding FAILED. Cluster may need manual inspection." + exit 1 + fi + + # ── Step 8: Verify ───────────────────────────────────────────── + info "All migrations completed. Verifying cluster health..." + sleep 2 + CLUSTER_INFO="$(vcli_local CLUSTER INFO 2>/dev/null)" + SLOTS_OK="$(echo "$CLUSTER_INFO" | grep 'cluster_slots_ok' | cut -d: -f2 | tr -d '[:space:]')" + CLUSTER_STATE="$(echo "$CLUSTER_INFO" | grep 'cluster_state' | cut -d: -f2 | tr -d '[:space:]')" + info "cluster_state=${CLUSTER_STATE} cluster_slots_ok=${SLOTS_OK}" + + if [[ "$SLOTS_OK" != "16384" ]]; then + fail "VERIFICATION FAILED: cluster_slots_ok=${SLOTS_OK} (expected 16384)" + run_cluster_fix + sleep 2 + CLUSTER_INFO="$(vcli_local CLUSTER INFO 2>/dev/null)" + SLOTS_OK="$(echo "$CLUSTER_INFO" | grep 'cluster_slots_ok' | cut -d: -f2 | tr -d '[:space:]')" + if [[ "$SLOTS_OK" != "16384" ]]; then + fail "cluster_slots_ok still ${SLOTS_OK} after fix. Manual intervention required." + exit 1 + fi + info "Cluster fix restored all slots." + fi + if [[ "$CLUSTER_STATE" != "ok" ]]; then + fail "VERIFICATION FAILED: cluster_state=${CLUSTER_STATE} (expected ok)" + exit 1 + fi + + # ── Step 9: Clean up stale replica importing flags ──────────── + # Valkey 9 bug (valkey-io/valkey#998): when a replica does a full sync + # during slot migration, the RDB snapshot carries the in-progress + # importing state. After the primary completes the migration and clears + # its importing flag, the replica still has the stale flag from the RDB. + # Gossip won't clear it (cleanup is primary-only). SETSLOT STABLE is + # rejected on replicas from external clients. + # + # Fix: run CLUSTER SETSLOT STABLE on the PRIMARY of the affected + # replica. The primary propagates it to the replica via the replication + # stream (forceCommandPropagation). This clears the stale flag. + info "Checking for stale importing/migrating flags on all nodes..." + ALL_NODES_CHECK="$(vcli_local CLUSTER NODES 2>/dev/null || echo '')" + STALE_FIXED=0 + STALE_FAILED=0 + while IFS= read -r line; do + [[ -z "$line" ]] && continue + node_flags="$(echo "$line" | awk '{print $3}')" + if echo "$node_flags" | grep -q "fail\|handshake"; then continue; fi + slot_fields="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')" + # Extract slot numbers from [slot-<-nodeid] or [slot->-nodeid] markers + stale_slots="$(echo "$slot_fields" | grep -oE '\[[0-9]+-[<>]-' | grep -oE '[0-9]+' || true)" + if [[ -z "$stale_slots" ]]; then continue; fi + + node_id="$(echo "$line" | awk '{print $1}')" + node_fqdn="$(echo "$line" | awk '{print $2}' | cut -d@ -f2 | cut -d, -f2 | cut -d. -f1)" + + if echo "$node_flags" | grep -q "slave"; then + # Replica: find its primary and issue SETSLOT STABLE there + primary_id="$(echo "$line" | awk '{print $4}')" + primary_line="$(echo "$ALL_NODES_CHECK" | grep "^${primary_id} ")" + if [[ -z "$primary_line" ]]; then + warn "Cannot find primary ${primary_id:0:8}... for replica ${node_fqdn}. Manual fix needed." + STALE_FAILED=$((STALE_FAILED + 1)) + continue + fi + primary_addr="$(echo "$primary_line" | awk '{print $2}' | cut -d@ -f1)" + primary_host="$(echo "$primary_addr" | rev | cut -d: -f2- | rev)" + primary_port="$(echo "$primary_addr" | rev | cut -d: -f1 | rev)" + for slot in $stale_slots; do + info " Clearing stale slot ${slot} on replica ${node_fqdn} via primary ${primary_host}:${primary_port}..." + result="$(vcli "$primary_host" "$primary_port" CLUSTER SETSLOT "$slot" STABLE 2>&1 || echo 'ERROR')" + if [[ "$result" == *"OK"* ]]; then + STALE_FIXED=$((STALE_FIXED + 1)) + else + warn " Failed to clear slot ${slot}: ${result}" + STALE_FAILED=$((STALE_FAILED + 1)) + fi + done + else + # Primary: issue SETSLOT STABLE directly + node_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)" + node_host="$(echo "$node_addr" | rev | cut -d: -f2- | rev)" + node_port="$(echo "$node_addr" | rev | cut -d: -f1 | rev)" + for slot in $stale_slots; do + info " Clearing stale slot ${slot} on primary ${node_fqdn}..." + result="$(vcli "$node_host" "$node_port" CLUSTER SETSLOT "$slot" STABLE 2>&1 || echo 'ERROR')" + if [[ "$result" == *"OK"* ]]; then + STALE_FIXED=$((STALE_FIXED + 1)) + else + warn " Failed to clear slot ${slot}: ${result}" + STALE_FAILED=$((STALE_FAILED + 1)) + fi + done + fi + done <<< "$ALL_NODES_CHECK" + + if [[ $STALE_FIXED -gt 0 ]]; then + info "Cleared ${STALE_FIXED} stale slot flag(s) (valkey-io/valkey#998 workaround)." + sleep 2 # let replication propagate SETSLOT STABLE to replicas + fi + if [[ $STALE_FAILED -gt 0 ]]; then + warn "${STALE_FAILED} stale flag(s) could not be cleared. These may block KubeBlocks preTerminate." + warn "Manual fix: delete the affected replica pod (KubeBlocks will recreate it)." + fi + if [[ $STALE_FIXED -eq 0 && $STALE_FAILED -eq 0 ]]; then + info "No stale importing/migrating flags found." + fi + + # Show final topology + info "Final topology:" + FINAL_NODES="$(vcli_local CLUSTER NODES)" + while IFS= read -r line; do + [[ -z "$line" ]] && continue + node_id="$(echo "$line" | awk '{print $1}')" + node_flags="$(echo "$line" | awk '{print $3}')" + if echo "$node_flags" | grep -q "master" && ! echo "$node_flags" | grep -q "fail\|handshake"; then + node_slots="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')" + info " ${node_id:0:8}... [${node_slots}]" + fi + done <<< "$FINAL_NODES" + info "ASM resharding completed successfully: ${CURRENT_SHARDS} -> ${TARGET_SHARDS} shards" + exit 0 + + # ── Action 3: scale-in topology change ─────────────────────────────── + # No-op when current_shards <= target. Otherwise patches Cluster.spec. + # shardings[0].shards down (slots have already been drained by + # asm-migrate) and waits for KubeBlocks to delete the extra pods. + - name: finalize-topology + failurePolicy: Fail + parameters: + - targetShards + - clusterName + workload: + type: Job + podInfoExtractorName: valkey-pod + backoffLimit: 0 + podSpec: + # KB ignores serviceAccountName here; it injects the component's + # kb-managed SA. The chart binds the asm-orchestrator Role to + # the namespace's SA group, which covers it. See chart's + # templates/asm-rbac.yaml for the rationale. + restartPolicy: Never + containers: + - name: kubectl + image: {{ .Values.image.registry | default "docker.io" }}/apecloud/kubectl:1.29 + imagePullPolicy: IfNotPresent + command: + - bash + - -c + - | + set -euo pipefail + CLUSTER="${clusterName}" + TARGET="${targetShards}" + NS="${CLUSTER_NAMESPACE}" + log() { echo "[finalize-topology] $(date +%H:%M:%S) $*"; } + + CURRENT=$(kubectl get cluster -n "$NS" "$CLUSTER" \ + -o jsonpath='{.spec.shardings[0].shards}') + log "current_shards=${CURRENT} target_shards=${TARGET}" + + if [[ "$TARGET" -ge "$CURRENT" ]]; then + log "Not scale-in; nothing to patch." + exit 0 + fi + + log "Scale-in: patching Cluster ${CLUSTER} shardings[0].shards=${TARGET}" + kubectl patch cluster -n "$NS" "$CLUSTER" --type=json \ + -p "[{\"op\":\"replace\",\"path\":\"/spec/shardings/0/shards\",\"value\":${TARGET}}]" + + REPLICAS=$(kubectl get cluster -n "$NS" "$CLUSTER" \ + -o jsonpath='{.spec.shardings[0].template.replicas}') + EXPECTED=$((TARGET * REPLICAS)) + log "Waiting for pod count to drop to ${EXPECTED}..." + + elapsed=0 + while [[ $elapsed -lt 600 ]]; do + COUNT=$(kubectl get pods -n "$NS" \ + -l "app.kubernetes.io/instance=${CLUSTER},apps.kubeblocks.io/sharding-name=shard" \ + --no-headers 2>/dev/null | wc -l | tr -d ' ') + [[ "$COUNT" -le "$EXPECTED" ]] && break + [[ $((elapsed % 30)) -eq 0 ]] && log " count=${COUNT} target=${EXPECTED} (${elapsed}s)..." + sleep 10 + elapsed=$((elapsed + 10)) + done + if [[ "$COUNT" -gt "$EXPECTED" ]]; then + log "ERROR: pod count still ${COUNT} > ${EXPECTED} after 10m" + exit 1 + fi + + log "Waiting for Cluster phase Running..." + elapsed=0 + while [[ $elapsed -lt 300 ]]; do + PHASE=$(kubectl get cluster -n "$NS" "$CLUSTER" \ + -o jsonpath='{.status.phase}' 2>/dev/null) + [[ "$PHASE" == "Running" ]] && break + sleep 5 + elapsed=$((elapsed + 5)) + done + log "Cluster phase=${PHASE}. Scale-in finalized." + env: + - name: clusterName + value: $(clusterName) + - name: targetShards + value: $(targetShards) + - name: CLUSTER_NAMESPACE + value: $(CLUSTER_NAMESPACE)