From 51078142d62c1f746c611d73939ee1979ff813b6 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Wed, 6 May 2026 12:43:59 +0200
Subject: [PATCH 01/11] feat(valkey): add Valkey cluster addon as a sibling to
 redis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stand up addons/valkey/ as a cluster-mode-only side-by-side addon, so our
Valkey customizations live in their own file tree and never collide with
upstream redis evolution. This retires the five post-install Helm hooks
in stream-infra (patch-cache-config, patch-maxmemory, patch-prefer-ip,
patch-reshard-cm, patch-valkey-image) by baking the equivalent behaviour
into the addon at template-level.

What's in the addon
-------------------
- Single Valkey major (9.x) — no multi-version range loop, no sentinel,
  no twemproxy. cmpv-valkey-cluster.yaml ships docker.io/valkey/valkey
  images. dbctl/agamotto stay on apecloud.
- ShardingDefinition with `minShards: 1` (provisions 1, 2, 3+ shards,
  matching how AWS ElastiCache exposes the same engine).
- redis.conf tuned for a cache workload at template-level: appendonly no,
  save "" (no scheduled BGSAVE), io-threads 1 (avoids CFS throttling at
  our pod CPU limit), latency-monitor-threshold 25 (observability),
  maxmemory-policy allkeys-lru, maxmemory at 85% of pod memory limit.
- valkey-cluster-server-start.sh: emits `cluster-preferred-endpoint-type
  ip` on the default-network branch (was `hostname`), so CLUSTER SLOTS
  announces VPC-routable IPs for chat-api and other external clients.
- valkey-cluster-manage.sh: skips the legacy `redis-cli --cluster reshard`
  call on shard scale-out — slot migration is driven by ASM
  (CLUSTER MIGRATESLOTS via ape-dts) through the OpsDefinition in
  stream-infra.
- valkey-cluster-common.sh: branches `create_redis_cluster` on a single
  primary to use `CLUSTER ADDSLOTSRANGE 0 16383` (mirroring ElastiCache),
  bypassing `redis-cli --cluster create` which rejects fewer than 3
  masters. Lifts the matching guard in initialize_redis_cluster.

Function names inside the scripts intentionally keep their `redis_*`
identifiers to minimise the diff vs. upstream redis scripts and ease
future bug-porting.

Settings are global for now — no per-cluster Helm knobs. Add
ParametersDefinition / values overrides later if cluster-specific
tunings are needed.

Verification
------------
- `helm template addons/valkey` renders 5 resources cleanly:
  ShardingDefinition, ComponentDefinition, ComponentVersion, plus the
  config + scripts ConfigMap templates. All 9 script files mount.
- shellspec for `build_single_shard_addslots_command` and
  `create_redis_cluster` branch logic: 4 examples, 0 failures.
---
 addons/valkey/Chart.yaml                      |   34 +
 .../valkey/config/valkey-cluster-config.tpl   |  123 ++
 addons/valkey/scripts-ut-spec/utils.sh        |   32 +
 .../valkey_cluster_common_spec.sh             |  114 ++
 addons/valkey/templates/_helpers.tpl          |   98 ++
 .../valkey/templates/cmpd-valkey-cluster.yaml |  555 +++++++++
 .../valkey/templates/cmpv-valkey-cluster.yaml |   36 +
 .../valkey/templates/shardingdefinition.yaml  |   30 +
 .../valkey-cluster-config-template.yaml       |   11 +
 .../valkey-cluster-scripts-template.yaml      |   25 +
 .../reload-parameter.sh                       |   30 +
 .../valkey/valkey-cluster-scripts/sync-acl.sh |   52 +
 .../valkey-cluster-common.sh                  |  787 ++++++++++++
 .../valkey-cluster-manage.sh                  | 1051 +++++++++++++++++
 .../valkey-cluster-replica-member-leave.sh    |  111 ++
 .../valkey-cluster-replica-pre-stop.sh        |   43 +
 .../valkey-cluster-server-start.sh            |  776 ++++++++++++
 .../valkey-cluster-switchover.sh              |  255 ++++
 .../valkey-cluster-scripts/valkey-ping.sh     |   69 ++
 addons/valkey/values.yaml                     |   57 +
 20 files changed, 4289 insertions(+)
 create mode 100644 addons/valkey/Chart.yaml
 create mode 100644 addons/valkey/config/valkey-cluster-config.tpl
 create mode 100644 addons/valkey/scripts-ut-spec/utils.sh
 create mode 100644 addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh
 create mode 100644 addons/valkey/templates/_helpers.tpl
 create mode 100644 addons/valkey/templates/cmpd-valkey-cluster.yaml
 create mode 100644 addons/valkey/templates/cmpv-valkey-cluster.yaml
 create mode 100644 addons/valkey/templates/shardingdefinition.yaml
 create mode 100644 addons/valkey/templates/valkey-cluster-config-template.yaml
 create mode 100644 addons/valkey/templates/valkey-cluster-scripts-template.yaml
 create mode 100644 addons/valkey/valkey-cluster-scripts/reload-parameter.sh
 create mode 100644 addons/valkey/valkey-cluster-scripts/sync-acl.sh
 create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh
 create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh
 create mode 100755 addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh
 create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh
 create mode 100755 addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh
 create mode 100644 addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh
 create mode 100755 addons/valkey/valkey-cluster-scripts/valkey-ping.sh
 create mode 100644 addons/valkey/values.yaml

diff --git a/addons/valkey/Chart.yaml b/addons/valkey/Chart.yaml
new file mode 100644
index 000000000..93321aa6c
--- /dev/null
+++ b/addons/valkey/Chart.yaml
@@ -0,0 +1,34 @@
+apiVersion: v2
+name: valkey
+description: "Valkey is an open-source, high-performance key/value store. This addon provisions Valkey Cluster topologies on KubeBlocks. Valkey speaks the Redis protocol and uses Redis-compatible cluster bootstrap; this is a sibling addon to `redis` so upstream redis evolution does not conflict with our Valkey customizations."
+
+type: application
+
+version: 0.1.0
+
+appVersion: "9.0.3"
+
+# Add a dependency to the kubeblocks definition library chart, same as the redis addon.
+dependencies:
+  - name: kblib
+    version: 0.1.0
+    repository: file://../kblib
+    alias: extra
+
+home: https://valkey.io/
+icon: https://valkey.io/img/Valkey_Logo_Color.svg
+keywords:
+  - valkey
+  - redis
+  - database
+  - nosql
+  - cluster
+
+maintainers:
+  - name: GetStream
+    url: https://github.com/GetStream/kubeblocks-addons/
+
+annotations:
+  addon.kubeblocks.io/kubeblocks-version: ">=1.0.0"
+  addon.kubeblocks.io/model: "key-value"
+  addon.kubeblocks.io/provider: "community"
diff --git a/addons/valkey/config/valkey-cluster-config.tpl b/addons/valkey/config/valkey-cluster-config.tpl
new file mode 100644
index 000000000..e896c7c8f
--- /dev/null
+++ b/addons/valkey/config/valkey-cluster-config.tpl
@@ -0,0 +1,123 @@
+bind * -::*
+tcp-backlog 511
+timeout 0
+ignore-warnings ARM64-COW-BUG
+tcp-keepalive 300
+daemonize no
+pidfile /var/run/redis_6379.pid
+{{ block "logsBlock" . }}
+loglevel notice
+logfile "/data/running.log"
+{{ end }}
+databases 16
+always-show-logo no
+set-proc-title yes
+proc-title-template "{title} {listen-addr} {server-mode}"
+stop-writes-on-bgsave-error yes
+rdbcompression yes
+rdbchecksum yes
+dbfilename dump.rdb
+rdb-del-sync-files no
+dir /data
+replica-serve-stale-data yes
+replica-read-only yes
+repl-diskless-sync yes
+repl-diskless-sync-delay 5
+repl-diskless-sync-max-replicas 0
+repl-diskless-load disabled
+repl-disable-tcp-nodelay no
+replica-priority 100
+acllog-max-len 128
+lazyfree-lazy-eviction no
+lazyfree-lazy-expire no
+lazyfree-lazy-server-del no
+replica-lazy-flush no
+lazyfree-lazy-user-del no
+lazyfree-lazy-user-flush no
+oom-score-adj no
+oom-score-adj-values 0 200 800
+disable-thp yes
+
+# AOF off: fsync on EBS gp3 caused 30-40ms event-loop stalls (LATENCY DOCTOR
+# confirmed). Replicas + EBS-mounted nodes.conf give us cluster-topology
+# durability, which is all we need for a cache.
+appendonly no
+appendfilename "appendonly.aof"
+appenddirname "appendonlydir"
+appendfsync everysec
+no-appendfsync-on-rewrite no
+auto-aof-rewrite-percentage 100
+auto-aof-rewrite-min-size 64mb
+aof-load-truncated yes
+aof-use-rdb-preamble yes
+aof-timestamp-enabled no
+
+# Disable scheduled BGSAVE forks (default rules tripped every ~90s under our
+# load; each fork briefly stalls the event loop).
+save ""
+
+slowlog-log-slower-than 10000
+slowlog-max-len 128
+
+# Observability: log event-loop stalls > 25ms. Negligible overhead, big
+# diagnostic value (without it, LATENCY DOCTOR returns nothing).
+latency-monitor-threshold 25
+
+notify-keyspace-events ""
+hash-max-listpack-entries 512
+hash-max-listpack-value 64
+list-max-listpack-size -2
+list-compress-depth 0
+set-max-intset-entries 512
+zset-max-listpack-entries 128
+zset-max-listpack-value 64
+hll-sparse-max-bytes 3000
+stream-node-max-bytes 4096
+stream-node-max-entries 100
+activerehashing yes
+client-output-buffer-limit normal 0 0 0
+client-output-buffer-limit replica 256mb 64mb 60
+client-output-buffer-limit pubsub 32mb 8mb 60
+hz 10
+dynamic-hz yes
+aof-rewrite-incremental-fsync yes
+rdb-save-incremental-fsync yes
+jemalloc-bg-thread yes
+enable-debug-command yes
+aclfile /etc/redis/users.acl
+
+# Single IO thread: at the pod CPU limit, 4 IO threads + main thread caused
+# CFS throttling (~8% of periods at 1500m). Our workload is fine
+# single-threaded. STARTUP-ONLY (CONFIG SET rejects io-threads).
+io-threads 1
+io-threads-do-reads yes
+
+# configuration for valkey cluster (Redis-protocol compatible)
+cluster-enabled yes
+cluster-config-file /data/nodes.conf
+cluster-allow-replica-migration no
+cluster-node-timeout 5000
+cluster-replica-validity-factor 0
+cluster-require-full-coverage yes
+cluster-allow-reads-when-down no
+
+# Eviction policy: allkeys-lru (cache mode — we want eviction across the
+# whole keyspace, not just keys with TTLs).
+maxmemory-policy allkeys-lru
+# maxmemory: 85% of the pod memory limit, leaving ~15% headroom for
+# connection / replication buffers. Persistence is disabled, so no RDB-fork
+# memory doubling concern.
+{{- $limit_memory := default 0 $.PHY_MEMORY | int }}
+{{- if gt $limit_memory 0 }}
+maxmemory {{ mulf $limit_memory 0.85 | int }}
+{{- end }}
+
+{{- if eq (index $ "TLS_ENABLED") "true"  }}
+tls-cert-file {{ $.TLS_MOUNT_PATH }}/tls.crt
+tls-key-file {{ $.TLS_MOUNT_PATH }}/tls.key
+tls-ca-cert-file {{ $.TLS_MOUNT_PATH }}/ca.crt
+tls-auth-clients no
+tls-replication yes
+tls-cluster yes
+port 0
+{{- end -}}
diff --git a/addons/valkey/scripts-ut-spec/utils.sh b/addons/valkey/scripts-ut-spec/utils.sh
new file mode 100644
index 000000000..5b2506969
--- /dev/null
+++ b/addons/valkey/scripts-ut-spec/utils.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# utils functions for shellspec unit tests
+
+convert_tpl_to_bash() {
+  local input_file="$1"
+  local output_file="$2"
+
+  sed -e '/^{{\/\*$/,/^\*\/}}$/d' \
+      -e '/^{{-.*}}/d' \
+      -e 's/{{- define ".*" }}//' \
+      -e 's/{{- end }}//' \
+      "$input_file" >> "$output_file"
+}
+
+generate_common_library() {
+  local library_file="$1"
+
+  libcommons_tpl_file="../../kblib/templates/_libcommons.tpl"
+  libpods_tpl_file="../../kblib/templates/_libpods.tpl"
+  libstrings_tpl_file="../../kblib/templates/_libstrings.tpl"
+  libenvs_tpl_file="../../kblib/templates/_libenvs.tpl"
+  libcompvars_tpl_file="../../kblib/templates/_libcompvars.tpl"
+  libututils_tpl_file="../../kblib/templates/_libututils.tpl"
+
+  convert_tpl_to_bash $libcommons_tpl_file "$library_file"
+  convert_tpl_to_bash $libpods_tpl_file "$library_file"
+  convert_tpl_to_bash $libstrings_tpl_file "$library_file"
+  convert_tpl_to_bash $libenvs_tpl_file "$library_file"
+  convert_tpl_to_bash $libcompvars_tpl_file "$library_file"
+  convert_tpl_to_bash $libututils_tpl_file "$library_file"
+}
\ No newline at end of file
diff --git a/addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh b/addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh
new file mode 100644
index 000000000..c95ce409f
--- /dev/null
+++ b/addons/valkey/scripts-ut-spec/valkey_cluster_common_spec.sh
@@ -0,0 +1,114 @@
+# shellcheck shell=bash
+# shellcheck disable=SC2034
+
+# Tightly scoped spec for the Valkey-specific edits to the cluster bootstrap
+# helpers. Full coverage of the upstream script behaviour lives in the redis
+# addon's spec; here we cover only what the valkey addon adds:
+#
+#   - build_single_shard_addslots_command (new helper for 1-shard provisioning)
+#   - create_redis_cluster branch on primary_count == 1
+
+# validate_shell_type_and_version defined in shellspec/spec_helper.sh used to validate the expected shell type and version this script needs to run.
+if ! validate_shell_type_and_version "bash" 4 &>/dev/null; then
+  echo "valkey_cluster_common_spec.sh skip cases because dependency bash version 4 or higher is not installed."
+  exit 0
+fi
+
+source ./utils.sh
+
+common_library_file="./common.sh"
+generate_common_library $common_library_file
+
+Describe "Valkey Cluster Common Bash Script Tests"
+  Include $common_library_file
+  Include ../valkey-cluster-scripts/valkey-cluster-common.sh
+
+  init() {
+    # ut_mode=true makes unset_xtrace_when_ut_mode_false / set_xtrace_when_ut_mode_false
+    # no-op so xtrace doesn't leak into stderr expectations.
+    ut_mode="true"
+  }
+  BeforeAll "init"
+
+  cleanup() {
+    rm -f $common_library_file
+  }
+  AfterAll 'cleanup'
+
+  setup_redis_cli_env() {
+    REDIS_CLI_TLS_CMD=""
+  }
+  Before "setup_redis_cli_env"
+
+  Describe "build_single_shard_addslots_command()"
+    Context "without password"
+      It "uses CLUSTER ADDSLOTSRANGE 0 16383"
+        node_endpoint="172.0.0.1:6379"
+
+        When call build_single_shard_addslots_command "$node_endpoint"
+        The output should eq "redis-cli  -h 172.0.0.1 -p 6379  cluster addslotsrange 0 16383"
+        The stderr should include "initialize single-shard cluster command: redis-cli  -h 172.0.0.1 -p 6379  cluster addslotsrange 0 16383"
+      End
+    End
+
+    Context "with password"
+      setup() {
+        export REDIS_DEFAULT_PASSWORD="password"
+      }
+      Before "setup"
+
+      un_setup() {
+        unset REDIS_DEFAULT_PASSWORD
+      }
+      After "un_setup"
+
+      It "passes auth via -a and masks password in log"
+        node_endpoint="172.0.0.1:6379"
+
+        When call build_single_shard_addslots_command "$node_endpoint"
+        The output should eq "redis-cli  -h 172.0.0.1 -p 6379 -a password cluster addslotsrange 0 16383"
+        The stderr should include "initialize single-shard cluster command: redis-cli  -h 172.0.0.1 -p 6379 -a ******** cluster addslotsrange 0 16383"
+      End
+    End
+  End
+
+  Describe "create_redis_cluster()"
+    Context "with a single primary"
+      build_single_shard_addslots_command() {
+        echo "ADDSLOTS_CMD"
+      }
+      build_redis_cluster_create_command() {
+        echo "MULTI_SHARD_CMD"
+      }
+      ADDSLOTS_CMD() { return 0; }
+      MULTI_SHARD_CMD() { echo "should not be called"; return 1; }
+
+      It "uses the single-shard ADDSLOTS path and skips --cluster create"
+        primary_nodes="172.0.0.1:6379 "
+
+        When call create_redis_cluster "$primary_nodes"
+        The status should be success
+        The stdout should not include "should not be called"
+      End
+    End
+
+    Context "with multiple primaries"
+      build_single_shard_addslots_command() {
+        echo "ADDSLOTS_CMD"
+      }
+      build_redis_cluster_create_command() {
+        echo "MULTI_SHARD_CMD"
+      }
+      ADDSLOTS_CMD() { echo "should not be called"; return 1; }
+      MULTI_SHARD_CMD() { return 0; }
+
+      It "uses the upstream --cluster create path"
+        primary_nodes="172.0.0.1:6379 172.0.0.2:6379 172.0.0.3:6379 "
+
+        When call create_redis_cluster "$primary_nodes"
+        The status should be success
+        The stdout should not include "should not be called"
+      End
+    End
+  End
+End
diff --git a/addons/valkey/templates/_helpers.tpl b/addons/valkey/templates/_helpers.tpl
new file mode 100644
index 000000000..d62d97bd4
--- /dev/null
+++ b/addons/valkey/templates/_helpers.tpl
@@ -0,0 +1,98 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "valkey.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "valkey.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "valkey.labels" -}}
+helm.sh/chart: {{ include "valkey.chart" . }}
+{{ include "valkey.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Common annotations
+*/}}
+{{- define "valkey.annotations" -}}
+{{ include "kblib.helm.resourcePolicy" . }}
+{{ include "valkey.apiVersion" . }}
+apps.kubeblocks.io/skip-immutable-check: "true"
+{{- end }}
+
+{{/*
+API version annotation
+*/}}
+{{- define "valkey.apiVersion" -}}
+kubeblocks.io/crd-api-version: apps.kubeblocks.io/v1
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "valkey.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "valkey.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Define valkey cluster component definition regular expression name prefix
+*/}}
+{{- define "valkeyCluster.cmpdRegexpPattern" -}}
+^valkey-cluster-\d+
+{{- end -}}
+
+{{/*
+Define valkey cluster component script template name
+*/}}
+{{- define "valkeyCluster.scriptsTemplate" -}}
+valkey-cluster-scripts-template-{{ .Chart.Version }}
+{{- end -}}
+
+{{- define "metrics.repository" -}}
+{{ .Values.metrics.image.registry | default ( .Values.image.registry | default "docker.io" ) }}/{{ .Values.metrics.image.repository}}
+{{- end }}
+
+{{- define "metrics.image" -}}
+{{ .Values.metrics.image.registry | default ( .Values.image.registry | default "docker.io" ) }}/{{ .Values.metrics.image.repository}}:{{ .Values.metrics.image.tag }}
+{{- end }}
+
+{{/*
+Generate scripts configmap data block
+*/}}
+{{- define "valkey-cluster.extend.scripts" -}}
+{{- range $path, $_ :=  $.Files.Glob "valkey-cluster-scripts/**" }}
+{{ $path | base }}: |-
+{{- $.Files.Get $path | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "valkey.config.reconfigureAction" -}}
+reconfigure:
+  exec:
+    container: valkey-cluster
+    targetPodSelector: All
+    command:
+      - /bin/sh
+      - -c
+      - |
+        set -eu
+
+        env | cut -d= -f1 | grep -E '^[a-z0-9_.-][a-z0-9_.-]*$' | sort -u | while IFS= read -r param; do
+          [ -n "${param}" ] || continue
+          /scripts/reload-parameter.sh "${param}" "$(printenv "${param}")"
+        done
+{{- end -}}
diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml
new file mode 100644
index 000000000..c9c39b46a
--- /dev/null
+++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml
@@ -0,0 +1,555 @@
+{{- range .Values.valkeyVersions }}
+---
+apiVersion: apps.kubeblocks.io/v1
+kind: ComponentDefinition
+metadata:
+  name: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+  labels:
+    {{- include "valkey.labels" $ | nindent 4 }}
+  annotations:
+    {{- include "valkey.annotations" $ | nindent 4 }}
+spec:
+  provider: GetStream
+  description: Valkey {{ .major }}.x cluster ComponentDefinition (sibling to redis-cluster, with stream-tuned defaults baked in).
+  serviceKind: valkey-cluster
+  serviceVersion: {{ .serviceVersion }}
+  minReadySeconds: 10
+  podUpgradePolicy: ReCreate
+  tls:
+    volumeName: tls
+    mountPath: {{ $.Values.tlsMountPath }}
+    caFile: ca.crt
+    certFile: tls.crt
+    keyFile: tls.key
+  services:
+    - name: valkey-advertised
+      serviceName: valkey-advertised
+      spec:
+        type: NodePort
+        ports:
+        - name: valkey-advertised
+          port: 6379
+          targetPort: valkey-cluster
+        - name: advertised-bus
+          port: 16379
+          targetPort: cluster-bus
+      podService: true
+      disableAutoProvision: true
+    - name: valkey-lb-advertised
+      serviceName: valkey-lb-advertised
+      spec:
+        type: LoadBalancer
+        externalTrafficPolicy: Cluster
+        ports:
+          - name: valkey-advertised
+            port: 6379
+            targetPort: valkey-cluster
+          - name: advertised-bus
+            port: 16379
+            targetPort: cluster-bus
+      podService: true
+      disableAutoProvision: true
+  updateStrategy: BestEffortParallel
+  podManagementPolicy: OrderedReady
+  volumes:
+    - name: data
+      needSnapshot: true
+  roles:
+    - name: primary
+      updatePriority: 2
+      participatesInQuorum: false
+    - name: secondary
+      updatePriority: 1
+      participatesInQuorum: false
+  logConfigs:
+    {{- range $name,$pattern := $.Values.logConfigs }}
+    - name: {{ $name }}
+      filePathPattern: {{ $pattern }}
+    {{- end }}
+  exporter:
+    containerName: metrics
+    scrapePath: /metrics
+    scrapePort: http-metrics
+  configs:
+    - name: valkey-cluster-config
+      template: {{ printf "valkey-cluster-config-template-%s" $.Chart.Version }}
+      namespace: {{ $.Release.Namespace }}
+      volumeName: valkey-cluster-config
+      externalManaged: true
+      {{- include "valkey.config.reconfigureAction" $ | nindent 6 }}
+  scripts:
+    - name: valkey-cluster-scripts
+      template: {{ include "valkeyCluster.scriptsTemplate" $ }}
+      namespace: {{ $.Release.Namespace }}
+      volumeName: scripts
+      defaultMode: 0555
+  {{- include "kblib.syncer.policyRules" $ | nindent 2 }}
+  systemAccounts:
+    - name: default
+      initAccount: true
+      passwordGenerationPolicy:
+        length: 10
+        numDigits: 5
+        numSymbols: 0
+        letterCase: MixedCases
+  hostNetwork:
+    containerPorts:
+      - container: valkey-cluster
+        ports:
+          - valkey-cluster
+          - cluster-bus
+  {{- if $.Values.enableMetrics }}
+      - container: metrics
+        ports:
+          - http-metrics
+          - server-metrics
+  {{- end }}
+  vars:
+    - name: TLS_ENABLED
+      valueFrom:
+        tlsVarRef:
+          enabled: Optional
+          optional: true
+    - name: TLS_MOUNT_PATH
+      value: {{ $.Values.tlsMountPath }}
+    - name: CLUSTER_NAME
+      valueFrom:
+        clusterVarRef:
+          clusterName: Required
+    - name: CLUSTER_NAMESPACE
+      valueFrom:
+        clusterVarRef:
+          namespace: Required
+    - name: COMPONENT_REPLICAS
+      valueFrom:
+        componentVarRef:
+          optional: false
+          replicas: Required
+    - name: CLUSTER_DOMAIN
+      value: {{ $.Values.clusterDomain }}
+    ## the default username/password of valkey connection (uses Redis-protocol AUTH)
+    - name: REDIS_DEFAULT_USER
+      valueFrom:
+        credentialVarRef:
+          name: default
+          username: Required
+    - name: REDIS_DEFAULT_PASSWORD
+      valueFrom:
+        credentialVarRef:
+          name: default
+          password: Required
+    - name: REDIS_REPL_USER
+      value: "kbreplicator"
+    - name: REDIS_REPL_PASSWORD
+      valueFrom:
+        credentialVarRef:
+          name: default
+          password: Required
+    - name: CURRENT_SHARD_POD_NAME_LIST
+      valueFrom:
+        componentVarRef:
+          optional: false
+          podNames: Required
+    - name: CURRENT_SHARD_POD_FQDN_LIST
+      valueFrom:
+        componentVarRef:
+          optional: false
+          podFQDNs: Required
+    - name: CURRENT_SHARD_COMPONENT_NAME
+      valueFrom:
+        componentVarRef:
+          optional: false
+          componentName: Required
+    - name: CURRENT_SHARD_COMPONENT_SHORT_NAME
+      valueFrom:
+        componentVarRef:
+          optional: false
+          shortName: Required
+    - name: CURRENT_SHARD_ADVERTISED_PORT
+      valueFrom:
+        serviceVarRef:
+          name: valkey-advertised
+          optional: true
+          port:
+            name: valkey-advertised
+            option: Required
+    - name: CURRENT_SHARD_ADVERTISED_BUS_PORT
+      valueFrom:
+        serviceVarRef:
+          name: valkey-advertised
+          optional: true
+          port:
+            name: advertised-bus
+            option: Required
+    - name: CURRENT_SHARD_LB_ADVERTISED_HOST
+      valueFrom:
+        serviceVarRef:
+          name: valkey-lb-advertised
+          optional: true
+          loadBalancer: Required
+          host: Required
+    - name: CURRENT_SHARD_LB_ADVERTISED_PORT
+      valueFrom:
+        serviceVarRef:
+          name: valkey-lb-advertised
+          optional: true
+          port:
+            name: valkey-advertised
+            option: Required
+    - name: CURRENT_SHARD_LB_ADVERTISED_BUS_PORT
+      valueFrom:
+        serviceVarRef:
+          name: valkey-lb-advertised
+          optional: true
+          port:
+            name: advertised-bus
+            option: Required
+    - name: ALL_SHARDS_COMPONENT_SHORT_NAMES
+      valueFrom:
+        componentVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          optional: false
+          shortName: Required
+          multipleClusterObjectOption:
+            strategy: combined
+    - name: ALL_SHARDS_POD_NAME_LIST
+      valueFrom:
+        componentVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          optional: false
+          podNames: Required
+          multipleClusterObjectOption:
+            strategy: individual
+    - name: ALL_SHARDS_POD_FQDN_LIST
+      valueFrom:
+        componentVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          optional: false
+          podFQDNs: Required
+          multipleClusterObjectOption:
+            strategy: individual
+    - name: ALL_SHARDS_ADVERTISED_PORT
+      valueFrom:
+        serviceVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          name: valkey-advertised
+          optional: true
+          multipleClusterObjectOption:
+            strategy: combined
+            combinedOption:
+              flattenFormat:
+                delimiter: "."
+                keyValueDelimiter: "@"
+          port:
+            name: valkey-advertised
+            option: Required
+    - name: ALL_SHARDS_LB_ADVERTISED_PORT
+      valueFrom:
+        serviceVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          name: valkey-lb-advertised
+          optional: true
+          multipleClusterObjectOption:
+            strategy: combined
+            combinedOption:
+              flattenFormat:
+                delimiter: "."
+                keyValueDelimiter: "@"
+          port:
+            name: valkey-advertised
+            option: Required
+    - name: ALL_SHARDS_LB_ADVERTISED_BUS_PORT
+      valueFrom:
+        serviceVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          name: valkey-lb-advertised
+          optional: true
+          multipleClusterObjectOption:
+            strategy: combined
+            combinedOption:
+              flattenFormat:
+                delimiter: "."
+                keyValueDelimiter: "@"
+          port:
+            name: advertised-bus
+            option: Required
+    - name: ALL_SHARDS_LB_ADVERTISED_HOST
+      valueFrom:
+        serviceVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          name: valkey-lb-advertised
+          optional: true
+          multipleClusterObjectOption:
+            strategy: combined
+            combinedOption:
+              flattenFormat:
+                delimiter: ","
+                keyValueDelimiter: "@"
+          host: Required
+          loadBalancer: Required
+    - name: REDIS_CLUSTER_HOST_NETWORK_PORT
+      valueFrom:
+        hostNetworkVarRef:
+          optional: true
+          container:
+            name: valkey-cluster
+            port:
+              name: valkey-cluster
+              option: Required
+    - name: SERVICE_PORT
+      value: "6379"
+      expression: {{ `{{if index . "REDIS_CLUSTER_HOST_NETWORK_PORT"}}{{.REDIS_CLUSTER_HOST_NETWORK_PORT}}{{else}}{{.SERVICE_PORT}}{{end}}` | toYaml }}
+    - name: REDIS_METRICS_ADDR
+      value: "redis://localhost:$(SERVICE_PORT)"
+      expression: {{ `{{if eq (index . "TLS_ENABLED") "true"}}rediss://localhost: {{.SERVICE_PORT }}{{else}}redis://localhost:{{.SERVICE_PORT}}{{end}}` | toYaml }}
+    - name: REDIS_CLI_TLS_CMD
+      value: ""
+      expression: {{ `{{if eq (index . "TLS_ENABLED") "true"}}--tls --insecure{{else }}{{end}}` | toYaml }}
+    - name: REDIS_CLUSTER_HOST_NETWORK_BUS_PORT
+      valueFrom:
+        hostNetworkVarRef:
+          optional: true
+          container:
+            name: valkey-cluster
+            port:
+              name: cluster-bus
+              option: Required
+    - name: CLUSTER_BUS_PORT
+      value: "16379"
+      expression: {{ `{{if index . "REDIS_CLUSTER_HOST_NETWORK_BUS_PORT"}}{{.REDIS_CLUSTER_HOST_NETWORK_BUS_PORT}}{{else}}{{.CLUSTER_BUS_PORT}}{{end}}` | toYaml }}
+  {{- if $.Values.enableMetrics }}
+    - name: REDIS_METRICS_HOST_NETWORK_PORT
+      valueFrom:
+        hostNetworkVarRef:
+          optional: true
+          container:
+            name: metrics
+            port:
+              name: http-metrics
+              option: Required
+    - name: REDIS_METRICS_HTTP_PORT
+      value: "9121"
+      expression: {{ `{{if index . "REDIS_METRICS_HOST_NETWORK_PORT"}}{{.REDIS_METRICS_HOST_NETWORK_PORT}}{{else}}{{.REDIS_METRICS_HTTP_PORT}}{{end}}` | toYaml }}
+    - name: REDIS_METRICS_HOST_NETWORK_SERVER_PORT
+      valueFrom:
+        hostNetworkVarRef:
+          optional: true
+          container:
+            name: metrics
+            port:
+              name: server-metrics
+              option: Required
+    - name: REDIS_METRICS_SERVER_PORT
+      value: "8888"
+      expression: {{ `{{if index . "REDIS_METRICS_HOST_NETWORK_SERVER_PORT"}}{{.REDIS_METRICS_HOST_NETWORK_SERVER_PORT}}{{else}}{{.REDIS_METRICS_SERVER_PORT}}{{end}}` | toYaml }}
+    - name: REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT
+      valueFrom:
+        hostNetworkVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          optional: true
+          container:
+            name: valkey-cluster
+            port:
+              name: valkey-cluster
+              option: Required
+          multipleClusterObjectOption:
+            strategy: combined
+            combinedOption:
+              flattenFormat:
+                delimiter: ","
+                keyValueDelimiter: ":"
+  {{- end }}
+    - name: REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_BUS_PORT
+      valueFrom:
+        hostNetworkVarRef:
+          compDef: {{ printf "%s-%s" .componentDef $.Chart.Version }}
+          optional: true
+          container:
+            name: valkey-cluster
+            port:
+              name: cluster-bus
+              option: Required
+          multipleClusterObjectOption:
+            strategy: combined
+            combinedOption:
+              flattenFormat:
+                delimiter: ","
+                keyValueDelimiter: ":"
+    - name: PHY_MEMORY
+      valueFrom:
+        resourceVarRef:
+          memoryLimit: Required
+  lifecycleActions:
+    roleProbe:
+      periodSeconds: 1
+      timeoutSeconds: 1
+      exec:
+        container: valkey-cluster
+        env:
+        - name: CURRENT_POD_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.name
+        - name: KB_HOST_IP
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: status.hostIP
+        - name: KB_POD_FQDN
+          value: "$(CURRENT_POD_NAME).$(CURRENT_SHARD_COMPONENT_NAME)-headless.$(CLUSTER_NAMESPACE).svc.{{ $.Values.clusterDomain }}"
+        - name: KB_CLUSTER_COMP_NAME
+          value: $(CURRENT_SHARD_COMPONENT_NAME)
+        - name: REDIS_LB_ADVERTISED_HOST
+          value: $(CURRENT_SHARD_LB_ADVERTISED_HOST)
+        - name: KB_SERVICE_PORT
+          value: "$(SERVICE_PORT)"
+        command:
+          - /tools/dbctl
+          - redis
+          - getrole
+    postProvision:
+      timeoutSeconds: 900
+      exec:
+        container: valkey-cluster
+        command:
+          - /bin/bash
+          - -c
+          - /scripts/valkey-cluster-manage.sh --post-provision  > /tmp/post-provision.log 2>&1
+        env:
+          - name: CURRENT_POD_NAME
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: metadata.name
+          - name: CURRENT_POD_IP
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: status.podIP
+          - name: CURRENT_POD_HOST_IP
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: status.hostIP
+      preCondition: RuntimeReady
+      retryPolicy:
+        maxRetries: 10
+    memberLeave:
+      exec:
+        container: valkey-cluster
+        command:
+          - /bin/bash
+          - -c
+          - /scripts/valkey-cluster-replica-member-leave.sh > /tmp/member-leave.log 2>&1
+      retryPolicy:
+        maxRetries: 10
+    switchover:
+      exec:
+        container: valkey-cluster
+        command:
+          - /bin/bash
+          - -c
+          - /scripts/valkey-cluster-switchover.sh  > /tmp/switchover.log 2>&1
+    memberJoin:
+      exec:
+        container: valkey-cluster
+        command:
+          - /bin/bash
+          - -c
+          - /scripts/sync-acl.sh
+        targetPodSelector: Any
+  runtime:
+    initContainers:
+      - name: init-dbctl
+        command:
+          - cp
+          - -r
+          - /bin/dbctl
+          - /tools/
+        imagePullPolicy: {{ default "IfNotPresent" $.Values.dbctlImage.pullPolicy }}
+        volumeMounts:
+          - mountPath: /tools
+            name: tools
+    containers:
+      - name: valkey-cluster
+        imagePullPolicy: {{ default "IfNotPresent" $.Values.image.pullPolicy }}
+        ports:
+          - name: valkey-cluster
+            containerPort: 6379
+          - name: cluster-bus
+            containerPort: 16379
+        volumeMounts:
+          - name: data
+            mountPath: {{ $.Values.dataMountPath }}
+          - name: valkey-cluster-config
+            mountPath: /etc/conf
+          - name: scripts
+            mountPath: /scripts
+          - name: redis-conf
+            mountPath: /etc/redis
+          - mountPath: /tools
+            name: tools
+        env:
+          - name: CURRENT_POD_NAME
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: metadata.name
+          - name: CURRENT_POD_IP
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: status.podIP
+          - name: CURRENT_POD_HOST_IP
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: status.hostIP
+          - name: POD_FQDN
+            value: "$(CURRENT_POD_NAME).$(CURRENT_SHARD_COMPONENT_NAME)-headless.$(CLUSTER_NAMESPACE).svc.{{ $.Values.clusterDomain }}"
+        command: [ "/scripts/valkey-cluster-server-start.sh" ]
+        readinessProbe:
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 5
+          exec:
+            command:
+              - sh
+              - -c
+              - /scripts/valkey-ping.sh
+        lifecycle:
+          preStop:
+            exec:
+              command:
+                - /bin/bash
+                - -c
+                - /scripts/valkey-cluster-replica-pre-stop.sh
+  {{- if $.Values.enableMetrics }}
+      - name: metrics
+        imagePullPolicy: {{ $.Values.metrics.image.pullPolicy | quote }}
+        securityContext:
+          runAsNonRoot: true
+          runAsUser: 1001
+        env:
+          - name: REDIS_ADDR
+            value: "$(REDIS_METRICS_ADDR)"
+          - name: REDIS_EXPORTER_WEB_LISTEN_ADDRESS
+            value: "0.0.0.0:$(REDIS_METRICS_HTTP_PORT)"
+          - name: REDIS_USER
+            value: $(REDIS_DEFAULT_USER)
+          - name: REDIS_PASSWORD
+            value: $(REDIS_DEFAULT_PASSWORD)
+          - name: REDIS_EXPORTER_IS_CLUSTER
+            value: "true"
+          - name: REDIS_EXPORTER_SKIP_TLS_VERIFICATION
+            value: "true"
+        ports:
+          - name: http-metrics
+            containerPort: {{ $.Values.metrics.service.port }}
+          - name: server-metrics
+            containerPort: {{ $.Values.metrics.service.serverPort }}
+  {{- end }}
+{{- end }}
diff --git a/addons/valkey/templates/cmpv-valkey-cluster.yaml b/addons/valkey/templates/cmpv-valkey-cluster.yaml
new file mode 100644
index 000000000..31ab4b38c
--- /dev/null
+++ b/addons/valkey/templates/cmpv-valkey-cluster.yaml
@@ -0,0 +1,36 @@
+apiVersion: apps.kubeblocks.io/v1
+kind: ComponentVersion
+metadata:
+  name: valkey-cluster
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  annotations:
+    {{- include "valkey.apiVersion" . | nindent 4 }}
+spec:
+  compatibilityRules:
+  {{- range .Values.valkeyVersions }}
+  - compDefs:
+      - {{ .componentDef }}
+    releases:
+    {{- range .mirrorVersions }}
+    - {{ .version }}
+    {{- end }}
+  {{- end }}
+  releases:
+  {{- $valkeyRepository := printf "%s/%s" ( .Values.image.registry | default "docker.io" ) .Values.image.repository }}
+  {{- range .Values.valkeyVersions }}
+  {{- range .mirrorVersions }}
+  - name: {{ .version }}
+    serviceVersion: {{ .version }}
+    images:
+      valkey-cluster: {{ $valkeyRepository }}:{{ .imageTag }}
+      postProvision: {{ $valkeyRepository }}:{{ .imageTag }}
+      accountProvision: {{ $valkeyRepository }}:{{ .imageTag }}
+      switchover: {{ $valkeyRepository }}:{{ .imageTag }}
+      preTerminate: {{ $valkeyRepository }}:{{ .imageTag }}
+      memberLeave: {{ $valkeyRepository }}:{{ .imageTag }}
+      memberJoin: {{ $valkeyRepository }}:{{ .imageTag }}
+      metrics: {{ include "metrics.repository" $ }}:{{ $.Values.metrics.image.tag }}
+      init-dbctl: {{ $.Values.dbctlImage.registry | default ( $.Values.image.registry | default "docker.io" ) }}/{{ $.Values.dbctlImage.repository }}:{{ $.Values.dbctlImage.tag }}
+  {{- end }}
+  {{- end }}
diff --git a/addons/valkey/templates/shardingdefinition.yaml b/addons/valkey/templates/shardingdefinition.yaml
new file mode 100644
index 000000000..ed1e71c8b
--- /dev/null
+++ b/addons/valkey/templates/shardingdefinition.yaml
@@ -0,0 +1,30 @@
+apiVersion: apps.kubeblocks.io/v1
+kind: ShardingDefinition
+metadata:
+  name: valkey-cluster
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  annotations:
+    {{- include "valkey.apiVersion" . | nindent 4 }}
+spec:
+  template:
+    compDef: {{ include "valkeyCluster.cmpdRegexpPattern" . }}
+  shardsLimit:
+    minShards: 1
+    maxShards: 64
+  provisionStrategy: Parallel
+  updateStrategy: Parallel
+  systemAccounts:
+    - name: default
+      shared: true
+  lifecycleActions:
+    shardRemove:
+      timeoutSeconds: 900
+      exec:
+        container: valkey-cluster
+        command:
+          - /bin/bash
+          - -c
+          - /scripts/valkey-cluster-manage.sh --pre-terminate > /tmp/pre-terminate.log 2>&1
+      retryPolicy:
+        maxRetries: 10
diff --git a/addons/valkey/templates/valkey-cluster-config-template.yaml b/addons/valkey/templates/valkey-cluster-config-template.yaml
new file mode 100644
index 000000000..38ed4e8ff
--- /dev/null
+++ b/addons/valkey/templates/valkey-cluster-config-template.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ printf "valkey-cluster-config-template-%s" .Chart.Version }}
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  annotations:
+    {{- include "valkey.annotations" . | nindent 4 }}
+data:
+  redis.conf: |-
+    {{- .Files.Get "config/valkey-cluster-config.tpl" | nindent 4 }}
diff --git a/addons/valkey/templates/valkey-cluster-scripts-template.yaml b/addons/valkey/templates/valkey-cluster-scripts-template.yaml
new file mode 100644
index 000000000..21e0f3c89
--- /dev/null
+++ b/addons/valkey/templates/valkey-cluster-scripts-template.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ printf "valkey-cluster-scripts-template-%s" .Chart.Version }}
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  annotations:
+    {{- include "valkey.annotations" . | nindent 4 }}
+data:
+  common.sh: |-
+    #!/bin/bash
+    {{- include "kblib.commons.call_func_with_retry" $ | nindent 4 }}
+    {{- include "kblib.commons.extract_obj_ordinal" $ | nindent 4 }}
+    {{- include "kblib.compvars.get_target_pod_fqdn_from_pod_fqdn_vars" $ | nindent 4 }}
+    {{- include "kblib.pods.min_lexicographical_order_pod" $ | nindent 4 }}
+    {{- include "kblib.ututils.set_xtrace_when_ut_mode_false" $ | nindent 4 }}
+    {{- include "kblib.ututils.unset_xtrace_when_ut_mode_false" $ | nindent 4 }}
+    {{- include "kblib.ututils.sleep_when_ut_mode_false" $ | nindent 4 }}
+    {{- include "kblib.strings.contains" $ | nindent 4 }}
+    {{- include "kblib.strings.is_empty" $ | nindent 4 }}
+    {{- include "kblib.strings.equals" $ | nindent 4 }}
+    {{- include "kblib.strings.split" $ | nindent 4 }}
+  {{- with include "valkey-cluster.extend.scripts" . }}
+  {{- . | nindent 2 }}
+  {{- end }}
diff --git a/addons/valkey/valkey-cluster-scripts/reload-parameter.sh b/addons/valkey/valkey-cluster-scripts/reload-parameter.sh
new file mode 100644
index 000000000..cf0fd7890
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/reload-parameter.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+paramName=""
+paramValue=""
+for val in $(echo "${1}" | tr ' ' '\n'); do
+  if [ -z "${paramName}" ]; then
+    paramName="${val}"
+  elif [ -z "${paramValue}" ]; then
+    paramValue="${val}"
+  else
+    paramValue="${paramValue} ${val}"
+  fi
+done
+
+if  [ -z "${paramValue}" ]; then
+  paramValue="${@:2}"
+else
+  paramValue="${paramValue} ${@:2}"
+fi
+
+if [ "$paramValue" = "\"\"" ]; then
+  paramValue=""
+fi
+service_port=${SERVICE_PORT:-6379}
+
+if [ -z $REDIS_DEFAULT_PASSWORD ]; then
+  redis-cli $REDIS_CLI_TLS_CMD -p $service_port CONFIG SET ${paramName} "${paramValue}"
+else
+  redis-cli $REDIS_CLI_TLS_CMD -p $service_port -a ${REDIS_DEFAULT_PASSWORD} CONFIG SET ${paramName} "${paramValue}"
+fi
diff --git a/addons/valkey/valkey-cluster-scripts/sync-acl.sh b/addons/valkey/valkey-cluster-scripts/sync-acl.sh
new file mode 100644
index 000000000..0b6bd0c31
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/sync-acl.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+service_port=${SERVICE_PORT:-6379}
+redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port -a $REDIS_DEFAULT_PASSWORD"
+if [ -z "$REDIS_DEFAULT_PASSWORD" ]; then
+   redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port"
+fi
+
+is_ok=false
+acl_list=""
+# 1. get acl list from other pods
+for pod_fqdn in $(echo "$REDIS_POD_FQDN_LIST" | tr ',' '\n'); do
+    if [[ "$pod_fqdn" == "$KB_JOIN_MEMBER_POD_FQDN" ]]; then
+        continue
+    fi
+    acl_list=$($redis_base_cmd -h "$pod_fqdn" ACL LIST)
+    if [ $? -eq 0 ]; then
+        is_ok=true
+        break
+    fi
+done
+
+if [ "$is_ok" = false ]; then
+    echo "Failed to get ACL LIST from other pods" >&2
+    exit 1
+fi
+
+if [ -z "$acl_list" ]; then
+    echo "No ACL rules found in other pods, skip synchronization" >&2
+    exit 0
+fi
+
+set -e
+# 2. apply acl list to current pod
+while IFS= read -r user_rule; do
+    [[ -z "$user_rule" ]] && continue
+
+    if [[ "$user_rule" =~ ^user[[:space:]]+([^[:space:]]+) ]]; then
+        username="${BASH_REMATCH[1]}"
+    else
+      # skip invalid user rule
+      continue
+    fi
+
+    if [[ "$username" == "default" ]]; then
+        continue
+    fi
+    rule_part="${user_rule#user $username }"
+    $redis_base_cmd -h $KB_JOIN_MEMBER_POD_FQDN ACL SETUSER "$username" $rule_part >&2
+done <<< "$acl_list"
+
+$redis_base_cmd -h $KB_JOIN_MEMBER_POD_FQDN ACL save >&2
\ No newline at end of file
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh
new file mode 100644
index 000000000..2f41f6388
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-common.sh
@@ -0,0 +1,787 @@
+#!/bin/bash
+
+# shellcheck disable=SC2153
+# shellcheck disable=SC2207
+# shellcheck disable=SC2034
+
+# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command.
+# Normally test without [expression] returns false. It means that __() { :; }
+# function is defined if this script runs directly.
+#
+# shellspec overrides the test command and returns true *once*. It means that
+# __() function defined internally by shellspec is called.
+#
+# In other words. If not in test mode, __ is just a comment. If test mode, __
+# is a interception point.
+# you should set ut_mode="true" when you want to run the script in shellspec file.
+ut_mode="false"
+test || __() {
+ # when running in non-unit test mode, set the options "set -ex".
+ set -ex;
+}
+
+retry_times=3
+check_ready_times=30
+retry_delay_second=2
+
+# usage: sleep_random_second_when_ut_mode_false <max_time> <min_time>
+sleep_random_second_when_ut_mode_false() {
+  if [ "false" == "$ut_mode" ]; then
+    local max_time="$1"
+    local min_time="$2"
+    local random_time=$((RANDOM % (max_time - min_time + 1) + min_time))
+    echo "Sleeping for $random_time seconds"
+    sleep "$random_time"
+  fi
+}
+
+## the component names of all shard
+## the value format of ALL_SHARDS_COMPONENT_SHORT_NAMES is like "shard-98x:shard-98x,shard-cq7:shard-cq7,shard-hy7:shard-hy7"
+## return the component names of all shards with the format "shard-98x,shard-cq7,shard-hy7"
+get_all_shards_components() {
+  local all_shards_components=""
+  if is_empty "$ALL_SHARDS_COMPONENT_SHORT_NAMES"; then
+    echo "Error: Required environment variable ALL_SHARDS_COMPONENT_SHORT_NAMES is not set." >&2
+    return 1
+  fi
+  IFS=',' read -ra all_shards_component_shortname_pairs <<< "$ALL_SHARDS_COMPONENT_SHORT_NAMES"
+  for pair in "${all_shards_component_shortname_pairs[@]}"; do
+    IFS=':' read -r shard_name _ <<< "$pair"
+    all_shards_components="${all_shards_components},${shard_name}"
+  done
+  all_shards_components="${all_shards_components#,}"
+  echo "$all_shards_components"
+  return 0
+}
+
+## the pod names of all shard, there are some environment variables name prefix with "ALL_SHARDS_POD_NAME_LIST" and
+## suffix with the shard name, like "ALL_SHARDS_POD_NAME_LIST_SHARD_98X", "ALL_SHARDS_POD_NAME_LIST_SHARD_CQ7", "ALL_SHARDS_POD_NAME_LIST_SHARD_HY7"
+## - ALL_SHARDS_POD_NAME_LIST_SHARD_98X="redis-shard-98x-0,redis-shard-98x-1"
+## - ALL_SHARDS_POD_NAME_LIST_SHARD_CQ7="redis-shard-cq7-0,redis-shard-cq7-1"
+## - ALL_SHARDS_POD_NAME_LIST_SHARD_HY7="redis-shard-hy7-0,redis-shard-hy7-1"
+## return the pod names of all shards combined with ","
+get_all_shards_pods() {
+  ## list all Envs name prefix with ALL_SHARDS_POD_NAME_LIST and get them value combined with ","
+  local envs
+  local all_shards_pods=""
+  envs=$(env | grep "^ALL_SHARDS_POD_NAME_LIST" | sort)
+  while IFS='=' read -r env_name env_value; do
+    if ! is_empty "$env_value"; then
+      if is_empty "$all_shards_pods"; then
+        all_shards_pods="$env_value"
+      else
+        all_shards_pods="$all_shards_pods,$env_value"
+      fi
+    fi
+  done <<< "$envs"
+  echo "$all_shards_pods"
+  return 0
+}
+
+## the pod fqdn list for all shard pod, it will generate a set of variables with the shard name suffix like:
+## - ALL_SHARDS_POD_FQDN_LIST_SHARD_98X="redis-shard-98x-0.redis-shard-98x-headless.default.cluster.local,redis-shard-98x-1.redis-shard-98x-headless.default.cluster.local"
+## - ALL_SHARDS_POD_FQDN_LIST_SHARD_CQ7="redis-shard-cq7-0.redis-shard-cq7-headless.default.cluster.local,redis-shard-cq7-1.redis-shard-cq7-headless.default.cluster.local"
+## - ALL_SHARDS_POD_FQDN_LIST_SHARD_HY7="redis-shard-hy7-0.redis-shard-hy7-headless.default.cluster.local,redis-shard-hy7-1.redis-shard-hy7-headless.default.cluster.local"
+## return the pod fqdn list for all shard pod combined with ","
+get_all_shards_pod_fqdns() {
+  ## list all Envs name prefix with ALL_SHARDS_POD_FQDN_LIST and get them value combined with ","
+  local envs
+  local all_shards_pod_fqdns=""
+  envs=$(env | grep "^ALL_SHARDS_POD_FQDN_LIST" | sort)
+  while IFS='=' read -r env_name env_value; do
+    if [[ -n "$env_value" ]]; then
+      if [[ -z "$all_shards_pod_fqdns" ]]; then
+        all_shards_pod_fqdns="$env_value"
+      else
+        all_shards_pod_fqdns="$all_shards_pod_fqdns,$env_value"
+      fi
+    fi
+  done <<< "$envs"
+  echo "$all_shards_pod_fqdns"
+  return 0
+}
+
+shutdown_redis_server() {
+  local service_port="$1"
+  unset_xtrace_when_ut_mode_false
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    redis-cli $REDIS_CLI_TLS_CMD -h 127.0.0.1 -p "$service_port" -a "$REDIS_DEFAULT_PASSWORD" shutdown
+  else
+    redis-cli $REDIS_CLI_TLS_CMD -h 127.0.0.1 -p "$service_port" shutdown
+  fi
+  set_xtrace_when_ut_mode_false
+  echo "shutdown redis server succeeded!"
+}
+
+check_redis_server_ready() {
+  unset_xtrace_when_ut_mode_false
+  local host="$1"
+  local port="$2"
+  local max_retry=10
+  local retry_interval=5
+  check_ready_cmd="redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port ping"
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    check_ready_cmd="redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a $REDIS_DEFAULT_PASSWORD ping"
+  fi
+  output=$($check_ready_cmd)
+  set_xtrace_when_ut_mode_false
+  status=$?
+  if [ $status -ne 0 ] || [ "$output" != "PONG" ] ; then
+    echo "Failed to execute the check ready command: $check_ready_cmd" >&2
+    return 1
+  fi
+}
+
+parse_advertised_svc_and_port() {
+  local pod_name="$1"
+  local advertised_ports="$2"
+  local svc_and_port="$3"
+  local pod_name_ordinal
+  local found=false
+
+  pod_name_ordinal=$(extract_obj_ordinal "$pod_name")
+  IFS=',' read -ra ports_array <<< "$advertised_ports"
+  for entry in "${ports_array[@]}"; do
+    IFS=':' read -ra parts <<< "$entry"
+    local svc_name="${parts[0]}"
+    local port="${parts[1]}"
+    local svc_name_ordinal
+
+    svc_name_ordinal=$(extract_obj_ordinal "$svc_name")
+    if [[ "$svc_name_ordinal" == "$pod_name_ordinal" ]]; then
+      if [[ "${svc_and_port}" == "true" ]]; then
+         echo "$svc_name:$port"
+      else
+         echo "$port"
+      fi
+      found=true
+      return 0
+    fi
+  done
+
+  if [[ "$found" == false ]]; then
+    return 1
+  fi
+}
+
+get_pod_service_port_by_network_mode() {
+  local target_pod_name="$1"
+  local service_port=${SERVICE_PORT:-6379}
+  # if redis cluster is using host network, the service port should be the host network port
+  if ! is_empty "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"; then
+    IFS=',' read -ra port_mappings <<< "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"
+    for mapping in "${port_mappings[@]}"; do
+      shard_name=$(echo "$mapping" | cut -d':' -f1)
+      mapping_port=$(echo "$mapping" | cut -d':' -f2)
+      if echo "${target_pod_name}" | grep -q "$shard_name"; then
+        service_port=$mapping_port
+        break
+      fi
+    done
+  fi
+  echo "$service_port"
+}
+
+send_cluster_meet() {
+  local primary_endpoint="$1"
+  local primary_port="$2"
+  local announce_ip="$3"
+  local announce_port="$4"
+  local announce_bus_port="$5"
+
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    meet_command="redis-cli $REDIS_CLI_TLS_CMD -h $primary_endpoint -p $primary_port cluster meet $announce_ip $announce_port $announce_bus_port"
+    logging_mask_meet_command="$meet_command"
+  else
+    meet_command="redis-cli $REDIS_CLI_TLS_CMD -h $primary_endpoint -p $primary_port -a $REDIS_DEFAULT_PASSWORD cluster meet $announce_ip $announce_port $announce_bus_port"
+    logging_mask_meet_command="${meet_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "check and correct other primary nodes meet command: $logging_mask_meet_command"
+  if ! $meet_command
+  then
+      echo "Failed to meet the node $announce_ip:$announce_port in check_and_meet_other_primary_nodes" >&2
+      return 1
+  else
+    echo "Meet the node $announce_ip:$announce_port successfully with new announce ip $announce_ip..." >&2
+    return 0
+  fi
+  set_xtrace_when_ut_mode_false
+}
+
+get_cluster_info() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  unset_xtrace_when_ut_mode_false
+  local command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port cluster info"
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port -a $REDIS_DEFAULT_PASSWORD cluster info"
+  fi
+  cluster_info=$($command)
+  set_xtrace_when_ut_mode_false
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to execute the get cluster info command" >&2
+    return 1
+  fi
+  echo "$cluster_info"
+  return 0
+}
+
+get_cluster_nodes_info() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  unset_xtrace_when_ut_mode_false
+  local command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port cluster nodes"
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    command="redis-cli $REDIS_CLI_TLS_CMD -h $cluster_node -p $cluster_node_port -a $REDIS_DEFAULT_PASSWORD cluster nodes"
+  fi
+  cluster_nodes_info=$($command)
+  set_xtrace_when_ut_mode_false
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to execute the get cluster nodes info command" >&2
+    return 1
+  fi
+  echo "$cluster_nodes_info"
+  return 0
+}
+
+get_cluster_id() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  local pod_fqdn="$3"
+  cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info in get_cluster_id" >&2
+    return 1
+  fi
+  if [ -n "${pod_fqdn}" ]; then
+    cluster_id=$(echo "$cluster_nodes_info" | grep "${pod_fqdn}" | awk '{print $1}')
+  else
+    cluster_id=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $1}')
+  fi
+  echo "$cluster_id"
+  return 0
+}
+
+get_cluster_announce_ip() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info in get_cluster_announce_ip" >&2
+    return 1
+  fi
+  cluster_announce_ip=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $2}' | awk -F ':' '{print $1}')
+  echo "$cluster_announce_ip"
+  return 0
+}
+
+check_node_in_cluster() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  local node_name="$3"
+  cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info in check_node_in_cluster" >&2
+    return 1
+  fi
+  # if the cluster_nodes_info contains multiple lines and the node_name is in the cluster_nodes_info, return true
+  if [ "$(echo "$cluster_nodes_info" | wc -l)" -gt 1 ] && echo "$cluster_nodes_info" | grep -q "$node_name"; then
+    return 0
+  else
+    return 1
+  fi
+}
+
+send_cluster_meet_with_retry() {
+  local primary_endpoint="$1"
+  local primary_port="$2"
+  local announce_ip="$3"
+  local announce_port="$4"
+  local announce_bus_port="$5"
+  send_cluster_meet_result=$(call_func_with_retry $retry_times 10 send_cluster_meet "$primary_endpoint" "$primary_port" "$announce_ip" "$announce_port" "$announce_bus_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to meet the node $announce_ip:$announce_port in check_and_meet_other_primary_nodes after retry" >&2
+    return 1
+  fi
+  return 0
+}
+
+get_cluster_info_with_retry() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  # call the get_cluster_info function with call_func_with_retry function and get the output
+  cluster_info=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get the cluster info of the cluster node $cluster_node:$cluster_node_port after retry" >&2
+    return 1
+  fi
+  echo "$cluster_info"
+  return 0
+}
+
+get_cluster_nodes_info_with_retry() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  # call the get_cluster_nodes_info function with call_func_with_retry function and get the output
+  cluster_nodes_info=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_nodes_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get the cluster nodes info of the cluster node $cluster_node:$cluster_node_port after retry" >&2
+    return 1
+  fi
+  echo "$cluster_nodes_info"
+  return 0
+}
+
+get_cluster_id_with_retry() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  local pod_fqdn="$3"
+  # call the execute_get_cluster_id_command function with call_func_with_retry function and get the output
+  cluster_id=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_id "$cluster_node" "$cluster_node_port" "${pod_fqdn}")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get the cluster id of the cluster node $cluster_node:$cluster_node_port after retry" >&2
+    return 1
+  fi
+  echo "$cluster_id"
+  return 0
+}
+
+get_cluster_announce_ip_with_retry() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  # call the execute_get_cluster_announce_ip_command function with call_func_with_retry function and get the output
+  cluster_announce_ip=$(call_func_with_retry $retry_times $retry_delay_second get_cluster_announce_ip "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get the cluster announce ip of the cluster node $cluster_node:$cluster_node_port after retry" >&2
+    return 1
+  fi
+  echo "$cluster_announce_ip"
+  return 0
+}
+
+check_node_in_cluster_with_retry() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  local node_name="$3"
+  # call the execute_check_node_in_cluster_command function with call_func_with_retry function and get the output
+  check_result=$(call_func_with_retry $retry_times $retry_delay_second check_node_in_cluster "$cluster_node" "$cluster_node_port" "$node_name")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to check the node $node_name in the cluster node $cluster_node:$cluster_node_port after retry" >&2
+    return 1
+  fi
+  return 0
+}
+
+check_redis_server_ready_with_retry() {
+  local host="$1"
+  local port="$2"
+  # call the execute_check_redis_server_ready_command function with call_func_with_retry function and get the output
+  check_result=$(call_func_with_retry $check_ready_times $retry_delay_second check_redis_server_ready "$host" "$port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to check the redis server ready after retry" >&2
+    return 1
+  fi
+  return 0
+}
+
+# check redis cluster all slots are covered
+check_slots_covered() {
+  # cluster_node_endpoint_wth_port is the target node endpoint with port, for example 172.0.0.1:6379
+  local node_endpoint_wth_port="$1"
+  local cluster_service_port="$2"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    check=$(redis-cli $REDIS_CLI_TLS_CMD --cluster check "$node_endpoint_wth_port" -p "$cluster_service_port")
+  else
+    check=$(redis-cli $REDIS_CLI_TLS_CMD --cluster check "$node_endpoint_wth_port" -p "$cluster_service_port" -a "$REDIS_DEFAULT_PASSWORD")
+  fi
+  set_xtrace_when_ut_mode_false
+  if contains "$check" "All 16384 slots covered"; then
+    return 0
+  else
+    return 1
+  fi
+}
+
+# check if the cluster has been initialized
+check_cluster_initialized() {
+  local cluster_pod_fqdn_list="$1"
+  if is_empty "$cluster_pod_fqdn_list"; then
+    echo "Error: Required environment variable cluster_pod_fqdn_list is not set." >&2
+    return 1
+  fi
+
+  local service_port
+  for pod_fqdn in $(echo "$cluster_pod_fqdn_list" | tr ',' ' '); do
+    pod_name=${pod_fqdn%%.*}
+    service_port=$(get_pod_service_port_by_network_mode "${pod_name}")
+    cluster_info=$(get_cluster_info_with_retry "$pod_fqdn" "$service_port")
+    status=$?
+    if [ $status -ne 0 ]; then
+      echo "Failed to get cluster info in check_cluster_initialized" >&2
+      return 1
+    fi
+    cluster_state=$(echo "$cluster_info" | awk -F: '/cluster_state/{print $2}' | tr -d '[:space:]')
+    if is_empty "$cluster_state" || equals "$cluster_state" "ok"; then
+      echo "Redis Cluster already initialized"
+      return 0
+    fi
+  done
+  echo "Redis Cluster not initialized" >&2
+  return 1
+}
+
+build_redis_cluster_create_command() {
+  local primary_nodes="$1"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    initialize_command="redis-cli $REDIS_CLI_TLS_CMD --cluster create $primary_nodes --cluster-yes"
+    logging_mask_initialize_command="$initialize_command"
+  else
+    initialize_command="redis-cli $REDIS_CLI_TLS_CMD --cluster create $primary_nodes -a $REDIS_DEFAULT_PASSWORD --cluster-yes"
+    logging_mask_initialize_command="${initialize_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "initialize cluster command: $logging_mask_initialize_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$initialize_command"
+}
+
+# `redis-cli --cluster create` rejects clusters with fewer than 3 masters.
+# For single-shard provisioning we bypass it and assign all 16384 slots to
+# the lone primary directly via CLUSTER ADDSLOTSRANGE — same approach AWS
+# ElastiCache uses for 1-node Valkey/Redis Cluster topologies.
+build_single_shard_addslots_command() {
+  local node_endpoint="$1"
+  local host="${node_endpoint%:*}"
+  local port="${node_endpoint##*:}"
+  unset_xtrace_when_ut_mode_false
+  local auth=""
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    auth="-a $REDIS_DEFAULT_PASSWORD"
+  fi
+  initialize_command="redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port $auth cluster addslotsrange 0 16383"
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    logging_mask_initialize_command="$initialize_command"
+  else
+    logging_mask_initialize_command="${initialize_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "initialize single-shard cluster command: $logging_mask_initialize_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$initialize_command"
+}
+
+build_secondary_replicated_command() {
+  local secondary_endpoint_with_port="$1"
+  local mapping_primary_endpoint_with_port="$2"
+  local mapping_primary_cluster_id="$3"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    replicated_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $secondary_endpoint_with_port $mapping_primary_endpoint_with_port --cluster-slave --cluster-master-id $mapping_primary_cluster_id"
+    logging_mask_replicated_command="$replicated_command"
+  else
+    replicated_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $secondary_endpoint_with_port $mapping_primary_endpoint_with_port --cluster-slave --cluster-master-id $mapping_primary_cluster_id -a $REDIS_DEFAULT_PASSWORD"
+    logging_mask_replicated_command="${replicated_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "initialize cluster secondary add-node command: $logging_mask_replicated_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$replicated_command"
+}
+
+build_scale_out_shard_primary_join_command() {
+  local scale_out_shard_default_primary_endpoint_with_port="$1"
+  local exist_available_node="$2"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    add_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $scale_out_shard_default_primary_endpoint_with_port $exist_available_node"
+    logging_mask_add_node_command="$add_node_command"
+  else
+    add_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster add-node $scale_out_shard_default_primary_endpoint_with_port $exist_available_node -a $REDIS_DEFAULT_PASSWORD"
+    logging_mask_add_node_command="${add_node_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "scale out shard primary add-node command: $logging_mask_add_node_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$add_node_command"
+}
+
+build_reshard_command() {
+  local primary_node_with_port="$1"
+  local mapping_primary_cluster_id="$2"
+  local slots_per_shard="$3"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    reshard_command="redis-cli $REDIS_CLI_TLS_CMD --cluster reshard $primary_node_with_port --cluster-from all --cluster-to $mapping_primary_cluster_id --cluster-slots $slots_per_shard --cluster-yes"
+    logging_mask_reshard_command="$reshard_command"
+  else
+    reshard_command="redis-cli $REDIS_CLI_TLS_CMD --cluster reshard $primary_node_with_port --cluster-from all --cluster-to $mapping_primary_cluster_id --cluster-slots $slots_per_shard -a $REDIS_DEFAULT_PASSWORD --cluster-yes"
+    logging_mask_reshard_command="${reshard_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "scale out shard reshard command: $logging_mask_reshard_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$reshard_command"
+}
+
+build_rebalance_to_zero_command() {
+  local node_with_port="$1"
+  local node_cluster_id="$2"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    rebalance_command="redis-cli $REDIS_CLI_TLS_CMD --cluster rebalance $node_with_port --cluster-weight $node_cluster_id=0 --cluster-yes "
+    logging_mask_rebalance_command="$rebalance_command"
+  else
+    rebalance_command="redis-cli $REDIS_CLI_TLS_CMD --cluster rebalance $node_with_port --cluster-weight $node_cluster_id=0 --cluster-yes -a $REDIS_DEFAULT_PASSWORD"
+    logging_mask_rebalance_command="${rebalance_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "set current component slot to 0 by rebalance command: $logging_mask_rebalance_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$rebalance_command"
+}
+
+build_del_node_command() {
+  local available_node="$1"
+  local node_to_del_cluster_id="$2"
+  local do_forget_node="$3"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    del_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster del-node $available_node $node_to_del_cluster_id -p $SERVICE_PORT"
+    if [[ "$do_forget_node" == "true" ]]; then
+      del_node_command="redis-cli $REDIS_CLI_TLS_CMD -p $SERVICE_PORT --cluster call $available_node cluster forget $node_to_del_cluster_id"
+    fi
+    logging_mask_del_node_command="$del_node_command"
+  else
+    del_node_command="redis-cli $REDIS_CLI_TLS_CMD --cluster del-node $available_node $node_to_del_cluster_id -p $SERVICE_PORT -a $REDIS_DEFAULT_PASSWORD"
+    if [[ "$do_forget_node" == "true" ]]; then
+      del_node_command="redis-cli $REDIS_CLI_TLS_CMD -p $SERVICE_PORT --cluster call $available_node cluster forget $node_to_del_cluster_id -a $REDIS_DEFAULT_PASSWORD"
+    fi
+    logging_mask_del_node_command="${del_node_command/$REDIS_DEFAULT_PASSWORD/********}"
+  fi
+  echo "del node command: $logging_mask_del_node_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$del_node_command"
+}
+
+build_acl_save_command() {
+  local service_port="$1"
+  unset_xtrace_when_ut_mode_false
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port -a $REDIS_DEFAULT_PASSWORD acl save"
+    logging_mask_acl_save_command="${acl_save_command/$REDIS_DEFAULT_PASSWORD/********}"
+  else
+    acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port acl save"
+    logging_mask_acl_save_command="$acl_save_command"
+  fi
+  echo "acl save command: $logging_mask_acl_save_command" >&2
+  set_xtrace_when_ut_mode_false
+  echo "$acl_save_command"
+}
+
+create_redis_cluster() {
+  local primary_nodes="$1"
+  local primary_count
+  primary_count=$(echo "$primary_nodes" | wc -w | tr -d ' ')
+  if [ "$primary_count" -eq 1 ]; then
+    local single_node="${primary_nodes% }"
+    initialize_command=$(build_single_shard_addslots_command "$single_node")
+  else
+    initialize_command=$(build_redis_cluster_create_command "$primary_nodes")
+  fi
+  if ! $initialize_command; then
+    echo "Failed to create Valkey Cluster" >&2
+    return 1
+  fi
+  return 0
+}
+
+secondary_replicated_to_primary() {
+  local secondary_endpoint_with_port="$1"
+  local mapping_primary_endpoint_with_port="$2"
+  local mapping_primary_cluster_id="$3"
+  replicated_command=$(build_secondary_replicated_command "$secondary_endpoint_with_port" "$mapping_primary_endpoint_with_port" "$mapping_primary_cluster_id")
+  replicated_output=$($replicated_command)
+  replicated_exit_code=$?
+  if [ $replicated_exit_code -ne 0 ]; then
+    echo "Failed to replicate the secondary node $secondary_endpoint_with_port to the primary node $mapping_primary_endpoint_with_port" >&2
+    return 1
+  fi
+  echo "$replicated_output"
+  return 0
+}
+
+scale_out_shard_primary_join_cluster() {
+  local scale_out_shard_default_primary_endpoint_with_port="$1"
+  local exist_available_node="$2"
+  add_node_command=$(build_scale_out_shard_primary_join_command "$scale_out_shard_default_primary_endpoint_with_port" "$exist_available_node")
+  if ! $add_node_command; then
+    echo "Failed to add the node $scale_out_shard_default_primary_endpoint_with_port to the cluster when scale_out_shard_primary_join_cluster" >&2
+    return 1
+  fi
+  return 0
+}
+
+scale_out_shard_reshard() {
+  local primary_node_with_port="$1"
+  local mapping_primary_cluster_id="$2"
+  local slots_per_shard="$3"
+  reshard_command=$(build_reshard_command "$primary_node_with_port" "$mapping_primary_cluster_id" "$slots_per_shard")
+  if ! $reshard_command; then
+    echo "Failed to reshard the cluster when scale_out_shard_reshard" >&2
+    return 1
+  fi
+  return 0
+}
+
+scale_in_shard_rebalance_to_zero() {
+  local node_with_port="$1"
+  local node_cluster_id="$2"
+  rebalance_command=$(build_rebalance_to_zero_command "$node_with_port" "$node_cluster_id")
+  if ! $rebalance_command; then
+    echo "Failed to rebalance the cluster when scale_in_shard_rebalance_to_zero" >&2
+    return 1
+  fi
+  return 0
+}
+
+scale_in_shard_del_node() {
+  local available_node="$1"
+  local node_to_del_cluster_id="$2"
+  del_node_command=$(build_del_node_command "$available_node" "$node_to_del_cluster_id")
+  if ! $del_node_command; then
+    echo "Failed to delete the node $available_node from the cluster when scale_in_shard_del_node" >&2
+    return 1
+  fi
+  return 0
+}
+
+secondary_member_leave_del_node() {
+  local available_node="$1"
+  local node_to_del_cluster_id="$2"
+  local do_forget_node="$3"
+  del_node_command=$(build_del_node_command "$available_node" "$node_to_del_cluster_id" "$do_forget_node")
+  if ! $del_node_command; then
+    echo "Failed to delete the node $available_node from the cluster when secondary_member_leave_del_node" >&2
+    return 1
+  fi
+  return 0
+}
+
+secondary_member_leave_del_node_with_retry() {
+  local available_node="$1"
+  local node_to_del_cluster_id="$2"
+  local do_forget_node="$3"
+  check_result=$(call_func_with_retry $check_ready_times $retry_delay_second secondary_member_leave_del_node "$available_node" "$node_to_del_cluster_id" "$do_forget_node")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to remove replica when member leave after retry" >&2
+    return 1
+  fi
+  return 0
+}
+
+execute_acl_save() {
+  local service_port="$1"
+  acl_save_command=$(build_acl_save_command "$service_port")
+  if ! $acl_save_command; then
+    echo "Failed to execute acl save command" >&2
+    return 1
+  fi
+  return 0
+}
+
+execute_acl_save_with_retry() {
+  local service_port="$1"
+  check_result=$(call_func_with_retry $check_ready_times $retry_delay_second execute_acl_save $service_port)
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to execute acl save command after retry" >&2
+    return 1
+  fi
+  return 0
+}
+
+check_redis_role() {
+  local host=$1
+  local port=$2
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    role_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port info replication)
+  else
+    role_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" info replication)
+  fi
+  set_xtrace_when_ut_mode_false
+
+  if echo "$role_info" | grep -q "^role:master"; then
+    echo "primary"
+  elif echo "$role_info" | grep -q "^role:slave"; then
+    echo "secondary"
+  else
+    echo "unknown"
+  fi
+}
+
+redis_config_get() {
+  local host=$1
+  local port=$2
+  local password=$3
+  local command=$4
+
+  local output
+  unset_xtrace_when_ut_mode_false
+  if ! is_empty "$password"; then
+    output=$(redis-cli $REDIS_CLI_TLS_CMD -h "$host" -p "$port" -a "$password" $command)
+  else
+    output=$(redis-cli $REDIS_CLI_TLS_CMD -h "$host" -p "$port" $command)
+  fi
+  local status=$?
+  set_xtrace_when_ut_mode_false
+
+  if [[ $status -ne 0 ]]; then
+    echo "Command failed with status $status." >&2
+    return 1
+  fi
+
+  if [[ -z "$output" ]]; then
+    echo "Command returned no output." >&2
+    return 1
+  fi
+
+  echo "$output"
+  return 0
+}
+
+forget_fail_node_when_cluster_is_ok() {
+  local host=$1
+  local port=$2
+  unset_xtrace_when_ut_mode_false
+  cluster_info=$(get_cluster_info_with_retry "$host" "$port")
+  cluster_state=$(echo "$cluster_info" | awk -F: '/cluster_state/{print $2}' | tr -d '[:space:]')
+  if [[ "$cluster_state" != "ok" ]]; then
+    echo "Cluster state is not ok, skip forget fail node"
+    set_xtrace_when_ut_mode_false
+    return 0
+  fi
+  cluster_nodes_info=$(get_cluster_nodes_info "$host" "$port")
+  while read -r line; do
+    node_id=$(echo "$line" | awk '{print $1}')
+    node_role=$(echo "$line" | awk '{print $3}')
+    if [[ "$node_role" == "fail" ]]; then
+      if [ -z ${REDIS_DEFAULT_PASSWORD} ]; then
+        redis-cli -h $host -p $port --cluster call $host:$port cluster forget ${node_id}
+      else
+        redis-cli -h $host -p $port --cluster call $host:$port cluster forget ${node_id} -a ${REDIS_DEFAULT_PASSWORD}
+      fi
+    fi
+  done <<< "$cluster_nodes_info"
+  set_xtrace_when_ut_mode_false
+}
\ No newline at end of file
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh
new file mode 100644
index 000000000..b78d5aae6
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-manage.sh
@@ -0,0 +1,1051 @@
+#!/bin/bash
+
+# shellcheck disable=SC2128
+# shellcheck disable=SC2207
+# shellcheck disable=SC1090
+
+# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command.
+# Normally test without [expression] returns false. It means that __() { :; }
+# function is defined if this script runs directly.
+#
+# shellspec overrides the test command and returns true *once*. It means that
+# __() function defined internally by shellspec is called.
+#
+# In other words. If not in test mode, __ is just a comment. If test mode, __
+# is a interception point.
+#
+# you should set ut_mode="true" when you want to run the script in shellspec file.
+#
+# shellcheck disable=SC2034
+ut_mode="false"
+test || __() {
+  # when running in non-unit test mode, set the options "set -ex".
+  set -ex;
+}
+
+# declare the global variables for initialize redis cluster
+declare -gA initialize_redis_cluster_primary_nodes
+declare -gA initialize_redis_cluster_secondary_nodes
+declare -gA initialize_pod_name_to_advertise_host_port_map
+
+# declare the global variables for scale out redis cluster shard
+declare -gA scale_out_shard_default_primary_node
+declare -gA scale_out_shard_default_other_nodes
+network_mode="default"
+
+init_environment(){
+  if [[ -z "${CURRENT_SHARD_ADVERTISED_PORT}" ]]; then
+    CURRENT_SHARD_ADVERTISED_PORT="${CURRENT_SHARD_LB_ADVERTISED_PORT}"
+  fi
+  if [[ -z "${CURRENT_SHARD_ADVERTISED_BUS_PORT}" ]]; then
+    CURRENT_SHARD_ADVERTISED_BUS_PORT="${CURRENT_SHARD_LB_ADVERTISED_BUS_PORT}"
+  fi
+  if [[ -z "${ALL_SHARDS_ADVERTISED_PORT}" ]]; then
+    ALL_SHARDS_ADVERTISED_PORT="${ALL_SHARDS_LB_ADVERTISED_PORT}"
+  fi
+  if [[ -z "${ALL_SHARDS_ADVERTISED_BUS_PORT}" ]]; then
+    ALL_SHARDS_ADVERTISED_BUS_PORT="${ALL_SHARDS_LB_ADVERTISED_BUS_PORT}"
+  fi
+  # determine cluster network mode
+  if [[ -n "$ALL_SHARDS_ADVERTISED_PORT" ]]; then
+    network_mode="advertised_svc"
+  elif [[ -n "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT" ]]; then
+    network_mode="host_network"
+  fi
+  KB_CLUSTER_POD_NAME_LIST=$(get_all_shards_pods)
+  KB_CLUSTER_POD_FQDN_LIST=$(get_all_shards_pod_fqdns)
+  KB_CLUSTER_COMPONENT_LIST=$(get_all_shards_components)
+}
+
+load_redis_cluster_common_utils() {
+  # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap
+  # and are mounted to the same path which defined in the cmpd.spec.scripts
+  kblib_common_library_file="/scripts/common.sh"
+  redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh"
+  source "${kblib_common_library_file}"
+  source "${redis_cluster_common_library_file}"
+}
+
+check_initialize_nodes_ready() {
+  local nodes=("$@")
+  for node in "${nodes[@]}"; do
+    local host port
+    host=$(echo "$node" | cut -d':' -f1)
+    port=$(echo "$node" | cut -d':' -f2)
+    if ! check_redis_server_ready_with_retry "$host" "$port"; then
+      return 1
+    fi
+  done
+  return 0
+}
+
+# initialize the other component and pods info
+init_other_components_and_pods_info() {
+  local current_component="$1"
+  local all_pod_fqdn_list="$2"
+  local all_component_list="$3"
+
+  other_components=()
+  other_component_pod_names=()
+  other_component_nodes=()
+  echo "init other components and pods info, current component: $current_component"
+  # filter out the components of the given component
+  IFS=',' read -ra components <<< "$all_component_list"
+  for comp in "${components[@]}"; do
+    if contains "$comp" "$current_component"; then
+      echo "skip the component $comp as it is the current component"
+      continue
+    fi
+    other_components+=("$comp")
+  done
+
+  # filter out the pods of the given component
+  for pod_fqdn in $(echo "$all_pod_fqdn_list" | tr ',' '\n'); do
+    pod_name=${pod_fqdn%%.*}
+    if echo "$pod_name" | grep "$current_component-"; then
+      echo "skip the pod $pod_name as it belongs the component $current_component"
+      continue
+    fi
+
+    other_component_pod_names+=("$pod_name")
+
+    local pod_service_port
+    pod_service_port=$(get_pod_service_port_by_network_mode "$pod_name")
+    other_component_nodes+=("$pod_fqdn:$pod_service_port")
+  done
+
+  echo "other_components: ${other_components[*]}"
+  echo "other_component_pod_names: ${other_component_pod_names[*]}"
+  echo "other_component_nodes: ${other_component_nodes[*]}"
+}
+
+find_exist_available_node() {
+  local node_ip
+  local node_port
+  for node in "${other_component_nodes[@]}"; do
+    # the $node is the headless address by default, we should get the real node address from cluster nodes
+    node_ip=$(echo "$node" | cut -d':' -f1)
+    node_port=$(echo "$node" | cut -d':' -f2)
+    if check_slots_covered "$node" "$node_port"; then
+      # the $node is the headless address by default, we should get the real node address from cluster nodes
+      cluster_nodes_info=$(get_cluster_nodes_info "$node_ip" "$node_port")
+      status=$?
+      if [ $status -ne 0 ]; then
+        echo "Failed to get cluster nodes info in find_exist_available_node" >&2
+        exit 1
+      fi
+      # grep my self node and return the nodeIp:port(it may be the announceIp and announcePort, for example when cluster enable NodePort/LoadBalancer service)
+      available_node_with_port=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $2}' | cut -d'@' -f1)
+      echo "$available_node_with_port"
+      return
+    fi
+  done
+  echo ""
+}
+
+extract_pod_name_prefix() {
+  local pod_name="$1"
+  # shellcheck disable=SC2001
+  prefix=$(echo "$pod_name" | sed 's/-[0-9]*$//')
+  echo "$prefix"
+}
+
+extract_lb_host_by_svc_name() {
+  local svc_name="$1"
+  for lb_composed_name in $(echo "$ALL_SHARDS_LB_ADVERTISED_HOST" | tr ',' '\n' ); do
+    lb_composed_name=${lb_composed_name#*@}
+    if [[ ${lb_composed_name} == *":"* ]]; then
+       if [[ ${lb_composed_name%:*} == "$svc_name" ]]; then
+         echo "${lb_composed_name#*:}"
+         break
+       fi
+    else
+       break
+    fi
+  done
+}
+
+# get the current component primary node and other nodes for scale in
+get_current_comp_nodes_for_scale_in() {
+
+  parse_node_line_info() {
+    local line="$1"
+
+    local node_ip_port_fields
+    # 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local
+    node_ip_port_fields=$(echo "$line" | awk '{print $2}')
+
+    local node_ip_port
+    # ip:port without bus port
+    node_ip_port=$(echo "$node_ip_port_fields" | awk -F '@' '{print $1}')
+
+    local node_ip
+    node_ip=$(echo "$node_ip_port" | cut -d':' -f1)
+
+    local node_port
+    node_port=$(echo "$node_ip_port" | cut -d':' -f2)
+
+    local node_fqdn
+    # redis-shard-sxj-0.redis-shard-sxj-headless.default.svc
+    node_fqdn=$(echo "$line" | awk '{print $2}' | awk -F ',' '{print $2}')
+
+    local node_role
+    node_role=$(echo "$line" | awk '{print $3}')
+
+    echo "$node_ip $node_port $node_role $node_fqdn"
+  }
+
+  get_node_address_by_network_mode() {
+    local node_ip="$1"
+    local node_port="$2"
+    local node_fqdn="$3"
+
+    case "$network_mode" in
+      "advertised_svc")
+        echo "$node_ip:$node_port"
+        ;;
+      "host_network")
+        echo "$node_ip:$REDIS_CLUSTER_HOST_NETWORK_PORT"
+        ;;
+      *)
+        # shellcheck disable=SC2153
+        echo "$node_fqdn:$SERVICE_PORT"
+        ;;
+    esac
+  }
+
+  categorize_node() {
+    local node_address="$1"
+    local node_role="$2"
+    local belong_current_comp="$3"
+
+    if [[ "$belong_current_comp" == "true" ]]; then
+      if [[ "$node_role" =~ "master" && ! "$node_role" =~ "fail" ]]; then
+        current_comp_primary_node+=("$node_address")
+      else
+        current_comp_other_nodes+=("$node_address")
+      fi
+    fi
+  }
+
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info in get_current_comp_nodes_for_scale_in" >&2
+    return 1
+  fi
+
+  current_comp_primary_node=()
+  current_comp_other_nodes=()
+
+  # if the cluster_nodes_info contains only one line, it means that the cluster not be initialized
+  if [ "$(echo "$cluster_nodes_info" | wc -l)" -eq 1 ]; then
+    echo "Cluster nodes info contains only one line, returning..."
+    return
+  fi
+
+  # prepare CURRENT_SHARD_HOST_OR_PORT_LIST for advertised_svc mode
+  CURRENT_SHARD_HOST_OR_PORT_LIST=()
+  if [ "$network_mode" == "advertised_svc" ]; then
+    IFS=',' read -ra CURRENT_POD_LIST <<< "$CURRENT_SHARD_POD_NAME_LIST"
+    for pod_name in "${CURRENT_POD_LIST[@]}"; do
+      svc_and_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_PORT" "true")
+      svc_name=${svc_and_port%:*}
+      lb_host=$(extract_lb_host_by_svc_name "${svc_name}")
+      if [ -n "$lb_host" ]; then
+          CURRENT_SHARD_HOST_OR_PORT_LIST+=("${lb_host}:6379")
+      else
+          svc_port="${svc_and_port#*:}"
+          CURRENT_SHARD_HOST_OR_PORT_LIST+=(":${svc_port}")
+      fi
+      echo "pod_name: $pod_name, svc_and_port: $svc_and_port"
+    done
+    # check length of CURRENT_SHARD_ANNOUNCE_IP_LIST must equal to CURRENT_POD_LIST
+    if [ ${#CURRENT_SHARD_HOST_OR_PORT_LIST[@]} -ne ${#CURRENT_POD_LIST[@]} ]; then
+      echo "Error: failed to get the pod ip list from KB_POD_LIST"
+      return 1
+    fi
+  fi
+  # the output of line is like:
+  # 1. using the pod fqdn as the nodeAddr
+  # 4958e6dca033cd1b321922508553fab869a29d 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287
+  # 2. using the nodeport or lb ip as the nodeAddr
+  # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:31000@31888,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287
+  # 3. using the host network ip as the nodeAddr
+  # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:1050@1051,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287
+  while read -r line; do
+    local node_info
+    node_info=$(parse_node_line_info "$line")
+    read -r node_ip node_port node_role node_fqdn <<< "$node_info"
+
+    belong_current_comp=false
+    if [ "$network_mode" == "advertised_svc" ]; then
+      for i in "${CURRENT_SHARD_HOST_OR_PORT_LIST[@]}"; do
+        node_announce_info=":$node_port"
+        if ! is_empty "$CURRENT_SHARD_LB_ADVERTISED_PORT"; then
+          node_announce_info="$node_ip:$node_port"
+        fi
+        if [[ "$i" == "$node_announce_info" ]]; then
+          belong_current_comp=true
+          break
+        fi
+      done
+    elif [ "$network_mode" == "host_network" ]; then
+      if contains "$node_port" "$SERVICE_PORT"; then
+        belong_current_comp=true
+      fi
+    elif contains "$node_fqdn" "$CURRENT_SHARD_COMPONENT_NAME"; then
+      belong_current_comp=true
+    fi
+    local node_address
+    node_address=$(get_node_address_by_network_mode "$node_ip" "$node_port" "$node_fqdn")
+    categorize_node "$node_address" "$node_role" "$belong_current_comp"
+  done <<< "$cluster_nodes_info"
+
+  echo "current_comp_primary_node: ${current_comp_primary_node[*]}"
+  echo "current_comp_other_nodes: ${current_comp_other_nodes[*]}"
+}
+
+# init the current shard component default primary and secondary nodes for scale out shard.
+# TODO: if advertised address is enable and instanceTemplate is specified, the pod service could not be parsed from the pod ordinal.
+init_current_comp_default_nodes_for_scale_out() {
+  # categorize the scale out node map
+  categorize_scale_out_node_map() {
+    local pod_name="$1"
+    local node_address="$2"
+    local pod_ordinal="$3"
+
+    if equals "$pod_ordinal" "$min_lexicographical_pod_ordinal"; then
+      scale_out_shard_default_primary_node["$pod_name"]="$node_address"
+    else
+      scale_out_shard_default_other_nodes["$pod_name"]="$node_address"
+    fi
+  }
+
+  # handle the advertised service network mode (currently only support NodePort service type
+  handle_advertised_svc_network_mode() {
+    local pod_fqdn="$1"
+    local pod_name_ordinal="$2"
+    local pod_name=${pod_fqdn%%.*}
+    local old_ifs="$IFS"
+    IFS=','
+    set -f
+    read -ra advertised_infos <<< "$CURRENT_SHARD_ADVERTISED_PORT"
+    set +f
+    IFS="$old_ifs"
+
+    local found_advertised_port=false
+    for advertised_info in "${advertised_infos[@]}"; do
+      local advertised_svc advertised_port advertised_svc_ordinal
+      advertised_svc=$(echo "$advertised_info" | cut -d':' -f1)
+      advertised_port=$(echo "$advertised_info" | cut -d':' -f2)
+      advertised_svc_ordinal=$(extract_obj_ordinal "$advertised_svc")
+
+      if [ "$pod_name_ordinal" == "$advertised_svc_ordinal" ]; then
+        local pod_host_ip
+        lb_host=$(extract_lb_host_by_svc_name "${advertised_svc}")
+        if ! is_empty "$lb_host"; then
+            echo "Found load balancer host for svcName '$advertised_svc', value is '$lb_host'."
+            pod_host_ip="$lb_host"
+            advertised_port="6379"
+        else
+            pod_host_ip=$(redis_config_get "$pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p')
+        fi
+        status=$?
+        if is_empty "$pod_host_ip" || [ $status -ne 0 ]; then
+          echo "Failed to get host ip of pod $pod_name" >&2
+          return 1
+        fi
+
+        categorize_scale_out_node_map "$pod_name" "$pod_host_ip:$advertised_port" "$pod_name_ordinal"
+        found_advertised_port=true
+        break
+      fi
+    done
+
+    if [ "$found_advertised_port" = false ]; then
+      echo "Advertised port not found for pod $pod_name" >&2
+      return 1
+    fi
+    return 0
+  }
+
+  # handle the host network mode
+  handle_host_network_mode() {
+    local pod_fqdn="$1"
+    local pod_name_ordinal="$2"
+    local pod_name=${pod_fqdn%%.*}
+    local pod_host_ip
+    pod_host_ip=$(redis_config_get "$pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p')
+    if is_empty "$pod_host_ip"; then
+      echo "Failed to get host ip of pod $pod_name in host network mode" >&2
+      return 1
+    fi
+
+    categorize_scale_out_node_map "$pod_name" "$pod_host_ip:$REDIS_CLUSTER_HOST_NETWORK_PORT" "$pod_name_ordinal"
+    return 0
+  }
+
+  # handle the default network mode
+  handle_default_network_mode() {
+    local pod_fqdn="$1"
+    local pod_name_ordinal="$2"
+    local pod_name=${pod_fqdn%%.*}
+    categorize_scale_out_node_map "$pod_name" "$pod_fqdn:$SERVICE_PORT" "$pod_name_ordinal"
+    return 0
+  }
+
+  process_pod_by_network_mode() {
+    local pod_fqdn="$1"
+    local pod_name_ordinal="$2"
+
+    case "$network_mode" in
+      "advertised_svc")
+        handle_advertised_svc_network_mode "$pod_fqdn" "$pod_name_ordinal"
+        ;;
+      "host_network")
+        handle_host_network_mode "$pod_fqdn" "$pod_name_ordinal"
+        ;;
+      *)
+        handle_default_network_mode "$pod_fqdn" "$pod_name_ordinal"
+        ;;
+    esac
+    return $?
+  }
+
+  local min_lexicographical_pod_name
+  local min_lexicographical_pod_ordinal
+  min_lexicographical_pod_name=$(min_lexicographical_order_pod "$CURRENT_SHARD_POD_NAME_LIST")
+  min_lexicographical_pod_ordinal=$(extract_obj_ordinal "$min_lexicographical_pod_name")
+  if is_empty "$min_lexicographical_pod_ordinal"; then
+    echo "Failed to get the ordinal of the min lexicographical pod $min_lexicographical_pod_name in init_current_comp_default_nodes_for_scale_out" >&2
+    return 1
+  fi
+
+  for pod_fqdn in $(echo "$CURRENT_SHARD_POD_FQDN_LIST" | tr ',' ' '); do
+    local pod_name_ordinal
+    pod_name=${pod_fqdn%%.*}
+    pod_name_ordinal=$(extract_obj_ordinal "$pod_name")
+    process_pod_by_network_mode "$pod_fqdn" "$pod_name_ordinal" || return 1
+  done
+  return 0
+}
+
+# initialize the redis cluster primary and secondary nodes, use the min lexicographical pod of each shard as the primary nodes by default.
+gen_initialize_redis_cluster_node() {
+  local is_primary=$1
+
+  categorize_node_maps() {
+    local pod_name="$1"
+    local host="$2"
+    local port="$3"
+    local is_primary="$4"
+
+    local node_addr="$host:$port"
+
+    if equals "$is_primary" "true"; then
+      initialize_redis_cluster_primary_nodes["$pod_name"]="$node_addr"
+    else
+      initialize_redis_cluster_secondary_nodes["$pod_name"]="$node_addr"
+    fi
+    initialize_pod_name_to_advertise_host_port_map["$pod_name"]="$node_addr"
+  }
+
+  # determine if pod should be processed based on primary/secondary role
+  should_process_pod() {
+    local is_primary="$1"
+    local pod_ordinal="$2"
+    local min_pod_ordinal="$3"
+
+    if [ "$is_primary" = "true" ]; then
+      [ "$pod_ordinal" = "$min_pod_ordinal" ]
+    else
+      [ "$pod_ordinal" != "$min_pod_ordinal" ]
+    fi
+  }
+
+  # Initialize node with advertised service configuration
+  initialize_advertised_svc_node() {
+    local pod_fqdn="$1"
+    local pod_name_ordinal="$2"
+    local is_primary="$3"
+    local pod_name=${pod_fqdn%%.*}
+
+    local pod_host_ip
+    pod_service_port=$(get_pod_service_port_by_network_mode "${pod_name}") || {
+        echo "Failed to get service port for pod: $pod_name" >&2
+        return 1
+    }
+    pod_host_ip=$(redis_config_get "$pod_fqdn" "$pod_service_port" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p')
+    if is_empty "$pod_host_ip"; then
+      echo "Failed to get host IP for pod: $pod_name" >&2
+      return 1
+    fi
+    ## the value format of ALL_SHARDS_ADVERTISED_PORT is "shard-98x@redis-shard-98x-redis-advertised-0:32024,redis-shard-98x-redis-advertised-1:31318.shard-cq7@redis-shard-cq7-redis-advertised-0:31828,redis-shard-cq7-redis-advertised-1:32000"
+    local old_ifs="$IFS"
+    IFS='.'
+    set -f
+    local shards
+    read -ra shards <<< "$ALL_SHARDS_ADVERTISED_PORT"
+    set +f
+    IFS="$old_ifs"
+
+    local shard
+    for shard in "${shards[@]}"; do
+      local shard_name
+      shard_name=$(echo "$shard" | cut -d'@' -f1)
+
+      # skip if pod doesn't belong to current shard
+      if ! echo "$pod_name" | grep -q "$shard_name"; then
+        continue
+      fi
+
+      # shard_advertised_infos like "redis-shard-98x-redis-advertised-0:32024,redis-shard-98x-redis-advertised-1:31318"
+      local old_ifs="$IFS"
+      IFS=','
+      set -f
+      local shard_advertised_infos
+      read -ra shard_advertised_infos <<< "$(echo "$shard" | cut -d'@' -f2)"
+      set +f
+      IFS="$old_ifs"
+
+      local shard_advertised_info
+      for shard_advertised_info in "${shard_advertised_infos[@]}"; do
+        local shard_advertised_svc
+        local shard_advertised_port
+        local shard_advertised_svc_ordinal
+
+        shard_advertised_svc=$(echo "$shard_advertised_info" | cut -d':' -f1)
+        shard_advertised_port=$(echo "$shard_advertised_info" | cut -d':' -f2)
+        shard_advertised_svc_ordinal=$(extract_obj_ordinal "$shard_advertised_svc")
+
+        if [ "$pod_name_ordinal" = "$shard_advertised_svc_ordinal" ]; then
+          lb_host=$(extract_lb_host_by_svc_name "${shard_advertised_svc}")
+          if [ -n "$lb_host" ]; then
+            echo "Found load balancer host for svcName '$shard_advertised_svc', value is '$lb_host'."
+            pod_host_ip="$lb_host"
+            shard_advertised_port="6379"
+          fi
+          categorize_node_maps "$pod_name" "$pod_host_ip" "$shard_advertised_port" "$is_primary"
+          return 0
+        fi
+      done
+    done
+    return 0
+  }
+
+  # Initialize node with host network configuration
+  initialize_host_network_node() {
+    local pod_fqdn="$1"
+    local is_primary="$2"
+    local pod_name=${pod_fqdn%%.*}
+
+    pod_service_port=$(get_pod_service_port_by_network_mode "${pod_name}") || {
+        echo "Failed to get service port for pod: $pod_name" >&2
+        return 1
+    }
+    pod_host_ip=$(redis_config_get "$pod_fqdn" "$pod_service_port" "$REDIS_DEFAULT_PASSWORD" "config get cluster-announce-ip" | sed -n '2p')
+    if is_empty "$pod_host_ip"; then
+      echo "Failed to get host ip of pod $pod_name in host network mode" >&2
+      return 1
+    fi
+    categorize_node_maps "$pod_name" "$pod_host_ip" "$pod_service_port" "$is_primary"
+    return 0
+  }
+
+  # Initialize node with default network configuration
+  initialize_default_network_node() {
+    local pod_fqdn="$1"
+    local is_primary="$2"
+    local pod_name=${pod_fqdn%%.*}
+
+    local pod_service_port
+    pod_service_port=$(get_pod_service_port_by_network_mode "${pod_name}") || {
+      echo "Failed to get service_port for pod: $pod_name" >&2
+      return 1
+    }
+    categorize_node_maps "$pod_name" "$pod_fqdn" "$pod_service_port" "$is_primary"
+    return 0
+  }
+
+  # determine cluster network mode
+  local network_mode="default"
+  if ! is_empty "$ALL_SHARDS_ADVERTISED_PORT"; then
+    network_mode="advertised_svc"
+  elif ! is_empty "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"; then
+    network_mode="host_network"
+  fi
+
+  # get and validate the min lexicographical pod name and ordinal
+  local min_lexicographical_pod_name
+  local min_lexicographical_pod_ordinal
+  min_lexicographical_pod_name=$(min_lexicographical_order_pod "$KB_CLUSTER_POD_NAME_LIST")
+  min_lexicographical_pod_ordinal=$(extract_obj_ordinal "$min_lexicographical_pod_name")
+  if is_empty "$min_lexicographical_pod_ordinal"; then
+    echo "Failed to get the ordinal of the min lexicographical pod $min_lexicographical_pod_name in gen_initialize_redis_cluster_node" >&2
+    return 1
+  fi
+
+  local pod_name
+  for pod_fqdn in $(echo "$KB_CLUSTER_POD_FQDN_LIST" | tr ',' ' '); do
+    local pod_name_ordinal
+    pod_name=${pod_fqdn%%.*}
+    pod_name_ordinal=$(extract_obj_ordinal "$pod_name") || continue
+
+    # skip pods based on primary/secondary role
+    if ! should_process_pod "$is_primary" "$pod_name_ordinal" "$min_lexicographical_pod_ordinal"; then
+      continue
+    fi
+    # initialize pod based on network mode
+    case "$network_mode" in
+      "advertised_svc")
+        initialize_advertised_svc_node "$pod_fqdn" "$pod_name_ordinal" "$is_primary" || return 1
+        ;;
+      "host_network")
+        initialize_host_network_node "$pod_fqdn" "$is_primary" || return 1
+        ;;
+      "default")
+        initialize_default_network_node "$pod_fqdn" "$is_primary" || return 1
+        ;;
+    esac
+  done
+  return 0
+}
+
+gen_initialize_redis_cluster_primary_node() {
+  gen_initialize_redis_cluster_node "true"
+}
+
+gen_initialize_redis_cluster_secondary_nodes() {
+  gen_initialize_redis_cluster_node "false"
+}
+
+initialize_redis_cluster() {
+  # generate primary and secondary nodes
+  gen_initialize_redis_cluster_primary_node
+  gen_initialize_redis_cluster_secondary_nodes
+
+  if [ ${#initialize_redis_cluster_primary_nodes[@]} -eq 0 ]; then
+    echo "Failed to get primary nodes" >&2
+    return 1
+  fi
+
+  # check all the primary nodes are ready
+  local primary_nodes=""
+  local primary_node_list=()
+  for pod_name in "${!initialize_redis_cluster_primary_nodes[@]}"; do
+    primary_nodes+="${initialize_redis_cluster_primary_nodes[$pod_name]} "
+    primary_node_list+=("${initialize_redis_cluster_primary_nodes[$pod_name]}")
+  done
+  if ! check_initialize_nodes_ready "${primary_node_list[@]}"; then
+    echo "Primary nodes health check failed" >&2
+    return 1
+  fi
+
+  # check all the secondary nodes are ready
+  if [ ${#initialize_redis_cluster_secondary_nodes[@]} -gt 0 ]; then
+    secondary_node_list=()
+    for pod_name in "${!initialize_redis_cluster_secondary_nodes[@]}"; do
+      secondary_node_list+=("${initialize_redis_cluster_secondary_nodes[$pod_name]}")
+    done
+    if ! check_initialize_nodes_ready "${secondary_node_list[@]}"; then
+      echo "Secondary nodes health check failed" >&2
+      return 1
+    fi
+  fi
+
+  # initialize all the primary nodes
+  if create_redis_cluster "$primary_nodes"; then
+    echo "Redis cluster initialized primary nodes successfully, cluster nodes: $primary_nodes"
+  else
+    echo "Failed to create redis cluster when initializing" >&2
+    return 1
+  fi
+
+  # get the first primary node to check the cluster
+  first_primary_node=$(echo "$primary_nodes" | awk '{print $1}')
+  if check_slots_covered "$first_primary_node" "$SERVICE_PORT"; then
+    echo "Redis cluster check primary nodes slots covered successfully."
+  else
+    echo "Failed to create redis cluster when checking slots covered" >&2
+    return 1
+  fi
+
+  # initialize all the secondary nodes
+  if [ ${#initialize_redis_cluster_secondary_nodes[@]} -eq 0 ]; then
+    echo "No secondary nodes to initialize"
+    return 0
+  fi
+
+  all_secondaries_ready=true
+  for secondary_pod_name in "${!initialize_redis_cluster_secondary_nodes[@]}"; do
+    secondary_endpoint_with_port=${initialize_redis_cluster_secondary_nodes["$secondary_pod_name"]}
+    # shellcheck disable=SC2001
+    mapping_primary_pod_name=$(echo "$secondary_pod_name" | sed 's/-[0-9]*$/-0/')
+    mapping_primary_endpoint_with_port=${initialize_pod_name_to_advertise_host_port_map["$mapping_primary_pod_name"]}
+    if is_empty "$mapping_primary_endpoint_with_port"; then
+      echo "Failed to find the mapping primary node for secondary node: $secondary_pod_name" >&2
+      return 1
+    fi
+    mapping_primary_endpoint=$(echo "$mapping_primary_endpoint_with_port" | cut -d':' -f1)
+    mapping_primary_port=$(echo "$mapping_primary_endpoint_with_port" | cut -d':' -f2)
+    mapping_primary_cluster_id=$(get_cluster_id "$mapping_primary_endpoint" "$mapping_primary_port")
+    echo "mapping_primary_fqdn: $mapping_primary_endpoint, mapping_primary_endpoint_with_port: $mapping_primary_endpoint_with_port, mapping_primary_cluster_id: $mapping_primary_cluster_id"
+    if is_empty "$mapping_primary_cluster_id"; then
+      echo "Failed to get the cluster id from cluster nodes of the mapping primary node: $mapping_primary_endpoint_with_port" >&2
+      return 1
+    fi
+    replicated_output=$(secondary_replicated_to_primary "$secondary_endpoint_with_port" "$mapping_primary_endpoint_with_port" "$mapping_primary_cluster_id")
+    status=$?
+    if [ $status -ne 0 ] ; then
+      echo "Failed to initialize the secondary node $secondary_pod_name, secondary replicated output: $replicated_output" >&2
+      return 1
+    fi
+    echo "Redis cluster initialized secondary node $secondary_pod_name successfully"
+    # waiting for all nodes sync the information
+    sleep_when_ut_mode_false 5
+    secondary_node="$secondary_pod_name"
+    if [ "$network_mode" != "default" ]; then
+      secondary_node="${initialize_redis_cluster_secondary_nodes["$secondary_pod_name"]}"
+    fi
+    # verify secondary node is already in all primary nodes
+    if ! verify_secondary_in_all_primaries "$secondary_node" "${primary_node_list[@]}"; then
+      echo "Failed to verify secondary node $secondary_node in all primary nodes" >&2
+      all_secondaries_ready=false
+      continue
+    fi
+    echo "Secondary node $secondary_pod_name successfully joined the cluster and verified in all primaries"
+  done
+
+  if [ "$all_secondaries_ready" = false ]; then
+    echo "Failed to initialize all secondary nodes" >&2
+    return 1
+  fi
+  echo "Redis cluster initialized all secondary nodes successfully"
+  return 0
+}
+
+verify_secondary_in_all_primaries() {
+  local secondary_node="$1"
+  local primary_nodes=("$@")
+  # Skip the first argument
+  shift
+  for primary_node in "$@"; do
+    local primary_host primary_port
+    primary_host=$(echo "$primary_node" | cut -d':' -f1)
+    primary_port=$(echo "$primary_node" | cut -d':' -f2)
+    retry_count=0
+    while ! check_node_in_cluster "$primary_host" "$primary_port" "$secondary_node" && [ $retry_count -lt 30 ]; do
+      sleep_when_ut_mode_false 3
+      ((retry_count++))
+    done
+    # shellcheck disable=SC2086
+    if [ $retry_count -eq 30 ]; then
+      echo "Secondary node $secondary_node not found in primary $primary_node after retry" >&2
+      return 1
+    fi
+  done
+  return 0
+}
+
+check_current_shard_other_nodes_are_joined() {
+  local current_primary_host="$1"
+  local service_port="$2"
+  cluster_nodes_info=$(get_cluster_nodes_info "$current_primary_host" "$service_port")
+  for secondary_pod_name in "${!scale_out_shard_default_other_nodes[@]}"; do
+    secondary_node="$secondary_pod_name"
+    if [ "$network_mode" != "default" ]; then
+      secondary_node="${scale_out_shard_default_other_nodes["$secondary_pod_name"]}"
+    fi
+    if ! contains "$cluster_nodes_info" "$secondary_node"; then
+      echo "Secondary node $secondary_node not found in primary $current_primary_host, need to joined" >&2
+      return 1
+    fi
+  done
+  return 0
+}
+
+scale_out_redis_cluster_shard() {
+  if is_empty "$CURRENT_SHARD_COMPONENT_SHORT_NAME" || is_empty "$KB_CLUSTER_POD_FQDN_LIST"; then
+    echo "Error: Required environment variable CURRENT_SHARD_COMPONENT_SHORT_NAME, KB_CLUSTER_POD_FQDN_LIST are not set when scale out redis cluster shard" >&2
+    return 1
+  fi
+
+  init_other_components_and_pods_info "$CURRENT_SHARD_COMPONENT_SHORT_NAME" "$KB_CLUSTER_POD_FQDN_LIST" "$KB_CLUSTER_COMPONENT_LIST"
+  if init_current_comp_default_nodes_for_scale_out; then
+    echo "Redis cluster scale out shard default primary and secondary nodes successfully"
+  else
+    echo "Failed to initialize the default primary and secondary nodes for scale out" >&2
+    return 1
+  fi
+
+  # check the current component shard whether is already scaled out
+  if [ ${#scale_out_shard_default_primary_node[@]} -eq 0 ]; then
+    echo "Failed to generate primary nodes when scaling out" >&2
+    return 1
+  fi
+  primary_node_with_port=$(echo "${scale_out_shard_default_primary_node[*]}" | awk '{print $1}')
+  primary_node_fqdn=$(echo "$primary_node_with_port" | awk -F ':' '{print $1}')
+  primary_node_port=$(echo "$primary_node_with_port" | awk -F ':' '{print $2}')
+  mapping_primary_cluster_id=$(get_cluster_id "$primary_node_fqdn" "$primary_node_port")
+  current_primary_joined=false
+  if check_slots_covered "$primary_node_with_port" "$SERVICE_PORT"; then
+    if check_current_shard_other_nodes_are_joined "$primary_node_fqdn" "$primary_node_port"; then
+      echo "The current component shard is already scaled out, no need to scale out again."
+      return 0
+    fi
+    current_primary_joined=true
+  fi
+
+  # find the exist available node which is not in the current component
+  available_node=$(find_exist_available_node)
+  if is_empty "$available_node"; then
+    echo "No exist available node found or cluster status is not ok" >&2
+    return 1
+  fi
+
+  # Forget fail node when cluster is ok
+  # forget_fail_node_when_cluster_is_ok "${available_node%%:*}" "${available_node##*:}"
+
+  # add the primary node for the current shard
+  if [ "$current_primary_joined" = false ]; then
+    local scale_out_shard_default_primary
+    for primary_pod_name in "${!scale_out_shard_default_primary_node[@]}"; do
+      scale_out_shard_default_primary="${scale_out_shard_default_primary_node[$primary_pod_name]}"
+      if scale_out_shard_primary_join_cluster "$scale_out_shard_default_primary" "$available_node"; then
+        echo "Redis cluster scale out shard primary node $primary_pod_name successfully"
+      else
+        echo "Failed to scale out shard primary node $primary_pod_name" >&2
+        return 1
+      fi
+    done
+  fi
+
+  # waiting for all nodes sync the information
+  sleep_when_ut_mode_false 5
+
+  # add the secondary nodes to replicate the primary node
+  local scale_out_shard_secondary_node
+  local scale_out_shard_secondary_node_with_port
+  for secondary_pod_name in "${!scale_out_shard_default_other_nodes[@]}"; do
+    scale_out_shard_secondary_node_with_port="${scale_out_shard_default_other_nodes[$secondary_pod_name]}"
+    scale_out_shard_secondary_node="${secondary_pod_name}"
+    if [ "$network_mode" != "default" ]; then
+      scale_out_shard_secondary_node=$scale_out_shard_secondary_node_with_port
+    fi
+    echo "primary_node_with_port: $primary_node_with_port, primary_node_fqdn: $primary_node_fqdn, mapping_primary_cluster_id: $mapping_primary_cluster_id"
+    if check_node_in_cluster "$primary_node_fqdn" "$primary_node_with_port" "$scale_out_shard_secondary_node"; then
+      echo "Secondary node $secondary_pod_name already joined the cluster, skip replicating to primary"
+      continue
+    fi
+    if secondary_replicated_to_primary "$scale_out_shard_secondary_node_with_port" "$primary_node_with_port" "$mapping_primary_cluster_id"; then
+      echo "Redis cluster scale out shard secondary node $secondary_pod_name successfully"
+    else
+      echo "Failed to scale out shard secondary node $secondary_pod_name" >&2
+      return 1
+    fi
+  done
+
+  # do the reshard
+  # TODO: optimize the number of reshard slots according to the cluster status
+  local total_slots
+  local current_comp_pod_count
+  local all_comp_pod_count
+  local shard_count
+  local slots_per_shard
+  total_slots=16384
+  current_comp_pod_count=$(echo "$CURRENT_SHARD_POD_NAME_LIST" | tr ',' '\n' | grep -c "^$CURRENT_SHARD_COMPONENT_NAME-")
+  all_comp_pod_count=$(echo "$KB_CLUSTER_POD_NAME_LIST" | tr ',' '\n' | grep -c ".*")
+  shard_count=$((all_comp_pod_count / current_comp_pod_count))
+  slots_per_shard=$((total_slots / shard_count))
+  # Stream-Valkey divergence: upstream calls `redis-cli --cluster reshard` here
+  # to migrate slots into the newly-joined primary. We do not — slot migration
+  # for our Valkey clusters is driven by the ASM (CLUSTER MIGRATESLOTS) path
+  # via OpsDefinition in stream-infra, which gives us live-migration with
+  # ape-dts and per-batch progress. The legacy reshard path uses
+  # MIGRATE COPY+DEL synchronously and stalls the source primary at high QPS.
+  # We keep the new shard joined with zero slots; the operator runs ASM next.
+  echo "Skipping legacy reshard call; slot migration handled by ASM OpsRequest." >&2
+  echo "  (target primary: $primary_node_with_port, slots_per_shard would have been: $slots_per_shard)" >&2
+
+  return 0
+}
+
+sync_acl_for_redis_cluster_shard() {
+  echo "Sync ACL rules for redis cluster shard..."
+  set +ex
+  redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -a $REDIS_DEFAULT_PASSWORD"
+  if [ -z "$REDIS_DEFAULT_PASSWORD" ]; then
+     redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD"
+  fi
+  is_ok=false
+  acl_list=""
+  # 1. get acl list from other pods
+  for pod_fqdn in $(echo "$KB_CLUSTER_POD_FQDN_LIST" | tr ',' ' '); do
+    pod_name=${pod_fqdn%%.*}
+    pod_service_port=$(get_pod_service_port_by_network_mode "$pod_name")
+    cluster_info=$(get_cluster_info_with_retry "$pod_fqdn" "$pod_service_port")
+    status=$?
+    if [ $status -ne 0 ]; then
+      continue
+    fi
+    cluster_state=$(echo "$cluster_info" | awk -F: '/cluster_state/{print $2}' | tr -d '[:space:]')
+    if is_empty "$cluster_state" || equals "$cluster_state" "ok"; then
+       acl_list=$($redis_base_cmd -p $pod_service_port -h "$pod_fqdn" ACL LIST)
+       is_ok=true
+       break
+    fi
+  done
+
+  if [ "$is_ok" = false ]; then
+      echo "Failed to get ACL LIST from other shard pods" >&2
+      exit 1
+  fi
+
+  if [ -z "$acl_list" ]; then
+      echo "No ACL rules found in other pods, skip synchronization" >&2
+      return
+  fi
+  # 2. apply acl list to current shard pods
+  set -e
+  while IFS= read -r user_rule; do
+      [[ -z "$user_rule" ]] && continue
+
+      if [[ "$user_rule" =~ ^user[[:space:]]+([^[:space:]]+) ]]; then
+          username="${BASH_REMATCH[1]}"
+      else
+        # skip invalid user rule
+        continue
+      fi
+
+      if [[ "$username" == "default" ]]; then
+          continue
+      fi
+      rule_part="${user_rule#user $username }"
+      for pod_fqdn in $(echo "$CURRENT_SHARD_POD_FQDN_LIST" | tr ',' '\n'); do
+         $redis_base_cmd -h $pod_fqdn -p $SERVICE_PORT ACL SETUSER "$username" $rule_part >&2
+         $redis_base_cmd -h $pod_fqdn -p $SERVICE_PORT ACL save >&2
+      done
+  done <<< "$acl_list"
+  set_xtrace_when_ut_mode_false
+}
+
+scale_in_redis_cluster_shard() {
+
+  if is_empty "$CURRENT_SHARD_COMPONENT_SHORT_NAME" || is_empty "$KB_CLUSTER_POD_FQDN_LIST"; then
+    echo "Error: Required environment variable CURRENT_SHARD_COMPONENT_SHORT_NAME, KB_CLUSTER_POD_FQDN_LIST are not set when scale in redis cluster shard" >&2
+    return 1
+  fi
+
+  # Forget fail node when cluster is ok
+  # forget_fail_node_when_cluster_is_ok "127.0.0.1" "$SERVICE_PORT"
+
+  # init information for the other components and pods
+  init_other_components_and_pods_info "$CURRENT_SHARD_COMPONENT_SHORT_NAME" "$KB_CLUSTER_POD_FQDN_LIST" "$KB_CLUSTER_COMPONENT_LIST"
+  available_node=$(find_exist_available_node)
+  available_node_fqdn=$(echo "$available_node" | awk -F ':' '{print $1}')
+  available_node_port=$(echo "$available_node" | awk -F ':' '{print $2}')
+  get_current_comp_nodes_for_scale_in "$available_node_fqdn" "$available_node_port"
+
+  # set the current shard component slot to 0 by rebalance command
+  for primary_node in "${current_comp_primary_node[@]}"; do
+    primary_node_fqdn=$(echo "$primary_node" | awk -F ':' '{print $1}')
+    primary_node_port=$(echo "$primary_node" | awk -F ':' '{print $2}')
+    primary_node_cluster_id=$(get_cluster_id "$primary_node_fqdn" "$primary_node_port")
+    if scale_in_shard_rebalance_to_zero "$primary_node" "$primary_node_cluster_id"; then
+      echo "Redis cluster scale in shard rebalance to zero successfully"
+    else
+      echo "Failed to rebalance the cluster for the current component when scaling in" >&2
+      return 1
+    fi
+  done
+
+  sleep_when_ut_mode_false 5
+
+  # delete the current shard component nodes from the cluster
+  for node_to_del in "${current_comp_primary_node[@]}" "${current_comp_other_nodes[@]}"; do
+    node_to_del_fqdn=$(echo "$node_to_del" | awk -F ':' '{print $1}')
+    node_to_del_port=$(echo "$node_to_del" | awk -F ':' '{print $2}')
+    node_to_del_cluster_id=$(get_cluster_id "$node_to_del_fqdn" "$node_to_del_port")
+    if scale_in_shard_del_node "$available_node" "$node_to_del_cluster_id"; then
+      echo "Redis cluster scale in shard delete node $node_to_del successfully"
+    else
+      echo "Failed to delete the node $node_to_del from the cluster when scaling in" >&2
+      return 1
+    fi
+  done
+  return 0
+}
+
+initialize_or_scale_out_redis_cluster() {
+  # TODO: remove random sleep, it's a workaround for the multi components initialization parallelism issue
+  sleep_random_second_when_ut_mode_false 10 1
+
+  # if the cluster is not initialized, initialize it
+  if ! check_cluster_initialized "$KB_CLUSTER_POD_FQDN_LIST"; then
+    echo "Redis Cluster not initialized, initializing..."
+    if initialize_redis_cluster; then
+      echo "Redis Cluster initialized successfully"
+    else
+      echo "Failed to initialize Redis Cluster" >&2
+      return 1
+    fi
+  else
+    sync_acl_for_redis_cluster_shard
+    echo "Redis Cluster already initialized, scaling out the shard..."
+    if scale_out_redis_cluster_shard; then
+      echo "Redis Cluster scale out shard successfully"
+    else
+      echo "Failed to scale out Redis Cluster shard" >&2
+      return 1
+    fi
+  fi
+  return 0
+}
+
+# This is magic for shellspec ut framework.
+# Sometime, functions are defined in a single shell script.
+# You will want to test it. but you do not want to run the script.
+# When included from shellspec, __SOURCED__ variable defined and script
+# end here. The script path is assigned to the __SOURCED__ variable.
+${__SOURCED__:+false} : || return 0
+
+# main
+if [ $# -eq 1 ]; then
+  load_redis_cluster_common_utils
+  init_environment
+  case $1 in
+  --help)
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --help                show help information"
+    echo "  --post-provision      initialize or scale out Redis Cluster Shard"
+    echo "  --pre-terminate       stop or scale in Redis Cluster Shard"
+    exit 0
+    ;;
+  --post-provision)
+    if initialize_or_scale_out_redis_cluster; then
+      echo "Redis Cluster initialized or scale out shard successfully"
+    else
+      echo "Failed to initialize or scale out Redis Cluster shard" >&2
+      exit 1
+    fi
+    exit 0
+    ;;
+  --pre-terminate)
+    if scale_in_redis_cluster_shard; then
+      echo "Redis Cluster scale in shard successfully"
+    else
+      echo "Failed to scale in Redis Cluster shard" >&2
+      exit 1
+    fi
+    exit 0
+    ;;
+  *)
+    echo "Error: invalid option '$1'"
+    exit 1
+    ;;
+  esac
+fi
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh
new file mode 100755
index 000000000..1ef68f487
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-member-leave.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# shellcheck disable=SC2034
+# shellcheck disable=SC1090
+# shellcheck disable=SC2153
+
+# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command.
+# Normally test without [expression] returns false. It means that __() { :; }
+# function is defined if this script runs directly.
+#
+# shellspec overrides the test command and returns true *once*. It means that
+# __() function defined internally by shellspec is called.
+#
+# In other words. If not in test mode, __ is just a comment. If test mode, __
+# is a interception point.
+# you should set ut_mode="true" when you want to run the script in shellspec file.
+ut_mode="false"
+test || __() {
+  # when running in non-unit test mode, set the options "set -ex".
+  set -ex;
+}
+
+service_port=${SERVICE_PORT:-6379}
+cluster_bus_port=${CLUSTER_BUS_PORT:-16379}
+
+load_redis_cluster_common_utils() {
+  # the common.sh and redis-cluster-common.sh scripts are defined in the redis cluster scripts template configmap
+  # and are mounted to the same path which defined in the cmpd.spec.scripts
+  kblib_common_library_file="/scripts/common.sh"
+  redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh"
+  source "${kblib_common_library_file}"
+  source "${redis_cluster_common_library_file}"
+}
+
+# remove_replica_from_shard_if_need removes the current pod from the cluster if it is a replica
+# TODO: remove it from preStop hook and it should be implemented in memberLeave lifecycleAction in KubeBlocks
+remove_replica_from_shard_if_need() {
+  # get the cluster nodes info
+  cluster_nodes_info=$(get_cluster_nodes_info_with_retry "$KB_LEAVE_MEMBER_POD_FQDN" "$service_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info in remove_replica_from_shard_if_need" >&2
+    return 1
+  fi
+  echo "Cluster nodes info: $cluster_nodes_info"
+
+  # if the cluster_nodes_info contains only one line, it means that the cluster not be initialized
+  if [ "$(echo "$cluster_nodes_info" | wc -l)" -le 1 ]; then
+    echo "Cluster nodes info contains only one line or is empty, returning..."
+    return 0
+  fi
+
+  # get the current node role, if the current node is a slave, remove it from the cluster
+  current_node_role=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $3}')
+  if contains "$current_node_role" "slave"; then
+    echo "Current node $KB_LEAVE_MEMBER_POD_NAME is a slave, removing it from the cluster..."
+    current_node_cluster_id=$(echo "$cluster_nodes_info" | grep "myself" | awk '{print $1}')
+    current_node_ip_and_port="127.0.0.1:$service_port"
+    do_forget_node=false
+    if contains "$current_node_role" "fail"; then
+      do_forget_node=true
+    fi
+    echo "Current node id: $current_node_cluster_id"
+    if secondary_member_leave_del_node_with_retry "$current_node_ip_and_port" "$current_node_cluster_id" "$do_forget_node"; then
+      echo "Successfully removed replica from shard."
+    else
+      echo "Failed to remove replica from shard." >&2
+      return 1
+    fi
+
+    # check if the current node is removed from the cluster
+    cluster_nodes_info=$(get_cluster_nodes_info "$KB_LEAVE_MEMBER_POD_FQDN" "$service_port")
+    status=$?
+    if [ $status -ne 0 ]; then
+      echo "Failed to get cluster nodes info in remove_replica_from_shard_if_need" >&2
+      return 1
+    fi
+
+    if [ "$(echo "$cluster_nodes_info" | wc -l)" -le 1 ]; then
+      echo "successfully removed replica from shard."
+      return
+    else
+      echo "Failed to remove replica from shard." >&2
+      return 1
+    fi
+  else
+    echo "Current node $KB_LEAVE_MEMBER_POD_NAME is a master, no need to remove it from the cluster."
+  fi
+  return 0
+}
+
+# This is magic for shellspec ut framework.
+# Sometime, functions are defined in a single shell script.
+# You will want to test it. but you do not want to run the script.
+# When included from shellspec, __SOURCED__ variable defined and script
+# end here. The script path is assigned to the __SOURCED__ variable.
+${__SOURCED__:+false} : || return 0
+
+# main
+load_redis_cluster_common_utils
+if execute_acl_save_with_retry $service_port; then
+  echo "acl save command executed successfully."
+else
+  echo "failed to execute acl save command." >&2
+  return 1
+fi
+if [ "$LEGACY_REDIS" = "true" ]; then
+  # Forget fail node when cluster is ok
+  forget_fail_node_when_cluster_is_ok "127.0.0.1" "$service_port"
+fi
+remove_replica_from_shard_if_need
\ No newline at end of file
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh
new file mode 100644
index 000000000..1e85b287a
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-replica-pre-stop.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# shellcheck disable=SC2034
+ut_mode="false"
+test || __() {
+  # when running in non-unit test mode, set the options "set -e".
+  set -ex;
+}
+
+load_common_library() {
+  # the common.sh scripts is mounted to the same path which is defined in the cmpd.spec.scripts
+  common_library_file="/scripts/common.sh"
+  # shellcheck disable=SC1090
+  source "${common_library_file}"
+}
+
+acl_save_before_stop() {
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $SERVICE_PORT -a $REDIS_DEFAULT_PASSWORD acl save"
+    logging_mask_acl_save_command="${acl_save_command/$REDIS_DEFAULT_PASSWORD/********}"
+  else
+    acl_save_command="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $SERVICE_PORT acl save"
+    logging_mask_acl_save_command="$acl_save_command"
+  fi
+  echo "acl save command: $logging_mask_acl_save_command"
+  if output=$($acl_save_command 2>&1); then
+    echo "acl save command executed successfully: $output"
+  else
+    echo "failed to execute acl save command: $output"
+    exit 1
+  fi
+}
+
+# This is magic for shellspec ut framework.
+# Sometime, functions are defined in a single shell script.
+# You will want to test it. but you do not want to run the script.
+# When included from shellspec, __SOURCED__ variable defined and script
+# end here. The script path is assigned to the __SOURCED__ variable.
+${__SOURCED__:+false} : || return 0
+
+# main
+load_common_library
+acl_save_before_stop
\ No newline at end of file
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh
new file mode 100755
index 000000000..14a8d9527
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh
@@ -0,0 +1,776 @@
+#!/bin/bash
+
+# shellcheck disable=SC2153
+# shellcheck disable=SC2207
+# shellcheck disable=SC2034
+# shellcheck disable=SC1090
+
+# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command.
+# Normally test without [expression] returns false. It means that __() { :; }
+# function is defined if this script runs directly.
+#
+# shellspec overrides the test command and returns true *once*. It means that
+# __() function defined internally by shellspec is called.
+#
+# In other words. If not in test mode, __ is just a comment. If test mode, __
+# is a interception point.
+# you should set ut_mode="true" when you want to run the script in shellspec file.
+ut_mode="false"
+test || __() {
+  # when running in non-unit test mode, set the options "set -ex".
+  set -ex;
+}
+
+service_port=6379
+cluster_bus_port=16379
+redis_template_conf="/etc/conf/redis.conf"
+redis_real_conf="/etc/redis/redis.conf"
+redis_acl_file="/data/users.acl"
+redis_acl_file_bak="/data/users.acl.bak"
+retry_times=3
+check_ready_times=30
+retry_delay_second=2
+
+# variables for scale out replica
+current_comp_primary_node=()
+current_comp_primary_fail_node=()
+current_comp_other_nodes=()
+other_comp_primary_nodes=()
+other_comp_primary_fail_nodes=()
+other_comp_other_nodes=()
+network_mode="default"
+
+
+init_environment(){
+  if [[ -z "${CURRENT_SHARD_ADVERTISED_PORT}" ]]; then
+    CURRENT_SHARD_ADVERTISED_PORT="${CURRENT_SHARD_LB_ADVERTISED_PORT}"
+  fi
+  if [[ -z "${CURRENT_SHARD_ADVERTISED_BUS_PORT}" ]]; then
+    CURRENT_SHARD_ADVERTISED_BUS_PORT="${CURRENT_SHARD_LB_ADVERTISED_BUS_PORT}"
+  fi
+}
+
+extract_lb_host_by_svc_name() {
+  local svc_name="$1"
+  for lb_composed_name in $(echo "$CURRENT_SHARD_LB_ADVERTISED_HOST" | tr ',' '\n' ); do
+    if [[ ${lb_composed_name} == *":"* ]]; then
+       if [[ ${lb_composed_name%:*} == "$svc_name" ]]; then
+         echo "${lb_composed_name#*:}"
+         break
+       fi
+    else
+       break
+    fi
+  done
+}
+
+load_redis_cluster_common_utils() {
+  # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap
+  # and are mounted to the same path which defined in the cmpd.spec.scripts
+  kblib_common_library_file="/scripts/common.sh"
+  redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh"
+  source "${kblib_common_library_file}"
+  source "${redis_cluster_common_library_file}"
+}
+
+check_and_meet_node() {
+  local source_endpoint="$1"
+  local source_port="$2"
+  local target_endpoint="$3"
+  local target_port="$4"
+  local target_bus_port="$5"
+
+  # Check for invalid port numbers and exit immediately if found
+  if [ "$target_port" -eq 0 ] || [ "$target_bus_port" -eq 0 ]; then
+    echo "Error: target_port ($target_port) or target_bus_port ($target_bus_port) is 0. Exiting..."
+    shutdown_redis_server "$service_port"
+    exit 1
+  fi
+
+  while true; do
+    # Get current announce IP from the target node
+    current_announce_ip=$(get_cluster_announce_ip "$target_endpoint" "$target_port")
+    echo "target: $target_endpoint:$target_port, current_announce_ip: $current_announce_ip"
+
+    # If current_announce_ip is empty, retry
+    if is_empty "$current_announce_ip"; then
+      echo "Error: current_announce_ip is empty"
+      sleep_when_ut_mode_false 3
+      continue
+    fi
+
+    # send cluster meet command to the primary node
+    if send_cluster_meet_with_retry "$source_endpoint" "$source_port" "$current_announce_ip" "$target_port" "$target_bus_port"; then
+      echo "Meet the node $target_endpoint successfully with new announce ip $current_announce_ip..."
+      break
+    else
+      echo "Failed to meet the node $target_endpoint" >&2
+      shutdown_redis_server "$service_port"
+      exit 1
+    fi
+  done
+}
+
+check_and_meet_other_primary_nodes() {
+  local current_primary_endpoint="$1"
+  local current_primary_port="$2"
+  local meet_other_comp_primary_nodes=("${other_comp_primary_nodes[@]}" "${other_comp_primary_fail_nodes[@]}")
+  if [ ${#meet_other_comp_primary_nodes[@]} -eq 0 ]; then
+    echo "meet_other_comp_primary_nodes is empty, skip check_and_meet_other_primary_nodes"
+    return
+  fi
+
+  # node_info value format: cluster_announce_ip#pod_fqdn#endpoint:port@bus_port
+  for node_info in "${meet_other_comp_primary_nodes[@]}"; do
+    node_endpoint_with_port=$(echo "$node_info" | awk -F '@' '{print $1}' | awk -F '#' '{print $3}')
+    node_endpoint=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $1}')
+    node_port=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $2}')
+    node_bus_port=$(echo "$node_info" | awk -F '@' '{print $2}')
+    node_fqdn=$(echo "$node_info" | awk -F '#' '{print $2}')
+    node_endpoint_for_meet="$node_endpoint"
+    if [ "$network_mode" == "default" ]; then
+      node_endpoint_for_meet="$node_fqdn"
+    fi
+    check_and_meet_node "$current_primary_endpoint" "$current_primary_port" "$node_endpoint_for_meet" "$node_port" "$node_bus_port"
+    sleep_when_ut_mode_false 3
+  done
+}
+
+check_and_meet_current_primary_node() {
+  local primary_node_endpoint="$1"
+  local primary_node_port="$2"
+  local primary_bus_port="$3"
+
+  check_and_meet_node "127.0.0.1" "$service_port" "$primary_node_endpoint" "$primary_node_port" "$primary_bus_port"
+}
+
+# get the current component nodes for scale out replica
+get_current_comp_nodes_for_scale_out_replica() {
+  local cluster_node="$1"
+  local cluster_node_port="$2"
+  cluster_nodes_info=$(get_cluster_nodes_info "$cluster_node" "$cluster_node_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info in get_current_comp_nodes_for_scale_out_replica: $cluster_nodes_info" >&2
+    return 1
+  fi
+
+  # if the cluster_nodes_info contains only one line, it means that the cluster not be initialized
+  shard_count=$(echo "${ALL_SHARDS_COMPONENT_SHORT_NAMES}" | tr ',' '\n' | wc -l)
+  if [ "$(echo "$cluster_nodes_info" | wc -l)" -lt ${shard_count} ]; then
+    echo "Cluster nodes info contains less than ${shard_count} nodes, returning..."
+    return
+  fi
+
+  # determine network mode
+  network_mode="default"
+  if ! is_empty "$CURRENT_SHARD_ADVERTISED_PORT"; then
+    network_mode="advertised_svc"
+  elif ! is_empty "$REDIS_CLUSTER_ALL_SHARDS_HOST_NETWORK_PORT"; then
+    network_mode="host_network"
+  fi
+
+  parse_node_line_info() {
+    # the output of line is like:
+    # 1. using the pod fqdn as the nodeAddr
+    # 4958e6dca033cd1b321922508553fab869a29d 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287
+    # 2. using the nodeport or lb ip as the nodeAddr
+    # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:31000@31888,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287
+    # 3. using the host network ip as the nodeAddr
+    # 4958e6dca033cd1b321922508553fab869a29d 172.10.0.1:1050@1051,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local master - 0 1711958289570 4 connected 0-1364 5461-6826 10923-12287
+    local line="$1"
+
+    local node_ip_port_fields
+    # 10.42.0.227:6379@16379,redis-shard-sxj-0.redis-shard-sxj-headless.default.svc
+    node_ip_port_fields=$(echo "$line" | awk '{print $2}')
+
+    local node_announce_ip_port
+    # ip:port without bus port
+    node_announce_ip_port=$(echo "$node_ip_port_fields" | awk -F '@' '{print $1}')
+
+    local node_announce_ip
+    node_announce_ip=$(echo "$node_announce_ip_port" | cut -d':' -f1)
+
+    local node_port
+    node_port=$(echo "$node_announce_ip_port" | cut -d':' -f2)
+
+    local node_bus_port
+    node_bus_port=$(echo "$node_ip_port_fields" | awk -F '@' '{print $2}' | awk -F ',' '{print $1}')
+
+    local node_fqdn
+    # redis-shard-sxj-0.redis-shard-sxj-headless.default.svc.cluster.local
+    node_fqdn=$(echo "$line" | awk '{print $2}' | awk -F ',' '{print $2}')
+
+    local node_role
+    node_role=$(echo "$line" | awk '{print $3}')
+
+    printf "%s %s %s %s %s" "$node_announce_ip" "$node_port" "$node_bus_port" "$node_role" "$node_fqdn"
+  }
+
+  build_node_entry() {
+    local mode="$1"
+    local announce_ip="$2"
+    local fqdn="$3"
+    local port="$4"
+    local bus_port="$5"
+
+    case "$mode" in
+      "advertised_svc")
+        # example format using nodeport: 172.10.0.1#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc#172.10.0.1:31000@31888
+        echo "$announce_ip#$fqdn#$announce_ip:$port@$bus_port"
+        ;;
+      "host_network")
+        # example format using host network: 172.10.0.1#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc#172.10.0.1:1050@1051
+        echo "$announce_ip#$fqdn#$announce_ip:$port@$bus_port"
+        ;;
+      *)
+        # example format using pod fqdn: 10.42.0.227#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc#redis-shard-sxj-0.redis-shard-sxj-headless.default.svc:6379@16379
+        echo "$announce_ip#$fqdn#$fqdn:$port@$bus_port"
+        ;;
+    esac
+  }
+
+  # categorize node into appropriate array
+  categorize_node() {
+    local node_entry="$1"
+    local node_role="$2"
+    local belong_current_comp="$3"
+
+    if [[ "$belong_current_comp" == "true" ]]; then
+      if contains "$node_role" "master"; then
+        if contains "$node_role" "fail"; then
+          current_comp_primary_fail_node+=("$node_entry")
+        else
+          current_comp_primary_node+=("$node_entry")
+        fi
+      else
+        current_comp_other_nodes+=("$node_entry")
+      fi
+    else
+      if contains "$node_role" "master"; then
+        if contains "$node_role" "fail"; then
+          other_comp_primary_fail_nodes+=("$node_entry")
+        else
+          other_comp_primary_nodes+=("$node_entry")
+        fi
+      else
+        other_comp_other_nodes+=("$node_entry")
+      fi
+    fi
+  }
+
+  # prepare CURRENT_SHARD_HOST_OR_PORT_LIST for advertised_svc mode
+  CURRENT_SHARD_HOST_OR_PORT_LIST=()
+  if [ "$network_mode" == "advertised_svc" ]; then
+    IFS=',' read -ra CURRENT_POD_LIST <<< "$CURRENT_SHARD_POD_NAME_LIST"
+    for pod_name in "${CURRENT_POD_LIST[@]}"; do
+      svc_and_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_PORT" "true")
+      svc_name=${svc_and_port%:*}
+      lb_host=$(extract_lb_host_by_svc_name "${svc_name}")
+      if [ -n "$lb_host" ]; then
+          CURRENT_SHARD_HOST_OR_PORT_LIST+=("${lb_host}:6379")
+      else
+          svc_port="${svc_and_port#*:}"
+          CURRENT_SHARD_HOST_OR_PORT_LIST+=(":${svc_port}")
+      fi
+      echo "pod_name: $pod_name, svc_and_port: $svc_and_port"
+    done
+    # check length of CURRENT_SHARD_ANNOUNCE_IP_LIST must equal to CURRENT_POD_LIST
+    if [ ${#CURRENT_SHARD_HOST_OR_PORT_LIST[@]} -ne ${#CURRENT_POD_LIST[@]} ]; then
+      echo "Error: failed to get the pod ip list from KB_POD_LIST"
+      return 1
+    fi
+  fi
+
+  # process each node
+  while read -r line; do
+    local node_info
+    node_info=$(parse_node_line_info "$line")
+    local node_announce_ip node_fqdn node_port node_bus_port node_role
+    read -r node_announce_ip node_port node_bus_port node_role node_fqdn <<< "$node_info"
+    # determine if the node belongs to the current component
+    belong_current_comp=false
+    if [ "$network_mode" == "advertised_svc" ]; then
+      for i in "${CURRENT_SHARD_HOST_OR_PORT_LIST[@]}"; do
+        node_announce_info=":$node_port"
+        if ! is_empty "$CURRENT_SHARD_LB_ADVERTISED_PORT"; then
+          node_announce_info="$node_announce_ip:$node_port"
+        fi
+        if [[ "$i" == "$node_announce_info" ]]; then
+          belong_current_comp=true
+          break
+        fi
+      done
+    elif [ "$network_mode" == "host_network" ]; then
+      if contains "$node_port" "$SERVICE_PORT"; then
+        belong_current_comp=true
+      fi
+    elif contains "$node_fqdn" "$CURRENT_SHARD_COMPONENT_NAME"; then
+      belong_current_comp=true
+    fi
+    # build node entry based on network mode
+    local node_entry
+    node_entry=$(build_node_entry "$network_mode" "$node_announce_ip" "$node_fqdn" "$node_port" "$node_bus_port")
+
+    # categorize nodes
+    categorize_node "$node_entry" "$node_role" "$belong_current_comp"
+  done <<< "$cluster_nodes_info"
+
+  echo "current_comp_primary_node: ${current_comp_primary_node[*]}"
+  echo "current_comp_primary_fail_node: ${current_comp_primary_fail_node[*]}"
+  echo "current_comp_other_nodes: ${current_comp_other_nodes[*]}"
+  echo "other_comp_primary_nodes: ${other_comp_primary_nodes[*]}"
+  echo "other_comp_primary_fail_nodes: ${other_comp_primary_fail_nodes[*]}"
+  echo "other_comp_other_nodes: ${other_comp_other_nodes[*]}"
+}
+
+# Note: During rebuild-instance, a new PVC is created without existing data and having the rebuild.flag file.
+# Therefore, we must rejoin this instance to the cluster as a secondary node.
+is_rebuild_instance() {
+  # Early return if rebuild flag doesn't exist
+  [[ ! -f /data/rebuild.flag ]] && return 1
+
+  # Check if nodes.conf exists
+  if [[ ! -f /data/nodes.conf ]]; then
+    echo "Rebuild instance detected: nodes.conf missing"
+    return 0
+  fi
+
+  # Check if nodes.conf contains only one node
+  if [[ $(grep -c ":" /data/nodes.conf) -eq 1 ]]; then
+    echo "Rebuild instance detected: single node configuration"
+    return 0
+  fi
+
+  return 1
+}
+
+remove_rebuild_instance_flag() {
+  if [ -f /data/rebuild.flag ]; then
+    rm -f /data/rebuild.flag
+    echo "remove rebuild.flag file succeeded!"
+  fi
+}
+
+# scale out replica of redis cluster shard if needed
+scale_redis_cluster_replica() {
+  # Waiting for redis-server to start
+  check_current_ready_ip="127.0.0.1"
+  if [ -n "$redis_announce_host_value" ]; then
+    check_current_ready_ip=$redis_announce_host_value
+  fi
+  if check_redis_server_ready_with_retry "127.0.0.1" "$service_port"; then
+    echo "Redis server is ready, continue to scale out replica..."
+  else
+    echo "Redis server is not ready, exit scale out replica..." >&2
+    exit 1
+  fi
+
+  if [ -f /data/nodes.conf ]; then
+    echo "the nodes.conf file after redis server start:"
+    cat /data/nodes.conf
+  else
+    echo "the nodes.conf file after redis server start is not exist"
+  fi
+
+  for target_node_name in $(echo "${CURRENT_SHARD_POD_NAME_LIST}" | tr ',' '\n'); do
+     if [ -f /data/rebuild.flag ] && [ "${CURRENT_POD_NAME}" == "${target_node_name}" ]; then
+       continue
+     fi
+     target_node_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$target_node_name")
+     if is_empty "$target_node_fqdn"; then
+       echo "Error: Failed to get target node fqdn from current shard pod fqdn list: $CURRENT_SHARD_POD_FQDN_LIST. Exiting." >&2
+       exit 1
+     fi
+     # get the current component nodes for scale out replica
+     get_current_comp_nodes_for_scale_out_replica "$target_node_fqdn" "$service_port"
+     if [ $? -eq 0 ]; then
+       break
+     fi
+  done
+
+  # check current_comp_primary_node is empty or not
+  if [ ${#current_comp_primary_node[@]} -eq 0 ]; then
+    if is_rebuild_instance; then
+      echo "current instance is a rebuild-instance, the current shard primary cannot be empty, please check the cluster status" >&2
+      shutdown_redis_server "$service_port"
+      exit 1
+    fi
+    if [ ${#current_comp_primary_fail_node[@]} -eq 0 ]; then
+      echo "current_comp_primary_node is empty, skip scale out replica"
+      exit 0
+    fi
+    # if current_comp_primary_node is empty, use current_comp_primary_fail_node instead
+    current_comp_primary_node=("${current_comp_primary_fail_node[@]}")
+  fi
+
+  # primary_node_info value format: cluster_announce_ip#pod_fqdn#endpoint:port@bus_port
+  primary_node_info=${current_comp_primary_node[0]}
+  primary_node_endpoint_with_port=$(echo "$primary_node_info" | awk -F '@' '{print $1}' | awk -F '#' '{print $3}')
+  primary_node_endpoint=$(echo "$primary_node_endpoint_with_port" | awk -F ':' '{print $1}')
+  primary_node_port=$(echo "$primary_node_endpoint_with_port" | awk -F ':' '{print $2}')
+  primary_node_fqdn=$(echo "$primary_node_info" | awk -F '#' '{print $2}')
+  primary_node_bus_port=$(echo "$primary_node_info" | awk -F '@' '{print $2}')
+  primary_node_endpoint_for_meet="$primary_node_endpoint"
+  if [ "$network_mode" == "default" ]; then
+     primary_node_endpoint_for_meet="$primary_node_fqdn"
+  fi
+  if contains "$primary_node_fqdn" "$CURRENT_POD_NAME" || contains "$primary_node_info" "$current_node_host_info"; then
+     echo "Current pod $CURRENT_POD_NAME is primary node, check and correct other primary nodes..."
+     check_and_meet_other_primary_nodes "$primary_node_endpoint_for_meet" "$primary_node_port"
+     echo "Node $CURRENT_POD_NAME is already in the cluster, skipping scale out replica..."
+     exit 0
+  fi
+  # if the current pod is not a rebuild-instance and is already in the cluster, skip scale out replica
+  if ! is_rebuild_instance && check_node_in_cluster_with_retry "$primary_node_endpoint_for_meet" "$primary_node_port" "$current_node_host_info"; then
+    # if current pod is primary node, check the others primary info, if the others primary node info is expired, send cluster meet command again
+    echo "Current pod $CURRENT_POD_NAME is a secondary node, check and meet current primary node..."
+    check_and_meet_current_primary_node "$primary_node_endpoint_for_meet" "$primary_node_port" "$primary_node_bus_port"
+    echo "Node $CURRENT_POD_NAME is already in the cluster, skipping scale out replica..."
+    exit 0
+  fi
+
+  # Forget fail node when cluster is ok
+  # forget_fail_node_when_cluster_is_ok "$primary_node_endpoint_for_meet" "$primary_node_port"
+
+  # add the current node as a replica of the primary node
+  primary_node_cluster_id=$(get_cluster_id_with_retry "$primary_node_endpoint_for_meet" "$primary_node_port")
+  status=$?
+  if is_empty "$primary_node_cluster_id" || [ $status -ne 0 ]; then
+    echo "Failed to get the cluster id of the primary node $primary_node_endpoint_with_port, sleep 30s for waiting next pod to start" >&2
+    sleep 30s
+    shutdown_redis_server "$service_port"
+    exit 1
+  fi
+  # current_node_with_port do not use advertised svc and port, because advertised svc and port are not ready when Pod is not Ready.
+  current_pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$CURRENT_POD_NAME")
+  if is_rebuild_instance; then
+    echo "Current instance is a rebuild-instance, forget node id in the cluster firstly."
+    node_id=$(get_cluster_id_with_retry "$primary_node_endpoint_for_meet" "$primary_node_port" "$current_node_host_info")
+    if [ -z ${REDIS_DEFAULT_PASSWORD} ]; then
+      redis-cli $REDIS_CLI_TLS_CMD -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id}
+    else
+      redis-cli $REDIS_CLI_TLS_CMD -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id} -a ${REDIS_DEFAULT_PASSWORD}
+    fi
+  fi
+  current_node_with_port="$current_pod_fqdn:$service_port"
+  replicated_output=$(secondary_replicated_to_primary "$current_node_with_port" "$primary_node_endpoint_with_port" "$primary_node_cluster_id")
+  status=$?
+  if [ $status -ne 0 ] ; then
+    if is_rebuild_instance && contains "$replicated_output" "is not empty"; then
+      echo "Current instance is a rebuild-instance, but the node already knows other nodes (check with CLUSTER NODES) or contains some key in database 0, shutdown redis server..." >&2
+      shutdown_redis_server
+      exit 1
+    elif contains "$replicated_output" "is not empty"; then
+      echo "Replica is not empty, Either the node already knows other nodes (check with CLUSTER NODES) or contains some key in database 0"
+    elif [[ $replicated_output == *"Not all 16384 slots are covered by nodes"* ]]; then
+      # shutdown the redis server if the cluster is not fully covered by nodes
+      echo "Not all 16384 slots are covered by nodes, shutdown redis server" >&2
+      shutdown_redis_server
+      exit 1
+    else
+      echo "Failed to add the node $current_pod_fqdn to the cluster in scale_redis_cluster_replica, Error message: $replicated_output, shutdown redis server" >&2
+      shutdown_redis_server "$service_port"
+      exit 1
+    fi
+  fi
+
+  if is_rebuild_instance; then
+    echo "replicate the node $current_pod_fqdn to the primary node $primary_node_endpoint_with_port successfully in rebuild-instance, remove rebuild.flag file..."
+    remove_rebuild_instance_flag
+  fi
+
+  # Hacky: When the entire redis cluster is restarted, a hacky sleep is used to wait for all primaries to enter the restarting state
+  sleep_when_ut_mode_false 5
+
+  # cluster meet the primary node until the current node is successfully added to the cluster
+  current_primary_met=false
+  declare -A other_primary_met
+  for node_info in "${other_comp_primary_nodes[@]}"; do
+    other_primary_met["$node_info"]=false
+  done
+  while true; do
+    all_met=true
+
+    # meet current component primary node if not met yet
+    if ! $current_primary_met; then
+      if scale_out_replica_send_meet "$primary_node_endpoint_for_meet" "$primary_node_port" "$primary_node_bus_port" "$current_node_host_info"; then
+        echo "Successfully meet the primary node $primary_node_endpoint_with_port in scale_redis_cluster_replica"
+        current_primary_met=true
+      else
+        echo "Failed to meet current primary node $primary_node_endpoint_with_port"
+        all_met=false
+      fi
+    fi
+
+    # meet the other components primary nodes if not met yet
+    for node_info in "${other_comp_primary_nodes[@]}"; do
+      if [ "${other_primary_met[$node_info]}" = false ]; then
+        node_endpoint_with_port=$(echo "$node_info" | awk -F '@' '{print $1}' | awk -F '#' '{print $3}')
+        node_endpoint=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $1}')
+        node_port=$(echo "$node_endpoint_with_port" | awk -F ':' '{print $2}')
+        node_bus_port=$(echo "$node_info" | awk -F '@' '{print $2}')
+        node_fqdn=$(echo "$node_info" | awk -F '#' '{print $2}')
+        node_endpoint_for_meet="$node_endpoint"
+        if [ "$network_mode" == "default" ]; then
+          node_endpoint_for_meet="$node_fqdn"
+        fi
+        if scale_out_replica_send_meet "$node_endpoint_for_meet" "$node_port" "$node_bus_port" "$current_node_host_info"; then
+          echo "Successfully meet the primary node $node_endpoint_with_port in scale_redis_cluster_replica"
+          other_primary_met["$node_info"]=true
+        else
+          echo "Failed to meet the other component primary node $node_endpoint_with_port in scale_redis_cluster_replica" >&2
+          all_met=false
+        fi
+      fi
+    done
+
+    # If all nodes are met successfully, break the loop
+    if $all_met && $current_primary_met; then
+      echo "All primary nodes have been successfully met"
+      break
+    fi
+
+    sleep_when_ut_mode_false 3
+  done
+}
+
+scale_out_replica_send_meet() {
+  local node_endpoint_to_meet="$1"
+  local node_port_to_meet="$2"
+  local node_bus_port_to_meet="$3"
+  local node_to_join="$4"
+
+  if check_node_in_cluster "$node_endpoint_to_meet" "$node_port_to_meet" "$node_to_join"; then
+    echo "Node $CURRENT_POD_NAME is successfully added to the cluster."
+    return 0
+  fi
+
+  node_cluster_announce_ip=$(get_cluster_announce_ip_with_retry "$node_endpoint_to_meet" "$node_port_to_meet")
+  # send cluster meet command to the target node
+  if send_cluster_meet_with_retry "127.0.0.1" "$service_port" "$node_cluster_announce_ip" "$node_port_to_meet" "$node_bus_port_to_meet"; then
+    echo "scale out replica meet the node $node_cluster_announce_ip successfully..."
+  else
+    echo "Failed to meet the node $node_endpoint_to_meet in scale_redis_cluster_replica, shutdown redis server" >&2
+    return 1
+  fi
+
+  return 0
+}
+
+load_redis_template_conf() {
+  echo "include $redis_template_conf" >> $redis_real_conf
+}
+
+build_redis_default_accounts() {
+  unset_xtrace_when_ut_mode_false
+  if ! is_empty "$REDIS_REPL_PASSWORD"; then
+    echo "masteruser $REDIS_REPL_USER" >> $redis_real_conf
+    echo "masterauth $REDIS_REPL_PASSWORD" >> $redis_real_conf
+    redis_repl_password_sha256=$(echo -n "$REDIS_REPL_PASSWORD" | sha256sum | cut -d' ' -f1)
+    echo "user $REDIS_REPL_USER on +psync +replconf +ping #$redis_repl_password_sha256" >> $redis_acl_file
+  fi
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    echo "protected-mode yes" >> $redis_real_conf
+    redis_password_sha256=$(echo -n "$REDIS_DEFAULT_PASSWORD" | sha256sum | cut -d' ' -f1)
+    echo "user default on #$redis_password_sha256 ~* &* +@all " >> $redis_acl_file
+  else
+    echo "protected-mode no" >> $redis_real_conf
+  fi
+  set_xtrace_when_ut_mode_false
+  echo "aclfile /data/users.acl" >> $redis_real_conf
+  echo "build redis default accounts succeeded!"
+}
+
+rebuild_redis_acl_file() {
+  if [ -f $redis_acl_file ]; then
+    sed "/user default on/d" $redis_acl_file > $redis_acl_file_bak && mv $redis_acl_file_bak $redis_acl_file
+    sed "/user $REDIS_REPL_USER on/d" $redis_acl_file > $redis_acl_file_bak && mv $redis_acl_file_bak $redis_acl_file
+    sed "/user $REDIS_SENTINEL_USER on/d" $redis_acl_file > $redis_acl_file_bak && mv $redis_acl_file_bak $redis_acl_file
+  else
+    touch $redis_acl_file
+  fi
+}
+
+build_announce_ip_and_port() {
+  # build announce ip and port according to whether the advertised svc is enabled
+  if ! is_empty "$redis_announce_host_value" && ! is_empty "$redis_announce_port_value"; then
+    echo "redis use advertised svc $redis_announce_host_value:$redis_announce_port_value to announce"
+    {
+      echo "replica-announce-port $redis_announce_port_value"
+      echo "replica-announce-ip $redis_announce_host_value"
+    } >> $redis_real_conf
+  elif [ "$FIXED_POD_IP_ENABLED" == "true" ]; then
+    echo "redis use fixed pod ip: $CURRENT_POD_IP to announce"
+    echo "replica-announce-ip $CURRENT_POD_IP" >> $redis_real_conf
+  else
+    current_pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$CURRENT_POD_NAME")
+    if is_empty "$current_pod_fqdn"; then
+      echo "Error: Failed to get current pod: $CURRENT_POD_NAME fqdn from current shard pod fqdn list: $CURRENT_SHARD_POD_FQDN_LIST. Exiting."
+      exit 1
+    fi
+    echo "redis use kb pod fqdn $current_pod_fqdn to announce"
+    echo "replica-announce-ip $current_pod_fqdn" >> $redis_real_conf
+  fi
+}
+
+build_cluster_announce_info() {
+  current_pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$CURRENT_POD_NAME")
+  if is_empty "$current_pod_fqdn"; then
+    echo "Error: Failed to get current pod: $CURRENT_POD_NAME fqdn from current shard pod fqdn list: $CURRENT_SHARD_POD_FQDN_LIST. Exiting."
+    exit 1
+  fi
+  current_node_host_info="$current_pod_fqdn"
+  # build announce ip and port according to whether the advertised svc is enabled
+  if ! is_empty "$redis_announce_host_value" && ! is_empty "$redis_announce_port_value" && ! is_empty "$redis_announce_bus_port_value"; then
+    current_node_host_info="$redis_announce_host_value:$redis_announce_port_value"
+    echo "redis cluster use advertised svc $redis_announce_host_value:$redis_announce_port_value@$redis_announce_bus_port_value to announce"
+    {
+      echo "cluster-announce-ip $redis_announce_host_value"
+      echo "cluster-announce-bus-port $redis_announce_bus_port_value"
+      # echo "cluster-announce-hostname $current_pod_fqdn"
+      echo "cluster-preferred-endpoint-type ip"
+      if [ "$TLS_ENABLED" == "true" ]; then
+        echo "cluster-announce-tls-port $redis_announce_port_value"
+        echo "cluster-announce-port 0"
+      else
+        echo "cluster-announce-port $redis_announce_port_value"
+      fi
+    } >> $redis_real_conf
+  elif [ "$FIXED_POD_IP_ENABLED" == "true" ]; then
+    echo "redis cluster use fixed pod ip: $CURRENT_POD_IP to announce"
+    {
+      echo "cluster-announce-ip $CURRENT_POD_IP"
+      echo "cluster-announce-hostname $current_pod_fqdn"
+      echo "cluster-preferred-endpoint-type ip"
+    } >> $redis_real_conf
+  else
+    echo "valkey cluster use pod fqdn $current_pod_fqdn to announce (preferring ip endpoint type)"
+    # Stream-Valkey divergence vs. upstream redis-cluster-server-start.sh:
+    # upstream emits `cluster-preferred-endpoint-type hostname` here, which
+    # makes CLUSTER SLOTS announce *.svc.cluster.local FQDNs that external
+    # clients (e.g. chat-api on EC2) cannot resolve. Force `ip` so cluster
+    # topology stays VPC-routable, matching the other two branches above.
+    {
+      echo "cluster-announce-ip $CURRENT_POD_IP"
+      echo "cluster-announce-hostname $current_pod_fqdn"
+      echo "cluster-preferred-endpoint-type ip"
+    } >> $redis_real_conf
+  fi
+}
+
+build_redis_cluster_service_port() {
+  if ! is_empty "$SERVICE_PORT"; then
+    service_port=$SERVICE_PORT
+  fi
+  if ! is_empty "$CLUSTER_BUS_PORT"; then
+    cluster_bus_port=$CLUSTER_BUS_PORT
+  fi
+  if [ "$TLS_ENABLED" == "true" ]; then
+    echo "tls-port $service_port" >> $redis_real_conf
+  else
+    echo "port $service_port" >> $redis_real_conf
+  fi
+  echo "cluster-port $cluster_bus_port" >> $redis_real_conf
+}
+
+parse_redis_cluster_shard_announce_addr() {
+  # The value format of CURRENT_SHARD_ADVERTISED_PORT and CURRENT_SHARD_ADVERTISED_BUS_PORT are "pod1Svc:advertisedPort1,pod2Svc:advertisedPort2,..."
+  if is_empty "$CURRENT_SHARD_ADVERTISED_PORT" || is_empty "$CURRENT_SHARD_ADVERTISED_BUS_PORT"; then
+    echo "Environment variable CURRENT_SHARD_ADVERTISED_PORT and CURRENT_SHARD_ADVERTISED_BUS_PORT not found. Ignoring."
+    # if redis cluster is in host network mode, use the host ip and port as the announce ip and port
+    if ! is_empty "${REDIS_CLUSTER_HOST_NETWORK_PORT}" && ! is_empty "${REDIS_CLUSTER_HOST_NETWORK_BUS_PORT}"; then
+      echo "redis cluster server is in host network mode, use the host ip:$CURRENT_POD_HOST_IP and port:$REDIS_CLUSTER_HOST_NETWORK_PORT, bus port:$REDIS_CLUSTER_HOST_NETWORK_BUS_PORT as the announce ip and port."
+      redis_announce_port_value="$REDIS_CLUSTER_HOST_NETWORK_PORT"
+      redis_announce_bus_port_value="$REDIS_CLUSTER_HOST_NETWORK_BUS_PORT"
+      redis_announce_host_value="$CURRENT_POD_HOST_IP"
+    fi
+    return 0
+  fi
+
+  local pod_name="$CURRENT_POD_NAME"
+  local port
+  local bus_port
+  svc_and_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_PORT" "true")
+  status=$?
+  if [[ $status -ne 0 ]] || is_empty "$svc_and_port"; then
+    echo "Exiting due to error in CURRENT_SHARD_ADVERTISED_PORT."
+    exit 1
+  fi
+
+  bus_port=$(parse_advertised_svc_and_port "$pod_name" "$CURRENT_SHARD_ADVERTISED_BUS_PORT")
+  status=$?
+  if [[ $status -ne 0 ]] || is_empty "$bus_port"; then
+    echo "Exiting due to error in CURRENT_SHARD_ADVERTISED_BUS_PORT."
+    exit 1
+  fi
+  redis_announce_port_value="${svc_and_port#*:}"
+  redis_announce_bus_port_value="$bus_port"
+  svc_name=${svc_and_port%:*}
+  lb_host=$(extract_lb_host_by_svc_name "${svc_name}")
+  if [ -n "$lb_host" ]; then
+    echo "Found load balancer host for svcName '$svc_name', value is '$lb_host'."
+    redis_announce_host_value="$lb_host"
+    redis_announce_port_value="6379"
+    redis_announce_bus_port_value="16379"
+  else
+    redis_announce_host_value="$CURRENT_POD_HOST_IP"
+  fi
+}
+
+start_redis_server() {
+    module_path="/opt/redis-stack/lib"
+    if [[ "$IS_REDIS8" == "true" ]]; then
+       module_path="/usr/local/lib/redis/modules"
+    fi
+    exec_cmd="exec redis-server /etc/redis/redis.conf"
+    if [ -f ${module_path}/redisearch.so ]; then
+        exec_cmd="$exec_cmd --loadmodule ${module_path}/redisearch.so ${REDISEARCH_ARGS}"
+    fi
+    if [ -f ${module_path}/redistimeseries.so ]; then
+        exec_cmd="$exec_cmd --loadmodule ${module_path}/redistimeseries.so ${REDISTIMESERIES_ARGS}"
+    fi
+    if [ -f ${module_path}/rejson.so ]; then
+        exec_cmd="$exec_cmd --loadmodule ${module_path}/rejson.so ${REDISJSON_ARGS}"
+    fi
+    if [ -f ${module_path}/redisbloom.so ]; then
+        exec_cmd="$exec_cmd --loadmodule ${module_path}/redisbloom.so ${REDISBLOOM_ARGS}"
+    fi
+    if [ -f ${module_path}/redisgraph.so ]; then
+        exec_cmd="$exec_cmd --loadmodule ${module_path}/redisgraph.so ${REDISGRAPH_ARGS}"
+    fi
+    if [ -f ${module_path}/rediscompat.so ]; then
+        exec_cmd="$exec_cmd --loadmodule ${module_path}/rediscompat.so"
+    fi
+    # NOTE: in replication mode, load this module will lead a memory leak for slave instance.
+    #if [ -f ${module_path}/redisgears.so ]; then
+    #    exec_cmd="$exec_cmd --loadmodule ${module_path}/redisgears.so v8-plugin-path ${module_path}/libredisgears_v8_plugin.so ${REDISGEARS_ARGS}"
+    #fi
+    echo "Starting redis server cmd: $exec_cmd"
+    eval "$exec_cmd"
+}
+
+# build redis cluster configuration redis.conf
+build_redis_conf() {
+  load_redis_template_conf
+  build_redis_cluster_service_port
+  build_announce_ip_and_port
+  build_cluster_announce_info
+  rebuild_redis_acl_file
+  build_redis_default_accounts
+}
+
+# This is magic for shellspec ut framework.
+# Sometime, functions are defined in a single shell script.
+# You will want to test it. but you do not want to run the script.
+# When included from shellspec, __SOURCED__ variable defined and script
+# end here. The script path is assigned to the __SOURCED__ variable.
+${__SOURCED__:+false} : || return 0
+
+init_environment
+load_redis_cluster_common_utils
+parse_redis_cluster_shard_announce_addr
+build_redis_conf
+# TODO: move to memberJoin action in the future
+scale_redis_cluster_replica &
+start_redis_server
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh
new file mode 100644
index 000000000..bb1bc9808
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-switchover.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command.
+# Normally test without [expression] returns false. It means that __() { :; }
+# function is defined if this script runs directly.
+#
+# shellspec overrides the test command and returns true *once*. It means that
+# __() function defined internally by shellspec is called.
+#
+# In other words. If not in test mode, __ is just a comment. If test mode, __
+# is a interception point.
+#
+# you should set ut_mode="true" when you want to run the script in shellspec file.
+#
+# shellcheck disable=SC2034
+# shellcheck disable=SC2153
+# shellcheck disable=SC1090
+ut_mode="false"
+test || __() {
+  # when running in non-unit test mode, set the options "set -ex".
+  set -ex;
+}
+
+load_redis_cluster_common_utils() {
+  # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap
+  # and are mounted to the same path which defined in the cmpd.spec.scripts
+  kblib_common_library_file="/scripts/common.sh"
+  redis_cluster_common_library_file="/scripts/valkey-cluster-common.sh"
+  source "${kblib_common_library_file}"
+  source "${redis_cluster_common_library_file}"
+}
+
+check_environment_exist() {
+  local required_vars=(
+    "CURRENT_SHARD_POD_NAME_LIST"
+    "CURRENT_SHARD_POD_FQDN_LIST"
+  )
+
+  if [[ ${COMPONENT_REPLICAS} -lt 2 ]]; then
+    exit 0
+  fi
+
+  for var in "${required_vars[@]}"; do
+    if is_empty "${!var}"; then
+      echo "Error: Required environment variable $var is not set." >&2
+      return 1
+    fi
+  done
+
+  if [ "$KB_SWITCHOVER_ROLE" != "primary" ]; then
+    echo "switchover not triggered for primary, nothing to do, exit 0"
+    exit 0
+  fi
+}
+
+init_redis_cluster_service_port() {
+  service_port=6379
+  if [ -n "$SERVICE_PORT" ]; then
+    service_port=$SERVICE_PORT
+  fi
+}
+
+get_current_shard_primary() {
+  local host=$1
+  local port=$2
+  local master_info
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    master_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port info replication)
+  else
+    master_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" info replication)
+  fi
+  set_xtrace_when_ut_mode_false
+
+  local master_host
+  local master_port
+
+  master_host=$(echo "$master_info" | grep "master_host:" | cut -d':' -f2 | tr -d '[:space:]')
+  master_port=$(echo "$master_info" | grep "master_port:" | cut -d':' -f2 | tr -d '[:space:]')
+
+  if is_empty "$master_host"|| is_empty "$master_port"; then
+    return 1
+  fi
+
+  echo "$master_host:$master_port"
+}
+
+get_all_shards_master() {
+  local host=$1
+  local port=$2
+  local cluster_nodes_info
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    cluster_nodes_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port cluster nodes)
+  else
+    cluster_nodes_info=$(redis-cli $REDIS_CLI_TLS_CMD -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" cluster nodes)
+  fi
+  set_xtrace_when_ut_mode_false
+
+  echo "$cluster_nodes_info" | grep "master" | grep -v "fail" | while read -r line; do
+    node_addr=$(echo "$line" | cut -d' ' -f2 | cut -d'@' -f1)
+    echo "$node_addr"
+  done
+}
+
+do_switchover() {
+  candidate_pod=$1
+  candidate_pod_fqdn=$2
+  need_check=$3
+
+  # check candidate pod is ready and has the role of secondary
+  role=$(check_redis_role "$candidate_pod_fqdn" $service_port)
+  if [ "$role" = "primary" ]; then
+    echo "Info: Candidate pod $candidate_pod is already a primary"
+    exit 0
+  fi
+  if ! equals "$role" "secondary"; then
+    echo "Error: Candidate pod $candidate_pod is not a secondary" >&2
+    return 1
+  fi
+
+  # get current shard primary
+  current_shard_primary=$(get_current_shard_primary "$candidate_pod_fqdn" $service_port)
+  if is_empty "$current_shard_primary"; then
+    echo "Error: Could not determine current shard primary for $candidate_pod" >&2
+    return 1
+  fi
+
+  # check cluster health from current shard primary
+  if ! check_slots_covered "$current_shard_primary" $service_port; then
+    echo "Error: Cluster health check failed" >&2
+    return 1
+  fi
+
+  # check if candidate is known by all the shards primary
+  current_shard_primary_host=$(echo "$current_shard_primary" | cut -d':' -f1)
+  current_shard_primary_port=$(echo "$current_shard_primary" | cut -d':' -f2)
+  if is_empty "$current_shard_primary_host" || is_empty "$current_shard_primary_port"; then
+    echo "Error: Could not determine current shard primary host and port" >&2
+    return 1
+  fi
+  primaries=$(get_all_shards_master "$current_shard_primary_host" $current_shard_primary_port)
+  candidate_node_id=$(get_cluster_id "$candidate_pod_fqdn" $service_port)
+  for primary in $primaries; do
+    primary_host=$(echo "$primary" | cut -d':' -f1)
+    primary_port=$(echo "$primary" | cut -d':' -f2)
+    if ! check_node_in_cluster_with_retry "$primary_host" $primary_port "$candidate_node_id"; then
+      echo "Error: Candidate $candidate_pod is not known by shard $primary" >&2
+      return 1
+    fi
+  done
+
+  # do switchover
+  echo "Starting switchover to $candidate_pod"
+  unset_xtrace_when_ut_mode_false
+  if is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    result=$(redis-cli $REDIS_CLI_TLS_CMD -h "$candidate_pod_fqdn" -p $service_port cluster failover)
+  else
+    result=$(redis-cli $REDIS_CLI_TLS_CMD -h "$candidate_pod_fqdn" -p $service_port -a "$REDIS_DEFAULT_PASSWORD" cluster failover)
+  fi
+  if [ "$need_check" != "true" ]; then
+    return 0
+  fi
+  set_xtrace_when_ut_mode_false
+  if [ "$result" != "OK" ]; then
+    echo "Error: Cluster Failover command failed with result: $result" >&2
+    return 1
+  fi
+
+  # check switchover result
+  max_attempts=60
+  attempt=0
+  while [ $attempt -lt $max_attempts ]; do
+    role=$(check_redis_role "$candidate_pod_fqdn" $service_port)
+    if [ "$role" = "primary" ]; then
+      echo "Switchover successful: $candidate_pod is now primary"
+      return 0
+    fi
+    sleep 2
+    ((attempt++))
+  done
+
+  echo "Error: Switchover verification timeout" >&2
+  return 1
+}
+
+switchover_without_candidate() {
+  candidate_pod=""
+  candidate_pod_fqdn=""
+  # check if the current node is removed from the cluster or not
+  cluster_nodes_info=$(get_cluster_nodes_info "$CURRENT_POD_IP" "$service_port")
+  status=$?
+  if [ $status -ne 0 ]; then
+    echo "Failed to get cluster nodes info " >&2
+    return 1
+  fi
+  #if current pod has been removed from cluster by redis-cluster-replica-member-leave.sh, and become an primary by dbctl, cluster nodes command return one line
+  if [ "$(echo "$cluster_nodes_info" | wc -l)" -le 1 ]; then
+    echo "this pos has been successfully removed replica from shard,no need to perform switch over."
+    return
+  fi
+
+  # get the one of secondary pod of current shard
+  # TODO: get the most suitable secondary pod which has the lowest latency
+  IFS=',' read -ra PODS <<< "$CURRENT_SHARD_POD_NAME_LIST"
+  for pod_name in "${PODS[@]}"; do
+    local pod_fqdn
+    pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$pod_name") || {
+      echo "Failed to get FQDN for pod: $pod_name" >&2
+      return 1
+    }
+    role=$(check_redis_role "$pod_fqdn" $service_port)
+    if [ "$role" = "secondary" ]; then
+      candidate_pod=$pod_name
+      candidate_pod_fqdn=$pod_fqdn
+      break
+    fi
+  done
+
+  if is_empty "$candidate_pod"; then
+    echo "Error: No eligible secondary found in pod list: $CURRENT_SHARD_POD_NAME_LIST" >&2
+    return 1
+  fi
+
+  # do switchover
+  do_switchover "$candidate_pod" "$candidate_pod_fqdn" "false" || return 1
+}
+
+switchover_with_candidate() {
+  # check KB_SWITCHOVER_CANDIDATE_FQDN and KB_SWITCHOVER_CANDIDATE_NAME are not empty
+  if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN" || is_empty "$KB_SWITCHOVER_CANDIDATE_NAME"; then
+    echo "KB_SWITCHOVER_CANDIDATE_NAME or KB_SWITCHOVER_CANDIDATE_FQDN is empty" >&2
+    return 1
+  fi
+
+  # do switchover
+  do_switchover "$KB_SWITCHOVER_CANDIDATE_NAME" "$KB_SWITCHOVER_CANDIDATE_FQDN" "true" || return 1
+}
+
+# This is magic for shellspec ut framework.
+# Sometime, functions are defined in a single shell script.
+# You will want to test it. but you do not want to run the script.
+# When included from shellspec, __SOURCED__ variable defined and script
+# end here. The script path is assigned to the __SOURCED__ variable.
+${__SOURCED__:+false} : || return 0
+
+# main
+load_redis_cluster_common_utils
+check_environment_exist || exit 1
+init_redis_cluster_service_port
+if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN"; then
+  switchover_without_candidate || exit 1
+else
+  switchover_with_candidate || exit 1
+fi
diff --git a/addons/valkey/valkey-cluster-scripts/valkey-ping.sh b/addons/valkey/valkey-cluster-scripts/valkey-ping.sh
new file mode 100755
index 000000000..811c496f9
--- /dev/null
+++ b/addons/valkey/valkey-cluster-scripts/valkey-ping.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command.
+# Normally test without [expression] returns false. It means that __() { :; }
+# function is defined if this script runs directly.
+#
+# shellspec overrides the test command and returns true *once*. It means that
+# __() function defined internally by shellspec is called.
+#
+# In other words. If not in test mode, __ is just a comment. If test mode, __
+# is a interception point.
+#
+# you should set ut_mode="true" when you want to run the script in shellspec file.
+#
+# shellcheck disable=SC2034
+ut_mode="false"
+test || __() {
+  # when running in non-unit test mode, set the options "set -e".
+  set -ex;
+}
+
+load_common_library() {
+  # the common.sh scripts is mounted to the same path which is defined in the cmpd.spec.scripts
+  common_library_file="/scripts/common.sh"
+  # shellcheck disable=SC1090
+  source "${common_library_file}"
+}
+
+check_redis_ok() {
+  unset_xtrace_when_ut_mode_false
+  service_port=${SERVICE_PORT:-6379}
+  if ! is_empty "$REDIS_DEFAULT_PASSWORD"; then
+    cmd="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port -a $REDIS_DEFAULT_PASSWORD ping"
+  else
+    cmd="redis-cli $REDIS_CLI_TLS_CMD -h localhost -p $service_port ping"
+  fi
+  response=$($cmd)
+  status=$?
+  set_xtrace_when_ut_mode_false
+  if [ $status -eq 124 ]; then
+    echo "Timed out" >&2
+    return 1
+  fi
+  if [ "$response" != "PONG" ]; then
+    echo "redis ping failed, response: $response" >&2
+    return 1
+  fi
+  echo "Redis is ok"
+}
+
+retry_check_redis_ok() {
+  if call_func_with_retry 5 3 check_redis_ok; then
+    return 0
+  else
+    echo "Redis is not running." >&2
+    return 1
+  fi
+}
+
+# This is magic for shellspec ut framework.
+# Sometime, functions are defined in a single shell script.
+# You will want to test it. but you do not want to run the script.
+# When included from shellspec, __SOURCED__ variable defined and script
+# end here. The script path is assigned to the __SOURCED__ variable.
+${__SOURCED__:+false} : || return 0
+
+# main
+load_common_library
+retry_check_redis_ok || exit 1
diff --git a/addons/valkey/values.yaml b/addons/valkey/values.yaml
new file mode 100644
index 000000000..1720fa17b
--- /dev/null
+++ b/addons/valkey/values.yaml
@@ -0,0 +1,57 @@
+# Default values for the valkey addon.
+# Single Valkey major version (9.x); add new entries here when adopting newer
+# patch / minor versions. Cluster topology only — no sentinel, no twemproxy.
+
+nameOverride: ""
+fullnameOverride: ""
+
+# Valkey versions: each entry produces a release line in ComponentVersion.
+# `serviceVersion` is what users select in their Cluster CR; `imageTag` is
+# the docker.io/valkey/valkey tag. Add a new entry to support a new patch
+# without changing existing clusters.
+valkeyVersions:
+  - major: "9"
+    componentDef: "valkey-cluster-9"
+    serviceVersion: "9.0.3"
+    defaultImageTag: "9.0.3"
+    mirrorVersions:
+      - version: "9.0.3"
+        imageTag: "9.0.3"
+      - version: "9.1.0"
+        imageTag: "9.1.0"
+
+image:
+  registry: docker.io
+  repository: valkey/valkey
+  pullPolicy: IfNotPresent
+
+# dbctl + agamotto stay on apecloud images — they are KubeBlocks-side
+# tooling, not the engine.
+dbctlImage:
+  registry: ""
+  repository: apecloud/dbctl
+  pullPolicy: IfNotPresent
+  tag: 0.2.1
+
+metrics:
+  image:
+    registry: ""
+    repository: oliver006/redis_exporter
+    tag: v1.80.1
+    pullPolicy: IfNotPresent
+  service:
+    port: 9121
+    serverPort: 8888
+
+# defined the data volume mount path of valkey server
+dataMountPath: /data
+
+logConfigs:
+  running: /data/running.log
+
+# cluster domain without . prefix
+clusterDomain: "cluster.local"
+
+enableMetrics: true
+
+tlsMountPath: /etc/pki/tls

From 3b894f7e5643bf0ba2e61105334fc80d6517c8dd Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Wed, 6 May 2026 14:17:46 +0200
Subject: [PATCH 02/11] chore(valkey): expand 9.0.x mirror versions; bump
 appVersion to 9.0.4

Add 9.0.0, 9.0.1, 9.0.2, 9.0.4 alongside existing 9.0.3 / 9.1.0 in
ComponentVersion releases. 9.0.4 (released 2026-05-06) becomes the chart
appVersion and the default `serviceVersion` on the ComponentDefinition.

The full 9.0.x range gives operators a pinned set of options for
OpsRequest type=Upgrade rollback / patch-version testing without needing
to redeploy the addon. Same-image-tag mapping; no behavioural change.
---
 addons/valkey/Chart.yaml  |  2 +-
 addons/valkey/values.yaml | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/addons/valkey/Chart.yaml b/addons/valkey/Chart.yaml
index 93321aa6c..5db68fd16 100644
--- a/addons/valkey/Chart.yaml
+++ b/addons/valkey/Chart.yaml
@@ -6,7 +6,7 @@ type: application
 
 version: 0.1.0
 
-appVersion: "9.0.3"
+appVersion: "9.0.4"
 
 # Add a dependency to the kubeblocks definition library chart, same as the redis addon.
 dependencies:
diff --git a/addons/valkey/values.yaml b/addons/valkey/values.yaml
index 1720fa17b..7540bb0c2 100644
--- a/addons/valkey/values.yaml
+++ b/addons/valkey/values.yaml
@@ -12,11 +12,19 @@ fullnameOverride: ""
 valkeyVersions:
   - major: "9"
     componentDef: "valkey-cluster-9"
-    serviceVersion: "9.0.3"
-    defaultImageTag: "9.0.3"
+    serviceVersion: "9.0.4"
+    defaultImageTag: "9.0.4"
     mirrorVersions:
+      - version: "9.0.0"
+        imageTag: "9.0.0"
+      - version: "9.0.1"
+        imageTag: "9.0.1"
+      - version: "9.0.2"
+        imageTag: "9.0.2"
       - version: "9.0.3"
         imageTag: "9.0.3"
+      - version: "9.0.4"
+        imageTag: "9.0.4"
       - version: "9.1.0"
         imageTag: "9.1.0"
 

From 5f30aec1defec16cf67e399d7561abcb461308b3 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Wed, 6 May 2026 14:26:27 +0200
Subject: [PATCH 03/11] chore(valkey): drop 9.1.0 from mirror versions

9.1.0 is still RC upstream and not yet a tagged release on
docker.io/valkey/valkey. Keep ComponentVersion to the stable 9.0.x line
(9.0.0 - 9.0.4) for now; re-add 9.1.0 once the GA tag ships.
---
 addons/valkey/values.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/addons/valkey/values.yaml b/addons/valkey/values.yaml
index 7540bb0c2..0a020c950 100644
--- a/addons/valkey/values.yaml
+++ b/addons/valkey/values.yaml
@@ -25,8 +25,6 @@ valkeyVersions:
         imageTag: "9.0.3"
       - version: "9.0.4"
         imageTag: "9.0.4"
-      - version: "9.1.0"
-        imageTag: "9.1.0"
 
 image:
   registry: docker.io

From 64159e4e86df02998f50458ef993e05d36dda234 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Fri, 8 May 2026 13:18:28 +0200
Subject: [PATCH 04/11] dropped the 'reconfigure' field

---
 addons/valkey/templates/cmpd-valkey-cluster.yaml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml
index c9c39b46a..f41745ade 100644
--- a/addons/valkey/templates/cmpd-valkey-cluster.yaml
+++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml
@@ -76,7 +76,15 @@ spec:
       namespace: {{ $.Release.Namespace }}
       volumeName: valkey-cluster-config
       externalManaged: true
-      {{- include "valkey.config.reconfigureAction" $ | nindent 6 }}
+      # NOTE: dropped the `reconfigure` field (helper:
+      # valkey.config.reconfigureAction) because the field was added to the
+      # ComponentDefinition CRD in KubeBlocks 1.1.x and is not in 1.0.2 (current
+      # stable). When upgrading the operator past 1.1.x, re-add:
+      #     {{ "{{- include \"valkey.config.reconfigureAction\" $ | nindent 6 }}" }}
+      # Trade-off: without `reconfigure`, ConfigMap changes don't hot-reload via
+      # operator exec. Config changes take effect on the next pod restart (helm
+      # upgrade with resource/version delta, or `kubectl create -f
+      # charts/valkey/ops/restart.yaml`). Adequate for our cache-mode workload.
   scripts:
     - name: valkey-cluster-scripts
       template: {{ include "valkeyCluster.scriptsTemplate" $ }}

From ed31ccc5a34bb37a6e4f50112fa5a8eb340240a4 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Fri, 8 May 2026 14:14:16 +0200
Subject: [PATCH 05/11] fix: remove externalManaged

---
 .../valkey/templates/cmpd-valkey-cluster.yaml | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml
index f41745ade..4b78d0f3e 100644
--- a/addons/valkey/templates/cmpd-valkey-cluster.yaml
+++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml
@@ -75,16 +75,21 @@ spec:
       template: {{ printf "valkey-cluster-config-template-%s" $.Chart.Version }}
       namespace: {{ $.Release.Namespace }}
       volumeName: valkey-cluster-config
-      externalManaged: true
-      # NOTE: dropped the `reconfigure` field (helper:
+      # NOTE: do NOT set `externalManaged: true`. In KubeBlocks 1.0.2 the
+      # synthesizer (pkg/controller/component/synthesize_component.go) wipes
+      # `template` to "" whenever externalManaged is true and the Cluster CR
+      # does not supply a `componentSpecs[].configs[].configMap` override.
+      # The downstream `transformer_component_template.precheck` then fails
+      # with `config/script template has no template specified: valkey-cluster-config`.
+      # We don't expose user-overridable configs, so leaving this off lets the
+      # operator manage the chart-provided ConfigMap directly.
+      #
+      # NOTE: also dropped the `reconfigure` field (helper:
       # valkey.config.reconfigureAction) because the field was added to the
-      # ComponentDefinition CRD in KubeBlocks 1.1.x and is not in 1.0.2 (current
-      # stable). When upgrading the operator past 1.1.x, re-add:
-      #     {{ "{{- include \"valkey.config.reconfigureAction\" $ | nindent 6 }}" }}
-      # Trade-off: without `reconfigure`, ConfigMap changes don't hot-reload via
-      # operator exec. Config changes take effect on the next pod restart (helm
-      # upgrade with resource/version delta, or `kubectl create -f
-      # charts/valkey/ops/restart.yaml`). Adequate for our cache-mode workload.
+      # ComponentDefinition CRD in KubeBlocks 1.1.x and is not in 1.0.2.
+      # ConfigMap changes won't hot-reload via operator exec — they take effect
+      # on the next pod restart (helm upgrade with resource/version delta, or
+      # `kubectl create -f charts/valkey/ops/restart.yaml`). Adequate for cache.
   scripts:
     - name: valkey-cluster-scripts
       template: {{ include "valkeyCluster.scriptsTemplate" $ }}

From a35926b205bcaa667996c6179116da020007c6df Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Mon, 11 May 2026 13:36:29 +0200
Subject: [PATCH 06/11] feat: avoid all replicas or primaries gone at once in
 rolling updates

---
 addons/valkey/templates/shardingdefinition.yaml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/addons/valkey/templates/shardingdefinition.yaml b/addons/valkey/templates/shardingdefinition.yaml
index ed1e71c8b..882b3a259 100644
--- a/addons/valkey/templates/shardingdefinition.yaml
+++ b/addons/valkey/templates/shardingdefinition.yaml
@@ -12,8 +12,18 @@ spec:
   shardsLimit:
     minShards: 1
     maxShards: 64
+  # provisionStrategy: how shards are added during initial cluster create or
+  # scale-out. Parallel is safe here — the bootstrap script ensures all shards
+  # converge before slot assignment runs.
   provisionStrategy: Parallel
-  updateStrategy: Parallel
+  # updateStrategy: how shards are processed during Upgrade/Restart/VScale
+  # OpsRequests. MUST be Serial for a Redis Cluster: if multiple shards roll
+  # in parallel, you can simultaneously lose every replica (then every
+  # primary), which breaks cluster bus quorum and leaves orphan/ghost nodes
+  # the heal CronJob can't recover from (no anchor pod in cluster_state:ok).
+  # Within a shard, podManagementPolicy=OrderedReady on the cmpd already
+  # serializes replica → primary with CLUSTER FAILOVER in between.
+  updateStrategy: Serial
   systemAccounts:
     - name: default
       shared: true

From e3c104b83ba1ea8b10beb81ad29d6e0e813644ce Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Mon, 11 May 2026 13:55:17 +0200
Subject: [PATCH 07/11] revert: kb 1.0.2 doesn't consume update strategy

---
 .../valkey/templates/shardingdefinition.yaml  | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/addons/valkey/templates/shardingdefinition.yaml b/addons/valkey/templates/shardingdefinition.yaml
index 882b3a259..3f59f1d23 100644
--- a/addons/valkey/templates/shardingdefinition.yaml
+++ b/addons/valkey/templates/shardingdefinition.yaml
@@ -12,18 +12,16 @@ spec:
   shardsLimit:
     minShards: 1
     maxShards: 64
-  # provisionStrategy: how shards are added during initial cluster create or
-  # scale-out. Parallel is safe here — the bootstrap script ensures all shards
-  # converge before slot assignment runs.
+  # NOTE: provisionStrategy + updateStrategy are declared/validated by KB 1.0.2
+  # (`controllers/apps/shardingdefinition_controller.go:validateProvisionNUpdateStrategy`)
+  # but NOT actually consumed when reconciling shard rollouts. The cluster
+  # controller updates all sharding components in parallel regardless of this
+  # value. Verified by `grep -r SerialStrategy pkg/controller/` returning no
+  # consumer references. Leaving as Parallel to match the field's actual
+  # semantics in this KB version — see charts/valkey/README.md for what
+  # actually controls upgrade safety (spoiler: nothing, currently).
   provisionStrategy: Parallel
-  # updateStrategy: how shards are processed during Upgrade/Restart/VScale
-  # OpsRequests. MUST be Serial for a Redis Cluster: if multiple shards roll
-  # in parallel, you can simultaneously lose every replica (then every
-  # primary), which breaks cluster bus quorum and leaves orphan/ghost nodes
-  # the heal CronJob can't recover from (no anchor pod in cluster_state:ok).
-  # Within a shard, podManagementPolicy=OrderedReady on the cmpd already
-  # serializes replica → primary with CLUSTER FAILOVER in between.
-  updateStrategy: Serial
+  updateStrategy: Parallel
   systemAccounts:
     - name: default
       shared: true

From 438d05389d21af3f3e8f369a929f541348ac17f1 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Tue, 12 May 2026 15:06:09 +0200
Subject: [PATCH 08/11] fix: use REDIS_POD_FQDN_LIST for memberJoin

---
 .../valkey/valkey-cluster-scripts/sync-acl.sh | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/addons/valkey/valkey-cluster-scripts/sync-acl.sh b/addons/valkey/valkey-cluster-scripts/sync-acl.sh
index 0b6bd0c31..b9ad7e098 100644
--- a/addons/valkey/valkey-cluster-scripts/sync-acl.sh
+++ b/addons/valkey/valkey-cluster-scripts/sync-acl.sh
@@ -1,4 +1,16 @@
 #!/bin/bash
+#
+# Sync ACL rules from existing shard peers onto a newly-joined pod. Invoked
+# by KubeBlocks as the memberJoin lifecycle action.
+#
+# Env vars in scope (we tolerate either naming so the script works under
+# both 1.0.x and 1.1.x KB versions):
+#   - KB_JOIN_MEMBER_POD_FQDN     — the pod being joined (injected by KB)
+#   - REDIS_POD_FQDN_LIST         — historical upstream name (often unset)
+#   - CURRENT_SHARD_POD_FQDN_LIST — name our cmpd actually exposes
+#
+# If neither list is populated, we have no peers to query and there's
+# nothing to sync. Exit 0 in that case rather than failing the join.
 
 service_port=${SERVICE_PORT:-6379}
 redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port -a $REDIS_DEFAULT_PASSWORD"
@@ -6,10 +18,17 @@ if [ -z "$REDIS_DEFAULT_PASSWORD" ]; then
    redis_base_cmd="redis-cli $REDIS_CLI_TLS_CMD -p $service_port"
 fi
 
+# Pick whichever peer list is populated; tolerate either name.
+peer_list="${REDIS_POD_FQDN_LIST:-$CURRENT_SHARD_POD_FQDN_LIST}"
+if [ -z "$peer_list" ]; then
+    echo "No peer FQDN list available (REDIS_POD_FQDN_LIST and CURRENT_SHARD_POD_FQDN_LIST both empty); nothing to sync, exiting 0" >&2
+    exit 0
+fi
+
 is_ok=false
 acl_list=""
 # 1. get acl list from other pods
-for pod_fqdn in $(echo "$REDIS_POD_FQDN_LIST" | tr ',' '\n'); do
+for pod_fqdn in $(echo "$peer_list" | tr ',' '\n'); do
     if [[ "$pod_fqdn" == "$KB_JOIN_MEMBER_POD_FQDN" ]]; then
         continue
     fi
@@ -21,7 +40,7 @@ for pod_fqdn in $(echo "$REDIS_POD_FQDN_LIST" | tr ',' '\n'); do
 done
 
 if [ "$is_ok" = false ]; then
-    echo "Failed to get ACL LIST from other pods" >&2
+    echo "Failed to get ACL LIST from any peer in: $peer_list" >&2
     exit 1
 fi
 

From dc79ce268afce779bd976459322587f54f9cf976 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Tue, 12 May 2026 21:07:07 +0200
Subject: [PATCH 09/11] feat: distinguishing rejoining pod

---
 .../valkey-cluster-server-start.sh            | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh
index 14a8d9527..cffb881af 100755
--- a/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh
+++ b/addons/valkey/valkey-cluster-scripts/valkey-cluster-server-start.sh
@@ -352,6 +352,30 @@ remove_rebuild_instance_flag() {
   fi
 }
 
+# is_rejoining_pod returns 0 if this pod's PVC has preserved cluster state
+# from a prior incarnation — i.e. nodes.conf shows we were part of a
+# multi-node Redis cluster. When EBS persistence is enabled, this is the
+# normal case after any pod recreation (Karpenter reschedule, AMI bump,
+# machine type change, AZ failover). The right thing to do is let redis
+# start with its preserved identity and let the cluster bus gossip update
+# peer addresses; the start script should NOT try to add-node or replicate
+# this pod (which fails with "Node ... is not empty" because the pod
+# already has cluster state).
+#
+# Distinct from is_rebuild_instance, which looks for an explicit
+# /data/rebuild.flag set by KubeBlocks during a planned wipe-and-rejoin.
+# A rejoining pod has nodes.conf AND no rebuild.flag.
+is_rejoining_pod() {
+  [[ ! -f /data/nodes.conf ]] && return 1
+  # Need at least 2 lines (self + peers). A single-line nodes.conf is what
+  # a fresh redis writes on first start, before any cluster membership.
+  [[ $(grep -c ":" /data/nodes.conf) -le 1 ]] && return 1
+  # Explicit KB-driven rebuild path takes precedence — fall through to the
+  # existing add-node/replicate logic, which knows how to handle that flag.
+  [[ -f /data/rebuild.flag ]] && return 1
+  return 0
+}
+
 # scale out replica of redis cluster shard if needed
 scale_redis_cluster_replica() {
   # Waiting for redis-server to start
@@ -373,6 +397,23 @@ scale_redis_cluster_replica() {
     echo "the nodes.conf file after redis server start is not exist"
   fi
 
+  # EBS-rejoin short-circuit: when the PVC has preserved cluster state from a
+  # prior incarnation of this pod (typical after node replacement when
+  # persistence is enabled), the right thing is to let redis start with its
+  # preserved node identity and let cluster bus gossip update peer addresses.
+  # The add-node path below would fail here with "Node ... is not empty"
+  # because the pod already has cluster state.
+  #
+  # KubeBlocks-driven rebuild (via /data/rebuild.flag) still falls through
+  # to the existing logic, which knows how to handle that case.
+  if is_rejoining_pod; then
+    nodes_count=$(grep -c ":" /data/nodes.conf)
+    echo "EBS rejoin detected: nodes.conf has ${nodes_count} entries from prior cluster membership."
+    echo "Skipping scale-out logic; redis cluster bus will re-converge via gossip."
+    echo "(FQDNs are stable across pod recreation; peers will update this pod's IP via gossip.)"
+    exit 0
+  fi
+
   for target_node_name in $(echo "${CURRENT_SHARD_POD_NAME_LIST}" | tr ',' '\n'); do
      if [ -f /data/rebuild.flag ] && [ "${CURRENT_POD_NAME}" == "${target_node_name}" ]; then
        continue

From a2862ef29e20758369a153245e98711e5177e06c Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Wed, 13 May 2026 14:59:09 +0200
Subject: [PATCH 10/11] feat: enable AOF and grace for image-swap data survival
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch valkey-cluster from RDB-only to AOF for durability across pod
restarts. The previous RDB-only config relied on shutdown-save which
races with the 10s shutdown-timeout under concurrent writes — verified
to flush 2 of 3 shards during multi-shard image upgrades.

With AOF on, each pod independently recovers its full dataset from
disk via AOF replay, so the cross-shard restart race becomes a non-
issue. Tested across reshard, vscale, image-swap, and node-type
cascade: 50/50 keys preserved end-to-end.

Companion settings:
- shutdown-timeout 25 (was default 10) for in-flight AOF rewrite
- terminationGracePeriodSeconds 60 in the cmpd runtime spec
---
 addons/valkey/config/valkey-cluster-config.tpl | 18 ++++++++++++------
 .../valkey/templates/cmpd-valkey-cluster.yaml  |  4 ++++
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/addons/valkey/config/valkey-cluster-config.tpl b/addons/valkey/config/valkey-cluster-config.tpl
index e896c7c8f..0885846c1 100644
--- a/addons/valkey/config/valkey-cluster-config.tpl
+++ b/addons/valkey/config/valkey-cluster-config.tpl
@@ -38,10 +38,11 @@ oom-score-adj no
 oom-score-adj-values 0 200 800
 disable-thp yes
 
-# AOF off: fsync on EBS gp3 caused 30-40ms event-loop stalls (LATENCY DOCTOR
-# confirmed). Replicas + EBS-mounted nodes.conf give us cluster-topology
-# durability, which is all we need for a cache.
-appendonly no
+# AOF on: required for data preservation across image-swap restarts. Without
+# AOF, shutdown-save races with shutdown-timeout under concurrent writes, and
+# a flushed dump.rdb can wipe a shard via the "empty master returns" cluster
+# bus path. With AOF, every write is on disk within ~1s; restart replays AOF.
+appendonly yes
 appendfilename "appendonly.aof"
 appenddirname "appendonlydir"
 appendfsync everysec
@@ -52,10 +53,15 @@ aof-load-truncated yes
 aof-use-rdb-preamble yes
 aof-timestamp-enabled no
 
-# Disable scheduled BGSAVE forks (default rules tripped every ~90s under our
-# load; each fork briefly stalls the event loop).
+# No scheduled BGSAVE: AOF gives us continuous durability; periodic BGSAVE
+# forks add latency without adding safety.
 save ""
 
+# Generous shutdown budget: AOF makes the final save cheap, but during AOF
+# rewrite or under load there can still be I/O to drain. Paired with the
+# cmpd's terminationGracePeriodSeconds (must be > this value).
+shutdown-timeout 25
+
 slowlog-log-slower-than 10000
 slowlog-max-len 128
 
diff --git a/addons/valkey/templates/cmpd-valkey-cluster.yaml b/addons/valkey/templates/cmpd-valkey-cluster.yaml
index 4b78d0f3e..6daf8e8e5 100644
--- a/addons/valkey/templates/cmpd-valkey-cluster.yaml
+++ b/addons/valkey/templates/cmpd-valkey-cluster.yaml
@@ -473,6 +473,10 @@ spec:
           - /scripts/sync-acl.sh
         targetPodSelector: Any
   runtime:
+    # Generous grace so AOF rewrite or in-flight fsync can land before SIGKILL.
+    # Must exceed redis-conf `shutdown-timeout` (25s) by enough margin for
+    # preStop hook + clean exit.
+    terminationGracePeriodSeconds: 60
     initContainers:
       - name: init-dbctl
         command:

From f3279e5ffb0423f3e80f38a5b0c9021d9d7e7d14 Mon Sep 17 00:00:00 2001
From: Yun Wang <yun.wang@getstream.io>
Date: Thu, 14 May 2026 15:57:51 +0200
Subject: [PATCH 11/11] feat (valkey): asm-reshard owns topology change + slot
 rebalance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the asm-reshard OpsDefinition from the stream-infra valkey chart
into this addon — it's intrinsic to a sharded-Valkey deployment, not
namespace-specific, so the addon is the right home.

While moving, fold the Cluster CR patch into the OpsDef so a single
OpsRequest drives the whole reshard. Adds two new actions around the
existing slot-migration script:

  - prepare-topology  (workload: Job, kubectl image): scale-out only.
    Patches Cluster.spec.shardings[0].shards up, waits for KB to bring
    the new pods to Running. No-op for scale-in.
  - asm-migrate       (exec: valkey-cluster): existing direction-aware
    slot rebalance, untouched.
  - finalize-topology (workload: Job, kubectl image): scale-in only.
    After slots are drained, patches shards down and waits for KB to
    delete the extra pods.

Each action's `failurePolicy: Fail` makes the OpsRequest fail loud if
any step trips. KB tracks progress as N/3 in the OpsRequest status.

KubeBlocks overrides workload.podSpec.serviceAccountName with the
component's kb-managed SA, so the RBAC has to live on the consuming
side (the chart binds a minimal Role to system:serviceaccounts:<ns> in
templates/asm-rbac.yaml).

Verified end-to-end with the stream-infra chart on KB 1.0.2 under
writer-reporter load: 3→5 then 5→3, 50/50 keys preserved, 0 write
failures.
---
 .../valkey/templates/opsdefinition-asm.yaml   | 869 ++++++++++++++++++
 1 file changed, 869 insertions(+)
 create mode 100644 addons/valkey/templates/opsdefinition-asm.yaml

diff --git a/addons/valkey/templates/opsdefinition-asm.yaml b/addons/valkey/templates/opsdefinition-asm.yaml
new file mode 100644
index 000000000..5d084cd41
--- /dev/null
+++ b/addons/valkey/templates/opsdefinition-asm.yaml
@@ -0,0 +1,869 @@
+# ASM (Atomic Slot Migration) OpsDefinition for Valkey 9.0+ on KubeBlocks.
+#
+# Single-OpsRequest path for sharded Valkey resharding. Owns both the
+# topology change (Cluster CR shards count) AND the slot rebalance. The
+# user creates one OpsRequest with `targetShards: N` and KB runs three
+# sequential actions:
+#
+#   1. prepare-topology — if scale-out, patch Cluster CR shards up,
+#      wait for new pods Running. No-op for scale-in.
+#   2. asm-migrate — direction-aware slot rebalance (CLUSTER MIGRATESLOTS).
+#      Runs inside an existing valkey-cluster container; no kubectl needed.
+#   3. finalize-topology — if scale-in, patch Cluster CR shards down,
+#      wait for old pods Terminated. No-op for scale-out.
+#
+# The asm-orchestrator ServiceAccount + RBAC (used by 1 and 3) is created
+# per-namespace by the stream-infra valkey chart, not by this addon — the
+# OpsDef references it by name and KB resolves it in the OpsRequest's
+# namespace.
+---
+apiVersion: operations.kubeblocks.io/v1alpha1
+kind: OpsDefinition
+metadata:
+  name: asm-reshard
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  annotations:
+    {{- include "valkey.apiVersion" . | nindent 4 }}
+spec:
+  parametersSchema:
+    openAPIV3Schema:
+      properties:
+        targetShards:
+          type: integer
+          description: Desired shard count after resharding
+        clusterName:
+          type: string
+          description: Name of the Valkey Cluster CR
+      required:
+        - targetShards
+        - clusterName
+  podInfoExtractors:
+    - name: valkey-pod
+      podSelector:
+        multiPodSelectionPolicy: Any
+      env:
+        - name: CLUSTER_NAMESPACE
+          valueFrom:
+            envRef:
+              envName: CLUSTER_NAMESPACE
+  actions:
+    # ── Action 1: scale-out topology change ──────────────────────────────
+    # No-op when current_shards >= target. Otherwise patches Cluster.spec.
+    # shardings[0].shards to target and waits for KubeBlocks to bring the
+    # new pods to Running.
+    - name: prepare-topology
+      failurePolicy: Fail
+      parameters:
+        - targetShards
+        - clusterName
+      workload:
+        type: Job
+        podInfoExtractorName: valkey-pod
+        backoffLimit: 0
+        podSpec:
+          # KB ignores serviceAccountName here; it injects the component's
+          # kb-managed SA. The chart binds the asm-orchestrator Role to
+          # the namespace's SA group, which covers it. See chart's
+          # templates/asm-rbac.yaml for the rationale.
+          restartPolicy: Never
+          containers:
+            - name: kubectl
+              image: {{ .Values.image.registry | default "docker.io" }}/apecloud/kubectl:1.29
+              imagePullPolicy: IfNotPresent
+              command:
+                - bash
+                - -c
+                - |
+                  set -euo pipefail
+                  CLUSTER="${clusterName}"
+                  TARGET="${targetShards}"
+                  NS="${CLUSTER_NAMESPACE}"
+                  log() { echo "[prepare-topology] $(date +%H:%M:%S) $*"; }
+
+                  CURRENT=$(kubectl get cluster -n "$NS" "$CLUSTER" \
+                    -o jsonpath='{.spec.shardings[0].shards}')
+                  log "current_shards=${CURRENT} target_shards=${TARGET}"
+
+                  if [[ "$TARGET" -le "$CURRENT" ]]; then
+                    log "Not scale-out; nothing to patch. (finalize-topology will handle scale-in.)"
+                    exit 0
+                  fi
+
+                  log "Scale-out: patching Cluster ${CLUSTER} shardings[0].shards=${TARGET}"
+                  kubectl patch cluster -n "$NS" "$CLUSTER" --type=json \
+                    -p "[{\"op\":\"replace\",\"path\":\"/spec/shardings/0/shards\",\"value\":${TARGET}}]"
+
+                  REPLICAS=$(kubectl get cluster -n "$NS" "$CLUSTER" \
+                    -o jsonpath='{.spec.shardings[0].template.replicas}')
+                  EXPECTED=$((TARGET * REPLICAS))
+                  log "Waiting for ${EXPECTED} pods Running (replicas/shard=${REPLICAS})..."
+
+                  elapsed=0
+                  while [[ $elapsed -lt 600 ]]; do
+                    READY=$(kubectl get pods -n "$NS" \
+                      -l "app.kubernetes.io/instance=${CLUSTER},apps.kubeblocks.io/sharding-name=shard" \
+                      --field-selector=status.phase=Running --no-headers 2>/dev/null \
+                      | wc -l | tr -d ' ')
+                    [[ "$READY" -ge "$EXPECTED" ]] && break
+                    [[ $((elapsed % 30)) -eq 0 ]] && log "  ready=${READY}/${EXPECTED} (${elapsed}s)..."
+                    sleep 10
+                    elapsed=$((elapsed + 10))
+                  done
+                  if [[ "$READY" -lt "$EXPECTED" ]]; then
+                    log "ERROR: only ${READY}/${EXPECTED} pods Running after 10m"
+                    exit 1
+                  fi
+
+                  log "Waiting for Cluster phase Running..."
+                  elapsed=0
+                  while [[ $elapsed -lt 300 ]]; do
+                    PHASE=$(kubectl get cluster -n "$NS" "$CLUSTER" \
+                      -o jsonpath='{.status.phase}' 2>/dev/null)
+                    [[ "$PHASE" == "Running" ]] && break
+                    sleep 5
+                    elapsed=$((elapsed + 5))
+                  done
+                  log "Cluster phase=${PHASE}. Topology ready for slot migration."
+              env:
+                - name: clusterName
+                  value: $(clusterName)
+                - name: targetShards
+                  value: $(targetShards)
+                - name: CLUSTER_NAMESPACE
+                  value: $(CLUSTER_NAMESPACE)
+
+    # ── Action 2: slot migration ─────────────────────────────────────────
+    # Direction-aware: detects scale-out vs scale-in from current vs target
+    # primary count, computes minimal slot moves, executes via CLUSTER
+    # MIGRATESLOTS. Runs inside an existing valkey-cluster container; uses
+    # CLUSTER NODES for discovery, redis-cli for the moves — no kubectl.
+    - name: asm-migrate
+      failurePolicy: Fail
+      parameters:
+        - targetShards
+        - clusterName
+      exec:
+        backoffLimit: 0
+        podInfoExtractorName: valkey-pod
+        containerName: valkey-cluster
+        command:
+          - bash
+          - -c
+          - |
+            set -euo pipefail
+
+            # ── Configuration ──────────────────────────────────────────────
+            # Parameters injected by KubeBlocks via $(paramName) substitution
+            TARGET_SHARDS="$(targetShards)"
+            CLUSTER_NAME="$(clusterName)"
+            ASM_TIMEOUT="${ASM_TIMEOUT_SECONDS:-600}"
+            POLL_INTERVAL=2
+            LOCAL_PORT=6379
+
+            # ── Logging ────────────────────────────────────────────────────
+            info() { echo "[INFO]  $(date +%H:%M:%S) $*"; }
+            warn() { echo "[WARN]  $(date +%H:%M:%S) $*"; }
+            fail() { echo "[FAIL]  $(date +%H:%M:%S) $*"; }
+
+            # ── Auth ───────────────────────────────────────────────────────
+            AUTH_ARGS=""
+            if [[ -n "${REDIS_DEFAULT_PASSWORD:-}" ]]; then
+              AUTH_ARGS="--no-auth-warning -a ${REDIS_DEFAULT_PASSWORD}"
+            fi
+
+            vcli() {
+              local host="$1" port="$2"; shift 2
+              if command -v valkey-cli >/dev/null 2>&1; then
+                valkey-cli $AUTH_ARGS -h "$host" -p "$port" "$@"
+              else
+                redis-cli $AUTH_ARGS -h "$host" -p "$port" "$@"
+              fi
+            }
+
+            vcli_local() {
+              vcli 127.0.0.1 "$LOCAL_PORT" "$@"
+            }
+
+            # ── Step 1: Validate parameters ────────────────────────────────
+            if [[ -z "$TARGET_SHARDS" ]]; then
+              fail "TARGET_SHARDS is not set. Pass it via OpsRequest parameters."
+              exit 1
+            fi
+            if ! [[ "$TARGET_SHARDS" =~ ^[0-9]+$ ]] || [[ "$TARGET_SHARDS" -lt 1 ]]; then
+              fail "TARGET_SHARDS must be a positive integer, got: '${TARGET_SHARDS}'"
+              exit 1
+            fi
+            if [[ -z "$CLUSTER_NAME" ]]; then
+              fail "CLUSTER_NAME is not set. Pass it via OpsRequest parameters."
+              exit 1
+            fi
+
+            info "ASM resharding: cluster=${CLUSTER_NAME} target_shards=${TARGET_SHARDS} timeout=${ASM_TIMEOUT}s"
+
+            # ── Step 2: Discover topology ──────────────────────────────────
+            CLUSTER_NODES_RAW="$(vcli_local CLUSTER NODES)"
+            if [[ -z "$CLUSTER_NODES_RAW" ]]; then
+              fail "CLUSTER NODES returned empty. Is the cluster running?"
+              exit 1
+            fi
+
+            declare -a PRIMARY_IDS=() PRIMARY_HOSTS=() PRIMARY_PORTS=() PRIMARY_SLOTS=() PRIMARY_COMP=()
+
+            while IFS= read -r line; do
+              [[ -z "$line" ]] && continue
+              local_id="$(echo "$line" | awk '{print $1}')"
+              local_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)"
+              local_flags="$(echo "$line" | awk '{print $3}')"
+              local_host="$(echo "$local_addr" | rev | cut -d: -f2- | rev)"
+              local_port="$(echo "$local_addr" | rev | cut -d: -f1 | rev)"
+              if echo "$local_flags" | grep -q "master" && ! echo "$local_flags" | grep -q "fail\|handshake"; then
+                PRIMARY_IDS+=("$local_id")
+                PRIMARY_HOSTS+=("$local_host")
+                PRIMARY_PORTS+=("$local_port")
+                local_slots="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')"
+                PRIMARY_SLOTS+=("$local_slots")
+                # Extract component short name from FQDN in CLUSTER NODES addr field.
+                # Format: ip:port@cport,podname.headless.ns.svc...
+                # Pod name: {cluster}-{comp}-{ordinal} -> component = {comp}
+                local_fqdn="$(echo "$line" | awk '{print $2}' | cut -d@ -f2 | cut -d, -f2 | cut -d. -f1)"
+                # Strip pod ordinal: valkey-poc-shard-mk4-0 -> valkey-poc-shard-mk4
+                local_pod_base="${local_fqdn%-*}"
+                # Strip cluster prefix: valkey-poc-shard-mk4 -> shard-mk4
+                local_comp="${local_pod_base#${CLUSTER_NAME}-}"
+                PRIMARY_COMP+=("$local_comp")
+              fi
+            done <<< "$CLUSTER_NODES_RAW"
+
+            CURRENT_SHARDS="${#PRIMARY_IDS[@]}"
+
+            # Sort primaries by component short name (ascending) to match KubeBlocks'
+            # shard removal rule: alphabetical sort, keep first N, drop the rest.
+            # This ensures indices >= TARGET_SHARDS correspond to the shards KB will remove.
+            if [[ $CURRENT_SHARDS -gt 1 ]]; then
+              # Build sortable lines: "comp_name|index", sort by comp_name, extract reordered indices
+              declare -a SORT_ORDER=()
+              for i in "${!PRIMARY_COMP[@]}"; do
+                echo "${PRIMARY_COMP[$i]}|$i"
+              done | sort -t'|' -k1,1 | while IFS='|' read -r _ idx; do
+                SORT_ORDER+=("$idx")
+              done
+              # If subshell ate SORT_ORDER, rebuild via temp file
+              if [[ ${#SORT_ORDER[@]} -eq 0 ]]; then
+                SORT_TMP="$(mktemp)"
+                for i in "${!PRIMARY_COMP[@]}"; do
+                  echo "${PRIMARY_COMP[$i]}|$i"
+                done | sort -t'|' -k1,1 | cut -d'|' -f2 > "$SORT_TMP"
+                SORT_ORDER=()
+                while IFS= read -r idx; do
+                  SORT_ORDER+=("$idx")
+                done < "$SORT_TMP"
+                rm -f "$SORT_TMP"
+              fi
+              # Reorder all arrays
+              declare -a _IDS=() _HOSTS=() _PORTS=() _SLOTS=() _COMP=()
+              for idx in "${SORT_ORDER[@]}"; do
+                _IDS+=("${PRIMARY_IDS[$idx]}")
+                _HOSTS+=("${PRIMARY_HOSTS[$idx]}")
+                _PORTS+=("${PRIMARY_PORTS[$idx]}")
+                _SLOTS+=("${PRIMARY_SLOTS[$idx]}")
+                _COMP+=("${PRIMARY_COMP[$idx]}")
+              done
+              PRIMARY_IDS=("${_IDS[@]}")
+              PRIMARY_HOSTS=("${_HOSTS[@]}")
+              PRIMARY_PORTS=("${_PORTS[@]}")
+              PRIMARY_SLOTS=("${_SLOTS[@]}")
+              PRIMARY_COMP=("${_COMP[@]}")
+            fi
+
+            # Count primaries that currently own slots vs. primaries that are empty.
+            # LOADED_SHARDS is the number of slot-owning shards, which is what we
+            # actually care about for direction detection (NOT CURRENT_SHARDS, which
+            # includes freshly-created empty primaries after a Cluster CR patch).
+            LOADED_SHARDS=0
+            EMPTY_COUNT=0
+            for i in "${!PRIMARY_IDS[@]}"; do
+              slots_trimmed="$(echo "${PRIMARY_SLOTS[$i]}" | xargs)"
+              if [[ -z "$slots_trimmed" ]]; then
+                EMPTY_COUNT=$((EMPTY_COUNT + 1))
+              else
+                LOADED_SHARDS=$((LOADED_SHARDS + 1))
+              fi
+            done
+
+            info "Current topology: ${CURRENT_SHARDS} primaries (${LOADED_SHARDS} loaded, ${EMPTY_COUNT} empty), sorted by component name"
+
+            for i in "${!PRIMARY_IDS[@]}"; do
+              info "  shard ${i}: comp=${PRIMARY_COMP[$i]} id=${PRIMARY_IDS[$i]:0:8}... host=${PRIMARY_HOSTS[$i]}:${PRIMARY_PORTS[$i]} slots=[${PRIMARY_SLOTS[$i]}]"
+            done
+
+            # Nothing to do only if both the primary count matches AND all primaries
+            # own slots (no empty primaries waiting to be populated).
+            if [[ "$TARGET_SHARDS" -eq "$LOADED_SHARDS" && "$EMPTY_COUNT" -eq 0 ]]; then
+              info "Already at ${TARGET_SHARDS} loaded shards with no empty primaries. Nothing to do."
+              exit 0
+            fi
+
+            if [[ "$TARGET_SHARDS" -gt "$LOADED_SHARDS" ]]; then
+              DIRECTION="scale-out"
+              info "Direction: scale-out (${LOADED_SHARDS} loaded -> ${TARGET_SHARDS} target)"
+              if [[ $EMPTY_COUNT -eq 0 ]]; then
+                fail "No empty shards found. Ensure new shard pods joined via CLUSTER MEET."
+                exit 1
+              fi
+              if [[ $((LOADED_SHARDS + EMPTY_COUNT)) -lt "$TARGET_SHARDS" ]]; then
+                fail "Not enough primaries (${CURRENT_SHARDS}) to reach target ${TARGET_SHARDS}. Ensure Cluster CR shards=${TARGET_SHARDS} and all pods are Running."
+                exit 1
+              fi
+              info "Will migrate slots into ${EMPTY_COUNT} empty shard(s)"
+            else
+              DIRECTION="scale-in"
+              info "Direction: scale-in (${LOADED_SHARDS} loaded -> ${TARGET_SHARDS} target)"
+            fi
+
+            # ── Step 3: ACL patch ──────────────────────────────────────────
+            # MIGRATESLOTS authenticates to the target node as kbreplicator (via
+            # masterauth). The snapshot transfer replays the full RDB stream including
+            # SELECT, SET, HSET, and every other write command stored in the migrating
+            # slots. kbreplicator's default ACL (-@all +psync +replconf +ping) is far
+            # too restrictive. Grant +@all for the migration; the user is internal to
+            # the cluster (masterauth-only), so this does not widen the attack surface.
+            info "Patching ACL: granting kbreplicator +@all on all nodes for slot migration..."
+            ALL_NODES_RAW="$(vcli_local CLUSTER NODES)"
+            ACL_FAIL=0
+            while IFS= read -r line; do
+              [[ -z "$line" ]] && continue
+              node_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)"
+              node_flags="$(echo "$line" | awk '{print $3}')"
+              node_host="$(echo "$node_addr" | rev | cut -d: -f2- | rev)"
+              node_port="$(echo "$node_addr" | rev | cut -d: -f1 | rev)"
+              if echo "$node_flags" | grep -q "fail\|handshake"; then continue; fi
+              result="$(vcli "$node_host" "$node_port" ACL SETUSER kbreplicator "+@all" "~*" "&*" 2>&1 || echo 'ERROR')"
+              if [[ "$result" == *"OK"* ]]; then
+                info "  ACL patched: ${node_host}:${node_port}"
+              elif [[ "$result" == *"ERR"*"user"*"not"* ]] || [[ "$result" == *"ERR"*"User"*"not"* ]]; then
+                warn "  kbreplicator not found on ${node_host}:${node_port}"
+              else
+                warn "  ACL patch failed on ${node_host}:${node_port}: ${result}"
+                ACL_FAIL=$((ACL_FAIL + 1))
+              fi
+            done <<< "$ALL_NODES_RAW"
+            if [[ $ACL_FAIL -gt 0 ]]; then
+              warn "ACL patch failed on ${ACL_FAIL} node(s)."
+            fi
+
+            # ── Step 4: Check for stuck migrations ─────────────────────────
+            info "Checking for stuck migrations from previous failures..."
+            STUCK_FOUND=false
+            while IFS= read -r line; do
+              [[ -z "$line" ]] && continue
+              node_flags="$(echo "$line" | awk '{print $3}')"
+              if echo "$node_flags" | grep -q "fail\|handshake"; then continue; fi
+              slot_fields="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')"
+              if echo "$slot_fields" | grep -qE '\[.*->-\]|\[.*-<-\]'; then
+                STUCK_FOUND=true
+                break
+              fi
+            done <<< "$ALL_NODES_RAW"
+
+            if [[ "$STUCK_FOUND" == "true" ]]; then
+              warn "Stuck MIGRATING/IMPORTING slots detected. Running cluster fix..."
+              fix_endpoint="127.0.0.1:${LOCAL_PORT}"
+              if command -v valkey-cli >/dev/null 2>&1; then
+                valkey-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true
+              else
+                redis-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true
+              fi
+              info "Cluster fix completed. Re-reading topology..."
+              CLUSTER_NODES_RAW="$(vcli_local CLUSTER NODES)"
+              PRIMARY_IDS=() PRIMARY_HOSTS=() PRIMARY_PORTS=() PRIMARY_SLOTS=()
+              NEW_SHARD_IDS=() NEW_SHARD_HOSTS=() NEW_SHARD_PORTS=()
+              while IFS= read -r line; do
+                [[ -z "$line" ]] && continue
+                local_id="$(echo "$line" | awk '{print $1}')"
+                local_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)"
+                local_flags="$(echo "$line" | awk '{print $3}')"
+                local_host="$(echo "$local_addr" | rev | cut -d: -f2- | rev)"
+                local_port="$(echo "$local_addr" | rev | cut -d: -f1 | rev)"
+                if echo "$local_flags" | grep -q "master" && ! echo "$local_flags" | grep -q "fail\|handshake"; then
+                  PRIMARY_IDS+=("$local_id")
+                  PRIMARY_HOSTS+=("$local_host")
+                  PRIMARY_PORTS+=("$local_port")
+                  local_slots="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')"
+                  PRIMARY_SLOTS+=("$local_slots")
+                fi
+              done <<< "$CLUSTER_NODES_RAW"
+              for i in "${!PRIMARY_IDS[@]}"; do
+                slots_trimmed="$(echo "${PRIMARY_SLOTS[$i]}" | xargs)"
+                if [[ -z "$slots_trimmed" ]]; then
+                  NEW_SHARD_IDS+=("${PRIMARY_IDS[$i]}")
+                  NEW_SHARD_HOSTS+=("${PRIMARY_HOSTS[$i]}")
+                  NEW_SHARD_PORTS+=("${PRIMARY_PORTS[$i]}")
+                fi
+              done
+              CURRENT_SHARDS="${#PRIMARY_IDS[@]}"
+              info "Topology after fix: ${CURRENT_SHARDS} primaries, ${#NEW_SHARD_IDS[@]} empty"
+            fi
+
+            # ── Step 5: Compute slot plan ──────────────────────────────────
+            # Generalized: works for both scale-out and scale-in.
+            # Compute target slot count per primary, then pair donors with receivers.
+            TOTAL_SLOTS=16384
+            SLOTS_PER_SHARD=$((TOTAL_SLOTS / TARGET_SHARDS))
+            REMAINDER=$((TOTAL_SLOTS % TARGET_SHARDS))
+
+            info "Target distribution: ${SLOTS_PER_SHARD} slots/shard (+1 for first ${REMAINDER} shards)"
+
+            # Count current slots per primary
+            declare -a OWNED_SLOTS=()
+            for i in "${!PRIMARY_IDS[@]}"; do
+              count=0
+              for range in ${PRIMARY_SLOTS[$i]}; do
+                [[ "$range" == *"["* ]] && continue
+                if [[ "$range" == *"-"* ]]; then
+                  s="${range%-*}"; e="${range#*-}"
+                  count=$((count + e - s + 1))
+                elif [[ "$range" =~ ^[0-9]+$ ]]; then
+                  count=$((count + 1))
+                fi
+              done
+              OWNED_SLOTS+=("$count")
+            done
+
+            info "Current slot distribution:"
+            for i in "${!PRIMARY_IDS[@]}"; do
+              info "  shard ${i} (${PRIMARY_IDS[$i]:0:8}...): ${OWNED_SLOTS[$i]} slots"
+            done
+
+            # Compute target count per primary:
+            #   indices 0..TARGET_SHARDS-1: even share
+            #   indices TARGET_SHARDS..CURRENT_SHARDS-1: 0 (being drained on scale-in)
+            declare -a TARGET_COUNTS=()
+            for i in "${!PRIMARY_IDS[@]}"; do
+              if [[ $i -lt $TARGET_SHARDS ]]; then
+                if [[ $i -lt $REMAINDER ]]; then
+                  TARGET_COUNTS+=($((SLOTS_PER_SHARD + 1)))
+                else
+                  TARGET_COUNTS+=("$SLOTS_PER_SHARD")
+                fi
+              else
+                TARGET_COUNTS+=(0)
+              fi
+            done
+
+            # Compute delta per primary: positive = donate, negative = receive
+            declare -a DELTAS=()
+            for i in "${!PRIMARY_IDS[@]}"; do
+              DELTAS+=($((OWNED_SLOTS[$i] - TARGET_COUNTS[$i])))
+            done
+
+            info "Slot movement plan:"
+            for i in "${!PRIMARY_IDS[@]}"; do
+              d="${DELTAS[$i]}"
+              if [[ $d -gt 0 ]]; then
+                info "  shard ${i}: donate ${d} slots (${OWNED_SLOTS[$i]} -> ${TARGET_COUNTS[$i]})"
+              elif [[ $d -lt 0 ]]; then
+                info "  shard ${i}: receive $((-d)) slots (${OWNED_SLOTS[$i]} -> ${TARGET_COUNTS[$i]})"
+              fi
+            done
+
+            # Build receiver list: shards that need more slots, with remaining capacity
+            declare -a RECV_IDX=() RECV_REMAINING=()
+            for i in "${!PRIMARY_IDS[@]}"; do
+              if [[ "${DELTAS[$i]}" -lt 0 ]]; then
+                RECV_IDX+=("$i")
+                RECV_REMAINING+=($(( -${DELTAS[$i]} )))
+              fi
+            done
+
+            # Build migration tuples: pair donors with receivers.
+            # Consumes from the end of each donor's ranges, splitting across
+            # receivers as needed. Tracks a cursor (cur_end) within each range
+            # to avoid overlapping migrations.
+            declare -a MIG_SOURCE_IDX=() MIG_TARGET_IDX=() MIG_START=() MIG_END=()
+            recv_ptr=0
+
+            for src_idx in "${!PRIMARY_IDS[@]}"; do
+              remaining_donate="${DELTAS[$src_idx]}"
+              [[ "$remaining_donate" -le 0 ]] && continue
+
+              # Parse this shard's slot ranges
+              declare -a SRC_RANGES_START=() SRC_RANGES_END=()
+              for range in ${PRIMARY_SLOTS[$src_idx]}; do
+                [[ "$range" == *"["* ]] && continue
+                if [[ "$range" == *"-"* ]]; then
+                  SRC_RANGES_START+=("${range%-*}")
+                  SRC_RANGES_END+=("${range#*-}")
+                elif [[ "$range" =~ ^[0-9]+$ ]]; then
+                  SRC_RANGES_START+=("$range")
+                  SRC_RANGES_END+=("$range")
+                fi
+              done
+
+              # Walk ranges from the end, tracking a cursor within each range
+              range_idx=$(( ${#SRC_RANGES_START[@]} - 1 ))
+              cur_end="${SRC_RANGES_END[$range_idx]}"
+
+              while [[ $remaining_donate -gt 0 && $range_idx -ge 0 ]]; do
+                r_start="${SRC_RANGES_START[$range_idx]}"
+                r_available=$((cur_end - r_start + 1))
+
+                # Find a receiver with remaining capacity
+                while [[ $recv_ptr -lt ${#RECV_IDX[@]} ]]; do
+                  [[ "${RECV_REMAINING[$recv_ptr]}" -gt 0 ]] && break
+                  recv_ptr=$((recv_ptr + 1))
+                done
+                if [[ $recv_ptr -ge ${#RECV_IDX[@]} ]]; then
+                  warn "No more receivers for slots."; break
+                fi
+
+                # Take min(available_in_range, receiver_needs, remaining_donate)
+                recv_needs="${RECV_REMAINING[$recv_ptr]}"
+                take_size=$r_available
+                [[ $take_size -gt $recv_needs ]] && take_size=$recv_needs
+                [[ $take_size -gt $remaining_donate ]] && take_size=$remaining_donate
+
+                take_end="$cur_end"
+                take_start=$((cur_end - take_size + 1))
+
+                tgt_idx="${RECV_IDX[$recv_ptr]}"
+                MIG_SOURCE_IDX+=("$src_idx"); MIG_TARGET_IDX+=("$tgt_idx")
+                MIG_START+=("$take_start"); MIG_END+=("$take_end")
+
+                remaining_donate=$((remaining_donate - take_size))
+                RECV_REMAINING[$recv_ptr]=$((${RECV_REMAINING[$recv_ptr]} - take_size))
+                [[ "${RECV_REMAINING[$recv_ptr]}" -le 0 ]] && recv_ptr=$((recv_ptr + 1))
+
+                # Shrink the working range
+                cur_end=$((take_start - 1))
+                if [[ $cur_end -lt $r_start ]]; then
+                  # Range fully consumed, move to previous range
+                  range_idx=$((range_idx - 1))
+                  [[ $range_idx -ge 0 ]] && cur_end="${SRC_RANGES_END[$range_idx]}"
+                fi
+              done
+              unset SRC_RANGES_START SRC_RANGES_END
+            done
+
+            if [[ "${#MIG_SOURCE_IDX[@]}" -eq 0 ]]; then
+              info "No migrations needed. Cluster is already balanced."; exit 0
+            fi
+
+            info "Planned ${#MIG_SOURCE_IDX[@]} migration(s):"
+            for i in "${!MIG_SOURCE_IDX[@]}"; do
+              src="${MIG_SOURCE_IDX[$i]}"; tgt="${MIG_TARGET_IDX[$i]}"
+              info "  slots ${MIG_START[$i]}-${MIG_END[$i]} from shard ${src} (${PRIMARY_IDS[$src]:0:8}...) to shard ${tgt} (${PRIMARY_IDS[$tgt]:0:8}...)"
+            done
+
+            # ── Step 6: Execute migrations ─────────────────────────────────
+            cancel_all_migrations() {
+              warn "Cancelling all in-flight migrations..."
+              for i in "${!PRIMARY_IDS[@]}"; do
+                slots_trimmed="$(echo "${PRIMARY_SLOTS[$i]}" | xargs)"
+                [[ -z "$slots_trimmed" ]] && continue
+                vcli "${PRIMARY_HOSTS[$i]}" "${PRIMARY_PORTS[$i]}" CLUSTER CANCELSLOTMIGRATIONS 2>&1 || true
+              done
+            }
+
+            run_cluster_fix() {
+              warn "Running cluster fix as remediation..."
+              fix_endpoint="127.0.0.1:${LOCAL_PORT}"
+              if command -v valkey-cli >/dev/null 2>&1; then
+                valkey-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true
+              else
+                redis-cli $AUTH_ARGS --cluster fix "$fix_endpoint" <<< "yes" 2>&1 || true
+              fi
+            }
+
+            # Helper: extract state for a specific slot range from GETSLOTMIGRATIONS output.
+            # GETSLOTMIGRATIONS returns entries newest-first. Return the FIRST match
+            # for the given slot_range (= most recent migration for those slots).
+            get_mig_state() {
+              local output="$1" target_range="$2"
+              local found_range="" current_state=""
+              while IFS= read -r kv_line; do
+                case "$kv_line" in
+                  slot_ranges) found_range="next" ;;
+                  state)       current_state="next" ;;
+                  *)
+                    if [[ "$found_range" == "next" ]]; then
+                      found_range="$kv_line"
+                    elif [[ "$current_state" == "next" ]]; then
+                      current_state="$kv_line"
+                      if [[ "$found_range" == "$target_range" ]]; then
+                        echo "$current_state"
+                        return 0
+                      fi
+                      found_range=""; current_state=""
+                    fi
+                    ;;
+                esac
+              done <<< "$output"
+            }
+
+            MIGRATION_FAILED=false
+            for mig_idx in "${!MIG_SOURCE_IDX[@]}"; do
+              src="${MIG_SOURCE_IDX[$mig_idx]}"; tgt="${MIG_TARGET_IDX[$mig_idx]}"
+              start="${MIG_START[$mig_idx]}"; end="${MIG_END[$mig_idx]}"
+              target_id="${PRIMARY_IDS[$tgt]}"
+              src_host="${PRIMARY_HOSTS[$src]}"; src_port="${PRIMARY_PORTS[$src]}"
+              slot_range="${start}-${end}"
+
+              info "Migration ${mig_idx}: slots ${slot_range} from ${src_host}:${src_port} to ${target_id:0:8}..."
+              mig_result="$(vcli "$src_host" "$src_port" CLUSTER MIGRATESLOTS SLOTSRANGE "$start" "$end" NODE "$target_id" 2>&1)"
+              if [[ "$mig_result" != *"OK"* ]]; then
+                fail "CLUSTER MIGRATESLOTS failed: ${mig_result}"
+                MIGRATION_FAILED=true; break
+              fi
+
+              info "  Migration started (async). Polling for slot_range=${slot_range}..."
+              elapsed=0; migration_done=false
+              while [[ $elapsed -lt $ASM_TIMEOUT ]]; do
+                sleep "$POLL_INTERVAL"; elapsed=$((elapsed + POLL_INTERVAL))
+                src_status_raw="$(vcli "$src_host" "$src_port" CLUSTER GETSLOTMIGRATIONS 2>&1 || echo 'UNREACHABLE')"
+                tgt_host="${PRIMARY_HOSTS[$tgt]}"; tgt_port="${PRIMARY_PORTS[$tgt]}"
+                tgt_status_raw="$(vcli "$tgt_host" "$tgt_port" CLUSTER GETSLOTMIGRATIONS 2>&1 || echo 'UNREACHABLE')"
+
+                # Check state of OUR migration only (by slot range), ignore stale history
+                src_state="$(get_mig_state "$src_status_raw" "$slot_range")"
+                tgt_state="$(get_mig_state "$tgt_status_raw" "$slot_range")"
+
+                if [[ "$src_state" == "failed" || "$src_state" == "cancelled" ]]; then
+                  fail "  Migration failed on source (state=${src_state})"
+                  MIGRATION_FAILED=true; break 2
+                fi
+                if [[ "$tgt_state" == "failed" || "$tgt_state" == "cancelled" ]]; then
+                  fail "  Migration failed on target (state=${tgt_state})"
+                  MIGRATION_FAILED=true; break 2
+                fi
+                if [[ "$src_state" == "success" && "$tgt_state" == "success" ]]; then
+                  migration_done=true; break
+                fi
+                # Also succeed if source says success (target may not track it the same way)
+                if [[ "$src_state" == "success" ]]; then
+                  migration_done=true; break
+                fi
+                [[ $((elapsed % 10)) -eq 0 ]] && info "  Polling... src=${src_state:-pending} tgt=${tgt_state:-pending} (${elapsed}s)"
+              done
+              [[ "$MIGRATION_FAILED" == "true" ]] && break
+              if [[ "$migration_done" != "true" ]]; then
+                fail "  Migration timed out after ${ASM_TIMEOUT}s"; MIGRATION_FAILED=true; break
+              fi
+              info "  Migration ${mig_idx} completed (${elapsed}s)"
+            done
+
+            # ── Step 7: Error handling ─────────────────────────────────────
+            if [[ "$MIGRATION_FAILED" == "true" ]]; then
+              cancel_all_migrations
+              sleep 2
+              REMNANTS="$(vcli_local CLUSTER NODES 2>/dev/null || echo '')"
+              if echo "$REMNANTS" | grep -qE '\[.*->-\]|\[.*-<-\]'; then
+                warn "MIGRATING/IMPORTING remnants detected after cancellation."
+                run_cluster_fix
+              fi
+              fail "ASM resharding FAILED. Cluster may need manual inspection."
+              exit 1
+            fi
+
+            # ── Step 8: Verify ─────────────────────────────────────────────
+            info "All migrations completed. Verifying cluster health..."
+            sleep 2
+            CLUSTER_INFO="$(vcli_local CLUSTER INFO 2>/dev/null)"
+            SLOTS_OK="$(echo "$CLUSTER_INFO" | grep 'cluster_slots_ok' | cut -d: -f2 | tr -d '[:space:]')"
+            CLUSTER_STATE="$(echo "$CLUSTER_INFO" | grep 'cluster_state' | cut -d: -f2 | tr -d '[:space:]')"
+            info "cluster_state=${CLUSTER_STATE} cluster_slots_ok=${SLOTS_OK}"
+
+            if [[ "$SLOTS_OK" != "16384" ]]; then
+              fail "VERIFICATION FAILED: cluster_slots_ok=${SLOTS_OK} (expected 16384)"
+              run_cluster_fix
+              sleep 2
+              CLUSTER_INFO="$(vcli_local CLUSTER INFO 2>/dev/null)"
+              SLOTS_OK="$(echo "$CLUSTER_INFO" | grep 'cluster_slots_ok' | cut -d: -f2 | tr -d '[:space:]')"
+              if [[ "$SLOTS_OK" != "16384" ]]; then
+                fail "cluster_slots_ok still ${SLOTS_OK} after fix. Manual intervention required."
+                exit 1
+              fi
+              info "Cluster fix restored all slots."
+            fi
+            if [[ "$CLUSTER_STATE" != "ok" ]]; then
+              fail "VERIFICATION FAILED: cluster_state=${CLUSTER_STATE} (expected ok)"
+              exit 1
+            fi
+
+            # ── Step 9: Clean up stale replica importing flags ────────────
+            # Valkey 9 bug (valkey-io/valkey#998): when a replica does a full sync
+            # during slot migration, the RDB snapshot carries the in-progress
+            # importing state. After the primary completes the migration and clears
+            # its importing flag, the replica still has the stale flag from the RDB.
+            # Gossip won't clear it (cleanup is primary-only). SETSLOT STABLE is
+            # rejected on replicas from external clients.
+            #
+            # Fix: run CLUSTER SETSLOT <slot> STABLE on the PRIMARY of the affected
+            # replica. The primary propagates it to the replica via the replication
+            # stream (forceCommandPropagation). This clears the stale flag.
+            info "Checking for stale importing/migrating flags on all nodes..."
+            ALL_NODES_CHECK="$(vcli_local CLUSTER NODES 2>/dev/null || echo '')"
+            STALE_FIXED=0
+            STALE_FAILED=0
+            while IFS= read -r line; do
+              [[ -z "$line" ]] && continue
+              node_flags="$(echo "$line" | awk '{print $3}')"
+              if echo "$node_flags" | grep -q "fail\|handshake"; then continue; fi
+              slot_fields="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')"
+              # Extract slot numbers from [slot-<-nodeid] or [slot->-nodeid] markers
+              stale_slots="$(echo "$slot_fields" | grep -oE '\[[0-9]+-[<>]-' | grep -oE '[0-9]+' || true)"
+              if [[ -z "$stale_slots" ]]; then continue; fi
+
+              node_id="$(echo "$line" | awk '{print $1}')"
+              node_fqdn="$(echo "$line" | awk '{print $2}' | cut -d@ -f2 | cut -d, -f2 | cut -d. -f1)"
+
+              if echo "$node_flags" | grep -q "slave"; then
+                # Replica: find its primary and issue SETSLOT STABLE there
+                primary_id="$(echo "$line" | awk '{print $4}')"
+                primary_line="$(echo "$ALL_NODES_CHECK" | grep "^${primary_id} ")"
+                if [[ -z "$primary_line" ]]; then
+                  warn "Cannot find primary ${primary_id:0:8}... for replica ${node_fqdn}. Manual fix needed."
+                  STALE_FAILED=$((STALE_FAILED + 1))
+                  continue
+                fi
+                primary_addr="$(echo "$primary_line" | awk '{print $2}' | cut -d@ -f1)"
+                primary_host="$(echo "$primary_addr" | rev | cut -d: -f2- | rev)"
+                primary_port="$(echo "$primary_addr" | rev | cut -d: -f1 | rev)"
+                for slot in $stale_slots; do
+                  info "  Clearing stale slot ${slot} on replica ${node_fqdn} via primary ${primary_host}:${primary_port}..."
+                  result="$(vcli "$primary_host" "$primary_port" CLUSTER SETSLOT "$slot" STABLE 2>&1 || echo 'ERROR')"
+                  if [[ "$result" == *"OK"* ]]; then
+                    STALE_FIXED=$((STALE_FIXED + 1))
+                  else
+                    warn "  Failed to clear slot ${slot}: ${result}"
+                    STALE_FAILED=$((STALE_FAILED + 1))
+                  fi
+                done
+              else
+                # Primary: issue SETSLOT STABLE directly
+                node_addr="$(echo "$line" | awk '{print $2}' | cut -d@ -f1)"
+                node_host="$(echo "$node_addr" | rev | cut -d: -f2- | rev)"
+                node_port="$(echo "$node_addr" | rev | cut -d: -f1 | rev)"
+                for slot in $stale_slots; do
+                  info "  Clearing stale slot ${slot} on primary ${node_fqdn}..."
+                  result="$(vcli "$node_host" "$node_port" CLUSTER SETSLOT "$slot" STABLE 2>&1 || echo 'ERROR')"
+                  if [[ "$result" == *"OK"* ]]; then
+                    STALE_FIXED=$((STALE_FIXED + 1))
+                  else
+                    warn "  Failed to clear slot ${slot}: ${result}"
+                    STALE_FAILED=$((STALE_FAILED + 1))
+                  fi
+                done
+              fi
+            done <<< "$ALL_NODES_CHECK"
+
+            if [[ $STALE_FIXED -gt 0 ]]; then
+              info "Cleared ${STALE_FIXED} stale slot flag(s) (valkey-io/valkey#998 workaround)."
+              sleep 2  # let replication propagate SETSLOT STABLE to replicas
+            fi
+            if [[ $STALE_FAILED -gt 0 ]]; then
+              warn "${STALE_FAILED} stale flag(s) could not be cleared. These may block KubeBlocks preTerminate."
+              warn "Manual fix: delete the affected replica pod (KubeBlocks will recreate it)."
+            fi
+            if [[ $STALE_FIXED -eq 0 && $STALE_FAILED -eq 0 ]]; then
+              info "No stale importing/migrating flags found."
+            fi
+
+            # Show final topology
+            info "Final topology:"
+            FINAL_NODES="$(vcli_local CLUSTER NODES)"
+            while IFS= read -r line; do
+              [[ -z "$line" ]] && continue
+              node_id="$(echo "$line" | awk '{print $1}')"
+              node_flags="$(echo "$line" | awk '{print $3}')"
+              if echo "$node_flags" | grep -q "master" && ! echo "$node_flags" | grep -q "fail\|handshake"; then
+                node_slots="$(echo "$line" | awk '{for(i=9;i<=NF;i++) printf "%s ", $i}')"
+                info "  ${node_id:0:8}... [${node_slots}]"
+              fi
+            done <<< "$FINAL_NODES"
+            info "ASM resharding completed successfully: ${CURRENT_SHARDS} -> ${TARGET_SHARDS} shards"
+            exit 0
+
+    # ── Action 3: scale-in topology change ───────────────────────────────
+    # No-op when current_shards <= target. Otherwise patches Cluster.spec.
+    # shardings[0].shards down (slots have already been drained by
+    # asm-migrate) and waits for KubeBlocks to delete the extra pods.
+    - name: finalize-topology
+      failurePolicy: Fail
+      parameters:
+        - targetShards
+        - clusterName
+      workload:
+        type: Job
+        podInfoExtractorName: valkey-pod
+        backoffLimit: 0
+        podSpec:
+          # KB ignores serviceAccountName here; it injects the component's
+          # kb-managed SA. The chart binds the asm-orchestrator Role to
+          # the namespace's SA group, which covers it. See chart's
+          # templates/asm-rbac.yaml for the rationale.
+          restartPolicy: Never
+          containers:
+            - name: kubectl
+              image: {{ .Values.image.registry | default "docker.io" }}/apecloud/kubectl:1.29
+              imagePullPolicy: IfNotPresent
+              command:
+                - bash
+                - -c
+                - |
+                  set -euo pipefail
+                  CLUSTER="${clusterName}"
+                  TARGET="${targetShards}"
+                  NS="${CLUSTER_NAMESPACE}"
+                  log() { echo "[finalize-topology] $(date +%H:%M:%S) $*"; }
+
+                  CURRENT=$(kubectl get cluster -n "$NS" "$CLUSTER" \
+                    -o jsonpath='{.spec.shardings[0].shards}')
+                  log "current_shards=${CURRENT} target_shards=${TARGET}"
+
+                  if [[ "$TARGET" -ge "$CURRENT" ]]; then
+                    log "Not scale-in; nothing to patch."
+                    exit 0
+                  fi
+
+                  log "Scale-in: patching Cluster ${CLUSTER} shardings[0].shards=${TARGET}"
+                  kubectl patch cluster -n "$NS" "$CLUSTER" --type=json \
+                    -p "[{\"op\":\"replace\",\"path\":\"/spec/shardings/0/shards\",\"value\":${TARGET}}]"
+
+                  REPLICAS=$(kubectl get cluster -n "$NS" "$CLUSTER" \
+                    -o jsonpath='{.spec.shardings[0].template.replicas}')
+                  EXPECTED=$((TARGET * REPLICAS))
+                  log "Waiting for pod count to drop to ${EXPECTED}..."
+
+                  elapsed=0
+                  while [[ $elapsed -lt 600 ]]; do
+                    COUNT=$(kubectl get pods -n "$NS" \
+                      -l "app.kubernetes.io/instance=${CLUSTER},apps.kubeblocks.io/sharding-name=shard" \
+                      --no-headers 2>/dev/null | wc -l | tr -d ' ')
+                    [[ "$COUNT" -le "$EXPECTED" ]] && break
+                    [[ $((elapsed % 30)) -eq 0 ]] && log "  count=${COUNT} target=${EXPECTED} (${elapsed}s)..."
+                    sleep 10
+                    elapsed=$((elapsed + 10))
+                  done
+                  if [[ "$COUNT" -gt "$EXPECTED" ]]; then
+                    log "ERROR: pod count still ${COUNT} > ${EXPECTED} after 10m"
+                    exit 1
+                  fi
+
+                  log "Waiting for Cluster phase Running..."
+                  elapsed=0
+                  while [[ $elapsed -lt 300 ]]; do
+                    PHASE=$(kubectl get cluster -n "$NS" "$CLUSTER" \
+                      -o jsonpath='{.status.phase}' 2>/dev/null)
+                    [[ "$PHASE" == "Running" ]] && break
+                    sleep 5
+                    elapsed=$((elapsed + 5))
+                  done
+                  log "Cluster phase=${PHASE}. Scale-in finalized."
+              env:
+                - name: clusterName
+                  value: $(clusterName)
+                - name: targetShards
+                  value: $(targetShards)
+                - name: CLUSTER_NAMESPACE
+                  value: $(CLUSTER_NAMESPACE)