From 6a6355a6c5dc502c3feef2d438ab3dd48d65068d Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 6 May 2026 13:30:38 +0100 Subject: [PATCH 01/31] fix: pass snapshot UUID instead of object to snapshot_controller.delete in delete_last_snapshot_if_needed --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 2527853b0..2f5d01048 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -138,7 +138,7 @@ def delete_last_snapshot_if_needed(this_task, lvol): snaps = sorted(snaps, key=lambda x: x.created_at) snapshot = snaps[-1] logger.info("Deleting snapshot: %s", snapshot.get_id()) - ret = snapshot_controller.delete(snapshot) + ret = snapshot_controller.delete(snapshot.get_id()) logger.debug(ret) From c9e40ac365d3d7a09c4ab9dc515d34c288bcd85d Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 6 May 2026 20:41:21 +0100 Subject: [PATCH 02/31] added snaps to debug logs --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index ae1bf819c..a5557e53a 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2660,6 +2660,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): snaps.append(snap) if snaps: + logger.debug(snaps) snaps = sorted(snaps, key=lambda x: x.created_at) snapshot = snaps[-1] snapshot = db_controller.get_snapshot_by_id(snapshot.target_replicated_snap_uuid) From 527a36c5bdd634c5e4f5aaba2d99a401d3cb9b03 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Thu, 7 May 2026 15:43:28 +0100 Subject: [PATCH 03/31] started spdk container with sudo --- simplyblock_web/templates/storage_deploy_spdk.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index 6f6e58979..f2a4bc39b 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -80,7 +80,7 @@ spec: if [ ! -e /dev/fd ]; then ln -s /proc/self/fd /dev/fd fi - /root/scripts/run_distr_with_ssd.sh "{{ L_CORES }}" "{{ SPDK_MEM }}" + sudo -E /root/scripts/run_distr_with_ssd.sh "{{ L_CORES }}" "{{ SPDK_MEM }}" env: - name: SSD_PCIE value: "{{ SSD_PCIE }}" @@ -97,7 +97,7 @@ spec: lifecycle: postStart: exec: - command: ["/bin/sh", "-c", "sudo modprobe nbd || echo failed to modprobe nbd"] + command: ["/bin/sh", "-c", "sudo -E modprobe nbd || echo failed to modprobe nbd"] securityContext: privileged: true volumeMounts: From 7a94eb271ebcda33af351f53812691af7ef5be5b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 7 May 2026 18:54:27 +0300 Subject: [PATCH 04/31] fix: add error handling for fetching storage node and cluster in lvol_event --- simplyblock_core/controllers/lvol_events.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_events.py b/simplyblock_core/controllers/lvol_events.py index 84bfe0b1e..af7119702 100644 --- a/simplyblock_core/controllers/lvol_events.py +++ b/simplyblock_core/controllers/lvol_events.py @@ -10,8 +10,14 @@ def _lvol_event(lvol, message, caused_by, event): db_controller = DBController() - snode = db_controller.get_storage_node_by_id(lvol.node_id) - cluster = db_controller.get_cluster_by_id(snode.cluster_id) + try: + snode = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) + except Exception as e: + logger.error(e) + logger.error(f"Error fetching related objects for lvol event: {message}") + return + ec.log_event_cluster( cluster_id=snode.cluster_id, domain=ec.DOMAIN_CLUSTER, From 4b51172811e45188e049052f3d36907ec2428470 Mon Sep 17 00:00:00 2001 From: Max Schettler Date: Wed, 29 Apr 2026 15:37:24 +0200 Subject: [PATCH 05/31] Enable mTLS --- simplyblock_core/models/storage_node.py | 4 +- simplyblock_core/rpc_client.py | 8 +- .../services/spdk_http_proxy_server.py | 6 +- simplyblock_core/settings.py | 97 +++++++++++++++++-- simplyblock_core/snode_client.py | 8 +- .../api/internal/storage_node/kubernetes.py | 6 +- simplyblock_web/app.py | 6 +- simplyblock_web/node_webapp.py | 3 +- .../templates/storage_deploy_spdk.yaml.j2 | 17 +++- .../templates/storage_init_job.yaml.j2 | 22 ++++- tests/test_storage_deploy_template.py | 59 +++++++++++ 11 files changed, 207 insertions(+), 29 deletions(-) create mode 100644 tests/test_storage_deploy_template.py diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 83ae1c1a2..331e21873 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -150,7 +150,7 @@ def client(self, **kwargs): """Return API client to this node """ host = self.api_endpoint - if Settings().tls_enabled: + if Settings().tls_connect != "disabled": port = self.api_endpoint.rsplit(":", 1)[1] host = f"{self._k8s_node_label()}.simplyblock-storage-node-api.{self.cr_namespace}.svc.cluster.local:{port}" return SNodeClient(host, **kwargs) @@ -159,7 +159,7 @@ def rpc_client(self, **kwargs): """Return rpc client to this node """ host = self.mgmt_ip - if Settings().tls_enabled: + if Settings().tls_connect != "disabled": host = f"{self._k8s_node_label()}.simplyblock-spdk-proxy.{self.cr_namespace}.svc.cluster.local" return RPCClient( host, self.rpc_port, diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 5dfd84062..0c4c771bc 100755 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -99,19 +99,21 @@ def __init__(self, host, port, username, password, timeout=180, retry=3): self.host = host self.port = port settings = Settings() - scheme = "https" if settings.tls_enabled else "http" + scheme = "https" if settings.tls_connect != "disabled" else "http" self.url = '%s://%s:%s/' % (scheme, self.host, self.port) self.username = username self.password = password self.timeout = timeout self.session = requests.session() - if settings.tls_enabled: - self.session.verify = str(settings.certificate_authority) + if settings.tls_connect != "disabled": + self.session.verify = str(settings.tls_certificate_authority) self.session.auth = (self.username, self.password) retries = Retry(total=retry, backoff_factor=1, connect=retry, read=retry, allowed_methods=self.DEFAULT_ALLOWED_METHODS) self.session.mount("http://", HTTPAdapter(max_retries=retries)) self.session.mount("https://", HTTPAdapter(max_retries=retries)) + if settings.tls_connect == "authenticated": + self.session.cert = (str(settings.tls_certificate), str(settings.tls_key)) def _request_cached(self, method, params=None, cache_ttl=RPC_CACHE_TTL_SEC): """Like _request but returns a cached result if one exists within cache_ttl seconds.""" diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 5e71742eb..11c632a67 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -5,7 +5,6 @@ import logging import os import socket -import ssl import sys import threading import time @@ -258,9 +257,8 @@ def run_server(host, port, user, password, is_threading_enabled=False): ServerHandler.key = key httpd = (ThreadingHTTPServer if is_threading_enabled else HTTPServer)((host, port), ServerHandler) settings = Settings() - if settings.tls_enabled: - context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) - context.load_cert_chain(settings.tls_certificate, settings.tls_key) + context = settings.make_server_ssl_context() + if context is not None: httpd.socket = context.wrap_socket(httpd.socket, server_side=True) httpd.timeout = TIMEOUT logger.info('Started RPC http proxy server') diff --git a/simplyblock_core/settings.py b/simplyblock_core/settings.py index eb2931ce0..f647f685c 100644 --- a/simplyblock_core/settings.py +++ b/simplyblock_core/settings.py @@ -1,19 +1,98 @@ +import ssl from pathlib import Path +from typing import Annotated, Any, Literal, Optional +from pydantic import BeforeValidator, Field, PlainSerializer, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict +_VERIFY_MODE_TO_STR = { + ssl.CERT_NONE: "disabled", + ssl.CERT_OPTIONAL: "optional", + ssl.CERT_REQUIRED: "required", +} +_STR_TO_VERIFY_MODE = {v: k for k, v in _VERIFY_MODE_TO_STR.items()} + + +def _parse_verify_mode(x: Any) -> ssl.VerifyMode: + if isinstance(x, str): + if x not in _STR_TO_VERIFY_MODE: + raise ValueError("Invalid input") + return _STR_TO_VERIFY_MODE[x] + + return x + + class Settings(BaseSettings): model_config = SettingsConfigDict(env_prefix="sb_", case_sensitive=False) + tls_serve: Annotated[ + bool, + Field( + description="Run servers in TLS mode. Requires certificate and key to be present." + ), + ] = False + tls_client_auth: Annotated[ + ssl.VerifyMode, + Field( + description="Client authentication mode. May be 'disabled', 'optional', or 'required'" + ), + BeforeValidator(_parse_verify_mode), + PlainSerializer(_VERIFY_MODE_TO_STR.__getitem__, return_type=str), + ] = ssl.CERT_NONE + tls_connect: Annotated[ + Literal["disabled", "anonymous", "authenticated"], + Field(description="Connect to internal services via TLS."), + ] = "disabled" + tls_provider: Annotated[ + Optional[Literal["openshift", "cert-manager"]], + Field(description="Provider for TLS certificates in the cluster."), + ] = None tls_certificate: Path = Path("/etc/simplyblock/tls/tls.crt") tls_key: Path = Path("/etc/simplyblock/tls/tls.key") - certificate_authority: Path = Path("/etc/simplyblock/tls/ca.crt") - - @property - def tls_enabled(self) -> bool: - return all([ - self.tls_certificate.is_file(), - self.tls_key.is_file(), - self.certificate_authority.is_file(), - ]) + tls_certificate_authority: Path = Path("/etc/simplyblock/tls/ca.crt") + + @model_validator(mode="after") + def validate_tls_files(self): + if not self.tls_serve and self.tls_connect == "disabled": + return self + + if self.tls_serve and ( + missing := [ + name + for name in ["tls_certificate", "tls_key"] + if not getattr(self, name).is_file() + ] + ): + raise ValueError( + "SB_TLS_SERVE=true requires TLS files to exist: " + ", ".join(missing) + ) + + if ( + self.tls_connect != "disabled" + and not self.tls_certificate_authority.is_file() + ): + raise ValueError( + "SB_TLS_CONNECT != 'disabled' requires certificate authority to exist" + ) + + return self + + @model_validator(mode="after") + def validate_tls_provider(self): + if self.tls_connect != "disabled" and self.tls_provider is None: + raise ValueError( + "TLS provider needs to be configured for TLS connections to be used" + ) + return self + + def make_server_ssl_context(self): + """Return an SSLContext requiring client certificates, or None if TLS is not configured.""" + if self.tls_client_auth == "disabled": + return None + + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ctx.load_cert_chain(self.tls_certificate, self.tls_key) + ctx.verify_mode = self.tls_client_auth + ctx.load_verify_locations(self.tls_certificate_authority) + return ctx diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 6bbd5cb2b..2e4a7a220 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -20,16 +20,18 @@ class SNodeClient: def __init__(self, host, timeout=300, retry=5): settings = Settings() - scheme = "https" if settings.tls_enabled else "http" + scheme = "https" if settings.tls_connect != "disabled" else "http" self.url = f'{scheme}://{host}/snode/' self.timeout = timeout self.session = requests.session() - if settings.tls_enabled: - self.session.verify = str(settings.certificate_authority) + if settings.tls_connect != "disabled": + self.session.verify = str(settings.tls_certificate_authority) self.session.headers['Content-Type'] = "application/json" retries = Retry(total=retry, backoff_factor=1, connect=retry, read=retry) self.session.mount("http://", HTTPAdapter(max_retries=retries)) self.session.mount("https://", HTTPAdapter(max_retries=retries)) + if settings.tls_connect == "authenticated": + self.session.cert = (str(settings.tls_certificate), str(settings.tls_key)) def _request(self, method, path, payload=None): try: diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index 8194db391..671a8f03f 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -281,6 +281,7 @@ class SPDKParams(BaseModel): })}}}, }) def spdk_process_start(body: SPDKParams): + settings = Settings() ssd_pcie_params = " ".join(" -A " + addr for addr in body.ssd_pcie) if body.ssd_pcie else "none" ssd_pcie_list = " ".join(body.ssd_pcie) @@ -373,7 +374,10 @@ def spdk_process_start(body: SPDKParams): 'FW_PORT': body.firewall_port, 'CPU_TOPOLOGY_ENABLED': cpu_topology_enabled, 'RESERVED_SYSTEM_CPUS': reserved_system_cpus, - 'TLS_ENABLED': Settings().tls_enabled, + 'TLS_SERVE': settings.tls_serve, + 'TLS_CONNECT': settings.tls_connect, + 'TLS_CLIENT_AUTH': settings.model_dump()["tls_client_auth"], + 'TLS_PROVIDER': settings.tls_provider, } if ubuntu_host: diff --git a/simplyblock_web/app.py b/simplyblock_web/app.py index fe5b6c377..0080aa890 100644 --- a/simplyblock_web/app.py +++ b/simplyblock_web/app.py @@ -93,8 +93,10 @@ def main() -> None: access_log=False, proxy_headers=True, forwarded_allow_ips='192.168.1.0/24', - ssl_certfile=settings.tls_certificate if settings.tls_enabled else None, - ssl_keyfile=settings.tls_key if settings.tls_enabled else None, + ssl_certfile=settings.tls_certificate if settings.tls_serve else None, + ssl_keyfile=settings.tls_key if settings.tls_serve else None, + ssl_ca_certs=settings.tls_certificate_authority if settings.tls_client_auth != "disabled" else None, + ssl_cert_reqs=settings.tls_client_auth, ) server: uvicorn.Server = uvicorn.Server(config) server.run() diff --git a/simplyblock_web/node_webapp.py b/simplyblock_web/node_webapp.py index 772ed9f7d..d8faef9cf 100644 --- a/simplyblock_web/node_webapp.py +++ b/simplyblock_web/node_webapp.py @@ -44,5 +44,4 @@ def status(): app.register_api(internal_api.storage_node.kubernetes.api) settings = Settings() - ssl_ctx = (settings.tls_certificate, settings.tls_key) if settings.tls_enabled else None - app.run(host='0.0.0.0', debug=constants.LOG_WEB_DEBUG, ssl_context=ssl_ctx) + app.run(host='0.0.0.0', debug=constants.LOG_WEB_DEBUG, ssl_context=settings.make_server_ssl_context()) diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index f2a4bc39b..ff0e34d47 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -56,8 +56,12 @@ spec: - name: dockercontainerlogdirectory hostPath: path: /var/log/pods - {% if TLS_ENABLED %} + {% if TLS_SERVE or TLS_CONNECT != "disabled" %} - name: tls + {% if TLS_PROVIDER == "cert-manager" %} + secret: + secretName: simplyblock-spdk-proxy-tls + {% else %} projected: sources: - secret: @@ -68,6 +72,7 @@ spec: - key: service-ca.crt path: ca.crt {% endif %} + {% endif %} containers: - name: spdk-container @@ -137,7 +142,7 @@ spec: volumeMounts: - name: socket-dir mountPath: /mnt/ramdisk - {% if TLS_ENABLED %} + {% if TLS_SERVE or TLS_CONNECT != "disabled" %} - name: tls mountPath: /etc/simplyblock/tls readOnly: true @@ -155,6 +160,14 @@ spec: value: "True" - name: TIMEOUT value: "300" + - name: SB_TLS_SERVE + value: "{{ TLS_SERVE }}" + - name: SB_TLS_CONNECT + value: "{{ TLS_CONNECT }}" + - name: SB_TLS_CLIENT_AUTH + value: "{{ TLS_CLIENT_AUTH }}" + - name: SB_TLS_PROVIDER + value: "{{ TLS_PROVIDER }}" {% if CPU_TOPOLOGY_ENABLED %} resources: limits: diff --git a/simplyblock_web/templates/storage_init_job.yaml.j2 b/simplyblock_web/templates/storage_init_job.yaml.j2 index d66f24f71..8d37f5cd8 100644 --- a/simplyblock_web/templates/storage_init_job.yaml.j2 +++ b/simplyblock_web/templates/storage_init_job.yaml.j2 @@ -21,6 +21,11 @@ spec: - name: host-proc hostPath: path: /proc + {% if TLS_CONNECT == "authenticated" %} + - name: tls + secret: + secretName: simplyblock-spdk-proxy-tls + {% endif %} containers: - name: init-setup image: simplyblock/alpine-tools:3.21.3 @@ -29,6 +34,11 @@ spec: volumeMounts: - name: host-proc mountPath: /proc + {% if TLS_CONNECT == "authenticated" %} + - name: tls + mountPath: /etc/simplyblock/tls + readOnly: true + {% endif %} command: ["/bin/sh", "-c"] args: - | @@ -47,7 +57,17 @@ spec: echo "--- Sending config to $NODE_IP ---" if [ "$OS_ID" != "talos" ]; then - RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X POST {% if TLS_ENABLED %}-k https{% else %}http{% endif %}://$NODE_IP:5000/snode/apply_config -H "Content-Type: application/json" -d '{}') + {% if TLS_CONNECT == "disabled" %} + {% set SCHEME = "http" %} + {% set CURL_TLS_FLAGS = "" %} + {% elif TLS_CONNECT == "anonymous" %} + {% set SCHEME = "https" %} + {% set CURL_TLS_FLAGS = "-k" %} + {% else %} + {% set SCHEME = "https" %} + {% set CURL_TLS_FLAGS = "-k --cert /etc/simplyblock/tls/tls.crt --key /etc/simplyblock/tls/tls.key" %} + {% endif %} + RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X POST {{ CURL_TLS_FLAGS }} {{ SCHEME }}://$NODE_IP:5000/snode/apply_config -H "Content-Type: application/json" -d '{}') echo "HTTP status: $RESPONSE" if [ "$RESPONSE" -lt 200 ] || [ "$RESPONSE" -ge 300 ]; then echo "Failed to apply config" diff --git a/tests/test_storage_deploy_template.py b/tests/test_storage_deploy_template.py new file mode 100644 index 000000000..b8c501543 --- /dev/null +++ b/tests/test_storage_deploy_template.py @@ -0,0 +1,59 @@ +import unittest +from pathlib import Path + +from jinja2 import Environment, FileSystemLoader + + +TEMPLATE_DIR = Path(__file__).resolve().parents[1] / "simplyblock_web" / "templates" + + +def _render_storage_deploy(tls_provider: str) -> str: + env = Environment( + loader=FileSystemLoader(str(TEMPLATE_DIR)), + trim_blocks=True, + lstrip_blocks=True, + ) + template = env.get_template("storage_deploy_spdk.yaml.j2") + return template.render( + SPDK_IMAGE="spdk:test", + L_CORES="0-1", + SPDK_MEM=1024, + CORES=2, + SERVER_IP="10.0.0.10", + RPC_PORT=8080, + RPC_USERNAME="admin", + RPC_PASSWORD="secret", + HOSTNAME="node-a", + NAMESPACE="simplyblock", + SIMPLYBLOCK_DOCKER_IMAGE="proxy:test", + GRAYLOG_SERVER_IP="10.0.0.20", + MODE="kubernetes", + CLUSTER_ID="cluster1", + SSD_PCIE="none", + PCI_ALLOWED="", + TOTAL_HP="", + NSOCKET=0, + FW_PORT=50001, + CPU_TOPOLOGY_ENABLED=False, + MEM_MEGA=1536, + MEM2_MEGA=1024, + TLS_ENABLED=True, + TLS_PROVIDER=tls_provider, + ) + + +class TestStorageDeployTemplate(unittest.TestCase): + + def test_openshift_uses_service_ca_key(self): + rendered = _render_storage_deploy("openshift") + self.assertIn("key: service-ca.crt", rendered) + self.assertIn('name: SB_TLS_PROVIDER', rendered) + self.assertIn('value: "openshift"', rendered) + + def test_cert_manager_mounts_secret_directly(self): + rendered = _render_storage_deploy("cert-manager") + self.assertIn("secretName: simplyblock-spdk-proxy-tls", rendered) + self.assertNotIn("simplyblock-certificate-authority", rendered) + self.assertNotIn("projected:", rendered) + self.assertIn('name: SB_TLS_PROVIDER', rendered) + self.assertIn('value: "cert-manager"', rendered) From 3e90082d5e2dbc4fc6be43d842fe1a8c13b7d6ab Mon Sep 17 00:00:00 2001 From: Max Schettler Date: Thu, 7 May 2026 17:42:02 +0200 Subject: [PATCH 06/31] Fix flask debug flag Passing the log-level to the debug flag is invalid, this likely simply enables debug mode.Running this in debug mode is a security risk. This can be overridden using the FLASK_DEBUG environment variable. --- simplyblock_web/node_webapp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/simplyblock_web/node_webapp.py b/simplyblock_web/node_webapp.py index d8faef9cf..e9bec8f76 100644 --- a/simplyblock_web/node_webapp.py +++ b/simplyblock_web/node_webapp.py @@ -5,7 +5,6 @@ from flask_openapi3 import OpenAPI -from simplyblock_core import constants from simplyblock_core.settings import Settings from simplyblock_web import utils from simplyblock_web.api import internal as internal_api @@ -44,4 +43,4 @@ def status(): app.register_api(internal_api.storage_node.kubernetes.api) settings = Settings() - app.run(host='0.0.0.0', debug=constants.LOG_WEB_DEBUG, ssl_context=settings.make_server_ssl_context()) + app.run(host='0.0.0.0', debug=False, ssl_context=settings.make_server_ssl_context()) From 1506c4cfc85e61008d2b9ccebfefa6d291d17bc4 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Thu, 7 May 2026 19:04:38 +0100 Subject: [PATCH 07/31] added more debug logs --- simplyblock_core/controllers/lvol_controller.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index bf5a1a604..05046c363 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2658,7 +2658,6 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): snaps.append(snap) if snaps: - logger.debug(snaps) snaps = sorted(snaps, key=lambda x: x.created_at) snapshot = snaps[-1] snapshot = db_controller.get_snapshot_by_id(snapshot.target_replicated_snap_uuid) @@ -2712,6 +2711,9 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): logger.debug(f"new lvol from_source: {new_lvol.from_source}") + logger.debug(f"new_lvol: {new_lvol}") + logger.debug(f"source_node: {source_node}") + lvol_bdev, error = add_lvol_on_node(new_lvol, source_node) if error: logger.error(error) From 342e7c9cf0c6eda0a4d02624fc6e4a63ed130f51 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Fri, 8 May 2026 09:24:24 +0100 Subject: [PATCH 08/31] fix: compare tls_client_auth against ssl.CERT_NONE instead of string disabled to correctly skip SSL context when TLS is disabled --- simplyblock_core/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/settings.py b/simplyblock_core/settings.py index f647f685c..6d6a13945 100644 --- a/simplyblock_core/settings.py +++ b/simplyblock_core/settings.py @@ -88,7 +88,7 @@ def validate_tls_provider(self): def make_server_ssl_context(self): """Return an SSLContext requiring client certificates, or None if TLS is not configured.""" - if self.tls_client_auth == "disabled": + if not self.tls_serve and self.tls_client_auth == ssl.CERT_NONE: return None ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) From 13e385d7c9a4c4ebb31d7d3f96440b06614560e6 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Fri, 8 May 2026 11:42:19 +0100 Subject: [PATCH 09/31] fix: use snapshot's owning node for bdev_lvol_clone in replicate_lvol_on_source_cluster and fix lvol_events crash on missing node --- simplyblock_core/controllers/lvol_controller.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 34bf27a0a..f8bc1cbd2 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2654,6 +2654,17 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") return False + # bdev_lvol_clone must run on the same SPDK that owns the snapshot's LVS + try: + snap_node = db_controller.get_storage_node_by_id(snapshot.lvol.node_id) + if snap_node.status == StorageNode.STATUS_ONLINE: + source_node = snap_node + else: + logger.error(f"Snapshot node {snapshot.lvol.node_id} is not online") + return False + except KeyError: + logger.warning(f"Could not find snapshot node {snapshot.lvol.node_id}, using current source_node") + # create lvol on target node new_lvol = copy.deepcopy(lvol) new_lvol.cloned_from_snap = snapshot.get_id() @@ -2726,7 +2737,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): new_lvol.status = LVol.STATUS_ONLINE new_lvol.from_source = True new_lvol.write_to_db(db_controller.kv_store) - lvol_events.lvol_replicated(lvol, new_lvol) + lvol_events.lvol_replicated(new_lvol, new_lvol) logger.debug(f"new lvol from_source: {new_lvol.from_source}") return new_lvol.lvol_uuid From b6e2b79916ad20a94ef1f3f4b55194f8a4736d90 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Fri, 8 May 2026 12:38:40 +0100 Subject: [PATCH 10/31] fix snapshot replication crash loop: handle NVMe attach failures with graceful retry instead of RuntimeError --- .../services/snapshot_replication.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 2f5d01048..49dc831d4 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -80,12 +80,20 @@ def process_snap_replicate_start(task, snapshot): if not ret: msg = "controller attach failed" logger.error(msg) - raise RuntimeError(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return bdev_name = ret[0] if not bdev_name: msg = "Bdev name not returned from controller attach" logger.error(msg) - raise RuntimeError(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return bdev_found = False for i in range(5): ret = snode.rpc_client().get_bdevs(bdev_name) @@ -96,8 +104,13 @@ def process_snap_replicate_start(task, snapshot): time.sleep(1) if not bdev_found: + msg = f"Failed to connect to lvol: {remote_lv.get_id()}" logger.error("lvol Bdev not found after 5 attempts") - raise RuntimeError(f"Failed to connect to lvol: {remote_lv.get_id()}") + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return offset = 0 if "offset" in task.function_params and task.function_params["offset"]: @@ -360,7 +373,15 @@ def task_runner(task: JobSchedule): if task.status != JobSchedule.STATUS_DONE: # get new task object because it could be changed from cancel task task = db.get_task_by_id(task.uuid) - res = task_runner(task) + try: + res = task_runner(task) + except Exception as e: + logger.exception("task_runner raised for task %s: %s", task.uuid, e) + task.function_result = str(e) + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db(db.kv_store) + res = False if not res: time.sleep(3) From c703081c7bb477a1b7b4c82721ccca0ae7ab87b1 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Fri, 8 May 2026 15:41:30 +0100 Subject: [PATCH 11/31] fix: return HTTP 400 on replication endpoint failures instead of silent false with 200 --- simplyblock_web/api/v2/volume.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 10bc42cc7..76aa9c4b8 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -223,29 +223,26 @@ def inflate(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: @instance_api.post('/replication_trigger', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) def replication_trigger(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: if not lvol_controller.replication_trigger(volume.get_id()): - raise ValueError('Failed to start volume snapshot replication') - + raise HTTPException(400, 'Failed to trigger volume snapshot replication') return Response(status_code=204) @instance_api.post('/replication_start', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) def replication_start(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: if not lvol_controller.replication_start(volume.get_id(), cluster.get_id()): - raise ValueError('Failed to start volume snapshot replication') - + raise HTTPException(400, 'Failed to start volume snapshot replication') return Response(status_code=204) @instance_api.post('/replication_stop', name='clusters:storage-pools:volumes:replication_stop', status_code=204, responses={204: {"content": None}}) def replication_stop(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: if not lvol_controller.replication_stop(volume.get_id()): - raise ValueError('Failed to stop volume snapshot replication') - + raise HTTPException(400, 'Failed to stop volume snapshot replication') return Response(status_code=204) @instance_api.get('/connect', name='clusters:storage-pools:volumes:connect') def connect(cluster: Cluster, pool: StoragePool, volume: Volume): details, err = lvol_controller.connect_lvol(volume.get_id()) if err: - raise ValueError(err) + raise HTTPException(400, err) return details @@ -306,7 +303,10 @@ def create_snapshot( @instance_api.post('/replicate_lvol', name='clusters:storage-pools:volumes:replicate_lvol') def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): - return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) + result = lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) + if not result: + raise HTTPException(400, 'Failed to replicate lvol on target cluster — no replicated snapshot found') + return result class ReplicateLVolParams(BaseModel): @@ -315,7 +315,10 @@ class ReplicateLVolParams(BaseModel): @api.post('/replicate_lvol_on_source_cluster', name='clusters:storage-pools:replicate_lvol_on_source_cluster') def replicate_lvol_on_source_cluster(cluster: Cluster, pool: StoragePool, body: ReplicateLVolParams): - return lvol_controller.replicate_lvol_on_source_cluster(body.lvol_id, cluster.get_id(), pool.get_id()) + result = lvol_controller.replicate_lvol_on_source_cluster(body.lvol_id, cluster.get_id(), pool.get_id()) + if not result: + raise HTTPException(400, 'Failed to replicate lvol on source cluster') + return result @instance_api.get('/list_replication_tasks', name='clusters:storage-pools:volumes:list_replication_tasks') From 1a430729f8af85de0a0bbf07d9c94dee4d9978c1 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 11:45:48 +0100 Subject: [PATCH 12/31] fix: match replicated lvol on target cluster by NQN lvol UUID suffix instead of full NQN --- .../controllers/lvol_controller.py | 85 ++++++++----------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index f8bc1cbd2..54fcc0a0c 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -347,7 +347,7 @@ def validate_aes_xts_keys(key1: str, key2: str) -> Tuple[bool, str]: def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, with_snapshot=False, max_size=0, crypto_key1=None, crypto_key2=None, lvol_priority_class=0, - uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0, + uid=None, pvc_name=None, namespaced=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0, allowed_hosts=None, do_replicate=False, replication_cluster_id=None): db_controller = DBController() @@ -365,25 +365,6 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= if host_node.lvol_sync_del(): logger.info(f"LVol sync delete task on node: {host_node.get_id()}, proceeding anyway") - if namespace: - try: - master_lvol = db_controller.get_lvol_by_id(namespace) - except KeyError as e: - logger.error(e) - return False - - host_node = db_controller.get_storage_node_by_id(master_lvol.node_id) - - lvols_count = 0 - for lv in db_controller.get_lvols(host_node.cluster_id): - if lv.namespace == namespace: - lvols_count += 1 - - if lvols_count >= master_lvol.max_namespace_per_subsys: - msg = f"Max namespaces reached: {lvols_count}" - logger.error(msg) - return False, msg - pool = None for p in db_controller.get_pools(): if pool_id_or_name == p.get_id() or pool_id_or_name == p.pool_name: @@ -534,20 +515,23 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= lvol.lvol_priority_class = 0 lvol.fabric = fabric - if namespace: - master_lvol = db_controller.get_lvol_by_id(namespace) - lvol.nqn = master_lvol.nqn - lvol.namespace = namespace or "" - else: - lvol.nqn = cl.nqn + ":lvol:" + lvol.uuid - if not host_node: nodes = _get_next_3_nodes(cl.get_id(), lvol.size) if not nodes: return False, "No nodes found with enough resources to create the LVol" host_node = nodes[0] - lvol.max_namespace_per_subsys = host_node.max_lvol + # Create a new subsystem by default unless namespaced is set + lvol.nqn = cl.nqn + ":lvol:" + lvol.uuid + lvol.max_namespace_per_subsys = max_namespace_per_subsys + namespace = None + + if namespaced: + result = get_next_available_subsystem_on_node(host_node.get_id()) + if result: + namespace, free_nqn = result + lvol.nqn = free_nqn + lvol.namespace = namespace s_node = db_controller.get_storage_node_by_id(host_node.secondary_node_id) attr_name = f"active_{fabric}" @@ -1665,7 +1649,7 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO, if cluster.status == Cluster.STATUS_SUSPENDED and cluster.snapshot_replication_target_cluster: logger.error("Cluster is suspended, looking for replicated lvol") for lv in db_controller.get_lvols(cluster.snapshot_replication_target_cluster): - if lv.nqn == lvol.nqn: + if lv.nqn.split(":lvol:")[-1] == lvol.nqn.split(":lvol:")[-1]: logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") lvol = lv break @@ -2301,9 +2285,9 @@ def clone_lvol(lvol_id, clone_name, new_size=None, pvc_name=None): return False, str(e) host_node = db_controller.get_storage_node_by_id(lvol.node_id) - lvol_count = len(db_controller.get_lvols_by_node_id(lvol.node_id)) - if lvol_count >= host_node.max_lvol: - error = f"Too many lvols on node: {host_node.get_id()}, max lvols reached: {lvol_count}" + subsys_count = len(set(lv.nqn for lv in db_controller.get_lvols_by_node_id(lvol.node_id))) + if subsys_count >= host_node.max_lvol: + error = f"Too many subsystems on node: {host_node.get_id()}, max subsystems reached: {subsys_count}" logger.error(error) return False, error @@ -2320,7 +2304,7 @@ def clone_lvol(lvol_id, clone_name, new_size=None, pvc_name=None): logger.error(err) return False, str(err) new_lvol_uuid, err = snapshot_controller.clone( - snapshot_uuid, clone_name, new_size, pvc_name, delete_snap_on_lvol_delete=True, lock=False) + snapshot_uuid, clone_name, new_size, pvc_name, delete_snap_on_lvol_delete=True, lock=False, namespaced=True) if err: logger.error(err) if snapshot_uuid: @@ -2654,17 +2638,6 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") return False - # bdev_lvol_clone must run on the same SPDK that owns the snapshot's LVS - try: - snap_node = db_controller.get_storage_node_by_id(snapshot.lvol.node_id) - if snap_node.status == StorageNode.STATUS_ONLINE: - source_node = snap_node - else: - logger.error(f"Snapshot node {snapshot.lvol.node_id} is not online") - return False - except KeyError: - logger.warning(f"Could not find snapshot node {snapshot.lvol.node_id}, using current source_node") - # create lvol on target node new_lvol = copy.deepcopy(lvol) new_lvol.cloned_from_snap = snapshot.get_id() @@ -2710,9 +2683,6 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): logger.debug(f"new lvol from_source: {new_lvol.from_source}") - logger.debug(f"new_lvol: {new_lvol}") - logger.debug(f"source_node: {source_node}") - lvol_bdev, error = add_lvol_on_node(new_lvol, source_node) if error: logger.error(error) @@ -2737,7 +2707,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): new_lvol.status = LVol.STATUS_ONLINE new_lvol.from_source = True new_lvol.write_to_db(db_controller.kv_store) - lvol_events.lvol_replicated(new_lvol, new_lvol) + lvol_events.lvol_replicated(lvol, new_lvol) logger.debug(f"new lvol from_source: {new_lvol.from_source}") return new_lvol.lvol_uuid @@ -2965,3 +2935,22 @@ def get_master_lvols_by_pool_uuid(pool_id, is_json=False): return json.dumps(data, indent=2) else: return utils.print_table(data) + + +def get_namespaces_per_lvol(lvol): + db_controller = DBController() + ns_count = 0 + for lv in db_controller.get_lvols_by_node_id(lvol.node_id): + if lv.nqn == lvol.nqn and lv.status not in [LVol.STATUS_IN_DELETION, LVol.STATUS_DELETED]: + ns_count += 1 + return ns_count + + +def get_next_available_subsystem_on_node(node_id): + db_controller = DBController() + for lvol in db_controller.get_lvols_by_node_id(node_id): + if not lvol.namespace: + ns_count = get_namespaces_per_lvol(lvol) + if ns_count < lvol.max_namespace_per_subsys: + return lvol.get_id(), lvol.nqn + return None \ No newline at end of file From 6ade9068dac73210a9c7054c273c0dd31d777177 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 11:47:37 +0100 Subject: [PATCH 13/31] Revert "fix: match replicated lvol on target cluster by NQN lvol UUID suffix instead of full NQN" This reverts commit 1a430729f8af85de0a0bbf07d9c94dee4d9978c1. --- .../controllers/lvol_controller.py | 85 +++++++++++-------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 54fcc0a0c..f8bc1cbd2 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -347,7 +347,7 @@ def validate_aes_xts_keys(key1: str, key2: str) -> Tuple[bool, str]: def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, with_snapshot=False, max_size=0, crypto_key1=None, crypto_key2=None, lvol_priority_class=0, - uid=None, pvc_name=None, namespaced=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0, + uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0, allowed_hosts=None, do_replicate=False, replication_cluster_id=None): db_controller = DBController() @@ -365,6 +365,25 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= if host_node.lvol_sync_del(): logger.info(f"LVol sync delete task on node: {host_node.get_id()}, proceeding anyway") + if namespace: + try: + master_lvol = db_controller.get_lvol_by_id(namespace) + except KeyError as e: + logger.error(e) + return False + + host_node = db_controller.get_storage_node_by_id(master_lvol.node_id) + + lvols_count = 0 + for lv in db_controller.get_lvols(host_node.cluster_id): + if lv.namespace == namespace: + lvols_count += 1 + + if lvols_count >= master_lvol.max_namespace_per_subsys: + msg = f"Max namespaces reached: {lvols_count}" + logger.error(msg) + return False, msg + pool = None for p in db_controller.get_pools(): if pool_id_or_name == p.get_id() or pool_id_or_name == p.pool_name: @@ -515,23 +534,20 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= lvol.lvol_priority_class = 0 lvol.fabric = fabric + if namespace: + master_lvol = db_controller.get_lvol_by_id(namespace) + lvol.nqn = master_lvol.nqn + lvol.namespace = namespace or "" + else: + lvol.nqn = cl.nqn + ":lvol:" + lvol.uuid + if not host_node: nodes = _get_next_3_nodes(cl.get_id(), lvol.size) if not nodes: return False, "No nodes found with enough resources to create the LVol" host_node = nodes[0] - # Create a new subsystem by default unless namespaced is set - lvol.nqn = cl.nqn + ":lvol:" + lvol.uuid - lvol.max_namespace_per_subsys = max_namespace_per_subsys - namespace = None - - if namespaced: - result = get_next_available_subsystem_on_node(host_node.get_id()) - if result: - namespace, free_nqn = result - lvol.nqn = free_nqn - lvol.namespace = namespace + lvol.max_namespace_per_subsys = host_node.max_lvol s_node = db_controller.get_storage_node_by_id(host_node.secondary_node_id) attr_name = f"active_{fabric}" @@ -1649,7 +1665,7 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO, if cluster.status == Cluster.STATUS_SUSPENDED and cluster.snapshot_replication_target_cluster: logger.error("Cluster is suspended, looking for replicated lvol") for lv in db_controller.get_lvols(cluster.snapshot_replication_target_cluster): - if lv.nqn.split(":lvol:")[-1] == lvol.nqn.split(":lvol:")[-1]: + if lv.nqn == lvol.nqn: logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") lvol = lv break @@ -2285,9 +2301,9 @@ def clone_lvol(lvol_id, clone_name, new_size=None, pvc_name=None): return False, str(e) host_node = db_controller.get_storage_node_by_id(lvol.node_id) - subsys_count = len(set(lv.nqn for lv in db_controller.get_lvols_by_node_id(lvol.node_id))) - if subsys_count >= host_node.max_lvol: - error = f"Too many subsystems on node: {host_node.get_id()}, max subsystems reached: {subsys_count}" + lvol_count = len(db_controller.get_lvols_by_node_id(lvol.node_id)) + if lvol_count >= host_node.max_lvol: + error = f"Too many lvols on node: {host_node.get_id()}, max lvols reached: {lvol_count}" logger.error(error) return False, error @@ -2304,7 +2320,7 @@ def clone_lvol(lvol_id, clone_name, new_size=None, pvc_name=None): logger.error(err) return False, str(err) new_lvol_uuid, err = snapshot_controller.clone( - snapshot_uuid, clone_name, new_size, pvc_name, delete_snap_on_lvol_delete=True, lock=False, namespaced=True) + snapshot_uuid, clone_name, new_size, pvc_name, delete_snap_on_lvol_delete=True, lock=False) if err: logger.error(err) if snapshot_uuid: @@ -2638,6 +2654,17 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") return False + # bdev_lvol_clone must run on the same SPDK that owns the snapshot's LVS + try: + snap_node = db_controller.get_storage_node_by_id(snapshot.lvol.node_id) + if snap_node.status == StorageNode.STATUS_ONLINE: + source_node = snap_node + else: + logger.error(f"Snapshot node {snapshot.lvol.node_id} is not online") + return False + except KeyError: + logger.warning(f"Could not find snapshot node {snapshot.lvol.node_id}, using current source_node") + # create lvol on target node new_lvol = copy.deepcopy(lvol) new_lvol.cloned_from_snap = snapshot.get_id() @@ -2683,6 +2710,9 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): logger.debug(f"new lvol from_source: {new_lvol.from_source}") + logger.debug(f"new_lvol: {new_lvol}") + logger.debug(f"source_node: {source_node}") + lvol_bdev, error = add_lvol_on_node(new_lvol, source_node) if error: logger.error(error) @@ -2707,7 +2737,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): new_lvol.status = LVol.STATUS_ONLINE new_lvol.from_source = True new_lvol.write_to_db(db_controller.kv_store) - lvol_events.lvol_replicated(lvol, new_lvol) + lvol_events.lvol_replicated(new_lvol, new_lvol) logger.debug(f"new lvol from_source: {new_lvol.from_source}") return new_lvol.lvol_uuid @@ -2935,22 +2965,3 @@ def get_master_lvols_by_pool_uuid(pool_id, is_json=False): return json.dumps(data, indent=2) else: return utils.print_table(data) - - -def get_namespaces_per_lvol(lvol): - db_controller = DBController() - ns_count = 0 - for lv in db_controller.get_lvols_by_node_id(lvol.node_id): - if lv.nqn == lvol.nqn and lv.status not in [LVol.STATUS_IN_DELETION, LVol.STATUS_DELETED]: - ns_count += 1 - return ns_count - - -def get_next_available_subsystem_on_node(node_id): - db_controller = DBController() - for lvol in db_controller.get_lvols_by_node_id(node_id): - if not lvol.namespace: - ns_count = get_namespaces_per_lvol(lvol) - if ns_count < lvol.max_namespace_per_subsys: - return lvol.get_id(), lvol.nqn - return None \ No newline at end of file From 1993509192ded7f1e8d9c516e70da6b7822d8e6c Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 11:48:44 +0100 Subject: [PATCH 14/31] fix: match replicated lvol on target cluster by NQN lvol UUID suffix instead of full NQN --- simplyblock_core/controllers/lvol_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index f8bc1cbd2..e2ecddc7f 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1665,7 +1665,7 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO, if cluster.status == Cluster.STATUS_SUSPENDED and cluster.snapshot_replication_target_cluster: logger.error("Cluster is suspended, looking for replicated lvol") for lv in db_controller.get_lvols(cluster.snapshot_replication_target_cluster): - if lv.nqn == lvol.nqn: + if lv.nqn.split(":lvol:")[-1] == lvol.nqn.split(":lvol:")[-1]: logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") lvol = lv break From dc09031ce8f26a6267f7ea8dd63781e4f6c3a698 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 12:53:15 +0100 Subject: [PATCH 15/31] feat: add model field to lvol connect response for device symlink resolution --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index e2ecddc7f..fe78db856 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1726,6 +1726,7 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO, entry = { "ns_id": lvol.ns_id, + "model": lvol.uuid, "transport": transport, "ip": ip, "port": port, From 99eeac1ca190efc044532882cf178899b2047538 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 13:56:00 +0100 Subject: [PATCH 16/31] added more logs for sorted snapshot --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index fe78db856..f3b2df6a3 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2618,6 +2618,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): if snaps: snaps = sorted(snaps, key=lambda x: x.created_at) + logger.debug(f"sorted snapshots: {snaps}") snapshot = snaps[-1] if not snapshot: From 00b6981d2ea27f5cab99de0a9e79e5c99769cd80 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 14:49:12 +0100 Subject: [PATCH 17/31] fix: replicate_lvol_on_source_cluster to search target cluster tasks and use source_replicated_snap_uuid for failback snapshot recovery --- .../controllers/lvol_controller.py | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index f3b2df6a3..878f25e90 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2602,9 +2602,36 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): return False + target_cluster_id = None + if lvol.replication_node_id: + try: + target_node = db_controller.get_storage_node_by_id(lvol.replication_node_id) + target_cluster_id = target_node.cluster_id + except KeyError: + pass + if not target_cluster_id: + _src_cl = db_controller.get_cluster_by_id(source_node.cluster_id) + target_cluster_id = _src_cl.snapshot_replication_target_cluster + + if not target_cluster_id: + logger.error(f"Target cluster not found for lvol: {lvol_id}") + return False + + target_lvol_id = None + lvol_id_in_nqn = lvol.nqn.split(":")[-1] + for lv in db_controller.get_lvols(target_cluster_id): + if lv.nqn.split(":")[-1] == lvol_id_in_nqn: + logger.info(f"LVol with same lvol nqn already exists on target cluster: {lv.get_id()}") + target_lvol_id = lv.get_id() + break + + if not target_lvol_id: + logger.error(f"LVol with same nqn does not exist on target cluster: {target_cluster_id}") + return False + snaps = [] snapshot = None - for task in db_controller.get_job_tasks(source_node.cluster_id): + for task in db_controller.get_job_tasks(target_cluster_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: @@ -2612,45 +2639,14 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): except KeyError: continue - if snap.lvol.get_id() != lvol_id: + if snap.lvol.get_id() != target_lvol_id: continue snaps.append(snap) if snaps: snaps = sorted(snaps, key=lambda x: x.created_at) - logger.debug(f"sorted snapshots: {snaps}") - snapshot = snaps[-1] - - if not snapshot: - target_node = db_controller.get_storage_node_by_id(lvol.replication_node_id) - logger.info(f"Looking for snapshot in target cluster: {target_node.cluster_id}") - target_lvol_id = None - lvol_id_in_nqn = lvol.nqn.split(":")[-1] - for lv in db_controller.get_lvols(target_node.cluster_id): - if lv.nqn.split(":")[-1] == lvol_id_in_nqn: - logger.info(f"LVol with same lvol nqn already exists on target cluster: {lv.get_id()}") - target_lvol_id = lv.get_id() - - if not target_lvol_id: - logger.error(f"LVol with same nqn does not exist on target cluster: {target_node.cluster_id}") - return False - - for task in db_controller.get_job_tasks(target_node.cluster_id): - if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: - logger.debug(task) - try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) - except KeyError: - continue - - if snap.lvol.get_id() != target_lvol_id: - continue - snaps.append(snap) - - if snaps: - snaps = sorted(snaps, key=lambda x: x.created_at) - snapshot = snaps[-1] - snapshot = db_controller.get_snapshot_by_id(snapshot.target_replicated_snap_uuid) + last_snapshot = snaps[-1] + snapshot = db_controller.get_snapshot_by_id(last_snapshot.source_replicated_snap_uuid) if not snapshot: logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") From a8d094e626d170e9d0e2462cf03cedc661b2a77b Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 16:05:02 +0100 Subject: [PATCH 18/31] fix: mark snapshot replication task as done when snapshot is not found instead of retrying indefinitely --- simplyblock_core/services/snapshot_replication.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 49dc831d4..e8fbe8345 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -265,7 +265,13 @@ def process_snap_replicate_finish(task, snapshot): def task_runner(task: JobSchedule): - snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + try: + snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + task.function_result = "snapshot not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True if not snapshot: task.function_result = "snapshot not found" task.status = JobSchedule.STATUS_DONE From b0cd51cc6968b52709e0abe6a8b0c26f66758e97 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 18:33:32 +0100 Subject: [PATCH 19/31] stop forward replication on source lvol after failover to prevent normal replication restart on cluster recovery --- simplyblock_core/controllers/lvol_controller.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 878f25e90..c40c623fd 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2477,7 +2477,9 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.write_to_db(db_controller.kv_store) lvol = db_controller.get_lvol_by_id(lvol_id) lvol.from_source = False + lvol.do_replicate = False lvol.write_to_db() + replication_stop(lvol_id) lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid From 4f5a4c61a65bb9df7172eaf1523ef0eb1e315e8f Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 11 May 2026 18:43:13 +0100 Subject: [PATCH 20/31] start forward replication on source lvol after failback --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index c40c623fd..5a11a5194 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2736,6 +2736,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): new_lvol.status = LVol.STATUS_ONLINE new_lvol.from_source = True + new_lvol.do_replicate = True new_lvol.write_to_db(db_controller.kv_store) lvol_events.lvol_replicated(new_lvol, new_lvol) logger.debug(f"new lvol from_source: {new_lvol.from_source}") From 23da54f8d8bf29ae5dcb65916bb721e39369d1fa Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 12 May 2026 13:46:19 +0100 Subject: [PATCH 21/31] removed setting replication to false on lvol --- simplyblock_core/controllers/lvol_controller.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 5a11a5194..8141e859f 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2477,7 +2477,6 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.write_to_db(db_controller.kv_store) lvol = db_controller.get_lvol_by_id(lvol_id) lvol.from_source = False - lvol.do_replicate = False lvol.write_to_db() replication_stop(lvol_id) lvol_events.lvol_replicated(lvol, new_lvol) @@ -2736,7 +2735,6 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): new_lvol.status = LVol.STATUS_ONLINE new_lvol.from_source = True - new_lvol.do_replicate = True new_lvol.write_to_db(db_controller.kv_store) lvol_events.lvol_replicated(new_lvol, new_lvol) logger.debug(f"new lvol from_source: {new_lvol.from_source}") From 0885ba09c5da2806b0ee63cb7fb115b6895e176e Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 12 May 2026 15:38:53 +0100 Subject: [PATCH 22/31] updated endpoint replication_trigger --- simplyblock_web/api/v2/volume.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 76aa9c4b8..915245af8 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -220,7 +220,7 @@ def inflate(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: return Response(status_code=204) -@instance_api.post('/replication_trigger', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) +@instance_api.post('/replication_trigger', name='clusters:storage-pools:volumes:replication_trigger', status_code=204, responses={204: {"content": None}}) def replication_trigger(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: if not lvol_controller.replication_trigger(volume.get_id()): raise HTTPException(400, 'Failed to trigger volume snapshot replication') From 14b148bc71ea53e08a5701a4b6cbcb5f170e4648 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 12 May 2026 16:45:58 +0100 Subject: [PATCH 23/31] add more logs to list_replication_tasks --- simplyblock_core/controllers/lvol_controller.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 8141e859f..eefc95e23 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2488,17 +2488,25 @@ def list_replication_tasks(lvol_id): db_controller = DBController() lvol = db_controller.get_lvol_by_id(lvol_id) node = db_controller.get_storage_node_by_id(lvol.node_id) + logger.info("list_replication_tasks: lvol=%s node=%s cluster=%s do_replicate=%s", + lvol_id, node.get_id(), node.cluster_id, lvol.do_replicate) tasks = [] - for task in db_controller.get_job_tasks(node.cluster_id): + all_tasks = db_controller.get_job_tasks(node.cluster_id) + logger.info("list_replication_tasks: total cluster tasks=%d", len(all_tasks)) + for task in all_tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: try: snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) except KeyError: + logger.warning("list_replication_tasks: snapshot not found for task=%s", task.uuid) continue if snap.lvol.get_id() != lvol_id: continue + logger.info("list_replication_tasks: matched task=%s snap=%s status=%s", + task.uuid, snap.get_id(), task.status) tasks.append(task) + logger.info("list_replication_tasks: returning %d tasks for lvol=%s", len(tasks), lvol_id) return tasks From bfd05cf42703932e0727ab625ca8e8b1957775f2 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 12 May 2026 17:47:05 +0100 Subject: [PATCH 24/31] updated snapsh id reference name --- simplyblock_core/controllers/lvol_controller.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index eefc95e23..c6ef2e94f 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1572,7 +1572,7 @@ def get_replication_info(lvol_id_or_name): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: continue @@ -1588,7 +1588,7 @@ def get_replication_info(lvol_id_or_name): out["tasks"] = [t.to_dict() for t in tasks] out["replicated_count"] = len(snaps) last_task = tasks[-1] - last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) + last_snap = db_controller.get_snapshot_by_id(last_task.function_result) out["last_snapshot_id"] = last_snap.get_id() out["last_replication_time"] = last_task.updated_at if "end_time" in last_task.function_params and "start_time" in last_task.function_params: @@ -2184,7 +2184,7 @@ def replication_trigger(lvol_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: continue @@ -2200,7 +2200,7 @@ def replication_trigger(lvol_id): out["tasks"] = tasks out["replicated_count"] = len(snaps) last_task = tasks[-1] - last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) + last_snap = db_controller.get_snapshot_by_id(last_task.function_result) out["last_snapshot_id"] = last_snap.get_id() out["last_replication_time"] = last_task.updated_at duration = "" @@ -2350,7 +2350,7 @@ def replication_stop(lvol_id, delete=False): for task in tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.status != JobSchedule.STATUS_DONE: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) if snap.lvol.uuid == lvol.uuid: tasks_controller.cancel_task(task.uuid) @@ -2393,7 +2393,7 @@ def replicate_lvol_on_target_cluster(lvol_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: continue @@ -2496,7 +2496,7 @@ def list_replication_tasks(lvol_id): for task in all_tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: logger.warning("list_replication_tasks: snapshot not found for task=%s", task.uuid) continue @@ -2644,7 +2644,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: continue From 5ed24b0ba119bc99628aded4381adbe7accf6348 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 12 May 2026 17:48:08 +0100 Subject: [PATCH 25/31] set replicate_as_snap_instance to false --- simplyblock_core/controllers/tasks_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index dadcd8247..c8e57d9b8 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -594,7 +594,7 @@ def _check_snap_instance_on_node(snapshot_id: str , node_id: str): _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, snapshot.cluster_id, node_id, "", function_params={"snapshot_id": snapshot.get_id(), "replicate_to_source": False, - "replicate_as_snap_instance": True}, + "replicate_as_snap_instance": False}, send_to_cluster_log=False) From 04be1b9b3973683749f3c5c0ee0008cbe5d5f1cb Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 12 May 2026 18:59:00 +0100 Subject: [PATCH 26/31] reverted: updated snapsh id reference name --- simplyblock_core/controllers/lvol_controller.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index c6ef2e94f..b18fb595c 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1572,7 +1572,7 @@ def get_replication_info(lvol_id_or_name): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_result) + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) except KeyError: continue @@ -1588,7 +1588,7 @@ def get_replication_info(lvol_id_or_name): out["tasks"] = [t.to_dict() for t in tasks] out["replicated_count"] = len(snaps) last_task = tasks[-1] - last_snap = db_controller.get_snapshot_by_id(last_task.function_result) + last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) out["last_snapshot_id"] = last_snap.get_id() out["last_replication_time"] = last_task.updated_at if "end_time" in last_task.function_params and "start_time" in last_task.function_params: @@ -2184,7 +2184,7 @@ def replication_trigger(lvol_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_result) + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) except KeyError: continue @@ -2200,7 +2200,7 @@ def replication_trigger(lvol_id): out["tasks"] = tasks out["replicated_count"] = len(snaps) last_task = tasks[-1] - last_snap = db_controller.get_snapshot_by_id(last_task.function_result) + last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) out["last_snapshot_id"] = last_snap.get_id() out["last_replication_time"] = last_task.updated_at duration = "" @@ -2350,7 +2350,7 @@ def replication_stop(lvol_id, delete=False): for task in tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.status != JobSchedule.STATUS_DONE: - snap = db_controller.get_snapshot_by_id(task.function_result) + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) if snap.lvol.uuid == lvol.uuid: tasks_controller.cancel_task(task.uuid) @@ -2393,7 +2393,7 @@ def replicate_lvol_on_target_cluster(lvol_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_result) + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) except KeyError: continue From 02530fc79dd8ffd102cc085becab96325938a2de Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 13 May 2026 00:47:55 +0100 Subject: [PATCH 27/31] fix replicate_lvol_on_source/target_cluster snapshot lookup --- simplyblock_core/controllers/lvol_controller.py | 15 +++++++++------ .../controllers/snapshot_controller.py | 1 + simplyblock_core/models/snapshot.py | 3 ++- simplyblock_core/services/snapshot_replication.py | 1 + 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index b18fb595c..de9821072 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2397,7 +2397,7 @@ def replicate_lvol_on_target_cluster(lvol_id): except KeyError: continue - if snap.lvol.get_id() != lvol_id: + if snap.source_lvol_id != lvol_id: continue snaps.append(snap) @@ -2496,11 +2496,11 @@ def list_replication_tasks(lvol_id): for task in all_tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: try: - snap = db_controller.get_snapshot_by_id(task.function_result) + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) except KeyError: logger.warning("list_replication_tasks: snapshot not found for task=%s", task.uuid) continue - if snap.lvol.get_id() != lvol_id: + if snap.source_lvol_id != lvol_id: continue logger.info("list_replication_tasks: matched task=%s snap=%s status=%s", task.uuid, snap.get_id(), task.status) @@ -2643,19 +2643,22 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): for task in db_controller.get_job_tasks(target_cluster_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) + if not task.function_params.get("replicate_to_source"): + continue + if task.status != JobSchedule.STATUS_DONE: + continue try: snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: continue - if snap.lvol.get_id() != target_lvol_id: + if snap.source_lvol_id != target_lvol_id: continue snaps.append(snap) if snaps: snaps = sorted(snaps, key=lambda x: x.created_at) - last_snapshot = snaps[-1] - snapshot = db_controller.get_snapshot_by_id(last_snapshot.source_replicated_snap_uuid) + snapshot = snaps[-1] if not snapshot: logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index c50e77a4d..b8a450376 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -271,6 +271,7 @@ def add(lvol_id, snapshot_name, backup=False, lock=True): snap.snap_bdev = f"{lvol.lvs_name}/{snap_bdev_name}" snap.created_at = int(time.time()) snap.lvol = lvol + snap.source_lvol_id = lvol.get_id() snap.fabric = lvol.fabric snap.vuid = snap_vuid snap.status = SnapShot.STATUS_ONLINE diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index ab91a0087..d6f014247 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -34,4 +34,5 @@ class SnapShot(BaseModel): source_replicated_snap_uuid: str = "" next_snap_uuid: str = "" prev_snap_uuid: str = "" - instances: list = [] \ No newline at end of file + instances: list = [] + source_lvol_id: str = "" \ No newline at end of file diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index e8fbe8345..25b8e4fe5 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -247,6 +247,7 @@ def process_snap_replicate_finish(task, snapshot): except Exception as e: logger.error(e) + new_snapshot.source_lvol_id = snapshot.lvol.get_id() new_snapshot.write_to_db() if snapshot.status == SnapShot.STATUS_IN_REPLICATION: From 9f15841a0181148560eed33016e8894a4d3b8029 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 13 May 2026 11:39:10 +0100 Subject: [PATCH 28/31] use function_result to get snapshot id --- simplyblock_core/controllers/lvol_controller.py | 6 ++---- simplyblock_core/services/snapshot_replication.py | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index de9821072..ff942eacf 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2393,7 +2393,7 @@ def replicate_lvol_on_target_cluster(lvol_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) try: - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + snap = db_controller.get_snapshot_by_id(task.function_result) except KeyError: continue @@ -2403,9 +2403,7 @@ def replicate_lvol_on_target_cluster(lvol_id): if snaps: snaps = sorted(snaps, key=lambda x: x.created_at) - last_snapshot = snaps[-1] - rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) - snapshot = rep_snap + snapshot = snaps[-1] if not snapshot: logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 25b8e4fe5..f79994d40 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -138,6 +138,8 @@ def delete_last_snapshot_if_needed(this_task, lvol): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: if task.get_id() == this_task.get_id(): continue + if task.status != JobSchedule.STATUS_DONE: + continue logger.debug(task) try: snap = db.get_snapshot_by_id(task.function_params["snapshot_id"]) From 16ebd260f47a462b4a6df473bd50c11d983fc619 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 13 May 2026 14:00:16 +0100 Subject: [PATCH 29/31] set do_replicate to true on replicate_lvol_on_source_cluster --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index ff942eacf..5601fc63c 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2744,6 +2744,7 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): new_lvol.status = LVol.STATUS_ONLINE new_lvol.from_source = True + new_lvol.do_replicate = True new_lvol.write_to_db(db_controller.kv_store) lvol_events.lvol_replicated(new_lvol, new_lvol) logger.debug(f"new lvol from_source: {new_lvol.from_source}") From bc7c0b660213a316a54ce22ed2404ea737be3516 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 13 May 2026 15:37:56 +0100 Subject: [PATCH 30/31] fix replicate_lvol_on_source_cluster to match failback tasks using source_lvol_id without replicate_to_source filter --- simplyblock_core/controllers/lvol_controller.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 5601fc63c..6b8058814 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2641,8 +2641,6 @@ def replicate_lvol_on_source_cluster(lvol_id, cluster_id=None, pool_uuid=None): for task in db_controller.get_job_tasks(target_cluster_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) - if not task.function_params.get("replicate_to_source"): - continue if task.status != JobSchedule.STATUS_DONE: continue try: From 9a57c73ed51695f46eec532235de761348b367c2 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Wed, 13 May 2026 17:03:06 +0100 Subject: [PATCH 31/31] fix: inherit source_lvol_id from snapshot instead of embedded lvol in replication --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index f79994d40..a71caaaf4 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -249,7 +249,7 @@ def process_snap_replicate_finish(task, snapshot): except Exception as e: logger.error(e) - new_snapshot.source_lvol_id = snapshot.lvol.get_id() + new_snapshot.source_lvol_id = snapshot.source_lvol_id or snapshot.lvol.get_id() new_snapshot.write_to_db() if snapshot.status == SnapShot.STATUS_IN_REPLICATION: