From 55e909369a854f6dc86d2633e4917aa51e6cb249 Mon Sep 17 00:00:00 2001 From: Manohar Reddy Date: Wed, 27 May 2026 00:03:27 +0200 Subject: [PATCH] fix: atomic subsystem namespace slot check via FDB transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallel clone/create requests sharing a subsystem (namespaced=True) all read the same namespace count from DB and can each decide the slot is free, resulting in more lvols than max_namespace_per_subsys being written to one NQN. Fix: replace the bare lvol.write_to_db() call with a single FDB transactional function (write_lvol_with_ns_check) that re-counts active namespaces for the target NQN inside the transaction and writes the new lvol record only when the subsystem still has room. Because the range-read (b'object/LVol/') and the write share one FDB transaction, concurrent writers that race on the same NQN trigger an OCC conflict on commit. FDB automatically retries the loser with fresh data, serialising the slot allocation without any explicit lock — parallel creates on *different* subsystems continue to run without any contention. Affected callers: - snapshot_controller.clone() (namespaced clones) - lvol_controller.add_lvol_ha() (namespaced lvol creates) Both paths now return a retryable error instead of silently over- allocating when the slot is taken after the OCC conflict is resolved. Co-Authored-By: Claude Sonnet 4.6 --- .../controllers/lvol_controller.py | 11 +++++- .../controllers/snapshot_controller.py | 11 +++++- simplyblock_core/db_controller.py | 35 +++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 87fd2ad99..c8917abbb 100755 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -685,7 +685,16 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= logger.exception(msg) return False, msg - lvol.write_to_db(db_controller.kv_store) + if lvol.namespace: + # Namespaced lvol: verify the subsystem slot count atomically before + # writing. FDB's OCC retries the losing transaction when two parallel + # creates race for the last free slot, so max_namespace_per_subsys is + # never exceeded. + if not db_controller.write_lvol_with_ns_check(lvol): + logger.error("Subsystem %s reached capacity during concurrent create", lvol.nqn) + return False, "Subsystem namespace limit reached concurrently; retry" + else: + lvol.write_to_db(db_controller.kv_store) if ha_type == "single": if host_node.status == StorageNode.STATUS_ONLINE: diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index e727e1838..01f5bd8b0 100755 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -749,7 +749,16 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None logger.exception(msg) return False, msg - lvol.write_to_db(db_controller.kv_store) + if lvol.namespace: + # Namespaced clone: verify the subsystem slot count atomically before + # writing. FDB's OCC retries the losing transaction when two parallel + # clones race for the last free slot, so max_namespace_per_subsys is + # never exceeded. + if not db_controller.write_lvol_with_ns_check(lvol): + logger.error("Subsystem %s reached capacity during concurrent clone", lvol.nqn) + return False, "Subsystem namespace limit reached concurrently; retry the clone" + else: + lvol.write_to_db(db_controller.kv_store) if lvol.ha_type == "single": lvol_bdev, error = lvol_controller.add_lvol_on_node(lvol, snode) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index e3e27fc00..309bb65c9 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -427,6 +427,41 @@ def release_backup_chain_locks(self, snapshot_ids): transactional = fdb.transactional(DBController._release_backup_chain_locks_tx) transactional(self, self.kv_store, ordered_snapshot_ids) + # ---- Subsystem Namespace Slot Guard (Single FDB Transaction) ---- + + def _write_lvol_with_ns_check_tx(self, tr, node_id, nqn, max_ns, lvol_key, lvol_data): + """Count active namespaces for *nqn* on *node_id* inside a single FDB + transaction and write the new lvol record only if the subsystem still + has room. Because the range-read and the write share one transaction, + FDB's OCC serialises concurrent writers to the same subsystem + automatically — no explicit lock needed. + Returns False (without writing) when the subsystem is already full. + """ + live = 0 + for _, v in tr.get_range_startswith(b"object/LVol/"): + d = json.loads(v) + if (d.get("node_id") == node_id + and d.get("nqn") == nqn + and d.get("status") not in (LVol.STATUS_IN_DELETION, LVol.STATUS_DELETED)): + live += 1 + if live >= max_ns: + return False + tr[lvol_key] = lvol_data + return True + + def write_lvol_with_ns_check(self, lvol): + """Write a namespaced lvol to FDB while atomically verifying the + per-subsystem namespace limit is not exceeded. Use in place of + lvol.write_to_db() when lvol.namespace is set (namespaced placement). + """ + fn = fdb.transactional(DBController._write_lvol_with_ns_check_tx) + return fn( + self, self.kv_store, + lvol.node_id, lvol.nqn, lvol.max_namespace_per_subsys, + lvol.get_db_id().encode(), + json.dumps(lvol.to_dict()).encode(), + ) + # ---- Pre-Restart Guard (Single FDB Transaction) ---- def _try_set_node_restarting_tx(self, tr, cluster_id, node_id):