From 0d50c43ee59c9a3ccb94fefca87c52956973948e Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Mon, 12 Jan 2026 13:44:46 +0100 Subject: [PATCH 1/3] fix(drbd): prevent duplicate TCP ports after toggle-disk operations Remove redundant ensureStackDataExists() call with empty payload from resetStoragePools() method that was causing TCP port conflicts after toggle-disk operations. Root Cause: ----------- The resetStoragePools() method, introduced in 2019 (commit 95cc17d0b8), calls ensureStackDataExists() with an empty LayerPayload. This worked correctly when TCP ports were stored at RscDfn level. After the TCP port migration to per-node level (commit f754943463, May 2025), this empty payload results in DrbdRscData being created without TCP ports assigned. The controller then sends a Pojo with an empty port Set to satellites. On satellites, when DrbdRscData is initialized with an empty port list, initPorts() uses preferredNewPortsRef from peer resources. Since SatelliteDynamicNumberPool.tryAllocate() always returns true (no-op), any port from preferredNewPortsRef is accepted without conflict checking, leading to duplicate TCP port assignments. Impact: ------- This regression affects toggle-disk operations, particularly: - Snapshot creation/restore operations - Manual toggle-disk operations - Any operation calling resetStoragePools() Symptoms include: - DRBD resources failing to adjust with "port is also used" errors - Resources stuck in StandAlone or Connecting states - Multiple resources on the same node using identical TCP ports Solution: --------- Remove the ensureStackDataExists() call from resetStoragePools() as it is redundant. The calling code (e.g., CtrlRscToggleDiskApiCallHandler line 1071) already invokes ensureStackDataExists() with the correct payload immediately after resetStoragePools(). This fix ensures: 1. resetStoragePools() only resets storage pool assignments 2. Layer data creation with proper TCP ports happens via the caller's ensureStackDataExists() with correct payload 3. No DrbdRscData objects are created without TCP port assignments Related Issues: --------------- Fixes #454 - Duplicate TCP ports after backup/restore operations Related to user reports of resources stuck in StandAlone after node reboots when toggle-disk or backup operations were in progress. Testing: -------- Verified that: - Toggle-disk operations no longer create resources without TCP ports - Backup/restore operations complete without TCP port conflicts - Resources maintain unique TCP ports across toggle-disk cycles Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java b/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java index 5abafef18..980432df5 100644 --- a/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java +++ b/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java @@ -277,8 +277,6 @@ public void resetStoragePools(Resource rscRef) rscDataToProcess.addAll(rscData.getChildren()); } - - ensureStackDataExists(rscRef, null, new LayerPayload()); } catch (AccessDeniedException exc) { From 79d6375c55d6181b35a7b7f0fe8dbdfb86e126cd Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sat, 28 Mar 2026 08:30:08 +0100 Subject: [PATCH 2/3] fix(drbd): preserve TCP ports during toggle-disk operations When removeLayerData() deletes DrbdRscData during toggle-disk, the TCP ports are freed from the number pool. ensureStackDataExists() then allocates new ports which may differ from the old ones if other resources claimed them in the meantime. The controller correctly avoids collisions in its own number pool, but if the satellite misses the update (e.g. due to controller restart or connectivity issues), it keeps the old ports while peers receive the new ones, causing DRBD connections to fail with StandAlone state. Add copyDrbdTcpPortsIfExists() to save existing TCP ports into the LayerPayload before removeLayerData() deletes them. Call it from copyDrbdNodeIdIfExists() (covers both toggle-disk paths) and from the needsDeactivate path (shared storage pool case). This ensures the same TCP ports are reused when DrbdRscData is recreated, eliminating the window for port mismatch between controller and satellites. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../CtrlRscToggleDiskApiCallHandler.java | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java b/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java index 614d460fa..f084c8998 100644 --- a/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java +++ b/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java @@ -54,6 +54,7 @@ import com.linbit.linstor.storage.StorageException; import com.linbit.linstor.storage.data.adapter.drbd.DrbdRscData; import com.linbit.linstor.storage.interfaces.categories.resource.AbsRscLayerObject; +import com.linbit.linstor.core.types.TcpPortNumber; import com.linbit.linstor.storage.kinds.DeviceLayerKind; import com.linbit.linstor.storage.utils.LayerUtils; import com.linbit.linstor.tasks.AutoDiskfulTask; @@ -81,6 +82,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.TreeSet; import org.reactivestreams.Publisher; import reactor.core.publisher.Flux; @@ -855,10 +857,16 @@ private Flux prepareDiskAddition( rsc.getNode().getName().displayValue, rscDfn.getName().displayValue ); + + /* + * We also have to remove the currently diskless DrbdRscData and free up the node-id as now we must + * use the shared resource's node-id. We still need to preserve TCP ports though. + */ + copyDrbdTcpPortsIfExists(rsc, payload); } else { - copyDrbdNodeIdIfExists(rsc, payload); + copyDrbdSettings(rsc, payload); } removeLayerData(rsc); @@ -1040,7 +1048,7 @@ private void ensureAllPeersHavePeerSlotLeft(ResourceDefinition rscDfnRef) /** * Although we need to rebuild the layerData as the layerList might have changed, if we do not * deactivate (i.e. down) the current resource, we need to make sure that deleting DrbdRscData - * and recreating a new DrbdRscData ends up with the same node-id as before. + * and recreating a new DrbdRscData ends up with the same node-id and TCP ports as before. */ private void copyDrbdNodeIdIfExists(Resource rsc, LayerPayload payload) throws ImplementationError { @@ -1058,6 +1066,37 @@ private void copyDrbdNodeIdIfExists(Resource rsc, LayerPayload payload) throws I payload.drbdRsc.replacingOldLayerRscId = drbdRscData.getRscLayerId(); payload.drbdRsc.nodeId = drbdRscData.getNodeId().value; } + copyDrbdTcpPortsIfExists(rsc, payload); + } + + /** + * Preserves existing TCP ports during toggle-disk operations. + * + * When removeLayerData() deletes DrbdRscData, the TCP ports are freed from the number pool. + * If ensureStackDataExists() then allocates different ports, and the satellite misses the update + * (e.g. due to controller restart or connectivity issues), the satellite keeps the old ports + * while peers get the new ones, causing DRBD connections to fail with StandAlone state. + */ + private void copyDrbdTcpPortsIfExists(Resource rsc, LayerPayload payload) throws ImplementationError + { + Set> drbdRscDataSet = LayerRscUtils.getRscDataByLayer( + getLayerData(apiCtx, rsc), + DeviceLayerKind.DRBD + ); + if (!drbdRscDataSet.isEmpty()) + { + DrbdRscData drbdRscData = (DrbdRscData) drbdRscDataSet.iterator().next(); + Collection tcpPorts = drbdRscData.getTcpPortList(); + if (tcpPorts != null && !tcpPorts.isEmpty()) + { + Set portInts = new TreeSet<>(); + for (TcpPortNumber port : tcpPorts) + { + portInts.add(port.value); + } + payload.drbdRsc.tcpPorts = portInts; + } + } } private List removeLayerData(Resource rscRef) From bcc89902f4f61ac1589dd07ebb7f5aae1935370d Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Mon, 30 Mar 2026 14:52:41 +0200 Subject: [PATCH 3/3] fix(drbd): address review feedback - Introduce copyDrbdSettings() wrapper instead of calling copyDrbdTcpPortsIfExists() from within copyDrbdNodeIdIfExists() - Replace calls to copyDrbdNodeIdIfExists() with copyDrbdSettings() in both toggle-disk paths - Make ensureStackDataExists() in resetStoragePools() optional via boolean parameter to prevent accidental omission in future callers while allowing CtrlRscToggleDiskApiCallHandler to skip the redundant call Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../CtrlRscToggleDiskApiCallHandler.java | 33 ++++++++++--------- .../resource/CtrlRscLayerDataFactory.java | 10 ++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java b/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java index f084c8998..aa6b15dc7 100644 --- a/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java +++ b/controller/src/main/java/com/linbit/linstor/core/apicallhandler/controller/CtrlRscToggleDiskApiCallHandler.java @@ -1046,10 +1046,20 @@ private void ensureAllPeersHavePeerSlotLeft(ResourceDefinition rscDfnRef) } /** - * Although we need to rebuild the layerData as the layerList might have changed, if we do not - * deactivate (i.e. down) the current resource, we need to make sure that deleting DrbdRscData - * and recreating a new DrbdRscData ends up with the same node-id and TCP ports as before. + * Copies DRBD settings (node-id and TCP ports) from the existing DrbdRscData into the payload + * before removeLayerData() deletes them. This ensures that recreated DrbdRscData ends up with + * the same node-id and TCP ports as before. + * + * TCP ports must be preserved because if the satellite misses the update (e.g. due to controller + * restart or connectivity issues), it will keep the old ports while peers receive the new ones, + * causing DRBD connections to fail with StandAlone state. */ + private void copyDrbdSettings(Resource rsc, LayerPayload payload) throws ImplementationError + { + copyDrbdNodeIdIfExists(rsc, payload); + copyDrbdTcpPortsIfExists(rsc, payload); + } + private void copyDrbdNodeIdIfExists(Resource rsc, LayerPayload payload) throws ImplementationError { Set> drbdRscDataSet = LayerRscUtils.getRscDataByLayer( @@ -1066,17 +1076,8 @@ private void copyDrbdNodeIdIfExists(Resource rsc, LayerPayload payload) throws I payload.drbdRsc.replacingOldLayerRscId = drbdRscData.getRscLayerId(); payload.drbdRsc.nodeId = drbdRscData.getNodeId().value; } - copyDrbdTcpPortsIfExists(rsc, payload); } - /** - * Preserves existing TCP ports during toggle-disk operations. - * - * When removeLayerData() deletes DrbdRscData, the TCP ports are freed from the number pool. - * If ensureStackDataExists() then allocates different ports, and the satellite misses the update - * (e.g. due to controller restart or connectivity issues), the satellite keeps the old ports - * while peers get the new ones, causing DRBD connections to fail with StandAlone state. - */ private void copyDrbdTcpPortsIfExists(Resource rsc, LayerPayload payload) throws ImplementationError { Set> drbdRscDataSet = LayerRscUtils.getRscDataByLayer( @@ -1353,15 +1354,17 @@ private Flux finishOperationInTransaction( /* * We also have to remove the possible meta-children of previous StorageRscData. * LayerData will be recreated with ensureStackDataExists. - * However, we still need to remember our node-id if we had / have DRBD in the list + * However, we still need to remember our DRBD settings if we had / have DRBD in the list */ - copyDrbdNodeIdIfExists(rsc, payload); + copyDrbdSettings(rsc, payload); layerList = removeLayerData(rsc); } else { markDiskAdded(rsc); - ctrlLayerStackHelper.resetStoragePools(rsc); + // Pass false to skip redundant ensureStackDataExists call within resetStoragePools, + // since we call it explicitly below with the correct payload + ctrlLayerStackHelper.resetStoragePools(rsc, false); } ctrlLayerStackHelper.ensureStackDataExists(rsc, layerList, payload); diff --git a/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java b/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java index 980432df5..fc1ae658a 100644 --- a/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java +++ b/controller/src/main/java/com/linbit/linstor/layer/resource/CtrlRscLayerDataFactory.java @@ -264,6 +264,11 @@ rscRef, payload, layerList, new ChildResourceData(""), null } public void resetStoragePools(Resource rscRef) + { + resetStoragePools(rscRef, true); + } + + public void resetStoragePools(Resource rscRef, boolean callEnsureStackDataExistsRef) { try { @@ -277,6 +282,11 @@ public void resetStoragePools(Resource rscRef) rscDataToProcess.addAll(rscData.getChildren()); } + + if (callEnsureStackDataExistsRef) + { + ensureStackDataExists(rscRef, null, new LayerPayload()); + } } catch (AccessDeniedException exc) {