From 28853196b9dd6c7d60974523ed4b2c5bbf13ed3b Mon Sep 17 00:00:00 2001 From: Marius Leustean Date: Thu, 29 May 2025 13:35:44 +0300 Subject: [PATCH] [SAP] Improve parallel creation from snapshot/volume The source volume/snapshot was locked while the new volume was creating, preventing other volumes from being created in parallel from the same source. This was to prevent the source from being deleted while the volume is being created from it. We move that lock from the manager upper to the API layer until the volume reaches the 'creating' state, then the lock is released. The delete_snapshot / delete_volume will check the DB if there are volumes creating from it before proceeding with the deletion. The caveat is that, if a volume is stuck in 'creating', it will still prevent the source from being deleted. Change-Id: I8be26242474a2b4eba676978b533f8cad40e292b --- cinder/tests/unit/volume/test_volume.py | 16 ----------- cinder/volume/api.py | 38 +++++++++++++++++++++++++ cinder/volume/manager.py | 16 +---------- 3 files changed, 39 insertions(+), 31 deletions(-) diff --git a/cinder/tests/unit/volume/test_volume.py b/cinder/tests/unit/volume/test_volume.py index e97546bad2f..ce71d35ac84 100644 --- a/cinder/tests/unit/volume/test_volume.py +++ b/cinder/tests/unit/volume/test_volume.py @@ -1424,8 +1424,6 @@ def test_create_volume_from_snapshot_check_locks( orig_flow = engine.ActionEngine.run def mock_flow_run(*args, **kwargs): - # ensure the lock has been taken - mock_lock.assert_called_with('%s-delete_snapshot' % snap_id) # now proceed with the flow. ret = orig_flow(*args, **kwargs) return ret @@ -1450,26 +1448,20 @@ def mock_flow_run(*args, **kwargs): # mock the flow runner so we can do some checks self.mock_object(engine.ActionEngine, 'run', mock_flow_run) - # locked self.volume.create_volume(self.context, dst_vol, request_spec={'snapshot_id': snap_id}) - mock_lock.assert_called_with('%s-delete_snapshot' % snap_id) self.assertEqual(dst_vol.id, db.volume_get(admin_ctxt, dst_vol.id).id) self.assertEqual(snap_id, db.volume_get(admin_ctxt, dst_vol.id).snapshot_id) # locked self.volume.delete_volume(self.context, dst_vol) - mock_lock.assert_any_call('%s-delete_volume' % dst_vol.id) mock_lock.assert_any_call('volume-stats-%s' % self.volume.host) - # locked self.volume.delete_snapshot(self.context, snapshot_obj) - mock_lock.assert_called_with('%s-delete_snapshot' % snap_id) # locked self.volume.delete_volume(self.context, src_vol) - mock_lock.assert_any_call('%s-delete_volume' % src_vol.id) mock_lock.assert_any_call('volume-stats-%s' % self.volume.host) self.assertTrue(mock_lvm_create.called) @@ -1482,8 +1474,6 @@ def test_create_volume_from_volume_check_locks(self, mock_lock): orig_flow = engine.ActionEngine.run def mock_flow_run(*args, **kwargs): - # ensure the lock has been taken - mock_lock.assert_called_with('%s-delete_volume' % src_vol_id) # now proceed with the flow. ret = orig_flow(*args, **kwargs) return ret @@ -1505,22 +1495,16 @@ def mock_flow_run(*args, **kwargs): # mock the flow runner so we can do some checks self.mock_object(engine.ActionEngine, 'run', mock_flow_run) - # locked self.volume.create_volume(self.context, dst_vol, request_spec={'source_volid': src_vol_id}) - mock_lock.assert_called_with('%s-delete_volume' % src_vol_id) self.assertEqual(dst_vol_id, db.volume_get(admin_ctxt, dst_vol_id).id) self.assertEqual(src_vol_id, db.volume_get(admin_ctxt, dst_vol_id).source_volid) - # locked self.volume.delete_volume(self.context, dst_vol) - mock_lock.assert_any_call('%s-delete_volume' % dst_vol_id) mock_lock.assert_any_call('volume-stats-%s' % self.volume.host) - # locked self.volume.delete_volume(self.context, src_vol) - mock_lock.assert_any_call('%s-delete_volume' % src_vol_id) def _raise_metadata_copy_failure(self, method, dst_vol): # MetadataCopyFailure exception will be raised if DB service is Down diff --git a/cinder/volume/api.py b/cinder/volume/api.py index e72f4e88e03..3a8609fb91d 100644 --- a/cinder/volume/api.py +++ b/cinder/volume/api.py @@ -19,6 +19,7 @@ import ast import collections import datetime +import typing as ty from castellan import key_manager from oslo_config import cfg @@ -355,6 +356,23 @@ def create(self, context, size, name, description, snapshot=None, LOG.exception(msg) raise exception.CinderException(msg) + locked_action: ty.Optional[str] + if hasattr(snapshot, 'id'): + # Make sure the snapshot is not deleted until we are done with it. + locked_action = "%s-%s" % (snapshot.id, 'delete_snapshot') + elif hasattr(source_volume, 'id'): + # Make sure the volume is not deleted until we are done with it. + locked_action = "%s-%s" % (source_volume.id, 'delete_volume') + else: + locked_action = None + + if locked_action is None: + return self._run_create_flow(flow_engine) + else: + with coordination.COORDINATOR.get_lock(locked_action): + return self._run_create_flow(flow_engine) + + def _run_create_flow(self, flow_engine): # Attaching this listener will capture all of the notifications that # taskflow sends out and redirect them to a more useful log for # cinders debugging (or error reporting) usage. @@ -412,6 +430,16 @@ def delete(self, context, volume, else: project_id = context.project_id + vols = db.volume_get_all( + context.elevated(), + limit=1, + filters={'source_volid': volume.id, + 'status': 'creating'}) + if len(vols): + msg = _('Volume is busy being cloned to other volumes. ' + 'Please try again later.') + raise exception.InvalidVolume(reason=msg) + if not volume.host: volume_utils.notify_about_volume_usage(context, volume, "delete.start") @@ -1197,6 +1225,16 @@ def delete_snapshot(self, context, snapshot, force=False, if not unmanage_only: snapshot.assert_not_frozen() + vols = db.volume_get_all( + context.elevated(), + limit=1, + filters={'snapshot_id': snapshot.id, + 'status': 'creating'}) + if len(vols): + msg = _('Snapshot is in use by other volumes ' + 'that are being created. Please try again later.') + raise exception.InvalidSnapshot(reason=msg) + # Build required conditions for conditional update expected = {'cgsnapshot_id': None, 'group_snapshot_id': None} diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py index 2e5a7e6091c..f567a13469e 100644 --- a/cinder/volume/manager.py +++ b/cinder/volume/manager.py @@ -900,16 +900,6 @@ def create_volume(self, context, volume, request_spec=None, snapshot_id = request_spec.get('snapshot_id') source_volid = request_spec.get('source_volid') - locked_action: ty.Optional[str] - if snapshot_id is not None: - # Make sure the snapshot is not deleted until we are done with it. - locked_action = "%s-%s" % (snapshot_id, 'delete_snapshot') - elif source_volid is not None: - # Make sure the volume is not deleted until we are done with it. - locked_action = "%s-%s" % (source_volid, 'delete_volume') - else: - locked_action = None - def _run_flow() -> None: # This code executes create volume flow. If something goes wrong, # flow reverts all job that was done and reraises an exception. @@ -923,11 +913,7 @@ def _run_flow() -> None: rescheduled = False try: - if locked_action is None: - _run_flow() - else: - with coordination.COORDINATOR.get_lock(locked_action): - _run_flow() + _run_flow() finally: try: flow_engine.storage.fetch('refreshed')