From 0cdfd172b479c80605c37e17a303c5e0b0b8f0c6 Mon Sep 17 00:00:00 2001 From: Fabian Wiesel Date: Fri, 25 Jul 2025 12:46:22 +0200 Subject: [PATCH] Add command to error out migrations which are 'orphaned' Orphaned means, the VM has no task state, but the migration is still marked as running. Since the VM has no task, we never can't expect the migration state to change otherwise. Change-Id: I3a03f3ae17d4bdfd4df8c17ed07c79f22b54612c --- nova/cmd/manage.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ nova/db/main/api.py | 25 +++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index cdd1c44e3fa..d8276a121f3 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -766,6 +766,59 @@ def soft_delete_excessive_instance_faults( # Return 1 if we soft-deleted something return int(bool(cell_to_rows_deleted)) + @args('--verbose', action='store_true', dest='verbose', default=False, + help='Print how many rows were changed per cell.') + @args('--all-cells', action='store_true', dest='all_cells', + default=False, help='Run command across all cells.') + def error_out_orphaned_migrations(self, all_cells=False, verbose=False): + """Error-out orphaned migrations. + + Returns 0 if nothing was changed, 1 if at least one row was + changed, 3 if no connection could be established to the API DB. + """ + + ctxt = context.get_admin_context() + try: + cell_mappings = objects.CellMappingList.get_all(ctxt) + except db_exc.CantStartEngineError: + print(_('Failed to connect to API DB so aborting this ' + 'change attempt. Please check your config file to ' + 'make sure that [api_database]/connection is set and run ' + 'this command again.')) + return 3 + + cell_to_rows_changed = {} + if not all_cells: + cell_mappings = [None] + for cell_mapping in cell_mappings: + with context.target_cell(ctxt, cell_mapping) as cctxt: + # If all_cells=False, cell_mapping is None + cell_name = cell_mapping.name if cell_mapping else '' + try: + while True: + rows_deleted = \ + db.error_out_orphaned_migrations(cctxt) + if rows_deleted: + cell_to_rows_changed.setdefault(cell_name, 0) + cell_to_rows_changed[cell_name] += rows_deleted + if verbose: + print('.', end='') + except KeyboardInterrupt: + break + + if verbose: + if cell_to_rows_changed: + print(format_dict( + cell_to_rows_changed, + dict_property=_('Cell'), + dict_value=_('Number of Changed Rows'), + )) + else: + print(_('No row was changed.')) + + # Return 1 if we changed something + return int(bool(cell_to_rows_changed)) + class ApiDbCommands(object): """Class for managing the api database.""" diff --git a/nova/db/main/api.py b/nova/db/main/api.py index 63201244202..00798a844bd 100644 --- a/nova/db/main/api.py +++ b/nova/db/main/api.py @@ -4708,6 +4708,31 @@ def soft_delete_excessive_instance_faults(context, max_rows, max_faults): return deleted +@pick_context_manager_writer +def error_out_orphaned_migrations(context): + """Error-out orphaned migrations. + + We occasionally end up with instances in a running/stopped state without + any task, but the migrations are still "running". + As a workaround, we simply error them out. + + :returns: The number of soft-deleted rows. + """ + + when = timeutils.utcnow() - datetime.timedelta(seconds=10) + mig = models.Migration + migrations = mig.__table__ + subquery = sa.select(mig.id).join(mig.instance).where( + sa.and_(mig.status == 'running', + models.Instance.task_state == sa.null(), + models.Instance.updated_at < when) + ) + query = sa.update(migrations).where( + migrations.c.id.in_(subquery)).values(status='error') + rowcount = context.session.execute(query).rowcount + return rowcount + + ####################