From a64b12da6cc361d6880b25fa5341f88e2d900748 Mon Sep 17 00:00:00 2001
From: Fabian Wiesel <fabian.wiesel@sap.com>
Date: Fri, 27 Aug 2021 09:37:44 +0200
Subject: [PATCH 1/4] VmWare: Expose the hypervisor

You can configure now
[vmware]
hypervisor_mode=<cluster,esxi_to_cluster,cluster_to_esxi>

to chose to expose the individual esxi hosts instead of the cluster.
It will only show the esxi-host, as hypervisor node, but will not
honour the node parameter for placeing the instance for instance-creation,
migration, etc...

Change-Id: I264fd242c0de01ae8442c03bc726a0abfbe176ef
---
 nova/compute/manager.py              | 92 +++++++++++++++++++++++++---
 nova/conductor/manager.py            |  3 +
 nova/conductor/tasks/live_migrate.py | 18 ++++--
 nova/conf/vmware.py                  | 13 ++++
 nova/virt/hardware.py                |  4 +-
 nova/virt/vmwareapi/driver.py        | 34 +++++++---
 nova/virt/vmwareapi/host.py          | 13 ++++
 nova/virt/vmwareapi/vm_util.py       |  1 +
 nova/virt/vmwareapi/vmops.py         | 15 ++++-
 9 files changed, 167 insertions(+), 26 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index e89199cd4bf..e3075e5c026 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -8000,9 +8000,13 @@ def _detach_interface(self, context, instance, port_id):
             action=fields.NotificationAction.INTERFACE_DETACH,
             phase=fields.NotificationPhase.END)
 
-    def _get_compute_info(self, context, host):
-        return objects.ComputeNode.get_first_node_by_host_for_old_compat(
-            context, host)
+    def _get_compute_info(self, host, nodename=None):
+        if not nodename:
+            return objects.ComputeNode.get_first_node_by_host_for_old_compat(
+                self.context, host)
+
+        return objects.ComputeNode.get_by_host_and_nodename(
+            self.context, host, nodename)
 
     @wrap_exception()
     def check_instance_shared_storage(self, ctxt, data):
@@ -8063,7 +8067,7 @@ def check_can_live_migrate_destination(self, ctxt, instance,
             raise exception.MigrationPreCheckError(reason=msg)
 
         src_compute_info = obj_base.obj_to_primitive(
-            self._get_compute_info(ctxt, instance.host))
+            self._get_compute_info(ctxt, instance.host, instance.node))
         dst_compute_info = obj_base.obj_to_primitive(
             self._get_compute_info(ctxt, self.host))
         dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,
@@ -9019,12 +9023,19 @@ def post_live_migration_at_destination(self, context, instance,
                           'destination host.', instance=instance)
         finally:
             # Restore instance state and update host
-            current_power_state = self._get_power_state(instance)
-            node_name = None
             prev_host = instance.host
             try:
-                compute_node = self._get_compute_info(context, self.host)
-                node_name = compute_node.hypervisor_hostname
+                vm_info = self.driver.get_info(instance, use_cache=False)
+                current_power_state = vm_info.power_state
+                node_name = vm_info.node
+            except exception.InstanceNotFound:
+                current_power_state = power_state.NOSTATE
+                node_name = None
+
+            try:
+                if not node_name:
+                    compute_node = self._get_compute_info(context, self.host)
+                    node_name = compute_node.hypervisor_hostname
             except exception.ComputeHostNotFound:
                 LOG.exception('Failed to get compute_info for %s', self.host)
             finally:
@@ -9832,11 +9843,17 @@ def _query_driver_power_state_and_sync(self, context, db_instance):
         try:
             vm_instance = self.driver.get_info(db_instance)
             vm_power_state = vm_instance.state
+            vm_node = vm_instance.node
         except exception.InstanceNotFound:
             vm_power_state = power_state.NOSTATE
+            vm_node = None
         # Note(maoy): the above get_info call might take a long time,
         # for example, because of a broken libvirt driver.
         try:
+            self._sync_instance_node(context,
+                                     db_instance,
+                                     vm_node,
+                                     use_slave=True)
             self._sync_instance_power_state(context,
                                             db_instance,
                                             vm_power_state,
@@ -9884,6 +9901,65 @@ def _stop_unexpected_shutdown_instance(self, context, vm_state,
                 LOG.exception("error during stop() in sync_power_state.",
                               instance=db_instance)
 
+    def _sync_instance_node(self, context, instance, new_node,
+                            use_slave=False):
+        """Align the instance node between the database and hypervisor
+
+        If the instance is found on a different hypervisor the allocations
+        will be moved accordingly
+        """
+        if not new_node:
+            return
+
+        # We re-query the DB to get the latest instance info to minimize
+        # (not eliminate) race condition.
+        instance.refresh(use_slave=use_slave)
+
+        if self.host != instance.host:
+            return
+
+        source_node = instance.node
+        if new_node == instance.node:
+            return
+
+        rt = self._get_resource_tracker()
+        rc = self.scheduler_client.reportclient
+        try:
+            source_cn_uuid = rt.get_node_uuid(source_node)
+            cn_uuid = rt.get_node_uuid(new_node)
+
+            LOG.info("Moving instance from %s (%s) to %s (%s)",
+                        source_node, source_cn_uuid,
+                        new_node, cn_uuid,
+                        instance=instance)
+
+            allocs = rc.get_allocations_for_consumer(context, instance.uuid)
+            if not allocs:
+                LOG.warning("Failed to get existing resources")
+                return
+
+            if cn_uuid in allocs:
+                LOG.info("Instance has already allocations for %s",
+                        cn_uuid, instance=instance)
+            else:
+                LOG.debug(allocs, instance=instance)
+                resources = allocs.values()[0]['resources']
+                res = rc.set_and_clear_allocations(context, cn_uuid,
+                                         instance.uuid,
+                                         resources, instance.project_id,
+                                         instance.user_id)
+                if not res:
+                    LOG.warning("Failed to update allocations",
+                        instance=instance)
+                    return
+                LOG.debug("Updated allocations", instance=instance)
+
+            instance.node = new_node
+            instance.save()
+            self._update_scheduler_instance_info(context, instance)
+        except Exception:
+            LOG.exception("Failed to move instance to new node")
+
     def _sync_instance_power_state(self, context, db_instance, vm_power_state,
                                    use_slave=False):
         """Align instance power state between the database and hypervisor.
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index ae8acd3d5c2..bce66c35ff1 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -458,6 +458,7 @@ def live_migrate_instance(self, context, instance, scheduler_hint,
     def _live_migrate(self, context, instance, scheduler_hint,
                       block_migration, disk_over_commit, request_spec):
         destination = scheduler_hint.get("host")
+        destination_node = scheduler_hint.get("node")
 
         def _set_vm_state(context, instance, ex, vm_state=None,
                           task_state=None):
@@ -474,10 +475,12 @@ def _set_vm_state(context, instance, ex, vm_state=None,
 
         migration = objects.Migration(context=context.elevated())
         migration.dest_compute = destination
+        migration.dest_node = destination_node
         migration.status = 'accepted'
         migration.instance_uuid = instance.uuid
         migration.source_compute = instance.host
         migration.migration_type = fields.MigrationType.LIVE_MIGRATION
+        migration.source_node = instance.node
         if instance.obj_attr_is_set('flavor'):
             migration.old_instance_type_id = instance.flavor.id
             migration.new_instance_type_id = instance.flavor.id
diff --git a/nova/conductor/tasks/live_migrate.py b/nova/conductor/tasks/live_migrate.py
index 5e9d8235609..c9a34f90280 100644
--- a/nova/conductor/tasks/live_migrate.py
+++ b/nova/conductor/tasks/live_migrate.py
@@ -62,10 +62,12 @@ def __init__(self, context, instance, destination,
                  request_spec=None):
         super(LiveMigrationTask, self).__init__(context, instance)
         self.destination = destination
+        self.dest_node = None
         self.block_migration = block_migration
         self.disk_over_commit = disk_over_commit
         self.migration = migration
         self.source = instance.host
+        self.source_node = instance.node
         self.migrate_data = None
         self.limits = None
 
@@ -97,7 +99,7 @@ def _execute(self):
             # wants the scheduler to pick a destination host, or a host was
             # specified but is not forcing it, so they want the scheduler
             # filters to run on the specified host, like a scheduler hint.
-            self.destination, dest_node, self.limits = self._find_destination()
+            self.destination, self.dest_node, self.limits = self._find_destination()
         else:
             # This is the case that the user specified the 'force' flag when
             # live migrating with a specific destination host so the scheduler
@@ -336,12 +338,18 @@ def _check_destination_has_enough_memory(self):
                     instance_uuid=instance_uuid, dest=dest, avail=avail,
                     mem_inst=mem_inst))
 
-    def _get_compute_info(self, host):
-        return objects.ComputeNode.get_first_node_by_host_for_old_compat(
-            self.context, host)
+    def _get_compute_info(self, host, nodename=None):
+        if not nodename:
+            return objects.ComputeNode.get_first_node_by_host_for_old_compat(
+                self.context, host)
+
+        return objects.ComputeNode.get_by_host_and_nodename(
+            self.context, host, nodename)
 
     def _check_compatible_with_source_hypervisor(self, destination):
-        source_info = self._get_compute_info(self.source)
+        migration = self.migration
+        source_info = self._get_compute_info(migration.source_compute,
+                                             migration.source_node)
         destination_info = self._get_compute_info(destination)
 
         source_type = source_info.hypervisor_type
diff --git a/nova/conf/vmware.py b/nova/conf/vmware.py
index ab87ab73fe6..1126cc13a48 100644
--- a/nova/conf/vmware.py
+++ b/nova/conf/vmware.py
@@ -451,6 +451,19 @@
 
 Example: "vmx-13" for vSphere 6.5.
 Versions can be looked up here: https://kb.vmware.com/s/article/1003746
+"""),
+    cfg.StrOpt('hypervisor_mode',
+               default="cluster",
+               help="""
+Defines how the vsphere host is exposed to nova.
+
+Can be one of the following:
+
+cluster: The host has one hypervisor for the whole cluster. (Original)
+esxi_to_cluster: Both will be exposed, but the ESXi resources are reserved, and
+                 all instances will be moved over to the cluster
+cluster_to_esxi: Both will be exposed, but the ESXi resources are reserved, and
+                 all instances will be from the cluster to the ESXi-hosts
 """),
 ]
 
diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py
index 87b483bc8f0..60b0dfe611f 100644
--- a/nova/virt/hardware.py
+++ b/nova/virt/hardware.py
@@ -240,15 +240,17 @@ def get_number_of_serial_ports(flavor, image_meta):
 
 class InstanceInfo(object):
 
-    def __init__(self, state, internal_id=None):
+    def __init__(self, state, internal_id=None, node=None):
         """Create a new Instance Info object
 
         :param state: Required. The running state, one of the power_state codes
         :param internal_id: Optional. A unique ID for the instance. Need not be
                             related to the Instance.uuid.
+        :param node: Optional. The name of the compute node. See Instance.node
         """
         self.state = state
         self.internal_id = internal_id
+        self.node = node
 
     def __eq__(self, other):
         return (self.__class__ == other.__class__ and
diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py
index b7ca867f50b..a9550f5bb50 100644
--- a/nova/virt/vmwareapi/driver.py
+++ b/nova/virt/vmwareapi/driver.py
@@ -71,6 +71,7 @@
 
 UUID_RE = re.compile(r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-'
                      r'[a-fA-F0-9]{4}-[a-fA-F0-9]{12}')
+HYPERVISOR_MODES = ('cluster', 'esxi_to_cluster', 'cluster_to_esxi')
 
 
 class VMwareVCDriver(driver.ComputeDriver):
@@ -118,6 +119,12 @@ def __init__(self, virtapi, scheme="https"):
             raise Exception(_("Must specify host_ip, host_username and "
                               "host_password to use vmwareapi.VMwareVCDriver"))
 
+        if CONF.vmware.hypervisor_mode not in HYPERVISOR_MODES:
+            raise Exception("Unknown hypervisor mode {}. Expecting {}".format(
+                CONF.vmware.hypervisor_mode,
+                HYPERVISOR_MODES,
+            ))
+
         self._datastore_regex = None
         if CONF.vmware.datastore_regex:
             try:
@@ -171,6 +178,7 @@ def __init__(self, virtapi, scheme="https"):
                                       self._nodename,
                                       self._cluster_ref,
                                       self._datastore_regex)
+                                        self._vc_state,
         self._vmops = vmops.VMwareVMOps(self._session,
                                         virtapi,
                                         self._volumeops,
@@ -477,8 +485,6 @@ def get_cluster_metrics(self):
 
     def get_available_nodes(self, refresh=False):
         """Returns nodenames of all nodes managed by the compute service.
-
-        This driver supports only one compute node.
         """
         # get_available_nodes is called at the beginning of polling
         # the resources of all the nodes via
@@ -486,10 +492,12 @@ def get_available_nodes(self, refresh=False):
         # We follow here the same pattern as in the ironic driver and use this
         # function call as an indicator of a new polling cycle and refresh
         # the host stats cached in _vc_state by calling...
-        self._vc_state.get_host_stats(refresh=True)
+        hosts = self._vc_state.get_host_stats(refresh=True)
         # In the following calls to get_inventory and get_available_resource
         # for each node, we then return the cached data
-        return [self._nodename]
+
+        if CONF.vmware.hypervisor_mode == 'cluster':
+            return [self._nodename]
 
     def update_provider_tree(self, provider_tree, nodename, allocations=None):
         """Update a ProviderTree object with current resource provider,
@@ -538,6 +546,13 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
         :raises: ReshapeFailed if the requested tree reshape fails for
             whatever reason.
         """
+        if nodename == self._nodename:
+            # Either "cluster", or
+            normal = (CONF.vmware.hypervisor_mode in ('cluster',
+                                                      'esxi_to_cluster'))
+        else:
+            normal = (CONF.vmware.hypervisor_mode == 'cluster_to_esxi')
+
         stats = self.get_available_resource(nodename)
         result = {}
 
@@ -549,7 +564,7 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
         ratios = self._get_allocation_ratios(inv)
 
         local_gb = stats["local_gb"]
-        local_gb_max_free = stats.get("local_gb_max_free", "local_gb")
+        local_gb_max_free = stats.get("local_gb_max_free", local_gb)
         local_gb_max_unit_limit = CONF.vmware.resource_disk_gb_max_unit_limit
         if local_gb_max_unit_limit != -1:
             local_gb_max_free = min(local_gb_max_free, local_gb_max_unit_limit)
@@ -558,7 +573,7 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
                 CONF.reserved_host_disk_mb)
             result[orc.DISK_GB] = {
                 'total': local_gb,
-                'reserved': reserved_disk_gb,
+                'reserved': reserved_disk_gb if normal else local_gb,
                 'min_unit': 1,
                 'max_unit': local_gb_max_free,
                 'step_size': 1,
@@ -571,7 +586,7 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
             reserved_vcpus = stats["vcpus_reserved"]
             result[orc.VCPU] = {
                 'total': vcpus,
-                'reserved': reserved_vcpus,
+                'reserved': reserved_vcpus if normal else vcpus,
                 'min_unit': 1,
                 'max_unit': max_vcpus,
                 'step_size': 1,
@@ -584,7 +599,7 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
         if memory_mb > 0 and max_memory_mb > 0:
             result[orc.MEMORY_MB] = {
                 'total': memory_mb,
-                'reserved': reserved_memory_mb,
+                'reserved': reserved_memory_mb if normal else memory_mb,
                 'min_unit': 1,
                 'max_unit': max_memory_mb,
                 'step_size': 1,
@@ -597,7 +612,8 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
             if available_memory_mb > 0:
                 result[utils.MEMORY_RESERVABLE_MB_RESOURCE] = {
                         'total': available_memory_mb,
-                        'reserved': reserved_reservable_memory,
+                        'reserved': (reserved_reservable_memory
+                                     if normal else available_memory_mb),
                         'min_unit': 1,
                         'max_unit': max_memory_mb,
                         'step_size': 1,
diff --git a/nova/virt/vmwareapi/host.py b/nova/virt/vmwareapi/host.py
index ed1c89faceb..966a87124fc 100644
--- a/nova/virt/vmwareapi/host.py
+++ b/nova/virt/vmwareapi/host.py
@@ -22,6 +22,7 @@
 from oslo_utils import units
 from oslo_utils import versionutils
 from oslo_vmware import exceptions as vexc
+from oslo_vmware import vim_util as v_util
 
 import nova.conf
 from nova import context
@@ -66,6 +67,7 @@ def __init__(self, session, cluster_node_name, cluster, datastore_regex):
         self._cluster = cluster
         self._datastore_regex = datastore_regex
         self._stats = {}
+        self._host_ref_to_name = {}
         ctx = context.get_admin_context()
         try:
             service = objects.Service.get_by_compute_host(ctx, CONF.host)
@@ -80,6 +82,10 @@ def __init__(self, session, cluster_node_name, cluster, datastore_regex):
                 str(about_info.version))
         self.update_status()
 
+    @property
+    def cluster_node_name(self):
+        return self._cluster_node_name
+
     def get_host_stats(self, refresh=False):
         """Return the current state of the cluster. If 'refresh' is
         True, run the update first.
@@ -88,6 +94,11 @@ def get_host_stats(self, refresh=False):
             self.update_status()
         return self._stats
 
+    def get_host_name(self, host_ref):
+        if not self._host_ref_to_name:
+            self.update_status()
+        return self._host_ref_to_name[v_util.get_moref_value(host_ref)]
+
     def update_status(self):
         """Update the current state of the cluster."""
         data = {}
@@ -125,6 +136,8 @@ def update_status(self):
         }
 
         for host, stats in per_host_stats.items():
+            obj = stats.pop("obj")
+            self._host_ref_to_name[v_util.get_moref_value(obj)] = host
             data[host] = self._merge_stats(host, stats, defaults)
 
         cluster_stats = vm_util.aggregate_stats_from_cluster(per_host_stats)
diff --git a/nova/virt/vmwareapi/vm_util.py b/nova/virt/vmwareapi/vm_util.py
index c805d5c6723..81a73baa2ec 100644
--- a/nova/virt/vmwareapi/vm_util.py
+++ b/nova/virt/vmwareapi/vm_util.py
@@ -1734,6 +1734,7 @@ def _process_host_stats(obj, host_reservations_map):
         "memory_mb": mem_mb,
         "memory_mb_used": getattr(stats_summary, "overallMemoryUsage", 0),
         "cpu_info": _host_props_to_cpu_info(host_props),
+        "obj": obj.obj,
     }
 
     _set_hypervisor_type_and_version(stats, host_props)
diff --git a/nova/virt/vmwareapi/vmops.py b/nova/virt/vmwareapi/vmops.py
index 23f9c1c546f..a9523ea6895 100644
--- a/nova/virt/vmwareapi/vmops.py
+++ b/nova/virt/vmwareapi/vmops.py
@@ -3320,14 +3320,22 @@ def poll_rebooting_instances(self, timeout, instances):
 
     def get_info(self, instance, use_cache=True):
         """Return data about the VM instance."""
-        powerstate_property = 'runtime.powerState'
+        lst_properties = ["runtime.powerState"]
+        if CONF.vmware.hypervisor_mode == 'cluster_to_esxi':
+            lst_properties.append("runtime.host")
         # if use_cache is true, then we are fine with possibly slightly
         # outdated values in favour of less api load, so no polling of updates
         # => skip_update=use_cache
-        vm_props = self._get_instance_props(instance, [powerstate_property],
+        vm_props = self._get_instance_props(instance, lst_properties,
                                             skip_update=use_cache)
+        if CONF.vmware.hypervisor_mode == 'cluster_to_esxi':
+            node = self._vc_state.get_host_name(vm_props["runtime.host"])
+        else:
+            node = self._vc_state.cluster_node_name
         return hardware.InstanceInfo(
-            state=constants.POWER_STATES[vm_props[powerstate_property]])
+            state=constants.POWER_STATES[vm_props["runtime.powerState"]],
+            node=node
+            )
 
     def _get_diagnostics(self, instance):
         """Return data about VM diagnostics."""
@@ -4131,6 +4139,7 @@ def _get_vm_monitor_spec(self, vim):
             ["config.instanceUuid",
              "config.managedBy",
              "runtime.powerState",
+             "runtime.host",
              "summary.guest.toolsStatus",
              "summary.guest.toolsRunningStatus",
             ])

From cbbbb74491bf5d6cabafa5f7d3d494841190865c Mon Sep 17 00:00:00 2001
From: Fabian Wiesel <fabian.wiesel@sap.com>
Date: Tue, 21 Sep 2021 09:25:31 +0200
Subject: [PATCH 2/4] Host-Api: Create summary over all nodes

The code assumes that there is a single compute-node per host,
which is incorrect for ironic. By summarising over all hosts,
we get a correct response also for compute-hosts with more than
one compute-node.

Change-Id: Iaf6e2a72f6649e234660de47bec8b1da1ea1571e
---
 nova/api/openstack/compute/hosts.py | 31 +++++++++++++++++------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/nova/api/openstack/compute/hosts.py b/nova/api/openstack/compute/hosts.py
index 14d1a2666a5..a1ad8f5d9bf 100644
--- a/nova/api/openstack/compute/hosts.py
+++ b/nova/api/openstack/compute/hosts.py
@@ -204,20 +204,26 @@ def reboot(self, req, id):
         return self._host_power_action(req, host_name=id, action="reboot")
 
     @staticmethod
-    def _get_total_resources(host_name, compute_node):
+    def _get_total_resources(host_name, compute_nodes):
         return {'resource': {'host': host_name,
                              'project': '(total)',
-                             'cpu': compute_node.vcpus,
-                             'memory_mb': compute_node.memory_mb,
-                             'disk_gb': compute_node.local_gb}}
+                             'cpu': sum(cn.vcpus
+                                        for cn in compute_nodes),
+                             'memory_mb': sum(cn.memory_mb
+                                              for cn in compute_nodes),
+                             'disk_gb': sum(cn.local_gb
+                                            for cn in compute_nodes)}}
 
     @staticmethod
-    def _get_used_now_resources(host_name, compute_node):
+    def _get_used_now_resources(host_name, compute_nodes):
         return {'resource': {'host': host_name,
                              'project': '(used_now)',
-                             'cpu': compute_node.vcpus_used,
-                             'memory_mb': compute_node.memory_mb_used,
-                             'disk_gb': compute_node.local_gb_used}}
+                             'cpu': sum(cn.vcpus_used
+                                        for cn in compute_nodes),
+                             'memory_mb': sum(cn.memory_mb_used
+                                              for cn in compute_nodes),
+                             'disk_gb': sum(cn.local_gb_used
+                                            for cn in compute_nodes)}}
 
     @staticmethod
     def _get_resource_totals_from_instances(host_name, instances):
@@ -272,16 +278,15 @@ def show(self, req, id):
         try:
             mapping = objects.HostMapping.get_by_host(context, host_name)
             nova_context.set_target_cell(context, mapping.cell_mapping)
-            compute_node = (
-                objects.ComputeNode.get_first_node_by_host_for_old_compat(
-                    context, host_name))
+            compute_nodes = objects.ComputeNodeList.get_all_by_host(
+                    context, host_name)
             instances = self.api.instance_get_all_by_host(context, host_name)
         except (exception.ComputeHostNotFound,
                 exception.HostMappingNotFound) as e:
             raise webob.exc.HTTPNotFound(explanation=e.format_message())
-        resources = [self._get_total_resources(host_name, compute_node)]
+        resources = [self._get_total_resources(host_name, compute_nodes)]
         resources.append(self._get_used_now_resources(host_name,
-                                                      compute_node))
+                                                      compute_nodes))
         resources.append(self._get_resource_totals_from_instances(host_name,
                                                                   instances))
         by_proj_resources = self._get_resources_by_project(host_name,

From d2433df6ab8da0cf78d0e9813f22bceb4a56281b Mon Sep 17 00:00:00 2001
From: Fabian Wiesel <fabian.wiesel@sap.com>
Date: Thu, 7 Sep 2023 11:38:50 +0200
Subject: [PATCH 3/4] WIP: Cold-Migrate/Resize with multiple nodes

The driver and scheduler support conceptually multiple nodes
per compute host, but the resize api assumes a single node
per host as the only case of multiple nodes per host is ironic,
which does not support resizing or migration for obvious reasons.

This removes the assumption to fix resize/cold-migration on
hypervisors, which may support it (eventually vmware)

Change-Id: I9200dfea623458865f289c25fa5fb06a2140b1cb
---
 nova/compute/api.py | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/nova/compute/api.py b/nova/compute/api.py
index 368f04e8b35..12be05a719a 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -4015,14 +4015,13 @@ def _validate_host_for_cold_migrate(
                 raise exception.ComputeHostNotFound(host=host_name)
 
             with nova_context.target_cell(context, hm.cell_mapping) as cctxt:
-                node = objects.ComputeNode.\
-                    get_first_node_by_host_for_old_compat(
-                        cctxt, host_name, use_slave=True)
+                nodes = objects.ComputeNodeList.get_all_by_host(
+                    cctxt, host_name, use_slave=True)
         else:
-            node = objects.ComputeNode.get_first_node_by_host_for_old_compat(
+            nodes = objects.ComputeNodeList.get_all_by_host(
                 context, host_name, use_slave=True)
 
-        return node
+        return nodes
 
     # TODO(stephenfin): This logic would be so much easier to grok if we
     # finally split resize and cold migration into separate code paths
@@ -4050,8 +4049,10 @@ def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
             context, instance)
 
         if host_name is not None:
-            node = self._validate_host_for_cold_migrate(
+            nodes = self._validate_host_for_cold_migrate(
                 context, instance, host_name, allow_cross_cell_resize)
+        else:
+            nodes = None
 
         self._check_auto_disk_config(
             instance, auto_disk_config=auto_disk_config)
@@ -4076,9 +4077,8 @@ def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
             if CONF.always_resize_on_same_host:
                 LOG.info('Setting resize to the same host')
                 host_name = instance.host
-                node = (
-                    objects.ComputeNode.get_first_node_by_host_for_old_compat(
-                        context, host_name, use_slave=True))
+                nodes = objects.ComputeNodeList.get_all_by_host(
+                            context, host_name, use_slave=True)
             new_flavor = flavors.get_flavor_by_flavor_id(
                 flavor_id, read_deleted="no")
             # NOTE(wenping): We use this instead of the 'block_accelerator'
@@ -4189,20 +4189,16 @@ def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
         # which takes has filter_properties which in turn has
         # scheduler_hints (plural).
 
-        if host_name is None:
-            # If 'host_name' is not specified,
-            # clear the 'requested_destination' field of the RequestSpec
-            # except set the allow_cross_cell_move flag since conductor uses
-            # it prior to scheduling.
-            request_spec.requested_destination = objects.Destination(
-                allow_cross_cell_move=allow_cross_cell_resize)
+        if nodes and len(nodes) == 1:
+            node_name = nodes[0].hypervisor_hostname
         else:
-            # Set the host and the node so that the scheduler will
-            # validate them.
-            request_spec.requested_destination = objects.Destination(
-                host=node.host, node=node.hypervisor_hostname,
-                allow_cross_cell_move=allow_cross_cell_resize)
+            node_name = None
 
+        # Set the host and the node so that the scheduler will either
+        # validate them, or select one matching host and potentially node
+        request_spec.requested_destination = objects.Destination(
+            host=host_name, node=node_name,
+            allow_cross_cell_move=allow_cross_cell_resize)
         # Asynchronously RPC cast to conductor so the response is not blocked
         # during scheduling. If something fails the user can find out via
         # instance actions.

From 2516f3c5a36c2481d5d379fb2888151c219b1868 Mon Sep 17 00:00:00 2001
From: Fabian Wiesel <fabian.wiesel@sap.com>
Date: Wed, 13 Sep 2023 13:18:45 +0200
Subject: [PATCH 4/4] WIP: Per node live-migration

Change-Id: I1f118e639e69d0556fbfb0980ff0340d7525a003
---
 .../common_payloads/RequestSpecPayload.json   |  1 +
 nova/compute/api.py                           | 17 ++++---
 nova/compute/manager.py                       |  8 ++--
 nova/conductor/tasks/live_migrate.py          | 44 ++++++++++---------
 nova/notifications/objects/request_spec.py    |  5 ++-
 nova/objects/request_spec.py                  | 12 ++++-
 nova/scheduler/host_manager.py                | 18 +++++++-
 .../objects/test_notification.py              |  2 +-
 nova/virt/vmwareapi/driver.py                 |  3 +-
 9 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/doc/notification_samples/common_payloads/RequestSpecPayload.json b/doc/notification_samples/common_payloads/RequestSpecPayload.json
index 3301c18c589..090af96d3e3 100644
--- a/doc/notification_samples/common_payloads/RequestSpecPayload.json
+++ b/doc/notification_samples/common_payloads/RequestSpecPayload.json
@@ -4,6 +4,7 @@
         "availability_zone": null,
         "flavor": {"$ref": "FlavorPayload.json#"},
         "ignore_hosts": null,
+        "ignore_nodes": null,
         "image": {"$ref": "ImageMetaPayload.json#"},
         "instance_uuid": "d5e6a7b7-80e5-4166-85a3-cd6115201082",
         "num_instances": 1,
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 12be05a719a..4c277986b62 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -5475,15 +5475,13 @@ def evacuate(self, context, instance, host, on_shared_storage,
         # the pre-v2.29 API microversion, which wouldn't set force
         if force is False and host:
             nodes = objects.ComputeNodeList.get_all_by_host(context, host)
-            # NOTE(sbauza): Unset the host to make sure we call the scheduler
-            host = None
-            # FIXME(sbauza): Since only Ironic driver uses more than one
-            # compute per service but doesn't support evacuations,
-            # let's provide the first one.
-            target = nodes[0]
+            if len(nodes) == 1:
+                node = nodes[0].hypervisor_hostname
+            else:
+                node = None
             destination = objects.Destination(
-                host=target.host,
-                node=target.hypervisor_hostname
+                host=host,
+                node=node
             )
             request_spec.requested_destination = destination
 
@@ -5497,7 +5495,8 @@ def evacuate(self, context, instance, host, on_shared_storage,
                        bdms=None,
                        recreate=True,
                        on_shared_storage=on_shared_storage,
-                       host=host,
+                       # NOTE(sbauza): To make sure we call the scheduler
+                       host=None,
                        request_spec=request_spec,
                        )
 
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index e3075e5c026..70ff99f1653 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -8002,8 +8002,10 @@ def _detach_interface(self, context, instance, port_id):
 
     def _get_compute_info(self, host, nodename=None):
         if not nodename:
-            return objects.ComputeNode.get_first_node_by_host_for_old_compat(
-                self.context, host)
+            nodes = objects.ComputeNodeList.get_all_by_host(self.context, host)
+            if len(nodes) != 1:
+                raise exception.ComputeHostNotFound(host=host)
+            return nodes[0]
 
         return objects.ComputeNode.get_by_host_and_nodename(
             self.context, host, nodename)
@@ -8069,7 +8071,7 @@ def check_can_live_migrate_destination(self, ctxt, instance,
         src_compute_info = obj_base.obj_to_primitive(
             self._get_compute_info(ctxt, instance.host, instance.node))
         dst_compute_info = obj_base.obj_to_primitive(
-            self._get_compute_info(ctxt, self.host))
+            self._get_compute_info(ctxt, self.host, migration.dest_node))
         dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,
             instance, src_compute_info, dst_compute_info,
             block_migration, disk_over_commit)
diff --git a/nova/conductor/tasks/live_migrate.py b/nova/conductor/tasks/live_migrate.py
index c9a34f90280..7b68ab4bab2 100644
--- a/nova/conductor/tasks/live_migrate.py
+++ b/nova/conductor/tasks/live_migrate.py
@@ -99,7 +99,8 @@ def _execute(self):
             # wants the scheduler to pick a destination host, or a host was
             # specified but is not forcing it, so they want the scheduler
             # filters to run on the specified host, like a scheduler hint.
-            self.destination, self.dest_node, self.limits = self._find_destination()
+            self.destination, self.dest_node, self.limits = \
+                self._find_destination()
         else:
             # This is the case that the user specified the 'force' flag when
             # live migrating with a specific destination host so the scheduler
@@ -110,7 +111,7 @@ def _execute(self):
             self._check_destination_has_enough_memory()
             source_node, dest_node = (
                 self._check_compatible_with_source_hypervisor(
-                    self.destination))
+                    self.destination, self.dest_node))
             # TODO(mriedem): Call select_destinations() with a
             # skip_filters=True flag so the scheduler does the work of claiming
             # resources on the destination in Placement but still bypass the
@@ -317,7 +318,7 @@ def _check_destination_is_not_source(self):
                     instance_id=self.instance.uuid, host=self.destination)
 
     def _check_destination_has_enough_memory(self):
-        compute = self._get_compute_info(self.destination)
+        compute = self._get_compute_info(self.destination, self.dest_node)
         free_ram_mb = compute.free_ram_mb
         total_ram_mb = compute.memory_mb
         mem_inst = self.instance.memory_mb
@@ -340,17 +341,19 @@ def _check_destination_has_enough_memory(self):
 
     def _get_compute_info(self, host, nodename=None):
         if not nodename:
-            return objects.ComputeNode.get_first_node_by_host_for_old_compat(
-                self.context, host)
+            nodes = objects.ComputeNodeList.get_all_by_host(self.context, host)
+            if len(nodes) != 1:
+                raise exception.ComputeHostNotFound(host=host)
+            return nodes[0]
 
         return objects.ComputeNode.get_by_host_and_nodename(
             self.context, host, nodename)
 
-    def _check_compatible_with_source_hypervisor(self, destination):
+    def _check_compatible_with_source_hypervisor(self, dest_host, dest_node):
         migration = self.migration
         source_info = self._get_compute_info(migration.source_compute,
                                              migration.source_node)
-        destination_info = self._get_compute_info(destination)
+        destination_info = self._get_compute_info(dest_host, dest_node)
 
         source_type = source_info.hypervisor_type
         destination_type = destination_info.hypervisor_type
@@ -469,14 +472,12 @@ def _get_destination_cell_mapping(self):
                 reason=(_('Unable to determine in which cell '
                           'destination host %s lives.') % self.destination))
 
-    def _get_request_spec_for_select_destinations(self, attempted_hosts=None):
+    def _get_request_spec_for_select_destinations(self):
         """Builds a RequestSpec that can be passed to select_destinations
 
         Used when calling the scheduler to pick a destination host for live
         migrating the instance.
 
-        :param attempted_hosts: List of host names to ignore in the scheduler.
-            This is generally at least seeded with the source host.
         :returns: nova.objects.RequestSpec object
         """
         # NOTE(fwiesel): In order to check the compatibility
@@ -530,14 +531,13 @@ def _get_request_spec_for_select_destinations(self, attempted_hosts=None):
 
     def _find_destination(self):
         # TODO(johngarbutt) this retry loop should be shared
-        attempted_hosts = [self.source]
-        request_spec = self._get_request_spec_for_select_destinations(
-            attempted_hosts)
+        attempted_nodes = [self.source_node]
+        request_spec = self._get_request_spec_for_select_destinations()
 
         host = None
         while host is None:
-            self._check_not_over_max_retries(attempted_hosts)
-            request_spec.ignore_hosts = attempted_hosts
+            self._check_not_over_max_retries(attempted_nodes)
+            request_spec.ignore_nodes = attempted_nodes
             try:
                 selection_lists = self.query_client.select_destinations(
                         self.context, request_spec, [self.instance.uuid],
@@ -546,6 +546,7 @@ def _find_destination(self):
                 # only one instance, and we don't care about any alternates.
                 selection = selection_lists[0][0]
                 host = selection.service_host
+                node = selection.nodename
             except messaging.RemoteError as ex:
                 # TODO(ShaoHe Feng) There maybe multi-scheduler, and the
                 # scheduling algorithm is R-R, we can let other scheduler try.
@@ -568,17 +569,18 @@ def _find_destination(self):
                         self.context, self.report_client,
                         self.instance.pci_requests.requests, provider_mapping)
             try:
-                self._check_compatible_with_source_hypervisor(host)
+                self._check_compatible_with_source_hypervisor(host, node)
                 self._call_livem_checks_on_host(host, provider_mapping)
             except (exception.Invalid, exception.MigrationPreCheckError) as e:
-                LOG.debug("Skipping host: %(host)s because: %(e)s",
-                    {"host": host, "e": e})
-                attempted_hosts.append(host)
+                LOG.debug("Skipping node: %(host)s/%(node)s because: %(e)s",
+                    {"host": host, "node": node, "e": e})
+                attempted_nodes.append(node)
                 # The scheduler would have created allocations against the
                 # selected destination host in Placement, so we need to remove
                 # those before moving on.
                 self._remove_host_allocations(selection.compute_node_uuid)
                 host = None
+                node = None
         # TODO(artom) We should probably just return the whole selection object
         # at this point.
         return (selection.service_host, selection.nodename, selection.limits)
@@ -595,11 +597,11 @@ def _remove_host_allocations(self, compute_node_uuid):
         self.report_client.remove_provider_tree_from_instance_allocation(
             self.context, self.instance.uuid, compute_node_uuid)
 
-    def _check_not_over_max_retries(self, attempted_hosts):
+    def _check_not_over_max_retries(self, attempted_nodes):
         if CONF.migrate_max_retries == -1:
             return
 
-        retries = len(attempted_hosts) - 1
+        retries = len(attempted_nodes) - 1
         if retries > CONF.migrate_max_retries:
             if self.migration:
                 self.migration.status = 'failed'
diff --git a/nova/notifications/objects/request_spec.py b/nova/notifications/objects/request_spec.py
index a46a71e8da6..8a748e6f625 100644
--- a/nova/notifications/objects/request_spec.py
+++ b/nova/notifications/objects/request_spec.py
@@ -26,10 +26,12 @@ class RequestSpecPayload(base.NotificationPayloadBase):
     # Version 1.1: Add force_hosts, force_nodes, ignore_hosts, image_meta,
     #              instance_group, requested_destination, retry,
     #              scheduler_hints and security_groups fields
-    VERSION = '1.1'
+    # Version 1.2: Add ignore_nodes field
+    VERSION = '1.2'
 
     SCHEMA = {
         'ignore_hosts': ('request_spec', 'ignore_hosts'),
+        'ignore_nodes': ('request_spec', 'ignore_nodes'),
         'instance_uuid': ('request_spec', 'instance_uuid'),
         'project_id': ('request_spec', 'project_id'),
         'user_id': ('request_spec', 'user_id'),
@@ -47,6 +49,7 @@ class RequestSpecPayload(base.NotificationPayloadBase):
         'force_hosts': fields.StringField(nullable=True),
         'force_nodes': fields.StringField(nullable=True),
         'ignore_hosts': fields.ListOfStringsField(nullable=True),
+        'ignore_nodes': fields.ListOfStringsField(nullable=True),
         'image_meta': fields.ObjectField('ImageMetaPayload', nullable=True),
         'instance_group': fields.ObjectField('ServerGroupPayload',
             nullable=True),
diff --git a/nova/objects/request_spec.py b/nova/objects/request_spec.py
index 67c9684057a..74f91047246 100644
--- a/nova/objects/request_spec.py
+++ b/nova/objects/request_spec.py
@@ -73,6 +73,8 @@ class RequestSpec(base.NovaObject):
         'num_instances': fields.IntegerField(default=1),
         # NOTE(alex_xu): This field won't be persisted.
         'ignore_hosts': fields.ListOfStringsField(nullable=True),
+        # NOTE(fabianw): This field won't be persisted
+        'ignore_nodes': fields.ListOfStringsField(nullable=True),
         # NOTE(mriedem): In reality, you can only ever have one
         # host in the force_hosts list. The fact this is a list
         # is a mistake perpetuated over time.
@@ -347,6 +349,7 @@ def from_primitives(cls, context, request_spec, filter_properties):
         spec._from_flavor(flavor)
         # Hydrate now from filter_properties
         spec.ignore_hosts = filter_properties.get('ignore_hosts')
+        spec.ignore_nodes = filter_properties.get('ignore_nodes')
         spec.force_hosts = filter_properties.get('force_hosts')
         spec.force_nodes = filter_properties.get('force_nodes')
         retry = filter_properties.get('retry', {})
@@ -460,6 +463,8 @@ def to_legacy_filter_properties_dict(self):
         filt_props = {}
         if self.obj_attr_is_set('ignore_hosts') and self.ignore_hosts:
             filt_props['ignore_hosts'] = self.ignore_hosts
+        if self.obj_attr_is_set('ignore_nodes') and self.ignore_nodes:
+            filt_props['ignore_nodes'] = self.ignore_nodes
         if self.obj_attr_is_set('force_hosts') and self.force_hosts:
             filt_props['force_hosts'] = self.force_hosts
         if self.obj_attr_is_set('force_nodes') and self.force_nodes:
@@ -527,6 +532,7 @@ def from_components(
         spec_obj._from_instance_pci_requests(pci_requests)
         spec_obj._from_instance_numa_topology(numa_topology)
         spec_obj.ignore_hosts = filter_properties.get('ignore_hosts')
+        spec_obj.ignore_nodes = filter_properties.get('ignore_nodes')
         spec_obj.force_hosts = filter_properties.get('force_hosts')
         spec_obj.force_nodes = filter_properties.get('force_nodes')
         spec_obj._from_retry(filter_properties.get('retry', {}))
@@ -619,10 +625,11 @@ def _from_db_object(context, spec, db_spec):
                 # None and we'll lose what is set (but not persisted) on the
                 # object.
                 continue
-            elif key in ('retry', 'ignore_hosts'):
+            elif key in ('retry', 'ignore_hosts', 'ignore_nodes'):
                 # NOTE(takashin): Do not override the 'retry' or 'ignore_hosts'
                 # fields which are not persisted. They are not lazy-loadable
                 # fields. If they are not set, set None.
+                # NOTE(fabianw): Same with 'ignore_nodes'
                 if not spec.obj_attr_is_set(key):
                     setattr(spec, key, None)
             elif key == "numa_topology":
@@ -704,7 +711,8 @@ def _get_update_primitives(self):
                 spec.instance_group.hosts = None
             # NOTE(mriedem): Don't persist these since they are per-request
             for excluded in ('retry', 'requested_destination',
-                             'requested_resources', 'ignore_hosts'):
+                             'requested_resources', 'ignore_hosts',
+                             'ignore_nodes'):
                 if excluded in spec and getattr(spec, excluded):
                     setattr(spec, excluded, None)
             # NOTE(stephenfin): Don't persist network metadata since we have
diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py
index 23d72140839..ef3027f8719 100644
--- a/nova/scheduler/host_manager.py
+++ b/nova/scheduler/host_manager.py
@@ -49,6 +49,7 @@
 
 class ReadOnlyDict(IterableUserDict):
     """A read-only dict."""
+
     def __init__(self, source=None):
         self.data = {}
         if source:
@@ -497,6 +498,16 @@ def _strip_ignore_hosts(host_map, hosts_to_ignore):
             ignored_hosts_str = ', '.join(ignored_hosts)
             LOG.info('Host filter ignoring hosts: %s', ignored_hosts_str)
 
+        def _strip_ignore_nodes(host_map, nodes_to_ignore):
+            ignored_nodes = []
+            for node in nodes_to_ignore:
+                for (hostname, nodename) in list(host_map.keys()):
+                    if node.lower() == nodename.lower():
+                        del host_map[(hostname, nodename)]
+                        ignored_nodes.append(node)
+            ignored_nodes_str = ', '.join(ignored_nodes)
+            LOG.info('Host filter ignoring nodes: %s', ignored_nodes_str)
+
         def _match_forced_hosts(host_map, hosts_to_force):
             forced_hosts = []
             lowered_hosts_to_force = [host.lower() for host in hosts_to_force]
@@ -567,6 +578,7 @@ def _get_hosts_matching_request(hosts, requested_destination):
             return iter(requested_nodes)
 
         ignore_hosts = spec_obj.ignore_hosts or []
+        ignore_nodes = spec_obj.ignore_nodes or []
         force_hosts = spec_obj.force_hosts or []
         force_nodes = spec_obj.force_nodes or []
         requested_node = spec_obj.requested_destination
@@ -576,7 +588,7 @@ def _get_hosts_matching_request(hosts, requested_destination):
             # possible to any requested destination nodes before passing the
             # list to the filters
             hosts = _get_hosts_matching_request(hosts, requested_node)
-        if ignore_hosts or force_hosts or force_nodes:
+        if ignore_hosts or ignore_nodes or force_hosts or force_nodes:
             # NOTE(deva): we can't assume "host" is unique because
             #             one host may have many nodes.
             name_to_cls_map = {(x.host, x.nodename): x for x in hosts}
@@ -584,6 +596,10 @@ def _get_hosts_matching_request(hosts, requested_destination):
                 _strip_ignore_hosts(name_to_cls_map, ignore_hosts)
                 if not name_to_cls_map:
                     return []
+            if ignore_nodes:
+                _strip_ignore_nodes(name_to_cls_map, ignore_nodes)
+                if not name_to_cls_map:
+                    return []
             # NOTE(deva): allow force_hosts and force_nodes independently
             if force_hosts:
                 _match_forced_hosts(name_to_cls_map, force_hosts)
diff --git a/nova/tests/unit/notifications/objects/test_notification.py b/nova/tests/unit/notifications/objects/test_notification.py
index b66d91283ad..6166fbe2a10 100644
--- a/nova/tests/unit/notifications/objects/test_notification.py
+++ b/nova/tests/unit/notifications/objects/test_notification.py
@@ -427,7 +427,7 @@ def test_payload_is_not_generated_if_notification_format_is_unversioned(
     'MetricsNotification': '1.0-a73147b93b520ff0061865849d3dfa56',
     'MetricsPayload': '1.0-65c69b15b4de5a8c01971cb5bb9ab650',
     'NotificationPublisher': '2.2-ff8ef16673817ca7a3ea69c689e260c6',
-    'RequestSpecPayload': '1.1-64d30723a2e381d0cd6a16a877002c64',
+    'RequestSpecPayload': '1.2-6e4978f842a19991871904f126b97ecf',
     'SchedulerRetriesPayload': '1.0-03a07d09575ef52cced5b1b24301d0b4',
     'SelectDestinationsNotification': '1.0-a73147b93b520ff0061865849d3dfa56',
     'ServerGroupNotification': '1.0-a73147b93b520ff0061865849d3dfa56',
diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py
index a9550f5bb50..66001899697 100644
--- a/nova/virt/vmwareapi/driver.py
+++ b/nova/virt/vmwareapi/driver.py
@@ -178,7 +178,6 @@ def __init__(self, virtapi, scheme="https"):
                                       self._nodename,
                                       self._cluster_ref,
                                       self._datastore_regex)
-                                        self._vc_state,
         self._vmops = vmops.VMwareVMOps(self._session,
                                         virtapi,
                                         self._volumeops,
@@ -499,6 +498,8 @@ def get_available_nodes(self, refresh=False):
         if CONF.vmware.hypervisor_mode == 'cluster':
             return [self._nodename]
 
+        return hosts.keys()
+
     def update_provider_tree(self, provider_tree, nodename, allocations=None):
         """Update a ProviderTree object with current resource provider,
         inventory information and CPU traits.