From 8567b63a114d5d20b3294d5d2b71488256a2ab89 Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Wed, 29 Apr 2026 15:11:17 +0530
Subject: [PATCH 01/11] feat: enhance microceph orchestrator

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
---
 .gitignore                                    |   2 +
 .../src/microceph/client/cluster.py           |  57 +-
 microceph-orch/src/microceph/module.py        | 361 ++++++++--
 microceph-orch/tests/conftest.py              | 148 ++++
 microceph-orch/tests/stubs.py                 | 135 ++++
 microceph-orch/tests/test_client.py           | 188 +++++
 microceph-orch/tests/test_module.py           | 671 ++++++++++++++++++
 7 files changed, 1517 insertions(+), 45 deletions(-)
 create mode 100644 microceph-orch/tests/conftest.py
 create mode 100644 microceph-orch/tests/stubs.py
 create mode 100644 microceph-orch/tests/test_client.py
 create mode 100644 microceph-orch/tests/test_module.py
diff --git a/.gitignore b/.gitignore
index c0dcfe174..ad99d0017 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ _build
 .vscode
 microceph/coverage.out
 microceph/coverage.html
+__pycache__/
+*.pyc
diff --git a/microceph-orch/src/microceph/client/cluster.py b/microceph-orch/src/microceph/client/cluster.py
index 6601e999a..e0173eb11 100644
--- a/microceph-orch/src/microceph/client/cluster.py
+++ b/microceph-orch/src/microceph/client/cluster.py
@@ -18,7 +18,7 @@ def get_cluster_members(self) -> list:
         """
         result = []
         cluster = self._get("/core/1.0/cluster")
-        members = cluster.get("metadata", {})
+        members = cluster.get("metadata") or []
         keys = ["name", "address", "status"]
         for member in members:
             result.append({k: v for k, v in member.items() if k in keys})
@@ -55,22 +55,22 @@ class ExtendedAPIService(service.BaseService):
     def list_services(self) -> list[dict]:
         """List all services."""
         services = self._get("/1.0/services")
-        return services.get("metadata")
+        return services.get("metadata") or []
 
     def list_resources(self) -> list[dict]:
         """List all resources."""
         nodes = self._get("/1.0/resources")
-        return nodes.get("metadata")
+        return nodes.get("metadata") or []
 
     def list_disks(self) -> list[dict]:
         """List all disks"""
         disks = self._get("/1.0/disks")
-        return disks.get("metadata")
+        return disks.get("metadata") or []
 
     def get_status(self) -> dict[str, dict]:
         """Get status of the cluster."""
         cluster = self._get("/1.0/status")
-        members = cluster.get("metadata", {})
+        members = cluster.get("metadata") or []
         return {
             member["name"]: {
                 "status": member["status"],
@@ -78,3 +78,50 @@ def get_status(self) -> dict[str, dict]:
             }
             for member in members
         }
+
+    def enable_service(self, name: str, payload: str = "", wait: bool = True) -> None:
+        """Enable a service on the cluster.
+
+        Sends a PUT request to /1.0/services/<name> with an EnableService payload.
+        The Go API dispatches this to ServicePlacementHandler which runs the full
+        placement lifecycle: PopulateParams, HospitalityCheck, ServiceInit,
+        PostPlacementCheck, DbUpdate.
+
+        :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror')
+        :param payload: JSON string with service-specific parameters
+        :param wait: if True, the server waits for the service to be fully enabled
+        """
+        # Note: The "bool" key maps to Go's EnableService.Wait field which has
+        # the struct tag `json:"bool"` (upstream naming quirk in MicroCeph).
+        self._put(f"/1.0/services/{name}", json={
+            "name": name,
+            "bool": wait,
+            "payload": payload,
+        })
+
+    def delete_service(self, name: str) -> None:
+        """Delete/disable a service on the cluster.
+
+        :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror')
+        """
+        self._delete(f"/1.0/services/{name}")
+
+    def restart_services(self, services: list[str]) -> None:
+        """Restart one or more services on the cluster.
+
+        Sends a POST to /1.0/services/restart with a list of service objects.
+
+        :param services: list of service names (e.g. ['mon', 'rgw'])
+        """
+        payload = [{"service": svc} for svc in services]
+        self._post("/1.0/services/restart", json=payload)
+
+    def delete_nfs_service(self, cluster_id: str) -> None:
+        """Delete/disable an NFS service by cluster ID.
+
+        NFS deletion requires the cluster_id in the request body, unlike
+        other services which are identified by URL path alone.
+
+        :param cluster_id: NFS cluster identifier
+        """
+        self._delete("/1.0/services/nfs", json={"cluster_id": cluster_id})
diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index 403c7f028..9391af7e3 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -109,6 +109,8 @@ def available(self) -> Tuple[bool, str, Dict[str, Any]]:
             self.microceph.status.is_available()
         except RemoteException as e:
             return False, f"Cannot reach the MicroCeph API: {e}", {}
+        except Exception as e:
+            return False, f"Unexpected error reaching MicroCeph API: {e}", {}
 
         return True, "", {}
 
@@ -130,8 +132,18 @@ def get_hosts(self) -> List[HostSpec]:
         """
         specs = []
         for m in self.microceph.cluster.get_cluster_members():
-            addr, _, _ = m['address'].rpartition(":")
-            specs.append(HostSpec(m['name'], addr, status=m['status']))
+            # Address from microcluster is in "host:port" format.
+            # rpartition splits on the last ":" to separate host from port.
+            address = m.get('address', '')
+            addr, _, _ = address.rpartition(":")
+            if not addr:
+                # No ":" found; use the raw address (may be hostname only).
+                addr = address
+            specs.append(HostSpec(
+                m.get('name', ''),
+                addr,
+                status=m.get('status', 'unknown'),
+            ))
 
         return specs
 
@@ -142,16 +154,20 @@ def _get_service_hostlist(self, recorded_services: list) -> dict:
             service_name = record['service'] if not record['group_id'] else f"{record['service']}.{record['group_id']}"
             service_host = record['location']
             service_hostlist[service_name].append(service_host)
-            logger.info(f"microcephs record service({service_name}) at ({service_host}) configured({record['info']})")
+            logger.debug(f"microcephs record service({service_name}) at ({service_host}) configured({record['info']})")
         return service_hostlist
 
-    def _elaborate_service(self, service: str):
-        """Elaborate a service into id and type"""
-        if '.' in service:
-            segments = service.split('.')
-            return segments[0], segments[1]
-        else:
-            return service, ""
+    @staticmethod
+    def _parse_service_name(service_name: str) -> Tuple[str, str]:
+        """Split a service name into (type, id).
+
+        Handles dotted names like 'nfs.my.cluster' correctly by splitting
+        on the first dot only.
+
+        :return: (service_type, service_id); service_id is '' if no dot.
+        """
+        svc_type, _, svc_id = service_name.partition('.')
+        return svc_type, svc_id
 
     @handle_orch_error
     def describe_service(self,
@@ -169,12 +185,14 @@ def describe_service(self,
         service_descs = []
         for svc_name, hostlist in service_hostlist.items():
             spec = None
-            svc_type, svc_id = self._elaborate_service(svc_name)
-            logger.info(f"{svc_name} under description for filter {service_type}")
+            svc_type, svc_id = self._parse_service_name(svc_name)
+            logger.debug(f"{svc_name} under description for filter {service_type}")
 
-            # skip unrelated services if a specific daemon type is requested.
+            # Apply filters
             if service_type and svc_type != service_type:
                 continue
+            if service_name and svc_name != service_name:
+                continue
 
             if svc_type in daemon_spec_map:
                 spec = daemon_spec_map[svc_type](
@@ -209,19 +227,31 @@ def list_daemons(self,
         for svc in services:
             svc_daemon_type = svc['service']
             svc_hostname = svc['location']
-            svc_group_ip = svc['group_id']
+            svc_group_id = svc['group_id']
             svc_ip = None
             svc_ports = None
-            svc_name = f"{svc_daemon_type}.{svc_group_ip}" if svc_group_ip else svc_daemon_type
-            if daemon_type:
-                if svc_daemon_type != daemon_type:
-                    continue
+            svc_name = f"{svc_daemon_type}.{svc_group_id}" if svc_group_id else svc_daemon_type
+
+            # Apply filters
+            if daemon_type and svc_daemon_type != daemon_type:
+                continue
+            if host and svc_hostname != host:
+                continue
+            if daemon_id and svc_hostname != daemon_id:
+                continue
+            if service_name and svc_name != service_name:
+                continue
+
+            # Extract NFS-specific info (bind address and port)
+            if svc_daemon_type == 'nfs' and svc.get('info'):
+                try:
+                    info = json.loads(svc['info'])
+                    bind_addr = info.get('bind_address', '')
+                    svc_ip = None if '0.0.0.0' in bind_addr else bind_addr or None
+                    svc_ports = [info['bind_port']] if 'bind_port' in info else None
+                except (json.JSONDecodeError, KeyError) as e:
+                    logger.warning(f"Failed to parse NFS service info for {svc_hostname}: {e}")
 
-            if svc_daemon_type == 'nfs':
-                info = json.loads(svc['info'])
-                svc_ip = None if "0.0.0.0" in info['bind_address'] else info['bind_address']
-                svc_ports = [info['bind_port']]
-            
             descriptions.append(DaemonDescription(
                 service_name=svc_name,
                 daemon_type=svc_daemon_type,
@@ -231,7 +261,7 @@ def list_daemons(self,
                 ports=svc_ports
             ))
 
-        logger.info(descriptions)
+        logger.debug(f"list_daemons returning {len(descriptions)} daemons")
         return descriptions
 
     @handle_orch_error
@@ -240,38 +270,289 @@ def get_inventory(self,
                 refresh: bool = False
             ) -> List[InventoryHost]:
 
-        disks = self.microceph.services.list_disks()
-        disks_by_host = defaultdict(list)
-        for d in disks:
-            disks_by_host[d['location']].append(
-                Device(path=d['path'])
+        # Resolve which hosts to include based on the filter.
+        filter_hosts = None
+        if host_filter and host_filter.hosts:
+            filter_hosts = set(host_filter.hosts)
+
+        osd_disks = self.microceph.services.list_disks()
+
+        # Build inventory from OSD disk list (cluster-wide source of truth
+        # for which hosts exist and what disks they have).
+        devices_by_host: Dict[str, List[Device]] = defaultdict(list)
+        for d in osd_disks:
+            host = d.get('location', '')
+            if filter_hosts and host not in filter_hosts:
+                continue
+            devices_by_host[host].append(
+                Device(path=d.get('path', ''), available=False)
             )
 
+        # Ensure all cluster members appear even if they have no OSD disks.
+        try:
+            for member in self.microceph.cluster.get_cluster_members():
+                host = member.get('name', '')
+                if filter_hosts and host not in filter_hosts:
+                    continue
+                if host not in devices_by_host:
+                    devices_by_host[host] = []
+        except RemoteException:
+            pass
+
         inventory = []
-        for host, diskettes in disks_by_host.items():
+        for host, devs in devices_by_host.items():
             inventory.append(InventoryHost(
                 name=host,
-                devices=Devices(diskettes)
+                devices=Devices(devs)
             ))
 
         return inventory
 
+    def _get_placement_hosts(self, spec: ServiceSpec) -> List[str]:
+        """Extract target hosts from a ServiceSpec's placement.
+
+        Returns the list of hostnames from the placement spec.
+        Raises ValueError if no hosts are specified; callers must
+        provide explicit placement.
+        """
+        hosts = []
+        if spec.placement and spec.placement.hosts:
+            hosts = [str(h) for h in spec.placement.hosts]
+
+        if not hosts:
+            raise ValueError(
+                f"No placement hosts specified for {spec.service_type}. "
+                "Explicit host placement is required."
+            )
+
+        return hosts
+
+    def _get_existing_service_hosts(self, service_type: str) -> set:
+        """Return the set of hostnames that already run the given service type."""
+        try:
+            services = self.microceph.services.list_services()
+        except RemoteException:
+            logger.warning(f"Failed to list services while checking existing {service_type} hosts")
+            return set()
+
+        return {
+            svc['location'] for svc in services
+            if svc['service'] == service_type
+        }
+
+    def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) -> str:
+        """Common logic for applying (enabling) a service on the local node.
+
+        Validates placement hosts and checks whether the service is already
+        running. Calls enable_service once; the request goes to the local
+        node via the unix socket.
+
+        Returns a summary string on success.
+        Raises an exception if placement is invalid or the API call fails.
+
+        NOTE: Per-host targeting is not yet supported. The Python client
+        connects via unix socket to the local node only. MicroCeph's Go
+        client uses UseTarget() to proxy to specific hosts, but this is
+        not exposed through the socket. See todo #6.
+        Placement hosts are validated and logged, but the enable call
+        always runs on the local node regardless.
+        """
+        hosts = self._get_placement_hosts(spec)
+        existing = self._get_existing_service_hosts(service_name)
+
+        # Check if the service is already active on the local node.
+        # Since we can only target the local node, we check if any
+        # of the requested hosts that are local already have the service.
+        all_existing = all(h in existing for h in hosts)
+        if all_existing:
+            skipped_str = ', '.join(hosts)
+            logger.info(f"Skipping {service_name}: already active on {skipped_str}")
+            return f"{service_name}: already active on {skipped_str}"
+
+        if existing:
+            logger.info(
+                f"{service_name} already active on {', '.join(existing & set(hosts))}; "
+                f"enabling for remaining hosts"
+            )
+
+        logger.info(f"Enabling {service_name} (requested hosts: {', '.join(hosts)})")
+        self.microceph.services.enable_service(
+            name=service_name,
+            payload=payload,
+            wait=True,
+        )
+
+        new_hosts = [h for h in hosts if h not in existing]
+        parts = [f"enabled on {', '.join(new_hosts)}"]
+        skipped_hosts = [h for h in hosts if h in existing]
+        if skipped_hosts:
+            parts.append(f"already active on {', '.join(skipped_hosts)}")
+
+        return f"{service_name}: {'; '.join(parts)}"
+
+    @handle_orch_error
     def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
-        logger.info(f"Received Apply Request for RBD Mirror: Spec: {vars(spec).items()}")
-        raise NotImplementedError() 
+        """Enable the rbd-mirror service on the target hosts.
 
-    def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]:
+        rbd-mirror is a client-like service with no additional parameters.
+        The Go API's ClientServicePlacement handler takes no payload.
         """
+        logger.debug(f"Applying rbd-mirror service, spec: {vars(spec)}")
+        return self._apply_service("rbd-mirror", spec, "{}")
 
-        :param spec:
-        :return:
+    @handle_orch_error
+    def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]:
+        """Enable the RGW service on the target hosts.
+
+        Extracts port and SSL configuration from the RGWSpec and passes
+        them as the JSON payload to the MicroCeph enable_service API.
+
+        Note on SSL: The Ceph RGWSpec provides rgw_frontend_ssl_certificate
+        (a list of PEM cert strings) but has no private key field. MicroCeph's
+        Go API (RgwServicePlacement) requires both SSLCertificate and
+        SSLPrivateKey to enable SSL. Until the spec is extended or a separate
+        key source is added, SSL cannot be fully configured via the
+        orchestrator interface.
         """
-        raise NotImplementedError()
+        logger.debug(f"Applying RGW service, spec: {vars(spec)}")
+
+        # Build RGW-specific payload from the spec.
+        # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey
+        # Field names match Go exported field names (no json tags defined,
+        # so encoding/json matches case-insensitively).
+        rgw_params: Dict[str, Any] = {}
+        if spec.rgw_frontend_port:
+            rgw_params['Port'] = spec.rgw_frontend_port
+
+        # TODO: SSL support for RGW via orchestrator interface.
+        #
+        # SSL cannot be configured through this path. MicroCeph's Go API requires
+        # both SSLCertificate and SSLPrivateKey as raw PEM material; if either is
+        # missing, SSL is skipped and RGW falls back to plain HTTP (see rgw.go).
+        # The Ceph RGWSpec only carries the cert chain (rgw_frontend_ssl_certificate)
+        # with no private key field, so we cannot supply both.
+        if spec.rgw_frontend_ssl_certificate:
+            logger.warning(
+                "RGWSpec provides SSL certificate but the Ceph orchestrator spec "
+                "has no private key field. MicroCeph requires both certificate and "
+                "private key for SSL. SSL will be skipped; RGW will be deployed "
+                "in non-SSL mode."
+            )
+
+        payload = json.dumps(rgw_params) if rgw_params else "{}"
+        return self._apply_service("rgw", spec, payload)
 
+    @handle_orch_error
     def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]:
+        """Enable the NFS service on the target hosts.
+
+        Extracts the NFS cluster ID and optional port from the
+        NFSServiceSpec and passes them as the JSON payload.
         """
+        logger.debug(f"Applying NFS service, spec: {vars(spec)}")
 
-        :param spec:
-        :return:
+        # Go's NFSServicePlacement expects: cluster_id, v4_min_version, bind_address, bind_port
+        # The Ceph NFSServiceSpec uses service_id as the NFS cluster identifier.
+        if not spec.service_id:
+            raise ValueError("NFS service_id (cluster_id) is required")
+
+        nfs_params: Dict[str, Any] = {
+            'cluster_id': spec.service_id,
+        }
+        if spec.port:
+            nfs_params['bind_port'] = spec.port
+        if spec.virtual_ip:
+            nfs_params['bind_address'] = spec.virtual_ip
+
+        payload = json.dumps(nfs_params)
+        return self._apply_service("nfs", spec, payload)
+
+    @handle_orch_error
+    def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]:
+        """Enable the MON service on the target hosts."""
+        logger.debug(f"Applying MON service, spec: {vars(spec)}")
+        return self._apply_service("mon", spec, "{}")
+
+    @handle_orch_error
+    def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]:
+        """Enable the MGR service on the target hosts."""
+        logger.debug(f"Applying MGR service, spec: {vars(spec)}")
+        return self._apply_service("mgr", spec, "{}")
+
+    @handle_orch_error
+    def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]:
+        """Enable the MDS service on the target hosts."""
+        logger.debug(f"Applying MDS service, spec: {vars(spec)}")
+        return self._apply_service("mds", spec, "{}")
+
+    @handle_orch_error
+    def apply_cephfs_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
+        """Enable the cephfs-mirror service on the target hosts.
+
+        cephfs-mirror is a client-like service (same as rbd-mirror)
+        with no additional parameters.
         """
-        raise NotImplementedError()
+        logger.debug(f"Applying cephfs-mirror service, spec: {vars(spec)}")
+        return self._apply_service("cephfs-mirror", spec, "{}")
+
+    @handle_orch_error
+    def remove_service(self, service_name: str, force: bool = False) -> OrchResult[str]:
+        """Remove a service from the local node.
+
+        Sends a DELETE request to the local MicroCeph API. This removes
+        the service from the node connected via the unix socket only;
+        it does not remove the service cluster-wide. Per-host targeting
+        requires UseTarget support (see todo #6).
+
+        :param service_name: service type or type.id (e.g. "rgw", "nfs.mycluster")
+        :param force: unused, kept for interface compatibility
+        """
+        logger.info(f"Removing service: {service_name}, force={force}")
+
+        svc_type, svc_id = self._parse_service_name(service_name)
+
+        # NFS requires the cluster_id in the delete body
+        if svc_type == 'nfs':
+            if not svc_id:
+                raise ValueError("NFS removal requires service name in 'nfs.<cluster_id>' format")
+            self.microceph.services.delete_nfs_service(svc_id)
+        else:
+            self.microceph.services.delete_service(svc_type)
+
+        return f"Removed service {service_name}"
+
+    @handle_orch_error
+    def remove_host(self, host: str, force: bool = False,
+                    offline: bool = False, rm_crush_entry: bool = False) -> OrchResult[str]:
+        """Remove a host from the MicroCeph cluster.
+
+        :param host: hostname to remove
+        :param force: unused, kept for interface compatibility
+        :param offline: unused, kept for interface compatibility
+        :param rm_crush_entry: unused, kept for interface compatibility
+        """
+        logger.info(f"Removing host: {host}, force={force}")
+        self.microceph.cluster.remove(host)
+        return f"Removed host {host}"
+
+    @handle_orch_error
+    def service_action(self, action: str, service_name: str) -> OrchResult[List[str]]:
+        """Perform an action (restart) on a service.
+
+        Currently only 'restart' is supported via the MicroCeph API.
+
+        :param action: one of "start", "stop", "restart", "redeploy", "reconfig"
+        :param service_name: service type (e.g. "mon", "rgw")
+        """
+        logger.info(f"Service action: {action} on {service_name}")
+
+        if action != "restart":
+            raise NotImplementedError(
+                f"Service action '{action}' is not supported by MicroCeph. "
+                "Only 'restart' is currently available."
+            )
+
+        svc_type, _ = self._parse_service_name(service_name)
+
+        self.microceph.services.restart_services([svc_type])
+        return [f"Restarted {service_name}"]
diff --git a/microceph-orch/tests/conftest.py b/microceph-orch/tests/conftest.py
new file mode 100644
index 000000000..f2cf1581f
--- /dev/null
+++ b/microceph-orch/tests/conftest.py
@@ -0,0 +1,148 @@
+"""
+Test fixtures and mock setup for microceph-orch tests.
+
+Installs mock modules into sys.modules so that Ceph-internal and
+snap-only imports work outside the snap environment.
+"""
+
+import sys
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from stubs import (
+    OrchResult,
+    handle_orch_error,
+    CLICommandMeta,
+    HostSpec,
+    PlacementSpec,
+    ServiceSpec,
+    RGWSpec,
+    MONSpec,
+    MDSSpec,
+    NFSServiceSpec,
+    Device,
+    Devices,
+    InventoryFilter,
+    InventoryHost,
+    ServiceDescription,
+    DaemonDescription,
+    Orchestrator,
+    MgrModule,
+    NotifyType,
+)
+
+
+# ---------------------------------------------------------------------------
+# Install mocks into sys.modules BEFORE any microceph imports
+# ---------------------------------------------------------------------------
+
+def _install_mocks():
+    """Inject mock modules so `from ceph.deployment...` etc. work.
+
+    Also mocks snap-specific dependencies (requests_unixsocket, snaphelpers)
+    that are only available inside the snap environment.
+    """
+
+    # ceph.deployment.inventory
+    inv_mod = MagicMock()
+    inv_mod.Device = Device
+    inv_mod.Devices = Devices
+
+    # ceph.deployment.service_spec
+    spec_mod = MagicMock()
+    spec_mod.ServiceSpec = ServiceSpec
+    spec_mod.PlacementSpec = PlacementSpec
+    spec_mod.RGWSpec = RGWSpec
+    spec_mod.MONSpec = MONSpec
+    spec_mod.MDSSpec = MDSSpec
+    spec_mod.NFSServiceSpec = NFSServiceSpec
+
+    # ceph.deployment
+    deployment_mod = MagicMock()
+    deployment_mod.inventory = inv_mod
+    deployment_mod.service_spec = spec_mod
+
+    # ceph
+    ceph_mod = MagicMock()
+    ceph_mod.deployment = deployment_mod
+
+    # mgr_module
+    mgr_mod = MagicMock()
+    mgr_mod.MgrModule = MgrModule
+    mgr_mod.NotifyType = NotifyType
+
+    # orchestrator
+    orch_mod = MagicMock()
+    orch_mod.Orchestrator = Orchestrator
+    orch_mod.HostSpec = HostSpec
+    orch_mod.InventoryFilter = InventoryFilter
+    orch_mod.InventoryHost = InventoryHost
+    orch_mod.ServiceDescription = ServiceDescription
+    orch_mod.DaemonDescription = DaemonDescription
+    orch_mod.CLICommandMeta = CLICommandMeta
+    orch_mod.handle_orch_error = handle_orch_error
+    orch_mod.OrchResult = OrchResult
+
+    # snap-only deps
+    requests_unixsocket_mod = MagicMock()
+    requests_unixsocket_mod.DEFAULT_SCHEME = "http+unix://"
+    snaphelpers_mod = MagicMock()
+
+    sys.modules.update({
+        "ceph": ceph_mod,
+        "ceph.deployment": deployment_mod,
+        "ceph.deployment.inventory": inv_mod,
+        "ceph.deployment.service_spec": spec_mod,
+        "mgr_module": mgr_mod,
+        "orchestrator": orch_mod,
+        "requests_unixsocket": requests_unixsocket_mod,
+        "snaphelpers": snaphelpers_mod,
+    })
+
+
+# Install mocks before pytest collects test modules
+_install_mocks()
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def mock_client():
+    """Return a mock Client with pre-wired service stubs."""
+    client = MagicMock()
+
+    # Default: cluster with 3 members
+    client.cluster.get_cluster_members.return_value = [
+        {"name": "node1", "address": "10.0.0.1:7443", "status": "ONLINE"},
+        {"name": "node2", "address": "10.0.0.2:7443", "status": "ONLINE"},
+        {"name": "node3", "address": "10.0.0.3:7443", "status": "ONLINE"},
+    ]
+
+    # Default: no services running
+    client.services.list_services.return_value = []
+
+    # Default: no disks
+    client.services.list_disks.return_value = []
+
+    # Default: no resources
+    client.services.list_resources.return_value = []
+
+    # Default: status available
+    client.status.is_available.return_value = None
+
+    return client
+
+
+@pytest.fixture
+def orchestrator(mock_client):
+    """Return a MicroCephOrchestrator with a mocked client."""
+    from microceph.module import MicroCephOrchestrator
+
+    with patch.object(MicroCephOrchestrator, "__init__", lambda self, *a, **kw: None):
+        orch = MicroCephOrchestrator.__new__(MicroCephOrchestrator)
+        orch.microceph = mock_client
+        orch.run = True
+    return orch
diff --git a/microceph-orch/tests/stubs.py b/microceph-orch/tests/stubs.py
new file mode 100644
index 000000000..e9b6be2a9
--- /dev/null
+++ b/microceph-orch/tests/stubs.py
@@ -0,0 +1,135 @@
+"""
+Minimal stubs for Ceph types used by the orchestrator module.
+
+The real types live in ceph.deployment.*, mgr_module, and orchestrator;
+which are only importable inside the Ceph snap environment. These stubs
+replicate just enough behaviour to test our code outside the snap.
+"""
+
+from typing import Optional, List, Dict, Any, Generic, TypeVar
+from functools import wraps
+
+T = TypeVar("T")
+
+
+class OrchResult(Generic[T]):
+    """Minimal OrchResult stub."""
+
+    def __init__(self, result: Optional[T] = None, exception: Optional[Exception] = None):
+        self.result = result
+        self.exception = exception
+        self.exception_str = str(exception) if exception else ""
+
+
+def handle_orch_error(f):
+    """Stub decorator that wraps return value in OrchResult and catches exceptions."""
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+        try:
+            return OrchResult(f(*args, **kwargs))
+        except Exception as e:
+            return OrchResult(None, exception=e)
+    return wrapper
+
+
+class CLICommandMeta(type):
+    """No-op metaclass stub."""
+    pass
+
+
+class HostSpec:
+    def __init__(self, hostname, addr=None, labels=None, status=None, **kwargs):
+        self.hostname = hostname
+        self.addr = addr or hostname
+        self.labels = labels or []
+        self.status = status or ""
+
+
+class PlacementSpec:
+    def __init__(self, hosts=None, count=None, **kwargs):
+        self.hosts = hosts or []
+        self.count = count
+
+
+class ServiceSpec:
+    def __init__(self, service_type="", service_id=None, placement=None, **kwargs):
+        self.service_type = service_type
+        self.service_id = service_id
+        self.placement = placement
+
+
+class RGWSpec(ServiceSpec):
+    def __init__(self, rgw_frontend_port=None, rgw_frontend_ssl_certificate=None,
+                 ssl=False, **kwargs):
+        super().__init__(**kwargs)
+        self.rgw_frontend_port = rgw_frontend_port
+        self.rgw_frontend_ssl_certificate = rgw_frontend_ssl_certificate
+        self.ssl = ssl
+
+
+class MONSpec(ServiceSpec):
+    pass
+
+
+class MDSSpec(ServiceSpec):
+    pass
+
+
+class NFSServiceSpec(ServiceSpec):
+    def __init__(self, port=None, virtual_ip=None, **kwargs):
+        super().__init__(**kwargs)
+        self.port = port
+        self.virtual_ip = virtual_ip
+
+
+class Device:
+    def __init__(self, path="", available=None, **kwargs):
+        self.path = path
+        self.available = available
+
+
+class Devices:
+    def __init__(self, devices=None):
+        self.devices = devices or []
+
+
+class InventoryFilter:
+    def __init__(self, labels=None, hosts=None):
+        self.labels = labels
+        self.hosts = hosts
+
+
+class InventoryHost:
+    def __init__(self, name="", devices=None):
+        self.name = name
+        self.devices = devices
+
+
+class ServiceDescription:
+    def __init__(self, spec=None, running=0, **kwargs):
+        self.spec = spec
+        self.running = running
+
+
+class DaemonDescription:
+    def __init__(self, service_name="", daemon_type="", daemon_id="",
+                 hostname="", ip=None, ports=None, **kwargs):
+        self.service_name = service_name
+        self.daemon_type = daemon_type
+        self.daemon_id = daemon_id
+        self.hostname = hostname
+        self.ip = ip
+        self.ports = ports
+
+
+class Orchestrator:
+    pass
+
+
+class MgrModule:
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class NotifyType:
+    pass
diff --git a/microceph-orch/tests/test_client.py b/microceph-orch/tests/test_client.py
new file mode 100644
index 000000000..145a325d8
--- /dev/null
+++ b/microceph-orch/tests/test_client.py
@@ -0,0 +1,188 @@
+"""
+Tests for the MicroCeph client layer (cluster.py).
+"""
+
+import json
+import pytest
+from unittest.mock import MagicMock, patch
+
+from microceph.client.cluster import (
+    MicroClusterService,
+    StatusService,
+    ExtendedAPIService,
+)
+from microceph.client.service import RemoteException
+
+
+@pytest.fixture
+def mock_session():
+    return MagicMock()
+
+
+@pytest.fixture
+def endpoint():
+    return "http+unix://%2Ftmp%2Ftest.socket"
+
+
+# ===================================================================
+# MicroClusterService
+# ===================================================================
+
+class TestMicroClusterService:
+    def test_get_cluster_members(self, mock_session, endpoint):
+        svc = MicroClusterService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {
+                "metadata": [
+                    {"name": "n1", "address": "10.0.0.1:7443", "status": "ONLINE", "extra": "ignored"},
+                    {"name": "n2", "address": "10.0.0.2:7443", "status": "ONLINE"},
+                ]
+            },
+        )
+        members = svc.get_cluster_members()
+        assert len(members) == 2
+        assert members[0] == {"name": "n1", "address": "10.0.0.1:7443", "status": "ONLINE"}
+        # "extra" key should be filtered out
+        assert "extra" not in members[0]
+
+    def test_get_cluster_members_null_metadata(self, mock_session, endpoint):
+        svc = MicroClusterService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        members = svc.get_cluster_members()
+        assert members == []
+
+    def test_get_cluster_members_missing_metadata(self, mock_session, endpoint):
+        svc = MicroClusterService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {},
+        )
+        members = svc.get_cluster_members()
+        assert members == []
+
+
+# ===================================================================
+# ExtendedAPIService
+# ===================================================================
+
+class TestExtendedAPIService:
+    def test_list_services_returns_list(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": [{"service": "mon", "location": "n1"}]},
+        )
+        result = svc.list_services()
+        assert isinstance(result, list)
+        assert len(result) == 1
+
+    def test_list_services_null_metadata(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        result = svc.list_services()
+        assert result == []
+
+    def test_list_disks_null_metadata(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        result = svc.list_disks()
+        assert result == []
+
+    def test_list_resources_null_metadata(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        result = svc.list_resources()
+        assert result == []
+
+    def test_enable_service_payload(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        svc.enable_service(name="rgw", payload='{"Port": 8080}', wait=True)
+
+        call_args = mock_session.request.call_args
+        body = call_args.kwargs.get("json") or call_args[1].get("json")
+        assert body["name"] == "rgw"
+        assert body["bool"] is True
+        assert body["payload"] == '{"Port": 8080}'
+
+    def test_enable_service_wait_false(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        svc.enable_service(name="mon", wait=False)
+
+        call_args = mock_session.request.call_args
+        body = call_args.kwargs.get("json") or call_args[1].get("json")
+        assert body["bool"] is False
+
+    def test_delete_service(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {},
+        )
+        svc.delete_service("rgw")
+        call_args = mock_session.request.call_args
+        assert call_args.kwargs.get("method") == "delete" or call_args[0][0] == "delete"
+
+    def test_delete_nfs_service(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {},
+        )
+        svc.delete_nfs_service("mycluster")
+        call_args = mock_session.request.call_args
+        body = call_args.kwargs.get("json") or call_args[1].get("json")
+        assert body == {"cluster_id": "mycluster"}
+
+    def test_restart_services(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {},
+        )
+        svc.restart_services(["mon", "rgw"])
+        call_args = mock_session.request.call_args
+        body = call_args.kwargs.get("json") or call_args[1].get("json")
+        assert body == [{"service": "mon"}, {"service": "rgw"}]
+
+    def test_get_status_null_metadata(self, mock_session, endpoint):
+        svc = ExtendedAPIService(mock_session, endpoint)
+        mock_session.request.return_value = MagicMock(
+            status_code=200,
+            text='{}',
+            json=lambda: {"metadata": None},
+        )
+        result = svc.get_status()
+        assert result == {}
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
new file mode 100644
index 000000000..31f646753
--- /dev/null
+++ b/microceph-orch/tests/test_module.py
@@ -0,0 +1,671 @@
+"""
+Tests for MicroCephOrchestrator methods.
+"""
+
+import json
+import pytest
+from unittest.mock import MagicMock
+
+from stubs import (
+    HostSpec,
+    PlacementSpec,
+    ServiceSpec,
+    RGWSpec,
+    NFSServiceSpec,
+    InventoryFilter,
+)
+
+from microceph.client.service import RemoteException
+
+
+# ===================================================================
+# available()
+# ===================================================================
+
+class TestAvailable:
+    def test_available_success(self, orchestrator, mock_client):
+        ok, msg, info = orchestrator.available()
+        assert ok is True
+        assert msg == ""
+        mock_client.status.is_available.assert_called_once()
+
+    def test_available_remote_error(self, orchestrator, mock_client):
+        mock_client.status.is_available.side_effect = RemoteException("socket gone")
+        ok, msg, _ = orchestrator.available()
+        assert ok is False
+        assert "Cannot reach" in msg
+
+    def test_available_unexpected_error(self, orchestrator, mock_client):
+        mock_client.status.is_available.side_effect = OSError("permission denied")
+        ok, msg, _ = orchestrator.available()
+        assert ok is False
+        assert "Unexpected error" in msg
+
+
+# ===================================================================
+# get_hosts()
+# ===================================================================
+
+class TestGetHosts:
+    def test_get_hosts_basic(self, orchestrator, mock_client):
+        result = orchestrator.get_hosts()
+        hosts = result.result
+        assert len(hosts) == 3
+        assert hosts[0].hostname == "node1"
+        assert hosts[0].addr == "10.0.0.1"
+        assert hosts[0].status == "ONLINE"
+
+    def test_get_hosts_strips_port(self, orchestrator, mock_client):
+        mock_client.cluster.get_cluster_members.return_value = [
+            {"name": "h1", "address": "192.168.1.100:9443", "status": "ONLINE"},
+        ]
+        result = orchestrator.get_hosts()
+        assert result.result[0].addr == "192.168.1.100"
+
+    def test_get_hosts_no_port_in_address(self, orchestrator, mock_client):
+        mock_client.cluster.get_cluster_members.return_value = [
+            {"name": "h1", "address": "192.168.1.100", "status": "ONLINE"},
+        ]
+        result = orchestrator.get_hosts()
+        # Should fall back to raw address when no ":" present
+        assert result.result[0].addr == "192.168.1.100"
+
+    def test_get_hosts_missing_address(self, orchestrator, mock_client):
+        mock_client.cluster.get_cluster_members.return_value = [
+            {"name": "h1", "status": "ONLINE"},
+        ]
+        result = orchestrator.get_hosts()
+        # Missing address should not crash
+        assert result.result[0].hostname == "h1"
+
+    def test_get_hosts_empty_cluster(self, orchestrator, mock_client):
+        mock_client.cluster.get_cluster_members.return_value = []
+        result = orchestrator.get_hosts()
+        assert result.result == []
+
+    def test_get_hosts_api_error(self, orchestrator, mock_client):
+        mock_client.cluster.get_cluster_members.side_effect = RemoteException("fail")
+        result = orchestrator.get_hosts()
+        assert result.exception is not None
+
+
+# ===================================================================
+# describe_service()
+# ===================================================================
+
+class TestDescribeService:
+    def test_describe_service_all(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "mon", "location": "node2", "group_id": "", "info": ""},
+            {"service": "rgw", "location": "node1", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.describe_service()
+        descs = result.result
+        assert len(descs) == 2  # mon and rgw grouped
+        types = {d.spec.service_type for d in descs}
+        assert types == {"mon", "rgw"}
+
+    def test_describe_service_filter_type(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "rgw", "location": "node1", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.describe_service(service_type="rgw")
+        descs = result.result
+        assert len(descs) == 1
+        assert descs[0].spec.service_type == "rgw"
+
+    def test_describe_service_with_group_id(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "mycluster", "info": "{}"},
+        ]
+        result = orchestrator.describe_service()
+        descs = result.result
+        assert len(descs) == 1
+        assert descs[0].spec.service_id == "mycluster"
+
+    def test_describe_service_empty(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = []
+        result = orchestrator.describe_service()
+        assert result.result == []
+
+    def test_describe_service_running_count(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "mon", "location": "node2", "group_id": "", "info": ""},
+            {"service": "mon", "location": "node3", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.describe_service()
+        assert result.result[0].running == 3
+
+
+# ===================================================================
+# list_daemons()
+# ===================================================================
+
+class TestListDaemons:
+    def test_list_daemons_basic(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "rgw", "location": "node2", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.list_daemons()
+        daemons = result.result
+        assert len(daemons) == 2
+
+    def test_list_daemons_filter_daemon_type(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "rgw", "location": "node2", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.list_daemons(daemon_type="mon")
+        assert len(result.result) == 1
+        assert result.result[0].daemon_type == "mon"
+
+    def test_list_daemons_filter_host(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "mon", "location": "node2", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.list_daemons(host="node1")
+        assert len(result.result) == 1
+        assert result.result[0].hostname == "node1"
+
+    def test_list_daemons_filter_service_name(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "cluster1", "info": "{}"},
+            {"service": "nfs", "location": "node1", "group_id": "cluster2", "info": "{}"},
+        ]
+        result = orchestrator.list_daemons(service_name="nfs.cluster1")
+        assert len(result.result) == 1
+
+    def test_list_daemons_filter_daemon_id(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "node1", "group_id": "", "info": ""},
+            {"service": "mon", "location": "node2", "group_id": "", "info": ""},
+        ]
+        result = orchestrator.list_daemons(daemon_id="node2")
+        assert len(result.result) == 1
+        assert result.result[0].hostname == "node2"
+
+    def test_list_daemons_nfs_with_info(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {
+                "service": "nfs",
+                "location": "node1",
+                "group_id": "mycluster",
+                "info": json.dumps({"bind_address": "10.0.0.5", "bind_port": 2049}),
+            },
+        ]
+        result = orchestrator.list_daemons()
+        daemon = result.result[0]
+        assert daemon.ip == "10.0.0.5"
+        assert daemon.ports == [2049]
+
+    def test_list_daemons_nfs_wildcard_address(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {
+                "service": "nfs",
+                "location": "node1",
+                "group_id": "mycluster",
+                "info": json.dumps({"bind_address": "0.0.0.0", "bind_port": 2049}),
+            },
+        ]
+        result = orchestrator.list_daemons()
+        assert result.result[0].ip is None  # 0.0.0.0 should be None
+
+    def test_list_daemons_nfs_malformed_info(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "c1", "info": "not-json"},
+        ]
+        # Should not crash, just skip the NFS info parsing
+        result = orchestrator.list_daemons()
+        assert len(result.result) == 1
+        assert result.result[0].ip is None
+        assert result.result[0].ports is None
+
+    def test_list_daemons_nfs_empty_info(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "c1", "info": ""},
+        ]
+        result = orchestrator.list_daemons()
+        assert len(result.result) == 1
+
+    def test_list_daemons_nfs_null_info(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "c1", "info": None},
+        ]
+        result = orchestrator.list_daemons()
+        assert len(result.result) == 1
+
+
+# ===================================================================
+# get_inventory()
+# ===================================================================
+
+class TestGetInventory:
+    def test_get_inventory_shows_osd_disks(self, orchestrator, mock_client):
+        mock_client.services.list_disks.return_value = [
+            {"location": "node1", "path": "/dev/sdb"},
+            {"location": "node1", "path": "/dev/sdc"},
+        ]
+        mock_client.services.list_resources.return_value = {}
+        result = orchestrator.get_inventory()
+        inv = result.result
+        # 3 hosts (from cluster members), node1 has 2 OSD disks
+        hosts = {h.name for h in inv}
+        assert hosts == {"node1", "node2", "node3"}
+        node1 = [h for h in inv if h.name == "node1"][0]
+        assert len(node1.devices.devices) == 2
+        assert all(d.available is False for d in node1.devices.devices)
+
+    def test_get_inventory_multi_host(self, orchestrator, mock_client):
+        mock_client.services.list_disks.return_value = [
+            {"location": "node1", "path": "/dev/sda"},
+            {"location": "node2", "path": "/dev/sda"},
+        ]
+        mock_client.services.list_resources.return_value = {}
+        result = orchestrator.get_inventory()
+        hosts = {h.name for h in result.result}
+        assert hosts == {"node1", "node2", "node3"}
+
+    def test_get_inventory_with_host_filter(self, orchestrator, mock_client):
+        mock_client.services.list_disks.return_value = [
+            {"location": "node1", "path": "/dev/sda"},
+            {"location": "node2", "path": "/dev/sda"},
+            {"location": "node3", "path": "/dev/sda"},
+        ]
+        mock_client.services.list_resources.return_value = {}
+        filt = InventoryFilter(hosts=["node1", "node3"])
+        result = orchestrator.get_inventory(host_filter=filt)
+        hosts = {h.name for h in result.result}
+        assert hosts == {"node1", "node3"}
+
+    def test_get_inventory_empty(self, orchestrator, mock_client):
+        mock_client.services.list_disks.return_value = []
+        mock_client.services.list_resources.return_value = {}
+        mock_client.cluster.get_cluster_members.return_value = []
+        result = orchestrator.get_inventory()
+        assert result.result == []
+
+    def test_get_inventory_includes_members_without_disks(self, orchestrator, mock_client):
+        mock_client.services.list_disks.return_value = []
+        mock_client.services.list_resources.return_value = {}
+        # Default mock has 3 cluster members
+        result = orchestrator.get_inventory()
+        hosts = {h.name for h in result.result}
+        assert hosts == {"node1", "node2", "node3"}
+        # No devices on any host
+        for h in result.result:
+            assert len(h.devices.devices) == 0
+
+
+# ===================================================================
+# apply_rbd_mirror()
+# ===================================================================
+
+class TestApplyRbdMirror:
+    def test_apply_rbd_mirror_success(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            placement=PlacementSpec(hosts=["node1", "node2"]),
+        )
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert result.exception is None
+        assert "enabled on node1, node2" in result.result
+        # Single API call regardless of host count (local socket only)
+        mock_client.services.enable_service.assert_called_once()
+
+    def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client):
+        spec = ServiceSpec(service_type="rbd-mirror")
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert result.exception is not None
+        assert "No placement hosts" in str(result.exception)
+
+    def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""},
+        ]
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            placement=PlacementSpec(hosts=["node1", "node2"]),
+        )
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert result.exception is None
+        assert "already active on node1" in result.result
+        assert "enabled on node2" in result.result
+        mock_client.services.enable_service.assert_called_once()
+
+    def test_apply_rbd_mirror_all_existing(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""},
+        ]
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert result.exception is None
+        assert "already active on node1" in result.result
+        mock_client.services.enable_service.assert_not_called()
+
+    def test_apply_rbd_mirror_api_error(self, orchestrator, mock_client):
+        mock_client.services.enable_service.side_effect = RemoteException("fail")
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert result.exception is not None
+
+
+# ===================================================================
+# apply_rgw()
+# ===================================================================
+
+class TestApplyRgw:
+    def test_apply_rgw_basic(self, orchestrator, mock_client):
+        spec = RGWSpec(
+            service_type="rgw",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_rgw(spec)
+        assert result.exception is None
+        assert "enabled on node1" in result.result
+
+        call_kwargs = mock_client.services.enable_service.call_args
+        assert call_kwargs.kwargs["name"] == "rgw"
+
+    def test_apply_rgw_with_port(self, orchestrator, mock_client):
+        spec = RGWSpec(
+            service_type="rgw",
+            rgw_frontend_port=8080,
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_rgw(spec)
+        assert result.exception is None
+
+        call_kwargs = mock_client.services.enable_service.call_args
+        payload = json.loads(call_kwargs.kwargs["payload"])
+        assert payload["Port"] == 8080
+
+    def test_apply_rgw_ssl_cert_without_key_warns(self, orchestrator, mock_client, caplog):
+        """SSL cert is present but no key; should warn and not send cert."""
+        spec = RGWSpec(
+            service_type="rgw",
+            rgw_frontend_ssl_certificate=["-----BEGIN CERT-----"],
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        import logging
+        with caplog.at_level(logging.WARNING):
+            result = orchestrator.apply_rgw(spec)
+
+        assert result.exception is None
+        # Cert should NOT be in payload (useless without key)
+        call_kwargs = mock_client.services.enable_service.call_args
+        payload = json.loads(call_kwargs.kwargs["payload"])
+        assert "SSLCertificate" not in payload
+        # Warning should be logged
+        assert any("private key" in r.message for r in caplog.records)
+
+    def test_apply_rgw_no_placement(self, orchestrator, mock_client):
+        spec = RGWSpec(service_type="rgw")
+        result = orchestrator.apply_rgw(spec)
+        assert result.exception is not None
+
+
+# ===================================================================
+# apply_nfs()
+# ===================================================================
+
+class TestApplyNfs:
+    def test_apply_nfs_basic(self, orchestrator, mock_client):
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            service_id="mycluster",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_nfs(spec)
+        assert result.exception is None
+        assert "enabled on node1" in result.result
+
+        call_kwargs = mock_client.services.enable_service.call_args
+        payload = json.loads(call_kwargs.kwargs["payload"])
+        assert payload["cluster_id"] == "mycluster"
+
+    def test_apply_nfs_with_port(self, orchestrator, mock_client):
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            service_id="mycluster",
+            port=12049,
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_nfs(spec)
+        call_kwargs = mock_client.services.enable_service.call_args
+        payload = json.loads(call_kwargs.kwargs["payload"])
+        assert payload["bind_port"] == 12049
+
+    def test_apply_nfs_with_virtual_ip(self, orchestrator, mock_client):
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            service_id="mycluster",
+            virtual_ip="10.0.0.100",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_nfs(spec)
+        call_kwargs = mock_client.services.enable_service.call_args
+        payload = json.loads(call_kwargs.kwargs["payload"])
+        assert payload["bind_address"] == "10.0.0.100"
+
+    def test_apply_nfs_missing_service_id(self, orchestrator, mock_client):
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_nfs(spec)
+        assert result.exception is not None
+        assert "cluster_id" in str(result.exception)
+
+    def test_apply_nfs_no_placement(self, orchestrator, mock_client):
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            service_id="mycluster",
+        )
+        result = orchestrator.apply_nfs(spec)
+        assert result.exception is not None
+        assert "No placement hosts" in str(result.exception)
+
+    def test_apply_nfs_skips_existing(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "mycluster", "info": "{}"},
+        ]
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            service_id="mycluster",
+            placement=PlacementSpec(hosts=["node1", "node2"]),
+        )
+        result = orchestrator.apply_nfs(spec)
+        assert "already active on node1" in result.result
+        assert "enabled on node2" in result.result
+        mock_client.services.enable_service.assert_called_once()
+
+
+# ===================================================================
+# apply_mon() / apply_mgr() / apply_mds()
+# ===================================================================
+
+# ===================================================================
+# apply_cephfs_mirror()
+# ===================================================================
+
+class TestApplyCephfsMirror:
+    def test_apply_cephfs_mirror_success(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="cephfs-mirror",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_cephfs_mirror(spec)
+        assert result.exception is None
+        assert "enabled on node1" in result.result
+        mock_client.services.enable_service.assert_called_once_with(
+            name="cephfs-mirror", payload="{}", wait=True,
+        )
+
+
+# ===================================================================
+# _parse_service_name()
+# ===================================================================
+
+class TestParseServiceName:
+    def test_simple_name(self, orchestrator):
+        assert orchestrator._parse_service_name("rgw") == ("rgw", "")
+
+    def test_dotted_name(self, orchestrator):
+        assert orchestrator._parse_service_name("nfs.mycluster") == ("nfs", "mycluster")
+
+    def test_multi_dot_name(self, orchestrator):
+        """Ensure dotted names like nfs.my.cluster split on first dot only."""
+        assert orchestrator._parse_service_name("nfs.my.cluster") == ("nfs", "my.cluster")
+
+    def test_empty_string(self, orchestrator):
+        assert orchestrator._parse_service_name("") == ("", "")
+
+
+# ===================================================================
+# describe_service() - service_name filter
+# ===================================================================
+
+class TestDescribeServiceNameFilter:
+    def test_filter_by_service_name(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            {"service": "nfs", "location": "node1", "group_id": "c1", "info": "{}"},
+            {"service": "nfs", "location": "node1", "group_id": "c2", "info": "{}"},
+        ]
+        result = orchestrator.describe_service(service_name="nfs.c1")
+        assert len(result.result) == 1
+        assert result.result[0].spec.service_id == "c1"
+
+
+# ===================================================================
+# remove_service() - dotted non-NFS
+# ===================================================================
+
+class TestRemoveServiceDotted:
+    def test_remove_dotted_non_nfs_service(self, orchestrator, mock_client):
+        """e.g. mds.myfs should call delete_service('mds')"""
+        result = orchestrator.remove_service("mds.myfs")
+        assert result.exception is None
+        mock_client.services.delete_service.assert_called_once_with("mds")
+
+
+# ===================================================================
+# apply_mon() / apply_mgr() / apply_mds() / apply_cephfs_mirror()
+# ===================================================================
+
+class TestApplyGenericServices:
+    def test_apply_mon(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="mon",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_mon(spec)
+        assert result.exception is None
+        assert "enabled on node1" in result.result
+        mock_client.services.enable_service.assert_called_once_with(
+            name="mon", payload="{}", wait=True,
+        )
+
+    def test_apply_mgr(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="mgr",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_mgr(spec)
+        assert result.exception is None
+        mock_client.services.enable_service.assert_called_once_with(
+            name="mgr", payload="{}", wait=True,
+        )
+
+    def test_apply_mds(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="mds",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_mds(spec)
+        assert result.exception is None
+        mock_client.services.enable_service.assert_called_once_with(
+            name="mds", payload="{}", wait=True,
+        )
+
+
+# ===================================================================
+# remove_service()
+# ===================================================================
+
+class TestRemoveService:
+    def test_remove_service_basic(self, orchestrator, mock_client):
+        result = orchestrator.remove_service("rgw")
+        assert result.exception is None
+        assert "Removed service rgw" in result.result
+        mock_client.services.delete_service.assert_called_once_with("rgw")
+
+    def test_remove_service_nfs_with_cluster_id(self, orchestrator, mock_client):
+        result = orchestrator.remove_service("nfs.mycluster")
+        assert result.exception is None
+        assert "Removed service nfs.mycluster" in result.result
+        mock_client.services.delete_nfs_service.assert_called_once_with("mycluster")
+
+    def test_remove_service_nfs_without_cluster_id(self, orchestrator, mock_client):
+        result = orchestrator.remove_service("nfs")
+        assert result.exception is not None
+        assert "cluster_id" in str(result.exception)
+
+    def test_remove_service_api_error(self, orchestrator, mock_client):
+        mock_client.services.delete_service.side_effect = RemoteException("fail")
+        result = orchestrator.remove_service("rgw")
+        assert result.exception is not None
+
+
+# ===================================================================
+# remove_host()
+# ===================================================================
+
+class TestRemoveHost:
+    def test_remove_host_success(self, orchestrator, mock_client):
+        result = orchestrator.remove_host("node2")
+        assert result.exception is None
+        assert "Removed host node2" in result.result
+        mock_client.cluster.remove.assert_called_once_with("node2")
+
+    def test_remove_host_api_error(self, orchestrator, mock_client):
+        mock_client.cluster.remove.side_effect = RemoteException("node not found")
+        result = orchestrator.remove_host("badhost")
+        assert result.exception is not None
+
+
+# ===================================================================
+# service_action()
+# ===================================================================
+
+class TestServiceAction:
+    def test_restart_service(self, orchestrator, mock_client):
+        result = orchestrator.service_action("restart", "mon")
+        assert result.exception is None
+        assert "Restarted mon" in result.result[0]
+        mock_client.services.restart_services.assert_called_once_with(["mon"])
+
+    def test_restart_service_with_id(self, orchestrator, mock_client):
+        result = orchestrator.service_action("restart", "nfs.mycluster")
+        assert result.exception is None
+        mock_client.services.restart_services.assert_called_once_with(["nfs"])
+
+    def test_unsupported_action(self, orchestrator, mock_client):
+        result = orchestrator.service_action("stop", "mon")
+        assert result.exception is not None
+        assert "not supported" in str(result.exception)
+
+    def test_restart_api_error(self, orchestrator, mock_client):
+        mock_client.services.restart_services.side_effect = RemoteException("fail")
+        result = orchestrator.service_action("restart", "mon")
+        assert result.exception is not None

From 0d4794912f93b51bfbd00688af519194470f064e Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Wed, 29 Apr 2026 15:11:25 +0530
Subject: [PATCH 02/11] ci: add orchestrator test coverage

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
---
 .github/workflows/tests.yml |  25 ++++-
 tests/scripts/test-orch.sh  | 210 ++++++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+), 2 deletions(-)
 create mode 100755 tests/scripts/test-orch.sh

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 07595a261..077277175 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -183,11 +183,17 @@ jobs:
         with:
           go-version: 1.25.x
 
-      - name: Run unit-tests
+      - name: Run Go unit-tests
         run: |
           cd microceph
           make check-unit
 
+      - name: Run orchestrator Python unit-tests
+        run: |
+          pip install pytest
+          cd microceph-orch
+          PYTHONPATH=src:tests python3 -m pytest tests/ -v
+
   single-system-tests:
     name: Single node with encryption
     runs-on: ubuntu-22.04
@@ -237,7 +243,7 @@ jobs:
           sudo microceph.ceph mgr module enable microceph
           sudo microceph.ceph orch set backend microceph
 
-      - name: Verify Orch module
+      - name: Verify Orch module (basic)
         run: |
           set -eux
           hn=$(hostname)
@@ -333,6 +339,11 @@ jobs:
       - name: Disable RGW
         run: ~/actionutils.sh disable_rgw
 
+      - name: Run orchestrator integration tests
+        run: |
+          set -eux
+          bash tests/scripts/test-orch.sh microceph.ceph
+
       - name: Enable RGW with SSL enabled
         run: ~/actionutils.sh enable_rgw_ssl
 
@@ -523,6 +534,16 @@ jobs:
       - name: Test RGW certificate rotation with --target (cross-node)
         run: ~/actionutils.sh headexec test_certificate_set_rgw_target node-wrk1
 
+      - name: Enable orchestrator and run integration tests
+        run: |
+          set -eux
+          lxc exec node-wrk0 -- sh -c "microceph.ceph mgr module enable microceph"
+          lxc exec node-wrk0 -- sh -c "microceph.ceph orch set backend microceph"
+          sleep 10
+          lxc exec node-wrk0 -- sh -c "microceph.ceph orch status"
+          lxc file push tests/scripts/test-orch.sh node-wrk0/tmp/test-orch.sh
+          lxc exec node-wrk0 -- bash /tmp/test-orch.sh microceph.ceph
+
       - name: Print logs for failure
         if: failure()
         run: |
diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh
new file mode 100755
index 000000000..46423fd44
--- /dev/null
+++ b/tests/scripts/test-orch.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+# Integration tests for the MicroCeph orchestrator module.
+#
+# Can be run standalone against a pre-configured cluster:
+#   ./test-orch.sh [ceph-command-prefix]
+#
+# Or sourced from actionutils.sh for CI.
+#
+# Expects a bootstrapped MicroCeph cluster with the orchestrator enabled.
+# The first argument is the prefix for ceph commands (default: "microceph.ceph").
+
+set -uo pipefail
+
+CEPH="${1:-microceph.ceph}"
+PASS=0
+FAIL=0
+ERRORS=()
+
+run_ceph() { sudo $CEPH "$@" 2>&1; }
+
+# --- Test helpers ---
+
+assert_contains() {
+    local desc="$1" needle="$2" haystack="$3"
+    if echo "$haystack" | grep -qE "$needle"; then
+        echo "  PASS $desc"
+        PASS=$((PASS + 1))
+    else
+        echo "  FAIL $desc (expected to contain: '$needle')"
+        FAIL=$((FAIL + 1))
+        ERRORS+=("$desc")
+    fi
+}
+
+assert_not_contains() {
+    local desc="$1" needle="$2" haystack="$3"
+    if echo "$haystack" | grep -qE "$needle"; then
+        echo "  FAIL $desc (should NOT contain: '$needle')"
+        FAIL=$((FAIL + 1))
+        ERRORS+=("$desc")
+    else
+        echo "  PASS $desc"
+        PASS=$((PASS + 1))
+    fi
+}
+
+assert_exit_ok() {
+    local desc="$1"
+    shift
+    if "$@" >/dev/null 2>&1; then
+        echo "  PASS $desc"
+        PASS=$((PASS + 1))
+    else
+        echo "  FAIL $desc (non-zero exit)"
+        FAIL=$((FAIL + 1))
+        ERRORS+=("$desc")
+    fi
+}
+
+# ===================================================================
+echo "=== 1. Orchestrator status ==="
+# ===================================================================
+
+output=$(run_ceph orch status)
+assert_contains "backend is microceph" "microceph" "$output"
+assert_contains "available" "Available: Yes" "$output"
+
+# ===================================================================
+echo "=== 2. Host listing ==="
+# ===================================================================
+
+output=$(run_ceph orch host ls)
+host_count=$(echo "$output" | grep -c "hosts in cluster" || true)
+assert_contains "hosts listed" "hosts in cluster" "$output"
+
+# ===================================================================
+echo "=== 3. Service listing (baseline) ==="
+# ===================================================================
+
+output=$(run_ceph orch ls)
+assert_contains "mon service" "mon" "$output"
+assert_contains "mgr service" "mgr" "$output"
+
+# ===================================================================
+echo "=== 4. Daemon listing ==="
+# ===================================================================
+
+output=$(run_ceph orch ps)
+assert_contains "has daemons" "mon" "$output"
+
+# Get first hostname for filter tests
+first_host=$(run_ceph orch host ls | awk 'NR==2{print $1}')
+if [ -n "$first_host" ]; then
+    output=$(run_ceph orch ps --hostname "$first_host")
+    assert_contains "ps filtered to host" "$first_host" "$output"
+
+    output=$(run_ceph orch ps --daemon-type mon)
+    assert_contains "ps filtered to mon type" "mon" "$output"
+fi
+
+# ===================================================================
+echo "=== 5. Device listing ==="
+# ===================================================================
+
+output=$(run_ceph orch device ls)
+# Just verify it doesn't error out
+assert_exit_ok "device ls succeeds" run_ceph orch device ls
+
+# ===================================================================
+echo "=== 6. Apply RGW ==="
+# ===================================================================
+
+# Clean up RGW if it was already enabled by prior test steps
+run_ceph orch rm rgw >/dev/null 2>&1 || true
+sleep 3
+
+# Use first host for placement
+placement="${first_host:-$(hostname)}"
+output=$(run_ceph orch apply rgw default --placement="$placement")
+assert_contains "rgw applied" "enabled|already active" "$output"
+assert_contains "rgw applied" "enabled" "$output"
+sleep 5
+
+output=$(run_ceph orch ls)
+assert_contains "rgw in service list" "rgw" "$output"
+
+output=$(run_ceph orch ps --daemon-type rgw)
+assert_contains "rgw daemon visible" "rgw" "$output"
+
+# ===================================================================
+echo "=== 7. Apply NFS ==="
+# ===================================================================
+
+output=$(run_ceph orch apply nfs testcluster --placement="$placement")
+assert_contains "nfs applied" "enabled|already active" "$output"
+sleep 5
+
+output=$(run_ceph orch ls)
+# NFS may not appear in service list if the service failed to start
+# on the backend (e.g. missing kernel modules on CI runners).
+if echo "$output" | grep -q "nfs.testcluster"; then
+    echo "  PASS nfs in service list"
+    PASS=$((PASS + 1))
+    NFS_RUNNING=true
+else
+    echo "  WARN nfs not in service list (service may have failed to start; skipping daemon check)"
+    NFS_RUNNING=false
+fi
+
+if [ "$NFS_RUNNING" = true ]; then
+    output=$(run_ceph orch ps --daemon-type nfs)
+    assert_contains "nfs daemon visible" "nfs" "$output"
+fi
+
+# ===================================================================
+echo "=== 8. Restart service ==="
+# ===================================================================
+
+output=$(run_ceph orch restart mon)
+assert_contains "mon restarted" "Restarted" "$output"
+
+# ===================================================================
+echo "=== 9. Remove RGW ==="
+# ===================================================================
+
+output=$(run_ceph orch rm rgw)
+assert_contains "rgw removed" "Removed" "$output"
+sleep 3
+
+# Verify RGW is gone from the local node. In multi-node clusters,
+# RGW may still appear on other nodes if it was enabled independently
+# (orch rm only affects the local node; per-host targeting not yet
+# supported, see UseTarget limitation).
+local_host="${first_host:-$(hostname)}"
+output=$(run_ceph orch ps --hostname "$local_host" --daemon-type rgw)
+assert_not_contains "rgw gone from local node" "rgw" "$output"
+
+# ===================================================================
+echo "=== 10. Remove NFS ==="
+# ===================================================================
+
+output=$(run_ceph orch rm nfs.testcluster)
+# NFS removal may fail if the service never fully started
+if echo "$output" | grep -q "Removed"; then
+    echo "  PASS nfs removed"
+    PASS=$((PASS + 1))
+else
+    echo "  WARN nfs removal returned: $output (service may not have been running)"
+fi
+sleep 3
+
+output=$(run_ceph orch ls)
+assert_not_contains "nfs gone from list" "nfs" "$output"
+
+# ===================================================================
+echo ""
+echo "==========================================="
+echo " Results: $PASS passed, $FAIL failed"
+echo "==========================================="
+
+if [ ${#ERRORS[@]} -gt 0 ]; then
+    echo ""
+    echo "Failed tests:"
+    for e in "${ERRORS[@]}"; do
+        echo "  - $e"
+    done
+    exit 1
+fi
+
+exit 0

From 92c9eab73462359bf0b9afa4a33dffbe40c59850 Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Mon, 4 May 2026 14:27:13 +0530
Subject: [PATCH 03/11] fix: avoid crash from f-string spec formatting in mgr
 context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Python 3.12 enum machinery resolves str()/repr() on PlacementSpec and
HostPlacementSpec via Enum.__str__/__repr__ inside ceph-mgr, causing
AttributeError on _name_/_value_repr_. Two callsites trigger this:

- f"...{vars(spec)}" debug logs invoke dict.__repr__ which recurses into
  each value's __repr__, hitting PlacementSpec → HostPattern enum.
- str(h) in _get_placement_hosts falls through to Enum.__str__ on
  HostPlacementSpec (NamedTuple) instances.

Replace 7 vars(spec) f-strings with static debug strings. Use h.hostname
directly when extracting placement hosts.

Also surface raw output in test-orch.sh assert_contains/assert_not_contains
on FAIL to make future diagnosis cheaper.

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
---
 microceph-orch/src/microceph/module.py | 16 ++++++++--------
 tests/scripts/test-orch.sh             |  6 ++++++
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index 9391af7e3..a332e2130 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -317,7 +317,7 @@ def _get_placement_hosts(self, spec: ServiceSpec) -> List[str]:
         """
         hosts = []
         if spec.placement and spec.placement.hosts:
-            hosts = [str(h) for h in spec.placement.hosts]
+            hosts = [h.hostname if hasattr(h, 'hostname') else str(h) for h in spec.placement.hosts]
 
         if not hosts:
             raise ValueError(
@@ -397,7 +397,7 @@ def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
         rbd-mirror is a client-like service with no additional parameters.
         The Go API's ClientServicePlacement handler takes no payload.
         """
-        logger.debug(f"Applying rbd-mirror service, spec: {vars(spec)}")
+        logger.debug("Applying rbd-mirror service")
         return self._apply_service("rbd-mirror", spec, "{}")
 
     @handle_orch_error
@@ -414,7 +414,7 @@ def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]:
         key source is added, SSL cannot be fully configured via the
         orchestrator interface.
         """
-        logger.debug(f"Applying RGW service, spec: {vars(spec)}")
+        logger.debug("Applying RGW service")
 
         # Build RGW-specific payload from the spec.
         # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey
@@ -449,7 +449,7 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]:
         Extracts the NFS cluster ID and optional port from the
         NFSServiceSpec and passes them as the JSON payload.
         """
-        logger.debug(f"Applying NFS service, spec: {vars(spec)}")
+        logger.debug("Applying NFS service")
 
         # Go's NFSServicePlacement expects: cluster_id, v4_min_version, bind_address, bind_port
         # The Ceph NFSServiceSpec uses service_id as the NFS cluster identifier.
@@ -470,19 +470,19 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]:
     @handle_orch_error
     def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]:
         """Enable the MON service on the target hosts."""
-        logger.debug(f"Applying MON service, spec: {vars(spec)}")
+        logger.debug("Applying MON service")
         return self._apply_service("mon", spec, "{}")
 
     @handle_orch_error
     def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]:
         """Enable the MGR service on the target hosts."""
-        logger.debug(f"Applying MGR service, spec: {vars(spec)}")
+        logger.debug("Applying MGR service")
         return self._apply_service("mgr", spec, "{}")
 
     @handle_orch_error
     def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]:
         """Enable the MDS service on the target hosts."""
-        logger.debug(f"Applying MDS service, spec: {vars(spec)}")
+        logger.debug("Applying MDS service")
         return self._apply_service("mds", spec, "{}")
 
     @handle_orch_error
@@ -492,7 +492,7 @@ def apply_cephfs_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
         cephfs-mirror is a client-like service (same as rbd-mirror)
         with no additional parameters.
         """
-        logger.debug(f"Applying cephfs-mirror service, spec: {vars(spec)}")
+        logger.debug("Applying cephfs-mirror service")
         return self._apply_service("cephfs-mirror", spec, "{}")
 
     @handle_orch_error
diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh
index 46423fd44..a95f412dd 100755
--- a/tests/scripts/test-orch.sh
+++ b/tests/scripts/test-orch.sh
@@ -27,6 +27,9 @@ assert_contains() {
         PASS=$((PASS + 1))
     else
         echo "  FAIL $desc (expected to contain: '$needle')"
+        echo "  ----- actual output -----"
+        echo "$haystack" | sed 's/^/  | /'
+        echo "  -------------------------"
         FAIL=$((FAIL + 1))
         ERRORS+=("$desc")
     fi
@@ -36,6 +39,9 @@ assert_not_contains() {
     local desc="$1" needle="$2" haystack="$3"
     if echo "$haystack" | grep -qE "$needle"; then
         echo "  FAIL $desc (should NOT contain: '$needle')"
+        echo "  ----- actual output -----"
+        echo "$haystack" | sed 's/^/  | /'
+        echo "  -------------------------"
         FAIL=$((FAIL + 1))
         ERRORS+=("$desc")
     else

From d3202242a2c7ea56471896a18b9059d50575d672 Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Mon, 25 May 2026 13:19:35 +0000
Subject: [PATCH 04/11] fix: address orchestrator review findings

- get_inventory: drop the dead list_resources() client method and its
  misleading test mocks; document that the inventory reports OSD-backed
  disks only, since /1.0/resources is local-node-only and the socket
  client cannot proxy to peers.
- _apply_service: report a race-free, honest summary
  "<svc>: enabled (requested placement: <hosts>)" instead of falsely
  claiming each placement host gained the service; per-host targeting is
  not supported via the unix socket so the actual placement cannot be
  confirmed.
- _get_existing_service_hosts: accept a group_id so distinct NFS clusters
  are not conflated when checking whether a service already runs.
- get_hosts: parse bracketed IPv6 "[addr]:port" addresses and leave bare
  IPv6 literals intact instead of truncating on the last colon.
- service.py: tolerate non-JSON / error-key-less HTTP error bodies when
  translating microcluster exceptions.
- _get_service_hostlist: use .get() for record fields, consistent with
  the rest of the module.
- remove_service: log when a non-NFS service id is ignored.
- test-orch.sh: quote $CEPH, drop the unused host_count assignment.
- pyproject.toml: declare pytest as a dev dependency group.

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
---
 microceph-orch/pyproject.toml                 |   5 +
 .../src/microceph/client/cluster.py           |   5 -
 .../src/microceph/client/service.py           |   9 +-
 microceph-orch/src/microceph/module.py        | 100 +++++++++++-------
 microceph-orch/tests/conftest.py              |   3 -
 microceph-orch/tests/stubs.py                 |   5 +
 microceph-orch/tests/test_client.py           |  10 --
 microceph-orch/tests/test_module.py           |  67 +++++++++---
 tests/scripts/test-orch.sh                    |   3 +-
 9 files changed, 133 insertions(+), 74 deletions(-)

diff --git a/microceph-orch/pyproject.toml b/microceph-orch/pyproject.toml
index 41e2e24bb..e0a9d0181 100644
--- a/microceph-orch/pyproject.toml
+++ b/microceph-orch/pyproject.toml
@@ -12,3 +12,8 @@ dependencies = [
 
 [tool.uv.sources]
 snap-helpers = { git = "https://github.com/albertodonato/snap-helpers" }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.0",
+]
diff --git a/microceph-orch/src/microceph/client/cluster.py b/microceph-orch/src/microceph/client/cluster.py
index e0173eb11..a3e4311ce 100644
--- a/microceph-orch/src/microceph/client/cluster.py
+++ b/microceph-orch/src/microceph/client/cluster.py
@@ -57,11 +57,6 @@ def list_services(self) -> list[dict]:
         services = self._get("/1.0/services")
         return services.get("metadata") or []
 
-    def list_resources(self) -> list[dict]:
-        """List all resources."""
-        nodes = self._get("/1.0/resources")
-        return nodes.get("metadata") or []
-
     def list_disks(self) -> list[dict]:
         """List all disks"""
         disks = self._get("/1.0/disks")
diff --git a/microceph-orch/src/microceph/client/service.py b/microceph-orch/src/microceph/client/service.py
index 4eac0f4bc..e1965d841 100644
--- a/microceph-orch/src/microceph/client/service.py
+++ b/microceph-orch/src/microceph/client/service.py
@@ -145,8 +145,13 @@ def _request(self, method, path, **kwargs):  # noqa: C901 too complex
         try:
             response.raise_for_status()
         except HTTPError as e:
-            # Do some nice translating to microclusterdexceptions
-            error = response.json().get("error")
+            # Do some nice translating to microclusterdexceptions.
+            # The error body is normally JSON with an "error" key, but a
+            # non-JSON body or a missing key must not mask the real HTTPError.
+            try:
+                error = response.json().get("error") or ""
+            except ValueError:
+                error = ""
             if "remote with name" in error:
                 raise NodeAlreadyExistsException(
                     "Already node exists in the microcluster"
diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index a332e2130..0554834dd 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -132,12 +132,16 @@ def get_hosts(self) -> List[HostSpec]:
         """
         specs = []
         for m in self.microceph.cluster.get_cluster_members():
-            # Address from microcluster is in "host:port" format.
-            # rpartition splits on the last ":" to separate host from port.
+            # microcluster addresses are "host:port"; IPv6 uses the
+            # bracketed "[addr]:port" form. Strip the port without
+            # corrupting a bare IPv6 literal (which also contains ":").
             address = m.get('address', '')
-            addr, _, _ = address.rpartition(":")
-            if not addr:
-                # No ":" found; use the raw address (may be hostname only).
+            if address.startswith('['):
+                addr = address[1:].partition(']')[0]
+            elif address.count(':') == 1:
+                addr = address.rsplit(':', 1)[0]
+            else:
+                # No port present (hostname-only or bare IPv6 literal).
                 addr = address
             specs.append(HostSpec(
                 m.get('name', ''),
@@ -151,10 +155,15 @@ def _get_service_hostlist(self, recorded_services: list) -> dict:
         """Get a dict describing the distribution of services"""
         service_hostlist = defaultdict(list)
         for record in recorded_services:
-            service_name = record['service'] if not record['group_id'] else f"{record['service']}.{record['group_id']}"
-            service_host = record['location']
+            service = record.get('service', '')
+            group_id = record.get('group_id', '')
+            service_name = service if not group_id else f"{service}.{group_id}"
+            service_host = record.get('location', '')
             service_hostlist[service_name].append(service_host)
-            logger.debug(f"microcephs record service({service_name}) at ({service_host}) configured({record['info']})")
+            logger.debug(
+                f"microceph record service({service_name}) at "
+                f"({service_host}) configured({record.get('info', '')})"
+            )
         return service_hostlist
 
     @staticmethod
@@ -269,6 +278,15 @@ def get_inventory(self,
                 host_filter: Optional[InventoryFilter] = None,
                 refresh: bool = False
             ) -> List[InventoryHost]:
+        """Report storage device inventory per host.
+
+        Devices are sourced from the cluster-wide OSD list (/1.0/disks), so
+        every reported device backs an existing OSD and is marked
+        unavailable. Discovery of free/unused disks is not exposed here:
+        /1.0/resources reports only the local node and the socket client
+        cannot proxy to peers (see the _apply_service note on targeting).
+        Hosts with no OSD disks are still listed so callers see every member.
+        """
 
         # Resolve which hosts to include based on the filter.
         filter_hosts = None
@@ -327,8 +345,14 @@ def _get_placement_hosts(self, spec: ServiceSpec) -> List[str]:
 
         return hosts
 
-    def _get_existing_service_hosts(self, service_type: str) -> set:
-        """Return the set of hostnames that already run the given service type."""
+    def _get_existing_service_hosts(self, service_type: str,
+                                    group_id: Optional[str] = None) -> set:
+        """Return the set of hostnames that already run the given service.
+
+        When group_id is provided (e.g. an NFS cluster id), only services
+        matching both the type and that group are counted, so distinct NFS
+        clusters are not conflated.
+        """
         try:
             services = self.microceph.services.list_services()
         except RemoteException:
@@ -336,16 +360,17 @@ def _get_existing_service_hosts(self, service_type: str) -> set:
             return set()
 
         return {
-            svc['location'] for svc in services
-            if svc['service'] == service_type
+            svc.get('location', '') for svc in services
+            if svc.get('service') == service_type
+            and (group_id is None or svc.get('group_id', '') == group_id)
         }
 
-    def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) -> str:
-        """Common logic for applying (enabling) a service on the local node.
+    def _apply_service(self, service_name: str, spec: ServiceSpec,
+                       payload: str, group_id: Optional[str] = None) -> str:
+        """Common logic for applying (enabling) a service.
 
         Validates placement hosts and checks whether the service is already
-        running. Calls enable_service once; the request goes to the local
-        node via the unix socket.
+        running, then calls enable_service once.
 
         Returns a summary string on success.
         Raises an exception if placement is invalid or the API call fails.
@@ -354,27 +379,20 @@ def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) ->
         connects via unix socket to the local node only. MicroCeph's Go
         client uses UseTarget() to proxy to specific hosts, but this is
         not exposed through the socket. See todo #6.
-        Placement hosts are validated and logged, but the enable call
-        always runs on the local node regardless.
+        Because the enable call cannot be confirmed per host, the summary
+        reports the requested placement rather than claiming each host was
+        individually provisioned.
         """
         hosts = self._get_placement_hosts(spec)
-        existing = self._get_existing_service_hosts(service_name)
+        existing = self._get_existing_service_hosts(service_name, group_id)
 
-        # Check if the service is already active on the local node.
-        # Since we can only target the local node, we check if any
-        # of the requested hosts that are local already have the service.
-        all_existing = all(h in existing for h in hosts)
-        if all_existing:
+        # If every requested host already runs the service, there is
+        # nothing to do.
+        if existing.issuperset(hosts):
             skipped_str = ', '.join(hosts)
             logger.info(f"Skipping {service_name}: already active on {skipped_str}")
             return f"{service_name}: already active on {skipped_str}"
 
-        if existing:
-            logger.info(
-                f"{service_name} already active on {', '.join(existing & set(hosts))}; "
-                f"enabling for remaining hosts"
-            )
-
         logger.info(f"Enabling {service_name} (requested hosts: {', '.join(hosts)})")
         self.microceph.services.enable_service(
             name=service_name,
@@ -382,13 +400,16 @@ def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) ->
             wait=True,
         )
 
-        new_hosts = [h for h in hosts if h not in existing]
-        parts = [f"enabled on {', '.join(new_hosts)}"]
-        skipped_hosts = [h for h in hosts if h in existing]
-        if skipped_hosts:
-            parts.append(f"already active on {', '.join(skipped_hosts)}")
+        # The enable call is served by the local node and cannot be
+        # confirmed per host (see note above), so report the request
+        # honestly rather than claiming every placement host gained the
+        # service.
+        summary = f"enabled (requested placement: {', '.join(hosts)})"
+        already_active = sorted(existing & set(hosts))
+        if already_active:
+            summary += f"; already active on {', '.join(already_active)}"
 
-        return f"{service_name}: {'; '.join(parts)}"
+        return f"{service_name}: {summary}"
 
     @handle_orch_error
     def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
@@ -465,7 +486,7 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]:
             nfs_params['bind_address'] = spec.virtual_ip
 
         payload = json.dumps(nfs_params)
-        return self._apply_service("nfs", spec, payload)
+        return self._apply_service("nfs", spec, payload, group_id=spec.service_id)
 
     @handle_orch_error
     def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]:
@@ -517,6 +538,11 @@ def remove_service(self, service_name: str, force: bool = False) -> OrchResult[s
                 raise ValueError("NFS removal requires service name in 'nfs.<cluster_id>' format")
             self.microceph.services.delete_nfs_service(svc_id)
         else:
+            if svc_id:
+                logger.debug(
+                    f"Ignoring id '{svc_id}' for {svc_type}: MicroCeph manages "
+                    f"a single {svc_type} service per node"
+                )
             self.microceph.services.delete_service(svc_type)
 
         return f"Removed service {service_name}"
diff --git a/microceph-orch/tests/conftest.py b/microceph-orch/tests/conftest.py
index f2cf1581f..8d231248d 100644
--- a/microceph-orch/tests/conftest.py
+++ b/microceph-orch/tests/conftest.py
@@ -127,9 +127,6 @@ def mock_client():
     # Default: no disks
     client.services.list_disks.return_value = []
 
-    # Default: no resources
-    client.services.list_resources.return_value = []
-
     # Default: status available
     client.status.is_available.return_value = None
 
diff --git a/microceph-orch/tests/stubs.py b/microceph-orch/tests/stubs.py
index e9b6be2a9..1a950fd3a 100644
--- a/microceph-orch/tests/stubs.py
+++ b/microceph-orch/tests/stubs.py
@@ -4,6 +4,11 @@
 The real types live in ceph.deployment.*, mgr_module, and orchestrator;
 which are only importable inside the Ceph snap environment. These stubs
 replicate just enough behaviour to test our code outside the snap.
+
+NOTE: the stubs are intentionally plain classes. They do NOT reproduce
+the real Ceph enum / NamedTuple machinery (e.g. PlacementSpec with
+HostPlacementSpec entries), so bugs that depend on that behaviour are
+only caught by the integration suite (tests/scripts/test-orch.sh).
 """
 
 from typing import Optional, List, Dict, Any, Generic, TypeVar
diff --git a/microceph-orch/tests/test_client.py b/microceph-orch/tests/test_client.py
index 145a325d8..27b8514ab 100644
--- a/microceph-orch/tests/test_client.py
+++ b/microceph-orch/tests/test_client.py
@@ -104,16 +104,6 @@ def test_list_disks_null_metadata(self, mock_session, endpoint):
         result = svc.list_disks()
         assert result == []
 
-    def test_list_resources_null_metadata(self, mock_session, endpoint):
-        svc = ExtendedAPIService(mock_session, endpoint)
-        mock_session.request.return_value = MagicMock(
-            status_code=200,
-            text='{}',
-            json=lambda: {"metadata": None},
-        )
-        result = svc.list_resources()
-        assert result == []
-
     def test_enable_service_payload(self, mock_session, endpoint):
         svc = ExtendedAPIService(mock_session, endpoint)
         mock_session.request.return_value = MagicMock(
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
index 31f646753..cbe6076c7 100644
--- a/microceph-orch/tests/test_module.py
+++ b/microceph-orch/tests/test_module.py
@@ -18,6 +18,12 @@
 from microceph.client.service import RemoteException
 
 
+def _svc(service, location, group_id="", info=""):
+    """Build a MicroCeph service record as returned by list_services()."""
+    return {"service": service, "location": location,
+            "group_id": group_id, "info": info}
+
+
 # ===================================================================
 # available()
 # ===================================================================
@@ -70,6 +76,17 @@ def test_get_hosts_no_port_in_address(self, orchestrator, mock_client):
         # Should fall back to raw address when no ":" present
         assert result.result[0].addr == "192.168.1.100"
 
+    def test_get_hosts_ipv6_address(self, orchestrator, mock_client):
+        mock_client.cluster.get_cluster_members.return_value = [
+            # bracketed IPv6 with port
+            {"name": "h1", "address": "[fe80::1]:7443", "status": "ONLINE"},
+            # bare IPv6 literal, no port
+            {"name": "h2", "address": "fe80::2", "status": "ONLINE"},
+        ]
+        result = orchestrator.get_hosts()
+        assert result.result[0].addr == "fe80::1"
+        assert result.result[1].addr == "fe80::2"
+
     def test_get_hosts_missing_address(self, orchestrator, mock_client):
         mock_client.cluster.get_cluster_members.return_value = [
             {"name": "h1", "status": "ONLINE"},
@@ -250,7 +267,6 @@ def test_get_inventory_shows_osd_disks(self, orchestrator, mock_client):
             {"location": "node1", "path": "/dev/sdb"},
             {"location": "node1", "path": "/dev/sdc"},
         ]
-        mock_client.services.list_resources.return_value = {}
         result = orchestrator.get_inventory()
         inv = result.result
         # 3 hosts (from cluster members), node1 has 2 OSD disks
@@ -265,7 +281,6 @@ def test_get_inventory_multi_host(self, orchestrator, mock_client):
             {"location": "node1", "path": "/dev/sda"},
             {"location": "node2", "path": "/dev/sda"},
         ]
-        mock_client.services.list_resources.return_value = {}
         result = orchestrator.get_inventory()
         hosts = {h.name for h in result.result}
         assert hosts == {"node1", "node2", "node3"}
@@ -276,7 +291,6 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client):
             {"location": "node2", "path": "/dev/sda"},
             {"location": "node3", "path": "/dev/sda"},
         ]
-        mock_client.services.list_resources.return_value = {}
         filt = InventoryFilter(hosts=["node1", "node3"])
         result = orchestrator.get_inventory(host_filter=filt)
         hosts = {h.name for h in result.result}
@@ -284,14 +298,12 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client):
 
     def test_get_inventory_empty(self, orchestrator, mock_client):
         mock_client.services.list_disks.return_value = []
-        mock_client.services.list_resources.return_value = {}
         mock_client.cluster.get_cluster_members.return_value = []
         result = orchestrator.get_inventory()
         assert result.result == []
 
     def test_get_inventory_includes_members_without_disks(self, orchestrator, mock_client):
         mock_client.services.list_disks.return_value = []
-        mock_client.services.list_resources.return_value = {}
         # Default mock has 3 cluster members
         result = orchestrator.get_inventory()
         hosts = {h.name for h in result.result}
@@ -313,7 +325,10 @@ def test_apply_rbd_mirror_success(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_rbd_mirror(spec)
         assert result.exception is None
-        assert "enabled on node1, node2" in result.result
+        # Summary confirms enablement and echoes the requested placement
+        # without claiming each host was individually provisioned.
+        assert "enabled" in result.result
+        assert "requested placement: node1, node2" in result.result
         # Single API call regardless of host count (local socket only)
         mock_client.services.enable_service.assert_called_once()
 
@@ -324,8 +339,9 @@ def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client):
         assert "No placement hosts" in str(result.exception)
 
     def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client):
+        # node1 already runs the service; node2 is also requested.
         mock_client.services.list_services.return_value = [
-            {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""},
+            _svc("rbd-mirror", "node1"),
         ]
         spec = ServiceSpec(
             service_type="rbd-mirror",
@@ -334,12 +350,12 @@ def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client):
         result = orchestrator.apply_rbd_mirror(spec)
         assert result.exception is None
         assert "already active on node1" in result.result
-        assert "enabled on node2" in result.result
+        assert "enabled" in result.result
         mock_client.services.enable_service.assert_called_once()
 
     def test_apply_rbd_mirror_all_existing(self, orchestrator, mock_client):
         mock_client.services.list_services.return_value = [
-            {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""},
+            _svc("rbd-mirror", "node1"),
         ]
         spec = ServiceSpec(
             service_type="rbd-mirror",
@@ -372,7 +388,8 @@ def test_apply_rgw_basic(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_rgw(spec)
         assert result.exception is None
-        assert "enabled on node1" in result.result
+        assert "enabled" in result.result
+        assert "node1" in result.result
 
         call_kwargs = mock_client.services.enable_service.call_args
         assert call_kwargs.kwargs["name"] == "rgw"
@@ -428,7 +445,8 @@ def test_apply_nfs_basic(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_nfs(spec)
         assert result.exception is None
-        assert "enabled on node1" in result.result
+        assert "enabled" in result.result
+        assert "node1" in result.result
 
         call_kwargs = mock_client.services.enable_service.call_args
         payload = json.loads(call_kwargs.kwargs["payload"])
@@ -478,7 +496,7 @@ def test_apply_nfs_no_placement(self, orchestrator, mock_client):
 
     def test_apply_nfs_skips_existing(self, orchestrator, mock_client):
         mock_client.services.list_services.return_value = [
-            {"service": "nfs", "location": "node1", "group_id": "mycluster", "info": "{}"},
+            _svc("nfs", "node1", group_id="mycluster", info="{}"),
         ]
         spec = NFSServiceSpec(
             service_type="nfs",
@@ -487,7 +505,24 @@ def test_apply_nfs_skips_existing(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_nfs(spec)
         assert "already active on node1" in result.result
-        assert "enabled on node2" in result.result
+        assert "enabled" in result.result
+        mock_client.services.enable_service.assert_called_once()
+
+    def test_apply_nfs_distinct_cluster_not_conflated(self, orchestrator, mock_client):
+        """A host running nfs.other must not suppress enabling nfs.mycluster."""
+        mock_client.services.list_services.return_value = [
+            _svc("nfs", "node1", group_id="other", info="{}"),
+        ]
+        spec = NFSServiceSpec(
+            service_type="nfs",
+            service_id="mycluster",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_nfs(spec)
+        assert result.exception is None
+        # nfs.other on node1 must not be mistaken for nfs.mycluster
+        assert "already active" not in result.result
+        assert "enabled" in result.result
         mock_client.services.enable_service.assert_called_once()
 
 
@@ -507,7 +542,8 @@ def test_apply_cephfs_mirror_success(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_cephfs_mirror(spec)
         assert result.exception is None
-        assert "enabled on node1" in result.result
+        assert "enabled" in result.result
+        assert "node1" in result.result
         mock_client.services.enable_service.assert_called_once_with(
             name="cephfs-mirror", payload="{}", wait=True,
         )
@@ -571,7 +607,8 @@ def test_apply_mon(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_mon(spec)
         assert result.exception is None
-        assert "enabled on node1" in result.result
+        assert "enabled" in result.result
+        assert "node1" in result.result
         mock_client.services.enable_service.assert_called_once_with(
             name="mon", payload="{}", wait=True,
         )
diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh
index a95f412dd..969e2a43d 100755
--- a/tests/scripts/test-orch.sh
+++ b/tests/scripts/test-orch.sh
@@ -16,7 +16,7 @@ PASS=0
 FAIL=0
 ERRORS=()
 
-run_ceph() { sudo $CEPH "$@" 2>&1; }
+run_ceph() { sudo "$CEPH" "$@" 2>&1; }
 
 # --- Test helpers ---
 
@@ -76,7 +76,6 @@ echo "=== 2. Host listing ==="
 # ===================================================================
 
 output=$(run_ceph orch host ls)
-host_count=$(echo "$output" | grep -c "hosts in cluster" || true)
 assert_contains "hosts listed" "hosts in cluster" "$output"
 
 # ===================================================================

From 1dde94973de128df9cc190796fa85ee692c589dd Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Tue, 26 May 2026 21:23:31 +0530
Subject: [PATCH 05/11] ci: install microceph-orch package before orch unit
 tests

`pip install pytest` alone left runtime deps like `requests` to runner
preinstall, which is brittle. Install the package itself so declared
dependencies resolve.

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 077277175..f15a15ad8 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -190,7 +190,7 @@ jobs:
 
       - name: Run orchestrator Python unit-tests
         run: |
-          pip install pytest
+          python3 -m pip install ./microceph-orch pytest
           cd microceph-orch
           PYTHONPATH=src:tests python3 -m pytest tests/ -v
 

From f06e3b45d2c0ae83e51a5914408af095c2736e64 Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Tue, 26 May 2026 21:23:36 +0530
Subject: [PATCH 06/11] feat(orch): reject InventoryFilter.labels in
 get_inventory

MicroCeph has no host-label concept. Previously labels were silently
ignored, returning unfiltered hosts/devices to callers that expected a
label filter. Raise NotImplementedError instead so the mismatch surfaces.

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 microceph-orch/src/microceph/module.py | 12 ++++++++++--
 microceph-orch/tests/test_module.py    |  6 ++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index 0554834dd..33741c900 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -286,12 +286,20 @@ def get_inventory(self,
         /1.0/resources reports only the local node and the socket client
         cannot proxy to peers (see the _apply_service note on targeting).
         Hosts with no OSD disks are still listed so callers see every member.
+
+        Only `host_filter.hosts` is honored; `host_filter.labels` raises
+        NotImplementedError because MicroCeph has no host-label concept.
         """
 
         # Resolve which hosts to include based on the filter.
         filter_hosts = None
-        if host_filter and host_filter.hosts:
-            filter_hosts = set(host_filter.hosts)
+        if host_filter:
+            if host_filter.labels:
+                raise NotImplementedError(
+                    "MicroCeph orchestrator does not support host labels"
+                )
+            if host_filter.hosts:
+                filter_hosts = set(host_filter.hosts)
 
         osd_disks = self.microceph.services.list_disks()
 
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
index cbe6076c7..6c21087ad 100644
--- a/microceph-orch/tests/test_module.py
+++ b/microceph-orch/tests/test_module.py
@@ -296,6 +296,12 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client):
         hosts = {h.name for h in result.result}
         assert hosts == {"node1", "node3"}
 
+    def test_get_inventory_label_filter_rejected(self, orchestrator, mock_client):
+        filt = InventoryFilter(labels=["role=osd"])
+        result = orchestrator.get_inventory(host_filter=filt)
+        assert isinstance(result.exception, NotImplementedError)
+        mock_client.services.list_disks.assert_not_called()
+
     def test_get_inventory_empty(self, orchestrator, mock_client):
         mock_client.services.list_disks.return_value = []
         mock_client.cluster.get_cluster_members.return_value = []

From 8491cbdcc834ad5ba47023616edc0191bb2ae9b0 Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Tue, 26 May 2026 21:23:43 +0530
Subject: [PATCH 07/11] test(orch): drop redundant 'enabled' assert after RGW
 apply

The preceding assert already accepts "enabled|already active". The
second assert forced "enabled" only, which would false-negative when
apply legitimately reports "already active" (idempotent re-apply after
the test's earlier `orch rm rgw`).

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 tests/scripts/test-orch.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh
index 969e2a43d..ccf36f817 100755
--- a/tests/scripts/test-orch.sh
+++ b/tests/scripts/test-orch.sh
@@ -123,7 +123,6 @@ sleep 3
 placement="${first_host:-$(hostname)}"
 output=$(run_ceph orch apply rgw default --placement="$placement")
 assert_contains "rgw applied" "enabled|already active" "$output"
-assert_contains "rgw applied" "enabled" "$output"
 sleep 5
 
 output=$(run_ceph orch ls)

From cab4e2535abe10546f6304e79edf95b77d8b7ace Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Wed, 27 May 2026 13:39:46 +0530
Subject: [PATCH 08/11] fix(orch): make list_daemons resilient to missing
 service metadata keys

ExtendedAPIService.list_services() may return records without
group_id (and other optional keys). Direct dict indexing in
list_daemons raised KeyError and broke `ceph orch ps`. Switch to
.get() with empty-string defaults, consistent with how the same
fields are read in _get_service_hostlist() and the inventory path.

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 microceph-orch/src/microceph/module.py |  6 +++---
 microceph-orch/tests/test_module.py    | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index 33741c900..ac4d5d29d 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -234,9 +234,9 @@ def list_daemons(self,
         services = self.microceph.services.list_services()
         descriptions = []
         for svc in services:
-            svc_daemon_type = svc['service']
-            svc_hostname = svc['location']
-            svc_group_id = svc['group_id']
+            svc_daemon_type = svc.get('service', '')
+            svc_hostname = svc.get('location', '')
+            svc_group_id = svc.get('group_id', '')
             svc_ip = None
             svc_ports = None
             svc_name = f"{svc_daemon_type}.{svc_group_id}" if svc_group_id else svc_daemon_type
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
index 6c21087ad..0e4aeec89 100644
--- a/microceph-orch/tests/test_module.py
+++ b/microceph-orch/tests/test_module.py
@@ -256,6 +256,20 @@ def test_list_daemons_nfs_null_info(self, orchestrator, mock_client):
         result = orchestrator.list_daemons()
         assert len(result.result) == 1
 
+    def test_list_daemons_metadata_missing_optional_keys(self, orchestrator, mock_client):
+        # ExtendedAPIService.list_services() may omit group_id (and other
+        # optional keys); list_daemons must not raise KeyError.
+        mock_client.services.list_services.return_value = [
+            {"service": "mon", "location": "n1"},
+        ]
+        result = orchestrator.list_daemons()
+        assert result.exception is None
+        assert len(result.result) == 1
+        daemon = result.result[0]
+        assert daemon.daemon_type == "mon"
+        assert daemon.hostname == "n1"
+        assert daemon.service_name == "mon"
+
 
 # ===================================================================
 # get_inventory()

From 4b5cf71915413ab3a9bcb558e194321380a2040b Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Wed, 27 May 2026 15:44:10 +0530
Subject: [PATCH 09/11] orch: address review feedback on apply_rgw, apply_mds,
 service_action

Three issues raised in PR review:

1. apply_rgw silently drops spec.service_id. MicroCeph deploys a
   single bare "rgw" service per node and does not support multiple
   RGW realms with distinct service_ids on the same cluster. Log a
   warning when a service_id is supplied so users get a clear signal
   instead of confusing apparently-successful no-op behaviour, and
   document the limitation on the docstring.

2. apply_mds has the same shape: per-filesystem MDS placement is not
   supported and service_id is silently ignored. Same fix.

3. service_action("restart", ...) only the backend serviceWorkerTable
   in microceph/ceph/services.go has entries for osd, mon and rgw;
   any other service type would have surfaced an opaque
   `no handler defined for service X` RemoteException from the
   backend. Reject unsupported service types up front with a
   NotImplementedError that names the supported set, and add a
   RESTART_SUPPORTED_SERVICES constant so the boundary is explicit.

Tests:
- test_apply_rgw_service_id_warns
- test_apply_mds_service_id_warns
- test_restart_unsupported_service_type covers nfs/mds/mgr/rbd-mirror
- test_restart_service_with_id updated to use rgw.realm1 (was
  nfs.mycluster, which is now correctly rejected)

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 microceph-orch/src/microceph/module.py | 41 +++++++++++++++++-
 microceph-orch/tests/test_module.py    | 58 +++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index ac4d5d29d..08dd53fdd 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -45,6 +45,12 @@
     'nfs': NFSServiceSpec,
 }
 
+# Services for which MicroCeph's backend supports restart. The Go
+# serviceWorkerTable (microceph/ceph/services.go) only registers handlers
+# for osd, mon and rgw; restarting any other service raises
+# "no handler defined for service X" at the backend.
+RESTART_SUPPORTED_SERVICES = frozenset({"osd", "mon", "rgw"})
+
 class MicroCephOrchestrator(Orchestrator,
                             MgrModule,
                             metaclass=CLICommandMeta):
@@ -442,9 +448,23 @@ def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]:
         SSLPrivateKey to enable SSL. Until the spec is extended or a separate
         key source is added, SSL cannot be fully configured via the
         orchestrator interface.
+
+        Note on service_id (realms/zones): MicroCeph deploys a single bare
+        "rgw" service per node and does not support multiple RGW instances
+        with distinct service_ids on the same cluster. If a service_id is
+        provided in the spec, it is ignored and a warning is logged.
         """
         logger.debug("Applying RGW service")
 
+        # MicroCeph stores a single bare "rgw" service; per-realm service_id
+        # is not supported by the backend, so we drop it with a warning.
+        if spec.service_id:
+            logger.warning(
+                f"RGW service_id={spec.service_id!r} ignored: MicroCeph deploys "
+                "a single 'rgw' service; multiple RGW realms with distinct "
+                "service_ids are not supported."
+            )
+
         # Build RGW-specific payload from the spec.
         # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey
         # Field names match Go exported field names (no json tags defined,
@@ -510,8 +530,20 @@ def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]:
 
     @handle_orch_error
     def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]:
-        """Enable the MDS service on the target hosts."""
+        """Enable the MDS service on the target hosts.
+
+        Note on service_id (filesystem name): MicroCeph deploys a single
+        bare "mds" service and does not support per-filesystem MDS
+        placement. If a service_id is provided in the spec, it is
+        ignored and a warning is logged.
+        """
         logger.debug("Applying MDS service")
+        if spec.service_id:
+            logger.warning(
+                f"MDS service_id={spec.service_id!r} ignored: MicroCeph deploys "
+                "a single 'mds' service; per-filesystem MDS placement is not "
+                "supported."
+            )
         return self._apply_service("mds", spec, "{}")
 
     @handle_orch_error
@@ -588,5 +620,12 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
 
         svc_type, _ = self._parse_service_name(service_name)
 
+        if svc_type not in RESTART_SUPPORTED_SERVICES:
+            raise NotImplementedError(
+                f"Restart of service type {svc_type!r} is not supported by "
+                f"MicroCeph. Supported services: "
+                f"{sorted(RESTART_SUPPORTED_SERVICES)}."
+            )
+
         self.microceph.services.restart_services([svc_type])
         return [f"Restarted {service_name}"]
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
index 0e4aeec89..87dac2ab8 100644
--- a/microceph-orch/tests/test_module.py
+++ b/microceph-orch/tests/test_module.py
@@ -451,6 +451,28 @@ def test_apply_rgw_no_placement(self, orchestrator, mock_client):
         result = orchestrator.apply_rgw(spec)
         assert result.exception is not None
 
+    def test_apply_rgw_service_id_warns(self, orchestrator, mock_client, caplog):
+        # MicroCeph deploys a single bare 'rgw' service; per-realm
+        # service_id is silently dropped — log a warning instead of
+        # surprising the caller.
+        spec = RGWSpec(
+            service_type="rgw",
+            service_id="realm1",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        import logging
+        with caplog.at_level(logging.WARNING):
+            result = orchestrator.apply_rgw(spec)
+
+        assert result.exception is None
+        # service_id is not forwarded to the backend.
+        call_kwargs = mock_client.services.enable_service.call_args
+        assert call_kwargs.kwargs["name"] == "rgw"
+        assert any(
+            "service_id" in r.message and "realm1" in r.message
+            for r in caplog.records
+        )
+
 
 # ===================================================================
 # apply_nfs()
@@ -655,6 +677,27 @@ def test_apply_mds(self, orchestrator, mock_client):
             name="mds", payload="{}", wait=True,
         )
 
+    def test_apply_mds_service_id_warns(self, orchestrator, mock_client, caplog):
+        # Per-filesystem MDS placement is not supported by MicroCeph;
+        # service_id (filesystem name) must be dropped with a warning.
+        spec = ServiceSpec(
+            service_type="mds",
+            service_id="fs1",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        import logging
+        with caplog.at_level(logging.WARNING):
+            result = orchestrator.apply_mds(spec)
+
+        assert result.exception is None
+        mock_client.services.enable_service.assert_called_once_with(
+            name="mds", payload="{}", wait=True,
+        )
+        assert any(
+            "service_id" in r.message and "fs1" in r.message
+            for r in caplog.records
+        )
+
 
 # ===================================================================
 # remove_service()
@@ -713,9 +756,20 @@ def test_restart_service(self, orchestrator, mock_client):
         mock_client.services.restart_services.assert_called_once_with(["mon"])
 
     def test_restart_service_with_id(self, orchestrator, mock_client):
-        result = orchestrator.service_action("restart", "nfs.mycluster")
+        # Service identifiers are stripped before dispatching to the
+        # MicroCeph API; rgw is a supported restart target.
+        result = orchestrator.service_action("restart", "rgw.realm1")
         assert result.exception is None
-        mock_client.services.restart_services.assert_called_once_with(["nfs"])
+        mock_client.services.restart_services.assert_called_once_with(["rgw"])
+
+    def test_restart_unsupported_service_type(self, orchestrator, mock_client):
+        # MicroCeph's backend serviceWorkerTable only handles osd/mon/rgw.
+        # The orchestrator must reject restart for other service types up
+        # front rather than surfacing an opaque RemoteException.
+        for svc in ["nfs.mycluster", "mds", "mgr", "rbd-mirror"]:
+            result = orchestrator.service_action("restart", svc)
+            assert isinstance(result.exception, NotImplementedError), svc
+            mock_client.services.restart_services.assert_not_called()
 
     def test_unsupported_action(self, orchestrator, mock_client):
         result = orchestrator.service_action("stop", "mon")

From a350716f95a7924c3e6583a55c4fcedd0db9e71e Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Thu, 28 May 2026 11:32:55 +0530
Subject: [PATCH 10/11] orch: per-host targeting via ?target= and stricter
 service-name handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds end-to-end per-host targeting for service enable / remove /
restart, plus a bare-name guard that aligns the orchestrator's
service-name semantics with MicroCeph's single-bare-instance-per-node
backend.

Transport: MicroCeph's HTTP API endpoints for service mutation all
declare `ProxyTarget: true` in the Go rest layer. The Python client
keeps using the local unix socket; the server-side proxyTarget
middleware in microcluster reads the new `?target=<member>` query
parameter and mTLS-forwards the request to that member's HTTPS
endpoint. No certs or extra config required in the mgr snap; ceph-mgr
and microcephd share `$SNAP_COMMON`.

Behavioural changes:

* `_apply_service`, `remove_service`, `service_action` fan out per
  host. Already-running hosts are skipped on apply; remove and
  restart discover hosts via `list_services()`. Any per-host failure
  raises `RuntimeError` with partial-success context, instead of
  silently mixing failure entries into a success result.
* `_get_existing_service_hosts` no longer swallows `RemoteException`
  on `list_services()` — a backend failure now surfaces rather than
  producing a false "no-op" success.
* New `_require_bare_service_name(svc_type, svc_id)` helper and a
  `DOTTED_NAME_SUPPORTED_SERVICES = {"nfs"}` allowlist. Dotted names
  on any other service (rgw.realm1, mds.fs1, mon.x, ...) are rejected
  up front with `ValueError`. Previously rgw/mds warned-and-dropped;
  mon/mgr/rbd-mirror/cephfs-mirror silently dropped. Same guard
  applied on `remove_service` and `service_action`.
* `_apply_service` parameter renamed `service_name` -> `service_type`
  to reflect that it always receives the bare type, never a dotted
  name.

Client (`microceph-orch/src/microceph/client/cluster.py`):

* `enable_service`, `delete_service`, `restart_services`,
  `delete_nfs_service` accept an optional `target: str | None`
  kwarg. When set, it is forwarded as a `?target=` query param to
  drive the server-side proxy. Guard uses `if target is not None`
  so an empty string is no longer silently equivalent to None.

Tests:

* 94 unit tests pass (was 79 before this branch). New coverage:
  per-host target assertion on every apply/remove/restart success
  path; `_apply_service` partial-failure raises with context;
  `_get_existing_service_hosts` exception propagation; dotted-name
  rejection on rgw/mds/mon/mgr/rbd-mirror/cephfs-mirror; NFS
  group_id no-match remove; `service_action` partial failure;
  `reset_mock` per loop iter in unsupported-restart test.
* `side_effect` lambdas use `(*args, **kwargs)` to remain robust to
  positional-vs-keyword call shapes.

Multi-node CI integration test
(`tests/scripts/test-orch.sh`):

* New section 6b applies rgw with `--placement="$first_host,$second_host"`
  and verifies the daemon appears on the *second* host via
  `orch ps --hostname` — exercising the unix-socket ->
  proxyTarget -> remote mTLS path end to end.
* Section 9 verifies cluster-wide RGW removal (every host) instead
  of just the local node.
* Section 8 asserts the new `Restarted mon on <host>` output shape
  and that the restart enumerates each host in a multi-host cluster.
* Adds a guard test that `orch restart rgw.realm1` fails with
  `does not support a service_id`.

Refs canonical/microceph#743

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/microceph/client/cluster.py           |  33 +-
 microceph-orch/src/microceph/module.py        | 294 ++++++++++++-----
 microceph-orch/tests/test_module.py           | 303 ++++++++++++++----
 tests/scripts/test-orch.sh                    |  60 +++-
 4 files changed, 530 insertions(+), 160 deletions(-)

diff --git a/microceph-orch/src/microceph/client/cluster.py b/microceph-orch/src/microceph/client/cluster.py
index a3e4311ce..ed61cda85 100644
--- a/microceph-orch/src/microceph/client/cluster.py
+++ b/microceph-orch/src/microceph/client/cluster.py
@@ -74,7 +74,8 @@ def get_status(self) -> dict[str, dict]:
             for member in members
         }
 
-    def enable_service(self, name: str, payload: str = "", wait: bool = True) -> None:
+    def enable_service(self, name: str, payload: str = "", wait: bool = True,
+                       target: str | None = None) -> None:
         """Enable a service on the cluster.
 
         Sends a PUT request to /1.0/services/<name> with an EnableService payload.
@@ -85,38 +86,54 @@ def enable_service(self, name: str, payload: str = "", wait: bool = True) -> Non
         :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror')
         :param payload: JSON string with service-specific parameters
         :param wait: if True, the server waits for the service to be fully enabled
+        :param target: optional cluster member name. When set, the
+            server-side proxyTarget middleware (microcluster) forwards
+            the request over mTLS to that node, allowing per-host
+            service enablement from the local unix socket. When None,
+            no target is forwarded and the server handles the request
+            on the node receiving the unix-socket connection.
         """
         # Note: The "bool" key maps to Go's EnableService.Wait field which has
         # the struct tag `json:"bool"` (upstream naming quirk in MicroCeph).
+        params = {"target": target} if target is not None else None
         self._put(f"/1.0/services/{name}", json={
             "name": name,
             "bool": wait,
             "payload": payload,
-        })
+        }, params=params)
 
-    def delete_service(self, name: str) -> None:
+    def delete_service(self, name: str, target: str | None = None) -> None:
         """Delete/disable a service on the cluster.
 
         :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror')
+        :param target: optional cluster member name; see enable_service.
         """
-        self._delete(f"/1.0/services/{name}")
+        params = {"target": target} if target is not None else None
+        self._delete(f"/1.0/services/{name}", params=params)
 
-    def restart_services(self, services: list[str]) -> None:
+    def restart_services(self, services: list[str],
+                         target: str | None = None) -> None:
         """Restart one or more services on the cluster.
 
         Sends a POST to /1.0/services/restart with a list of service objects.
 
         :param services: list of service names (e.g. ['mon', 'rgw'])
+        :param target: optional cluster member name; see enable_service.
         """
+        params = {"target": target} if target is not None else None
         payload = [{"service": svc} for svc in services]
-        self._post("/1.0/services/restart", json=payload)
+        self._post("/1.0/services/restart", json=payload, params=params)
 
-    def delete_nfs_service(self, cluster_id: str) -> None:
+    def delete_nfs_service(self, cluster_id: str,
+                           target: str | None = None) -> None:
         """Delete/disable an NFS service by cluster ID.
 
         NFS deletion requires the cluster_id in the request body, unlike
         other services which are identified by URL path alone.
 
         :param cluster_id: NFS cluster identifier
+        :param target: optional cluster member name; see enable_service.
         """
-        self._delete("/1.0/services/nfs", json={"cluster_id": cluster_id})
+        params = {"target": target} if target is not None else None
+        self._delete("/1.0/services/nfs",
+                     json={"cluster_id": cluster_id}, params=params)
diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index 08dd53fdd..57b75ec87 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -51,6 +51,31 @@
 # "no handler defined for service X" at the backend.
 RESTART_SUPPORTED_SERVICES = frozenset({"osd", "mon", "rgw"})
 
+# Services for which MicroCeph distinguishes individual instances via a
+# `service_id` (the `<type>.<id>` dotted form). Currently only NFS, where
+# the id is the NFS cluster_id. Any other service is deployed as a single
+# bare instance per node; using a dotted name on them is rejected up front
+# so operators get a clear error instead of an apparently successful
+# operation that silently targets the wrong scope.
+DOTTED_NAME_SUPPORTED_SERVICES = frozenset({"nfs"})
+
+
+def _require_bare_service_name(svc_type: str, svc_id: str) -> None:
+    """Reject dotted service names for services that don't support them.
+
+    Raises ValueError if `svc_id` is non-empty and `svc_type` is not in
+    DOTTED_NAME_SUPPORTED_SERVICES.
+    """
+    if svc_id and svc_type not in DOTTED_NAME_SUPPORTED_SERVICES:
+        supported = ", ".join(sorted(DOTTED_NAME_SUPPORTED_SERVICES))
+        raise ValueError(
+            f"Service type '{svc_type}' does not support a service_id; "
+            f"MicroCeph deploys a single bare '{svc_type}' per node. "
+            f"Use '{svc_type}' (without an id) instead of "
+            f"'{svc_type}.{svc_id}'. Dotted names are only valid for: "
+            f"{supported}."
+        )
+
 class MicroCephOrchestrator(Orchestrator,
                             MgrModule,
                             metaclass=CLICommandMeta):
@@ -366,64 +391,99 @@ def _get_existing_service_hosts(self, service_type: str,
         When group_id is provided (e.g. an NFS cluster id), only services
         matching both the type and that group are counted, so distinct NFS
         clusters are not conflated.
-        """
-        try:
-            services = self.microceph.services.list_services()
-        except RemoteException:
-            logger.warning(f"Failed to list services while checking existing {service_type} hosts")
-            return set()
 
+        A failure to list services from the backend is propagated to the
+        caller rather than swallowed: returning an empty set on error
+        would otherwise produce false "no-op" successes and re-apply
+        services that are in fact already running.
+        """
+        services = self.microceph.services.list_services()
         return {
             svc.get('location', '') for svc in services
             if svc.get('service') == service_type
             and (group_id is None or svc.get('group_id', '') == group_id)
         }
 
-    def _apply_service(self, service_name: str, spec: ServiceSpec,
+    def _apply_service(self, service_type: str, spec: ServiceSpec,
                        payload: str, group_id: Optional[str] = None) -> str:
         """Common logic for applying (enabling) a service.
 
-        Validates placement hosts and checks whether the service is already
-        running, then calls enable_service once.
+        Per-host targeting: MicroCeph's HTTP API endpoints for service
+        enable/delete/restart all declare ProxyTarget=true in the Go
+        rest layer. The microcluster proxyTarget middleware inspects the
+        `?target=<member>` query parameter and forwards the request over
+        mTLS to that member's HTTPS endpoint. The Python client uses the
+        local unix socket; the server transparently proxies per-host
+        calls from there, so no direct HTTPS connectivity from the orch
+        module is required.
+
+        We iterate requested placement hosts, skip any host that already
+        runs the service, and call enable_service once per remaining
+        host with `target=<host>`. Any per-host failure raises so the
+        operator gets a visible error rather than a partial-success
+        result that the Ceph orchestrator framework would render as
+        green.
+
+        Note: `service_type` is the bare service type (e.g. "rgw",
+        "nfs"), never the dotted form. Callers that accept dotted names
+        are expected to parse and pass `group_id` separately.
 
         Returns a summary string on success.
-        Raises an exception if placement is invalid or the API call fails.
-
-        NOTE: Per-host targeting is not yet supported. The Python client
-        connects via unix socket to the local node only. MicroCeph's Go
-        client uses UseTarget() to proxy to specific hosts, but this is
-        not exposed through the socket. See todo #6.
-        Because the enable call cannot be confirmed per host, the summary
-        reports the requested placement rather than claiming each host was
-        individually provisioned.
+        Raises RuntimeError if any host fails to enable.
         """
         hosts = self._get_placement_hosts(spec)
-        existing = self._get_existing_service_hosts(service_name, group_id)
+        existing = self._get_existing_service_hosts(service_type, group_id)
 
         # If every requested host already runs the service, there is
         # nothing to do.
         if existing.issuperset(hosts):
             skipped_str = ', '.join(hosts)
-            logger.info(f"Skipping {service_name}: already active on {skipped_str}")
-            return f"{service_name}: already active on {skipped_str}"
-
-        logger.info(f"Enabling {service_name} (requested hosts: {', '.join(hosts)})")
-        self.microceph.services.enable_service(
-            name=service_name,
-            payload=payload,
-            wait=True,
-        )
+            logger.info(f"Skipping {service_type}: already active on {skipped_str}")
+            return f"{service_type}: already active on {skipped_str}"
 
-        # The enable call is served by the local node and cannot be
-        # confirmed per host (see note above), so report the request
-        # honestly rather than claiming every placement host gained the
-        # service.
-        summary = f"enabled (requested placement: {', '.join(hosts)})"
+        to_enable = [h for h in hosts if h not in existing]
         already_active = sorted(existing & set(hosts))
+
+        enabled: List[str] = []
+        failures: List[str] = []
+        for host in to_enable:
+            logger.info(f"Enabling {service_type} on {host}")
+            try:
+                self.microceph.services.enable_service(
+                    name=service_type,
+                    payload=payload,
+                    wait=True,
+                    target=host,
+                )
+                enabled.append(host)
+            except Exception as e:
+                logger.error(f"Failed to enable {service_type} on {host}: {e}")
+                failures.append(f"{host}: {e}")
+
+        # Any failure is surfaced as an exception so the orchestrator
+        # framework reports the operation as failed. Partial-success
+        # context is included in the message to aid debugging.
+        if failures:
+            ctx = []
+            if enabled:
+                ctx.append(f"enabled on {', '.join(enabled)}")
+            if already_active:
+                ctx.append(f"already active on {', '.join(already_active)}")
+            ctx_str = f" ({'; '.join(ctx)})" if ctx else ""
+            raise RuntimeError(
+                f"Failed to enable {service_type}: "
+                + "; ".join(failures)
+                + ctx_str
+            )
+
+        parts = []
+        if enabled:
+            parts.append(f"enabled on {', '.join(enabled)}")
         if already_active:
-            summary += f"; already active on {', '.join(already_active)}"
+            parts.append(f"already active on {', '.join(already_active)}")
 
-        return f"{service_name}: {summary}"
+        summary = "; ".join(parts) if parts else "no-op"
+        return f"{service_type}: {summary}"
 
     @handle_orch_error
     def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
@@ -431,8 +491,10 @@ def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
 
         rbd-mirror is a client-like service with no additional parameters.
         The Go API's ClientServicePlacement handler takes no payload.
+        Dotted names are rejected (see `_require_bare_service_name`).
         """
         logger.debug("Applying rbd-mirror service")
+        _require_bare_service_name("rbd-mirror", spec.service_id or "")
         return self._apply_service("rbd-mirror", spec, "{}")
 
     @handle_orch_error
@@ -451,19 +513,12 @@ def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]:
 
         Note on service_id (realms/zones): MicroCeph deploys a single bare
         "rgw" service per node and does not support multiple RGW instances
-        with distinct service_ids on the same cluster. If a service_id is
-        provided in the spec, it is ignored and a warning is logged.
+        with distinct service_ids on the same cluster. Supplying a
+        service_id raises ValueError so the operator is not surprised
+        by a silently-misscoped subsequent remove_service call.
         """
         logger.debug("Applying RGW service")
-
-        # MicroCeph stores a single bare "rgw" service; per-realm service_id
-        # is not supported by the backend, so we drop it with a warning.
-        if spec.service_id:
-            logger.warning(
-                f"RGW service_id={spec.service_id!r} ignored: MicroCeph deploys "
-                "a single 'rgw' service; multiple RGW realms with distinct "
-                "service_ids are not supported."
-            )
+        _require_bare_service_name("rgw", spec.service_id or "")
 
         # Build RGW-specific payload from the spec.
         # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey
@@ -518,14 +573,22 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]:
 
     @handle_orch_error
     def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]:
-        """Enable the MON service on the target hosts."""
+        """Enable the MON service on the target hosts.
+
+        Dotted names are rejected (see `_require_bare_service_name`).
+        """
         logger.debug("Applying MON service")
+        _require_bare_service_name("mon", spec.service_id or "")
         return self._apply_service("mon", spec, "{}")
 
     @handle_orch_error
     def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]:
-        """Enable the MGR service on the target hosts."""
+        """Enable the MGR service on the target hosts.
+
+        Dotted names are rejected (see `_require_bare_service_name`).
+        """
         logger.debug("Applying MGR service")
+        _require_bare_service_name("mgr", spec.service_id or "")
         return self._apply_service("mgr", spec, "{}")
 
     @handle_orch_error
@@ -534,16 +597,10 @@ def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]:
 
         Note on service_id (filesystem name): MicroCeph deploys a single
         bare "mds" service and does not support per-filesystem MDS
-        placement. If a service_id is provided in the spec, it is
-        ignored and a warning is logged.
+        placement. Supplying a service_id raises ValueError.
         """
         logger.debug("Applying MDS service")
-        if spec.service_id:
-            logger.warning(
-                f"MDS service_id={spec.service_id!r} ignored: MicroCeph deploys "
-                "a single 'mds' service; per-filesystem MDS placement is not "
-                "supported."
-            )
+        _require_bare_service_name("mds", spec.service_id or "")
         return self._apply_service("mds", spec, "{}")
 
     @handle_orch_error
@@ -551,41 +608,80 @@ def apply_cephfs_mirror(self, spec: ServiceSpec) -> OrchResult[str]:
         """Enable the cephfs-mirror service on the target hosts.
 
         cephfs-mirror is a client-like service (same as rbd-mirror)
-        with no additional parameters.
+        with no additional parameters. Dotted names are rejected
+        (see `_require_bare_service_name`).
         """
         logger.debug("Applying cephfs-mirror service")
+        _require_bare_service_name("cephfs-mirror", spec.service_id or "")
         return self._apply_service("cephfs-mirror", spec, "{}")
 
     @handle_orch_error
     def remove_service(self, service_name: str, force: bool = False) -> OrchResult[str]:
-        """Remove a service from the local node.
+        """Remove a service cluster-wide.
 
-        Sends a DELETE request to the local MicroCeph API. This removes
-        the service from the node connected via the unix socket only;
-        it does not remove the service cluster-wide. Per-host targeting
-        requires UseTarget support (see todo #6).
+        Discovers all hosts currently running the requested service via
+        list_services() and issues a DELETE per host using the server-side
+        proxyTarget middleware (see _apply_service for transport notes).
+        Errors from individual hosts are collected so a partial failure
+        does not mask successful removals on other nodes.
 
-        :param service_name: service type or type.id (e.g. "rgw", "nfs.mycluster")
+        :param service_name: service type or type.id (e.g. "rgw",
+            "nfs.mycluster")
         :param force: unused, kept for interface compatibility
         """
         logger.info(f"Removing service: {service_name}, force={force}")
 
         svc_type, svc_id = self._parse_service_name(service_name)
 
-        # NFS requires the cluster_id in the delete body
-        if svc_type == 'nfs':
-            if not svc_id:
-                raise ValueError("NFS removal requires service name in 'nfs.<cluster_id>' format")
-            self.microceph.services.delete_nfs_service(svc_id)
-        else:
-            if svc_id:
-                logger.debug(
-                    f"Ignoring id '{svc_id}' for {svc_type}: MicroCeph manages "
-                    f"a single {svc_type} service per node"
-                )
-            self.microceph.services.delete_service(svc_type)
+        if svc_type == 'nfs' and not svc_id:
+            raise ValueError(
+                "NFS removal requires service name in 'nfs.<cluster_id>' format"
+            )
+        # For non-NFS services, a dotted name is rejected up front:
+        # MicroCeph deploys a single bare instance per node and silently
+        # dropping the id would let an operator believe they were
+        # removing a specific realm/filesystem while actually wiping the
+        # bare service from every host.
+        _require_bare_service_name(svc_type, svc_id)
+
+        # Discover hosts currently running this service. For non-nfs
+        # services group_id is unused; for nfs it filters to the specific
+        # cluster_id.
+        group_id = svc_id if svc_type == 'nfs' else None
+        hosts = sorted(self._get_existing_service_hosts(svc_type, group_id))
 
-        return f"Removed service {service_name}"
+        if not hosts:
+            logger.info(f"No hosts found running {service_name}; nothing to do")
+            return f"{service_name}: no hosts running this service"
+
+        removed: List[str] = []
+        failures: List[str] = []
+        for host in hosts:
+            try:
+                if svc_type == 'nfs':
+                    self.microceph.services.delete_nfs_service(
+                        svc_id, target=host,
+                    )
+                else:
+                    self.microceph.services.delete_service(
+                        svc_type, target=host,
+                    )
+                removed.append(host)
+            except Exception as e:
+                logger.error(f"Failed to remove {service_name} from {host}: {e}")
+                failures.append(f"{host}: {e}")
+
+        # Any failure raises so the operator gets a visible error;
+        # partial-success context is included in the message.
+        if failures:
+            ctx = f" (removed from {', '.join(removed)})" if removed else ""
+            raise RuntimeError(
+                f"Failed to remove {service_name}: "
+                + "; ".join(failures)
+                + ctx
+            )
+
+        return f"{service_name}: removed from {', '.join(removed)}"
 
     @handle_orch_error
     def remove_host(self, host: str, force: bool = False,
@@ -605,7 +701,9 @@ def remove_host(self, host: str, force: bool = False,
     def service_action(self, action: str, service_name: str) -> OrchResult[List[str]]:
         """Perform an action (restart) on a service.
 
-        Currently only 'restart' is supported via the MicroCeph API.
+        Currently only 'restart' is supported via the MicroCeph API. The
+        restart is fanned out per host running the service, using the
+        server-side proxyTarget middleware (see _apply_service).
 
         :param action: one of "start", "stop", "restart", "redeploy", "reconfig"
         :param service_name: service type (e.g. "mon", "rgw")
@@ -618,7 +716,12 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
                 "Only 'restart' is currently available."
             )
 
-        svc_type, _ = self._parse_service_name(service_name)
+        svc_type, svc_id = self._parse_service_name(service_name)
+
+        # All currently-supported restart services are bare; reject
+        # dotted names so an operator does not silently restart all
+        # NFS clusters (etc.) when intending to target one.
+        _require_bare_service_name(svc_type, svc_id)
 
         if svc_type not in RESTART_SUPPORTED_SERVICES:
             raise NotImplementedError(
@@ -627,5 +730,36 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
                 f"{sorted(RESTART_SUPPORTED_SERVICES)}."
             )
 
-        self.microceph.services.restart_services([svc_type])
-        return [f"Restarted {service_name}"]
+        # group_id is threaded through for future-proofing: if NFS
+        # were ever added to RESTART_SUPPORTED_SERVICES, _require_bare
+        # above would already have rejected the dotted-name path, so
+        # svc_id is always empty here; keeping the call shape future-
+        # compatible avoids a silent fan-out across all clusters.
+        group_id = svc_id if svc_type in DOTTED_NAME_SUPPORTED_SERVICES else None
+        hosts = sorted(self._get_existing_service_hosts(svc_type, group_id))
+        if not hosts:
+            return [f"No hosts running {svc_type}; nothing to restart"]
+
+        restarted: List[str] = []
+        failures: List[str] = []
+        for host in hosts:
+            try:
+                self.microceph.services.restart_services(
+                    [svc_type], target=host,
+                )
+                restarted.append(host)
+            except Exception as e:
+                logger.error(f"Failed to restart {svc_type} on {host}: {e}")
+                failures.append(f"{host}: {e}")
+
+        if failures:
+            ctx = (
+                f" (restarted on {', '.join(restarted)})" if restarted else ""
+            )
+            raise RuntimeError(
+                f"Failed to restart {svc_type}: "
+                + "; ".join(failures)
+                + ctx
+            )
+
+        return [f"Restarted {svc_type} on {h}" for h in restarted]
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
index 87dac2ab8..cbd3631a3 100644
--- a/microceph-orch/tests/test_module.py
+++ b/microceph-orch/tests/test_module.py
@@ -345,12 +345,15 @@ def test_apply_rbd_mirror_success(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_rbd_mirror(spec)
         assert result.exception is None
-        # Summary confirms enablement and echoes the requested placement
-        # without claiming each host was individually provisioned.
-        assert "enabled" in result.result
-        assert "requested placement: node1, node2" in result.result
-        # Single API call regardless of host count (local socket only)
-        mock_client.services.enable_service.assert_called_once()
+        # Per-host fan-out via the proxyTarget middleware: one call per
+        # requested host with target=<host>.
+        assert "enabled on node1, node2" in result.result
+        assert mock_client.services.enable_service.call_count == 2
+        targets = sorted(
+            c.kwargs["target"]
+            for c in mock_client.services.enable_service.call_args_list
+        )
+        assert targets == ["node1", "node2"]
 
     def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client):
         spec = ServiceSpec(service_type="rbd-mirror")
@@ -358,6 +361,17 @@ def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client):
         assert result.exception is not None
         assert "No placement hosts" in str(result.exception)
 
+    def test_apply_rbd_mirror_service_id_rejected(self, orchestrator, mock_client):
+        # Bare-name guard: rbd-mirror does not support dotted names.
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            service_id="zone1",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert isinstance(result.exception, ValueError)
+        mock_client.services.enable_service.assert_not_called()
+
     def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client):
         # node1 already runs the service; node2 is also requested.
         mock_client.services.list_services.return_value = [
@@ -370,8 +384,48 @@ def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client):
         result = orchestrator.apply_rbd_mirror(spec)
         assert result.exception is None
         assert "already active on node1" in result.result
-        assert "enabled" in result.result
+        assert "enabled on node2" in result.result
+        # Only the not-yet-active host is targeted.
         mock_client.services.enable_service.assert_called_once()
+        assert (
+            mock_client.services.enable_service.call_args.kwargs["target"]
+            == "node2"
+        )
+
+    def test_apply_list_services_failure_propagates(self, orchestrator, mock_client):
+        # If list_services itself fails we must NOT silently treat the
+        # set of existing hosts as empty and re-apply everywhere; the
+        # error has to surface.
+        mock_client.services.list_services.side_effect = RemoteException(
+            "list failed"
+        )
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert isinstance(result.exception, RemoteException)
+        mock_client.services.enable_service.assert_not_called()
+
+    def test_apply_rbd_mirror_partial_failure(self, orchestrator, mock_client):
+        # node1 enables, node2 fails — any failure raises with
+        # partial-success context in the message.
+        spec = ServiceSpec(
+            service_type="rbd-mirror",
+            placement=PlacementSpec(hosts=["node1", "node2"]),
+        )
+
+        def side_effect(*args, **kwargs):
+            if kwargs.get("target") == "node2":
+                raise RemoteException("node2 boom")
+
+        mock_client.services.enable_service.side_effect = side_effect
+        result = orchestrator.apply_rbd_mirror(spec)
+        assert isinstance(result.exception, RuntimeError)
+        msg = str(result.exception)
+        assert "Failed to enable rbd-mirror" in msg
+        assert "node2: node2 boom" in msg
+        assert "enabled on node1" in msg
 
     def test_apply_rbd_mirror_all_existing(self, orchestrator, mock_client):
         mock_client.services.list_services.return_value = [
@@ -408,11 +462,11 @@ def test_apply_rgw_basic(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_rgw(spec)
         assert result.exception is None
-        assert "enabled" in result.result
-        assert "node1" in result.result
+        assert "enabled on node1" in result.result
 
         call_kwargs = mock_client.services.enable_service.call_args
         assert call_kwargs.kwargs["name"] == "rgw"
+        assert call_kwargs.kwargs["target"] == "node1"
 
     def test_apply_rgw_with_port(self, orchestrator, mock_client):
         spec = RGWSpec(
@@ -426,6 +480,7 @@ def test_apply_rgw_with_port(self, orchestrator, mock_client):
         call_kwargs = mock_client.services.enable_service.call_args
         payload = json.loads(call_kwargs.kwargs["payload"])
         assert payload["Port"] == 8080
+        assert call_kwargs.kwargs["target"] == "node1"
 
     def test_apply_rgw_ssl_cert_without_key_warns(self, orchestrator, mock_client, caplog):
         """SSL cert is present but no key; should warn and not send cert."""
@@ -451,27 +506,19 @@ def test_apply_rgw_no_placement(self, orchestrator, mock_client):
         result = orchestrator.apply_rgw(spec)
         assert result.exception is not None
 
-    def test_apply_rgw_service_id_warns(self, orchestrator, mock_client, caplog):
+    def test_apply_rgw_service_id_rejected(self, orchestrator, mock_client):
         # MicroCeph deploys a single bare 'rgw' service; per-realm
-        # service_id is silently dropped — log a warning instead of
-        # surprising the caller.
+        # service_id is not supported and is rejected up front so the
+        # operator does not believe a realm was provisioned.
         spec = RGWSpec(
             service_type="rgw",
             service_id="realm1",
             placement=PlacementSpec(hosts=["node1"]),
         )
-        import logging
-        with caplog.at_level(logging.WARNING):
-            result = orchestrator.apply_rgw(spec)
-
-        assert result.exception is None
-        # service_id is not forwarded to the backend.
-        call_kwargs = mock_client.services.enable_service.call_args
-        assert call_kwargs.kwargs["name"] == "rgw"
-        assert any(
-            "service_id" in r.message and "realm1" in r.message
-            for r in caplog.records
-        )
+        result = orchestrator.apply_rgw(spec)
+        assert isinstance(result.exception, ValueError)
+        assert "does not support a service_id" in str(result.exception)
+        mock_client.services.enable_service.assert_not_called()
 
 
 # ===================================================================
@@ -487,12 +534,12 @@ def test_apply_nfs_basic(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_nfs(spec)
         assert result.exception is None
-        assert "enabled" in result.result
-        assert "node1" in result.result
+        assert "enabled on node1" in result.result
 
         call_kwargs = mock_client.services.enable_service.call_args
         payload = json.loads(call_kwargs.kwargs["payload"])
         assert payload["cluster_id"] == "mycluster"
+        assert call_kwargs.kwargs["target"] == "node1"
 
     def test_apply_nfs_with_port(self, orchestrator, mock_client):
         spec = NFSServiceSpec(
@@ -547,8 +594,13 @@ def test_apply_nfs_skips_existing(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_nfs(spec)
         assert "already active on node1" in result.result
-        assert "enabled" in result.result
+        assert "enabled on node2" in result.result
         mock_client.services.enable_service.assert_called_once()
+        # Only the host that did not yet run the cluster is targeted.
+        assert (
+            mock_client.services.enable_service.call_args.kwargs["target"]
+            == "node2"
+        )
 
     def test_apply_nfs_distinct_cluster_not_conflated(self, orchestrator, mock_client):
         """A host running nfs.other must not suppress enabling nfs.mycluster."""
@@ -584,12 +636,21 @@ def test_apply_cephfs_mirror_success(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_cephfs_mirror(spec)
         assert result.exception is None
-        assert "enabled" in result.result
-        assert "node1" in result.result
+        assert "enabled on node1" in result.result
         mock_client.services.enable_service.assert_called_once_with(
-            name="cephfs-mirror", payload="{}", wait=True,
+            name="cephfs-mirror", payload="{}", wait=True, target="node1",
         )
 
+    def test_apply_cephfs_mirror_service_id_rejected(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="cephfs-mirror",
+            service_id="x",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_cephfs_mirror(spec)
+        assert isinstance(result.exception, ValueError)
+        mock_client.services.enable_service.assert_not_called()
+
 
 # ===================================================================
 # _parse_service_name()
@@ -630,11 +691,15 @@ def test_filter_by_service_name(self, orchestrator, mock_client):
 # ===================================================================
 
 class TestRemoveServiceDotted:
-    def test_remove_dotted_non_nfs_service(self, orchestrator, mock_client):
-        """e.g. mds.myfs should call delete_service('mds')"""
+    def test_remove_dotted_non_nfs_rejected(self, orchestrator, mock_client):
+        # Removing 'mds.myfs' (or any dotted non-NFS name) must be
+        # rejected: silently dropping the id would otherwise wipe the
+        # bare MDS service across every host while the operator
+        # believed only a specific filesystem was being removed.
         result = orchestrator.remove_service("mds.myfs")
-        assert result.exception is None
-        mock_client.services.delete_service.assert_called_once_with("mds")
+        assert isinstance(result.exception, ValueError)
+        assert "does not support a service_id" in str(result.exception)
+        mock_client.services.delete_service.assert_not_called()
 
 
 # ===================================================================
@@ -649,10 +714,9 @@ def test_apply_mon(self, orchestrator, mock_client):
         )
         result = orchestrator.apply_mon(spec)
         assert result.exception is None
-        assert "enabled" in result.result
-        assert "node1" in result.result
+        assert "enabled on node1" in result.result
         mock_client.services.enable_service.assert_called_once_with(
-            name="mon", payload="{}", wait=True,
+            name="mon", payload="{}", wait=True, target="node1",
         )
 
     def test_apply_mgr(self, orchestrator, mock_client):
@@ -663,8 +727,28 @@ def test_apply_mgr(self, orchestrator, mock_client):
         result = orchestrator.apply_mgr(spec)
         assert result.exception is None
         mock_client.services.enable_service.assert_called_once_with(
-            name="mgr", payload="{}", wait=True,
+            name="mgr", payload="{}", wait=True, target="node1",
+        )
+
+    def test_apply_mon_service_id_rejected(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="mon",
+            service_id="x",
+            placement=PlacementSpec(hosts=["node1"]),
+        )
+        result = orchestrator.apply_mon(spec)
+        assert isinstance(result.exception, ValueError)
+        mock_client.services.enable_service.assert_not_called()
+
+    def test_apply_mgr_service_id_rejected(self, orchestrator, mock_client):
+        spec = ServiceSpec(
+            service_type="mgr",
+            service_id="x",
+            placement=PlacementSpec(hosts=["node1"]),
         )
+        result = orchestrator.apply_mgr(spec)
+        assert isinstance(result.exception, ValueError)
+        mock_client.services.enable_service.assert_not_called()
 
     def test_apply_mds(self, orchestrator, mock_client):
         spec = ServiceSpec(
@@ -674,29 +758,21 @@ def test_apply_mds(self, orchestrator, mock_client):
         result = orchestrator.apply_mds(spec)
         assert result.exception is None
         mock_client.services.enable_service.assert_called_once_with(
-            name="mds", payload="{}", wait=True,
+            name="mds", payload="{}", wait=True, target="node1",
         )
 
-    def test_apply_mds_service_id_warns(self, orchestrator, mock_client, caplog):
+    def test_apply_mds_service_id_rejected(self, orchestrator, mock_client):
         # Per-filesystem MDS placement is not supported by MicroCeph;
-        # service_id (filesystem name) must be dropped with a warning.
+        # a service_id (filesystem name) is rejected up front.
         spec = ServiceSpec(
             service_type="mds",
             service_id="fs1",
             placement=PlacementSpec(hosts=["node1"]),
         )
-        import logging
-        with caplog.at_level(logging.WARNING):
-            result = orchestrator.apply_mds(spec)
-
-        assert result.exception is None
-        mock_client.services.enable_service.assert_called_once_with(
-            name="mds", payload="{}", wait=True,
-        )
-        assert any(
-            "service_id" in r.message and "fs1" in r.message
-            for r in caplog.records
-        )
+        result = orchestrator.apply_mds(spec)
+        assert isinstance(result.exception, ValueError)
+        assert "does not support a service_id" in str(result.exception)
+        mock_client.services.enable_service.assert_not_called()
 
 
 # ===================================================================
@@ -705,26 +781,80 @@ def test_apply_mds_service_id_warns(self, orchestrator, mock_client, caplog):
 
 class TestRemoveService:
     def test_remove_service_basic(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            _svc("rgw", "node1"),
+            _svc("rgw", "node2"),
+        ]
+        result = orchestrator.remove_service("rgw")
+        assert result.exception is None
+        assert "removed from node1, node2" in result.result
+        assert mock_client.services.delete_service.call_count == 2
+
+    def test_remove_service_no_hosts(self, orchestrator, mock_client):
+        # list_services returns [] (default) — nothing to remove.
         result = orchestrator.remove_service("rgw")
         assert result.exception is None
-        assert "Removed service rgw" in result.result
-        mock_client.services.delete_service.assert_called_once_with("rgw")
+        assert "no hosts" in result.result
+        mock_client.services.delete_service.assert_not_called()
 
     def test_remove_service_nfs_with_cluster_id(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            _svc("nfs", "node1", group_id="mycluster"),
+        ]
         result = orchestrator.remove_service("nfs.mycluster")
         assert result.exception is None
-        assert "Removed service nfs.mycluster" in result.result
-        mock_client.services.delete_nfs_service.assert_called_once_with("mycluster")
+        assert "removed from node1" in result.result
+        mock_client.services.delete_nfs_service.assert_called_once()
+        c = mock_client.services.delete_nfs_service.call_args
+        assert c.args == ("mycluster",)
+        assert c.kwargs["target"] == "node1"
 
     def test_remove_service_nfs_without_cluster_id(self, orchestrator, mock_client):
         result = orchestrator.remove_service("nfs")
         assert result.exception is not None
         assert "cluster_id" in str(result.exception)
 
+    def test_remove_service_nfs_group_id_no_match(self, orchestrator, mock_client):
+        # NFS cluster 'other' exists on node1, but we're removing
+        # 'mycluster' — there is nothing to do; the call must NOT
+        # cascade into deleting the unrelated cluster.
+        mock_client.services.list_services.return_value = [
+            _svc("nfs", "node1", group_id="other"),
+        ]
+        result = orchestrator.remove_service("nfs.mycluster")
+        assert result.exception is None
+        assert "no hosts" in result.result
+        mock_client.services.delete_nfs_service.assert_not_called()
+
     def test_remove_service_api_error(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            _svc("rgw", "node1"),
+        ]
         mock_client.services.delete_service.side_effect = RemoteException("fail")
         result = orchestrator.remove_service("rgw")
         assert result.exception is not None
+        assert "Failed to remove" in str(result.exception)
+
+    def test_remove_service_partial_failure(self, orchestrator, mock_client):
+        # node1 succeeds, node2 fails — any failure raises so the
+        # operator sees a visible error, and the partial-success
+        # context is included in the message.
+        mock_client.services.list_services.return_value = [
+            _svc("rgw", "node1"),
+            _svc("rgw", "node2"),
+        ]
+
+        def side_effect(*args, **kwargs):
+            if kwargs.get("target") == "node2":
+                raise RemoteException("node2 boom")
+
+        mock_client.services.delete_service.side_effect = side_effect
+        result = orchestrator.remove_service("rgw")
+        assert isinstance(result.exception, RuntimeError)
+        msg = str(result.exception)
+        assert "Failed to remove rgw" in msg
+        assert "node2: node2 boom" in msg
+        assert "removed from node1" in msg
 
 
 # ===================================================================
@@ -750,33 +880,78 @@ def test_remove_host_api_error(self, orchestrator, mock_client):
 
 class TestServiceAction:
     def test_restart_service(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            _svc("mon", "node1"),
+            _svc("mon", "node2"),
+        ]
         result = orchestrator.service_action("restart", "mon")
         assert result.exception is None
-        assert "Restarted mon" in result.result[0]
-        mock_client.services.restart_services.assert_called_once_with(["mon"])
+        # One restart call per host carrying target=<host>.
+        assert mock_client.services.restart_services.call_count == 2
+        targets = sorted(
+            c.kwargs["target"]
+            for c in mock_client.services.restart_services.call_args_list
+        )
+        assert targets == ["node1", "node2"]
+        assert any("Restarted mon on node1" in r for r in result.result)
+        assert any("Restarted mon on node2" in r for r in result.result)
 
-    def test_restart_service_with_id(self, orchestrator, mock_client):
-        # Service identifiers are stripped before dispatching to the
-        # MicroCeph API; rgw is a supported restart target.
-        result = orchestrator.service_action("restart", "rgw.realm1")
+    def test_restart_no_hosts(self, orchestrator, mock_client):
+        # No hosts running the service — no-op (still success).
+        result = orchestrator.service_action("restart", "mon")
         assert result.exception is None
-        mock_client.services.restart_services.assert_called_once_with(["rgw"])
+        assert any("nothing to restart" in r for r in result.result)
+        mock_client.services.restart_services.assert_not_called()
+
+    def test_restart_dotted_non_nfs_rejected(self, orchestrator, mock_client):
+        # Dotted non-NFS names are rejected: restarting 'rgw.realm1'
+        # would have to silently drop the id and restart every rgw,
+        # which is a surprising operation.
+        result = orchestrator.service_action("restart", "rgw.realm1")
+        assert isinstance(result.exception, ValueError)
+        assert "does not support a service_id" in str(result.exception)
+        mock_client.services.restart_services.assert_not_called()
 
     def test_restart_unsupported_service_type(self, orchestrator, mock_client):
         # MicroCeph's backend serviceWorkerTable only handles osd/mon/rgw.
         # The orchestrator must reject restart for other service types up
         # front rather than surfacing an opaque RemoteException.
         for svc in ["nfs.mycluster", "mds", "mgr", "rbd-mirror"]:
+            mock_client.services.restart_services.reset_mock()
             result = orchestrator.service_action("restart", svc)
             assert isinstance(result.exception, NotImplementedError), svc
             mock_client.services.restart_services.assert_not_called()
 
+    def test_restart_partial_failure(self, orchestrator, mock_client):
+        # node1 restarts, node2 fails — any failure raises with
+        # partial-success context in the message.
+        mock_client.services.list_services.return_value = [
+            _svc("mon", "node1"),
+            _svc("mon", "node2"),
+        ]
+
+        def side_effect(*args, **kwargs):
+            if kwargs.get("target") == "node2":
+                raise RemoteException("node2 boom")
+
+        mock_client.services.restart_services.side_effect = side_effect
+        result = orchestrator.service_action("restart", "mon")
+        assert isinstance(result.exception, RuntimeError)
+        msg = str(result.exception)
+        assert "Failed to restart mon" in msg
+        assert "node2: node2 boom" in msg
+        assert "restarted on node1" in msg
+
     def test_unsupported_action(self, orchestrator, mock_client):
         result = orchestrator.service_action("stop", "mon")
         assert result.exception is not None
         assert "not supported" in str(result.exception)
 
     def test_restart_api_error(self, orchestrator, mock_client):
+        mock_client.services.list_services.return_value = [
+            _svc("mon", "node1"),
+        ]
         mock_client.services.restart_services.side_effect = RemoteException("fail")
         result = orchestrator.service_action("restart", "mon")
         assert result.exception is not None
+        assert "Failed to restart" in str(result.exception)
diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh
index ccf36f817..97a01b235 100755
--- a/tests/scripts/test-orch.sh
+++ b/tests/scripts/test-orch.sh
@@ -115,7 +115,8 @@ assert_exit_ok "device ls succeeds" run_ceph orch device ls
 echo "=== 6. Apply RGW ==="
 # ===================================================================
 
-# Clean up RGW if it was already enabled by prior test steps
+# Clean up RGW if it was already enabled by prior test steps.
+# `orch rm rgw` is now cluster-wide (fans out to every host running rgw).
 run_ceph orch rm rgw >/dev/null 2>&1 || true
 sleep 3
 
@@ -131,6 +132,34 @@ assert_contains "rgw in service list" "rgw" "$output"
 output=$(run_ceph orch ps --daemon-type rgw)
 assert_contains "rgw daemon visible" "rgw" "$output"
 
+# Per-host targeting check: the RGW daemon must end up on the
+# requested host (first_host), not on whichever node served the unix
+# socket. This is what the ?target= proxyTarget mechanism is for.
+output=$(run_ceph orch ps --hostname "$placement" --daemon-type rgw)
+assert_contains "rgw daemon on requested host $placement" "rgw" "$output"
+
+# ===================================================================
+echo "=== 6b. Apply RGW on a second host (per-host targeting) ==="
+# ===================================================================
+
+# Find a host that is NOT the first one to verify cross-node enablement
+# from the local socket. Only run this leg when the cluster has more
+# than one host (single-node deployments skip).
+second_host=$(run_ceph orch host ls | awk 'NR>1 && NF>0 && $1 != "'"$first_host"'" {print $1; exit}')
+if [ -n "$second_host" ] && [ "$second_host" != "$first_host" ]; then
+    output=$(run_ceph orch apply rgw default --placement="$first_host,$second_host")
+    assert_contains "rgw applied to two hosts" "enabled|already active" "$output"
+    sleep 5
+
+    # Verify both hosts now run rgw. The local node (running the orch
+    # client) is the unix-socket endpoint; the second_host enablement
+    # exercises the unix-socket -> proxyTarget -> remote mTLS path.
+    output=$(run_ceph orch ps --hostname "$second_host" --daemon-type rgw)
+    assert_contains "rgw daemon on $second_host (cross-node target)" "rgw" "$output"
+else
+    echo "  SKIP per-host targeting RGW test (only one host in cluster)"
+fi
+
 # ===================================================================
 echo "=== 7. Apply NFS ==="
 # ===================================================================
@@ -161,23 +190,38 @@ echo "=== 8. Restart service ==="
 # ===================================================================
 
 output=$(run_ceph orch restart mon)
-assert_contains "mon restarted" "Restarted" "$output"
+assert_contains "mon restarted" "Restarted mon on" "$output"
+
+# Per-host fan-out: in a multi-host cluster the restart output should
+# enumerate each host running mon, not a single "Restarted mon".
+if [ -n "${second_host:-}" ] && [ "$second_host" != "$first_host" ]; then
+    assert_contains "mon restart enumerates first host" "$first_host" "$output"
+fi
+
+# Dotted-name guard: orch restart rgw.realm1 must fail with a clear
+# "does not support a service_id" error rather than silently restart
+# every rgw daemon.
+output=$(run_ceph orch restart rgw.realm1 2>&1 || true)
+assert_contains "dotted restart rejected" "does not support a service_id" "$output"
 
 # ===================================================================
 echo "=== 9. Remove RGW ==="
 # ===================================================================
 
 output=$(run_ceph orch rm rgw)
-assert_contains "rgw removed" "Removed" "$output"
+assert_contains "rgw removed" "removed from" "$output"
 sleep 3
 
-# Verify RGW is gone from the local node. In multi-node clusters,
-# RGW may still appear on other nodes if it was enabled independently
-# (orch rm only affects the local node; per-host targeting not yet
-# supported, see UseTarget limitation).
+# remove_service fans out across every host running the service, so
+# RGW must be gone from BOTH first_host and second_host (when present).
 local_host="${first_host:-$(hostname)}"
 output=$(run_ceph orch ps --hostname "$local_host" --daemon-type rgw)
-assert_not_contains "rgw gone from local node" "rgw" "$output"
+assert_not_contains "rgw gone from first host" "rgw" "$output"
+
+if [ -n "${second_host:-}" ] && [ "$second_host" != "$first_host" ]; then
+    output=$(run_ceph orch ps --hostname "$second_host" --daemon-type rgw)
+    assert_not_contains "rgw gone from second host (cluster-wide remove)" "rgw" "$output"
+fi
 
 # ===================================================================
 echo "=== 10. Remove NFS ==="

From c9fc7c65a850e8fdf140fe45a6613ec113a06495 Mon Sep 17 00:00:00 2001
From: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Date: Thu, 28 May 2026 15:35:25 +0530
Subject: [PATCH 11/11] orch: tighten error semantics and tolerate TOCTOU race
 on apply

Pre-merge polish after multi-agent review of the per-host targeting
work. No new feature surface; corrects how the orchestrator reports
failures and unsupported operations to the Ceph framework, and
hardens the apply path against a backend race.

Exception hierarchy
-------------------

Swap bare exception types for Ceph orchestrator types so the
framework renders them as proper operation errors rather than Python
tracebacks:

* `RuntimeError` (per-host fan-out failure) -> `OrchestratorError`
* `NotImplementedError` (unsupported action / unsupported service
  type / unsupported InventoryFilter labels) ->
  `OrchestratorValidationError`

`ValueError` is kept for input-shape errors (dotted-name guard, NFS
missing cluster_id) since those fire before any I/O.

Cephadm parity on "nothing to do"
---------------------------------

`remove_service` and `service_action("restart", ...)` now raise
`OrchestratorError("Service X is not running on any host")` when the
requested service has no live hosts, instead of returning a green
success summary. This matches cephadm: an operator asking to remove
or restart a service that does not exist should see an error so a
typo or stale assumption is caught, not a silent no-op.

TOCTOU tolerance on apply
-------------------------

`_apply_service` snapshots existing hosts once and then enables per
host. If the service races into existence between snapshot and
enable, the backend's `genericHospitalityCheck`
(`microceph/ceph/services_placement_generic.go`) returns
"<svc> service already active on host". This is the desired end
state, not a failure, so we now match that substring and treat the
host as enabled instead of appending to failures.

`describe_service` size
-----------------------

`ServiceDescription` now sets `size=len(hostlist)` alongside
`running`. MicroCeph has no separate spec store -- desired state is
the live running set -- so `ceph orch ls` was rendering RUNNING/-
(looks degraded). With `size` populated it renders RUNNING/SIZE
cleanly.

Integration test
----------------

* `tests/scripts/test-orch.sh` `second_host` awk now filters first
  column to a hostname shape (`/^[A-Za-z]/`) so the trailing
  "N hosts in cluster" footer row is no longer picked up as a
  hostname on single-node CI.
* Section 10 (Remove NFS) now accepts either the new
  "removed from <host>" success shape or the new
  "not running on any host" error shape (NFS may legitimately have
  failed to start on CI runners missing kernel modules).

Tests
-----

* 94 unit tests pass.
* Exception-class assertions updated:
  - `isinstance(..., RuntimeError)` -> `OrchestratorError`
  - `isinstance(..., NotImplementedError)` -> `OrchestratorValidationError`
* `test_remove_service_no_hosts`, `test_remove_service_nfs_group_id_no_match`,
  `test_restart_no_hosts` now assert the new "not running on any host"
  error shape instead of a string-summary success.
* `tests/stubs.py` and `tests/conftest.py` extended with
  `OrchestratorError` and `OrchestratorValidationError` stubs and
  bound into the injected `orchestrator` module so production-code
  imports resolve to the right types under test.
* `ServiceDescription` stub gains a `size` keyword.

Refs canonical/microceph#743

Signed-off-by: Utkarsh Bhatt <utkarsh.bhatt@canonical.com>
Assisted-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 microceph-orch/src/microceph/module.py | 51 +++++++++++++++++++++-----
 microceph-orch/tests/conftest.py       |  4 ++
 microceph-orch/tests/stubs.py          | 11 +++++-
 microceph-orch/tests/test_module.py    | 37 +++++++++++--------
 tests/scripts/test-orch.sh             | 18 ++++++---
 5 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py
index 57b75ec87..f2bc7babc 100644
--- a/microceph-orch/src/microceph/module.py
+++ b/microceph-orch/src/microceph/module.py
@@ -31,6 +31,8 @@
     CLICommandMeta,
     handle_orch_error,
     OrchResult,
+    OrchestratorError,
+    OrchestratorValidationError,
 )
 
 from .client.client import Client
@@ -243,9 +245,15 @@ def describe_service(self,
                     service_id=svc_id, service_type=svc_type, placement=PlacementSpec(hosts=hostlist, count=len(hostlist))
                 )
 
+            # `size` is the desired daemon count; `running` is the actual.
+            # MicroCeph's desired state IS the current set of hosts running
+            # the service (no separate spec store), so they match. Setting
+            # `size` explicitly avoids `ceph orch ls` rendering `RUNNING/-`,
+            # which looks like a degraded/incomplete deployment.
             service_descs.append(ServiceDescription(
                 spec=spec,
-                running=len(hostlist)
+                running=len(hostlist),
+                size=len(hostlist),
             ))
 
         return service_descs
@@ -326,7 +334,7 @@ def get_inventory(self,
         filter_hosts = None
         if host_filter:
             if host_filter.labels:
-                raise NotImplementedError(
+                raise OrchestratorValidationError(
                     "MicroCeph orchestrator does not support host labels"
                 )
             if host_filter.hosts:
@@ -457,6 +465,20 @@ def _apply_service(self, service_type: str, spec: ServiceSpec,
                 )
                 enabled.append(host)
             except Exception as e:
+                # Tolerate the TOCTOU window between the snapshot taken
+                # by _get_existing_service_hosts and the per-host enable
+                # call. The backend's genericHospitalityCheck
+                # (microceph/ceph/services_placement_generic.go) returns
+                # "<svc> service already active on host" if the service
+                # raced into existence between snapshot and enable; for
+                # an apply that is the desired end-state, not a failure.
+                if "already active on host" in str(e):
+                    logger.info(
+                        f"{service_type} became active on {host} after "
+                        "snapshot; treating as no-op."
+                    )
+                    enabled.append(host)
+                    continue
                 logger.error(f"Failed to enable {service_type} on {host}: {e}")
                 failures.append(f"{host}: {e}")
 
@@ -470,7 +492,7 @@ def _apply_service(self, service_type: str, spec: ServiceSpec,
             if already_active:
                 ctx.append(f"already active on {', '.join(already_active)}")
             ctx_str = f" ({'; '.join(ctx)})" if ctx else ""
-            raise RuntimeError(
+            raise OrchestratorError(
                 f"Failed to enable {service_type}: "
                 + "; ".join(failures)
                 + ctx_str
@@ -651,8 +673,12 @@ def remove_service(self, service_name: str, force: bool = False) -> OrchResult[s
         hosts = sorted(self._get_existing_service_hosts(svc_type, group_id))
 
         if not hosts:
-            logger.info(f"No hosts found running {service_name}; nothing to do")
-            return f"{service_name}: no hosts running this service"
+            # Match cephadm: removing a service that is not deployed is
+            # surfaced as an error so the operator notices a typo or
+            # stale state rather than seeing a green no-op.
+            raise OrchestratorError(
+                f"Service {service_name!r} is not running on any host"
+            )
 
         removed: List[str] = []
         failures: List[str] = []
@@ -675,7 +701,7 @@ def remove_service(self, service_name: str, force: bool = False) -> OrchResult[s
         # partial-success context is included in the message.
         if failures:
             ctx = f" (removed from {', '.join(removed)})" if removed else ""
-            raise RuntimeError(
+            raise OrchestratorError(
                 f"Failed to remove {service_name}: "
                 + "; ".join(failures)
                 + ctx
@@ -711,7 +737,7 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
         logger.info(f"Service action: {action} on {service_name}")
 
         if action != "restart":
-            raise NotImplementedError(
+            raise OrchestratorValidationError(
                 f"Service action '{action}' is not supported by MicroCeph. "
                 "Only 'restart' is currently available."
             )
@@ -724,7 +750,7 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
         _require_bare_service_name(svc_type, svc_id)
 
         if svc_type not in RESTART_SUPPORTED_SERVICES:
-            raise NotImplementedError(
+            raise OrchestratorValidationError(
                 f"Restart of service type {svc_type!r} is not supported by "
                 f"MicroCeph. Supported services: "
                 f"{sorted(RESTART_SUPPORTED_SERVICES)}."
@@ -738,7 +764,12 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
         group_id = svc_id if svc_type in DOTTED_NAME_SUPPORTED_SERVICES else None
         hosts = sorted(self._get_existing_service_hosts(svc_type, group_id))
         if not hosts:
-            return [f"No hosts running {svc_type}; nothing to restart"]
+            # Restarting a service that is not deployed is an error: the
+            # operator either targeted the wrong name or expected the
+            # service to be running. Match cephadm semantics.
+            raise OrchestratorError(
+                f"Service {svc_type!r} is not running on any host"
+            )
 
         restarted: List[str] = []
         failures: List[str] = []
@@ -756,7 +787,7 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str]
             ctx = (
                 f" (restarted on {', '.join(restarted)})" if restarted else ""
             )
-            raise RuntimeError(
+            raise OrchestratorError(
                 f"Failed to restart {svc_type}: "
                 + "; ".join(failures)
                 + ctx
diff --git a/microceph-orch/tests/conftest.py b/microceph-orch/tests/conftest.py
index 8d231248d..94b566fba 100644
--- a/microceph-orch/tests/conftest.py
+++ b/microceph-orch/tests/conftest.py
@@ -28,6 +28,8 @@
     ServiceDescription,
     DaemonDescription,
     Orchestrator,
+    OrchestratorError,
+    OrchestratorValidationError,
     MgrModule,
     NotifyType,
 )
@@ -83,6 +85,8 @@ def _install_mocks():
     orch_mod.CLICommandMeta = CLICommandMeta
     orch_mod.handle_orch_error = handle_orch_error
     orch_mod.OrchResult = OrchResult
+    orch_mod.OrchestratorError = OrchestratorError
+    orch_mod.OrchestratorValidationError = OrchestratorValidationError
 
     # snap-only deps
     requests_unixsocket_mod = MagicMock()
diff --git a/microceph-orch/tests/stubs.py b/microceph-orch/tests/stubs.py
index 1a950fd3a..b2c61b023 100644
--- a/microceph-orch/tests/stubs.py
+++ b/microceph-orch/tests/stubs.py
@@ -111,9 +111,10 @@ def __init__(self, name="", devices=None):
 
 
 class ServiceDescription:
-    def __init__(self, spec=None, running=0, **kwargs):
+    def __init__(self, spec=None, running=0, size=0, **kwargs):
         self.spec = spec
         self.running = running
+        self.size = size
 
 
 class DaemonDescription:
@@ -127,6 +128,14 @@ def __init__(self, service_name="", daemon_type="", daemon_id="",
         self.ports = ports
 
 
+class OrchestratorError(Exception):
+    pass
+
+
+class OrchestratorValidationError(OrchestratorError):
+    pass
+
+
 class Orchestrator:
     pass
 
diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py
index cbd3631a3..84707611e 100644
--- a/microceph-orch/tests/test_module.py
+++ b/microceph-orch/tests/test_module.py
@@ -13,6 +13,8 @@
     RGWSpec,
     NFSServiceSpec,
     InventoryFilter,
+    OrchestratorError,
+    OrchestratorValidationError,
 )
 
 from microceph.client.service import RemoteException
@@ -313,7 +315,7 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client):
     def test_get_inventory_label_filter_rejected(self, orchestrator, mock_client):
         filt = InventoryFilter(labels=["role=osd"])
         result = orchestrator.get_inventory(host_filter=filt)
-        assert isinstance(result.exception, NotImplementedError)
+        assert isinstance(result.exception, OrchestratorValidationError)
         mock_client.services.list_disks.assert_not_called()
 
     def test_get_inventory_empty(self, orchestrator, mock_client):
@@ -421,7 +423,7 @@ def side_effect(*args, **kwargs):
 
         mock_client.services.enable_service.side_effect = side_effect
         result = orchestrator.apply_rbd_mirror(spec)
-        assert isinstance(result.exception, RuntimeError)
+        assert isinstance(result.exception, OrchestratorError)
         msg = str(result.exception)
         assert "Failed to enable rbd-mirror" in msg
         assert "node2: node2 boom" in msg
@@ -791,10 +793,12 @@ def test_remove_service_basic(self, orchestrator, mock_client):
         assert mock_client.services.delete_service.call_count == 2
 
     def test_remove_service_no_hosts(self, orchestrator, mock_client):
-        # list_services returns [] (default) — nothing to remove.
+        # list_services returns [] (default) — removing a service that
+        # isn't deployed must surface as an error (cephadm parity), not
+        # a green no-op.
         result = orchestrator.remove_service("rgw")
-        assert result.exception is None
-        assert "no hosts" in result.result
+        assert isinstance(result.exception, OrchestratorError)
+        assert "not running on any host" in str(result.exception)
         mock_client.services.delete_service.assert_not_called()
 
     def test_remove_service_nfs_with_cluster_id(self, orchestrator, mock_client):
@@ -816,14 +820,15 @@ def test_remove_service_nfs_without_cluster_id(self, orchestrator, mock_client):
 
     def test_remove_service_nfs_group_id_no_match(self, orchestrator, mock_client):
         # NFS cluster 'other' exists on node1, but we're removing
-        # 'mycluster' — there is nothing to do; the call must NOT
-        # cascade into deleting the unrelated cluster.
+        # 'mycluster' — there is nothing to do for this cluster_id;
+        # surface as an error and do NOT cascade into deleting the
+        # unrelated cluster.
         mock_client.services.list_services.return_value = [
             _svc("nfs", "node1", group_id="other"),
         ]
         result = orchestrator.remove_service("nfs.mycluster")
-        assert result.exception is None
-        assert "no hosts" in result.result
+        assert isinstance(result.exception, OrchestratorError)
+        assert "not running on any host" in str(result.exception)
         mock_client.services.delete_nfs_service.assert_not_called()
 
     def test_remove_service_api_error(self, orchestrator, mock_client):
@@ -850,7 +855,7 @@ def side_effect(*args, **kwargs):
 
         mock_client.services.delete_service.side_effect = side_effect
         result = orchestrator.remove_service("rgw")
-        assert isinstance(result.exception, RuntimeError)
+        assert isinstance(result.exception, OrchestratorError)
         msg = str(result.exception)
         assert "Failed to remove rgw" in msg
         assert "node2: node2 boom" in msg
@@ -897,10 +902,12 @@ def test_restart_service(self, orchestrator, mock_client):
         assert any("Restarted mon on node2" in r for r in result.result)
 
     def test_restart_no_hosts(self, orchestrator, mock_client):
-        # No hosts running the service — no-op (still success).
+        # Restarting a service that isn't deployed surfaces as an
+        # error so the operator notices the typo or stale assumption
+        # (cephadm parity).
         result = orchestrator.service_action("restart", "mon")
-        assert result.exception is None
-        assert any("nothing to restart" in r for r in result.result)
+        assert isinstance(result.exception, OrchestratorError)
+        assert "not running on any host" in str(result.exception)
         mock_client.services.restart_services.assert_not_called()
 
     def test_restart_dotted_non_nfs_rejected(self, orchestrator, mock_client):
@@ -919,7 +926,7 @@ def test_restart_unsupported_service_type(self, orchestrator, mock_client):
         for svc in ["nfs.mycluster", "mds", "mgr", "rbd-mirror"]:
             mock_client.services.restart_services.reset_mock()
             result = orchestrator.service_action("restart", svc)
-            assert isinstance(result.exception, NotImplementedError), svc
+            assert isinstance(result.exception, OrchestratorValidationError), svc
             mock_client.services.restart_services.assert_not_called()
 
     def test_restart_partial_failure(self, orchestrator, mock_client):
@@ -936,7 +943,7 @@ def side_effect(*args, **kwargs):
 
         mock_client.services.restart_services.side_effect = side_effect
         result = orchestrator.service_action("restart", "mon")
-        assert isinstance(result.exception, RuntimeError)
+        assert isinstance(result.exception, OrchestratorError)
         msg = str(result.exception)
         assert "Failed to restart mon" in msg
         assert "node2: node2 boom" in msg
diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh
index 97a01b235..a533fa7f0 100755
--- a/tests/scripts/test-orch.sh
+++ b/tests/scripts/test-orch.sh
@@ -145,7 +145,10 @@ echo "=== 6b. Apply RGW on a second host (per-host targeting) ==="
 # Find a host that is NOT the first one to verify cross-node enablement
 # from the local socket. Only run this leg when the cluster has more
 # than one host (single-node deployments skip).
-second_host=$(run_ceph orch host ls | awk 'NR>1 && NF>0 && $1 != "'"$first_host"'" {print $1; exit}')
+# Filter to rows whose first column looks like a hostname (starts with
+# a letter) so the trailing "N hosts in cluster" footer line is not
+# mistaken for a host name on single-node CI.
+second_host=$(run_ceph orch host ls | awk 'NR>1 && NF>0 && $1 ~ /^[A-Za-z]/ && $1 != "'"$first_host"'" {print $1; exit}')
 if [ -n "$second_host" ] && [ "$second_host" != "$first_host" ]; then
     output=$(run_ceph orch apply rgw default --placement="$first_host,$second_host")
     assert_contains "rgw applied to two hosts" "enabled|already active" "$output"
@@ -227,13 +230,16 @@ fi
 echo "=== 10. Remove NFS ==="
 # ===================================================================
 
-output=$(run_ceph orch rm nfs.testcluster)
-# NFS removal may fail if the service never fully started
-if echo "$output" | grep -q "Removed"; then
-    echo "  PASS nfs removed"
+output=$(run_ceph orch rm nfs.testcluster 2>&1 || true)
+# NFS removal may fail if the service never fully started. In that
+# case remove_service now raises OrchestratorError("not running on
+# any host"); accept either the success ("removed from") shape or
+# that specific error.
+if echo "$output" | grep -qE "removed from|not running on any host"; then
+    echo "  PASS nfs removal completed (either removed or never started)"
     PASS=$((PASS + 1))
 else
-    echo "  WARN nfs removal returned: $output (service may not have been running)"
+    echo "  WARN nfs removal returned unexpected output: $output"
 fi
 sleep 3