From 8567b63a114d5d20b3294d5d2b71488256a2ab89 Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Wed, 29 Apr 2026 15:11:17 +0530 Subject: [PATCH 01/11] feat: enhance microceph orchestrator Signed-off-by: Utkarsh Bhatt --- .gitignore | 2 + .../src/microceph/client/cluster.py | 57 +- microceph-orch/src/microceph/module.py | 361 ++++++++-- microceph-orch/tests/conftest.py | 148 ++++ microceph-orch/tests/stubs.py | 135 ++++ microceph-orch/tests/test_client.py | 188 +++++ microceph-orch/tests/test_module.py | 671 ++++++++++++++++++ 7 files changed, 1517 insertions(+), 45 deletions(-) create mode 100644 microceph-orch/tests/conftest.py create mode 100644 microceph-orch/tests/stubs.py create mode 100644 microceph-orch/tests/test_client.py create mode 100644 microceph-orch/tests/test_module.py diff --git a/.gitignore b/.gitignore index c0dcfe174..ad99d0017 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ _build .vscode microceph/coverage.out microceph/coverage.html +__pycache__/ +*.pyc diff --git a/microceph-orch/src/microceph/client/cluster.py b/microceph-orch/src/microceph/client/cluster.py index 6601e999a..e0173eb11 100644 --- a/microceph-orch/src/microceph/client/cluster.py +++ b/microceph-orch/src/microceph/client/cluster.py @@ -18,7 +18,7 @@ def get_cluster_members(self) -> list: """ result = [] cluster = self._get("/core/1.0/cluster") - members = cluster.get("metadata", {}) + members = cluster.get("metadata") or [] keys = ["name", "address", "status"] for member in members: result.append({k: v for k, v in member.items() if k in keys}) @@ -55,22 +55,22 @@ class ExtendedAPIService(service.BaseService): def list_services(self) -> list[dict]: """List all services.""" services = self._get("/1.0/services") - return services.get("metadata") + return services.get("metadata") or [] def list_resources(self) -> list[dict]: """List all resources.""" nodes = self._get("/1.0/resources") - return nodes.get("metadata") + return nodes.get("metadata") or [] def list_disks(self) -> list[dict]: """List all disks""" disks = self._get("/1.0/disks") - return disks.get("metadata") + return disks.get("metadata") or [] def get_status(self) -> dict[str, dict]: """Get status of the cluster.""" cluster = self._get("/1.0/status") - members = cluster.get("metadata", {}) + members = cluster.get("metadata") or [] return { member["name"]: { "status": member["status"], @@ -78,3 +78,50 @@ def get_status(self) -> dict[str, dict]: } for member in members } + + def enable_service(self, name: str, payload: str = "", wait: bool = True) -> None: + """Enable a service on the cluster. + + Sends a PUT request to /1.0/services/ with an EnableService payload. + The Go API dispatches this to ServicePlacementHandler which runs the full + placement lifecycle: PopulateParams, HospitalityCheck, ServiceInit, + PostPlacementCheck, DbUpdate. + + :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror') + :param payload: JSON string with service-specific parameters + :param wait: if True, the server waits for the service to be fully enabled + """ + # Note: The "bool" key maps to Go's EnableService.Wait field which has + # the struct tag `json:"bool"` (upstream naming quirk in MicroCeph). + self._put(f"/1.0/services/{name}", json={ + "name": name, + "bool": wait, + "payload": payload, + }) + + def delete_service(self, name: str) -> None: + """Delete/disable a service on the cluster. + + :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror') + """ + self._delete(f"/1.0/services/{name}") + + def restart_services(self, services: list[str]) -> None: + """Restart one or more services on the cluster. + + Sends a POST to /1.0/services/restart with a list of service objects. + + :param services: list of service names (e.g. ['mon', 'rgw']) + """ + payload = [{"service": svc} for svc in services] + self._post("/1.0/services/restart", json=payload) + + def delete_nfs_service(self, cluster_id: str) -> None: + """Delete/disable an NFS service by cluster ID. + + NFS deletion requires the cluster_id in the request body, unlike + other services which are identified by URL path alone. + + :param cluster_id: NFS cluster identifier + """ + self._delete("/1.0/services/nfs", json={"cluster_id": cluster_id}) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index 403c7f028..9391af7e3 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -109,6 +109,8 @@ def available(self) -> Tuple[bool, str, Dict[str, Any]]: self.microceph.status.is_available() except RemoteException as e: return False, f"Cannot reach the MicroCeph API: {e}", {} + except Exception as e: + return False, f"Unexpected error reaching MicroCeph API: {e}", {} return True, "", {} @@ -130,8 +132,18 @@ def get_hosts(self) -> List[HostSpec]: """ specs = [] for m in self.microceph.cluster.get_cluster_members(): - addr, _, _ = m['address'].rpartition(":") - specs.append(HostSpec(m['name'], addr, status=m['status'])) + # Address from microcluster is in "host:port" format. + # rpartition splits on the last ":" to separate host from port. + address = m.get('address', '') + addr, _, _ = address.rpartition(":") + if not addr: + # No ":" found; use the raw address (may be hostname only). + addr = address + specs.append(HostSpec( + m.get('name', ''), + addr, + status=m.get('status', 'unknown'), + )) return specs @@ -142,16 +154,20 @@ def _get_service_hostlist(self, recorded_services: list) -> dict: service_name = record['service'] if not record['group_id'] else f"{record['service']}.{record['group_id']}" service_host = record['location'] service_hostlist[service_name].append(service_host) - logger.info(f"microcephs record service({service_name}) at ({service_host}) configured({record['info']})") + logger.debug(f"microcephs record service({service_name}) at ({service_host}) configured({record['info']})") return service_hostlist - def _elaborate_service(self, service: str): - """Elaborate a service into id and type""" - if '.' in service: - segments = service.split('.') - return segments[0], segments[1] - else: - return service, "" + @staticmethod + def _parse_service_name(service_name: str) -> Tuple[str, str]: + """Split a service name into (type, id). + + Handles dotted names like 'nfs.my.cluster' correctly by splitting + on the first dot only. + + :return: (service_type, service_id); service_id is '' if no dot. + """ + svc_type, _, svc_id = service_name.partition('.') + return svc_type, svc_id @handle_orch_error def describe_service(self, @@ -169,12 +185,14 @@ def describe_service(self, service_descs = [] for svc_name, hostlist in service_hostlist.items(): spec = None - svc_type, svc_id = self._elaborate_service(svc_name) - logger.info(f"{svc_name} under description for filter {service_type}") + svc_type, svc_id = self._parse_service_name(svc_name) + logger.debug(f"{svc_name} under description for filter {service_type}") - # skip unrelated services if a specific daemon type is requested. + # Apply filters if service_type and svc_type != service_type: continue + if service_name and svc_name != service_name: + continue if svc_type in daemon_spec_map: spec = daemon_spec_map[svc_type]( @@ -209,19 +227,31 @@ def list_daemons(self, for svc in services: svc_daemon_type = svc['service'] svc_hostname = svc['location'] - svc_group_ip = svc['group_id'] + svc_group_id = svc['group_id'] svc_ip = None svc_ports = None - svc_name = f"{svc_daemon_type}.{svc_group_ip}" if svc_group_ip else svc_daemon_type - if daemon_type: - if svc_daemon_type != daemon_type: - continue + svc_name = f"{svc_daemon_type}.{svc_group_id}" if svc_group_id else svc_daemon_type + + # Apply filters + if daemon_type and svc_daemon_type != daemon_type: + continue + if host and svc_hostname != host: + continue + if daemon_id and svc_hostname != daemon_id: + continue + if service_name and svc_name != service_name: + continue + + # Extract NFS-specific info (bind address and port) + if svc_daemon_type == 'nfs' and svc.get('info'): + try: + info = json.loads(svc['info']) + bind_addr = info.get('bind_address', '') + svc_ip = None if '0.0.0.0' in bind_addr else bind_addr or None + svc_ports = [info['bind_port']] if 'bind_port' in info else None + except (json.JSONDecodeError, KeyError) as e: + logger.warning(f"Failed to parse NFS service info for {svc_hostname}: {e}") - if svc_daemon_type == 'nfs': - info = json.loads(svc['info']) - svc_ip = None if "0.0.0.0" in info['bind_address'] else info['bind_address'] - svc_ports = [info['bind_port']] - descriptions.append(DaemonDescription( service_name=svc_name, daemon_type=svc_daemon_type, @@ -231,7 +261,7 @@ def list_daemons(self, ports=svc_ports )) - logger.info(descriptions) + logger.debug(f"list_daemons returning {len(descriptions)} daemons") return descriptions @handle_orch_error @@ -240,38 +270,289 @@ def get_inventory(self, refresh: bool = False ) -> List[InventoryHost]: - disks = self.microceph.services.list_disks() - disks_by_host = defaultdict(list) - for d in disks: - disks_by_host[d['location']].append( - Device(path=d['path']) + # Resolve which hosts to include based on the filter. + filter_hosts = None + if host_filter and host_filter.hosts: + filter_hosts = set(host_filter.hosts) + + osd_disks = self.microceph.services.list_disks() + + # Build inventory from OSD disk list (cluster-wide source of truth + # for which hosts exist and what disks they have). + devices_by_host: Dict[str, List[Device]] = defaultdict(list) + for d in osd_disks: + host = d.get('location', '') + if filter_hosts and host not in filter_hosts: + continue + devices_by_host[host].append( + Device(path=d.get('path', ''), available=False) ) + # Ensure all cluster members appear even if they have no OSD disks. + try: + for member in self.microceph.cluster.get_cluster_members(): + host = member.get('name', '') + if filter_hosts and host not in filter_hosts: + continue + if host not in devices_by_host: + devices_by_host[host] = [] + except RemoteException: + pass + inventory = [] - for host, diskettes in disks_by_host.items(): + for host, devs in devices_by_host.items(): inventory.append(InventoryHost( name=host, - devices=Devices(diskettes) + devices=Devices(devs) )) return inventory + def _get_placement_hosts(self, spec: ServiceSpec) -> List[str]: + """Extract target hosts from a ServiceSpec's placement. + + Returns the list of hostnames from the placement spec. + Raises ValueError if no hosts are specified; callers must + provide explicit placement. + """ + hosts = [] + if spec.placement and spec.placement.hosts: + hosts = [str(h) for h in spec.placement.hosts] + + if not hosts: + raise ValueError( + f"No placement hosts specified for {spec.service_type}. " + "Explicit host placement is required." + ) + + return hosts + + def _get_existing_service_hosts(self, service_type: str) -> set: + """Return the set of hostnames that already run the given service type.""" + try: + services = self.microceph.services.list_services() + except RemoteException: + logger.warning(f"Failed to list services while checking existing {service_type} hosts") + return set() + + return { + svc['location'] for svc in services + if svc['service'] == service_type + } + + def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) -> str: + """Common logic for applying (enabling) a service on the local node. + + Validates placement hosts and checks whether the service is already + running. Calls enable_service once; the request goes to the local + node via the unix socket. + + Returns a summary string on success. + Raises an exception if placement is invalid or the API call fails. + + NOTE: Per-host targeting is not yet supported. The Python client + connects via unix socket to the local node only. MicroCeph's Go + client uses UseTarget() to proxy to specific hosts, but this is + not exposed through the socket. See todo #6. + Placement hosts are validated and logged, but the enable call + always runs on the local node regardless. + """ + hosts = self._get_placement_hosts(spec) + existing = self._get_existing_service_hosts(service_name) + + # Check if the service is already active on the local node. + # Since we can only target the local node, we check if any + # of the requested hosts that are local already have the service. + all_existing = all(h in existing for h in hosts) + if all_existing: + skipped_str = ', '.join(hosts) + logger.info(f"Skipping {service_name}: already active on {skipped_str}") + return f"{service_name}: already active on {skipped_str}" + + if existing: + logger.info( + f"{service_name} already active on {', '.join(existing & set(hosts))}; " + f"enabling for remaining hosts" + ) + + logger.info(f"Enabling {service_name} (requested hosts: {', '.join(hosts)})") + self.microceph.services.enable_service( + name=service_name, + payload=payload, + wait=True, + ) + + new_hosts = [h for h in hosts if h not in existing] + parts = [f"enabled on {', '.join(new_hosts)}"] + skipped_hosts = [h for h in hosts if h in existing] + if skipped_hosts: + parts.append(f"already active on {', '.join(skipped_hosts)}") + + return f"{service_name}: {'; '.join(parts)}" + + @handle_orch_error def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]: - logger.info(f"Received Apply Request for RBD Mirror: Spec: {vars(spec).items()}") - raise NotImplementedError() + """Enable the rbd-mirror service on the target hosts. - def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]: + rbd-mirror is a client-like service with no additional parameters. + The Go API's ClientServicePlacement handler takes no payload. """ + logger.debug(f"Applying rbd-mirror service, spec: {vars(spec)}") + return self._apply_service("rbd-mirror", spec, "{}") - :param spec: - :return: + @handle_orch_error + def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]: + """Enable the RGW service on the target hosts. + + Extracts port and SSL configuration from the RGWSpec and passes + them as the JSON payload to the MicroCeph enable_service API. + + Note on SSL: The Ceph RGWSpec provides rgw_frontend_ssl_certificate + (a list of PEM cert strings) but has no private key field. MicroCeph's + Go API (RgwServicePlacement) requires both SSLCertificate and + SSLPrivateKey to enable SSL. Until the spec is extended or a separate + key source is added, SSL cannot be fully configured via the + orchestrator interface. """ - raise NotImplementedError() + logger.debug(f"Applying RGW service, spec: {vars(spec)}") + + # Build RGW-specific payload from the spec. + # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey + # Field names match Go exported field names (no json tags defined, + # so encoding/json matches case-insensitively). + rgw_params: Dict[str, Any] = {} + if spec.rgw_frontend_port: + rgw_params['Port'] = spec.rgw_frontend_port + + # TODO: SSL support for RGW via orchestrator interface. + # + # SSL cannot be configured through this path. MicroCeph's Go API requires + # both SSLCertificate and SSLPrivateKey as raw PEM material; if either is + # missing, SSL is skipped and RGW falls back to plain HTTP (see rgw.go). + # The Ceph RGWSpec only carries the cert chain (rgw_frontend_ssl_certificate) + # with no private key field, so we cannot supply both. + if spec.rgw_frontend_ssl_certificate: + logger.warning( + "RGWSpec provides SSL certificate but the Ceph orchestrator spec " + "has no private key field. MicroCeph requires both certificate and " + "private key for SSL. SSL will be skipped; RGW will be deployed " + "in non-SSL mode." + ) + + payload = json.dumps(rgw_params) if rgw_params else "{}" + return self._apply_service("rgw", spec, payload) + @handle_orch_error def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]: + """Enable the NFS service on the target hosts. + + Extracts the NFS cluster ID and optional port from the + NFSServiceSpec and passes them as the JSON payload. """ + logger.debug(f"Applying NFS service, spec: {vars(spec)}") - :param spec: - :return: + # Go's NFSServicePlacement expects: cluster_id, v4_min_version, bind_address, bind_port + # The Ceph NFSServiceSpec uses service_id as the NFS cluster identifier. + if not spec.service_id: + raise ValueError("NFS service_id (cluster_id) is required") + + nfs_params: Dict[str, Any] = { + 'cluster_id': spec.service_id, + } + if spec.port: + nfs_params['bind_port'] = spec.port + if spec.virtual_ip: + nfs_params['bind_address'] = spec.virtual_ip + + payload = json.dumps(nfs_params) + return self._apply_service("nfs", spec, payload) + + @handle_orch_error + def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]: + """Enable the MON service on the target hosts.""" + logger.debug(f"Applying MON service, spec: {vars(spec)}") + return self._apply_service("mon", spec, "{}") + + @handle_orch_error + def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]: + """Enable the MGR service on the target hosts.""" + logger.debug(f"Applying MGR service, spec: {vars(spec)}") + return self._apply_service("mgr", spec, "{}") + + @handle_orch_error + def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]: + """Enable the MDS service on the target hosts.""" + logger.debug(f"Applying MDS service, spec: {vars(spec)}") + return self._apply_service("mds", spec, "{}") + + @handle_orch_error + def apply_cephfs_mirror(self, spec: ServiceSpec) -> OrchResult[str]: + """Enable the cephfs-mirror service on the target hosts. + + cephfs-mirror is a client-like service (same as rbd-mirror) + with no additional parameters. """ - raise NotImplementedError() + logger.debug(f"Applying cephfs-mirror service, spec: {vars(spec)}") + return self._apply_service("cephfs-mirror", spec, "{}") + + @handle_orch_error + def remove_service(self, service_name: str, force: bool = False) -> OrchResult[str]: + """Remove a service from the local node. + + Sends a DELETE request to the local MicroCeph API. This removes + the service from the node connected via the unix socket only; + it does not remove the service cluster-wide. Per-host targeting + requires UseTarget support (see todo #6). + + :param service_name: service type or type.id (e.g. "rgw", "nfs.mycluster") + :param force: unused, kept for interface compatibility + """ + logger.info(f"Removing service: {service_name}, force={force}") + + svc_type, svc_id = self._parse_service_name(service_name) + + # NFS requires the cluster_id in the delete body + if svc_type == 'nfs': + if not svc_id: + raise ValueError("NFS removal requires service name in 'nfs.' format") + self.microceph.services.delete_nfs_service(svc_id) + else: + self.microceph.services.delete_service(svc_type) + + return f"Removed service {service_name}" + + @handle_orch_error + def remove_host(self, host: str, force: bool = False, + offline: bool = False, rm_crush_entry: bool = False) -> OrchResult[str]: + """Remove a host from the MicroCeph cluster. + + :param host: hostname to remove + :param force: unused, kept for interface compatibility + :param offline: unused, kept for interface compatibility + :param rm_crush_entry: unused, kept for interface compatibility + """ + logger.info(f"Removing host: {host}, force={force}") + self.microceph.cluster.remove(host) + return f"Removed host {host}" + + @handle_orch_error + def service_action(self, action: str, service_name: str) -> OrchResult[List[str]]: + """Perform an action (restart) on a service. + + Currently only 'restart' is supported via the MicroCeph API. + + :param action: one of "start", "stop", "restart", "redeploy", "reconfig" + :param service_name: service type (e.g. "mon", "rgw") + """ + logger.info(f"Service action: {action} on {service_name}") + + if action != "restart": + raise NotImplementedError( + f"Service action '{action}' is not supported by MicroCeph. " + "Only 'restart' is currently available." + ) + + svc_type, _ = self._parse_service_name(service_name) + + self.microceph.services.restart_services([svc_type]) + return [f"Restarted {service_name}"] diff --git a/microceph-orch/tests/conftest.py b/microceph-orch/tests/conftest.py new file mode 100644 index 000000000..f2cf1581f --- /dev/null +++ b/microceph-orch/tests/conftest.py @@ -0,0 +1,148 @@ +""" +Test fixtures and mock setup for microceph-orch tests. + +Installs mock modules into sys.modules so that Ceph-internal and +snap-only imports work outside the snap environment. +""" + +import sys +from unittest.mock import MagicMock, patch + +import pytest + +from stubs import ( + OrchResult, + handle_orch_error, + CLICommandMeta, + HostSpec, + PlacementSpec, + ServiceSpec, + RGWSpec, + MONSpec, + MDSSpec, + NFSServiceSpec, + Device, + Devices, + InventoryFilter, + InventoryHost, + ServiceDescription, + DaemonDescription, + Orchestrator, + MgrModule, + NotifyType, +) + + +# --------------------------------------------------------------------------- +# Install mocks into sys.modules BEFORE any microceph imports +# --------------------------------------------------------------------------- + +def _install_mocks(): + """Inject mock modules so `from ceph.deployment...` etc. work. + + Also mocks snap-specific dependencies (requests_unixsocket, snaphelpers) + that are only available inside the snap environment. + """ + + # ceph.deployment.inventory + inv_mod = MagicMock() + inv_mod.Device = Device + inv_mod.Devices = Devices + + # ceph.deployment.service_spec + spec_mod = MagicMock() + spec_mod.ServiceSpec = ServiceSpec + spec_mod.PlacementSpec = PlacementSpec + spec_mod.RGWSpec = RGWSpec + spec_mod.MONSpec = MONSpec + spec_mod.MDSSpec = MDSSpec + spec_mod.NFSServiceSpec = NFSServiceSpec + + # ceph.deployment + deployment_mod = MagicMock() + deployment_mod.inventory = inv_mod + deployment_mod.service_spec = spec_mod + + # ceph + ceph_mod = MagicMock() + ceph_mod.deployment = deployment_mod + + # mgr_module + mgr_mod = MagicMock() + mgr_mod.MgrModule = MgrModule + mgr_mod.NotifyType = NotifyType + + # orchestrator + orch_mod = MagicMock() + orch_mod.Orchestrator = Orchestrator + orch_mod.HostSpec = HostSpec + orch_mod.InventoryFilter = InventoryFilter + orch_mod.InventoryHost = InventoryHost + orch_mod.ServiceDescription = ServiceDescription + orch_mod.DaemonDescription = DaemonDescription + orch_mod.CLICommandMeta = CLICommandMeta + orch_mod.handle_orch_error = handle_orch_error + orch_mod.OrchResult = OrchResult + + # snap-only deps + requests_unixsocket_mod = MagicMock() + requests_unixsocket_mod.DEFAULT_SCHEME = "http+unix://" + snaphelpers_mod = MagicMock() + + sys.modules.update({ + "ceph": ceph_mod, + "ceph.deployment": deployment_mod, + "ceph.deployment.inventory": inv_mod, + "ceph.deployment.service_spec": spec_mod, + "mgr_module": mgr_mod, + "orchestrator": orch_mod, + "requests_unixsocket": requests_unixsocket_mod, + "snaphelpers": snaphelpers_mod, + }) + + +# Install mocks before pytest collects test modules +_install_mocks() + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def mock_client(): + """Return a mock Client with pre-wired service stubs.""" + client = MagicMock() + + # Default: cluster with 3 members + client.cluster.get_cluster_members.return_value = [ + {"name": "node1", "address": "10.0.0.1:7443", "status": "ONLINE"}, + {"name": "node2", "address": "10.0.0.2:7443", "status": "ONLINE"}, + {"name": "node3", "address": "10.0.0.3:7443", "status": "ONLINE"}, + ] + + # Default: no services running + client.services.list_services.return_value = [] + + # Default: no disks + client.services.list_disks.return_value = [] + + # Default: no resources + client.services.list_resources.return_value = [] + + # Default: status available + client.status.is_available.return_value = None + + return client + + +@pytest.fixture +def orchestrator(mock_client): + """Return a MicroCephOrchestrator with a mocked client.""" + from microceph.module import MicroCephOrchestrator + + with patch.object(MicroCephOrchestrator, "__init__", lambda self, *a, **kw: None): + orch = MicroCephOrchestrator.__new__(MicroCephOrchestrator) + orch.microceph = mock_client + orch.run = True + return orch diff --git a/microceph-orch/tests/stubs.py b/microceph-orch/tests/stubs.py new file mode 100644 index 000000000..e9b6be2a9 --- /dev/null +++ b/microceph-orch/tests/stubs.py @@ -0,0 +1,135 @@ +""" +Minimal stubs for Ceph types used by the orchestrator module. + +The real types live in ceph.deployment.*, mgr_module, and orchestrator; +which are only importable inside the Ceph snap environment. These stubs +replicate just enough behaviour to test our code outside the snap. +""" + +from typing import Optional, List, Dict, Any, Generic, TypeVar +from functools import wraps + +T = TypeVar("T") + + +class OrchResult(Generic[T]): + """Minimal OrchResult stub.""" + + def __init__(self, result: Optional[T] = None, exception: Optional[Exception] = None): + self.result = result + self.exception = exception + self.exception_str = str(exception) if exception else "" + + +def handle_orch_error(f): + """Stub decorator that wraps return value in OrchResult and catches exceptions.""" + @wraps(f) + def wrapper(*args, **kwargs): + try: + return OrchResult(f(*args, **kwargs)) + except Exception as e: + return OrchResult(None, exception=e) + return wrapper + + +class CLICommandMeta(type): + """No-op metaclass stub.""" + pass + + +class HostSpec: + def __init__(self, hostname, addr=None, labels=None, status=None, **kwargs): + self.hostname = hostname + self.addr = addr or hostname + self.labels = labels or [] + self.status = status or "" + + +class PlacementSpec: + def __init__(self, hosts=None, count=None, **kwargs): + self.hosts = hosts or [] + self.count = count + + +class ServiceSpec: + def __init__(self, service_type="", service_id=None, placement=None, **kwargs): + self.service_type = service_type + self.service_id = service_id + self.placement = placement + + +class RGWSpec(ServiceSpec): + def __init__(self, rgw_frontend_port=None, rgw_frontend_ssl_certificate=None, + ssl=False, **kwargs): + super().__init__(**kwargs) + self.rgw_frontend_port = rgw_frontend_port + self.rgw_frontend_ssl_certificate = rgw_frontend_ssl_certificate + self.ssl = ssl + + +class MONSpec(ServiceSpec): + pass + + +class MDSSpec(ServiceSpec): + pass + + +class NFSServiceSpec(ServiceSpec): + def __init__(self, port=None, virtual_ip=None, **kwargs): + super().__init__(**kwargs) + self.port = port + self.virtual_ip = virtual_ip + + +class Device: + def __init__(self, path="", available=None, **kwargs): + self.path = path + self.available = available + + +class Devices: + def __init__(self, devices=None): + self.devices = devices or [] + + +class InventoryFilter: + def __init__(self, labels=None, hosts=None): + self.labels = labels + self.hosts = hosts + + +class InventoryHost: + def __init__(self, name="", devices=None): + self.name = name + self.devices = devices + + +class ServiceDescription: + def __init__(self, spec=None, running=0, **kwargs): + self.spec = spec + self.running = running + + +class DaemonDescription: + def __init__(self, service_name="", daemon_type="", daemon_id="", + hostname="", ip=None, ports=None, **kwargs): + self.service_name = service_name + self.daemon_type = daemon_type + self.daemon_id = daemon_id + self.hostname = hostname + self.ip = ip + self.ports = ports + + +class Orchestrator: + pass + + +class MgrModule: + def __init__(self, *args, **kwargs): + pass + + +class NotifyType: + pass diff --git a/microceph-orch/tests/test_client.py b/microceph-orch/tests/test_client.py new file mode 100644 index 000000000..145a325d8 --- /dev/null +++ b/microceph-orch/tests/test_client.py @@ -0,0 +1,188 @@ +""" +Tests for the MicroCeph client layer (cluster.py). +""" + +import json +import pytest +from unittest.mock import MagicMock, patch + +from microceph.client.cluster import ( + MicroClusterService, + StatusService, + ExtendedAPIService, +) +from microceph.client.service import RemoteException + + +@pytest.fixture +def mock_session(): + return MagicMock() + + +@pytest.fixture +def endpoint(): + return "http+unix://%2Ftmp%2Ftest.socket" + + +# =================================================================== +# MicroClusterService +# =================================================================== + +class TestMicroClusterService: + def test_get_cluster_members(self, mock_session, endpoint): + svc = MicroClusterService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: { + "metadata": [ + {"name": "n1", "address": "10.0.0.1:7443", "status": "ONLINE", "extra": "ignored"}, + {"name": "n2", "address": "10.0.0.2:7443", "status": "ONLINE"}, + ] + }, + ) + members = svc.get_cluster_members() + assert len(members) == 2 + assert members[0] == {"name": "n1", "address": "10.0.0.1:7443", "status": "ONLINE"} + # "extra" key should be filtered out + assert "extra" not in members[0] + + def test_get_cluster_members_null_metadata(self, mock_session, endpoint): + svc = MicroClusterService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + members = svc.get_cluster_members() + assert members == [] + + def test_get_cluster_members_missing_metadata(self, mock_session, endpoint): + svc = MicroClusterService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {}, + ) + members = svc.get_cluster_members() + assert members == [] + + +# =================================================================== +# ExtendedAPIService +# =================================================================== + +class TestExtendedAPIService: + def test_list_services_returns_list(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": [{"service": "mon", "location": "n1"}]}, + ) + result = svc.list_services() + assert isinstance(result, list) + assert len(result) == 1 + + def test_list_services_null_metadata(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + result = svc.list_services() + assert result == [] + + def test_list_disks_null_metadata(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + result = svc.list_disks() + assert result == [] + + def test_list_resources_null_metadata(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + result = svc.list_resources() + assert result == [] + + def test_enable_service_payload(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + svc.enable_service(name="rgw", payload='{"Port": 8080}', wait=True) + + call_args = mock_session.request.call_args + body = call_args.kwargs.get("json") or call_args[1].get("json") + assert body["name"] == "rgw" + assert body["bool"] is True + assert body["payload"] == '{"Port": 8080}' + + def test_enable_service_wait_false(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + svc.enable_service(name="mon", wait=False) + + call_args = mock_session.request.call_args + body = call_args.kwargs.get("json") or call_args[1].get("json") + assert body["bool"] is False + + def test_delete_service(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {}, + ) + svc.delete_service("rgw") + call_args = mock_session.request.call_args + assert call_args.kwargs.get("method") == "delete" or call_args[0][0] == "delete" + + def test_delete_nfs_service(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {}, + ) + svc.delete_nfs_service("mycluster") + call_args = mock_session.request.call_args + body = call_args.kwargs.get("json") or call_args[1].get("json") + assert body == {"cluster_id": "mycluster"} + + def test_restart_services(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {}, + ) + svc.restart_services(["mon", "rgw"]) + call_args = mock_session.request.call_args + body = call_args.kwargs.get("json") or call_args[1].get("json") + assert body == [{"service": "mon"}, {"service": "rgw"}] + + def test_get_status_null_metadata(self, mock_session, endpoint): + svc = ExtendedAPIService(mock_session, endpoint) + mock_session.request.return_value = MagicMock( + status_code=200, + text='{}', + json=lambda: {"metadata": None}, + ) + result = svc.get_status() + assert result == {} diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py new file mode 100644 index 000000000..31f646753 --- /dev/null +++ b/microceph-orch/tests/test_module.py @@ -0,0 +1,671 @@ +""" +Tests for MicroCephOrchestrator methods. +""" + +import json +import pytest +from unittest.mock import MagicMock + +from stubs import ( + HostSpec, + PlacementSpec, + ServiceSpec, + RGWSpec, + NFSServiceSpec, + InventoryFilter, +) + +from microceph.client.service import RemoteException + + +# =================================================================== +# available() +# =================================================================== + +class TestAvailable: + def test_available_success(self, orchestrator, mock_client): + ok, msg, info = orchestrator.available() + assert ok is True + assert msg == "" + mock_client.status.is_available.assert_called_once() + + def test_available_remote_error(self, orchestrator, mock_client): + mock_client.status.is_available.side_effect = RemoteException("socket gone") + ok, msg, _ = orchestrator.available() + assert ok is False + assert "Cannot reach" in msg + + def test_available_unexpected_error(self, orchestrator, mock_client): + mock_client.status.is_available.side_effect = OSError("permission denied") + ok, msg, _ = orchestrator.available() + assert ok is False + assert "Unexpected error" in msg + + +# =================================================================== +# get_hosts() +# =================================================================== + +class TestGetHosts: + def test_get_hosts_basic(self, orchestrator, mock_client): + result = orchestrator.get_hosts() + hosts = result.result + assert len(hosts) == 3 + assert hosts[0].hostname == "node1" + assert hosts[0].addr == "10.0.0.1" + assert hosts[0].status == "ONLINE" + + def test_get_hosts_strips_port(self, orchestrator, mock_client): + mock_client.cluster.get_cluster_members.return_value = [ + {"name": "h1", "address": "192.168.1.100:9443", "status": "ONLINE"}, + ] + result = orchestrator.get_hosts() + assert result.result[0].addr == "192.168.1.100" + + def test_get_hosts_no_port_in_address(self, orchestrator, mock_client): + mock_client.cluster.get_cluster_members.return_value = [ + {"name": "h1", "address": "192.168.1.100", "status": "ONLINE"}, + ] + result = orchestrator.get_hosts() + # Should fall back to raw address when no ":" present + assert result.result[0].addr == "192.168.1.100" + + def test_get_hosts_missing_address(self, orchestrator, mock_client): + mock_client.cluster.get_cluster_members.return_value = [ + {"name": "h1", "status": "ONLINE"}, + ] + result = orchestrator.get_hosts() + # Missing address should not crash + assert result.result[0].hostname == "h1" + + def test_get_hosts_empty_cluster(self, orchestrator, mock_client): + mock_client.cluster.get_cluster_members.return_value = [] + result = orchestrator.get_hosts() + assert result.result == [] + + def test_get_hosts_api_error(self, orchestrator, mock_client): + mock_client.cluster.get_cluster_members.side_effect = RemoteException("fail") + result = orchestrator.get_hosts() + assert result.exception is not None + + +# =================================================================== +# describe_service() +# =================================================================== + +class TestDescribeService: + def test_describe_service_all(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "mon", "location": "node2", "group_id": "", "info": ""}, + {"service": "rgw", "location": "node1", "group_id": "", "info": ""}, + ] + result = orchestrator.describe_service() + descs = result.result + assert len(descs) == 2 # mon and rgw grouped + types = {d.spec.service_type for d in descs} + assert types == {"mon", "rgw"} + + def test_describe_service_filter_type(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "rgw", "location": "node1", "group_id": "", "info": ""}, + ] + result = orchestrator.describe_service(service_type="rgw") + descs = result.result + assert len(descs) == 1 + assert descs[0].spec.service_type == "rgw" + + def test_describe_service_with_group_id(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "mycluster", "info": "{}"}, + ] + result = orchestrator.describe_service() + descs = result.result + assert len(descs) == 1 + assert descs[0].spec.service_id == "mycluster" + + def test_describe_service_empty(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [] + result = orchestrator.describe_service() + assert result.result == [] + + def test_describe_service_running_count(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "mon", "location": "node2", "group_id": "", "info": ""}, + {"service": "mon", "location": "node3", "group_id": "", "info": ""}, + ] + result = orchestrator.describe_service() + assert result.result[0].running == 3 + + +# =================================================================== +# list_daemons() +# =================================================================== + +class TestListDaemons: + def test_list_daemons_basic(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "rgw", "location": "node2", "group_id": "", "info": ""}, + ] + result = orchestrator.list_daemons() + daemons = result.result + assert len(daemons) == 2 + + def test_list_daemons_filter_daemon_type(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "rgw", "location": "node2", "group_id": "", "info": ""}, + ] + result = orchestrator.list_daemons(daemon_type="mon") + assert len(result.result) == 1 + assert result.result[0].daemon_type == "mon" + + def test_list_daemons_filter_host(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "mon", "location": "node2", "group_id": "", "info": ""}, + ] + result = orchestrator.list_daemons(host="node1") + assert len(result.result) == 1 + assert result.result[0].hostname == "node1" + + def test_list_daemons_filter_service_name(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "cluster1", "info": "{}"}, + {"service": "nfs", "location": "node1", "group_id": "cluster2", "info": "{}"}, + ] + result = orchestrator.list_daemons(service_name="nfs.cluster1") + assert len(result.result) == 1 + + def test_list_daemons_filter_daemon_id(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "node1", "group_id": "", "info": ""}, + {"service": "mon", "location": "node2", "group_id": "", "info": ""}, + ] + result = orchestrator.list_daemons(daemon_id="node2") + assert len(result.result) == 1 + assert result.result[0].hostname == "node2" + + def test_list_daemons_nfs_with_info(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + { + "service": "nfs", + "location": "node1", + "group_id": "mycluster", + "info": json.dumps({"bind_address": "10.0.0.5", "bind_port": 2049}), + }, + ] + result = orchestrator.list_daemons() + daemon = result.result[0] + assert daemon.ip == "10.0.0.5" + assert daemon.ports == [2049] + + def test_list_daemons_nfs_wildcard_address(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + { + "service": "nfs", + "location": "node1", + "group_id": "mycluster", + "info": json.dumps({"bind_address": "0.0.0.0", "bind_port": 2049}), + }, + ] + result = orchestrator.list_daemons() + assert result.result[0].ip is None # 0.0.0.0 should be None + + def test_list_daemons_nfs_malformed_info(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "c1", "info": "not-json"}, + ] + # Should not crash, just skip the NFS info parsing + result = orchestrator.list_daemons() + assert len(result.result) == 1 + assert result.result[0].ip is None + assert result.result[0].ports is None + + def test_list_daemons_nfs_empty_info(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "c1", "info": ""}, + ] + result = orchestrator.list_daemons() + assert len(result.result) == 1 + + def test_list_daemons_nfs_null_info(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "c1", "info": None}, + ] + result = orchestrator.list_daemons() + assert len(result.result) == 1 + + +# =================================================================== +# get_inventory() +# =================================================================== + +class TestGetInventory: + def test_get_inventory_shows_osd_disks(self, orchestrator, mock_client): + mock_client.services.list_disks.return_value = [ + {"location": "node1", "path": "/dev/sdb"}, + {"location": "node1", "path": "/dev/sdc"}, + ] + mock_client.services.list_resources.return_value = {} + result = orchestrator.get_inventory() + inv = result.result + # 3 hosts (from cluster members), node1 has 2 OSD disks + hosts = {h.name for h in inv} + assert hosts == {"node1", "node2", "node3"} + node1 = [h for h in inv if h.name == "node1"][0] + assert len(node1.devices.devices) == 2 + assert all(d.available is False for d in node1.devices.devices) + + def test_get_inventory_multi_host(self, orchestrator, mock_client): + mock_client.services.list_disks.return_value = [ + {"location": "node1", "path": "/dev/sda"}, + {"location": "node2", "path": "/dev/sda"}, + ] + mock_client.services.list_resources.return_value = {} + result = orchestrator.get_inventory() + hosts = {h.name for h in result.result} + assert hosts == {"node1", "node2", "node3"} + + def test_get_inventory_with_host_filter(self, orchestrator, mock_client): + mock_client.services.list_disks.return_value = [ + {"location": "node1", "path": "/dev/sda"}, + {"location": "node2", "path": "/dev/sda"}, + {"location": "node3", "path": "/dev/sda"}, + ] + mock_client.services.list_resources.return_value = {} + filt = InventoryFilter(hosts=["node1", "node3"]) + result = orchestrator.get_inventory(host_filter=filt) + hosts = {h.name for h in result.result} + assert hosts == {"node1", "node3"} + + def test_get_inventory_empty(self, orchestrator, mock_client): + mock_client.services.list_disks.return_value = [] + mock_client.services.list_resources.return_value = {} + mock_client.cluster.get_cluster_members.return_value = [] + result = orchestrator.get_inventory() + assert result.result == [] + + def test_get_inventory_includes_members_without_disks(self, orchestrator, mock_client): + mock_client.services.list_disks.return_value = [] + mock_client.services.list_resources.return_value = {} + # Default mock has 3 cluster members + result = orchestrator.get_inventory() + hosts = {h.name for h in result.result} + assert hosts == {"node1", "node2", "node3"} + # No devices on any host + for h in result.result: + assert len(h.devices.devices) == 0 + + +# =================================================================== +# apply_rbd_mirror() +# =================================================================== + +class TestApplyRbdMirror: + def test_apply_rbd_mirror_success(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="rbd-mirror", + placement=PlacementSpec(hosts=["node1", "node2"]), + ) + result = orchestrator.apply_rbd_mirror(spec) + assert result.exception is None + assert "enabled on node1, node2" in result.result + # Single API call regardless of host count (local socket only) + mock_client.services.enable_service.assert_called_once() + + def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client): + spec = ServiceSpec(service_type="rbd-mirror") + result = orchestrator.apply_rbd_mirror(spec) + assert result.exception is not None + assert "No placement hosts" in str(result.exception) + + def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""}, + ] + spec = ServiceSpec( + service_type="rbd-mirror", + placement=PlacementSpec(hosts=["node1", "node2"]), + ) + result = orchestrator.apply_rbd_mirror(spec) + assert result.exception is None + assert "already active on node1" in result.result + assert "enabled on node2" in result.result + mock_client.services.enable_service.assert_called_once() + + def test_apply_rbd_mirror_all_existing(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""}, + ] + spec = ServiceSpec( + service_type="rbd-mirror", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_rbd_mirror(spec) + assert result.exception is None + assert "already active on node1" in result.result + mock_client.services.enable_service.assert_not_called() + + def test_apply_rbd_mirror_api_error(self, orchestrator, mock_client): + mock_client.services.enable_service.side_effect = RemoteException("fail") + spec = ServiceSpec( + service_type="rbd-mirror", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_rbd_mirror(spec) + assert result.exception is not None + + +# =================================================================== +# apply_rgw() +# =================================================================== + +class TestApplyRgw: + def test_apply_rgw_basic(self, orchestrator, mock_client): + spec = RGWSpec( + service_type="rgw", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_rgw(spec) + assert result.exception is None + assert "enabled on node1" in result.result + + call_kwargs = mock_client.services.enable_service.call_args + assert call_kwargs.kwargs["name"] == "rgw" + + def test_apply_rgw_with_port(self, orchestrator, mock_client): + spec = RGWSpec( + service_type="rgw", + rgw_frontend_port=8080, + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_rgw(spec) + assert result.exception is None + + call_kwargs = mock_client.services.enable_service.call_args + payload = json.loads(call_kwargs.kwargs["payload"]) + assert payload["Port"] == 8080 + + def test_apply_rgw_ssl_cert_without_key_warns(self, orchestrator, mock_client, caplog): + """SSL cert is present but no key; should warn and not send cert.""" + spec = RGWSpec( + service_type="rgw", + rgw_frontend_ssl_certificate=["-----BEGIN CERT-----"], + placement=PlacementSpec(hosts=["node1"]), + ) + import logging + with caplog.at_level(logging.WARNING): + result = orchestrator.apply_rgw(spec) + + assert result.exception is None + # Cert should NOT be in payload (useless without key) + call_kwargs = mock_client.services.enable_service.call_args + payload = json.loads(call_kwargs.kwargs["payload"]) + assert "SSLCertificate" not in payload + # Warning should be logged + assert any("private key" in r.message for r in caplog.records) + + def test_apply_rgw_no_placement(self, orchestrator, mock_client): + spec = RGWSpec(service_type="rgw") + result = orchestrator.apply_rgw(spec) + assert result.exception is not None + + +# =================================================================== +# apply_nfs() +# =================================================================== + +class TestApplyNfs: + def test_apply_nfs_basic(self, orchestrator, mock_client): + spec = NFSServiceSpec( + service_type="nfs", + service_id="mycluster", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_nfs(spec) + assert result.exception is None + assert "enabled on node1" in result.result + + call_kwargs = mock_client.services.enable_service.call_args + payload = json.loads(call_kwargs.kwargs["payload"]) + assert payload["cluster_id"] == "mycluster" + + def test_apply_nfs_with_port(self, orchestrator, mock_client): + spec = NFSServiceSpec( + service_type="nfs", + service_id="mycluster", + port=12049, + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_nfs(spec) + call_kwargs = mock_client.services.enable_service.call_args + payload = json.loads(call_kwargs.kwargs["payload"]) + assert payload["bind_port"] == 12049 + + def test_apply_nfs_with_virtual_ip(self, orchestrator, mock_client): + spec = NFSServiceSpec( + service_type="nfs", + service_id="mycluster", + virtual_ip="10.0.0.100", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_nfs(spec) + call_kwargs = mock_client.services.enable_service.call_args + payload = json.loads(call_kwargs.kwargs["payload"]) + assert payload["bind_address"] == "10.0.0.100" + + def test_apply_nfs_missing_service_id(self, orchestrator, mock_client): + spec = NFSServiceSpec( + service_type="nfs", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_nfs(spec) + assert result.exception is not None + assert "cluster_id" in str(result.exception) + + def test_apply_nfs_no_placement(self, orchestrator, mock_client): + spec = NFSServiceSpec( + service_type="nfs", + service_id="mycluster", + ) + result = orchestrator.apply_nfs(spec) + assert result.exception is not None + assert "No placement hosts" in str(result.exception) + + def test_apply_nfs_skips_existing(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "mycluster", "info": "{}"}, + ] + spec = NFSServiceSpec( + service_type="nfs", + service_id="mycluster", + placement=PlacementSpec(hosts=["node1", "node2"]), + ) + result = orchestrator.apply_nfs(spec) + assert "already active on node1" in result.result + assert "enabled on node2" in result.result + mock_client.services.enable_service.assert_called_once() + + +# =================================================================== +# apply_mon() / apply_mgr() / apply_mds() +# =================================================================== + +# =================================================================== +# apply_cephfs_mirror() +# =================================================================== + +class TestApplyCephfsMirror: + def test_apply_cephfs_mirror_success(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="cephfs-mirror", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_cephfs_mirror(spec) + assert result.exception is None + assert "enabled on node1" in result.result + mock_client.services.enable_service.assert_called_once_with( + name="cephfs-mirror", payload="{}", wait=True, + ) + + +# =================================================================== +# _parse_service_name() +# =================================================================== + +class TestParseServiceName: + def test_simple_name(self, orchestrator): + assert orchestrator._parse_service_name("rgw") == ("rgw", "") + + def test_dotted_name(self, orchestrator): + assert orchestrator._parse_service_name("nfs.mycluster") == ("nfs", "mycluster") + + def test_multi_dot_name(self, orchestrator): + """Ensure dotted names like nfs.my.cluster split on first dot only.""" + assert orchestrator._parse_service_name("nfs.my.cluster") == ("nfs", "my.cluster") + + def test_empty_string(self, orchestrator): + assert orchestrator._parse_service_name("") == ("", "") + + +# =================================================================== +# describe_service() - service_name filter +# =================================================================== + +class TestDescribeServiceNameFilter: + def test_filter_by_service_name(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + {"service": "nfs", "location": "node1", "group_id": "c1", "info": "{}"}, + {"service": "nfs", "location": "node1", "group_id": "c2", "info": "{}"}, + ] + result = orchestrator.describe_service(service_name="nfs.c1") + assert len(result.result) == 1 + assert result.result[0].spec.service_id == "c1" + + +# =================================================================== +# remove_service() - dotted non-NFS +# =================================================================== + +class TestRemoveServiceDotted: + def test_remove_dotted_non_nfs_service(self, orchestrator, mock_client): + """e.g. mds.myfs should call delete_service('mds')""" + result = orchestrator.remove_service("mds.myfs") + assert result.exception is None + mock_client.services.delete_service.assert_called_once_with("mds") + + +# =================================================================== +# apply_mon() / apply_mgr() / apply_mds() / apply_cephfs_mirror() +# =================================================================== + +class TestApplyGenericServices: + def test_apply_mon(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="mon", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_mon(spec) + assert result.exception is None + assert "enabled on node1" in result.result + mock_client.services.enable_service.assert_called_once_with( + name="mon", payload="{}", wait=True, + ) + + def test_apply_mgr(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="mgr", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_mgr(spec) + assert result.exception is None + mock_client.services.enable_service.assert_called_once_with( + name="mgr", payload="{}", wait=True, + ) + + def test_apply_mds(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="mds", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_mds(spec) + assert result.exception is None + mock_client.services.enable_service.assert_called_once_with( + name="mds", payload="{}", wait=True, + ) + + +# =================================================================== +# remove_service() +# =================================================================== + +class TestRemoveService: + def test_remove_service_basic(self, orchestrator, mock_client): + result = orchestrator.remove_service("rgw") + assert result.exception is None + assert "Removed service rgw" in result.result + mock_client.services.delete_service.assert_called_once_with("rgw") + + def test_remove_service_nfs_with_cluster_id(self, orchestrator, mock_client): + result = orchestrator.remove_service("nfs.mycluster") + assert result.exception is None + assert "Removed service nfs.mycluster" in result.result + mock_client.services.delete_nfs_service.assert_called_once_with("mycluster") + + def test_remove_service_nfs_without_cluster_id(self, orchestrator, mock_client): + result = orchestrator.remove_service("nfs") + assert result.exception is not None + assert "cluster_id" in str(result.exception) + + def test_remove_service_api_error(self, orchestrator, mock_client): + mock_client.services.delete_service.side_effect = RemoteException("fail") + result = orchestrator.remove_service("rgw") + assert result.exception is not None + + +# =================================================================== +# remove_host() +# =================================================================== + +class TestRemoveHost: + def test_remove_host_success(self, orchestrator, mock_client): + result = orchestrator.remove_host("node2") + assert result.exception is None + assert "Removed host node2" in result.result + mock_client.cluster.remove.assert_called_once_with("node2") + + def test_remove_host_api_error(self, orchestrator, mock_client): + mock_client.cluster.remove.side_effect = RemoteException("node not found") + result = orchestrator.remove_host("badhost") + assert result.exception is not None + + +# =================================================================== +# service_action() +# =================================================================== + +class TestServiceAction: + def test_restart_service(self, orchestrator, mock_client): + result = orchestrator.service_action("restart", "mon") + assert result.exception is None + assert "Restarted mon" in result.result[0] + mock_client.services.restart_services.assert_called_once_with(["mon"]) + + def test_restart_service_with_id(self, orchestrator, mock_client): + result = orchestrator.service_action("restart", "nfs.mycluster") + assert result.exception is None + mock_client.services.restart_services.assert_called_once_with(["nfs"]) + + def test_unsupported_action(self, orchestrator, mock_client): + result = orchestrator.service_action("stop", "mon") + assert result.exception is not None + assert "not supported" in str(result.exception) + + def test_restart_api_error(self, orchestrator, mock_client): + mock_client.services.restart_services.side_effect = RemoteException("fail") + result = orchestrator.service_action("restart", "mon") + assert result.exception is not None From 0d4794912f93b51bfbd00688af519194470f064e Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Wed, 29 Apr 2026 15:11:25 +0530 Subject: [PATCH 02/11] ci: add orchestrator test coverage Signed-off-by: Utkarsh Bhatt --- .github/workflows/tests.yml | 25 ++++- tests/scripts/test-orch.sh | 210 ++++++++++++++++++++++++++++++++++++ 2 files changed, 233 insertions(+), 2 deletions(-) create mode 100755 tests/scripts/test-orch.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 07595a261..077277175 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -183,11 +183,17 @@ jobs: with: go-version: 1.25.x - - name: Run unit-tests + - name: Run Go unit-tests run: | cd microceph make check-unit + - name: Run orchestrator Python unit-tests + run: | + pip install pytest + cd microceph-orch + PYTHONPATH=src:tests python3 -m pytest tests/ -v + single-system-tests: name: Single node with encryption runs-on: ubuntu-22.04 @@ -237,7 +243,7 @@ jobs: sudo microceph.ceph mgr module enable microceph sudo microceph.ceph orch set backend microceph - - name: Verify Orch module + - name: Verify Orch module (basic) run: | set -eux hn=$(hostname) @@ -333,6 +339,11 @@ jobs: - name: Disable RGW run: ~/actionutils.sh disable_rgw + - name: Run orchestrator integration tests + run: | + set -eux + bash tests/scripts/test-orch.sh microceph.ceph + - name: Enable RGW with SSL enabled run: ~/actionutils.sh enable_rgw_ssl @@ -523,6 +534,16 @@ jobs: - name: Test RGW certificate rotation with --target (cross-node) run: ~/actionutils.sh headexec test_certificate_set_rgw_target node-wrk1 + - name: Enable orchestrator and run integration tests + run: | + set -eux + lxc exec node-wrk0 -- sh -c "microceph.ceph mgr module enable microceph" + lxc exec node-wrk0 -- sh -c "microceph.ceph orch set backend microceph" + sleep 10 + lxc exec node-wrk0 -- sh -c "microceph.ceph orch status" + lxc file push tests/scripts/test-orch.sh node-wrk0/tmp/test-orch.sh + lxc exec node-wrk0 -- bash /tmp/test-orch.sh microceph.ceph + - name: Print logs for failure if: failure() run: | diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh new file mode 100755 index 000000000..46423fd44 --- /dev/null +++ b/tests/scripts/test-orch.sh @@ -0,0 +1,210 @@ +#!/bin/bash +# Integration tests for the MicroCeph orchestrator module. +# +# Can be run standalone against a pre-configured cluster: +# ./test-orch.sh [ceph-command-prefix] +# +# Or sourced from actionutils.sh for CI. +# +# Expects a bootstrapped MicroCeph cluster with the orchestrator enabled. +# The first argument is the prefix for ceph commands (default: "microceph.ceph"). + +set -uo pipefail + +CEPH="${1:-microceph.ceph}" +PASS=0 +FAIL=0 +ERRORS=() + +run_ceph() { sudo $CEPH "$@" 2>&1; } + +# --- Test helpers --- + +assert_contains() { + local desc="$1" needle="$2" haystack="$3" + if echo "$haystack" | grep -qE "$needle"; then + echo " PASS $desc" + PASS=$((PASS + 1)) + else + echo " FAIL $desc (expected to contain: '$needle')" + FAIL=$((FAIL + 1)) + ERRORS+=("$desc") + fi +} + +assert_not_contains() { + local desc="$1" needle="$2" haystack="$3" + if echo "$haystack" | grep -qE "$needle"; then + echo " FAIL $desc (should NOT contain: '$needle')" + FAIL=$((FAIL + 1)) + ERRORS+=("$desc") + else + echo " PASS $desc" + PASS=$((PASS + 1)) + fi +} + +assert_exit_ok() { + local desc="$1" + shift + if "$@" >/dev/null 2>&1; then + echo " PASS $desc" + PASS=$((PASS + 1)) + else + echo " FAIL $desc (non-zero exit)" + FAIL=$((FAIL + 1)) + ERRORS+=("$desc") + fi +} + +# =================================================================== +echo "=== 1. Orchestrator status ===" +# =================================================================== + +output=$(run_ceph orch status) +assert_contains "backend is microceph" "microceph" "$output" +assert_contains "available" "Available: Yes" "$output" + +# =================================================================== +echo "=== 2. Host listing ===" +# =================================================================== + +output=$(run_ceph orch host ls) +host_count=$(echo "$output" | grep -c "hosts in cluster" || true) +assert_contains "hosts listed" "hosts in cluster" "$output" + +# =================================================================== +echo "=== 3. Service listing (baseline) ===" +# =================================================================== + +output=$(run_ceph orch ls) +assert_contains "mon service" "mon" "$output" +assert_contains "mgr service" "mgr" "$output" + +# =================================================================== +echo "=== 4. Daemon listing ===" +# =================================================================== + +output=$(run_ceph orch ps) +assert_contains "has daemons" "mon" "$output" + +# Get first hostname for filter tests +first_host=$(run_ceph orch host ls | awk 'NR==2{print $1}') +if [ -n "$first_host" ]; then + output=$(run_ceph orch ps --hostname "$first_host") + assert_contains "ps filtered to host" "$first_host" "$output" + + output=$(run_ceph orch ps --daemon-type mon) + assert_contains "ps filtered to mon type" "mon" "$output" +fi + +# =================================================================== +echo "=== 5. Device listing ===" +# =================================================================== + +output=$(run_ceph orch device ls) +# Just verify it doesn't error out +assert_exit_ok "device ls succeeds" run_ceph orch device ls + +# =================================================================== +echo "=== 6. Apply RGW ===" +# =================================================================== + +# Clean up RGW if it was already enabled by prior test steps +run_ceph orch rm rgw >/dev/null 2>&1 || true +sleep 3 + +# Use first host for placement +placement="${first_host:-$(hostname)}" +output=$(run_ceph orch apply rgw default --placement="$placement") +assert_contains "rgw applied" "enabled|already active" "$output" +assert_contains "rgw applied" "enabled" "$output" +sleep 5 + +output=$(run_ceph orch ls) +assert_contains "rgw in service list" "rgw" "$output" + +output=$(run_ceph orch ps --daemon-type rgw) +assert_contains "rgw daemon visible" "rgw" "$output" + +# =================================================================== +echo "=== 7. Apply NFS ===" +# =================================================================== + +output=$(run_ceph orch apply nfs testcluster --placement="$placement") +assert_contains "nfs applied" "enabled|already active" "$output" +sleep 5 + +output=$(run_ceph orch ls) +# NFS may not appear in service list if the service failed to start +# on the backend (e.g. missing kernel modules on CI runners). +if echo "$output" | grep -q "nfs.testcluster"; then + echo " PASS nfs in service list" + PASS=$((PASS + 1)) + NFS_RUNNING=true +else + echo " WARN nfs not in service list (service may have failed to start; skipping daemon check)" + NFS_RUNNING=false +fi + +if [ "$NFS_RUNNING" = true ]; then + output=$(run_ceph orch ps --daemon-type nfs) + assert_contains "nfs daemon visible" "nfs" "$output" +fi + +# =================================================================== +echo "=== 8. Restart service ===" +# =================================================================== + +output=$(run_ceph orch restart mon) +assert_contains "mon restarted" "Restarted" "$output" + +# =================================================================== +echo "=== 9. Remove RGW ===" +# =================================================================== + +output=$(run_ceph orch rm rgw) +assert_contains "rgw removed" "Removed" "$output" +sleep 3 + +# Verify RGW is gone from the local node. In multi-node clusters, +# RGW may still appear on other nodes if it was enabled independently +# (orch rm only affects the local node; per-host targeting not yet +# supported, see UseTarget limitation). +local_host="${first_host:-$(hostname)}" +output=$(run_ceph orch ps --hostname "$local_host" --daemon-type rgw) +assert_not_contains "rgw gone from local node" "rgw" "$output" + +# =================================================================== +echo "=== 10. Remove NFS ===" +# =================================================================== + +output=$(run_ceph orch rm nfs.testcluster) +# NFS removal may fail if the service never fully started +if echo "$output" | grep -q "Removed"; then + echo " PASS nfs removed" + PASS=$((PASS + 1)) +else + echo " WARN nfs removal returned: $output (service may not have been running)" +fi +sleep 3 + +output=$(run_ceph orch ls) +assert_not_contains "nfs gone from list" "nfs" "$output" + +# =================================================================== +echo "" +echo "===========================================" +echo " Results: $PASS passed, $FAIL failed" +echo "===========================================" + +if [ ${#ERRORS[@]} -gt 0 ]; then + echo "" + echo "Failed tests:" + for e in "${ERRORS[@]}"; do + echo " - $e" + done + exit 1 +fi + +exit 0 From 92c9eab73462359bf0b9afa4a33dffbe40c59850 Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Mon, 4 May 2026 14:27:13 +0530 Subject: [PATCH 03/11] fix: avoid crash from f-string spec formatting in mgr context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python 3.12 enum machinery resolves str()/repr() on PlacementSpec and HostPlacementSpec via Enum.__str__/__repr__ inside ceph-mgr, causing AttributeError on _name_/_value_repr_. Two callsites trigger this: - f"...{vars(spec)}" debug logs invoke dict.__repr__ which recurses into each value's __repr__, hitting PlacementSpec → HostPattern enum. - str(h) in _get_placement_hosts falls through to Enum.__str__ on HostPlacementSpec (NamedTuple) instances. Replace 7 vars(spec) f-strings with static debug strings. Use h.hostname directly when extracting placement hosts. Also surface raw output in test-orch.sh assert_contains/assert_not_contains on FAIL to make future diagnosis cheaper. Signed-off-by: Utkarsh Bhatt --- microceph-orch/src/microceph/module.py | 16 ++++++++-------- tests/scripts/test-orch.sh | 6 ++++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index 9391af7e3..a332e2130 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -317,7 +317,7 @@ def _get_placement_hosts(self, spec: ServiceSpec) -> List[str]: """ hosts = [] if spec.placement and spec.placement.hosts: - hosts = [str(h) for h in spec.placement.hosts] + hosts = [h.hostname if hasattr(h, 'hostname') else str(h) for h in spec.placement.hosts] if not hosts: raise ValueError( @@ -397,7 +397,7 @@ def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]: rbd-mirror is a client-like service with no additional parameters. The Go API's ClientServicePlacement handler takes no payload. """ - logger.debug(f"Applying rbd-mirror service, spec: {vars(spec)}") + logger.debug("Applying rbd-mirror service") return self._apply_service("rbd-mirror", spec, "{}") @handle_orch_error @@ -414,7 +414,7 @@ def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]: key source is added, SSL cannot be fully configured via the orchestrator interface. """ - logger.debug(f"Applying RGW service, spec: {vars(spec)}") + logger.debug("Applying RGW service") # Build RGW-specific payload from the spec. # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey @@ -449,7 +449,7 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]: Extracts the NFS cluster ID and optional port from the NFSServiceSpec and passes them as the JSON payload. """ - logger.debug(f"Applying NFS service, spec: {vars(spec)}") + logger.debug("Applying NFS service") # Go's NFSServicePlacement expects: cluster_id, v4_min_version, bind_address, bind_port # The Ceph NFSServiceSpec uses service_id as the NFS cluster identifier. @@ -470,19 +470,19 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]: @handle_orch_error def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]: """Enable the MON service on the target hosts.""" - logger.debug(f"Applying MON service, spec: {vars(spec)}") + logger.debug("Applying MON service") return self._apply_service("mon", spec, "{}") @handle_orch_error def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]: """Enable the MGR service on the target hosts.""" - logger.debug(f"Applying MGR service, spec: {vars(spec)}") + logger.debug("Applying MGR service") return self._apply_service("mgr", spec, "{}") @handle_orch_error def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]: """Enable the MDS service on the target hosts.""" - logger.debug(f"Applying MDS service, spec: {vars(spec)}") + logger.debug("Applying MDS service") return self._apply_service("mds", spec, "{}") @handle_orch_error @@ -492,7 +492,7 @@ def apply_cephfs_mirror(self, spec: ServiceSpec) -> OrchResult[str]: cephfs-mirror is a client-like service (same as rbd-mirror) with no additional parameters. """ - logger.debug(f"Applying cephfs-mirror service, spec: {vars(spec)}") + logger.debug("Applying cephfs-mirror service") return self._apply_service("cephfs-mirror", spec, "{}") @handle_orch_error diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh index 46423fd44..a95f412dd 100755 --- a/tests/scripts/test-orch.sh +++ b/tests/scripts/test-orch.sh @@ -27,6 +27,9 @@ assert_contains() { PASS=$((PASS + 1)) else echo " FAIL $desc (expected to contain: '$needle')" + echo " ----- actual output -----" + echo "$haystack" | sed 's/^/ | /' + echo " -------------------------" FAIL=$((FAIL + 1)) ERRORS+=("$desc") fi @@ -36,6 +39,9 @@ assert_not_contains() { local desc="$1" needle="$2" haystack="$3" if echo "$haystack" | grep -qE "$needle"; then echo " FAIL $desc (should NOT contain: '$needle')" + echo " ----- actual output -----" + echo "$haystack" | sed 's/^/ | /' + echo " -------------------------" FAIL=$((FAIL + 1)) ERRORS+=("$desc") else From d3202242a2c7ea56471896a18b9059d50575d672 Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Mon, 25 May 2026 13:19:35 +0000 Subject: [PATCH 04/11] fix: address orchestrator review findings - get_inventory: drop the dead list_resources() client method and its misleading test mocks; document that the inventory reports OSD-backed disks only, since /1.0/resources is local-node-only and the socket client cannot proxy to peers. - _apply_service: report a race-free, honest summary ": enabled (requested placement: )" instead of falsely claiming each placement host gained the service; per-host targeting is not supported via the unix socket so the actual placement cannot be confirmed. - _get_existing_service_hosts: accept a group_id so distinct NFS clusters are not conflated when checking whether a service already runs. - get_hosts: parse bracketed IPv6 "[addr]:port" addresses and leave bare IPv6 literals intact instead of truncating on the last colon. - service.py: tolerate non-JSON / error-key-less HTTP error bodies when translating microcluster exceptions. - _get_service_hostlist: use .get() for record fields, consistent with the rest of the module. - remove_service: log when a non-NFS service id is ignored. - test-orch.sh: quote $CEPH, drop the unused host_count assignment. - pyproject.toml: declare pytest as a dev dependency group. Signed-off-by: Utkarsh Bhatt --- microceph-orch/pyproject.toml | 5 + .../src/microceph/client/cluster.py | 5 - .../src/microceph/client/service.py | 9 +- microceph-orch/src/microceph/module.py | 100 +++++++++++------- microceph-orch/tests/conftest.py | 3 - microceph-orch/tests/stubs.py | 5 + microceph-orch/tests/test_client.py | 10 -- microceph-orch/tests/test_module.py | 67 +++++++++--- tests/scripts/test-orch.sh | 3 +- 9 files changed, 133 insertions(+), 74 deletions(-) diff --git a/microceph-orch/pyproject.toml b/microceph-orch/pyproject.toml index 41e2e24bb..e0a9d0181 100644 --- a/microceph-orch/pyproject.toml +++ b/microceph-orch/pyproject.toml @@ -12,3 +12,8 @@ dependencies = [ [tool.uv.sources] snap-helpers = { git = "https://github.com/albertodonato/snap-helpers" } + +[dependency-groups] +dev = [ + "pytest>=8.0", +] diff --git a/microceph-orch/src/microceph/client/cluster.py b/microceph-orch/src/microceph/client/cluster.py index e0173eb11..a3e4311ce 100644 --- a/microceph-orch/src/microceph/client/cluster.py +++ b/microceph-orch/src/microceph/client/cluster.py @@ -57,11 +57,6 @@ def list_services(self) -> list[dict]: services = self._get("/1.0/services") return services.get("metadata") or [] - def list_resources(self) -> list[dict]: - """List all resources.""" - nodes = self._get("/1.0/resources") - return nodes.get("metadata") or [] - def list_disks(self) -> list[dict]: """List all disks""" disks = self._get("/1.0/disks") diff --git a/microceph-orch/src/microceph/client/service.py b/microceph-orch/src/microceph/client/service.py index 4eac0f4bc..e1965d841 100644 --- a/microceph-orch/src/microceph/client/service.py +++ b/microceph-orch/src/microceph/client/service.py @@ -145,8 +145,13 @@ def _request(self, method, path, **kwargs): # noqa: C901 too complex try: response.raise_for_status() except HTTPError as e: - # Do some nice translating to microclusterdexceptions - error = response.json().get("error") + # Do some nice translating to microclusterdexceptions. + # The error body is normally JSON with an "error" key, but a + # non-JSON body or a missing key must not mask the real HTTPError. + try: + error = response.json().get("error") or "" + except ValueError: + error = "" if "remote with name" in error: raise NodeAlreadyExistsException( "Already node exists in the microcluster" diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index a332e2130..0554834dd 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -132,12 +132,16 @@ def get_hosts(self) -> List[HostSpec]: """ specs = [] for m in self.microceph.cluster.get_cluster_members(): - # Address from microcluster is in "host:port" format. - # rpartition splits on the last ":" to separate host from port. + # microcluster addresses are "host:port"; IPv6 uses the + # bracketed "[addr]:port" form. Strip the port without + # corrupting a bare IPv6 literal (which also contains ":"). address = m.get('address', '') - addr, _, _ = address.rpartition(":") - if not addr: - # No ":" found; use the raw address (may be hostname only). + if address.startswith('['): + addr = address[1:].partition(']')[0] + elif address.count(':') == 1: + addr = address.rsplit(':', 1)[0] + else: + # No port present (hostname-only or bare IPv6 literal). addr = address specs.append(HostSpec( m.get('name', ''), @@ -151,10 +155,15 @@ def _get_service_hostlist(self, recorded_services: list) -> dict: """Get a dict describing the distribution of services""" service_hostlist = defaultdict(list) for record in recorded_services: - service_name = record['service'] if not record['group_id'] else f"{record['service']}.{record['group_id']}" - service_host = record['location'] + service = record.get('service', '') + group_id = record.get('group_id', '') + service_name = service if not group_id else f"{service}.{group_id}" + service_host = record.get('location', '') service_hostlist[service_name].append(service_host) - logger.debug(f"microcephs record service({service_name}) at ({service_host}) configured({record['info']})") + logger.debug( + f"microceph record service({service_name}) at " + f"({service_host}) configured({record.get('info', '')})" + ) return service_hostlist @staticmethod @@ -269,6 +278,15 @@ def get_inventory(self, host_filter: Optional[InventoryFilter] = None, refresh: bool = False ) -> List[InventoryHost]: + """Report storage device inventory per host. + + Devices are sourced from the cluster-wide OSD list (/1.0/disks), so + every reported device backs an existing OSD and is marked + unavailable. Discovery of free/unused disks is not exposed here: + /1.0/resources reports only the local node and the socket client + cannot proxy to peers (see the _apply_service note on targeting). + Hosts with no OSD disks are still listed so callers see every member. + """ # Resolve which hosts to include based on the filter. filter_hosts = None @@ -327,8 +345,14 @@ def _get_placement_hosts(self, spec: ServiceSpec) -> List[str]: return hosts - def _get_existing_service_hosts(self, service_type: str) -> set: - """Return the set of hostnames that already run the given service type.""" + def _get_existing_service_hosts(self, service_type: str, + group_id: Optional[str] = None) -> set: + """Return the set of hostnames that already run the given service. + + When group_id is provided (e.g. an NFS cluster id), only services + matching both the type and that group are counted, so distinct NFS + clusters are not conflated. + """ try: services = self.microceph.services.list_services() except RemoteException: @@ -336,16 +360,17 @@ def _get_existing_service_hosts(self, service_type: str) -> set: return set() return { - svc['location'] for svc in services - if svc['service'] == service_type + svc.get('location', '') for svc in services + if svc.get('service') == service_type + and (group_id is None or svc.get('group_id', '') == group_id) } - def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) -> str: - """Common logic for applying (enabling) a service on the local node. + def _apply_service(self, service_name: str, spec: ServiceSpec, + payload: str, group_id: Optional[str] = None) -> str: + """Common logic for applying (enabling) a service. Validates placement hosts and checks whether the service is already - running. Calls enable_service once; the request goes to the local - node via the unix socket. + running, then calls enable_service once. Returns a summary string on success. Raises an exception if placement is invalid or the API call fails. @@ -354,27 +379,20 @@ def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) -> connects via unix socket to the local node only. MicroCeph's Go client uses UseTarget() to proxy to specific hosts, but this is not exposed through the socket. See todo #6. - Placement hosts are validated and logged, but the enable call - always runs on the local node regardless. + Because the enable call cannot be confirmed per host, the summary + reports the requested placement rather than claiming each host was + individually provisioned. """ hosts = self._get_placement_hosts(spec) - existing = self._get_existing_service_hosts(service_name) + existing = self._get_existing_service_hosts(service_name, group_id) - # Check if the service is already active on the local node. - # Since we can only target the local node, we check if any - # of the requested hosts that are local already have the service. - all_existing = all(h in existing for h in hosts) - if all_existing: + # If every requested host already runs the service, there is + # nothing to do. + if existing.issuperset(hosts): skipped_str = ', '.join(hosts) logger.info(f"Skipping {service_name}: already active on {skipped_str}") return f"{service_name}: already active on {skipped_str}" - if existing: - logger.info( - f"{service_name} already active on {', '.join(existing & set(hosts))}; " - f"enabling for remaining hosts" - ) - logger.info(f"Enabling {service_name} (requested hosts: {', '.join(hosts)})") self.microceph.services.enable_service( name=service_name, @@ -382,13 +400,16 @@ def _apply_service(self, service_name: str, spec: ServiceSpec, payload: str) -> wait=True, ) - new_hosts = [h for h in hosts if h not in existing] - parts = [f"enabled on {', '.join(new_hosts)}"] - skipped_hosts = [h for h in hosts if h in existing] - if skipped_hosts: - parts.append(f"already active on {', '.join(skipped_hosts)}") + # The enable call is served by the local node and cannot be + # confirmed per host (see note above), so report the request + # honestly rather than claiming every placement host gained the + # service. + summary = f"enabled (requested placement: {', '.join(hosts)})" + already_active = sorted(existing & set(hosts)) + if already_active: + summary += f"; already active on {', '.join(already_active)}" - return f"{service_name}: {'; '.join(parts)}" + return f"{service_name}: {summary}" @handle_orch_error def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]: @@ -465,7 +486,7 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]: nfs_params['bind_address'] = spec.virtual_ip payload = json.dumps(nfs_params) - return self._apply_service("nfs", spec, payload) + return self._apply_service("nfs", spec, payload, group_id=spec.service_id) @handle_orch_error def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]: @@ -517,6 +538,11 @@ def remove_service(self, service_name: str, force: bool = False) -> OrchResult[s raise ValueError("NFS removal requires service name in 'nfs.' format") self.microceph.services.delete_nfs_service(svc_id) else: + if svc_id: + logger.debug( + f"Ignoring id '{svc_id}' for {svc_type}: MicroCeph manages " + f"a single {svc_type} service per node" + ) self.microceph.services.delete_service(svc_type) return f"Removed service {service_name}" diff --git a/microceph-orch/tests/conftest.py b/microceph-orch/tests/conftest.py index f2cf1581f..8d231248d 100644 --- a/microceph-orch/tests/conftest.py +++ b/microceph-orch/tests/conftest.py @@ -127,9 +127,6 @@ def mock_client(): # Default: no disks client.services.list_disks.return_value = [] - # Default: no resources - client.services.list_resources.return_value = [] - # Default: status available client.status.is_available.return_value = None diff --git a/microceph-orch/tests/stubs.py b/microceph-orch/tests/stubs.py index e9b6be2a9..1a950fd3a 100644 --- a/microceph-orch/tests/stubs.py +++ b/microceph-orch/tests/stubs.py @@ -4,6 +4,11 @@ The real types live in ceph.deployment.*, mgr_module, and orchestrator; which are only importable inside the Ceph snap environment. These stubs replicate just enough behaviour to test our code outside the snap. + +NOTE: the stubs are intentionally plain classes. They do NOT reproduce +the real Ceph enum / NamedTuple machinery (e.g. PlacementSpec with +HostPlacementSpec entries), so bugs that depend on that behaviour are +only caught by the integration suite (tests/scripts/test-orch.sh). """ from typing import Optional, List, Dict, Any, Generic, TypeVar diff --git a/microceph-orch/tests/test_client.py b/microceph-orch/tests/test_client.py index 145a325d8..27b8514ab 100644 --- a/microceph-orch/tests/test_client.py +++ b/microceph-orch/tests/test_client.py @@ -104,16 +104,6 @@ def test_list_disks_null_metadata(self, mock_session, endpoint): result = svc.list_disks() assert result == [] - def test_list_resources_null_metadata(self, mock_session, endpoint): - svc = ExtendedAPIService(mock_session, endpoint) - mock_session.request.return_value = MagicMock( - status_code=200, - text='{}', - json=lambda: {"metadata": None}, - ) - result = svc.list_resources() - assert result == [] - def test_enable_service_payload(self, mock_session, endpoint): svc = ExtendedAPIService(mock_session, endpoint) mock_session.request.return_value = MagicMock( diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py index 31f646753..cbe6076c7 100644 --- a/microceph-orch/tests/test_module.py +++ b/microceph-orch/tests/test_module.py @@ -18,6 +18,12 @@ from microceph.client.service import RemoteException +def _svc(service, location, group_id="", info=""): + """Build a MicroCeph service record as returned by list_services().""" + return {"service": service, "location": location, + "group_id": group_id, "info": info} + + # =================================================================== # available() # =================================================================== @@ -70,6 +76,17 @@ def test_get_hosts_no_port_in_address(self, orchestrator, mock_client): # Should fall back to raw address when no ":" present assert result.result[0].addr == "192.168.1.100" + def test_get_hosts_ipv6_address(self, orchestrator, mock_client): + mock_client.cluster.get_cluster_members.return_value = [ + # bracketed IPv6 with port + {"name": "h1", "address": "[fe80::1]:7443", "status": "ONLINE"}, + # bare IPv6 literal, no port + {"name": "h2", "address": "fe80::2", "status": "ONLINE"}, + ] + result = orchestrator.get_hosts() + assert result.result[0].addr == "fe80::1" + assert result.result[1].addr == "fe80::2" + def test_get_hosts_missing_address(self, orchestrator, mock_client): mock_client.cluster.get_cluster_members.return_value = [ {"name": "h1", "status": "ONLINE"}, @@ -250,7 +267,6 @@ def test_get_inventory_shows_osd_disks(self, orchestrator, mock_client): {"location": "node1", "path": "/dev/sdb"}, {"location": "node1", "path": "/dev/sdc"}, ] - mock_client.services.list_resources.return_value = {} result = orchestrator.get_inventory() inv = result.result # 3 hosts (from cluster members), node1 has 2 OSD disks @@ -265,7 +281,6 @@ def test_get_inventory_multi_host(self, orchestrator, mock_client): {"location": "node1", "path": "/dev/sda"}, {"location": "node2", "path": "/dev/sda"}, ] - mock_client.services.list_resources.return_value = {} result = orchestrator.get_inventory() hosts = {h.name for h in result.result} assert hosts == {"node1", "node2", "node3"} @@ -276,7 +291,6 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client): {"location": "node2", "path": "/dev/sda"}, {"location": "node3", "path": "/dev/sda"}, ] - mock_client.services.list_resources.return_value = {} filt = InventoryFilter(hosts=["node1", "node3"]) result = orchestrator.get_inventory(host_filter=filt) hosts = {h.name for h in result.result} @@ -284,14 +298,12 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client): def test_get_inventory_empty(self, orchestrator, mock_client): mock_client.services.list_disks.return_value = [] - mock_client.services.list_resources.return_value = {} mock_client.cluster.get_cluster_members.return_value = [] result = orchestrator.get_inventory() assert result.result == [] def test_get_inventory_includes_members_without_disks(self, orchestrator, mock_client): mock_client.services.list_disks.return_value = [] - mock_client.services.list_resources.return_value = {} # Default mock has 3 cluster members result = orchestrator.get_inventory() hosts = {h.name for h in result.result} @@ -313,7 +325,10 @@ def test_apply_rbd_mirror_success(self, orchestrator, mock_client): ) result = orchestrator.apply_rbd_mirror(spec) assert result.exception is None - assert "enabled on node1, node2" in result.result + # Summary confirms enablement and echoes the requested placement + # without claiming each host was individually provisioned. + assert "enabled" in result.result + assert "requested placement: node1, node2" in result.result # Single API call regardless of host count (local socket only) mock_client.services.enable_service.assert_called_once() @@ -324,8 +339,9 @@ def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client): assert "No placement hosts" in str(result.exception) def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client): + # node1 already runs the service; node2 is also requested. mock_client.services.list_services.return_value = [ - {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""}, + _svc("rbd-mirror", "node1"), ] spec = ServiceSpec( service_type="rbd-mirror", @@ -334,12 +350,12 @@ def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client): result = orchestrator.apply_rbd_mirror(spec) assert result.exception is None assert "already active on node1" in result.result - assert "enabled on node2" in result.result + assert "enabled" in result.result mock_client.services.enable_service.assert_called_once() def test_apply_rbd_mirror_all_existing(self, orchestrator, mock_client): mock_client.services.list_services.return_value = [ - {"service": "rbd-mirror", "location": "node1", "group_id": "", "info": ""}, + _svc("rbd-mirror", "node1"), ] spec = ServiceSpec( service_type="rbd-mirror", @@ -372,7 +388,8 @@ def test_apply_rgw_basic(self, orchestrator, mock_client): ) result = orchestrator.apply_rgw(spec) assert result.exception is None - assert "enabled on node1" in result.result + assert "enabled" in result.result + assert "node1" in result.result call_kwargs = mock_client.services.enable_service.call_args assert call_kwargs.kwargs["name"] == "rgw" @@ -428,7 +445,8 @@ def test_apply_nfs_basic(self, orchestrator, mock_client): ) result = orchestrator.apply_nfs(spec) assert result.exception is None - assert "enabled on node1" in result.result + assert "enabled" in result.result + assert "node1" in result.result call_kwargs = mock_client.services.enable_service.call_args payload = json.loads(call_kwargs.kwargs["payload"]) @@ -478,7 +496,7 @@ def test_apply_nfs_no_placement(self, orchestrator, mock_client): def test_apply_nfs_skips_existing(self, orchestrator, mock_client): mock_client.services.list_services.return_value = [ - {"service": "nfs", "location": "node1", "group_id": "mycluster", "info": "{}"}, + _svc("nfs", "node1", group_id="mycluster", info="{}"), ] spec = NFSServiceSpec( service_type="nfs", @@ -487,7 +505,24 @@ def test_apply_nfs_skips_existing(self, orchestrator, mock_client): ) result = orchestrator.apply_nfs(spec) assert "already active on node1" in result.result - assert "enabled on node2" in result.result + assert "enabled" in result.result + mock_client.services.enable_service.assert_called_once() + + def test_apply_nfs_distinct_cluster_not_conflated(self, orchestrator, mock_client): + """A host running nfs.other must not suppress enabling nfs.mycluster.""" + mock_client.services.list_services.return_value = [ + _svc("nfs", "node1", group_id="other", info="{}"), + ] + spec = NFSServiceSpec( + service_type="nfs", + service_id="mycluster", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_nfs(spec) + assert result.exception is None + # nfs.other on node1 must not be mistaken for nfs.mycluster + assert "already active" not in result.result + assert "enabled" in result.result mock_client.services.enable_service.assert_called_once() @@ -507,7 +542,8 @@ def test_apply_cephfs_mirror_success(self, orchestrator, mock_client): ) result = orchestrator.apply_cephfs_mirror(spec) assert result.exception is None - assert "enabled on node1" in result.result + assert "enabled" in result.result + assert "node1" in result.result mock_client.services.enable_service.assert_called_once_with( name="cephfs-mirror", payload="{}", wait=True, ) @@ -571,7 +607,8 @@ def test_apply_mon(self, orchestrator, mock_client): ) result = orchestrator.apply_mon(spec) assert result.exception is None - assert "enabled on node1" in result.result + assert "enabled" in result.result + assert "node1" in result.result mock_client.services.enable_service.assert_called_once_with( name="mon", payload="{}", wait=True, ) diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh index a95f412dd..969e2a43d 100755 --- a/tests/scripts/test-orch.sh +++ b/tests/scripts/test-orch.sh @@ -16,7 +16,7 @@ PASS=0 FAIL=0 ERRORS=() -run_ceph() { sudo $CEPH "$@" 2>&1; } +run_ceph() { sudo "$CEPH" "$@" 2>&1; } # --- Test helpers --- @@ -76,7 +76,6 @@ echo "=== 2. Host listing ===" # =================================================================== output=$(run_ceph orch host ls) -host_count=$(echo "$output" | grep -c "hosts in cluster" || true) assert_contains "hosts listed" "hosts in cluster" "$output" # =================================================================== From 1dde94973de128df9cc190796fa85ee692c589dd Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Tue, 26 May 2026 21:23:31 +0530 Subject: [PATCH 05/11] ci: install microceph-orch package before orch unit tests `pip install pytest` alone left runtime deps like `requests` to runner preinstall, which is brittle. Install the package itself so declared dependencies resolve. Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 077277175..f15a15ad8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -190,7 +190,7 @@ jobs: - name: Run orchestrator Python unit-tests run: | - pip install pytest + python3 -m pip install ./microceph-orch pytest cd microceph-orch PYTHONPATH=src:tests python3 -m pytest tests/ -v From f06e3b45d2c0ae83e51a5914408af095c2736e64 Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Tue, 26 May 2026 21:23:36 +0530 Subject: [PATCH 06/11] feat(orch): reject InventoryFilter.labels in get_inventory MicroCeph has no host-label concept. Previously labels were silently ignored, returning unfiltered hosts/devices to callers that expected a label filter. Raise NotImplementedError instead so the mismatch surfaces. Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- microceph-orch/src/microceph/module.py | 12 ++++++++++-- microceph-orch/tests/test_module.py | 6 ++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index 0554834dd..33741c900 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -286,12 +286,20 @@ def get_inventory(self, /1.0/resources reports only the local node and the socket client cannot proxy to peers (see the _apply_service note on targeting). Hosts with no OSD disks are still listed so callers see every member. + + Only `host_filter.hosts` is honored; `host_filter.labels` raises + NotImplementedError because MicroCeph has no host-label concept. """ # Resolve which hosts to include based on the filter. filter_hosts = None - if host_filter and host_filter.hosts: - filter_hosts = set(host_filter.hosts) + if host_filter: + if host_filter.labels: + raise NotImplementedError( + "MicroCeph orchestrator does not support host labels" + ) + if host_filter.hosts: + filter_hosts = set(host_filter.hosts) osd_disks = self.microceph.services.list_disks() diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py index cbe6076c7..6c21087ad 100644 --- a/microceph-orch/tests/test_module.py +++ b/microceph-orch/tests/test_module.py @@ -296,6 +296,12 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client): hosts = {h.name for h in result.result} assert hosts == {"node1", "node3"} + def test_get_inventory_label_filter_rejected(self, orchestrator, mock_client): + filt = InventoryFilter(labels=["role=osd"]) + result = orchestrator.get_inventory(host_filter=filt) + assert isinstance(result.exception, NotImplementedError) + mock_client.services.list_disks.assert_not_called() + def test_get_inventory_empty(self, orchestrator, mock_client): mock_client.services.list_disks.return_value = [] mock_client.cluster.get_cluster_members.return_value = [] From 8491cbdcc834ad5ba47023616edc0191bb2ae9b0 Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Tue, 26 May 2026 21:23:43 +0530 Subject: [PATCH 07/11] test(orch): drop redundant 'enabled' assert after RGW apply The preceding assert already accepts "enabled|already active". The second assert forced "enabled" only, which would false-negative when apply legitimately reports "already active" (idempotent re-apply after the test's earlier `orch rm rgw`). Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- tests/scripts/test-orch.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh index 969e2a43d..ccf36f817 100755 --- a/tests/scripts/test-orch.sh +++ b/tests/scripts/test-orch.sh @@ -123,7 +123,6 @@ sleep 3 placement="${first_host:-$(hostname)}" output=$(run_ceph orch apply rgw default --placement="$placement") assert_contains "rgw applied" "enabled|already active" "$output" -assert_contains "rgw applied" "enabled" "$output" sleep 5 output=$(run_ceph orch ls) From cab4e2535abe10546f6304e79edf95b77d8b7ace Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Wed, 27 May 2026 13:39:46 +0530 Subject: [PATCH 08/11] fix(orch): make list_daemons resilient to missing service metadata keys ExtendedAPIService.list_services() may return records without group_id (and other optional keys). Direct dict indexing in list_daemons raised KeyError and broke `ceph orch ps`. Switch to .get() with empty-string defaults, consistent with how the same fields are read in _get_service_hostlist() and the inventory path. Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- microceph-orch/src/microceph/module.py | 6 +++--- microceph-orch/tests/test_module.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index 33741c900..ac4d5d29d 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -234,9 +234,9 @@ def list_daemons(self, services = self.microceph.services.list_services() descriptions = [] for svc in services: - svc_daemon_type = svc['service'] - svc_hostname = svc['location'] - svc_group_id = svc['group_id'] + svc_daemon_type = svc.get('service', '') + svc_hostname = svc.get('location', '') + svc_group_id = svc.get('group_id', '') svc_ip = None svc_ports = None svc_name = f"{svc_daemon_type}.{svc_group_id}" if svc_group_id else svc_daemon_type diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py index 6c21087ad..0e4aeec89 100644 --- a/microceph-orch/tests/test_module.py +++ b/microceph-orch/tests/test_module.py @@ -256,6 +256,20 @@ def test_list_daemons_nfs_null_info(self, orchestrator, mock_client): result = orchestrator.list_daemons() assert len(result.result) == 1 + def test_list_daemons_metadata_missing_optional_keys(self, orchestrator, mock_client): + # ExtendedAPIService.list_services() may omit group_id (and other + # optional keys); list_daemons must not raise KeyError. + mock_client.services.list_services.return_value = [ + {"service": "mon", "location": "n1"}, + ] + result = orchestrator.list_daemons() + assert result.exception is None + assert len(result.result) == 1 + daemon = result.result[0] + assert daemon.daemon_type == "mon" + assert daemon.hostname == "n1" + assert daemon.service_name == "mon" + # =================================================================== # get_inventory() From 4b5cf71915413ab3a9bcb558e194321380a2040b Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Wed, 27 May 2026 15:44:10 +0530 Subject: [PATCH 09/11] orch: address review feedback on apply_rgw, apply_mds, service_action Three issues raised in PR review: 1. apply_rgw silently drops spec.service_id. MicroCeph deploys a single bare "rgw" service per node and does not support multiple RGW realms with distinct service_ids on the same cluster. Log a warning when a service_id is supplied so users get a clear signal instead of confusing apparently-successful no-op behaviour, and document the limitation on the docstring. 2. apply_mds has the same shape: per-filesystem MDS placement is not supported and service_id is silently ignored. Same fix. 3. service_action("restart", ...) only the backend serviceWorkerTable in microceph/ceph/services.go has entries for osd, mon and rgw; any other service type would have surfaced an opaque `no handler defined for service X` RemoteException from the backend. Reject unsupported service types up front with a NotImplementedError that names the supported set, and add a RESTART_SUPPORTED_SERVICES constant so the boundary is explicit. Tests: - test_apply_rgw_service_id_warns - test_apply_mds_service_id_warns - test_restart_unsupported_service_type covers nfs/mds/mgr/rbd-mirror - test_restart_service_with_id updated to use rgw.realm1 (was nfs.mycluster, which is now correctly rejected) Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- microceph-orch/src/microceph/module.py | 41 +++++++++++++++++- microceph-orch/tests/test_module.py | 58 +++++++++++++++++++++++++- 2 files changed, 96 insertions(+), 3 deletions(-) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index ac4d5d29d..08dd53fdd 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -45,6 +45,12 @@ 'nfs': NFSServiceSpec, } +# Services for which MicroCeph's backend supports restart. The Go +# serviceWorkerTable (microceph/ceph/services.go) only registers handlers +# for osd, mon and rgw; restarting any other service raises +# "no handler defined for service X" at the backend. +RESTART_SUPPORTED_SERVICES = frozenset({"osd", "mon", "rgw"}) + class MicroCephOrchestrator(Orchestrator, MgrModule, metaclass=CLICommandMeta): @@ -442,9 +448,23 @@ def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]: SSLPrivateKey to enable SSL. Until the spec is extended or a separate key source is added, SSL cannot be fully configured via the orchestrator interface. + + Note on service_id (realms/zones): MicroCeph deploys a single bare + "rgw" service per node and does not support multiple RGW instances + with distinct service_ids on the same cluster. If a service_id is + provided in the spec, it is ignored and a warning is logged. """ logger.debug("Applying RGW service") + # MicroCeph stores a single bare "rgw" service; per-realm service_id + # is not supported by the backend, so we drop it with a warning. + if spec.service_id: + logger.warning( + f"RGW service_id={spec.service_id!r} ignored: MicroCeph deploys " + "a single 'rgw' service; multiple RGW realms with distinct " + "service_ids are not supported." + ) + # Build RGW-specific payload from the spec. # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey # Field names match Go exported field names (no json tags defined, @@ -510,8 +530,20 @@ def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]: @handle_orch_error def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]: - """Enable the MDS service on the target hosts.""" + """Enable the MDS service on the target hosts. + + Note on service_id (filesystem name): MicroCeph deploys a single + bare "mds" service and does not support per-filesystem MDS + placement. If a service_id is provided in the spec, it is + ignored and a warning is logged. + """ logger.debug("Applying MDS service") + if spec.service_id: + logger.warning( + f"MDS service_id={spec.service_id!r} ignored: MicroCeph deploys " + "a single 'mds' service; per-filesystem MDS placement is not " + "supported." + ) return self._apply_service("mds", spec, "{}") @handle_orch_error @@ -588,5 +620,12 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] svc_type, _ = self._parse_service_name(service_name) + if svc_type not in RESTART_SUPPORTED_SERVICES: + raise NotImplementedError( + f"Restart of service type {svc_type!r} is not supported by " + f"MicroCeph. Supported services: " + f"{sorted(RESTART_SUPPORTED_SERVICES)}." + ) + self.microceph.services.restart_services([svc_type]) return [f"Restarted {service_name}"] diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py index 0e4aeec89..87dac2ab8 100644 --- a/microceph-orch/tests/test_module.py +++ b/microceph-orch/tests/test_module.py @@ -451,6 +451,28 @@ def test_apply_rgw_no_placement(self, orchestrator, mock_client): result = orchestrator.apply_rgw(spec) assert result.exception is not None + def test_apply_rgw_service_id_warns(self, orchestrator, mock_client, caplog): + # MicroCeph deploys a single bare 'rgw' service; per-realm + # service_id is silently dropped — log a warning instead of + # surprising the caller. + spec = RGWSpec( + service_type="rgw", + service_id="realm1", + placement=PlacementSpec(hosts=["node1"]), + ) + import logging + with caplog.at_level(logging.WARNING): + result = orchestrator.apply_rgw(spec) + + assert result.exception is None + # service_id is not forwarded to the backend. + call_kwargs = mock_client.services.enable_service.call_args + assert call_kwargs.kwargs["name"] == "rgw" + assert any( + "service_id" in r.message and "realm1" in r.message + for r in caplog.records + ) + # =================================================================== # apply_nfs() @@ -655,6 +677,27 @@ def test_apply_mds(self, orchestrator, mock_client): name="mds", payload="{}", wait=True, ) + def test_apply_mds_service_id_warns(self, orchestrator, mock_client, caplog): + # Per-filesystem MDS placement is not supported by MicroCeph; + # service_id (filesystem name) must be dropped with a warning. + spec = ServiceSpec( + service_type="mds", + service_id="fs1", + placement=PlacementSpec(hosts=["node1"]), + ) + import logging + with caplog.at_level(logging.WARNING): + result = orchestrator.apply_mds(spec) + + assert result.exception is None + mock_client.services.enable_service.assert_called_once_with( + name="mds", payload="{}", wait=True, + ) + assert any( + "service_id" in r.message and "fs1" in r.message + for r in caplog.records + ) + # =================================================================== # remove_service() @@ -713,9 +756,20 @@ def test_restart_service(self, orchestrator, mock_client): mock_client.services.restart_services.assert_called_once_with(["mon"]) def test_restart_service_with_id(self, orchestrator, mock_client): - result = orchestrator.service_action("restart", "nfs.mycluster") + # Service identifiers are stripped before dispatching to the + # MicroCeph API; rgw is a supported restart target. + result = orchestrator.service_action("restart", "rgw.realm1") assert result.exception is None - mock_client.services.restart_services.assert_called_once_with(["nfs"]) + mock_client.services.restart_services.assert_called_once_with(["rgw"]) + + def test_restart_unsupported_service_type(self, orchestrator, mock_client): + # MicroCeph's backend serviceWorkerTable only handles osd/mon/rgw. + # The orchestrator must reject restart for other service types up + # front rather than surfacing an opaque RemoteException. + for svc in ["nfs.mycluster", "mds", "mgr", "rbd-mirror"]: + result = orchestrator.service_action("restart", svc) + assert isinstance(result.exception, NotImplementedError), svc + mock_client.services.restart_services.assert_not_called() def test_unsupported_action(self, orchestrator, mock_client): result = orchestrator.service_action("stop", "mon") From a350716f95a7924c3e6583a55c4fcedd0db9e71e Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Thu, 28 May 2026 11:32:55 +0530 Subject: [PATCH 10/11] orch: per-host targeting via ?target= and stricter service-name handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds end-to-end per-host targeting for service enable / remove / restart, plus a bare-name guard that aligns the orchestrator's service-name semantics with MicroCeph's single-bare-instance-per-node backend. Transport: MicroCeph's HTTP API endpoints for service mutation all declare `ProxyTarget: true` in the Go rest layer. The Python client keeps using the local unix socket; the server-side proxyTarget middleware in microcluster reads the new `?target=` query parameter and mTLS-forwards the request to that member's HTTPS endpoint. No certs or extra config required in the mgr snap; ceph-mgr and microcephd share `$SNAP_COMMON`. Behavioural changes: * `_apply_service`, `remove_service`, `service_action` fan out per host. Already-running hosts are skipped on apply; remove and restart discover hosts via `list_services()`. Any per-host failure raises `RuntimeError` with partial-success context, instead of silently mixing failure entries into a success result. * `_get_existing_service_hosts` no longer swallows `RemoteException` on `list_services()` — a backend failure now surfaces rather than producing a false "no-op" success. * New `_require_bare_service_name(svc_type, svc_id)` helper and a `DOTTED_NAME_SUPPORTED_SERVICES = {"nfs"}` allowlist. Dotted names on any other service (rgw.realm1, mds.fs1, mon.x, ...) are rejected up front with `ValueError`. Previously rgw/mds warned-and-dropped; mon/mgr/rbd-mirror/cephfs-mirror silently dropped. Same guard applied on `remove_service` and `service_action`. * `_apply_service` parameter renamed `service_name` -> `service_type` to reflect that it always receives the bare type, never a dotted name. Client (`microceph-orch/src/microceph/client/cluster.py`): * `enable_service`, `delete_service`, `restart_services`, `delete_nfs_service` accept an optional `target: str | None` kwarg. When set, it is forwarded as a `?target=` query param to drive the server-side proxy. Guard uses `if target is not None` so an empty string is no longer silently equivalent to None. Tests: * 94 unit tests pass (was 79 before this branch). New coverage: per-host target assertion on every apply/remove/restart success path; `_apply_service` partial-failure raises with context; `_get_existing_service_hosts` exception propagation; dotted-name rejection on rgw/mds/mon/mgr/rbd-mirror/cephfs-mirror; NFS group_id no-match remove; `service_action` partial failure; `reset_mock` per loop iter in unsupported-restart test. * `side_effect` lambdas use `(*args, **kwargs)` to remain robust to positional-vs-keyword call shapes. Multi-node CI integration test (`tests/scripts/test-orch.sh`): * New section 6b applies rgw with `--placement="$first_host,$second_host"` and verifies the daemon appears on the *second* host via `orch ps --hostname` — exercising the unix-socket -> proxyTarget -> remote mTLS path end to end. * Section 9 verifies cluster-wide RGW removal (every host) instead of just the local node. * Section 8 asserts the new `Restarted mon on ` output shape and that the restart enumerates each host in a multi-host cluster. * Adds a guard test that `orch restart rgw.realm1` fails with `does not support a service_id`. Refs canonical/microceph#743 Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- .../src/microceph/client/cluster.py | 33 +- microceph-orch/src/microceph/module.py | 294 ++++++++++++----- microceph-orch/tests/test_module.py | 303 ++++++++++++++---- tests/scripts/test-orch.sh | 60 +++- 4 files changed, 530 insertions(+), 160 deletions(-) diff --git a/microceph-orch/src/microceph/client/cluster.py b/microceph-orch/src/microceph/client/cluster.py index a3e4311ce..ed61cda85 100644 --- a/microceph-orch/src/microceph/client/cluster.py +++ b/microceph-orch/src/microceph/client/cluster.py @@ -74,7 +74,8 @@ def get_status(self) -> dict[str, dict]: for member in members } - def enable_service(self, name: str, payload: str = "", wait: bool = True) -> None: + def enable_service(self, name: str, payload: str = "", wait: bool = True, + target: str | None = None) -> None: """Enable a service on the cluster. Sends a PUT request to /1.0/services/ with an EnableService payload. @@ -85,38 +86,54 @@ def enable_service(self, name: str, payload: str = "", wait: bool = True) -> Non :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror') :param payload: JSON string with service-specific parameters :param wait: if True, the server waits for the service to be fully enabled + :param target: optional cluster member name. When set, the + server-side proxyTarget middleware (microcluster) forwards + the request over mTLS to that node, allowing per-host + service enablement from the local unix socket. When None, + no target is forwarded and the server handles the request + on the node receiving the unix-socket connection. """ # Note: The "bool" key maps to Go's EnableService.Wait field which has # the struct tag `json:"bool"` (upstream naming quirk in MicroCeph). + params = {"target": target} if target is not None else None self._put(f"/1.0/services/{name}", json={ "name": name, "bool": wait, "payload": payload, - }) + }, params=params) - def delete_service(self, name: str) -> None: + def delete_service(self, name: str, target: str | None = None) -> None: """Delete/disable a service on the cluster. :param name: service name (e.g. 'rgw', 'nfs', 'rbd-mirror') + :param target: optional cluster member name; see enable_service. """ - self._delete(f"/1.0/services/{name}") + params = {"target": target} if target is not None else None + self._delete(f"/1.0/services/{name}", params=params) - def restart_services(self, services: list[str]) -> None: + def restart_services(self, services: list[str], + target: str | None = None) -> None: """Restart one or more services on the cluster. Sends a POST to /1.0/services/restart with a list of service objects. :param services: list of service names (e.g. ['mon', 'rgw']) + :param target: optional cluster member name; see enable_service. """ + params = {"target": target} if target is not None else None payload = [{"service": svc} for svc in services] - self._post("/1.0/services/restart", json=payload) + self._post("/1.0/services/restart", json=payload, params=params) - def delete_nfs_service(self, cluster_id: str) -> None: + def delete_nfs_service(self, cluster_id: str, + target: str | None = None) -> None: """Delete/disable an NFS service by cluster ID. NFS deletion requires the cluster_id in the request body, unlike other services which are identified by URL path alone. :param cluster_id: NFS cluster identifier + :param target: optional cluster member name; see enable_service. """ - self._delete("/1.0/services/nfs", json={"cluster_id": cluster_id}) + params = {"target": target} if target is not None else None + self._delete("/1.0/services/nfs", + json={"cluster_id": cluster_id}, params=params) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index 08dd53fdd..57b75ec87 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -51,6 +51,31 @@ # "no handler defined for service X" at the backend. RESTART_SUPPORTED_SERVICES = frozenset({"osd", "mon", "rgw"}) +# Services for which MicroCeph distinguishes individual instances via a +# `service_id` (the `.` dotted form). Currently only NFS, where +# the id is the NFS cluster_id. Any other service is deployed as a single +# bare instance per node; using a dotted name on them is rejected up front +# so operators get a clear error instead of an apparently successful +# operation that silently targets the wrong scope. +DOTTED_NAME_SUPPORTED_SERVICES = frozenset({"nfs"}) + + +def _require_bare_service_name(svc_type: str, svc_id: str) -> None: + """Reject dotted service names for services that don't support them. + + Raises ValueError if `svc_id` is non-empty and `svc_type` is not in + DOTTED_NAME_SUPPORTED_SERVICES. + """ + if svc_id and svc_type not in DOTTED_NAME_SUPPORTED_SERVICES: + supported = ", ".join(sorted(DOTTED_NAME_SUPPORTED_SERVICES)) + raise ValueError( + f"Service type '{svc_type}' does not support a service_id; " + f"MicroCeph deploys a single bare '{svc_type}' per node. " + f"Use '{svc_type}' (without an id) instead of " + f"'{svc_type}.{svc_id}'. Dotted names are only valid for: " + f"{supported}." + ) + class MicroCephOrchestrator(Orchestrator, MgrModule, metaclass=CLICommandMeta): @@ -366,64 +391,99 @@ def _get_existing_service_hosts(self, service_type: str, When group_id is provided (e.g. an NFS cluster id), only services matching both the type and that group are counted, so distinct NFS clusters are not conflated. - """ - try: - services = self.microceph.services.list_services() - except RemoteException: - logger.warning(f"Failed to list services while checking existing {service_type} hosts") - return set() + A failure to list services from the backend is propagated to the + caller rather than swallowed: returning an empty set on error + would otherwise produce false "no-op" successes and re-apply + services that are in fact already running. + """ + services = self.microceph.services.list_services() return { svc.get('location', '') for svc in services if svc.get('service') == service_type and (group_id is None or svc.get('group_id', '') == group_id) } - def _apply_service(self, service_name: str, spec: ServiceSpec, + def _apply_service(self, service_type: str, spec: ServiceSpec, payload: str, group_id: Optional[str] = None) -> str: """Common logic for applying (enabling) a service. - Validates placement hosts and checks whether the service is already - running, then calls enable_service once. + Per-host targeting: MicroCeph's HTTP API endpoints for service + enable/delete/restart all declare ProxyTarget=true in the Go + rest layer. The microcluster proxyTarget middleware inspects the + `?target=` query parameter and forwards the request over + mTLS to that member's HTTPS endpoint. The Python client uses the + local unix socket; the server transparently proxies per-host + calls from there, so no direct HTTPS connectivity from the orch + module is required. + + We iterate requested placement hosts, skip any host that already + runs the service, and call enable_service once per remaining + host with `target=`. Any per-host failure raises so the + operator gets a visible error rather than a partial-success + result that the Ceph orchestrator framework would render as + green. + + Note: `service_type` is the bare service type (e.g. "rgw", + "nfs"), never the dotted form. Callers that accept dotted names + are expected to parse and pass `group_id` separately. Returns a summary string on success. - Raises an exception if placement is invalid or the API call fails. - - NOTE: Per-host targeting is not yet supported. The Python client - connects via unix socket to the local node only. MicroCeph's Go - client uses UseTarget() to proxy to specific hosts, but this is - not exposed through the socket. See todo #6. - Because the enable call cannot be confirmed per host, the summary - reports the requested placement rather than claiming each host was - individually provisioned. + Raises RuntimeError if any host fails to enable. """ hosts = self._get_placement_hosts(spec) - existing = self._get_existing_service_hosts(service_name, group_id) + existing = self._get_existing_service_hosts(service_type, group_id) # If every requested host already runs the service, there is # nothing to do. if existing.issuperset(hosts): skipped_str = ', '.join(hosts) - logger.info(f"Skipping {service_name}: already active on {skipped_str}") - return f"{service_name}: already active on {skipped_str}" - - logger.info(f"Enabling {service_name} (requested hosts: {', '.join(hosts)})") - self.microceph.services.enable_service( - name=service_name, - payload=payload, - wait=True, - ) + logger.info(f"Skipping {service_type}: already active on {skipped_str}") + return f"{service_type}: already active on {skipped_str}" - # The enable call is served by the local node and cannot be - # confirmed per host (see note above), so report the request - # honestly rather than claiming every placement host gained the - # service. - summary = f"enabled (requested placement: {', '.join(hosts)})" + to_enable = [h for h in hosts if h not in existing] already_active = sorted(existing & set(hosts)) + + enabled: List[str] = [] + failures: List[str] = [] + for host in to_enable: + logger.info(f"Enabling {service_type} on {host}") + try: + self.microceph.services.enable_service( + name=service_type, + payload=payload, + wait=True, + target=host, + ) + enabled.append(host) + except Exception as e: + logger.error(f"Failed to enable {service_type} on {host}: {e}") + failures.append(f"{host}: {e}") + + # Any failure is surfaced as an exception so the orchestrator + # framework reports the operation as failed. Partial-success + # context is included in the message to aid debugging. + if failures: + ctx = [] + if enabled: + ctx.append(f"enabled on {', '.join(enabled)}") + if already_active: + ctx.append(f"already active on {', '.join(already_active)}") + ctx_str = f" ({'; '.join(ctx)})" if ctx else "" + raise RuntimeError( + f"Failed to enable {service_type}: " + + "; ".join(failures) + + ctx_str + ) + + parts = [] + if enabled: + parts.append(f"enabled on {', '.join(enabled)}") if already_active: - summary += f"; already active on {', '.join(already_active)}" + parts.append(f"already active on {', '.join(already_active)}") - return f"{service_name}: {summary}" + summary = "; ".join(parts) if parts else "no-op" + return f"{service_type}: {summary}" @handle_orch_error def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]: @@ -431,8 +491,10 @@ def apply_rbd_mirror(self, spec: ServiceSpec) -> OrchResult[str]: rbd-mirror is a client-like service with no additional parameters. The Go API's ClientServicePlacement handler takes no payload. + Dotted names are rejected (see `_require_bare_service_name`). """ logger.debug("Applying rbd-mirror service") + _require_bare_service_name("rbd-mirror", spec.service_id or "") return self._apply_service("rbd-mirror", spec, "{}") @handle_orch_error @@ -451,19 +513,12 @@ def apply_rgw(self, spec: RGWSpec) -> OrchResult[str]: Note on service_id (realms/zones): MicroCeph deploys a single bare "rgw" service per node and does not support multiple RGW instances - with distinct service_ids on the same cluster. If a service_id is - provided in the spec, it is ignored and a warning is logged. + with distinct service_ids on the same cluster. Supplying a + service_id raises ValueError so the operator is not surprised + by a silently-misscoped subsequent remove_service call. """ logger.debug("Applying RGW service") - - # MicroCeph stores a single bare "rgw" service; per-realm service_id - # is not supported by the backend, so we drop it with a warning. - if spec.service_id: - logger.warning( - f"RGW service_id={spec.service_id!r} ignored: MicroCeph deploys " - "a single 'rgw' service; multiple RGW realms with distinct " - "service_ids are not supported." - ) + _require_bare_service_name("rgw", spec.service_id or "") # Build RGW-specific payload from the spec. # Go's RgwServicePlacement expects: Port, SSLPort, SSLCertificate, SSLPrivateKey @@ -518,14 +573,22 @@ def apply_nfs(self, spec: NFSServiceSpec) -> OrchResult[str]: @handle_orch_error def apply_mon(self, spec: ServiceSpec) -> OrchResult[str]: - """Enable the MON service on the target hosts.""" + """Enable the MON service on the target hosts. + + Dotted names are rejected (see `_require_bare_service_name`). + """ logger.debug("Applying MON service") + _require_bare_service_name("mon", spec.service_id or "") return self._apply_service("mon", spec, "{}") @handle_orch_error def apply_mgr(self, spec: ServiceSpec) -> OrchResult[str]: - """Enable the MGR service on the target hosts.""" + """Enable the MGR service on the target hosts. + + Dotted names are rejected (see `_require_bare_service_name`). + """ logger.debug("Applying MGR service") + _require_bare_service_name("mgr", spec.service_id or "") return self._apply_service("mgr", spec, "{}") @handle_orch_error @@ -534,16 +597,10 @@ def apply_mds(self, spec: ServiceSpec) -> OrchResult[str]: Note on service_id (filesystem name): MicroCeph deploys a single bare "mds" service and does not support per-filesystem MDS - placement. If a service_id is provided in the spec, it is - ignored and a warning is logged. + placement. Supplying a service_id raises ValueError. """ logger.debug("Applying MDS service") - if spec.service_id: - logger.warning( - f"MDS service_id={spec.service_id!r} ignored: MicroCeph deploys " - "a single 'mds' service; per-filesystem MDS placement is not " - "supported." - ) + _require_bare_service_name("mds", spec.service_id or "") return self._apply_service("mds", spec, "{}") @handle_orch_error @@ -551,41 +608,80 @@ def apply_cephfs_mirror(self, spec: ServiceSpec) -> OrchResult[str]: """Enable the cephfs-mirror service on the target hosts. cephfs-mirror is a client-like service (same as rbd-mirror) - with no additional parameters. + with no additional parameters. Dotted names are rejected + (see `_require_bare_service_name`). """ logger.debug("Applying cephfs-mirror service") + _require_bare_service_name("cephfs-mirror", spec.service_id or "") return self._apply_service("cephfs-mirror", spec, "{}") @handle_orch_error def remove_service(self, service_name: str, force: bool = False) -> OrchResult[str]: - """Remove a service from the local node. + """Remove a service cluster-wide. - Sends a DELETE request to the local MicroCeph API. This removes - the service from the node connected via the unix socket only; - it does not remove the service cluster-wide. Per-host targeting - requires UseTarget support (see todo #6). + Discovers all hosts currently running the requested service via + list_services() and issues a DELETE per host using the server-side + proxyTarget middleware (see _apply_service for transport notes). + Errors from individual hosts are collected so a partial failure + does not mask successful removals on other nodes. - :param service_name: service type or type.id (e.g. "rgw", "nfs.mycluster") + :param service_name: service type or type.id (e.g. "rgw", + "nfs.mycluster") :param force: unused, kept for interface compatibility """ logger.info(f"Removing service: {service_name}, force={force}") svc_type, svc_id = self._parse_service_name(service_name) - # NFS requires the cluster_id in the delete body - if svc_type == 'nfs': - if not svc_id: - raise ValueError("NFS removal requires service name in 'nfs.' format") - self.microceph.services.delete_nfs_service(svc_id) - else: - if svc_id: - logger.debug( - f"Ignoring id '{svc_id}' for {svc_type}: MicroCeph manages " - f"a single {svc_type} service per node" - ) - self.microceph.services.delete_service(svc_type) + if svc_type == 'nfs' and not svc_id: + raise ValueError( + "NFS removal requires service name in 'nfs.' format" + ) + # For non-NFS services, a dotted name is rejected up front: + # MicroCeph deploys a single bare instance per node and silently + # dropping the id would let an operator believe they were + # removing a specific realm/filesystem while actually wiping the + # bare service from every host. + _require_bare_service_name(svc_type, svc_id) + + # Discover hosts currently running this service. For non-nfs + # services group_id is unused; for nfs it filters to the specific + # cluster_id. + group_id = svc_id if svc_type == 'nfs' else None + hosts = sorted(self._get_existing_service_hosts(svc_type, group_id)) - return f"Removed service {service_name}" + if not hosts: + logger.info(f"No hosts found running {service_name}; nothing to do") + return f"{service_name}: no hosts running this service" + + removed: List[str] = [] + failures: List[str] = [] + for host in hosts: + try: + if svc_type == 'nfs': + self.microceph.services.delete_nfs_service( + svc_id, target=host, + ) + else: + self.microceph.services.delete_service( + svc_type, target=host, + ) + removed.append(host) + except Exception as e: + logger.error(f"Failed to remove {service_name} from {host}: {e}") + failures.append(f"{host}: {e}") + + # Any failure raises so the operator gets a visible error; + # partial-success context is included in the message. + if failures: + ctx = f" (removed from {', '.join(removed)})" if removed else "" + raise RuntimeError( + f"Failed to remove {service_name}: " + + "; ".join(failures) + + ctx + ) + + return f"{service_name}: removed from {', '.join(removed)}" @handle_orch_error def remove_host(self, host: str, force: bool = False, @@ -605,7 +701,9 @@ def remove_host(self, host: str, force: bool = False, def service_action(self, action: str, service_name: str) -> OrchResult[List[str]]: """Perform an action (restart) on a service. - Currently only 'restart' is supported via the MicroCeph API. + Currently only 'restart' is supported via the MicroCeph API. The + restart is fanned out per host running the service, using the + server-side proxyTarget middleware (see _apply_service). :param action: one of "start", "stop", "restart", "redeploy", "reconfig" :param service_name: service type (e.g. "mon", "rgw") @@ -618,7 +716,12 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] "Only 'restart' is currently available." ) - svc_type, _ = self._parse_service_name(service_name) + svc_type, svc_id = self._parse_service_name(service_name) + + # All currently-supported restart services are bare; reject + # dotted names so an operator does not silently restart all + # NFS clusters (etc.) when intending to target one. + _require_bare_service_name(svc_type, svc_id) if svc_type not in RESTART_SUPPORTED_SERVICES: raise NotImplementedError( @@ -627,5 +730,36 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] f"{sorted(RESTART_SUPPORTED_SERVICES)}." ) - self.microceph.services.restart_services([svc_type]) - return [f"Restarted {service_name}"] + # group_id is threaded through for future-proofing: if NFS + # were ever added to RESTART_SUPPORTED_SERVICES, _require_bare + # above would already have rejected the dotted-name path, so + # svc_id is always empty here; keeping the call shape future- + # compatible avoids a silent fan-out across all clusters. + group_id = svc_id if svc_type in DOTTED_NAME_SUPPORTED_SERVICES else None + hosts = sorted(self._get_existing_service_hosts(svc_type, group_id)) + if not hosts: + return [f"No hosts running {svc_type}; nothing to restart"] + + restarted: List[str] = [] + failures: List[str] = [] + for host in hosts: + try: + self.microceph.services.restart_services( + [svc_type], target=host, + ) + restarted.append(host) + except Exception as e: + logger.error(f"Failed to restart {svc_type} on {host}: {e}") + failures.append(f"{host}: {e}") + + if failures: + ctx = ( + f" (restarted on {', '.join(restarted)})" if restarted else "" + ) + raise RuntimeError( + f"Failed to restart {svc_type}: " + + "; ".join(failures) + + ctx + ) + + return [f"Restarted {svc_type} on {h}" for h in restarted] diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py index 87dac2ab8..cbd3631a3 100644 --- a/microceph-orch/tests/test_module.py +++ b/microceph-orch/tests/test_module.py @@ -345,12 +345,15 @@ def test_apply_rbd_mirror_success(self, orchestrator, mock_client): ) result = orchestrator.apply_rbd_mirror(spec) assert result.exception is None - # Summary confirms enablement and echoes the requested placement - # without claiming each host was individually provisioned. - assert "enabled" in result.result - assert "requested placement: node1, node2" in result.result - # Single API call regardless of host count (local socket only) - mock_client.services.enable_service.assert_called_once() + # Per-host fan-out via the proxyTarget middleware: one call per + # requested host with target=. + assert "enabled on node1, node2" in result.result + assert mock_client.services.enable_service.call_count == 2 + targets = sorted( + c.kwargs["target"] + for c in mock_client.services.enable_service.call_args_list + ) + assert targets == ["node1", "node2"] def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client): spec = ServiceSpec(service_type="rbd-mirror") @@ -358,6 +361,17 @@ def test_apply_rbd_mirror_no_placement(self, orchestrator, mock_client): assert result.exception is not None assert "No placement hosts" in str(result.exception) + def test_apply_rbd_mirror_service_id_rejected(self, orchestrator, mock_client): + # Bare-name guard: rbd-mirror does not support dotted names. + spec = ServiceSpec( + service_type="rbd-mirror", + service_id="zone1", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_rbd_mirror(spec) + assert isinstance(result.exception, ValueError) + mock_client.services.enable_service.assert_not_called() + def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client): # node1 already runs the service; node2 is also requested. mock_client.services.list_services.return_value = [ @@ -370,8 +384,48 @@ def test_apply_rbd_mirror_skips_existing(self, orchestrator, mock_client): result = orchestrator.apply_rbd_mirror(spec) assert result.exception is None assert "already active on node1" in result.result - assert "enabled" in result.result + assert "enabled on node2" in result.result + # Only the not-yet-active host is targeted. mock_client.services.enable_service.assert_called_once() + assert ( + mock_client.services.enable_service.call_args.kwargs["target"] + == "node2" + ) + + def test_apply_list_services_failure_propagates(self, orchestrator, mock_client): + # If list_services itself fails we must NOT silently treat the + # set of existing hosts as empty and re-apply everywhere; the + # error has to surface. + mock_client.services.list_services.side_effect = RemoteException( + "list failed" + ) + spec = ServiceSpec( + service_type="rbd-mirror", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_rbd_mirror(spec) + assert isinstance(result.exception, RemoteException) + mock_client.services.enable_service.assert_not_called() + + def test_apply_rbd_mirror_partial_failure(self, orchestrator, mock_client): + # node1 enables, node2 fails — any failure raises with + # partial-success context in the message. + spec = ServiceSpec( + service_type="rbd-mirror", + placement=PlacementSpec(hosts=["node1", "node2"]), + ) + + def side_effect(*args, **kwargs): + if kwargs.get("target") == "node2": + raise RemoteException("node2 boom") + + mock_client.services.enable_service.side_effect = side_effect + result = orchestrator.apply_rbd_mirror(spec) + assert isinstance(result.exception, RuntimeError) + msg = str(result.exception) + assert "Failed to enable rbd-mirror" in msg + assert "node2: node2 boom" in msg + assert "enabled on node1" in msg def test_apply_rbd_mirror_all_existing(self, orchestrator, mock_client): mock_client.services.list_services.return_value = [ @@ -408,11 +462,11 @@ def test_apply_rgw_basic(self, orchestrator, mock_client): ) result = orchestrator.apply_rgw(spec) assert result.exception is None - assert "enabled" in result.result - assert "node1" in result.result + assert "enabled on node1" in result.result call_kwargs = mock_client.services.enable_service.call_args assert call_kwargs.kwargs["name"] == "rgw" + assert call_kwargs.kwargs["target"] == "node1" def test_apply_rgw_with_port(self, orchestrator, mock_client): spec = RGWSpec( @@ -426,6 +480,7 @@ def test_apply_rgw_with_port(self, orchestrator, mock_client): call_kwargs = mock_client.services.enable_service.call_args payload = json.loads(call_kwargs.kwargs["payload"]) assert payload["Port"] == 8080 + assert call_kwargs.kwargs["target"] == "node1" def test_apply_rgw_ssl_cert_without_key_warns(self, orchestrator, mock_client, caplog): """SSL cert is present but no key; should warn and not send cert.""" @@ -451,27 +506,19 @@ def test_apply_rgw_no_placement(self, orchestrator, mock_client): result = orchestrator.apply_rgw(spec) assert result.exception is not None - def test_apply_rgw_service_id_warns(self, orchestrator, mock_client, caplog): + def test_apply_rgw_service_id_rejected(self, orchestrator, mock_client): # MicroCeph deploys a single bare 'rgw' service; per-realm - # service_id is silently dropped — log a warning instead of - # surprising the caller. + # service_id is not supported and is rejected up front so the + # operator does not believe a realm was provisioned. spec = RGWSpec( service_type="rgw", service_id="realm1", placement=PlacementSpec(hosts=["node1"]), ) - import logging - with caplog.at_level(logging.WARNING): - result = orchestrator.apply_rgw(spec) - - assert result.exception is None - # service_id is not forwarded to the backend. - call_kwargs = mock_client.services.enable_service.call_args - assert call_kwargs.kwargs["name"] == "rgw" - assert any( - "service_id" in r.message and "realm1" in r.message - for r in caplog.records - ) + result = orchestrator.apply_rgw(spec) + assert isinstance(result.exception, ValueError) + assert "does not support a service_id" in str(result.exception) + mock_client.services.enable_service.assert_not_called() # =================================================================== @@ -487,12 +534,12 @@ def test_apply_nfs_basic(self, orchestrator, mock_client): ) result = orchestrator.apply_nfs(spec) assert result.exception is None - assert "enabled" in result.result - assert "node1" in result.result + assert "enabled on node1" in result.result call_kwargs = mock_client.services.enable_service.call_args payload = json.loads(call_kwargs.kwargs["payload"]) assert payload["cluster_id"] == "mycluster" + assert call_kwargs.kwargs["target"] == "node1" def test_apply_nfs_with_port(self, orchestrator, mock_client): spec = NFSServiceSpec( @@ -547,8 +594,13 @@ def test_apply_nfs_skips_existing(self, orchestrator, mock_client): ) result = orchestrator.apply_nfs(spec) assert "already active on node1" in result.result - assert "enabled" in result.result + assert "enabled on node2" in result.result mock_client.services.enable_service.assert_called_once() + # Only the host that did not yet run the cluster is targeted. + assert ( + mock_client.services.enable_service.call_args.kwargs["target"] + == "node2" + ) def test_apply_nfs_distinct_cluster_not_conflated(self, orchestrator, mock_client): """A host running nfs.other must not suppress enabling nfs.mycluster.""" @@ -584,12 +636,21 @@ def test_apply_cephfs_mirror_success(self, orchestrator, mock_client): ) result = orchestrator.apply_cephfs_mirror(spec) assert result.exception is None - assert "enabled" in result.result - assert "node1" in result.result + assert "enabled on node1" in result.result mock_client.services.enable_service.assert_called_once_with( - name="cephfs-mirror", payload="{}", wait=True, + name="cephfs-mirror", payload="{}", wait=True, target="node1", ) + def test_apply_cephfs_mirror_service_id_rejected(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="cephfs-mirror", + service_id="x", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_cephfs_mirror(spec) + assert isinstance(result.exception, ValueError) + mock_client.services.enable_service.assert_not_called() + # =================================================================== # _parse_service_name() @@ -630,11 +691,15 @@ def test_filter_by_service_name(self, orchestrator, mock_client): # =================================================================== class TestRemoveServiceDotted: - def test_remove_dotted_non_nfs_service(self, orchestrator, mock_client): - """e.g. mds.myfs should call delete_service('mds')""" + def test_remove_dotted_non_nfs_rejected(self, orchestrator, mock_client): + # Removing 'mds.myfs' (or any dotted non-NFS name) must be + # rejected: silently dropping the id would otherwise wipe the + # bare MDS service across every host while the operator + # believed only a specific filesystem was being removed. result = orchestrator.remove_service("mds.myfs") - assert result.exception is None - mock_client.services.delete_service.assert_called_once_with("mds") + assert isinstance(result.exception, ValueError) + assert "does not support a service_id" in str(result.exception) + mock_client.services.delete_service.assert_not_called() # =================================================================== @@ -649,10 +714,9 @@ def test_apply_mon(self, orchestrator, mock_client): ) result = orchestrator.apply_mon(spec) assert result.exception is None - assert "enabled" in result.result - assert "node1" in result.result + assert "enabled on node1" in result.result mock_client.services.enable_service.assert_called_once_with( - name="mon", payload="{}", wait=True, + name="mon", payload="{}", wait=True, target="node1", ) def test_apply_mgr(self, orchestrator, mock_client): @@ -663,8 +727,28 @@ def test_apply_mgr(self, orchestrator, mock_client): result = orchestrator.apply_mgr(spec) assert result.exception is None mock_client.services.enable_service.assert_called_once_with( - name="mgr", payload="{}", wait=True, + name="mgr", payload="{}", wait=True, target="node1", + ) + + def test_apply_mon_service_id_rejected(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="mon", + service_id="x", + placement=PlacementSpec(hosts=["node1"]), + ) + result = orchestrator.apply_mon(spec) + assert isinstance(result.exception, ValueError) + mock_client.services.enable_service.assert_not_called() + + def test_apply_mgr_service_id_rejected(self, orchestrator, mock_client): + spec = ServiceSpec( + service_type="mgr", + service_id="x", + placement=PlacementSpec(hosts=["node1"]), ) + result = orchestrator.apply_mgr(spec) + assert isinstance(result.exception, ValueError) + mock_client.services.enable_service.assert_not_called() def test_apply_mds(self, orchestrator, mock_client): spec = ServiceSpec( @@ -674,29 +758,21 @@ def test_apply_mds(self, orchestrator, mock_client): result = orchestrator.apply_mds(spec) assert result.exception is None mock_client.services.enable_service.assert_called_once_with( - name="mds", payload="{}", wait=True, + name="mds", payload="{}", wait=True, target="node1", ) - def test_apply_mds_service_id_warns(self, orchestrator, mock_client, caplog): + def test_apply_mds_service_id_rejected(self, orchestrator, mock_client): # Per-filesystem MDS placement is not supported by MicroCeph; - # service_id (filesystem name) must be dropped with a warning. + # a service_id (filesystem name) is rejected up front. spec = ServiceSpec( service_type="mds", service_id="fs1", placement=PlacementSpec(hosts=["node1"]), ) - import logging - with caplog.at_level(logging.WARNING): - result = orchestrator.apply_mds(spec) - - assert result.exception is None - mock_client.services.enable_service.assert_called_once_with( - name="mds", payload="{}", wait=True, - ) - assert any( - "service_id" in r.message and "fs1" in r.message - for r in caplog.records - ) + result = orchestrator.apply_mds(spec) + assert isinstance(result.exception, ValueError) + assert "does not support a service_id" in str(result.exception) + mock_client.services.enable_service.assert_not_called() # =================================================================== @@ -705,26 +781,80 @@ def test_apply_mds_service_id_warns(self, orchestrator, mock_client, caplog): class TestRemoveService: def test_remove_service_basic(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + _svc("rgw", "node1"), + _svc("rgw", "node2"), + ] + result = orchestrator.remove_service("rgw") + assert result.exception is None + assert "removed from node1, node2" in result.result + assert mock_client.services.delete_service.call_count == 2 + + def test_remove_service_no_hosts(self, orchestrator, mock_client): + # list_services returns [] (default) — nothing to remove. result = orchestrator.remove_service("rgw") assert result.exception is None - assert "Removed service rgw" in result.result - mock_client.services.delete_service.assert_called_once_with("rgw") + assert "no hosts" in result.result + mock_client.services.delete_service.assert_not_called() def test_remove_service_nfs_with_cluster_id(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + _svc("nfs", "node1", group_id="mycluster"), + ] result = orchestrator.remove_service("nfs.mycluster") assert result.exception is None - assert "Removed service nfs.mycluster" in result.result - mock_client.services.delete_nfs_service.assert_called_once_with("mycluster") + assert "removed from node1" in result.result + mock_client.services.delete_nfs_service.assert_called_once() + c = mock_client.services.delete_nfs_service.call_args + assert c.args == ("mycluster",) + assert c.kwargs["target"] == "node1" def test_remove_service_nfs_without_cluster_id(self, orchestrator, mock_client): result = orchestrator.remove_service("nfs") assert result.exception is not None assert "cluster_id" in str(result.exception) + def test_remove_service_nfs_group_id_no_match(self, orchestrator, mock_client): + # NFS cluster 'other' exists on node1, but we're removing + # 'mycluster' — there is nothing to do; the call must NOT + # cascade into deleting the unrelated cluster. + mock_client.services.list_services.return_value = [ + _svc("nfs", "node1", group_id="other"), + ] + result = orchestrator.remove_service("nfs.mycluster") + assert result.exception is None + assert "no hosts" in result.result + mock_client.services.delete_nfs_service.assert_not_called() + def test_remove_service_api_error(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + _svc("rgw", "node1"), + ] mock_client.services.delete_service.side_effect = RemoteException("fail") result = orchestrator.remove_service("rgw") assert result.exception is not None + assert "Failed to remove" in str(result.exception) + + def test_remove_service_partial_failure(self, orchestrator, mock_client): + # node1 succeeds, node2 fails — any failure raises so the + # operator sees a visible error, and the partial-success + # context is included in the message. + mock_client.services.list_services.return_value = [ + _svc("rgw", "node1"), + _svc("rgw", "node2"), + ] + + def side_effect(*args, **kwargs): + if kwargs.get("target") == "node2": + raise RemoteException("node2 boom") + + mock_client.services.delete_service.side_effect = side_effect + result = orchestrator.remove_service("rgw") + assert isinstance(result.exception, RuntimeError) + msg = str(result.exception) + assert "Failed to remove rgw" in msg + assert "node2: node2 boom" in msg + assert "removed from node1" in msg # =================================================================== @@ -750,33 +880,78 @@ def test_remove_host_api_error(self, orchestrator, mock_client): class TestServiceAction: def test_restart_service(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + _svc("mon", "node1"), + _svc("mon", "node2"), + ] result = orchestrator.service_action("restart", "mon") assert result.exception is None - assert "Restarted mon" in result.result[0] - mock_client.services.restart_services.assert_called_once_with(["mon"]) + # One restart call per host carrying target=. + assert mock_client.services.restart_services.call_count == 2 + targets = sorted( + c.kwargs["target"] + for c in mock_client.services.restart_services.call_args_list + ) + assert targets == ["node1", "node2"] + assert any("Restarted mon on node1" in r for r in result.result) + assert any("Restarted mon on node2" in r for r in result.result) - def test_restart_service_with_id(self, orchestrator, mock_client): - # Service identifiers are stripped before dispatching to the - # MicroCeph API; rgw is a supported restart target. - result = orchestrator.service_action("restart", "rgw.realm1") + def test_restart_no_hosts(self, orchestrator, mock_client): + # No hosts running the service — no-op (still success). + result = orchestrator.service_action("restart", "mon") assert result.exception is None - mock_client.services.restart_services.assert_called_once_with(["rgw"]) + assert any("nothing to restart" in r for r in result.result) + mock_client.services.restart_services.assert_not_called() + + def test_restart_dotted_non_nfs_rejected(self, orchestrator, mock_client): + # Dotted non-NFS names are rejected: restarting 'rgw.realm1' + # would have to silently drop the id and restart every rgw, + # which is a surprising operation. + result = orchestrator.service_action("restart", "rgw.realm1") + assert isinstance(result.exception, ValueError) + assert "does not support a service_id" in str(result.exception) + mock_client.services.restart_services.assert_not_called() def test_restart_unsupported_service_type(self, orchestrator, mock_client): # MicroCeph's backend serviceWorkerTable only handles osd/mon/rgw. # The orchestrator must reject restart for other service types up # front rather than surfacing an opaque RemoteException. for svc in ["nfs.mycluster", "mds", "mgr", "rbd-mirror"]: + mock_client.services.restart_services.reset_mock() result = orchestrator.service_action("restart", svc) assert isinstance(result.exception, NotImplementedError), svc mock_client.services.restart_services.assert_not_called() + def test_restart_partial_failure(self, orchestrator, mock_client): + # node1 restarts, node2 fails — any failure raises with + # partial-success context in the message. + mock_client.services.list_services.return_value = [ + _svc("mon", "node1"), + _svc("mon", "node2"), + ] + + def side_effect(*args, **kwargs): + if kwargs.get("target") == "node2": + raise RemoteException("node2 boom") + + mock_client.services.restart_services.side_effect = side_effect + result = orchestrator.service_action("restart", "mon") + assert isinstance(result.exception, RuntimeError) + msg = str(result.exception) + assert "Failed to restart mon" in msg + assert "node2: node2 boom" in msg + assert "restarted on node1" in msg + def test_unsupported_action(self, orchestrator, mock_client): result = orchestrator.service_action("stop", "mon") assert result.exception is not None assert "not supported" in str(result.exception) def test_restart_api_error(self, orchestrator, mock_client): + mock_client.services.list_services.return_value = [ + _svc("mon", "node1"), + ] mock_client.services.restart_services.side_effect = RemoteException("fail") result = orchestrator.service_action("restart", "mon") assert result.exception is not None + assert "Failed to restart" in str(result.exception) diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh index ccf36f817..97a01b235 100755 --- a/tests/scripts/test-orch.sh +++ b/tests/scripts/test-orch.sh @@ -115,7 +115,8 @@ assert_exit_ok "device ls succeeds" run_ceph orch device ls echo "=== 6. Apply RGW ===" # =================================================================== -# Clean up RGW if it was already enabled by prior test steps +# Clean up RGW if it was already enabled by prior test steps. +# `orch rm rgw` is now cluster-wide (fans out to every host running rgw). run_ceph orch rm rgw >/dev/null 2>&1 || true sleep 3 @@ -131,6 +132,34 @@ assert_contains "rgw in service list" "rgw" "$output" output=$(run_ceph orch ps --daemon-type rgw) assert_contains "rgw daemon visible" "rgw" "$output" +# Per-host targeting check: the RGW daemon must end up on the +# requested host (first_host), not on whichever node served the unix +# socket. This is what the ?target= proxyTarget mechanism is for. +output=$(run_ceph orch ps --hostname "$placement" --daemon-type rgw) +assert_contains "rgw daemon on requested host $placement" "rgw" "$output" + +# =================================================================== +echo "=== 6b. Apply RGW on a second host (per-host targeting) ===" +# =================================================================== + +# Find a host that is NOT the first one to verify cross-node enablement +# from the local socket. Only run this leg when the cluster has more +# than one host (single-node deployments skip). +second_host=$(run_ceph orch host ls | awk 'NR>1 && NF>0 && $1 != "'"$first_host"'" {print $1; exit}') +if [ -n "$second_host" ] && [ "$second_host" != "$first_host" ]; then + output=$(run_ceph orch apply rgw default --placement="$first_host,$second_host") + assert_contains "rgw applied to two hosts" "enabled|already active" "$output" + sleep 5 + + # Verify both hosts now run rgw. The local node (running the orch + # client) is the unix-socket endpoint; the second_host enablement + # exercises the unix-socket -> proxyTarget -> remote mTLS path. + output=$(run_ceph orch ps --hostname "$second_host" --daemon-type rgw) + assert_contains "rgw daemon on $second_host (cross-node target)" "rgw" "$output" +else + echo " SKIP per-host targeting RGW test (only one host in cluster)" +fi + # =================================================================== echo "=== 7. Apply NFS ===" # =================================================================== @@ -161,23 +190,38 @@ echo "=== 8. Restart service ===" # =================================================================== output=$(run_ceph orch restart mon) -assert_contains "mon restarted" "Restarted" "$output" +assert_contains "mon restarted" "Restarted mon on" "$output" + +# Per-host fan-out: in a multi-host cluster the restart output should +# enumerate each host running mon, not a single "Restarted mon". +if [ -n "${second_host:-}" ] && [ "$second_host" != "$first_host" ]; then + assert_contains "mon restart enumerates first host" "$first_host" "$output" +fi + +# Dotted-name guard: orch restart rgw.realm1 must fail with a clear +# "does not support a service_id" error rather than silently restart +# every rgw daemon. +output=$(run_ceph orch restart rgw.realm1 2>&1 || true) +assert_contains "dotted restart rejected" "does not support a service_id" "$output" # =================================================================== echo "=== 9. Remove RGW ===" # =================================================================== output=$(run_ceph orch rm rgw) -assert_contains "rgw removed" "Removed" "$output" +assert_contains "rgw removed" "removed from" "$output" sleep 3 -# Verify RGW is gone from the local node. In multi-node clusters, -# RGW may still appear on other nodes if it was enabled independently -# (orch rm only affects the local node; per-host targeting not yet -# supported, see UseTarget limitation). +# remove_service fans out across every host running the service, so +# RGW must be gone from BOTH first_host and second_host (when present). local_host="${first_host:-$(hostname)}" output=$(run_ceph orch ps --hostname "$local_host" --daemon-type rgw) -assert_not_contains "rgw gone from local node" "rgw" "$output" +assert_not_contains "rgw gone from first host" "rgw" "$output" + +if [ -n "${second_host:-}" ] && [ "$second_host" != "$first_host" ]; then + output=$(run_ceph orch ps --hostname "$second_host" --daemon-type rgw) + assert_not_contains "rgw gone from second host (cluster-wide remove)" "rgw" "$output" +fi # =================================================================== echo "=== 10. Remove NFS ===" From c9fc7c65a850e8fdf140fe45a6613ec113a06495 Mon Sep 17 00:00:00 2001 From: Utkarsh Bhatt Date: Thu, 28 May 2026 15:35:25 +0530 Subject: [PATCH 11/11] orch: tighten error semantics and tolerate TOCTOU race on apply Pre-merge polish after multi-agent review of the per-host targeting work. No new feature surface; corrects how the orchestrator reports failures and unsupported operations to the Ceph framework, and hardens the apply path against a backend race. Exception hierarchy ------------------- Swap bare exception types for Ceph orchestrator types so the framework renders them as proper operation errors rather than Python tracebacks: * `RuntimeError` (per-host fan-out failure) -> `OrchestratorError` * `NotImplementedError` (unsupported action / unsupported service type / unsupported InventoryFilter labels) -> `OrchestratorValidationError` `ValueError` is kept for input-shape errors (dotted-name guard, NFS missing cluster_id) since those fire before any I/O. Cephadm parity on "nothing to do" --------------------------------- `remove_service` and `service_action("restart", ...)` now raise `OrchestratorError("Service X is not running on any host")` when the requested service has no live hosts, instead of returning a green success summary. This matches cephadm: an operator asking to remove or restart a service that does not exist should see an error so a typo or stale assumption is caught, not a silent no-op. TOCTOU tolerance on apply ------------------------- `_apply_service` snapshots existing hosts once and then enables per host. If the service races into existence between snapshot and enable, the backend's `genericHospitalityCheck` (`microceph/ceph/services_placement_generic.go`) returns " service already active on host". This is the desired end state, not a failure, so we now match that substring and treat the host as enabled instead of appending to failures. `describe_service` size ----------------------- `ServiceDescription` now sets `size=len(hostlist)` alongside `running`. MicroCeph has no separate spec store -- desired state is the live running set -- so `ceph orch ls` was rendering RUNNING/- (looks degraded). With `size` populated it renders RUNNING/SIZE cleanly. Integration test ---------------- * `tests/scripts/test-orch.sh` `second_host` awk now filters first column to a hostname shape (`/^[A-Za-z]/`) so the trailing "N hosts in cluster" footer row is no longer picked up as a hostname on single-node CI. * Section 10 (Remove NFS) now accepts either the new "removed from " success shape or the new "not running on any host" error shape (NFS may legitimately have failed to start on CI runners missing kernel modules). Tests ----- * 94 unit tests pass. * Exception-class assertions updated: - `isinstance(..., RuntimeError)` -> `OrchestratorError` - `isinstance(..., NotImplementedError)` -> `OrchestratorValidationError` * `test_remove_service_no_hosts`, `test_remove_service_nfs_group_id_no_match`, `test_restart_no_hosts` now assert the new "not running on any host" error shape instead of a string-summary success. * `tests/stubs.py` and `tests/conftest.py` extended with `OrchestratorError` and `OrchestratorValidationError` stubs and bound into the injected `orchestrator` module so production-code imports resolve to the right types under test. * `ServiceDescription` stub gains a `size` keyword. Refs canonical/microceph#743 Signed-off-by: Utkarsh Bhatt Assisted-by: Claude Opus 4.7 --- microceph-orch/src/microceph/module.py | 51 +++++++++++++++++++++----- microceph-orch/tests/conftest.py | 4 ++ microceph-orch/tests/stubs.py | 11 +++++- microceph-orch/tests/test_module.py | 37 +++++++++++-------- tests/scripts/test-orch.sh | 18 ++++++--- 5 files changed, 89 insertions(+), 32 deletions(-) diff --git a/microceph-orch/src/microceph/module.py b/microceph-orch/src/microceph/module.py index 57b75ec87..f2bc7babc 100644 --- a/microceph-orch/src/microceph/module.py +++ b/microceph-orch/src/microceph/module.py @@ -31,6 +31,8 @@ CLICommandMeta, handle_orch_error, OrchResult, + OrchestratorError, + OrchestratorValidationError, ) from .client.client import Client @@ -243,9 +245,15 @@ def describe_service(self, service_id=svc_id, service_type=svc_type, placement=PlacementSpec(hosts=hostlist, count=len(hostlist)) ) + # `size` is the desired daemon count; `running` is the actual. + # MicroCeph's desired state IS the current set of hosts running + # the service (no separate spec store), so they match. Setting + # `size` explicitly avoids `ceph orch ls` rendering `RUNNING/-`, + # which looks like a degraded/incomplete deployment. service_descs.append(ServiceDescription( spec=spec, - running=len(hostlist) + running=len(hostlist), + size=len(hostlist), )) return service_descs @@ -326,7 +334,7 @@ def get_inventory(self, filter_hosts = None if host_filter: if host_filter.labels: - raise NotImplementedError( + raise OrchestratorValidationError( "MicroCeph orchestrator does not support host labels" ) if host_filter.hosts: @@ -457,6 +465,20 @@ def _apply_service(self, service_type: str, spec: ServiceSpec, ) enabled.append(host) except Exception as e: + # Tolerate the TOCTOU window between the snapshot taken + # by _get_existing_service_hosts and the per-host enable + # call. The backend's genericHospitalityCheck + # (microceph/ceph/services_placement_generic.go) returns + # " service already active on host" if the service + # raced into existence between snapshot and enable; for + # an apply that is the desired end-state, not a failure. + if "already active on host" in str(e): + logger.info( + f"{service_type} became active on {host} after " + "snapshot; treating as no-op." + ) + enabled.append(host) + continue logger.error(f"Failed to enable {service_type} on {host}: {e}") failures.append(f"{host}: {e}") @@ -470,7 +492,7 @@ def _apply_service(self, service_type: str, spec: ServiceSpec, if already_active: ctx.append(f"already active on {', '.join(already_active)}") ctx_str = f" ({'; '.join(ctx)})" if ctx else "" - raise RuntimeError( + raise OrchestratorError( f"Failed to enable {service_type}: " + "; ".join(failures) + ctx_str @@ -651,8 +673,12 @@ def remove_service(self, service_name: str, force: bool = False) -> OrchResult[s hosts = sorted(self._get_existing_service_hosts(svc_type, group_id)) if not hosts: - logger.info(f"No hosts found running {service_name}; nothing to do") - return f"{service_name}: no hosts running this service" + # Match cephadm: removing a service that is not deployed is + # surfaced as an error so the operator notices a typo or + # stale state rather than seeing a green no-op. + raise OrchestratorError( + f"Service {service_name!r} is not running on any host" + ) removed: List[str] = [] failures: List[str] = [] @@ -675,7 +701,7 @@ def remove_service(self, service_name: str, force: bool = False) -> OrchResult[s # partial-success context is included in the message. if failures: ctx = f" (removed from {', '.join(removed)})" if removed else "" - raise RuntimeError( + raise OrchestratorError( f"Failed to remove {service_name}: " + "; ".join(failures) + ctx @@ -711,7 +737,7 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] logger.info(f"Service action: {action} on {service_name}") if action != "restart": - raise NotImplementedError( + raise OrchestratorValidationError( f"Service action '{action}' is not supported by MicroCeph. " "Only 'restart' is currently available." ) @@ -724,7 +750,7 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] _require_bare_service_name(svc_type, svc_id) if svc_type not in RESTART_SUPPORTED_SERVICES: - raise NotImplementedError( + raise OrchestratorValidationError( f"Restart of service type {svc_type!r} is not supported by " f"MicroCeph. Supported services: " f"{sorted(RESTART_SUPPORTED_SERVICES)}." @@ -738,7 +764,12 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] group_id = svc_id if svc_type in DOTTED_NAME_SUPPORTED_SERVICES else None hosts = sorted(self._get_existing_service_hosts(svc_type, group_id)) if not hosts: - return [f"No hosts running {svc_type}; nothing to restart"] + # Restarting a service that is not deployed is an error: the + # operator either targeted the wrong name or expected the + # service to be running. Match cephadm semantics. + raise OrchestratorError( + f"Service {svc_type!r} is not running on any host" + ) restarted: List[str] = [] failures: List[str] = [] @@ -756,7 +787,7 @@ def service_action(self, action: str, service_name: str) -> OrchResult[List[str] ctx = ( f" (restarted on {', '.join(restarted)})" if restarted else "" ) - raise RuntimeError( + raise OrchestratorError( f"Failed to restart {svc_type}: " + "; ".join(failures) + ctx diff --git a/microceph-orch/tests/conftest.py b/microceph-orch/tests/conftest.py index 8d231248d..94b566fba 100644 --- a/microceph-orch/tests/conftest.py +++ b/microceph-orch/tests/conftest.py @@ -28,6 +28,8 @@ ServiceDescription, DaemonDescription, Orchestrator, + OrchestratorError, + OrchestratorValidationError, MgrModule, NotifyType, ) @@ -83,6 +85,8 @@ def _install_mocks(): orch_mod.CLICommandMeta = CLICommandMeta orch_mod.handle_orch_error = handle_orch_error orch_mod.OrchResult = OrchResult + orch_mod.OrchestratorError = OrchestratorError + orch_mod.OrchestratorValidationError = OrchestratorValidationError # snap-only deps requests_unixsocket_mod = MagicMock() diff --git a/microceph-orch/tests/stubs.py b/microceph-orch/tests/stubs.py index 1a950fd3a..b2c61b023 100644 --- a/microceph-orch/tests/stubs.py +++ b/microceph-orch/tests/stubs.py @@ -111,9 +111,10 @@ def __init__(self, name="", devices=None): class ServiceDescription: - def __init__(self, spec=None, running=0, **kwargs): + def __init__(self, spec=None, running=0, size=0, **kwargs): self.spec = spec self.running = running + self.size = size class DaemonDescription: @@ -127,6 +128,14 @@ def __init__(self, service_name="", daemon_type="", daemon_id="", self.ports = ports +class OrchestratorError(Exception): + pass + + +class OrchestratorValidationError(OrchestratorError): + pass + + class Orchestrator: pass diff --git a/microceph-orch/tests/test_module.py b/microceph-orch/tests/test_module.py index cbd3631a3..84707611e 100644 --- a/microceph-orch/tests/test_module.py +++ b/microceph-orch/tests/test_module.py @@ -13,6 +13,8 @@ RGWSpec, NFSServiceSpec, InventoryFilter, + OrchestratorError, + OrchestratorValidationError, ) from microceph.client.service import RemoteException @@ -313,7 +315,7 @@ def test_get_inventory_with_host_filter(self, orchestrator, mock_client): def test_get_inventory_label_filter_rejected(self, orchestrator, mock_client): filt = InventoryFilter(labels=["role=osd"]) result = orchestrator.get_inventory(host_filter=filt) - assert isinstance(result.exception, NotImplementedError) + assert isinstance(result.exception, OrchestratorValidationError) mock_client.services.list_disks.assert_not_called() def test_get_inventory_empty(self, orchestrator, mock_client): @@ -421,7 +423,7 @@ def side_effect(*args, **kwargs): mock_client.services.enable_service.side_effect = side_effect result = orchestrator.apply_rbd_mirror(spec) - assert isinstance(result.exception, RuntimeError) + assert isinstance(result.exception, OrchestratorError) msg = str(result.exception) assert "Failed to enable rbd-mirror" in msg assert "node2: node2 boom" in msg @@ -791,10 +793,12 @@ def test_remove_service_basic(self, orchestrator, mock_client): assert mock_client.services.delete_service.call_count == 2 def test_remove_service_no_hosts(self, orchestrator, mock_client): - # list_services returns [] (default) — nothing to remove. + # list_services returns [] (default) — removing a service that + # isn't deployed must surface as an error (cephadm parity), not + # a green no-op. result = orchestrator.remove_service("rgw") - assert result.exception is None - assert "no hosts" in result.result + assert isinstance(result.exception, OrchestratorError) + assert "not running on any host" in str(result.exception) mock_client.services.delete_service.assert_not_called() def test_remove_service_nfs_with_cluster_id(self, orchestrator, mock_client): @@ -816,14 +820,15 @@ def test_remove_service_nfs_without_cluster_id(self, orchestrator, mock_client): def test_remove_service_nfs_group_id_no_match(self, orchestrator, mock_client): # NFS cluster 'other' exists on node1, but we're removing - # 'mycluster' — there is nothing to do; the call must NOT - # cascade into deleting the unrelated cluster. + # 'mycluster' — there is nothing to do for this cluster_id; + # surface as an error and do NOT cascade into deleting the + # unrelated cluster. mock_client.services.list_services.return_value = [ _svc("nfs", "node1", group_id="other"), ] result = orchestrator.remove_service("nfs.mycluster") - assert result.exception is None - assert "no hosts" in result.result + assert isinstance(result.exception, OrchestratorError) + assert "not running on any host" in str(result.exception) mock_client.services.delete_nfs_service.assert_not_called() def test_remove_service_api_error(self, orchestrator, mock_client): @@ -850,7 +855,7 @@ def side_effect(*args, **kwargs): mock_client.services.delete_service.side_effect = side_effect result = orchestrator.remove_service("rgw") - assert isinstance(result.exception, RuntimeError) + assert isinstance(result.exception, OrchestratorError) msg = str(result.exception) assert "Failed to remove rgw" in msg assert "node2: node2 boom" in msg @@ -897,10 +902,12 @@ def test_restart_service(self, orchestrator, mock_client): assert any("Restarted mon on node2" in r for r in result.result) def test_restart_no_hosts(self, orchestrator, mock_client): - # No hosts running the service — no-op (still success). + # Restarting a service that isn't deployed surfaces as an + # error so the operator notices the typo or stale assumption + # (cephadm parity). result = orchestrator.service_action("restart", "mon") - assert result.exception is None - assert any("nothing to restart" in r for r in result.result) + assert isinstance(result.exception, OrchestratorError) + assert "not running on any host" in str(result.exception) mock_client.services.restart_services.assert_not_called() def test_restart_dotted_non_nfs_rejected(self, orchestrator, mock_client): @@ -919,7 +926,7 @@ def test_restart_unsupported_service_type(self, orchestrator, mock_client): for svc in ["nfs.mycluster", "mds", "mgr", "rbd-mirror"]: mock_client.services.restart_services.reset_mock() result = orchestrator.service_action("restart", svc) - assert isinstance(result.exception, NotImplementedError), svc + assert isinstance(result.exception, OrchestratorValidationError), svc mock_client.services.restart_services.assert_not_called() def test_restart_partial_failure(self, orchestrator, mock_client): @@ -936,7 +943,7 @@ def side_effect(*args, **kwargs): mock_client.services.restart_services.side_effect = side_effect result = orchestrator.service_action("restart", "mon") - assert isinstance(result.exception, RuntimeError) + assert isinstance(result.exception, OrchestratorError) msg = str(result.exception) assert "Failed to restart mon" in msg assert "node2: node2 boom" in msg diff --git a/tests/scripts/test-orch.sh b/tests/scripts/test-orch.sh index 97a01b235..a533fa7f0 100755 --- a/tests/scripts/test-orch.sh +++ b/tests/scripts/test-orch.sh @@ -145,7 +145,10 @@ echo "=== 6b. Apply RGW on a second host (per-host targeting) ===" # Find a host that is NOT the first one to verify cross-node enablement # from the local socket. Only run this leg when the cluster has more # than one host (single-node deployments skip). -second_host=$(run_ceph orch host ls | awk 'NR>1 && NF>0 && $1 != "'"$first_host"'" {print $1; exit}') +# Filter to rows whose first column looks like a hostname (starts with +# a letter) so the trailing "N hosts in cluster" footer line is not +# mistaken for a host name on single-node CI. +second_host=$(run_ceph orch host ls | awk 'NR>1 && NF>0 && $1 ~ /^[A-Za-z]/ && $1 != "'"$first_host"'" {print $1; exit}') if [ -n "$second_host" ] && [ "$second_host" != "$first_host" ]; then output=$(run_ceph orch apply rgw default --placement="$first_host,$second_host") assert_contains "rgw applied to two hosts" "enabled|already active" "$output" @@ -227,13 +230,16 @@ fi echo "=== 10. Remove NFS ===" # =================================================================== -output=$(run_ceph orch rm nfs.testcluster) -# NFS removal may fail if the service never fully started -if echo "$output" | grep -q "Removed"; then - echo " PASS nfs removed" +output=$(run_ceph orch rm nfs.testcluster 2>&1 || true) +# NFS removal may fail if the service never fully started. In that +# case remove_service now raises OrchestratorError("not running on +# any host"); accept either the success ("removed from") shape or +# that specific error. +if echo "$output" | grep -qE "removed from|not running on any host"; then + echo " PASS nfs removal completed (either removed or never started)" PASS=$((PASS + 1)) else - echo " WARN nfs removal returned: $output (service may not have been running)" + echo " WARN nfs removal returned unexpected output: $output" fi sleep 3