diff --git a/centml/cli/cluster.py b/centml/cli/cluster.py index e2f4548..59d6003 100644 --- a/centml/cli/cluster.py +++ b/centml/cli/cluster.py @@ -103,6 +103,41 @@ def _get_ready_status(cclient, deployment): return click.style(style[0], fg=style[1], bg=style[2]) +def _get_status_error_messages(cclient, deployment): + if deployment.status != DeploymentStatus.ACTIVE: + return [] + + try: + status = cclient.get_status_v3(deployment.id) + except ApiException as e: + if e.status not in [400, 404]: + raise + + status = cclient.get_status(deployment.id) + error_message = getattr(status, "error_message", None) + return [error_message] if error_message else [] + + messages = [] + seen_messages = set() + + def add_message(label, error_message): + if not error_message or error_message in seen_messages: + return + + seen_messages.add(error_message) + messages.append(f"{label}: {error_message}") + + for revision in status.revision_pod_details_list or []: + revision_label = f"revision {revision.revision_number}" if revision.revision_number is not None else "revision" + add_message(revision_label, revision.error_message) + + for pod in revision.pod_details_list or []: + pod_label = pod.name or "pod" + add_message(f"{revision_label} / {pod_label}", pod.error_message) + + return messages + + @click.command(help="List all deployments") @click.argument("type", type=click.Choice(list(depl_name_to_type_map.keys())), required=False, default=None) def ls(type): @@ -150,23 +185,23 @@ def get(type, id): sys.exit("Please enter correct deployment type") ready_status = _get_ready_status(cclient, deployment) + status_error_messages = _get_status_error_messages(cclient, deployment) _, id_to_hw_map = _get_hw_to_id_map(cclient, deployment.cluster_id) hw = id_to_hw_map[deployment.hardware_instance_id] - - click.echo( - tabulate( - [ - ("Name", deployment.name), - ("Status", ready_status), - ("Endpoint", deployment.endpoint_url), - ("Created at", deployment.created_at.strftime("%Y-%m-%d %H:%M:%S")), - ("Hardware", f"{hw.name} ({hw.num_gpu}x {hw.gpu_type})"), - ("Cost", f"{hw.cost_per_hr / 100} credits/hr"), - ], - tablefmt="rounded_outline", - disable_numparse=True, - ) - ) + detail_rows = [ + ("Name", deployment.name), + ("Status", ready_status), + ("Endpoint", deployment.endpoint_url), + ("Created at", deployment.created_at.strftime("%Y-%m-%d %H:%M:%S")), + ("Hardware", f"{hw.name} ({hw.num_gpu}x {hw.gpu_type})"), + ("Cost", f"{hw.cost_per_hr / 100} credits/hr"), + ] + + click.echo(tabulate(detail_rows, tablefmt="rounded_outline", disable_numparse=True)) + if status_error_messages: + click.echo("\nStatus errors:") + for message in status_error_messages: + click.echo(f"- {message}") click.echo("Additional deployment configurations:") if depl_type in [DeploymentType.INFERENCE_V2, DeploymentType.INFERENCE_V3]: diff --git a/centml/compiler/prediction/__init__.py b/centml/compiler/prediction/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_cli_cluster.py b/tests/test_cli_cluster.py new file mode 100644 index 0000000..6e80de3 --- /dev/null +++ b/tests/test_cli_cluster.py @@ -0,0 +1,69 @@ +from types import SimpleNamespace + +import pytest + +from centml.cli.cluster import DeploymentStatus, _get_status_error_messages +from centml.sdk import ApiException + + +def test_status_error_messages_include_revision_and_pod_messages(): + cclient = SimpleNamespace( + get_status_v3=lambda _id: SimpleNamespace( + revision_pod_details_list=[ + SimpleNamespace( + revision_number=3, + error_message="revision failed", + pod_details_list=[ + SimpleNamespace(name="pod-a", error_message="image pull failed"), + SimpleNamespace(name="pod-b", error_message=None), + ], + ) + ] + ) + ) + deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE) + + messages = _get_status_error_messages(cclient, deployment) + + assert messages == ["revision 3: revision failed", "revision 3 / pod-a: image pull failed"] + + +def test_status_error_messages_do_not_repeat_duplicate_messages(): + duplicate_message = "one or more objects failed to apply" + cclient = SimpleNamespace( + get_status_v3=lambda _id: SimpleNamespace( + revision_pod_details_list=[ + SimpleNamespace( + revision_number=None, + error_message=duplicate_message, + pod_details_list=[SimpleNamespace(name=None, error_message=duplicate_message)], + ) + ] + ) + ) + deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE) + + assert _get_status_error_messages(cclient, deployment) == [f"revision: {duplicate_message}"] + + +def test_status_error_messages_fall_back_to_legacy_status_message(): + legacy_status = SimpleNamespace(error_message="legacy service failure") + + def get_status_v3(_id): + raise ApiException(status=404) + + cclient = SimpleNamespace(get_status_v3=get_status_v3, get_status=lambda _id: legacy_status) + deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE) + + assert _get_status_error_messages(cclient, deployment) == ["legacy service failure"] + + +def test_status_error_messages_reraises_unexpected_v3_errors(): + def get_status_v3(_id): + raise ApiException(status=500) + + cclient = SimpleNamespace(get_status_v3=get_status_v3) + deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE) + + with pytest.raises(ApiException): + _get_status_error_messages(cclient, deployment)