Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 50 additions & 15 deletions centml/cli/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,41 @@ def _get_ready_status(cclient, deployment):
return click.style(style[0], fg=style[1], bg=style[2])


def _get_status_error_messages(cclient, deployment):
if deployment.status != DeploymentStatus.ACTIVE:
return []

try:
status = cclient.get_status_v3(deployment.id)
except ApiException as e:
if e.status not in [400, 404]:
raise

status = cclient.get_status(deployment.id)
error_message = getattr(status, "error_message", None)
return [error_message] if error_message else []

messages = []
seen_messages = set()

def add_message(label, error_message):
if not error_message or error_message in seen_messages:
return

seen_messages.add(error_message)
messages.append(f"{label}: {error_message}")

for revision in status.revision_pod_details_list or []:
revision_label = f"revision {revision.revision_number}" if revision.revision_number is not None else "revision"
add_message(revision_label, revision.error_message)

for pod in revision.pod_details_list or []:
pod_label = pod.name or "pod"
add_message(f"{revision_label} / {pod_label}", pod.error_message)

return messages


@click.command(help="List all deployments")
@click.argument("type", type=click.Choice(list(depl_name_to_type_map.keys())), required=False, default=None)
def ls(type):
Expand Down Expand Up @@ -150,23 +185,23 @@ def get(type, id):
sys.exit("Please enter correct deployment type")

ready_status = _get_ready_status(cclient, deployment)
status_error_messages = _get_status_error_messages(cclient, deployment)
_, id_to_hw_map = _get_hw_to_id_map(cclient, deployment.cluster_id)
hw = id_to_hw_map[deployment.hardware_instance_id]

click.echo(
tabulate(
[
("Name", deployment.name),
("Status", ready_status),
("Endpoint", deployment.endpoint_url),
("Created at", deployment.created_at.strftime("%Y-%m-%d %H:%M:%S")),
("Hardware", f"{hw.name} ({hw.num_gpu}x {hw.gpu_type})"),
("Cost", f"{hw.cost_per_hr / 100} credits/hr"),
],
tablefmt="rounded_outline",
disable_numparse=True,
)
)
detail_rows = [
("Name", deployment.name),
("Status", ready_status),
("Endpoint", deployment.endpoint_url),
("Created at", deployment.created_at.strftime("%Y-%m-%d %H:%M:%S")),
("Hardware", f"{hw.name} ({hw.num_gpu}x {hw.gpu_type})"),
("Cost", f"{hw.cost_per_hr / 100} credits/hr"),
]

click.echo(tabulate(detail_rows, tablefmt="rounded_outline", disable_numparse=True))
if status_error_messages:
click.echo("\nStatus errors:")
for message in status_error_messages:
click.echo(f"- {message}")

click.echo("Additional deployment configurations:")
if depl_type in [DeploymentType.INFERENCE_V2, DeploymentType.INFERENCE_V3]:
Expand Down
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just removed this file

Empty file.
69 changes: 69 additions & 0 deletions tests/test_cli_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from types import SimpleNamespace

import pytest

from centml.cli.cluster import DeploymentStatus, _get_status_error_messages
from centml.sdk import ApiException


def test_status_error_messages_include_revision_and_pod_messages():
cclient = SimpleNamespace(
get_status_v3=lambda _id: SimpleNamespace(
revision_pod_details_list=[
SimpleNamespace(
revision_number=3,
error_message="revision failed",
pod_details_list=[
SimpleNamespace(name="pod-a", error_message="image pull failed"),
SimpleNamespace(name="pod-b", error_message=None),
],
)
]
)
)
deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE)

messages = _get_status_error_messages(cclient, deployment)

assert messages == ["revision 3: revision failed", "revision 3 / pod-a: image pull failed"]


def test_status_error_messages_do_not_repeat_duplicate_messages():
duplicate_message = "one or more objects failed to apply"
cclient = SimpleNamespace(
get_status_v3=lambda _id: SimpleNamespace(
revision_pod_details_list=[
SimpleNamespace(
revision_number=None,
error_message=duplicate_message,
pod_details_list=[SimpleNamespace(name=None, error_message=duplicate_message)],
)
]
)
)
deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE)

assert _get_status_error_messages(cclient, deployment) == [f"revision: {duplicate_message}"]


def test_status_error_messages_fall_back_to_legacy_status_message():
legacy_status = SimpleNamespace(error_message="legacy service failure")

def get_status_v3(_id):
raise ApiException(status=404)

cclient = SimpleNamespace(get_status_v3=get_status_v3, get_status=lambda _id: legacy_status)
deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE)

assert _get_status_error_messages(cclient, deployment) == ["legacy service failure"]


def test_status_error_messages_reraises_unexpected_v3_errors():
def get_status_v3(_id):
raise ApiException(status=500)

cclient = SimpleNamespace(get_status_v3=get_status_v3)
deployment = SimpleNamespace(id=123, status=DeploymentStatus.ACTIVE)

with pytest.raises(ApiException):
_get_status_error_messages(cclient, deployment)
Loading