Skip to content

Commit 1b20ae6

Browse files
committed
Reconsider termination reason naming
- Drop `starting_timeout`, use `provisioning_timeout` instead. These two termination reasons had similar semantics. The difference was that `starting_timeout` was used for cloud instances and `provisioning_timeout` for SSH. - Rename `termination_deadline` -> `unreachable`. The old name was based on implementation details rather than semantics, so it was not very informative to the user. - For `no_offers`, add a termination reason message to highlight the difference between failing to find offers and failing to provision offers.
1 parent b8fe5be commit 1b20ae6

3 files changed

Lines changed: 16 additions & 10 deletions

File tree

src/dstack/_internal/core/models/instances.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,7 @@ class InstanceTerminationReason(str, Enum):
262262
PROVISIONING_TIMEOUT = "provisioning_timeout"
263263
ERROR = "error"
264264
JOB_FINISHED = "job_finished"
265-
TERMINATION_TIMEOUT = "termination_timeout"
266-
STARTING_TIMEOUT = "starting_timeout"
265+
UNREACHABLE = "unreachable"
267266
NO_OFFERS = "no_offers"
268267
MASTER_FAILED = "master_failed"
269268
MAX_INSTANCES_LIMIT = "max_instances_limit"
@@ -281,6 +280,7 @@ def from_legacy_str(cls, v: str) -> "InstanceTerminationReason":
281280
if v == "Idle timeout":
282281
return cls.IDLE_TIMEOUT
283282
if v in (
283+
"Instance has not become running in time",
284284
"Provisioning timeout expired",
285285
"Proivisioning timeout expired", # typo is intentional
286286
"The proivisioning timeout expired", # typo is intentional
@@ -312,9 +312,7 @@ def from_legacy_str(cls, v: str) -> "InstanceTerminationReason":
312312
if v == "Instance job finished":
313313
return cls.JOB_FINISHED
314314
if v == "Termination deadline":
315-
return cls.TERMINATION_TIMEOUT
316-
if v == "Instance has not become running in time":
317-
return cls.STARTING_TIMEOUT
315+
return cls.UNREACHABLE
318316
if v == "Fleet has too many instances":
319317
return cls.MAX_INSTANCES_LIMIT
320318
if v == "Low account balance":

src/dstack/_internal/server/background/tasks/process_instances.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
679679
)
680680
return
681681

682-
_mark_terminated(instance, InstanceTerminationReason.NO_OFFERS)
682+
_mark_terminated(
683+
instance,
684+
InstanceTerminationReason.NO_OFFERS,
685+
"All offers failed" if offers else "No offers found",
686+
)
683687
if instance.fleet and is_fleet_master_instance(instance) and is_cloud_cluster(instance.fleet):
684688
# Do not attempt to deploy other instances, as they won't determine the correct cluster
685689
# backend, region, and placement group without a successfully deployed master instance
@@ -690,10 +694,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
690694

691695

692696
def _mark_terminated(
693-
instance: InstanceModel, termination_reason: InstanceTerminationReason
697+
instance: InstanceModel,
698+
termination_reason: InstanceTerminationReason,
699+
termination_reason_message: Optional[str] = None,
694700
) -> None:
695701
instance.status = InstanceStatus.TERMINATED
696702
instance.termination_reason = termination_reason
703+
instance.termination_reason_message = termination_reason_message
697704
logger.info(
698705
"Terminated instance %s: %s",
699706
instance.name,
@@ -842,7 +849,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
842849
deadline = instance.termination_deadline
843850
if get_current_datetime() > deadline:
844851
instance.status = InstanceStatus.TERMINATING
845-
instance.termination_reason = InstanceTerminationReason.TERMINATION_TIMEOUT
852+
instance.termination_reason = InstanceTerminationReason.UNREACHABLE
846853
logger.warning(
847854
"Instance %s shim waiting timeout. Marked as TERMINATING",
848855
instance.name,
@@ -871,7 +878,8 @@ async def _wait_for_instance_provisioning_data(
871878
"Instance %s failed because instance has not become running in time", instance.name
872879
)
873880
instance.status = InstanceStatus.TERMINATING
874-
instance.termination_reason = InstanceTerminationReason.STARTING_TIMEOUT
881+
instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
882+
instance.termination_reason_message = "Backend did not complete provisioning in time"
875883
return
876884

877885
backend = await backends_services.get_project_backend_by_type(

src/tests/_internal/server/background/tasks/test_process_instances.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ async def test_check_shim_terminate_instance_by_deadline(self, test_db, session:
263263
assert instance is not None
264264
assert instance.status == InstanceStatus.TERMINATING
265265
assert instance.termination_deadline == termination_deadline_time
266-
assert instance.termination_reason == InstanceTerminationReason.TERMINATION_TIMEOUT
266+
assert instance.termination_reason == InstanceTerminationReason.UNREACHABLE
267267

268268
@pytest.mark.asyncio
269269
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)