diff --git a/app/backup.py b/app/backup.py
index 2d01a90c..49e6abe6 100644
--- a/app/backup.py
+++ b/app/backup.py
@@ -471,24 +471,30 @@ def _stop_container(self, c: Container, attempt=1) -> bool:
self._stop_container(c, attempt=attempt + 1)
return False
- def _start_container(self, c: Container, attempt=1) -> bool:
+ def _start_container(self, c: Container, attempt=1, max_attempts: Optional[int] = None) -> bool:
+ if max_attempts is None:
+ start_timeout = int(self.get_label(c, "start-timeout", str(self.env.START_TIMEOUT)))
+ max_attempts = max(1, start_timeout // 2)
+
c.reload() # Refresh the status for this container
status = c.status # Read once to avoid consuming multiple mock cycles
if status == "running":
if attempt == 1:
self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG")
+ else:
+ self.log_this(f"Container {c.name} is now running.", "INFO")
return True
if status != "exited":
# Transitional state: Docker statuses are created/restarting/running/paused/exited/dead
- if attempt <= 3:
+ if attempt <= max_attempts:
self.log_this(
- f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/3)",
+ f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/{max_attempts})",
"WARN",
)
time.sleep(2)
- return self._start_container(c, attempt=attempt + 1)
+ return self._start_container(c, attempt=attempt + 1, max_attempts=max_attempts)
return False
try:
@@ -505,10 +511,14 @@ def _start_container(self, c: Container, attempt=1) -> bool:
status = c.status # Read once to avoid consuming multiple mock cycles
if status == "running":
+ if attempt > 1:
+ self.log_this(f"Container {c.name} is now running.", "INFO")
return True
- elif attempt <= 3:
- self.log_this(f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/3)", "ERROR")
- return self._start_container(c, attempt=attempt + 1)
+ elif attempt <= max_attempts:
+ self.log_this(
+ f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/{max_attempts})", "WARN"
+ )
+ return self._start_container(c, attempt=attempt + 1, max_attempts=max_attempts)
return False
def _get_src_dir(self, c: Container, log=False) -> Tuple[Path, str]:
diff --git a/app/defaults.env b/app/defaults.env
index 52750a44..e7ad899e 100644
--- a/app/defaults.env
+++ b/app/defaults.env
@@ -51,6 +51,9 @@ LABEL_PREFIX=nautical-backup
# How long to wait for a container to stop before killing it
STOP_TIMEOUT=10
+# How long to wait for a container to reach running state after startup
+START_TIMEOUT=10
+
# Set the default log level to INFO
LOG_LEVEL=INFO
diff --git a/app/nautical_env.py b/app/nautical_env.py
index 0c2faef9..b1dc268d 100644
--- a/app/nautical_env.py
+++ b/app/nautical_env.py
@@ -69,6 +69,7 @@ def __init__(self) -> None:
self.REPORT_FILE = False
self.STOP_TIMEOUT = int(os.environ.get("STOP_TIMEOUT", 10))
+ self.START_TIMEOUT = int(os.environ.get("START_TIMEOUT", 10))
_keep = os.environ.get("NUMBER_OF_BACKUPS_TO_KEEP", "0")
self.NUMBER_OF_BACKUPS_TO_KEEP = int(_keep) if _keep.isdigit() else 0
diff --git a/docs/arguments.md b/docs/arguments.md
index f4a59d4a..eae8574a 100644
--- a/docs/arguments.md
+++ b/docs/arguments.md
@@ -569,6 +569,17 @@ STOP_TIMEOUT=10
🔄 This is the same action as the [Stop Timeout](./labels.md#stop-timeout) label, but applied globally.
+## Start Timeout
+Nautical will check every 2 seconds whether a container has reached a running state after startup, giving up after *x* total seconds.
+
+> **Default**: 10 (seconds)
+
+```properties
+START_TIMEOUT=10
+```
+
+🔄 This is the same action as the [Start Timeout](./labels.md#start-timeout) label, but applied globally.
+
## Backup on Start
Nautical will immediately perform a backup when the container is started in addition to the CRON scheduled backup.
diff --git a/docs/labels.md b/docs/labels.md
index 4e499d7d..d611bdd8 100644
--- a/docs/labels.md
+++ b/docs/labels.md
@@ -155,6 +155,17 @@ nautical-backup.stop-timeout=10
🔄 This is a similar action to the [Stop Timeout](./arguments.md#stop-timeout) variable, but applied only to this container.
+## Start Timeout
+Nautical will check every 2 seconds whether a container has reached a running state after startup, giving up after *x* total seconds.
+
+> **Default If Missing**: [Start Timeout](./arguments.md#start-timeout) Environment value (Defaults to 10 seconds)
+
+```properties
+nautical-backup.start-timeout=10
+```
+
+🔄 This is a similar action to the [Start Timeout](./arguments.md#start-timeout) variable, but applied only to this container.
+
## Groups
Use this label to have multiple containers stopped, backed up, and restarted at the same time.
diff --git a/pytest/test_backup.py b/pytest/test_backup.py
index f1fff563..50faec30 100644
--- a/pytest/test_backup.py
+++ b/pytest/test_backup.py
@@ -2948,6 +2948,83 @@ def test_start_container_transitional_state_eventually_running(
assert "container1" in nb.containers_completed
assert "container1" not in nb.containers_failed
+ @mock.patch("subprocess.run")
+ @pytest.mark.parametrize(
+ "mock_container1",
+ [
+ {
+ "name": "container1",
+ "id": "123456789",
+ # Reads: stop-check, after-stop, pre-rsync, start-attempt-1,
+ # after-c.start (restarting), retry-2 (restarting), retry-3 (restarting), retry-4 (running)
+ # Mirrors the exact sequence from issue #683 (2026-05-25 comment)
+ "status_side_effect": [
+ "running",
+ "exited",
+ "exited",
+ "exited",
+ "restarting",
+ "restarting",
+ "restarting",
+ "running",
+ ],
+ }
+ ],
+ indirect=True,
+ )
+ def test_start_container_three_restarting_states_eventually_running(
+ self,
+ mock_subprocess_run: MagicMock,
+ mock_docker_client: MagicMock,
+ mock_container1: MagicMock,
+ ):
+ """Container that stays in 'restarting' for 3 cycles before becoming running should still
+ complete successfully. Reproduces the scenario from issue #683 (2026-05-25)."""
+ mock_subprocess_run.return_value.returncode = 0
+
+ mock_docker_client.containers.list.return_value = [mock_container1]
+ nb = NauticalBackup(mock_docker_client)
+ nb.backup()
+
+ mock_container1.stop.assert_called()
+ mock_container1.start.assert_called()
+ assert "container1" in nb.containers_completed
+ assert "container1" not in nb.containers_failed
+
+ @mock.patch("subprocess.run")
+ @pytest.mark.parametrize(
+ "mock_container1",
+ [
+ {
+ "name": "container1",
+ "id": "123456789",
+ "labels": {"nautical-backup.start-timeout": "10"},
+ # Needs 2 retries after c.start() to reach running.
+ # With START_TIMEOUT=2 (max_attempts=1) this would fail;
+ # the label overrides to 10s (max_attempts=5) so it succeeds.
+ "status_side_effect": ["running", "exited", "exited", "exited", "restarting", "restarting", "running"],
+ }
+ ],
+ indirect=True,
+ )
+ def test_START_TIMEOUT_label_supersedes_env(
+ self,
+ mock_subprocess_run: MagicMock,
+ mock_docker_client: MagicMock,
+ mock_container1: MagicMock,
+ monkeypatch: pytest.MonkeyPatch,
+ ):
+ """Per-container start-timeout label should override the global START_TIMEOUT env var."""
+ mock_subprocess_run.return_value.returncode = 0
+ monkeypatch.setenv("START_TIMEOUT", "2") # max_attempts=1 — would fail without the label
+
+ mock_docker_client.containers.list.return_value = [mock_container1]
+ nb = NauticalBackup(mock_docker_client)
+ nb.backup()
+
+ assert "container1" in nb.containers_completed
+ assert "container1" not in nb.containers_failed
+
@mock.patch("subprocess.run")
@pytest.mark.parametrize(
"mock_container1",