From 31a592fa6a3e9957bc634682a85b050078bf85ce Mon Sep 17 00:00:00 2001 From: James Tufarelli Date: Mon, 25 May 2026 11:34:16 -0700 Subject: [PATCH] Add START_TIMEOUT --- app/backup.py | 24 ++++++++++---- app/defaults.env | 3 ++ app/nautical_env.py | 1 + docs/arguments.md | 11 +++++++ docs/labels.md | 11 +++++++ pytest/test_backup.py | 77 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 120 insertions(+), 7 deletions(-) diff --git a/app/backup.py b/app/backup.py index 2d01a90c..49e6abe6 100644 --- a/app/backup.py +++ b/app/backup.py @@ -471,24 +471,30 @@ def _stop_container(self, c: Container, attempt=1) -> bool: self._stop_container(c, attempt=attempt + 1) return False - def _start_container(self, c: Container, attempt=1) -> bool: + def _start_container(self, c: Container, attempt=1, max_attempts: Optional[int] = None) -> bool: + if max_attempts is None: + start_timeout = int(self.get_label(c, "start-timeout", str(self.env.START_TIMEOUT))) + max_attempts = max(1, start_timeout // 2) + c.reload() # Refresh the status for this container status = c.status # Read once to avoid consuming multiple mock cycles if status == "running": if attempt == 1: self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG") + else: + self.log_this(f"Container {c.name} is now running.", "INFO") return True if status != "exited": # Transitional state: Docker statuses are created/restarting/running/paused/exited/dead - if attempt <= 3: + if attempt <= max_attempts: self.log_this( - f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/3)", + f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/{max_attempts})", "WARN", ) time.sleep(2) - return self._start_container(c, attempt=attempt + 1) + return self._start_container(c, attempt=attempt + 1, max_attempts=max_attempts) return False try: @@ -505,10 +511,14 @@ def _start_container(self, c: Container, attempt=1) -> bool: status = c.status # Read once to avoid consuming multiple mock cycles if status == "running": + if attempt > 1: + self.log_this(f"Container {c.name} is now running.", "INFO") return True - elif attempt <= 3: - self.log_this(f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/3)", "ERROR") - return self._start_container(c, attempt=attempt + 1) + elif attempt <= max_attempts: + self.log_this( + f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/{max_attempts})", "WARN" + ) + return self._start_container(c, attempt=attempt + 1, max_attempts=max_attempts) return False def _get_src_dir(self, c: Container, log=False) -> Tuple[Path, str]: diff --git a/app/defaults.env b/app/defaults.env index 52750a44..e7ad899e 100644 --- a/app/defaults.env +++ b/app/defaults.env @@ -51,6 +51,9 @@ LABEL_PREFIX=nautical-backup # How long to wait for a container to stop before killing it STOP_TIMEOUT=10 +# How long to wait for a container to reach running state after startup +START_TIMEOUT=10 + # Set the default log level to INFO LOG_LEVEL=INFO diff --git a/app/nautical_env.py b/app/nautical_env.py index 0c2faef9..b1dc268d 100644 --- a/app/nautical_env.py +++ b/app/nautical_env.py @@ -69,6 +69,7 @@ def __init__(self) -> None: self.REPORT_FILE = False self.STOP_TIMEOUT = int(os.environ.get("STOP_TIMEOUT", 10)) + self.START_TIMEOUT = int(os.environ.get("START_TIMEOUT", 10)) _keep = os.environ.get("NUMBER_OF_BACKUPS_TO_KEEP", "0") self.NUMBER_OF_BACKUPS_TO_KEEP = int(_keep) if _keep.isdigit() else 0 diff --git a/docs/arguments.md b/docs/arguments.md index f4a59d4a..eae8574a 100644 --- a/docs/arguments.md +++ b/docs/arguments.md @@ -569,6 +569,17 @@ STOP_TIMEOUT=10 🔄 This is the same action as the [Stop Timeout](./labels.md#stop-timeout) label, but applied globally. +## Start Timeout +Nautical will check every 2 seconds whether a container has reached a running state after startup, giving up after *x* total seconds. + +> **Default**: 10 (seconds) + +```properties +START_TIMEOUT=10 +``` + +🔄 This is the same action as the [Start Timeout](./labels.md#start-timeout) label, but applied globally. + ## Backup on Start Nautical will immediately perform a backup when the container is started in addition to the CRON scheduled backup. diff --git a/docs/labels.md b/docs/labels.md index 4e499d7d..d611bdd8 100644 --- a/docs/labels.md +++ b/docs/labels.md @@ -155,6 +155,17 @@ nautical-backup.stop-timeout=10 🔄 This is a similar action to the [Stop Timeout](./arguments.md#stop-timeout) variable, but applied only to this container. +## Start Timeout +Nautical will check every 2 seconds whether a container has reached a running state after startup, giving up after *x* total seconds. + +> **Default If Missing**: [Start Timeout](./arguments.md#start-timeout) Environment value (Defaults to 10 seconds) + +```properties +nautical-backup.start-timeout=10 +``` + +🔄 This is a similar action to the [Start Timeout](./arguments.md#start-timeout) variable, but applied only to this container. + ## Groups Use this label to have multiple containers stopped, backed up, and restarted at the same time. diff --git a/pytest/test_backup.py b/pytest/test_backup.py index f1fff563..50faec30 100644 --- a/pytest/test_backup.py +++ b/pytest/test_backup.py @@ -2948,6 +2948,83 @@ def test_start_container_transitional_state_eventually_running( assert "container1" in nb.containers_completed assert "container1" not in nb.containers_failed + @mock.patch("subprocess.run") + @pytest.mark.parametrize( + "mock_container1", + [ + { + "name": "container1", + "id": "123456789", + # Reads: stop-check, after-stop, pre-rsync, start-attempt-1, + # after-c.start (restarting), retry-2 (restarting), retry-3 (restarting), retry-4 (running) + # Mirrors the exact sequence from issue #683 (2026-05-25 comment) + "status_side_effect": [ + "running", + "exited", + "exited", + "exited", + "restarting", + "restarting", + "restarting", + "running", + ], + } + ], + indirect=True, + ) + def test_start_container_three_restarting_states_eventually_running( + self, + mock_subprocess_run: MagicMock, + mock_docker_client: MagicMock, + mock_container1: MagicMock, + ): + """Container that stays in 'restarting' for 3 cycles before becoming running should still + complete successfully. Reproduces the scenario from issue #683 (2026-05-25).""" + mock_subprocess_run.return_value.returncode = 0 + + mock_docker_client.containers.list.return_value = [mock_container1] + nb = NauticalBackup(mock_docker_client) + nb.backup() + + mock_container1.stop.assert_called() + mock_container1.start.assert_called() + assert "container1" in nb.containers_completed + assert "container1" not in nb.containers_failed + + @mock.patch("subprocess.run") + @pytest.mark.parametrize( + "mock_container1", + [ + { + "name": "container1", + "id": "123456789", + "labels": {"nautical-backup.start-timeout": "10"}, + # Needs 2 retries after c.start() to reach running. + # With START_TIMEOUT=2 (max_attempts=1) this would fail; + # the label overrides to 10s (max_attempts=5) so it succeeds. + "status_side_effect": ["running", "exited", "exited", "exited", "restarting", "restarting", "running"], + } + ], + indirect=True, + ) + def test_START_TIMEOUT_label_supersedes_env( + self, + mock_subprocess_run: MagicMock, + mock_docker_client: MagicMock, + mock_container1: MagicMock, + monkeypatch: pytest.MonkeyPatch, + ): + """Per-container start-timeout label should override the global START_TIMEOUT env var.""" + mock_subprocess_run.return_value.returncode = 0 + monkeypatch.setenv("START_TIMEOUT", "2") # max_attempts=1 — would fail without the label + + mock_docker_client.containers.list.return_value = [mock_container1] + nb = NauticalBackup(mock_docker_client) + nb.backup() + + assert "container1" in nb.containers_completed + assert "container1" not in nb.containers_failed + @mock.patch("subprocess.run") @pytest.mark.parametrize( "mock_container1",