diff --git a/app/backup.py b/app/backup.py index f4a9c2a4..2d01a90c 100644 --- a/app/backup.py +++ b/app/backup.py @@ -473,14 +473,27 @@ def _stop_container(self, c: Container, attempt=1) -> bool: def _start_container(self, c: Container, attempt=1) -> bool: c.reload() # Refresh the status for this container + status = c.status # Read once to avoid consuming multiple mock cycles - if c.status != "exited": - self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG") + if status == "running": + if attempt == 1: + self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG") return True + if status != "exited": + # Transitional state: Docker statuses are created/restarting/running/paused/exited/dead + if attempt <= 3: + self.log_this( + f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/3)", + "WARN", + ) + time.sleep(2) + return self._start_container(c, attempt=attempt + 1) + return False + try: self.log_this(f"Starting {c.name}...") - c.start() # * Actually stop the container + c.start() # * Actually start the container except ReadTimeout: self.log_this(f"Timed out waiting for {c.name} to start. Checking container status...", "WARN") # Fall through to c.reload() — container may have started despite the timeout @@ -489,12 +502,13 @@ def _start_container(self, c: Container, attempt=1) -> bool: return False c.reload() # Refresh the status for this container + status = c.status # Read once to avoid consuming multiple mock cycles - if c.status == "running": + if status == "running": return True elif attempt <= 3: self.log_this(f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/3)", "ERROR") - self._start_container(c, attempt=attempt + 1) + return self._start_container(c, attempt=attempt + 1) return False def _get_src_dir(self, c: Container, log=False) -> Tuple[Path, str]: @@ -735,6 +749,16 @@ def _run_rsync(self, c: Optional[Container], rsync_args: str, src_dir: Path, des if out.returncode != 0: name = c.name if c else "unknown" message = f"rsync exited with code {out.returncode} for {name}" + if out.returncode == 23: + # Exit code 23 = partial transfer; commonly caused by symlinks on filesystems + # that don't support them (e.g. SMB/FAT). Regular files are still backed up. + hint = " (symlinks may have been skipped — use RSYNC_CUSTOM_ARGS=--no-links to suppress)" + if c: + self._record_container_failed(c, "rsync_failed", message + hint) + else: + self._record_error(message + hint) + self.log_this(message + hint, "ERROR") + return False if c: self._record_container_failed(c, "rsync_failed", message) else: diff --git a/pytest/test_backup.py b/pytest/test_backup.py index 53b2291b..f1fff563 100644 --- a/pytest/test_backup.py +++ b/pytest/test_backup.py @@ -2275,7 +2275,10 @@ def test_post_backup_exec_variables_for_rsync_failure( assert os.environ.get("NB_EXEC_CONTAINERS_FAILED") == "container1" assert os.environ.get("NB_EXEC_CONTAINER_SKIP_REASONS") == "container1=rsync_failed" assert os.environ.get("NB_EXEC_CONTAINER_FAILURE_REASONS") == "container1=rsync_failed" - assert os.environ.get("NB_EXEC_ERROR_MESSAGES") == "rsync exited with code 23 for container1" + assert ( + os.environ.get("NB_EXEC_ERROR_MESSAGES") + == "rsync exited with code 23 for container1 (symlinks may have been skipped — use RSYNC_CUSTOM_ARGS=--no-links to suppress)" + ) @mock.patch("subprocess.run") @pytest.mark.parametrize( @@ -2912,6 +2915,94 @@ def test_start_container_read_timeout( mock_container1.stop.assert_called() mock_container1.start.assert_called() + @mock.patch("subprocess.run") + @pytest.mark.parametrize( + "mock_container1", + [ + { + "name": "container1", + "id": "123456789", + # Reads: stop-check, after-stop, pre-rsync, start-attempt-1, + # after-c.start (restarting), retry-2 (restarting), retry-3 (running) + "status_side_effect": ["running", "exited", "exited", "exited", "restarting", "restarting", "running"], + } + ], + indirect=True, + ) + def test_start_container_transitional_state_eventually_running( + self, + mock_subprocess_run: MagicMock, + mock_docker_client: MagicMock, + mock_container1: MagicMock, + ): + """Container in transitional state (e.g. 'restarting') after c.start() should be retried + until it reaches 'running', and the backup should be marked as completed — not failed.""" + mock_subprocess_run.return_value.returncode = 0 + + mock_docker_client.containers.list.return_value = [mock_container1] + nb = NauticalBackup(mock_docker_client) + nb.backup() + + mock_container1.stop.assert_called() + mock_container1.start.assert_called() + assert "container1" in nb.containers_completed + assert "container1" not in nb.containers_failed + + @mock.patch("subprocess.run") + @pytest.mark.parametrize( + "mock_container1", + [ + { + "name": "container1", + "id": "123456789", + # Reads: stop-check, after-stop, pre-rsync, start-attempt-1, + # after-c.start (not running yet), start-attempt-2, after-c.start-2 (running) + "status_side_effect": ["running", "exited", "exited", "exited", "exited", "exited", "running"], + } + ], + indirect=True, + ) + def test_start_container_retry_success_propagates( + self, + mock_subprocess_run: MagicMock, + mock_docker_client: MagicMock, + mock_container1: MagicMock, + ): + """When container start fails on attempt 1 but succeeds on attempt 2, the backup should be + marked completed (not failed). This tests the missing-return bug fix.""" + mock_subprocess_run.return_value.returncode = 0 + + mock_docker_client.containers.list.return_value = [mock_container1] + nb = NauticalBackup(mock_docker_client) + nb.backup() + + assert mock_container1.start.call_count == 2 + assert "container1" in nb.containers_completed + assert "container1" not in nb.containers_failed + + @mock.patch("subprocess.run") + @pytest.mark.parametrize("mock_container1", [{"name": "container1", "id": "123456789"}], indirect=True) + def test_rsync_exit_code_23_includes_symlink_hint( + self, + mock_subprocess_run: MagicMock, + mock_docker_client: MagicMock, + mock_container1: MagicMock, + ): + """rsync exit code 23 (partial transfer) should include a hint about symlinks and the + RSYNC_CUSTOM_ARGS=--no-links workaround in the error message.""" + rsync_result = MagicMock() + rsync_result.returncode = 23 + mock_subprocess_run.return_value = rsync_result + + mock_docker_client.containers.list.return_value = [mock_container1] + nb = NauticalBackup(mock_docker_client) + nb.backup() + + assert len(nb.error_messages) == 1 + assert "--no-links" in nb.error_messages[0] + assert "symlinks" in nb.error_messages[0] + assert "container1" in nb.containers_failed + # ------------------------------------------------------------------------- # Retention policy tests # -------------------------------------------------------------------------