Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions app/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,14 +473,27 @@ def _stop_container(self, c: Container, attempt=1) -> bool:

def _start_container(self, c: Container, attempt=1) -> bool:
c.reload() # Refresh the status for this container
status = c.status # Read once to avoid consuming multiple mock cycles

if c.status != "exited":
self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG")
if status == "running":
if attempt == 1:
self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG")
return True

if status != "exited":
# Transitional state: Docker statuses are created/restarting/running/paused/exited/dead
if attempt <= 3:
self.log_this(
f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/3)",
"WARN",
)
time.sleep(2)
return self._start_container(c, attempt=attempt + 1)
return False

try:
self.log_this(f"Starting {c.name}...")
c.start() # * Actually stop the container
c.start() # * Actually start the container
except ReadTimeout:
self.log_this(f"Timed out waiting for {c.name} to start. Checking container status...", "WARN")
# Fall through to c.reload() — container may have started despite the timeout
Expand All @@ -489,12 +502,13 @@ def _start_container(self, c: Container, attempt=1) -> bool:
return False

c.reload() # Refresh the status for this container
status = c.status # Read once to avoid consuming multiple mock cycles

if c.status == "running":
if status == "running":
return True
elif attempt <= 3:
self.log_this(f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/3)", "ERROR")
self._start_container(c, attempt=attempt + 1)
return self._start_container(c, attempt=attempt + 1)
return False

def _get_src_dir(self, c: Container, log=False) -> Tuple[Path, str]:
Expand Down Expand Up @@ -735,6 +749,16 @@ def _run_rsync(self, c: Optional[Container], rsync_args: str, src_dir: Path, des
if out.returncode != 0:
name = c.name if c else "unknown"
message = f"rsync exited with code {out.returncode} for {name}"
if out.returncode == 23:
# Exit code 23 = partial transfer; commonly caused by symlinks on filesystems
# that don't support them (e.g. SMB/FAT). Regular files are still backed up.
hint = " (symlinks may have been skipped — use RSYNC_CUSTOM_ARGS=--no-links to suppress)"
if c:
self._record_container_failed(c, "rsync_failed", message + hint)
else:
self._record_error(message + hint)
self.log_this(message + hint, "ERROR")
return False
if c:
self._record_container_failed(c, "rsync_failed", message)
else:
Expand Down
93 changes: 92 additions & 1 deletion pytest/test_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2275,7 +2275,10 @@ def test_post_backup_exec_variables_for_rsync_failure(
assert os.environ.get("NB_EXEC_CONTAINERS_FAILED") == "container1"
assert os.environ.get("NB_EXEC_CONTAINER_SKIP_REASONS") == "container1=rsync_failed"
assert os.environ.get("NB_EXEC_CONTAINER_FAILURE_REASONS") == "container1=rsync_failed"
assert os.environ.get("NB_EXEC_ERROR_MESSAGES") == "rsync exited with code 23 for container1"
assert (
os.environ.get("NB_EXEC_ERROR_MESSAGES")
== "rsync exited with code 23 for container1 (symlinks may have been skipped — use RSYNC_CUSTOM_ARGS=--no-links to suppress)"
)

@mock.patch("subprocess.run")
@pytest.mark.parametrize(
Expand Down Expand Up @@ -2912,6 +2915,94 @@ def test_start_container_read_timeout(
mock_container1.stop.assert_called()
mock_container1.start.assert_called()

@mock.patch("subprocess.run")
@pytest.mark.parametrize(
"mock_container1",
[
{
"name": "container1",
"id": "123456789",
# Reads: stop-check, after-stop, pre-rsync, start-attempt-1,
# after-c.start (restarting), retry-2 (restarting), retry-3 (running)
"status_side_effect": ["running", "exited", "exited", "exited", "restarting", "restarting", "running"],
}
],
indirect=True,
)
def test_start_container_transitional_state_eventually_running(
self,
mock_subprocess_run: MagicMock,
mock_docker_client: MagicMock,
mock_container1: MagicMock,
):
"""Container in transitional state (e.g. 'restarting') after c.start() should be retried
until it reaches 'running', and the backup should be marked as completed — not failed."""
mock_subprocess_run.return_value.returncode = 0

mock_docker_client.containers.list.return_value = [mock_container1]
nb = NauticalBackup(mock_docker_client)
nb.backup()

mock_container1.stop.assert_called()
mock_container1.start.assert_called()
assert "container1" in nb.containers_completed
assert "container1" not in nb.containers_failed

@mock.patch("subprocess.run")
@pytest.mark.parametrize(
"mock_container1",
[
{
"name": "container1",
"id": "123456789",
# Reads: stop-check, after-stop, pre-rsync, start-attempt-1,
# after-c.start (not running yet), start-attempt-2, after-c.start-2 (running)
"status_side_effect": ["running", "exited", "exited", "exited", "exited", "exited", "running"],
}
],
indirect=True,
)
def test_start_container_retry_success_propagates(
self,
mock_subprocess_run: MagicMock,
mock_docker_client: MagicMock,
mock_container1: MagicMock,
):
"""When container start fails on attempt 1 but succeeds on attempt 2, the backup should be
marked completed (not failed). This tests the missing-return bug fix."""
mock_subprocess_run.return_value.returncode = 0

mock_docker_client.containers.list.return_value = [mock_container1]
nb = NauticalBackup(mock_docker_client)
nb.backup()

assert mock_container1.start.call_count == 2
assert "container1" in nb.containers_completed
assert "container1" not in nb.containers_failed

@mock.patch("subprocess.run")
@pytest.mark.parametrize("mock_container1", [{"name": "container1", "id": "123456789"}], indirect=True)
def test_rsync_exit_code_23_includes_symlink_hint(
self,
mock_subprocess_run: MagicMock,
mock_docker_client: MagicMock,
mock_container1: MagicMock,
):
"""rsync exit code 23 (partial transfer) should include a hint about symlinks and the
RSYNC_CUSTOM_ARGS=--no-links workaround in the error message."""
rsync_result = MagicMock()
rsync_result.returncode = 23
mock_subprocess_run.return_value = rsync_result

mock_docker_client.containers.list.return_value = [mock_container1]
nb = NauticalBackup(mock_docker_client)
nb.backup()

assert len(nb.error_messages) == 1
assert "--no-links" in nb.error_messages[0]
assert "symlinks" in nb.error_messages[0]
assert "container1" in nb.containers_failed

# -------------------------------------------------------------------------
# Retention policy tests
# -------------------------------------------------------------------------
Expand Down
Loading