Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions app/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,24 +471,30 @@ def _stop_container(self, c: Container, attempt=1) -> bool:
self._stop_container(c, attempt=attempt + 1)
return False

def _start_container(self, c: Container, attempt=1) -> bool:
def _start_container(self, c: Container, attempt=1, max_attempts: Optional[int] = None) -> bool:
if max_attempts is None:
start_timeout = int(self.get_label(c, "start-timeout", str(self.env.START_TIMEOUT)))
max_attempts = max(1, start_timeout // 2)

c.reload() # Refresh the status for this container
status = c.status # Read once to avoid consuming multiple mock cycles

if status == "running":
if attempt == 1:
self.log_this(f"Container {c.name} was not stopped. No need to start.", "DEBUG")
else:
self.log_this(f"Container {c.name} is now running.", "INFO")
return True

if status != "exited":
# Transitional state: Docker statuses are created/restarting/running/paused/exited/dead
if attempt <= 3:
if attempt <= max_attempts:
self.log_this(
f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/3)",
f"Container {c.name} is in '{status}' state, waiting for it to stabilize (Attempt {attempt}/{max_attempts})",
"WARN",
)
time.sleep(2)
return self._start_container(c, attempt=attempt + 1)
return self._start_container(c, attempt=attempt + 1, max_attempts=max_attempts)
return False

try:
Expand All @@ -505,10 +511,14 @@ def _start_container(self, c: Container, attempt=1) -> bool:
status = c.status # Read once to avoid consuming multiple mock cycles

if status == "running":
if attempt > 1:
self.log_this(f"Container {c.name} is now running.", "INFO")
return True
elif attempt <= 3:
self.log_this(f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/3)", "ERROR")
return self._start_container(c, attempt=attempt + 1)
elif attempt <= max_attempts:
self.log_this(
f"Container {c.name} was not in running state. Trying again (Attempt {attempt}/{max_attempts})", "WARN"
)
return self._start_container(c, attempt=attempt + 1, max_attempts=max_attempts)
return False

def _get_src_dir(self, c: Container, log=False) -> Tuple[Path, str]:
Expand Down
3 changes: 3 additions & 0 deletions app/defaults.env
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ LABEL_PREFIX=nautical-backup
# How long to wait for a container to stop before killing it
STOP_TIMEOUT=10

# How long to wait for a container to reach running state after startup
START_TIMEOUT=10

# Set the default log level to INFO
LOG_LEVEL=INFO

Expand Down
1 change: 1 addition & 0 deletions app/nautical_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(self) -> None:
self.REPORT_FILE = False

self.STOP_TIMEOUT = int(os.environ.get("STOP_TIMEOUT", 10))
self.START_TIMEOUT = int(os.environ.get("START_TIMEOUT", 10))

_keep = os.environ.get("NUMBER_OF_BACKUPS_TO_KEEP", "0")
self.NUMBER_OF_BACKUPS_TO_KEEP = int(_keep) if _keep.isdigit() else 0
Expand Down
11 changes: 11 additions & 0 deletions docs/arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,17 @@ STOP_TIMEOUT=10

<small>🔄 This is the same action as the [Stop Timeout](./labels.md#stop-timeout) label, but applied globally.</small>

## Start Timeout
Nautical will check every 2 seconds whether a container has reached a running state after startup, giving up after *x* total seconds.

> **Default**: 10 <small>(seconds)</small>

```properties
START_TIMEOUT=10
```

<small>🔄 This is the same action as the [Start Timeout](./labels.md#start-timeout) label, but applied globally.</small>

## Backup on Start
Nautical will immediately perform a backup when the container is started in addition to the CRON scheduled backup.

Expand Down
11 changes: 11 additions & 0 deletions docs/labels.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ nautical-backup.stop-timeout=10

<small>🔄 This is a similar action to the [Stop Timeout](./arguments.md#stop-timeout) variable, but applied only to this container.</small>

## Start Timeout
Nautical will check every 2 seconds whether a container has reached a running state after startup, giving up after *x* total seconds.

> **Default If Missing**: [Start Timeout](./arguments.md#start-timeout) Environment value<small> (Defaults to 10 seconds)</small>

```properties
nautical-backup.start-timeout=10
```

<small>🔄 This is a similar action to the [Start Timeout](./arguments.md#start-timeout) variable, but applied only to this container.</small>

## Groups
Use this label to have multiple containers stopped, backed up, and restarted at the same time.

Expand Down
77 changes: 77 additions & 0 deletions pytest/test_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2948,6 +2948,83 @@ def test_start_container_transitional_state_eventually_running(
assert "container1" in nb.containers_completed
assert "container1" not in nb.containers_failed

@mock.patch("subprocess.run")
@pytest.mark.parametrize(
"mock_container1",
[
{
"name": "container1",
"id": "123456789",
# Reads: stop-check, after-stop, pre-rsync, start-attempt-1,
# after-c.start (restarting), retry-2 (restarting), retry-3 (restarting), retry-4 (running)
# Mirrors the exact sequence from issue #683 (2026-05-25 comment)
"status_side_effect": [
"running",
"exited",
"exited",
"exited",
"restarting",
"restarting",
"restarting",
"running",
],
}
],
indirect=True,
)
def test_start_container_three_restarting_states_eventually_running(
self,
mock_subprocess_run: MagicMock,
mock_docker_client: MagicMock,
mock_container1: MagicMock,
):
"""Container that stays in 'restarting' for 3 cycles before becoming running should still
complete successfully. Reproduces the scenario from issue #683 (2026-05-25)."""
mock_subprocess_run.return_value.returncode = 0

mock_docker_client.containers.list.return_value = [mock_container1]
nb = NauticalBackup(mock_docker_client)
nb.backup()

mock_container1.stop.assert_called()
mock_container1.start.assert_called()
assert "container1" in nb.containers_completed
assert "container1" not in nb.containers_failed

@mock.patch("subprocess.run")
@pytest.mark.parametrize(
"mock_container1",
[
{
"name": "container1",
"id": "123456789",
"labels": {"nautical-backup.start-timeout": "10"},
# Needs 2 retries after c.start() to reach running.
# With START_TIMEOUT=2 (max_attempts=1) this would fail;
# the label overrides to 10s (max_attempts=5) so it succeeds.
"status_side_effect": ["running", "exited", "exited", "exited", "restarting", "restarting", "running"],
}
],
indirect=True,
)
def test_START_TIMEOUT_label_supersedes_env(
self,
mock_subprocess_run: MagicMock,
mock_docker_client: MagicMock,
mock_container1: MagicMock,
monkeypatch: pytest.MonkeyPatch,
):
"""Per-container start-timeout label should override the global START_TIMEOUT env var."""
mock_subprocess_run.return_value.returncode = 0
monkeypatch.setenv("START_TIMEOUT", "2") # max_attempts=1 — would fail without the label

mock_docker_client.containers.list.return_value = [mock_container1]
nb = NauticalBackup(mock_docker_client)
nb.backup()

assert "container1" in nb.containers_completed
assert "container1" not in nb.containers_failed

@mock.patch("subprocess.run")
@pytest.mark.parametrize(
"mock_container1",
Expand Down