From be9088d1be01a10ef0b37460dbc813972416dd85 Mon Sep 17 00:00:00 2001 From: Halvor Reiten Date: Mon, 18 May 2026 15:29:58 +0200 Subject: [PATCH] fix: surface repo image build failures --- .../src/scheduler/image_builder.py | 96 ++++++++++- .../tests/test_image_builder_v2.py | 149 ++++++++++++++++-- .../src/sandbox_runtime/entrypoint.py | 2 +- .../tests/test_supervisor_monitor.py | 17 ++ 4 files changed, 243 insertions(+), 21 deletions(-) diff --git a/packages/modal-infra/src/scheduler/image_builder.py b/packages/modal-infra/src/scheduler/image_builder.py index 4d2651138..567ef643c 100644 --- a/packages/modal-infra/src/scheduler/image_builder.py +++ b/packages/modal-infra/src/scheduler/image_builder.py @@ -25,6 +25,7 @@ import os import subprocess import time +from collections.abc import Iterable import httpx import modal @@ -45,6 +46,22 @@ CALLBACK_MAX_RETRIES = 3 CALLBACK_BACKOFF_BASE = 2 # seconds: 2, 4, 8 +# Build log errors are surfaced through callbacks; keep them concise. +BUILD_FAILURE_MESSAGE_MAX_CHARS = 500 +BUILD_FAILURE_REDACT_MIN_CHARS = 4 + +_BUILD_LOG_EVENTS = ( + "git.sync_complete", + "image_build.complete", + "setup.failed", + "setup.timeout", + "setup.error", + "supervisor.error", + "supervisor.fatal", +) +_SETUP_FAILURE_EVENTS = {"setup.failed", "setup.timeout", "setup.error"} +_BUILD_FAILURE_EVENTS = _SETUP_FAILURE_EVENTS | {"supervisor.error", "supervisor.fatal"} + class BuildError(Exception): """Raised when a build sandbox fails.""" @@ -52,6 +69,47 @@ class BuildError(Exception): pass +def _format_build_failure_event(entry: dict, redact_values: Iterable[str] = ()) -> str | None: + """Return a concise build failure message from a structured log entry.""" + event = entry.get("event") + if not isinstance(event, str): + return None + if event not in _BUILD_FAILURE_EVENTS: + return None + + if event in {"setup.failed", "setup.timeout"}: + message = _trim_build_failure_message(entry.get("output_tail"), redact_values) + if not message and entry.get("exit_code") is not None: + message = f"exit_code={entry['exit_code']}" + else: + message = _trim_build_failure_message( + entry.get("error_message") or entry.get("error"), redact_values + ) + + if not message: + return event + return f"{event}: {message}" + + +def _trim_build_failure_message(value: object, redact_values: Iterable[str] = ()) -> str: + """Keep callback errors compact while preserving the most relevant tail.""" + if not isinstance(value, str): + return "" + message = value.strip() + redactions = sorted( + { + redact_value + for redact_value in redact_values + if len(redact_value) >= BUILD_FAILURE_REDACT_MIN_CHARS + }, + key=len, + reverse=True, + ) + for redact_value in redactions: + message = message.replace(redact_value, "***") + return message[-BUILD_FAILURE_MESSAGE_MAX_CHARS:] + + async def _terminate_build_sandbox(handle, build_id: str, reason: str) -> bool: """Terminate a build sandbox, logging but not failing the build on cleanup errors.""" try: @@ -155,37 +213,55 @@ def _generate_clone_token() -> str: return "" -async def _stream_build_logs(sandbox) -> tuple[str, bool]: +async def _stream_build_logs( + sandbox, redact_values: Iterable[str] = () +) -> tuple[str, bool, str | None]: """ Stream sandbox stdout and extract build results. The entrypoint logs structured JSON lines. We look for: - event="git.sync_complete" with "head_sha" field - event="image_build.complete" to know the build finished + - setup/supervisor errors to preserve the actual build failure The sandbox stays alive after logging image_build.complete (it awaits shutdown_event), so we can snapshot_filesystem() while it's still running. Returns: - (head_sha, build_complete) tuple. head_sha is empty string if not found. + (head_sha, build_complete, error_message) tuple. head_sha is empty string if not found. """ head_sha = "" + error_message: str | None = None + has_setup_failure = False + redact_values = tuple(redact_values) try: async for line in sandbox.stdout: - if "git.sync_complete" not in line and "image_build.complete" not in line: + if not any(event in line for event in _BUILD_LOG_EVENTS): continue try: entry = json.loads(line) - event = entry.get("event", "") + if not isinstance(entry, dict): + continue + event = entry.get("event") + if not isinstance(event, str): + continue if event == "git.sync_complete" and entry.get("head_sha"): head_sha = entry["head_sha"] elif event == "image_build.complete": - return head_sha, True + return head_sha, True, None + + failure_message = _format_build_failure_event(entry, redact_values) + is_setup_failure = event in _SETUP_FAILURE_EVENTS + if failure_message and ( + error_message is None or (is_setup_failure and not has_setup_failure) + ): + error_message = failure_message + has_setup_failure = is_setup_failure except json.JSONDecodeError: continue except Exception as e: log.warn("build.stream_error", error=str(e)) - return head_sha, False + return head_sha, False, error_message @app.function( @@ -248,9 +324,15 @@ async def build_repo_image( ) # 3. Stream stdout until build completes (sandbox stays alive for snapshotting) - base_sha, build_complete = await _stream_build_logs(handle.modal_sandbox) + redact_values = (clone_token, *((user_env_vars or {}).values())) + base_sha, build_complete, build_error = await _stream_build_logs( + handle.modal_sandbox, + redact_values=redact_values, + ) if not build_complete: exit_code = handle.modal_sandbox.returncode + if build_error: + raise BuildError(f"Build sandbox exited without completing: {build_error}") raise BuildError(f"Build sandbox exited without completing (exit_code={exit_code})") # 4. Snapshot the running sandbox's filesystem diff --git a/packages/modal-infra/tests/test_image_builder_v2.py b/packages/modal-infra/tests/test_image_builder_v2.py index 825a59777..4eb125278 100644 --- a/packages/modal-infra/tests/test_image_builder_v2.py +++ b/packages/modal-infra/tests/test_image_builder_v2.py @@ -197,9 +197,10 @@ async def test_returns_sha_and_complete(self): mock_sandbox = MagicMock() mock_sandbox.stdout = self._async_stdout(log_lines) - sha, complete = await _stream_build_logs(mock_sandbox) + sha, complete, error = await _stream_build_logs(mock_sandbox) assert sha == "abc123def456" assert complete is True + assert error is None @pytest.mark.asyncio async def test_complete_without_sha(self): @@ -211,9 +212,10 @@ async def test_complete_without_sha(self): mock_sandbox = MagicMock() mock_sandbox.stdout = self._async_stdout(log_lines) - sha, complete = await _stream_build_logs(mock_sandbox) + sha, complete, error = await _stream_build_logs(mock_sandbox) assert sha == "" assert complete is True + assert error is None @pytest.mark.asyncio async def test_incomplete_when_sandbox_exits(self): @@ -226,9 +228,78 @@ async def test_incomplete_when_sandbox_exits(self): mock_sandbox = MagicMock() mock_sandbox.stdout = self._async_stdout(log_lines) - sha, complete = await _stream_build_logs(mock_sandbox) + sha, complete, error = await _stream_build_logs(mock_sandbox) assert sha == "abc123" assert complete is False + assert error is None + + @pytest.mark.asyncio + async def test_captures_setup_failure_tail(self): + """Should preserve the setup failure that caused a build to exit.""" + log_lines = [ + json.dumps({"level": "info", "event": "git.sync_complete", "head_sha": "abc123"}), + json.dumps( + { + "level": "error", + "event": "supervisor.error", + "error_message": "setup hook failed in build mode", + } + ), + json.dumps( + { + "level": "error", + "event": "setup.failed", + "output_tail": "npm install\nmissing dependency", + } + ), + ] + mock_sandbox = MagicMock() + mock_sandbox.stdout = self._async_stdout(log_lines) + + sha, complete, error = await _stream_build_logs(mock_sandbox) + assert sha == "abc123" + assert complete is False + assert error == "setup.failed: npm install\nmissing dependency" + + @pytest.mark.asyncio + async def test_falls_back_to_supervisor_error(self): + """Should use supervisor errors when no setup failure was emitted.""" + log_lines = [ + json.dumps( + { + "level": "error", + "event": "supervisor.error", + "error_message": "unexpected startup failure", + } + ), + ] + mock_sandbox = MagicMock() + mock_sandbox.stdout = self._async_stdout(log_lines) + + sha, complete, error = await _stream_build_logs(mock_sandbox) + assert sha == "" + assert complete is False + assert error == "supervisor.error: unexpected startup failure" + + @pytest.mark.asyncio + async def test_falls_back_to_supervisor_fatal(self): + """Should use fatal supervisor errors when no setup failure was emitted.""" + log_lines = [ + json.dumps( + { + "level": "error", + "event": "supervisor.fatal", + "error_message": "unexpected startup failure", + } + ), + ] + mock_sandbox = MagicMock() + mock_sandbox.stdout = self._async_stdout(log_lines) + + sha, complete, error = await _stream_build_logs(mock_sandbox) + assert sha == "" + assert complete is False + assert error == "supervisor.fatal: unexpected startup failure" @pytest.mark.asyncio async def test_returns_incomplete_on_error(self): @@ -241,9 +312,10 @@ async def _raise(): mock_sandbox = MagicMock() mock_sandbox.stdout = _raise() - sha, complete = await _stream_build_logs(mock_sandbox) + sha, complete, error = await _stream_build_logs(mock_sandbox) assert sha == "" assert complete is False + assert error is None @pytest.mark.asyncio async def test_handles_malformed_json(self): @@ -256,9 +328,10 @@ async def test_handles_malformed_json(self): mock_sandbox = MagicMock() mock_sandbox.stdout = self._async_stdout(log_lines) - sha, complete = await _stream_build_logs(mock_sandbox) + sha, complete, error = await _stream_build_logs(mock_sandbox) assert sha == "abc123" assert complete is True + assert error is None class TestBuildError: @@ -281,7 +354,7 @@ async def _aiter(): return _aiter() - def _build_handle(self, *, snapshot_side_effect=None): + def _build_handle(self, *, snapshot_side_effect=None, stdout_lines=None, returncode=0): snapshot_aio = AsyncMock( side_effect=snapshot_side_effect, return_value=SimpleNamespace(object_id="im-test"), @@ -290,16 +363,16 @@ def _build_handle(self, *, snapshot_side_effect=None): snapshot_filesystem.aio = snapshot_aio terminate_aio = AsyncMock() terminate = SimpleNamespace(aio=terminate_aio) + if stdout_lines is None: + stdout_lines = [ + json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}), + json.dumps({"event": "image_build.complete", "duration_ms": 5000}), + ] sandbox = SimpleNamespace( - stdout=self._async_stdout( - [ - json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}), - json.dumps({"event": "image_build.complete", "duration_ms": 5000}), - ] - ), + stdout=self._async_stdout(stdout_lines), snapshot_filesystem=snapshot_filesystem, terminate=terminate, - returncode=0, + returncode=returncode, ) return SimpleNamespace(modal_sandbox=sandbox), snapshot_aio, terminate_aio @@ -366,3 +439,53 @@ async def test_terminates_and_reports_failure_when_snapshot_times_out(self): "build_id": "img-1", "error": "Timed out waiting for image to be created", } + + @pytest.mark.asyncio + async def test_reports_build_log_failure_when_stream_never_completes(self): + handle, snapshot_aio, terminate_aio = self._build_handle( + stdout_lines=[ + json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}), + json.dumps( + { + "event": "setup.failed", + "output_tail": "npm install failed: abcd1234", + } + ), + json.dumps( + { + "event": "supervisor.error", + "error_message": "setup hook failed in build mode", + } + ), + ], + returncode=1, + ) + manager = SimpleNamespace(create_build_sandbox=AsyncMock(return_value=handle)) + + with ( + patch("src.scheduler.image_builder.validate_control_plane_url", return_value=True), + patch("src.scheduler.image_builder._generate_clone_token", return_value="gh-token"), + patch("src.sandbox.manager.SandboxManager", return_value=manager), + patch( + "src.scheduler.image_builder._callback_with_retry", + new_callable=AsyncMock, + return_value=True, + ) as callback, + ): + await build_repo_image.local( + repo_owner="acme", + repo_name="repo", + callback_url="https://cp.test/repo-images/build-complete", + build_id="img-1", + user_env_vars={"SHORT_TOKEN": "abcd", "API_TOKEN": "abcd1234"}, + ) + + snapshot_aio.assert_not_awaited() + terminate_aio.assert_awaited_once() + callback.assert_awaited_once() + failure_url, failure_payload = callback.await_args.args + assert failure_url == "https://cp.test/repo-images/build-failed" + assert failure_payload == { + "build_id": "img-1", + "error": "Build sandbox exited without completing: setup.failed: npm install failed: ***", + } diff --git a/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py b/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py index 63c1fcdfa..e1cc9a060 100644 --- a/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py +++ b/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py @@ -984,7 +984,7 @@ async def monitor_processes(self) -> None: async def _report_fatal_error(self, message: str) -> None: """Report a fatal error to the control plane.""" - self.log.error("supervisor.fatal", message=message) + self.log.error("supervisor.fatal", error_message=message) if not self.control_plane_url: return diff --git a/packages/sandbox-runtime/tests/test_supervisor_monitor.py b/packages/sandbox-runtime/tests/test_supervisor_monitor.py index dd7803a2e..ee69e17c9 100644 --- a/packages/sandbox-runtime/tests/test_supervisor_monitor.py +++ b/packages/sandbox-runtime/tests/test_supervisor_monitor.py @@ -159,3 +159,20 @@ async def capture_sleep(delay): # All delays should be <= BACKOFF_MAX for delay in sleep_delays: assert delay <= sup.BACKOFF_MAX + + +class TestFatalErrorReporting: + """Fatal supervisor errors should be loggable and reportable.""" + + async def test_report_fatal_error_logs_without_reserved_field_collision(self, caplog): + sup = _make_supervisor() + sup.control_plane_url = "" + + caplog.set_level("ERROR", logger="supervisor") + await sup._report_fatal_error("boom") + + fatal_records = [ + record for record in caplog.records if record.getMessage() == "supervisor.fatal" + ] + assert len(fatal_records) == 1 + assert fatal_records[0].error_message == "boom"