From be9088d1be01a10ef0b37460dbc813972416dd85 Mon Sep 17 00:00:00 2001
From: Halvor Reiten <ha.reiten@gmail.com>
Date: Mon, 18 May 2026 15:29:58 +0200
Subject: [PATCH] fix: surface repo image build failures

---
 .../src/scheduler/image_builder.py            |  96 ++++++++++-
 .../tests/test_image_builder_v2.py            | 149 ++++++++++++++++--
 .../src/sandbox_runtime/entrypoint.py         |   2 +-
 .../tests/test_supervisor_monitor.py          |  17 ++
 4 files changed, 243 insertions(+), 21 deletions(-)

diff --git a/packages/modal-infra/src/scheduler/image_builder.py b/packages/modal-infra/src/scheduler/image_builder.py
index 4d2651138..567ef643c 100644
--- a/packages/modal-infra/src/scheduler/image_builder.py
+++ b/packages/modal-infra/src/scheduler/image_builder.py
@@ -25,6 +25,7 @@
 import os
 import subprocess
 import time
+from collections.abc import Iterable
 
 import httpx
 import modal
@@ -45,6 +46,22 @@
 CALLBACK_MAX_RETRIES = 3
 CALLBACK_BACKOFF_BASE = 2  # seconds: 2, 4, 8
 
+# Build log errors are surfaced through callbacks; keep them concise.
+BUILD_FAILURE_MESSAGE_MAX_CHARS = 500
+BUILD_FAILURE_REDACT_MIN_CHARS = 4
+
+_BUILD_LOG_EVENTS = (
+    "git.sync_complete",
+    "image_build.complete",
+    "setup.failed",
+    "setup.timeout",
+    "setup.error",
+    "supervisor.error",
+    "supervisor.fatal",
+)
+_SETUP_FAILURE_EVENTS = {"setup.failed", "setup.timeout", "setup.error"}
+_BUILD_FAILURE_EVENTS = _SETUP_FAILURE_EVENTS | {"supervisor.error", "supervisor.fatal"}
+
 
 class BuildError(Exception):
     """Raised when a build sandbox fails."""
@@ -52,6 +69,47 @@ class BuildError(Exception):
     pass
 
 
+def _format_build_failure_event(entry: dict, redact_values: Iterable[str] = ()) -> str | None:
+    """Return a concise build failure message from a structured log entry."""
+    event = entry.get("event")
+    if not isinstance(event, str):
+        return None
+    if event not in _BUILD_FAILURE_EVENTS:
+        return None
+
+    if event in {"setup.failed", "setup.timeout"}:
+        message = _trim_build_failure_message(entry.get("output_tail"), redact_values)
+        if not message and entry.get("exit_code") is not None:
+            message = f"exit_code={entry['exit_code']}"
+    else:
+        message = _trim_build_failure_message(
+            entry.get("error_message") or entry.get("error"), redact_values
+        )
+
+    if not message:
+        return event
+    return f"{event}: {message}"
+
+
+def _trim_build_failure_message(value: object, redact_values: Iterable[str] = ()) -> str:
+    """Keep callback errors compact while preserving the most relevant tail."""
+    if not isinstance(value, str):
+        return ""
+    message = value.strip()
+    redactions = sorted(
+        {
+            redact_value
+            for redact_value in redact_values
+            if len(redact_value) >= BUILD_FAILURE_REDACT_MIN_CHARS
+        },
+        key=len,
+        reverse=True,
+    )
+    for redact_value in redactions:
+        message = message.replace(redact_value, "***")
+    return message[-BUILD_FAILURE_MESSAGE_MAX_CHARS:]
+
+
 async def _terminate_build_sandbox(handle, build_id: str, reason: str) -> bool:
     """Terminate a build sandbox, logging but not failing the build on cleanup errors."""
     try:
@@ -155,37 +213,55 @@ def _generate_clone_token() -> str:
     return ""
 
 
-async def _stream_build_logs(sandbox) -> tuple[str, bool]:
+async def _stream_build_logs(
+    sandbox, redact_values: Iterable[str] = ()
+) -> tuple[str, bool, str | None]:
     """
     Stream sandbox stdout and extract build results.
 
     The entrypoint logs structured JSON lines. We look for:
     - event="git.sync_complete" with "head_sha" field
     - event="image_build.complete" to know the build finished
+    - setup/supervisor errors to preserve the actual build failure
 
     The sandbox stays alive after logging image_build.complete (it awaits
     shutdown_event), so we can snapshot_filesystem() while it's still running.
 
     Returns:
-        (head_sha, build_complete) tuple. head_sha is empty string if not found.
+        (head_sha, build_complete, error_message) tuple. head_sha is empty string if not found.
     """
     head_sha = ""
+    error_message: str | None = None
+    has_setup_failure = False
+    redact_values = tuple(redact_values)
     try:
         async for line in sandbox.stdout:
-            if "git.sync_complete" not in line and "image_build.complete" not in line:
+            if not any(event in line for event in _BUILD_LOG_EVENTS):
                 continue
             try:
                 entry = json.loads(line)
-                event = entry.get("event", "")
+                if not isinstance(entry, dict):
+                    continue
+                event = entry.get("event")
+                if not isinstance(event, str):
+                    continue
                 if event == "git.sync_complete" and entry.get("head_sha"):
                     head_sha = entry["head_sha"]
                 elif event == "image_build.complete":
-                    return head_sha, True
+                    return head_sha, True, None
+
+                failure_message = _format_build_failure_event(entry, redact_values)
+                is_setup_failure = event in _SETUP_FAILURE_EVENTS
+                if failure_message and (
+                    error_message is None or (is_setup_failure and not has_setup_failure)
+                ):
+                    error_message = failure_message
+                    has_setup_failure = is_setup_failure
             except json.JSONDecodeError:
                 continue
     except Exception as e:
         log.warn("build.stream_error", error=str(e))
-    return head_sha, False
+    return head_sha, False, error_message
 
 
 @app.function(
@@ -248,9 +324,15 @@ async def build_repo_image(
         )
 
         # 3. Stream stdout until build completes (sandbox stays alive for snapshotting)
-        base_sha, build_complete = await _stream_build_logs(handle.modal_sandbox)
+        redact_values = (clone_token, *((user_env_vars or {}).values()))
+        base_sha, build_complete, build_error = await _stream_build_logs(
+            handle.modal_sandbox,
+            redact_values=redact_values,
+        )
         if not build_complete:
             exit_code = handle.modal_sandbox.returncode
+            if build_error:
+                raise BuildError(f"Build sandbox exited without completing: {build_error}")
             raise BuildError(f"Build sandbox exited without completing (exit_code={exit_code})")
 
         # 4. Snapshot the running sandbox's filesystem
diff --git a/packages/modal-infra/tests/test_image_builder_v2.py b/packages/modal-infra/tests/test_image_builder_v2.py
index 825a59777..4eb125278 100644
--- a/packages/modal-infra/tests/test_image_builder_v2.py
+++ b/packages/modal-infra/tests/test_image_builder_v2.py
@@ -197,9 +197,10 @@ async def test_returns_sha_and_complete(self):
         mock_sandbox = MagicMock()
         mock_sandbox.stdout = self._async_stdout(log_lines)
 
-        sha, complete = await _stream_build_logs(mock_sandbox)
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
         assert sha == "abc123def456"
         assert complete is True
+        assert error is None
 
     @pytest.mark.asyncio
     async def test_complete_without_sha(self):
@@ -211,9 +212,10 @@ async def test_complete_without_sha(self):
         mock_sandbox = MagicMock()
         mock_sandbox.stdout = self._async_stdout(log_lines)
 
-        sha, complete = await _stream_build_logs(mock_sandbox)
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
         assert sha == ""
         assert complete is True
+        assert error is None
 
     @pytest.mark.asyncio
     async def test_incomplete_when_sandbox_exits(self):
@@ -226,9 +228,78 @@ async def test_incomplete_when_sandbox_exits(self):
         mock_sandbox = MagicMock()
         mock_sandbox.stdout = self._async_stdout(log_lines)
 
-        sha, complete = await _stream_build_logs(mock_sandbox)
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
         assert sha == "abc123"
         assert complete is False
+        assert error is None
+
+    @pytest.mark.asyncio
+    async def test_captures_setup_failure_tail(self):
+        """Should preserve the setup failure that caused a build to exit."""
+        log_lines = [
+            json.dumps({"level": "info", "event": "git.sync_complete", "head_sha": "abc123"}),
+            json.dumps(
+                {
+                    "level": "error",
+                    "event": "supervisor.error",
+                    "error_message": "setup hook failed in build mode",
+                }
+            ),
+            json.dumps(
+                {
+                    "level": "error",
+                    "event": "setup.failed",
+                    "output_tail": "npm install\nmissing dependency",
+                }
+            ),
+        ]
+        mock_sandbox = MagicMock()
+        mock_sandbox.stdout = self._async_stdout(log_lines)
+
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
+        assert sha == "abc123"
+        assert complete is False
+        assert error == "setup.failed: npm install\nmissing dependency"
+
+    @pytest.mark.asyncio
+    async def test_falls_back_to_supervisor_error(self):
+        """Should use supervisor errors when no setup failure was emitted."""
+        log_lines = [
+            json.dumps(
+                {
+                    "level": "error",
+                    "event": "supervisor.error",
+                    "error_message": "unexpected startup failure",
+                }
+            ),
+        ]
+        mock_sandbox = MagicMock()
+        mock_sandbox.stdout = self._async_stdout(log_lines)
+
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
+        assert sha == ""
+        assert complete is False
+        assert error == "supervisor.error: unexpected startup failure"
+
+    @pytest.mark.asyncio
+    async def test_falls_back_to_supervisor_fatal(self):
+        """Should use fatal supervisor errors when no setup failure was emitted."""
+        log_lines = [
+            json.dumps(
+                {
+                    "level": "error",
+                    "event": "supervisor.fatal",
+                    "error_message": "unexpected startup failure",
+                }
+            ),
+        ]
+        mock_sandbox = MagicMock()
+        mock_sandbox.stdout = self._async_stdout(log_lines)
+
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
+        assert sha == ""
+        assert complete is False
+        assert error == "supervisor.fatal: unexpected startup failure"
 
     @pytest.mark.asyncio
     async def test_returns_incomplete_on_error(self):
@@ -241,9 +312,10 @@ async def _raise():
         mock_sandbox = MagicMock()
         mock_sandbox.stdout = _raise()
 
-        sha, complete = await _stream_build_logs(mock_sandbox)
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
         assert sha == ""
         assert complete is False
+        assert error is None
 
     @pytest.mark.asyncio
     async def test_handles_malformed_json(self):
@@ -256,9 +328,10 @@ async def test_handles_malformed_json(self):
         mock_sandbox = MagicMock()
         mock_sandbox.stdout = self._async_stdout(log_lines)
 
-        sha, complete = await _stream_build_logs(mock_sandbox)
+        sha, complete, error = await _stream_build_logs(mock_sandbox)
         assert sha == "abc123"
         assert complete is True
+        assert error is None
 
 
 class TestBuildError:
@@ -281,7 +354,7 @@ async def _aiter():
 
         return _aiter()
 
-    def _build_handle(self, *, snapshot_side_effect=None):
+    def _build_handle(self, *, snapshot_side_effect=None, stdout_lines=None, returncode=0):
         snapshot_aio = AsyncMock(
             side_effect=snapshot_side_effect,
             return_value=SimpleNamespace(object_id="im-test"),
@@ -290,16 +363,16 @@ def _build_handle(self, *, snapshot_side_effect=None):
         snapshot_filesystem.aio = snapshot_aio
         terminate_aio = AsyncMock()
         terminate = SimpleNamespace(aio=terminate_aio)
+        if stdout_lines is None:
+            stdout_lines = [
+                json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}),
+                json.dumps({"event": "image_build.complete", "duration_ms": 5000}),
+            ]
         sandbox = SimpleNamespace(
-            stdout=self._async_stdout(
-                [
-                    json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}),
-                    json.dumps({"event": "image_build.complete", "duration_ms": 5000}),
-                ]
-            ),
+            stdout=self._async_stdout(stdout_lines),
             snapshot_filesystem=snapshot_filesystem,
             terminate=terminate,
-            returncode=0,
+            returncode=returncode,
         )
         return SimpleNamespace(modal_sandbox=sandbox), snapshot_aio, terminate_aio
 
@@ -366,3 +439,53 @@ async def test_terminates_and_reports_failure_when_snapshot_times_out(self):
             "build_id": "img-1",
             "error": "Timed out waiting for image to be created",
         }
+
+    @pytest.mark.asyncio
+    async def test_reports_build_log_failure_when_stream_never_completes(self):
+        handle, snapshot_aio, terminate_aio = self._build_handle(
+            stdout_lines=[
+                json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}),
+                json.dumps(
+                    {
+                        "event": "setup.failed",
+                        "output_tail": "npm install failed: abcd1234",
+                    }
+                ),
+                json.dumps(
+                    {
+                        "event": "supervisor.error",
+                        "error_message": "setup hook failed in build mode",
+                    }
+                ),
+            ],
+            returncode=1,
+        )
+        manager = SimpleNamespace(create_build_sandbox=AsyncMock(return_value=handle))
+
+        with (
+            patch("src.scheduler.image_builder.validate_control_plane_url", return_value=True),
+            patch("src.scheduler.image_builder._generate_clone_token", return_value="gh-token"),
+            patch("src.sandbox.manager.SandboxManager", return_value=manager),
+            patch(
+                "src.scheduler.image_builder._callback_with_retry",
+                new_callable=AsyncMock,
+                return_value=True,
+            ) as callback,
+        ):
+            await build_repo_image.local(
+                repo_owner="acme",
+                repo_name="repo",
+                callback_url="https://cp.test/repo-images/build-complete",
+                build_id="img-1",
+                user_env_vars={"SHORT_TOKEN": "abcd", "API_TOKEN": "abcd1234"},
+            )
+
+        snapshot_aio.assert_not_awaited()
+        terminate_aio.assert_awaited_once()
+        callback.assert_awaited_once()
+        failure_url, failure_payload = callback.await_args.args
+        assert failure_url == "https://cp.test/repo-images/build-failed"
+        assert failure_payload == {
+            "build_id": "img-1",
+            "error": "Build sandbox exited without completing: setup.failed: npm install failed: ***",
+        }
diff --git a/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py b/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py
index 63c1fcdfa..e1cc9a060 100644
--- a/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py
+++ b/packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py
@@ -984,7 +984,7 @@ async def monitor_processes(self) -> None:
 
     async def _report_fatal_error(self, message: str) -> None:
         """Report a fatal error to the control plane."""
-        self.log.error("supervisor.fatal", message=message)
+        self.log.error("supervisor.fatal", error_message=message)
 
         if not self.control_plane_url:
             return
diff --git a/packages/sandbox-runtime/tests/test_supervisor_monitor.py b/packages/sandbox-runtime/tests/test_supervisor_monitor.py
index dd7803a2e..ee69e17c9 100644
--- a/packages/sandbox-runtime/tests/test_supervisor_monitor.py
+++ b/packages/sandbox-runtime/tests/test_supervisor_monitor.py
@@ -159,3 +159,20 @@ async def capture_sleep(delay):
         # All delays should be <= BACKOFF_MAX
         for delay in sleep_delays:
             assert delay <= sup.BACKOFF_MAX
+
+
+class TestFatalErrorReporting:
+    """Fatal supervisor errors should be loggable and reportable."""
+
+    async def test_report_fatal_error_logs_without_reserved_field_collision(self, caplog):
+        sup = _make_supervisor()
+        sup.control_plane_url = ""
+
+        caplog.set_level("ERROR", logger="supervisor")
+        await sup._report_fatal_error("boom")
+
+        fatal_records = [
+            record for record in caplog.records if record.getMessage() == "supervisor.fatal"
+        ]
+        assert len(fatal_records) == 1
+        assert fatal_records[0].error_message == "boom"