Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 89 additions & 7 deletions packages/modal-infra/src/scheduler/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import os
import subprocess
import time
from collections.abc import Iterable

import httpx
import modal
Expand All @@ -45,13 +46,70 @@
CALLBACK_MAX_RETRIES = 3
CALLBACK_BACKOFF_BASE = 2 # seconds: 2, 4, 8

# Build log errors are surfaced through callbacks; keep them concise.
BUILD_FAILURE_MESSAGE_MAX_CHARS = 500
BUILD_FAILURE_REDACT_MIN_CHARS = 4

_BUILD_LOG_EVENTS = (
"git.sync_complete",
"image_build.complete",
"setup.failed",
"setup.timeout",
"setup.error",
"supervisor.error",
"supervisor.fatal",
)
_SETUP_FAILURE_EVENTS = {"setup.failed", "setup.timeout", "setup.error"}
_BUILD_FAILURE_EVENTS = _SETUP_FAILURE_EVENTS | {"supervisor.error", "supervisor.fatal"}

Comment thread
coderabbitai[bot] marked this conversation as resolved.

class BuildError(Exception):
"""Raised when a build sandbox fails."""

pass


def _format_build_failure_event(entry: dict, redact_values: Iterable[str] = ()) -> str | None:
"""Return a concise build failure message from a structured log entry."""
event = entry.get("event")
if not isinstance(event, str):
return None
if event not in _BUILD_FAILURE_EVENTS:
return None

if event in {"setup.failed", "setup.timeout"}:
message = _trim_build_failure_message(entry.get("output_tail"), redact_values)
if not message and entry.get("exit_code") is not None:
message = f"exit_code={entry['exit_code']}"
else:
message = _trim_build_failure_message(
entry.get("error_message") or entry.get("error"), redact_values
)

if not message:
return event
return f"{event}: {message}"


def _trim_build_failure_message(value: object, redact_values: Iterable[str] = ()) -> str:
"""Keep callback errors compact while preserving the most relevant tail."""
if not isinstance(value, str):
return ""
message = value.strip()
redactions = sorted(
{
redact_value
for redact_value in redact_values
if len(redact_value) >= BUILD_FAILURE_REDACT_MIN_CHARS
},
key=len,
reverse=True,
)
for redact_value in redactions:
message = message.replace(redact_value, "***")
return message[-BUILD_FAILURE_MESSAGE_MAX_CHARS:]


async def _terminate_build_sandbox(handle, build_id: str, reason: str) -> bool:
"""Terminate a build sandbox, logging but not failing the build on cleanup errors."""
try:
Expand Down Expand Up @@ -155,37 +213,55 @@ def _generate_clone_token() -> str:
return ""


async def _stream_build_logs(sandbox) -> tuple[str, bool]:
async def _stream_build_logs(
sandbox, redact_values: Iterable[str] = ()
) -> tuple[str, bool, str | None]:
"""
Stream sandbox stdout and extract build results.

The entrypoint logs structured JSON lines. We look for:
- event="git.sync_complete" with "head_sha" field
- event="image_build.complete" to know the build finished
- setup/supervisor errors to preserve the actual build failure

The sandbox stays alive after logging image_build.complete (it awaits
shutdown_event), so we can snapshot_filesystem() while it's still running.

Returns:
(head_sha, build_complete) tuple. head_sha is empty string if not found.
(head_sha, build_complete, error_message) tuple. head_sha is empty string if not found.
"""
head_sha = ""
error_message: str | None = None
has_setup_failure = False
redact_values = tuple(redact_values)
try:
async for line in sandbox.stdout:
if "git.sync_complete" not in line and "image_build.complete" not in line:
if not any(event in line for event in _BUILD_LOG_EVENTS):
continue
try:
entry = json.loads(line)
event = entry.get("event", "")
if not isinstance(entry, dict):
continue
event = entry.get("event")
if not isinstance(event, str):
continue
if event == "git.sync_complete" and entry.get("head_sha"):
head_sha = entry["head_sha"]
elif event == "image_build.complete":
return head_sha, True
return head_sha, True, None

failure_message = _format_build_failure_event(entry, redact_values)
is_setup_failure = event in _SETUP_FAILURE_EVENTS
if failure_message and (
error_message is None or (is_setup_failure and not has_setup_failure)
):
error_message = failure_message
has_setup_failure = is_setup_failure
except json.JSONDecodeError:
continue
except Exception as e:
log.warn("build.stream_error", error=str(e))
return head_sha, False
return head_sha, False, error_message


@app.function(
Expand Down Expand Up @@ -248,9 +324,15 @@ async def build_repo_image(
)

# 3. Stream stdout until build completes (sandbox stays alive for snapshotting)
base_sha, build_complete = await _stream_build_logs(handle.modal_sandbox)
redact_values = (clone_token, *((user_env_vars or {}).values()))
base_sha, build_complete, build_error = await _stream_build_logs(
handle.modal_sandbox,
redact_values=redact_values,
)
if not build_complete:
exit_code = handle.modal_sandbox.returncode
if build_error:
raise BuildError(f"Build sandbox exited without completing: {build_error}")
raise BuildError(f"Build sandbox exited without completing (exit_code={exit_code})")

# 4. Snapshot the running sandbox's filesystem
Expand Down
149 changes: 136 additions & 13 deletions packages/modal-infra/tests/test_image_builder_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,10 @@ async def test_returns_sha_and_complete(self):
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete = await _stream_build_logs(mock_sandbox)
sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == "abc123def456"
assert complete is True
assert error is None

@pytest.mark.asyncio
async def test_complete_without_sha(self):
Expand All @@ -211,9 +212,10 @@ async def test_complete_without_sha(self):
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete = await _stream_build_logs(mock_sandbox)
sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == ""
assert complete is True
assert error is None

@pytest.mark.asyncio
async def test_incomplete_when_sandbox_exits(self):
Expand All @@ -226,9 +228,78 @@ async def test_incomplete_when_sandbox_exits(self):
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete = await _stream_build_logs(mock_sandbox)
sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == "abc123"
assert complete is False
assert error is None

@pytest.mark.asyncio
async def test_captures_setup_failure_tail(self):
"""Should preserve the setup failure that caused a build to exit."""
log_lines = [
json.dumps({"level": "info", "event": "git.sync_complete", "head_sha": "abc123"}),
json.dumps(
{
"level": "error",
"event": "supervisor.error",
"error_message": "setup hook failed in build mode",
}
),
json.dumps(
{
"level": "error",
"event": "setup.failed",
"output_tail": "npm install\nmissing dependency",
}
),
]
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == "abc123"
assert complete is False
assert error == "setup.failed: npm install\nmissing dependency"

@pytest.mark.asyncio
async def test_falls_back_to_supervisor_error(self):
"""Should use supervisor errors when no setup failure was emitted."""
log_lines = [
json.dumps(
{
"level": "error",
"event": "supervisor.error",
"error_message": "unexpected startup failure",
}
),
]
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == ""
assert complete is False
assert error == "supervisor.error: unexpected startup failure"

@pytest.mark.asyncio
async def test_falls_back_to_supervisor_fatal(self):
"""Should use fatal supervisor errors when no setup failure was emitted."""
log_lines = [
json.dumps(
{
"level": "error",
"event": "supervisor.fatal",
"error_message": "unexpected startup failure",
}
),
]
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == ""
assert complete is False
assert error == "supervisor.fatal: unexpected startup failure"

@pytest.mark.asyncio
async def test_returns_incomplete_on_error(self):
Expand All @@ -241,9 +312,10 @@ async def _raise():
mock_sandbox = MagicMock()
mock_sandbox.stdout = _raise()

sha, complete = await _stream_build_logs(mock_sandbox)
sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == ""
assert complete is False
assert error is None

@pytest.mark.asyncio
async def test_handles_malformed_json(self):
Expand All @@ -256,9 +328,10 @@ async def test_handles_malformed_json(self):
mock_sandbox = MagicMock()
mock_sandbox.stdout = self._async_stdout(log_lines)

sha, complete = await _stream_build_logs(mock_sandbox)
sha, complete, error = await _stream_build_logs(mock_sandbox)
assert sha == "abc123"
assert complete is True
assert error is None


class TestBuildError:
Expand All @@ -281,7 +354,7 @@ async def _aiter():

return _aiter()

def _build_handle(self, *, snapshot_side_effect=None):
def _build_handle(self, *, snapshot_side_effect=None, stdout_lines=None, returncode=0):
snapshot_aio = AsyncMock(
side_effect=snapshot_side_effect,
return_value=SimpleNamespace(object_id="im-test"),
Expand All @@ -290,16 +363,16 @@ def _build_handle(self, *, snapshot_side_effect=None):
snapshot_filesystem.aio = snapshot_aio
terminate_aio = AsyncMock()
terminate = SimpleNamespace(aio=terminate_aio)
if stdout_lines is None:
stdout_lines = [
json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}),
json.dumps({"event": "image_build.complete", "duration_ms": 5000}),
]
sandbox = SimpleNamespace(
stdout=self._async_stdout(
[
json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}),
json.dumps({"event": "image_build.complete", "duration_ms": 5000}),
]
),
stdout=self._async_stdout(stdout_lines),
snapshot_filesystem=snapshot_filesystem,
terminate=terminate,
returncode=0,
returncode=returncode,
)
return SimpleNamespace(modal_sandbox=sandbox), snapshot_aio, terminate_aio

Expand Down Expand Up @@ -366,3 +439,53 @@ async def test_terminates_and_reports_failure_when_snapshot_times_out(self):
"build_id": "img-1",
"error": "Timed out waiting for image to be created",
}

@pytest.mark.asyncio
async def test_reports_build_log_failure_when_stream_never_completes(self):
handle, snapshot_aio, terminate_aio = self._build_handle(
stdout_lines=[
json.dumps({"event": "git.sync_complete", "head_sha": "abc123"}),
json.dumps(
{
"event": "setup.failed",
"output_tail": "npm install failed: abcd1234",
}
),
json.dumps(
{
"event": "supervisor.error",
"error_message": "setup hook failed in build mode",
}
),
],
returncode=1,
)
manager = SimpleNamespace(create_build_sandbox=AsyncMock(return_value=handle))

with (
patch("src.scheduler.image_builder.validate_control_plane_url", return_value=True),
patch("src.scheduler.image_builder._generate_clone_token", return_value="gh-token"),
patch("src.sandbox.manager.SandboxManager", return_value=manager),
patch(
"src.scheduler.image_builder._callback_with_retry",
new_callable=AsyncMock,
return_value=True,
) as callback,
):
await build_repo_image.local(
repo_owner="acme",
repo_name="repo",
callback_url="https://cp.test/repo-images/build-complete",
build_id="img-1",
user_env_vars={"SHORT_TOKEN": "abcd", "API_TOKEN": "abcd1234"},
)

snapshot_aio.assert_not_awaited()
terminate_aio.assert_awaited_once()
callback.assert_awaited_once()
failure_url, failure_payload = callback.await_args.args
assert failure_url == "https://cp.test/repo-images/build-failed"
assert failure_payload == {
"build_id": "img-1",
"error": "Build sandbox exited without completing: setup.failed: npm install failed: ***",
}
2 changes: 1 addition & 1 deletion packages/sandbox-runtime/src/sandbox_runtime/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ async def monitor_processes(self) -> None:

async def _report_fatal_error(self, message: str) -> None:
"""Report a fatal error to the control plane."""
self.log.error("supervisor.fatal", message=message)
self.log.error("supervisor.fatal", error_message=message)

if not self.control_plane_url:
return
Expand Down
Loading