From a8d05bb840d5329a568b79c102095c8b102e2561 Mon Sep 17 00:00:00 2001 From: Kevin Su Date: Tue, 21 Apr 2026 00:24:15 +0000 Subject: [PATCH 1/4] feat(cli): add --gpu flag to `flyte start demo` When set, passes `--gpus all` to the underlying `docker run` so the demo cluster can use host NVIDIA GPUs. Requires an NVIDIA-enabled host and a GPU-capable demo image (e.g. flyte-demo:gpu-latest, being added in flyteorg/flyte). Default off; existing non-GPU users are unaffected. Usage: flyte start demo --gpu --image ghcr.io/flyteorg/flyte-demo:gpu-latest Co-Authored-By: Claude Opus 4.7 (1M context) --- src/flyte/cli/_demo.py | 21 ++++++++++++--------- src/flyte/cli/_start.py | 11 +++++++++-- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/flyte/cli/_demo.py b/src/flyte/cli/_demo.py index 69455e2ad..ac56bcb68 100644 --- a/src/flyte/cli/_demo.py +++ b/src/flyte/cli/_demo.py @@ -84,6 +84,7 @@ def _run_container( flyte_demo_config_dir: Path, volume_name: str, ports: list[str], + gpu: bool = False, ) -> None: cmd = [ "docker", @@ -106,6 +107,8 @@ def _run_container( "--volume", f"{volume_name}:/var/lib/flyte/storage", ] + if gpu: + cmd.extend(["--gpus", "all"]) for port in ports: cmd.extend(["--publish", port]) cmd.append(image) @@ -222,7 +225,7 @@ def stop_demo() -> None: console.print("[green]Demo cluster stopped.[/green] Run [bold]flyte start demo[/bold] to resume.") -def launch_demo(image_name: str, is_dev_mode: bool, log_format: str = "console") -> None: +def launch_demo(image_name: str, is_dev_mode: bool, gpu: bool = False, log_format: str = "console") -> None: _ensure_volume(_VOLUME_NAME) if _container_is_paused(_CONTAINER_NAME): @@ -244,17 +247,17 @@ def launch_demo(image_name: str, is_dev_mode: bool, log_format: str = "console") steps = _STEPS_DEV if is_dev_mode else _STEPS if log_format == "json": - _launch_demo_plain(image_name, is_dev_mode, steps) + _launch_demo_plain(image_name, is_dev_mode, steps, gpu=gpu) else: - _launch_demo_rich(image_name, is_dev_mode, steps) + _launch_demo_rich(image_name, is_dev_mode, steps, gpu=gpu) -def _run_step(step_id: str, image_name: str, is_dev_mode: bool) -> None: +def _run_step(step_id: str, image_name: str, is_dev_mode: bool, gpu: bool = False) -> None: if step_id == "pull": _pull_image(image_name) elif step_id == "start": _run_container( - image_name, is_dev_mode, _CONTAINER_NAME, _KUBE_DIR, _FLYTE_DEMO_CONFIG_DIR, _VOLUME_NAME, _PORTS + image_name, is_dev_mode, _CONTAINER_NAME, _KUBE_DIR, _FLYTE_DEMO_CONFIG_DIR, _VOLUME_NAME, _PORTS, gpu=gpu ) elif step_id == "kubeconfig": _wait_for_kubeconfig(_KUBECONFIG_PATH) @@ -266,10 +269,10 @@ def _run_step(step_id: str, image_name: str, is_dev_mode: bool) -> None: _wait_for_demo_ready(is_dev_mode) -def _launch_demo_plain(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]]) -> None: +def _launch_demo_plain(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]], gpu: bool = False) -> None: for i, (description, step_id) in enumerate(steps, 1): click.echo(f"[{i}/{len(steps)}] {description}...") - _run_step(step_id, image_name, is_dev_mode) + _run_step(step_id, image_name, is_dev_mode, gpu=gpu) click.echo(f"[{i}/{len(steps)}] {description}... done") click.echo("") @@ -281,7 +284,7 @@ def _launch_demo_plain(image_name: str, is_dev_mode: bool, steps: list[tuple[str click.echo(" Image Registry: localhost:30000") -def _launch_demo_rich(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]]) -> None: +def _launch_demo_rich(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]], gpu: bool = False) -> None: with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -294,7 +297,7 @@ def _launch_demo_rich(image_name: str, is_dev_mode: bool, steps: list[tuple[str, for description, step_id in steps: progress.update(overall, description=f"[bold cyan]{description}") - _run_step(step_id, image_name, is_dev_mode) + _run_step(step_id, image_name, is_dev_mode, gpu=gpu) progress.advance(overall) if is_dev_mode: diff --git a/src/flyte/cli/_start.py b/src/flyte/cli/_start.py index 09cbd36b2..2ca5102b8 100644 --- a/src/flyte/cli/_start.py +++ b/src/flyte/cli/_start.py @@ -38,10 +38,17 @@ def tui(): default=False, help="Enable dev mode inside the demo cluster (sets FLYTE_DEV=True).", ) +@click.option( + "--gpu", + is_flag=True, + default=False, + help="Pass host GPUs into the demo container (adds --gpus all to docker run). " + "Requires an NVIDIA-enabled host and a GPU-capable image (e.g. flyte-demo:gpu-latest).", +) @click.pass_context -def demo(ctx: click.Context, image: str, dev: bool): +def demo(ctx: click.Context, image: str, dev: bool, gpu: bool): """Start a local Flyte demo cluster.""" from flyte.cli._demo import launch_demo log_format = getattr(ctx.obj, "log_format", "console") if ctx.obj else "console" - launch_demo(image, dev, log_format=log_format) + launch_demo(image, dev, gpu=gpu, log_format=log_format) From 0ece68fc2e74fbc3f5b832280298c063eff593c3 Mon Sep 17 00:00:00 2001 From: Kevin Su Date: Tue, 21 Apr 2026 07:33:53 +0000 Subject: [PATCH 2/4] fix(cli): retry kubeconfig chown on kubectl failure On Linux bind mounts, the in-container kubeconfig lands root-owned on the host and kubectl exits non-zero, which surfaces as CalledProcessError rather than PermissionError, so the existing chown-retry branch never fired. macOS avoided this because Docker Desktop remaps ownership. --- src/flyte/cli/_demo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/flyte/cli/_demo.py b/src/flyte/cli/_demo.py index ac56bcb68..39291b59d 100644 --- a/src/flyte/cli/_demo.py +++ b/src/flyte/cli/_demo.py @@ -178,8 +178,10 @@ def _merge_kubeconfig(kubeconfig_path: Path, container_name: str) -> None: try: result = _flatten_kubeconfig(default_kubeconfig, kubeconfig_path) - except PermissionError: - # Handle the case that the user does not have permission to kubeconfig file + except (PermissionError, subprocess.CalledProcessError): + # On Linux bind mounts, the in-container kubeconfig lands root-owned on + # the host; kubectl then exits non-zero (CalledProcessError) rather than + # Python raising PermissionError on open. uid, gid = os.getuid(), os.getgid() subprocess.run( ["docker", "exec", container_name, "chown", f"{uid}:{gid}", "/.kube/kubeconfig"], From 28344f787ec1af0acded48158edd7df1bbc330dd Mon Sep 17 00:00:00 2001 From: Kevin Su Date: Tue, 21 Apr 2026 07:47:09 +0000 Subject: [PATCH 3/4] test(cli): cover --gpu flag and kubeconfig chown retry - _run_container: asserts --gpus all is appended only when gpu=True - _merge_kubeconfig: asserts chown retry fires on both PermissionError and CalledProcessError (the Linux bind-mount case this PR fixes), and that the second failure propagates - demo CLI: asserts --gpu is plumbed through to launch_demo --- tests/cli/test_demo.py | 155 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 tests/cli/test_demo.py diff --git a/tests/cli/test_demo.py b/tests/cli/test_demo.py new file mode 100644 index 000000000..e6e829110 --- /dev/null +++ b/tests/cli/test_demo.py @@ -0,0 +1,155 @@ +""" +Unit tests for flyte.cli._demo. + +Covers the `--gpu` plumbing on `flyte start demo` and the +kubeconfig chown-retry fallback when kubectl fails to read a root-owned +kubeconfig on Linux bind mounts. +""" + +import subprocess +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from click.testing import CliRunner + +from flyte.cli._demo import _merge_kubeconfig, _run_container +from flyte.cli._start import demo + + +class TestRunContainerGpuFlag: + """Verify the --gpu flag appends `--gpus all` to the docker run command.""" + + @staticmethod + def _invoke(gpu: bool) -> list[str]: + with patch("flyte.cli._demo.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stderr="") + _run_container( + image="ghcr.io/flyteorg/flyte-demo:gpu-latest", + is_dev_mode=False, + container_name="flyte-demo", + kube_dir=Path("/tmp/.kube"), + flyte_demo_config_dir=Path("/tmp/.flyte/demo"), + volume_name="flyte-demo", + ports=["30080:30080"], + gpu=gpu, + ) + assert mock_run.call_count == 1 + return mock_run.call_args.args[0] + + def test_gpu_flag_appends_gpus_all(self): + cmd = self._invoke(gpu=True) + assert "--gpus" in cmd + assert cmd[cmd.index("--gpus") + 1] == "all" + + def test_gpu_disabled_does_not_set_gpus(self): + cmd = self._invoke(gpu=False) + assert "--gpus" not in cmd + + def test_gpu_flag_precedes_image(self): + # `docker run [options] ` — --gpus must come before the image arg. + cmd = self._invoke(gpu=True) + assert cmd.index("--gpus") < cmd.index("ghcr.io/flyteorg/flyte-demo:gpu-latest") + + +class TestMergeKubeconfigRetry: + """Verify the chown-retry fallback for a root-owned kubeconfig on Linux.""" + + def test_success_on_first_try_does_not_chown(self, tmp_path): + kubeconfig = tmp_path / "kubeconfig" + kubeconfig.write_text("") + + with ( + patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten, + patch("flyte.cli._demo.subprocess.run") as mock_run, + patch("flyte.cli._demo.shutil.move", side_effect=lambda src, dst: Path(dst).touch()), + patch("flyte.cli._demo.Path.home", return_value=tmp_path), + ): + mock_flatten.return_value = MagicMock(stdout="apiVersion: v1\n") + + _merge_kubeconfig(kubeconfig, "flyte-demo") + + assert mock_flatten.call_count == 1 + mock_run.assert_not_called() + + def test_called_process_error_triggers_chown_and_retry(self, tmp_path): + """This is the bug fix: on Linux, kubectl exits non-zero (CalledProcessError), + not PermissionError. The retry branch must fire.""" + kubeconfig = tmp_path / "kubeconfig" + kubeconfig.write_text("") + + with ( + patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten, + patch("flyte.cli._demo.subprocess.run") as mock_run, + patch("flyte.cli._demo.shutil.move", side_effect=lambda src, dst: Path(dst).touch()), + patch("flyte.cli._demo.Path.home", return_value=tmp_path), + ): + mock_flatten.side_effect = [ + subprocess.CalledProcessError(1, ["kubectl", "config", "view", "--flatten"]), + MagicMock(stdout="apiVersion: v1\n"), + ] + + _merge_kubeconfig(kubeconfig, "flyte-demo") + + assert mock_flatten.call_count == 2 + assert mock_run.call_count == 1 + docker_cmd = mock_run.call_args.args[0] + assert docker_cmd[:4] == ["docker", "exec", "flyte-demo", "chown"] + assert docker_cmd[-1] == "/.kube/kubeconfig" + + def test_permission_error_still_triggers_chown_and_retry(self, tmp_path): + """Legacy path — macOS users opening the file directly — should still work.""" + kubeconfig = tmp_path / "kubeconfig" + kubeconfig.write_text("") + + with ( + patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten, + patch("flyte.cli._demo.subprocess.run") as mock_run, + patch("flyte.cli._demo.shutil.move", side_effect=lambda src, dst: Path(dst).touch()), + patch("flyte.cli._demo.Path.home", return_value=tmp_path), + ): + mock_flatten.side_effect = [ + PermissionError("denied"), + MagicMock(stdout="apiVersion: v1\n"), + ] + + _merge_kubeconfig(kubeconfig, "flyte-demo") + + assert mock_flatten.call_count == 2 + assert mock_run.call_count == 1 + + def test_second_flatten_failure_propagates(self, tmp_path): + """If kubectl still fails after the chown, we should not swallow the error.""" + kubeconfig = tmp_path / "kubeconfig" + kubeconfig.write_text("") + + with ( + patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten, + patch("flyte.cli._demo.subprocess.run"), + patch("flyte.cli._demo.Path.home", return_value=tmp_path), + ): + err = subprocess.CalledProcessError(1, ["kubectl"]) + mock_flatten.side_effect = [err, err] + + with pytest.raises(subprocess.CalledProcessError): + _merge_kubeconfig(kubeconfig, "flyte-demo") + + +class TestDemoCliGpuFlag: + """Verify the --gpu Click option is plumbed to launch_demo.""" + + def test_gpu_flag_passed_through(self): + runner = CliRunner() + with patch("flyte.cli._demo.launch_demo") as mock_launch: + result = runner.invoke(demo, ["--gpu", "--image", "flyte-demo:gpu-latest"]) + assert result.exit_code == 0, result.output + mock_launch.assert_called_once() + assert mock_launch.call_args.kwargs["gpu"] is True + + def test_gpu_defaults_to_false(self): + runner = CliRunner() + with patch("flyte.cli._demo.launch_demo") as mock_launch: + result = runner.invoke(demo, ["--image", "flyte-demo:latest"]) + assert result.exit_code == 0, result.output + mock_launch.assert_called_once() + assert mock_launch.call_args.kwargs["gpu"] is False From d0734dc8ba37e792e50c96f768960b3b63aa76e6 Mon Sep 17 00:00:00 2001 From: Kevin Su Date: Tue, 21 Apr 2026 07:50:07 +0000 Subject: [PATCH 4/4] feat(cli): default --image to GPU image when --gpu is set If the user passes --gpu without --image, use ghcr.io/flyteorg/flyte-demo:gpu-latest rather than the CPU default. An explicit --image is still respected. --- src/flyte/cli/_start.py | 16 ++++++++++++---- tests/cli/test_demo.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/flyte/cli/_start.py b/src/flyte/cli/_start.py index 2ca5102b8..d1dc9c30d 100644 --- a/src/flyte/cli/_start.py +++ b/src/flyte/cli/_start.py @@ -25,11 +25,15 @@ def tui(): launch_tui_explore() +_DEFAULT_DEMO_IMAGE = "ghcr.io/flyteorg/flyte-sandbox-v2:nightly" +_DEFAULT_DEMO_GPU_IMAGE = "ghcr.io/flyteorg/flyte-demo:gpu-latest" + + @start.command() @click.option( "--image", - default="ghcr.io/flyteorg/flyte-sandbox-v2:nightly", - show_default=True, + default=None, + show_default=f"{_DEFAULT_DEMO_IMAGE} ({_DEFAULT_DEMO_GPU_IMAGE} when --gpu)", help="Docker image to use for the demo cluster.", ) @click.option( @@ -43,12 +47,16 @@ def tui(): is_flag=True, default=False, help="Pass host GPUs into the demo container (adds --gpus all to docker run). " - "Requires an NVIDIA-enabled host and a GPU-capable image (e.g. flyte-demo:gpu-latest).", + "Requires an NVIDIA-enabled host. Defaults --image to a GPU-capable image " + "if --image is not explicitly set.", ) @click.pass_context -def demo(ctx: click.Context, image: str, dev: bool, gpu: bool): +def demo(ctx: click.Context, image: str | None, dev: bool, gpu: bool): """Start a local Flyte demo cluster.""" from flyte.cli._demo import launch_demo + if image is None: + image = _DEFAULT_DEMO_GPU_IMAGE if gpu else _DEFAULT_DEMO_IMAGE + log_format = getattr(ctx.obj, "log_format", "console") if ctx.obj else "console" launch_demo(image, dev, gpu=gpu, log_format=log_format) diff --git a/tests/cli/test_demo.py b/tests/cli/test_demo.py index e6e829110..d5bc0c84e 100644 --- a/tests/cli/test_demo.py +++ b/tests/cli/test_demo.py @@ -153,3 +153,32 @@ def test_gpu_defaults_to_false(self): assert result.exit_code == 0, result.output mock_launch.assert_called_once() assert mock_launch.call_args.kwargs["gpu"] is False + + +class TestDemoCliDefaultImage: + """--gpu without --image should pick the GPU-capable default image.""" + + def test_gpu_without_image_uses_gpu_default(self): + from flyte.cli._start import _DEFAULT_DEMO_GPU_IMAGE + + runner = CliRunner() + with patch("flyte.cli._demo.launch_demo") as mock_launch: + result = runner.invoke(demo, ["--gpu"]) + assert result.exit_code == 0, result.output + assert mock_launch.call_args.args[0] == _DEFAULT_DEMO_GPU_IMAGE + + def test_no_flags_uses_cpu_default(self): + from flyte.cli._start import _DEFAULT_DEMO_IMAGE + + runner = CliRunner() + with patch("flyte.cli._demo.launch_demo") as mock_launch: + result = runner.invoke(demo, []) + assert result.exit_code == 0, result.output + assert mock_launch.call_args.args[0] == _DEFAULT_DEMO_IMAGE + + def test_explicit_image_with_gpu_is_respected(self): + runner = CliRunner() + with patch("flyte.cli._demo.launch_demo") as mock_launch: + result = runner.invoke(demo, ["--gpu", "--image", "myorg/custom:latest"]) + assert result.exit_code == 0, result.output + assert mock_launch.call_args.args[0] == "myorg/custom:latest"