From a8d05bb840d5329a568b79c102095c8b102e2561 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 00:24:15 +0000
Subject: [PATCH 1/4] feat(cli): add --gpu flag to `flyte start demo`

When set, passes `--gpus all` to the underlying `docker run` so the demo
cluster can use host NVIDIA GPUs. Requires an NVIDIA-enabled host and a
GPU-capable demo image (e.g. flyte-demo:gpu-latest, being added in
flyteorg/flyte). Default off; existing non-GPU users are unaffected.

Usage:
    flyte start demo --gpu --image ghcr.io/flyteorg/flyte-demo:gpu-latest

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/flyte/cli/_demo.py  | 21 ++++++++++++---------
 src/flyte/cli/_start.py | 11 +++++++++--
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/flyte/cli/_demo.py b/src/flyte/cli/_demo.py
index 69455e2ad..ac56bcb68 100644
--- a/src/flyte/cli/_demo.py
+++ b/src/flyte/cli/_demo.py
@@ -84,6 +84,7 @@ def _run_container(
     flyte_demo_config_dir: Path,
     volume_name: str,
     ports: list[str],
+    gpu: bool = False,
 ) -> None:
     cmd = [
         "docker",
@@ -106,6 +107,8 @@ def _run_container(
         "--volume",
         f"{volume_name}:/var/lib/flyte/storage",
     ]
+    if gpu:
+        cmd.extend(["--gpus", "all"])
     for port in ports:
         cmd.extend(["--publish", port])
     cmd.append(image)
@@ -222,7 +225,7 @@ def stop_demo() -> None:
     console.print("[green]Demo cluster stopped.[/green] Run [bold]flyte start demo[/bold] to resume.")
 
 
-def launch_demo(image_name: str, is_dev_mode: bool, log_format: str = "console") -> None:
+def launch_demo(image_name: str, is_dev_mode: bool, gpu: bool = False, log_format: str = "console") -> None:
     _ensure_volume(_VOLUME_NAME)
 
     if _container_is_paused(_CONTAINER_NAME):
@@ -244,17 +247,17 @@ def launch_demo(image_name: str, is_dev_mode: bool, log_format: str = "console")
     steps = _STEPS_DEV if is_dev_mode else _STEPS
 
     if log_format == "json":
-        _launch_demo_plain(image_name, is_dev_mode, steps)
+        _launch_demo_plain(image_name, is_dev_mode, steps, gpu=gpu)
     else:
-        _launch_demo_rich(image_name, is_dev_mode, steps)
+        _launch_demo_rich(image_name, is_dev_mode, steps, gpu=gpu)
 
 
-def _run_step(step_id: str, image_name: str, is_dev_mode: bool) -> None:
+def _run_step(step_id: str, image_name: str, is_dev_mode: bool, gpu: bool = False) -> None:
     if step_id == "pull":
         _pull_image(image_name)
     elif step_id == "start":
         _run_container(
-            image_name, is_dev_mode, _CONTAINER_NAME, _KUBE_DIR, _FLYTE_DEMO_CONFIG_DIR, _VOLUME_NAME, _PORTS
+            image_name, is_dev_mode, _CONTAINER_NAME, _KUBE_DIR, _FLYTE_DEMO_CONFIG_DIR, _VOLUME_NAME, _PORTS, gpu=gpu
         )
     elif step_id == "kubeconfig":
         _wait_for_kubeconfig(_KUBECONFIG_PATH)
@@ -266,10 +269,10 @@ def _run_step(step_id: str, image_name: str, is_dev_mode: bool) -> None:
         _wait_for_demo_ready(is_dev_mode)
 
 
-def _launch_demo_plain(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]]) -> None:
+def _launch_demo_plain(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]], gpu: bool = False) -> None:
     for i, (description, step_id) in enumerate(steps, 1):
         click.echo(f"[{i}/{len(steps)}] {description}...")
-        _run_step(step_id, image_name, is_dev_mode)
+        _run_step(step_id, image_name, is_dev_mode, gpu=gpu)
         click.echo(f"[{i}/{len(steps)}] {description}... done")
 
     click.echo("")
@@ -281,7 +284,7 @@ def _launch_demo_plain(image_name: str, is_dev_mode: bool, steps: list[tuple[str
         click.echo("  Image Registry: localhost:30000")
 
 
-def _launch_demo_rich(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]]) -> None:
+def _launch_demo_rich(image_name: str, is_dev_mode: bool, steps: list[tuple[str, str]], gpu: bool = False) -> None:
     with Progress(
         SpinnerColumn(),
         TextColumn("[progress.description]{task.description}"),
@@ -294,7 +297,7 @@ def _launch_demo_rich(image_name: str, is_dev_mode: bool, steps: list[tuple[str,
 
         for description, step_id in steps:
             progress.update(overall, description=f"[bold cyan]{description}")
-            _run_step(step_id, image_name, is_dev_mode)
+            _run_step(step_id, image_name, is_dev_mode, gpu=gpu)
             progress.advance(overall)
 
     if is_dev_mode:
diff --git a/src/flyte/cli/_start.py b/src/flyte/cli/_start.py
index 09cbd36b2..2ca5102b8 100644
--- a/src/flyte/cli/_start.py
+++ b/src/flyte/cli/_start.py
@@ -38,10 +38,17 @@ def tui():
     default=False,
     help="Enable dev mode inside the demo cluster (sets FLYTE_DEV=True).",
 )
+@click.option(
+    "--gpu",
+    is_flag=True,
+    default=False,
+    help="Pass host GPUs into the demo container (adds --gpus all to docker run). "
+    "Requires an NVIDIA-enabled host and a GPU-capable image (e.g. flyte-demo:gpu-latest).",
+)
 @click.pass_context
-def demo(ctx: click.Context, image: str, dev: bool):
+def demo(ctx: click.Context, image: str, dev: bool, gpu: bool):
     """Start a local Flyte demo cluster."""
     from flyte.cli._demo import launch_demo
 
     log_format = getattr(ctx.obj, "log_format", "console") if ctx.obj else "console"
-    launch_demo(image, dev, log_format=log_format)
+    launch_demo(image, dev, gpu=gpu, log_format=log_format)

From 0ece68fc2e74fbc3f5b832280298c063eff593c3 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 07:33:53 +0000
Subject: [PATCH 2/4] fix(cli): retry kubeconfig chown on kubectl failure

On Linux bind mounts, the in-container kubeconfig lands root-owned on
the host and kubectl exits non-zero, which surfaces as CalledProcessError
rather than PermissionError, so the existing chown-retry branch never
fired. macOS avoided this because Docker Desktop remaps ownership.
---
 src/flyte/cli/_demo.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/flyte/cli/_demo.py b/src/flyte/cli/_demo.py
index ac56bcb68..39291b59d 100644
--- a/src/flyte/cli/_demo.py
+++ b/src/flyte/cli/_demo.py
@@ -178,8 +178,10 @@ def _merge_kubeconfig(kubeconfig_path: Path, container_name: str) -> None:
 
     try:
         result = _flatten_kubeconfig(default_kubeconfig, kubeconfig_path)
-    except PermissionError:
-        # Handle the case that the user does not have permission to kubeconfig file
+    except (PermissionError, subprocess.CalledProcessError):
+        # On Linux bind mounts, the in-container kubeconfig lands root-owned on
+        # the host; kubectl then exits non-zero (CalledProcessError) rather than
+        # Python raising PermissionError on open.
         uid, gid = os.getuid(), os.getgid()
         subprocess.run(
             ["docker", "exec", container_name, "chown", f"{uid}:{gid}", "/.kube/kubeconfig"],

From 28344f787ec1af0acded48158edd7df1bbc330dd Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 07:47:09 +0000
Subject: [PATCH 3/4] test(cli): cover --gpu flag and kubeconfig chown retry

- _run_container: asserts --gpus all is appended only when gpu=True
- _merge_kubeconfig: asserts chown retry fires on both PermissionError
  and CalledProcessError (the Linux bind-mount case this PR fixes), and
  that the second failure propagates
- demo CLI: asserts --gpu is plumbed through to launch_demo
---
 tests/cli/test_demo.py | 155 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 tests/cli/test_demo.py

diff --git a/tests/cli/test_demo.py b/tests/cli/test_demo.py
new file mode 100644
index 000000000..e6e829110
--- /dev/null
+++ b/tests/cli/test_demo.py
@@ -0,0 +1,155 @@
+"""
+Unit tests for flyte.cli._demo.
+
+Covers the `--gpu` plumbing on `flyte start demo` and the
+kubeconfig chown-retry fallback when kubectl fails to read a root-owned
+kubeconfig on Linux bind mounts.
+"""
+
+import subprocess
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from flyte.cli._demo import _merge_kubeconfig, _run_container
+from flyte.cli._start import demo
+
+
+class TestRunContainerGpuFlag:
+    """Verify the --gpu flag appends `--gpus all` to the docker run command."""
+
+    @staticmethod
+    def _invoke(gpu: bool) -> list[str]:
+        with patch("flyte.cli._demo.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0, stderr="")
+            _run_container(
+                image="ghcr.io/flyteorg/flyte-demo:gpu-latest",
+                is_dev_mode=False,
+                container_name="flyte-demo",
+                kube_dir=Path("/tmp/.kube"),
+                flyte_demo_config_dir=Path("/tmp/.flyte/demo"),
+                volume_name="flyte-demo",
+                ports=["30080:30080"],
+                gpu=gpu,
+            )
+            assert mock_run.call_count == 1
+            return mock_run.call_args.args[0]
+
+    def test_gpu_flag_appends_gpus_all(self):
+        cmd = self._invoke(gpu=True)
+        assert "--gpus" in cmd
+        assert cmd[cmd.index("--gpus") + 1] == "all"
+
+    def test_gpu_disabled_does_not_set_gpus(self):
+        cmd = self._invoke(gpu=False)
+        assert "--gpus" not in cmd
+
+    def test_gpu_flag_precedes_image(self):
+        # `docker run [options] <image>` — --gpus must come before the image arg.
+        cmd = self._invoke(gpu=True)
+        assert cmd.index("--gpus") < cmd.index("ghcr.io/flyteorg/flyte-demo:gpu-latest")
+
+
+class TestMergeKubeconfigRetry:
+    """Verify the chown-retry fallback for a root-owned kubeconfig on Linux."""
+
+    def test_success_on_first_try_does_not_chown(self, tmp_path):
+        kubeconfig = tmp_path / "kubeconfig"
+        kubeconfig.write_text("")
+
+        with (
+            patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten,
+            patch("flyte.cli._demo.subprocess.run") as mock_run,
+            patch("flyte.cli._demo.shutil.move", side_effect=lambda src, dst: Path(dst).touch()),
+            patch("flyte.cli._demo.Path.home", return_value=tmp_path),
+        ):
+            mock_flatten.return_value = MagicMock(stdout="apiVersion: v1\n")
+
+            _merge_kubeconfig(kubeconfig, "flyte-demo")
+
+            assert mock_flatten.call_count == 1
+            mock_run.assert_not_called()
+
+    def test_called_process_error_triggers_chown_and_retry(self, tmp_path):
+        """This is the bug fix: on Linux, kubectl exits non-zero (CalledProcessError),
+        not PermissionError. The retry branch must fire."""
+        kubeconfig = tmp_path / "kubeconfig"
+        kubeconfig.write_text("")
+
+        with (
+            patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten,
+            patch("flyte.cli._demo.subprocess.run") as mock_run,
+            patch("flyte.cli._demo.shutil.move", side_effect=lambda src, dst: Path(dst).touch()),
+            patch("flyte.cli._demo.Path.home", return_value=tmp_path),
+        ):
+            mock_flatten.side_effect = [
+                subprocess.CalledProcessError(1, ["kubectl", "config", "view", "--flatten"]),
+                MagicMock(stdout="apiVersion: v1\n"),
+            ]
+
+            _merge_kubeconfig(kubeconfig, "flyte-demo")
+
+            assert mock_flatten.call_count == 2
+            assert mock_run.call_count == 1
+            docker_cmd = mock_run.call_args.args[0]
+            assert docker_cmd[:4] == ["docker", "exec", "flyte-demo", "chown"]
+            assert docker_cmd[-1] == "/.kube/kubeconfig"
+
+    def test_permission_error_still_triggers_chown_and_retry(self, tmp_path):
+        """Legacy path — macOS users opening the file directly — should still work."""
+        kubeconfig = tmp_path / "kubeconfig"
+        kubeconfig.write_text("")
+
+        with (
+            patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten,
+            patch("flyte.cli._demo.subprocess.run") as mock_run,
+            patch("flyte.cli._demo.shutil.move", side_effect=lambda src, dst: Path(dst).touch()),
+            patch("flyte.cli._demo.Path.home", return_value=tmp_path),
+        ):
+            mock_flatten.side_effect = [
+                PermissionError("denied"),
+                MagicMock(stdout="apiVersion: v1\n"),
+            ]
+
+            _merge_kubeconfig(kubeconfig, "flyte-demo")
+
+            assert mock_flatten.call_count == 2
+            assert mock_run.call_count == 1
+
+    def test_second_flatten_failure_propagates(self, tmp_path):
+        """If kubectl still fails after the chown, we should not swallow the error."""
+        kubeconfig = tmp_path / "kubeconfig"
+        kubeconfig.write_text("")
+
+        with (
+            patch("flyte.cli._demo._flatten_kubeconfig") as mock_flatten,
+            patch("flyte.cli._demo.subprocess.run"),
+            patch("flyte.cli._demo.Path.home", return_value=tmp_path),
+        ):
+            err = subprocess.CalledProcessError(1, ["kubectl"])
+            mock_flatten.side_effect = [err, err]
+
+            with pytest.raises(subprocess.CalledProcessError):
+                _merge_kubeconfig(kubeconfig, "flyte-demo")
+
+
+class TestDemoCliGpuFlag:
+    """Verify the --gpu Click option is plumbed to launch_demo."""
+
+    def test_gpu_flag_passed_through(self):
+        runner = CliRunner()
+        with patch("flyte.cli._demo.launch_demo") as mock_launch:
+            result = runner.invoke(demo, ["--gpu", "--image", "flyte-demo:gpu-latest"])
+            assert result.exit_code == 0, result.output
+            mock_launch.assert_called_once()
+            assert mock_launch.call_args.kwargs["gpu"] is True
+
+    def test_gpu_defaults_to_false(self):
+        runner = CliRunner()
+        with patch("flyte.cli._demo.launch_demo") as mock_launch:
+            result = runner.invoke(demo, ["--image", "flyte-demo:latest"])
+            assert result.exit_code == 0, result.output
+            mock_launch.assert_called_once()
+            assert mock_launch.call_args.kwargs["gpu"] is False

From d0734dc8ba37e792e50c96f768960b3b63aa76e6 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 07:50:07 +0000
Subject: [PATCH 4/4] feat(cli): default --image to GPU image when --gpu is set

If the user passes --gpu without --image, use
ghcr.io/flyteorg/flyte-demo:gpu-latest rather than the CPU default.
An explicit --image is still respected.
---
 src/flyte/cli/_start.py | 16 ++++++++++++----
 tests/cli/test_demo.py  | 29 +++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/flyte/cli/_start.py b/src/flyte/cli/_start.py
index 2ca5102b8..d1dc9c30d 100644
--- a/src/flyte/cli/_start.py
+++ b/src/flyte/cli/_start.py
@@ -25,11 +25,15 @@ def tui():
     launch_tui_explore()
 
 
+_DEFAULT_DEMO_IMAGE = "ghcr.io/flyteorg/flyte-sandbox-v2:nightly"
+_DEFAULT_DEMO_GPU_IMAGE = "ghcr.io/flyteorg/flyte-demo:gpu-latest"
+
+
 @start.command()
 @click.option(
     "--image",
-    default="ghcr.io/flyteorg/flyte-sandbox-v2:nightly",
-    show_default=True,
+    default=None,
+    show_default=f"{_DEFAULT_DEMO_IMAGE} ({_DEFAULT_DEMO_GPU_IMAGE} when --gpu)",
     help="Docker image to use for the demo cluster.",
 )
 @click.option(
@@ -43,12 +47,16 @@ def tui():
     is_flag=True,
     default=False,
     help="Pass host GPUs into the demo container (adds --gpus all to docker run). "
-    "Requires an NVIDIA-enabled host and a GPU-capable image (e.g. flyte-demo:gpu-latest).",
+    "Requires an NVIDIA-enabled host. Defaults --image to a GPU-capable image "
+    "if --image is not explicitly set.",
 )
 @click.pass_context
-def demo(ctx: click.Context, image: str, dev: bool, gpu: bool):
+def demo(ctx: click.Context, image: str | None, dev: bool, gpu: bool):
     """Start a local Flyte demo cluster."""
     from flyte.cli._demo import launch_demo
 
+    if image is None:
+        image = _DEFAULT_DEMO_GPU_IMAGE if gpu else _DEFAULT_DEMO_IMAGE
+
     log_format = getattr(ctx.obj, "log_format", "console") if ctx.obj else "console"
     launch_demo(image, dev, gpu=gpu, log_format=log_format)
diff --git a/tests/cli/test_demo.py b/tests/cli/test_demo.py
index e6e829110..d5bc0c84e 100644
--- a/tests/cli/test_demo.py
+++ b/tests/cli/test_demo.py
@@ -153,3 +153,32 @@ def test_gpu_defaults_to_false(self):
             assert result.exit_code == 0, result.output
             mock_launch.assert_called_once()
             assert mock_launch.call_args.kwargs["gpu"] is False
+
+
+class TestDemoCliDefaultImage:
+    """--gpu without --image should pick the GPU-capable default image."""
+
+    def test_gpu_without_image_uses_gpu_default(self):
+        from flyte.cli._start import _DEFAULT_DEMO_GPU_IMAGE
+
+        runner = CliRunner()
+        with patch("flyte.cli._demo.launch_demo") as mock_launch:
+            result = runner.invoke(demo, ["--gpu"])
+            assert result.exit_code == 0, result.output
+            assert mock_launch.call_args.args[0] == _DEFAULT_DEMO_GPU_IMAGE
+
+    def test_no_flags_uses_cpu_default(self):
+        from flyte.cli._start import _DEFAULT_DEMO_IMAGE
+
+        runner = CliRunner()
+        with patch("flyte.cli._demo.launch_demo") as mock_launch:
+            result = runner.invoke(demo, [])
+            assert result.exit_code == 0, result.output
+            assert mock_launch.call_args.args[0] == _DEFAULT_DEMO_IMAGE
+
+    def test_explicit_image_with_gpu_is_respected(self):
+        runner = CliRunner()
+        with patch("flyte.cli._demo.launch_demo") as mock_launch:
+            result = runner.invoke(demo, ["--gpu", "--image", "myorg/custom:latest"])
+            assert result.exit_code == 0, result.output
+            assert mock_launch.call_args.args[0] == "myorg/custom:latest"