From 161eb225936b00a82004bc9c2a7fd9a8bca558b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 21 Apr 2026 13:59:14 -0700 Subject: [PATCH 1/2] feat: multi-Python worker images with startup version check (AE-2827) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Python 3.10 and 3.11 support to GPU worker images via side-by-side torch install in the existing runpod/pytorch base. 3.12 keeps the fast path (torch pre-installed) to avoid the ~7 GB reinstall cost on hot deployments; 3.10/3.11 images pay that cost once per cold start per DC. Sibling to flash#322 which landed the SDK-level plumbing. Tags follow the same ``py${VERSION}-${TAG}`` scheme already in use for CPU images. - Dockerfile / Dockerfile-lb (GPU): accept PYTHON_VERSION build arg; install torch from download.pytorch.org/whl/cu128 and repoint /usr/local/bin/python for non-3.12 targets; validate interpreter matches the arg during build. - Dockerfile-cpu / Dockerfile-lb-cpu (CPU): surface PYTHON_VERSION at runtime via FLASH_PYTHON_VERSION env so the worker's startup check can read it. - src/version.py: new ``assert_python_version_matches_image`` — raises PythonVersionMismatchError at handler boot when ``sys.version_info`` disagrees with the image's stamped FLASH_PYTHON_VERSION. Caught before user code runs; skipped when the env var is unset (local dev). - src/handler.py / src/lb_handler.py: call the assertion immediately after logging setup, before ``maybe_unpack()`` and handler import. - tests/unit/test_version.py: 4 new cases covering env-unset skip, match, mismatch raise, and message contents. - tests/unit/test_lb_handler.py: extend the mocked ``version`` module with ``assert_python_version_matches_image`` so fresh-import tests don't break. - .github/workflows/ci.yml: expand CI to build GPU and LB images across {3.10, 3.11, 3.12}; align prod CPU and LB-CPU default to 3.12 (matches flash's DEFAULT_PYTHON_VERSION). --- .github/workflows/ci.yml | 131 +++++++++++++++++++++++++++++----- Dockerfile | 44 ++++++++---- Dockerfile-cpu | 5 ++ Dockerfile-lb | 42 +++++++---- Dockerfile-lb-cpu | 5 ++ src/handler.py | 6 +- src/lb_handler.py | 6 +- src/version.py | 31 ++++++++ tests/unit/test_lb_handler.py | 1 + tests/unit/test_version.py | 38 ++++++++++ 10 files changed, 263 insertions(+), 46 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30975f8..307e552 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -73,6 +73,9 @@ jobs: docker-test: runs-on: ubuntu-latest if: github.event_name != 'pull_request' || github.head_ref != 'release-please--branches--main' + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -89,19 +92,22 @@ jobs: push: false tags: flash-cpu:test build-args: | - PYTHON_VERSION=3.11 - cache-from: type=gha - cache-to: type=gha,mode=max + PYTHON_VERSION=${{ matrix.python-version }} + cache-from: type=gha,scope=cpu-test-py${{ matrix.python-version }} + cache-to: type=gha,mode=max,scope=cpu-test-py${{ matrix.python-version }} load: true - name: Test CPU handler execution in Docker environment run: | - echo "Testing CPU handler in Docker environment..." + echo "Testing CPU handler (Python ${{ matrix.python-version }})..." docker run --rm flash-cpu:test ./test-handler.sh docker-test-lb-cpu: runs-on: ubuntu-latest if: github.event_name != 'pull_request' || github.head_ref != 'release-please--branches--main' + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -118,24 +124,97 @@ jobs: push: false tags: flash-lb-cpu:test build-args: | - PYTHON_VERSION=3.11 - cache-from: type=gha - cache-to: type=gha,mode=max + PYTHON_VERSION=${{ matrix.python-version }} + cache-from: type=gha,scope=lb-cpu-test-py${{ matrix.python-version }} + cache-to: type=gha,mode=max,scope=lb-cpu-test-py${{ matrix.python-version }} load: true - name: Test LB handler execution in Docker environment run: | - echo "Testing LB handler in Docker environment..." + echo "Testing LB handler (Python ${{ matrix.python-version }})..." docker run --rm flash-lb-cpu:test ./test-lb-handler.sh + docker-test-gpu: + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' || github.head_ref != 'release-please--branches--main' + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - name: Clear space + run: | + rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost "$AGENT_TOOLSDIRECTORY" + docker system prune -af + df -h + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build GPU Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + platforms: linux/amd64 + push: false + tags: flash-gpu:test + build-args: | + PYTHON_VERSION=${{ matrix.python-version }} + cache-from: type=gha,scope=gpu-test-py${{ matrix.python-version }} + cache-to: type=gha,mode=max,scope=gpu-test-py${{ matrix.python-version }} + + docker-test-lb: + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' || github.head_ref != 'release-please--branches--main' + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - name: Clear space + run: | + rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost "$AGENT_TOOLSDIRECTORY" + docker system prune -af + df -h + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build GPU Load Balancer Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile-lb + platforms: linux/amd64 + push: false + tags: flash-lb:test + build-args: | + PYTHON_VERSION=${{ matrix.python-version }} + cache-from: type=gha,scope=lb-test-py${{ matrix.python-version }} + cache-to: type=gha,mode=max,scope=lb-test-py${{ matrix.python-version }} + docker-validation: runs-on: ubuntu-latest - needs: [test, lint, docker-test, docker-test-lb-cpu] + needs: [test, lint, docker-test, docker-test-lb-cpu, docker-test-gpu, docker-test-lb] if: always() steps: - name: Check all jobs succeeded run: | - results=("${{ needs.test.result }}" "${{ needs.lint.result }}" "${{ needs.docker-test.result }}" "${{ needs.docker-test-lb-cpu.result }}") + results=( + "${{ needs.test.result }}" + "${{ needs.lint.result }}" + "${{ needs.docker-test.result }}" + "${{ needs.docker-test-lb-cpu.result }}" + "${{ needs.docker-test-gpu.result }}" + "${{ needs.docker-test-lb.result }}" + ) for result in "${results[@]}"; do if [[ "$result" != "success" && "$result" != "skipped" ]]; then echo "One or more quality checks failed (got: $result)" @@ -168,8 +247,13 @@ jobs: needs: [release] if: needs.release.outputs.release_created strategy: + fail-fast: false matrix: include: + - python-version: "3.10" + is-default: false + - python-version: "3.11" + is-default: false - python-version: "3.12" is-default: true steps: @@ -226,22 +310,25 @@ jobs: platforms: linux/amd64 push: true tags: ${{ steps.tags.outputs.tags }} - cache-from: type=gha,scope=gpu - cache-to: type=gha,mode=max,scope=gpu + build-args: | + PYTHON_VERSION=${{ matrix.python-version }} + cache-from: type=gha,scope=gpu-py${{ matrix.python-version }} + cache-to: type=gha,mode=max,scope=gpu-py${{ matrix.python-version }} docker-prod-cpu: runs-on: ubuntu-latest needs: [release] if: needs.release.outputs.release_created strategy: + fail-fast: false matrix: include: - python-version: "3.10" is-default: false - python-version: "3.11" - is-default: true - - python-version: "3.12" is-default: false + - python-version: "3.12" + is-default: true steps: - name: Clear Space run: | @@ -306,8 +393,13 @@ jobs: needs: [release] if: needs.release.outputs.release_created strategy: + fail-fast: false matrix: include: + - python-version: "3.10" + is-default: false + - python-version: "3.11" + is-default: false - python-version: "3.12" is-default: true steps: @@ -364,22 +456,25 @@ jobs: platforms: linux/amd64 push: true tags: ${{ steps.tags.outputs.tags }} - cache-from: type=gha,scope=lb - cache-to: type=gha,mode=max,scope=lb + build-args: | + PYTHON_VERSION=${{ matrix.python-version }} + cache-from: type=gha,scope=lb-py${{ matrix.python-version }} + cache-to: type=gha,mode=max,scope=lb-py${{ matrix.python-version }} docker-prod-lb-cpu: runs-on: ubuntu-latest needs: [release] if: needs.release.outputs.release_created strategy: + fail-fast: false matrix: include: - python-version: "3.10" is-default: false - python-version: "3.11" - is-default: true - - python-version: "3.12" is-default: false + - python-version: "3.12" + is-default: true steps: - name: Clear Space run: | diff --git a/Dockerfile b/Dockerfile index a0f220f..62504b7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,30 @@ -# Base image provides Python 3.12 (from runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204) +# Base image provides Python 3.9-3.13 via deadsnakes; only 3.12 has torch +# pre-installed. For 3.10 and 3.11 we reinstall torch from the CUDA 12.8 +# wheel index (~7 GB overhead) and repoint /usr/local/bin/python so the +# worker CMD picks up the correct interpreter. FROM runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204 -# Use the base image's Python as-is to preserve pre-installed packages (torch, cuda libs). -# The pytorch base image provides its own Python with torch already installed. -# Symlinking to /usr/bin/python3.X would switch to a bare system Python without torch. -# Validate that the base image provides the expected Python version. -ARG EXPECTED_PYTHON_VERSION=3.12 -RUN python --version && \ - actual=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") && \ - if [ "$actual" != "$EXPECTED_PYTHON_VERSION" ]; then \ - echo "ERROR: Expected Python $EXPECTED_PYTHON_VERSION but base image provides $actual" && exit 1; \ +# Target Python version for the worker runtime. +ARG PYTHON_VERSION=3.12 +ARG TORCH_VERSION=2.9.1+cu128 +ARG TORCH_INDEX_URL=https://download.pytorch.org/whl/cu128 + +# Expose the target version to the running worker for startup validation. +ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION} + +# Validate the base image provides the requested interpreter and activate it. +# For non-3.12 targets, install torch for the selected Python and repoint +# /usr/local/bin/python and python3 so downstream `python` invocations use it. +# For 3.12 we keep the base image's python/torch untouched to avoid the +# ~7 GB reinstall cost. +RUN python${PYTHON_VERSION} --version \ + && if [ "${PYTHON_VERSION}" != "3.12" ]; then \ + python${PYTHON_VERSION} -m ensurepip --upgrade \ + && python${PYTHON_VERSION} -m pip install --no-cache-dir \ + --index-url ${TORCH_INDEX_URL} \ + "torch==${TORCH_VERSION}" \ + && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python \ + && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python3; \ fi WORKDIR /app @@ -41,20 +56,21 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-ins && rm -rf /var/lib/apt/lists/* # Copy app code and install dependencies -# Use --python to target the base image's Python (preserves torch in its site-packages) +# Use --python to target the active interpreter (preserves torch in its site-packages) COPY README.md pyproject.toml uv.lock ./ COPY src/ ./ RUN uv export --format requirements-txt --no-dev --no-hashes > requirements.txt \ && uv pip install --python $(which python) --break-system-packages -r requirements.txt -# Install numpy for the base image's Python version. +# Install numpy for the active Python version. # The runpod/pytorch image ships torch but not numpy. Flash build excludes numpy # from tarballs (BASE_IMAGE_PACKAGES) to save tarball space (~30 MB), so numpy # must be provided here in the base image. RUN python -m pip install --no-cache-dir numpy -# Verify torch and numpy are available from the base image -RUN python -c "import torch; print(f'torch {torch.__version__} CUDA {torch.cuda.is_available()}')" \ +# Verify torch, numpy, and the expected Python version are available. +RUN python -c "import sys; actual = f'{sys.version_info.major}.{sys.version_info.minor}'; expected = '${PYTHON_VERSION}'; assert actual == expected, f'Expected Python {expected}, got {actual}'; print(f'Python {actual} OK')" \ + && python -c "import torch; print(f'torch {torch.__version__} CUDA {torch.cuda.is_available()}')" \ && python -c "import numpy; print(f'numpy {numpy.__version__}')" CMD ["python", "handler.py"] diff --git a/Dockerfile-cpu b/Dockerfile-cpu index ec9894b..48c174f 100644 --- a/Dockerfile-cpu +++ b/Dockerfile-cpu @@ -1,6 +1,11 @@ ARG PYTHON_VERSION=3.12 FROM python:${PYTHON_VERSION}-slim +# Re-declare after FROM so the value is visible in this build stage, and +# expose it at runtime for the worker's startup version check. +ARG PYTHON_VERSION +ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION} + WORKDIR /app # Prevent interactive prompts during package installation diff --git a/Dockerfile-lb b/Dockerfile-lb index 3d0ed1c..29eb4f3 100644 --- a/Dockerfile-lb +++ b/Dockerfile-lb @@ -1,13 +1,30 @@ -# Base image provides Python 3.12 (from runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204) +# Base image provides Python 3.9-3.13 via deadsnakes; only 3.12 has torch +# pre-installed. For 3.10 and 3.11 we reinstall torch from the CUDA 12.8 +# wheel index (~7 GB overhead) and repoint /usr/local/bin/python so the +# worker CMD picks up the correct interpreter. FROM runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204 -# Use the base image's Python as-is to preserve pre-installed packages (torch, cuda libs). -# Validate that the base image provides the expected Python version. -ARG EXPECTED_PYTHON_VERSION=3.12 -RUN python --version && \ - actual=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") && \ - if [ "$actual" != "$EXPECTED_PYTHON_VERSION" ]; then \ - echo "ERROR: Expected Python $EXPECTED_PYTHON_VERSION but base image provides $actual" && exit 1; \ +# Target Python version for the worker runtime. +ARG PYTHON_VERSION=3.12 +ARG TORCH_VERSION=2.9.1+cu128 +ARG TORCH_INDEX_URL=https://download.pytorch.org/whl/cu128 + +# Expose the target version to the running worker for startup validation. +ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION} + +# Validate the base image provides the requested interpreter and activate it. +# For non-3.12 targets, install torch for the selected Python and repoint +# /usr/local/bin/python and python3 so downstream `python` invocations use it. +# For 3.12 we keep the base image's python/torch untouched to avoid the +# ~7 GB reinstall cost. +RUN python${PYTHON_VERSION} --version \ + && if [ "${PYTHON_VERSION}" != "3.12" ]; then \ + python${PYTHON_VERSION} -m ensurepip --upgrade \ + && python${PYTHON_VERSION} -m pip install --no-cache-dir \ + --index-url ${TORCH_INDEX_URL} \ + "torch==${TORCH_VERSION}" \ + && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python \ + && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python3; \ fi WORKDIR /app @@ -39,20 +56,21 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-ins && rm -rf /var/lib/apt/lists/* # Copy app code and install dependencies -# Use --python to target the base image's Python (preserves torch in its site-packages) +# Use --python to target the active interpreter (preserves torch in its site-packages) COPY README.md pyproject.toml uv.lock ./ COPY src/ ./ RUN uv export --format requirements-txt --no-dev --no-hashes > requirements.txt \ && uv pip install --python $(which python) --break-system-packages -r requirements.txt -# Install numpy for the base image's Python version. +# Install numpy for the active Python version. # The runpod/pytorch image ships torch but not numpy. Flash build excludes numpy # from tarballs (BASE_IMAGE_PACKAGES) to save tarball space (~30 MB), so numpy # must be provided here in the base image. RUN python -m pip install --no-cache-dir numpy -# Verify torch and numpy are available from the base image -RUN python -c "import torch; print(f'torch {torch.__version__} CUDA {torch.cuda.is_available()}')" \ +# Verify torch, numpy, and the expected Python version are available. +RUN python -c "import sys; actual = f'{sys.version_info.major}.{sys.version_info.minor}'; expected = '${PYTHON_VERSION}'; assert actual == expected, f'Expected Python {expected}, got {actual}'; print(f'Python {actual} OK')" \ + && python -c "import torch; print(f'torch {torch.__version__} CUDA {torch.cuda.is_available()}')" \ && python -c "import numpy; print(f'numpy {numpy.__version__}')" EXPOSE 80 diff --git a/Dockerfile-lb-cpu b/Dockerfile-lb-cpu index e534147..f99459f 100644 --- a/Dockerfile-lb-cpu +++ b/Dockerfile-lb-cpu @@ -1,6 +1,11 @@ ARG PYTHON_VERSION=3.12 FROM python:${PYTHON_VERSION}-slim +# Re-declare after FROM so the value is visible in this build stage, and +# expose it at runtime for the worker's startup version check. +ARG PYTHON_VERSION +ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION} + WORKDIR /app # Prevent interactive prompts during package installation diff --git a/src/handler.py b/src/handler.py index a1245e7..d629928 100644 --- a/src/handler.py +++ b/src/handler.py @@ -8,13 +8,17 @@ from constants import MAX_IMPORT_RECOVERY_ATTEMPTS from logger import setup_logging from unpack_volume import maybe_unpack -from version import format_version_banner +from version import assert_python_version_matches_image, format_version_banner # Initialize logging configuration setup_logging() logger = logging.getLogger(__name__) +# Fail fast if the running interpreter disagrees with the image's advertised +# version — catches mis-tagged images before user code runs. +assert_python_version_matches_image() + # Unpack Flash deployment artifacts if running in Flash mode # This is a no-op for Live Serverless and local development maybe_unpack() diff --git a/src/lb_handler.py b/src/lb_handler.py index 8fa662b..6000a9f 100644 --- a/src/lb_handler.py +++ b/src/lb_handler.py @@ -29,12 +29,16 @@ from logger import setup_logging from unpack_volume import maybe_unpack -from version import format_version_banner +from version import assert_python_version_matches_image, format_version_banner # Initialize logging configuration setup_logging() logger = logging.getLogger(__name__) +# Fail fast if the running interpreter disagrees with the image's advertised +# version — catches mis-tagged images before user code runs. +assert_python_version_matches_image() + # Unpack Flash deployment artifacts if running in Flash mode # This is a no-op for Live Serverless and local development maybe_unpack() diff --git a/src/version.py b/src/version.py index 3fb437f..4cafabf 100644 --- a/src/version.py +++ b/src/version.py @@ -1,11 +1,42 @@ """Version utilities for flash-worker boot logging.""" +import os import platform +import sys from importlib.metadata import PackageNotFoundError, version __version__ = "1.4.4" # x-release-please-version +class PythonVersionMismatchError(RuntimeError): + """Raised when the running interpreter does not match the image's declared version.""" + + +def assert_python_version_matches_image() -> None: + """Fail fast if ``sys.version_info`` disagrees with ``FLASH_PYTHON_VERSION``. + + The Dockerfiles stamp ``FLASH_PYTHON_VERSION`` with the image's target + Python (e.g. ``3.11``). If an image is mis-tagged, an apt upgrade + changes ``python`` symlinks, or the GPU side-by-side torch install fails + silently, this surfaces the skew immediately at worker boot instead of + letting user code fail later with a confusing ABI error. + + Skips the check when ``FLASH_PYTHON_VERSION`` is unset (local dev, + test harnesses). + """ + declared = os.environ.get("FLASH_PYTHON_VERSION") + if not declared: + return + + actual = f"{sys.version_info.major}.{sys.version_info.minor}" + if actual != declared: + raise PythonVersionMismatchError( + f"Worker interpreter mismatch: image declares FLASH_PYTHON_VERSION=" + f"{declared!r} but sys.version_info reports {actual!r}. " + f"Rebuild the image with the correct PYTHON_VERSION build arg." + ) + + def _get_version(package_name: str) -> str: try: return version(package_name) diff --git a/tests/unit/test_lb_handler.py b/tests/unit/test_lb_handler.py index d3dac76..dc43e05 100644 --- a/tests/unit/test_lb_handler.py +++ b/tests/unit/test_lb_handler.py @@ -16,6 +16,7 @@ # Mock heavy dependencies before importing lb_handler to prevent side effects _mock_version = MagicMock() _mock_version.format_version_banner = MagicMock(return_value="Starting Flash Worker vtest") +_mock_version.assert_python_version_matches_image = MagicMock(return_value=None) _MOCK_MODULES = { "logger": MagicMock(), diff --git a/tests/unit/test_version.py b/tests/unit/test_version.py index 253dc28..91d047e 100644 --- a/tests/unit/test_version.py +++ b/tests/unit/test_version.py @@ -1,12 +1,17 @@ """Tests for version utilities.""" import platform +import sys from importlib.metadata import PackageNotFoundError from unittest.mock import patch +import pytest + from version import ( + PythonVersionMismatchError, __version__, _get_version, + assert_python_version_matches_image, format_version_banner, get_flash_version, get_runpod_version, @@ -76,3 +81,36 @@ def test_banner_handles_unknown_versions(self, mock_worker, mock_flash, mock_run result == f"Starting Flash Worker unknown | Python {py} | runpod-flash unknown | runpod unknown" ) + + +class TestAssertPythonVersionMatchesImage: + """Tests for the AE-2827 Python version assertion.""" + + @pytest.fixture(autouse=True) + def _clear_env(self, monkeypatch): + monkeypatch.delenv("FLASH_PYTHON_VERSION", raising=False) + + def test_noop_when_env_var_unset(self): + """Local dev and test harnesses don't set FLASH_PYTHON_VERSION; skip check.""" + assert_python_version_matches_image() + + def test_passes_when_declared_matches_interpreter(self, monkeypatch): + actual = f"{sys.version_info.major}.{sys.version_info.minor}" + monkeypatch.setenv("FLASH_PYTHON_VERSION", actual) + assert_python_version_matches_image() + + def test_raises_on_mismatch(self, monkeypatch): + monkeypatch.setenv("FLASH_PYTHON_VERSION", "3.99") + with pytest.raises(PythonVersionMismatchError, match="interpreter mismatch"): + assert_python_version_matches_image() + + def test_mismatch_error_message_includes_both_versions(self, monkeypatch): + declared = "3.99" + monkeypatch.setenv("FLASH_PYTHON_VERSION", declared) + with pytest.raises(PythonVersionMismatchError) as excinfo: + assert_python_version_matches_image() + + actual = f"{sys.version_info.major}.{sys.version_info.minor}" + message = str(excinfo.value) + assert declared in message + assert actual in message From 0dee75c9c27f39035b2ec50e0bd24d1cdfe9cfd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Thu, 23 Apr 2026 03:21:47 -0700 Subject: [PATCH 2/2] fix(dockerfile): bootstrap pip via get-pip.py for non-3.12 GPU builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ubuntu 22.04's system python3.10 has ensurepip disabled by Debian policy, which broke the side-by-side torch install for 3.10 GPU images (CI: docker-test-gpu (3.10), docker-test-lb (3.10)). python3.11 is a separate interpreter without the disable, so only 3.10 was affected. Use urllib+get-pip.py instead of ensurepip — works for any interpreter regardless of distro patching, and urllib is stdlib so no curl dep. Also corrects the outdated deadsnakes comment on both Dockerfiles: the runpod/pytorch base image layers alt-Python 3.11/3.12 on top of the system 3.10, not via deadsnakes. --- Dockerfile | 17 ++++++++++++----- Dockerfile-lb | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 62504b7..01a49ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,8 @@ -# Base image provides Python 3.9-3.13 via deadsnakes; only 3.12 has torch -# pre-installed. For 3.10 and 3.11 we reinstall torch from the CUDA 12.8 -# wheel index (~7 GB overhead) and repoint /usr/local/bin/python so the -# worker CMD picks up the correct interpreter. +# Base image (runpod/pytorch:ubuntu2204) ships Ubuntu 22.04 system python3.10 +# plus alt-Python interpreters for 3.11/3.12 with torch pre-installed on 3.12. +# For non-3.12 targets we reinstall torch from the CUDA 12.8 wheel index +# (~7 GB overhead) and repoint /usr/local/bin/python so the worker CMD picks +# up the correct interpreter. FROM runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204 # Target Python version for the worker runtime. @@ -17,9 +18,15 @@ ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION} # /usr/local/bin/python and python3 so downstream `python` invocations use it. # For 3.12 we keep the base image's python/torch untouched to avoid the # ~7 GB reinstall cost. +# +# pip bootstrap: Ubuntu 22.04's system python3.10 has ensurepip disabled by +# Debian policy, so we install pip via get-pip.py (works for any interpreter +# regardless of distro patching). urllib is stdlib, avoiding a curl dependency. RUN python${PYTHON_VERSION} --version \ && if [ "${PYTHON_VERSION}" != "3.12" ]; then \ - python${PYTHON_VERSION} -m ensurepip --upgrade \ + python${PYTHON_VERSION} -c "import urllib.request; urllib.request.urlretrieve('https://bootstrap.pypa.io/get-pip.py', '/tmp/get-pip.py')" \ + && python${PYTHON_VERSION} /tmp/get-pip.py --no-cache-dir \ + && rm -f /tmp/get-pip.py \ && python${PYTHON_VERSION} -m pip install --no-cache-dir \ --index-url ${TORCH_INDEX_URL} \ "torch==${TORCH_VERSION}" \ diff --git a/Dockerfile-lb b/Dockerfile-lb index 29eb4f3..ff927c6 100644 --- a/Dockerfile-lb +++ b/Dockerfile-lb @@ -1,7 +1,8 @@ -# Base image provides Python 3.9-3.13 via deadsnakes; only 3.12 has torch -# pre-installed. For 3.10 and 3.11 we reinstall torch from the CUDA 12.8 -# wheel index (~7 GB overhead) and repoint /usr/local/bin/python so the -# worker CMD picks up the correct interpreter. +# Base image (runpod/pytorch:ubuntu2204) ships Ubuntu 22.04 system python3.10 +# plus alt-Python interpreters for 3.11/3.12 with torch pre-installed on 3.12. +# For non-3.12 targets we reinstall torch from the CUDA 12.8 wheel index +# (~7 GB overhead) and repoint /usr/local/bin/python so the worker CMD picks +# up the correct interpreter. FROM runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204 # Target Python version for the worker runtime. @@ -17,9 +18,15 @@ ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION} # /usr/local/bin/python and python3 so downstream `python` invocations use it. # For 3.12 we keep the base image's python/torch untouched to avoid the # ~7 GB reinstall cost. +# +# pip bootstrap: Ubuntu 22.04's system python3.10 has ensurepip disabled by +# Debian policy, so we install pip via get-pip.py (works for any interpreter +# regardless of distro patching). urllib is stdlib, avoiding a curl dependency. RUN python${PYTHON_VERSION} --version \ && if [ "${PYTHON_VERSION}" != "3.12" ]; then \ - python${PYTHON_VERSION} -m ensurepip --upgrade \ + python${PYTHON_VERSION} -c "import urllib.request; urllib.request.urlretrieve('https://bootstrap.pypa.io/get-pip.py', '/tmp/get-pip.py')" \ + && python${PYTHON_VERSION} /tmp/get-pip.py --no-cache-dir \ + && rm -f /tmp/get-pip.py \ && python${PYTHON_VERSION} -m pip install --no-cache-dir \ --index-url ${TORCH_INDEX_URL} \ "torch==${TORCH_VERSION}" \