From 7bca4cc57b43d4a0e3a75321ac385b433ee5ce0e Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 24 Apr 2026 23:07:14 +0000 Subject: [PATCH 1/3] feat(backends): add CUDA 13 + L4T arm64 CUDA 13 variants for vllm/vllm-omni/sglang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds new build profiles mirroring the diffusers/ace-step pattern so vLLM serving (and SGLang on arm64) can be deployed on CUDA 13 hosts and JetPack 7 boards: - vllm: cublas13 (PyPI cu130 channel) + l4t13 (jetson-ai-lab SBSA cu130 prebuilt vllm + flash-attn). - vllm-omni: cublas13 + l4t13. Floats vllm version on cu13 since vllm 0.19+ ships cu130 wheels by default and vllm-omni tracks vllm master; cu12 path keeps the 0.14.0 pin to avoid disturbing existing images. - sglang: l4t13 arm64 only — uses the prebuilt sglang wheel from the jetson-ai-lab SBSA cu130 index, so no source build is needed. Cublas13 sglang on x86_64 is intentionally deferred. CI matrix gains five new images (-gpu-nvidia-cuda-13-vllm{,-omni}, -nvidia-l4t-cuda-13-arm64-{vllm,vllm-omni,sglang}); backend/index.yaml gains the matching capability keys (nvidia-cuda-13, nvidia-l4t-cuda-13) and latest/development merge entries. Assisted-by: Claude:claude-opus-4-7 [Read] [Edit] [Write] [Bash] --- .github/workflows/backend.yml | 65 +++++++++++++++++++ backend/index.yaml | 62 ++++++++++++++++++ backend/python/sglang/install.sh | 9 +++ backend/python/sglang/requirements-l4t13.txt | 7 ++ backend/python/vllm-omni/install.sh | 25 ++++++- .../vllm-omni/requirements-cublas13.txt | 5 ++ .../python/vllm-omni/requirements-l4t13.txt | 13 ++++ backend/python/vllm/install.sh | 12 ++++ .../vllm/requirements-cublas13-after.txt | 2 + backend/python/vllm/requirements-cublas13.txt | 5 ++ .../python/vllm/requirements-l4t13-after.txt | 2 + backend/python/vllm/requirements-l4t13.txt | 8 +++ 12 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 backend/python/sglang/requirements-l4t13.txt create mode 100644 backend/python/vllm-omni/requirements-cublas13.txt create mode 100644 backend/python/vllm-omni/requirements-l4t13.txt create mode 100644 backend/python/vllm/requirements-cublas13-after.txt create mode 100644 backend/python/vllm/requirements-cublas13.txt create mode 100644 backend/python/vllm/requirements-l4t13-after.txt create mode 100644 backend/python/vllm/requirements-l4t13.txt diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 9ceb9d4f9a09..f9295af70c35 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -920,6 +920,32 @@ jobs: backend: "turboquant" dockerfile: "./backend/Dockerfile.turboquant" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-vllm' + runs-on: 'arc-runner-set' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "vllm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-vllm-omni' + runs-on: 'arc-runner-set' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "vllm-omni" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1076,6 +1102,45 @@ jobs: backend: "diffusers" dockerfile: "./backend/Dockerfile.python" context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-vllm' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "vllm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-vllm-omni' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "vllm-omni" + dockerfile: "./backend/Dockerfile.python" + context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-sglang' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "sglang" + dockerfile: "./backend/Dockerfile.python" + context: "./" - build-type: 'l4t' cuda-major-version: "13" cuda-minor-version: "0" diff --git a/backend/index.yaml b/backend/index.yaml index 63726c5b620e..38b8f798bea0 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -263,6 +263,8 @@ amd: "rocm-vllm" intel: "intel-vllm" nvidia-cuda-12: "cuda12-vllm" + nvidia-cuda-13: "cuda13-vllm" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm" cpu: "cpu-vllm" - &sglang name: "sglang" @@ -285,6 +287,7 @@ amd: "rocm-sglang" intel: "intel-sglang" nvidia-cuda-12: "cuda12-sglang" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sglang" cpu: "cpu-sglang" - &vllm-omni name: "vllm-omni" @@ -311,6 +314,8 @@ nvidia: "cuda12-vllm-omni" amd: "rocm-vllm-omni" nvidia-cuda-12: "cuda12-vllm-omni" + nvidia-cuda-13: "cuda13-vllm-omni" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-omni" - &mlx name: "mlx" uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx" @@ -1814,12 +1819,25 @@ nvidia: "cuda12-vllm-development" amd: "rocm-vllm-development" intel: "intel-vllm-development" + nvidia-cuda-12: "cuda12-vllm-development" + nvidia-cuda-13: "cuda13-vllm-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development" cpu: "cpu-vllm-development" - !!merge <<: *vllm name: "cuda12-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm" mirrors: - localai/localai-backends:latest-gpu-nvidia-cuda-12-vllm +- !!merge <<: *vllm + name: "cuda13-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-vllm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-vllm +- !!merge <<: *vllm + name: "cuda13-nvidia-l4t-arm64-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm - !!merge <<: *vllm name: "rocm-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-vllm" @@ -1840,6 +1858,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-12-vllm +- !!merge <<: *vllm + name: "cuda13-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-vllm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-vllm +- !!merge <<: *vllm + name: "cuda13-nvidia-l4t-arm64-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vllm" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vllm - !!merge <<: *vllm name: "rocm-vllm-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-vllm" @@ -1862,12 +1890,19 @@ nvidia: "cuda12-sglang-development" amd: "rocm-sglang-development" intel: "intel-sglang-development" + nvidia-cuda-12: "cuda12-sglang-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sglang-development" cpu: "cpu-sglang-development" - !!merge <<: *sglang name: "cuda12-sglang" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sglang" mirrors: - localai/localai-backends:latest-gpu-nvidia-cuda-12-sglang +- !!merge <<: *sglang + name: "cuda13-nvidia-l4t-arm64-sglang" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-sglang" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-sglang - !!merge <<: *sglang name: "rocm-sglang" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-sglang" @@ -1888,6 +1923,11 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sglang" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-12-sglang +- !!merge <<: *sglang + name: "cuda13-nvidia-l4t-arm64-sglang-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-sglang" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-sglang - !!merge <<: *sglang name: "rocm-sglang-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-sglang" @@ -1910,11 +1950,23 @@ nvidia: "cuda12-vllm-omni-development" amd: "rocm-vllm-omni-development" nvidia-cuda-12: "cuda12-vllm-omni-development" + nvidia-cuda-13: "cuda13-vllm-omni-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-omni-development" - !!merge <<: *vllm-omni name: "cuda12-vllm-omni" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm-omni" mirrors: - localai/localai-backends:latest-gpu-nvidia-cuda-12-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-vllm-omni" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-vllm-omni" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-nvidia-l4t-arm64-vllm-omni" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm-omni" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm-omni - !!merge <<: *vllm-omni name: "rocm-vllm-omni" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-vllm-omni" @@ -1925,6 +1977,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm-omni" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-12-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-vllm-omni-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-vllm-omni" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-nvidia-l4t-arm64-vllm-omni-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vllm-omni" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vllm-omni - !!merge <<: *vllm-omni name: "rocm-vllm-omni-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-vllm-omni" diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh index 0f8f1f63456a..22539b5bcd20 100755 --- a/backend/python/sglang/install.sh +++ b/backend/python/sglang/install.sh @@ -23,6 +23,15 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# JetPack 7 / L4T arm64 wheels are built for cp312 and shipped via +# pypi.jetson-ai-lab.io. Bump the venv Python so the prebuilt sglang +# wheel resolves cleanly. +if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" +fi + # sglang's CPU path has no prebuilt wheel on PyPI — upstream publishes # a separate pyproject_cpu.toml that must be swapped in before `pip install`. # Reference: docker/xeon.Dockerfile in the sglang upstream repo. diff --git a/backend/python/sglang/requirements-l4t13.txt b/backend/python/sglang/requirements-l4t13.txt new file mode 100644 index 000000000000..ad9908650963 --- /dev/null +++ b/backend/python/sglang/requirements-l4t13.txt @@ -0,0 +1,7 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +accelerate +torch +torchvision +torchaudio +transformers +sglang[all] diff --git a/backend/python/vllm-omni/install.sh b/backend/python/vllm-omni/install.sh index 3aa6367d33ef..889a130c739b 100755 --- a/backend/python/vllm-omni/install.sh +++ b/backend/python/vllm-omni/install.sh @@ -26,7 +26,11 @@ fi # Install base requirements first installRequirements -# Install vllm based on build type +# Install vllm based on build type. vllm-omni tracks vllm master from +# source (cloned below) so we leave the upstream vllm dependency unpinned +# — vllm 0.19+ ships cu130 wheels by default, which is what we want for +# cublas13. Older cuda12/rocm/cpu paths still resolve a compatible wheel +# from the relevant channel. if [ "x${BUILD_TYPE}" == "xhipblas" ]; then # ROCm if [ "x${USE_PIP}" == "xtrue" ]; then @@ -34,8 +38,25 @@ if [ "x${BUILD_TYPE}" == "xhipblas" ]; then else uv pip install vllm==0.14.0 --extra-index-url https://wheels.vllm.ai/rocm/0.14.0/rocm700 fi +elif [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + # JetPack 7 / L4T arm64 cu130 — vllm comes from the prebuilt SBSA wheel + # at jetson-ai-lab. Version is unpinned: the index ships whatever build + # matches the cu130/cp312 ABI. + if [ "x${USE_PIP}" == "xtrue" ]; then + pip install vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 + else + uv pip install vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 + fi +elif [ "x${BUILD_PROFILE}" == "xcublas13" ]; then + # vllm 0.19+ defaults to cu130 wheels on PyPI, no extra index needed. + if [ "x${USE_PIP}" == "xtrue" ]; then + pip install vllm --torch-backend=auto + else + uv pip install vllm --torch-backend=auto + fi elif [ "x${BUILD_TYPE}" == "xcublas" ] || [ "x${BUILD_TYPE}" == "x" ]; then - # CUDA (default) or CPU + # cuda12 / CPU — keep the 0.14.0 pin for compatibility with the existing + # cuda12 vllm-omni image; bumping should be its own change. if [ "x${USE_PIP}" == "xtrue" ]; then pip install vllm==0.14.0 --torch-backend=auto else diff --git a/backend/python/vllm-omni/requirements-cublas13.txt b/backend/python/vllm-omni/requirements-cublas13.txt new file mode 100644 index 000000000000..4ef40a539eb5 --- /dev/null +++ b/backend/python/vllm-omni/requirements-cublas13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +accelerate +torch +transformers +bitsandbytes diff --git a/backend/python/vllm-omni/requirements-l4t13.txt b/backend/python/vllm-omni/requirements-l4t13.txt new file mode 100644 index 000000000000..ff6f8e5b7817 --- /dev/null +++ b/backend/python/vllm-omni/requirements-l4t13.txt @@ -0,0 +1,13 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +accelerate +torch +torchvision +torchaudio +transformers +bitsandbytes +flash-attn +diffusers +librosa +soundfile +pillow +numpy diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index cf6fa7efe1c3..6a6da6360394 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -32,6 +32,18 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# JetPack 7 / L4T arm64 wheels (torch, vllm, flash-attn) live on +# pypi.jetson-ai-lab.io and are built for cp312, so bump the venv Python +# accordingly. JetPack 6 keeps cp310 + USE_PIP=true. +if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then + USE_PIP=true +fi +if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" +fi + # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in # requirements-cpu-after.txt and compiles vllm locally against the host's # actual CPU. Not used by default because it takes ~30-40 minutes, but diff --git a/backend/python/vllm/requirements-cublas13-after.txt b/backend/python/vllm/requirements-cublas13-after.txt new file mode 100644 index 000000000000..1644a5544581 --- /dev/null +++ b/backend/python/vllm/requirements-cublas13-after.txt @@ -0,0 +1,2 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +vllm diff --git a/backend/python/vllm/requirements-cublas13.txt b/backend/python/vllm/requirements-cublas13.txt new file mode 100644 index 000000000000..4ef40a539eb5 --- /dev/null +++ b/backend/python/vllm/requirements-cublas13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +accelerate +torch +transformers +bitsandbytes diff --git a/backend/python/vllm/requirements-l4t13-after.txt b/backend/python/vllm/requirements-l4t13-after.txt new file mode 100644 index 000000000000..01be2590bdc2 --- /dev/null +++ b/backend/python/vllm/requirements-l4t13-after.txt @@ -0,0 +1,2 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +vllm diff --git a/backend/python/vllm/requirements-l4t13.txt b/backend/python/vllm/requirements-l4t13.txt new file mode 100644 index 000000000000..9afff7f6642d --- /dev/null +++ b/backend/python/vllm/requirements-l4t13.txt @@ -0,0 +1,8 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +accelerate +torch +torchvision +torchaudio +transformers +bitsandbytes +flash-attn From 4a5010be814f9fda553e1a9ba42f6a2f4aa78e33 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 25 Apr 2026 07:42:19 +0000 Subject: [PATCH 2/3] fix(backends): use unsafe-best-match index strategy on l4t13 builds The jetson-ai-lab SBSA cu130 index lists transitive deps (decord, etc.) at limited versions / older Python ABIs. uv defaults to the first index that contains a package and refuses to fall through to PyPI, so sglang l4t13 build fails resolving decord. Mirror the existing cpu sglang profile by setting --index-strategy=unsafe-best-match on l4t13 across the three backends, and apply it to the explicit vllm install line in vllm-omni's install.sh (which doesn't honor EXTRA_PIP_INSTALL_FLAGS). Assisted-by: Claude:claude-opus-4-7 [Read] [Edit] [Bash] --- backend/python/sglang/install.sh | 6 +++++- backend/python/vllm-omni/install.sh | 11 ++++++++--- backend/python/vllm/install.sh | 6 +++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh index 22539b5bcd20..f0acc08e64b7 100755 --- a/backend/python/sglang/install.sh +++ b/backend/python/sglang/install.sh @@ -25,11 +25,15 @@ fi # JetPack 7 / L4T arm64 wheels are built for cp312 and shipped via # pypi.jetson-ai-lab.io. Bump the venv Python so the prebuilt sglang -# wheel resolves cleanly. +# wheel resolves cleanly. unsafe-best-match is required because the +# jetson-ai-lab index lists transitive deps (e.g. decord) at older +# versions only — without it uv refuses to fall through to PyPI for a +# compatible wheel and resolution fails. if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then PYTHON_VERSION="3.12" PYTHON_PATCH="12" PY_STANDALONE_TAG="20251120" + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi # sglang's CPU path has no prebuilt wheel on PyPI — upstream publishes diff --git a/backend/python/vllm-omni/install.sh b/backend/python/vllm-omni/install.sh index 889a130c739b..9a42b472778c 100755 --- a/backend/python/vllm-omni/install.sh +++ b/backend/python/vllm-omni/install.sh @@ -12,11 +12,15 @@ else source $backend_dir/../common/libbackend.sh fi -# Handle l4t build profiles (Python 3.12, pip fallback) if needed +# Handle l4t build profiles (Python 3.12, pip fallback) if needed. +# unsafe-best-match is required on l4t13 because the jetson-ai-lab index +# lists transitive deps at limited versions — without it uv pins to the +# first matching index and fails to resolve a compatible wheel from PyPI. if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then PYTHON_VERSION="3.12" PYTHON_PATCH="12" PY_STANDALONE_TAG="20251120" + EXTRA_PIP_INSTALL_FLAGS="${EXTRA_PIP_INSTALL_FLAGS:-} --index-strategy=unsafe-best-match" fi if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then @@ -41,11 +45,12 @@ if [ "x${BUILD_TYPE}" == "xhipblas" ]; then elif [ "x${BUILD_PROFILE}" == "xl4t13" ]; then # JetPack 7 / L4T arm64 cu130 — vllm comes from the prebuilt SBSA wheel # at jetson-ai-lab. Version is unpinned: the index ships whatever build - # matches the cu130/cp312 ABI. + # matches the cu130/cp312 ABI. unsafe-best-match lets uv fall through + # to PyPI for transitive deps not present on the jetson-ai-lab index. if [ "x${USE_PIP}" == "xtrue" ]; then pip install vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 else - uv pip install vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 + uv pip install --index-strategy=unsafe-best-match vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 fi elif [ "x${BUILD_PROFILE}" == "xcublas13" ]; then # vllm 0.19+ defaults to cu130 wheels on PyPI, no extra index needed. diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 6a6da6360394..4660956340d3 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -34,7 +34,10 @@ fi # JetPack 7 / L4T arm64 wheels (torch, vllm, flash-attn) live on # pypi.jetson-ai-lab.io and are built for cp312, so bump the venv Python -# accordingly. JetPack 6 keeps cp310 + USE_PIP=true. +# accordingly. JetPack 6 keeps cp310 + USE_PIP=true. unsafe-best-match +# is required because the jetson-ai-lab index lists transitive deps at +# limited versions — without it uv pins to the first matching index and +# fails to resolve a compatible wheel from PyPI. if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then USE_PIP=true fi @@ -42,6 +45,7 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then PYTHON_VERSION="3.12" PYTHON_PATCH="12" PY_STANDALONE_TAG="20251120" + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in From 6753a21198f29b691d410c42682aa22626b9f9cb Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 25 Apr 2026 08:54:07 +0000 Subject: [PATCH 3/3] fix(sglang): drop [all] extras on l4t13, floor version at 0.5.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The [all] extra brings in outlines→decord, and decord has no aarch64 cp312 wheel on PyPI nor the jetson-ai-lab index (only legacy cp35-cp37 tags). With unsafe-best-match enabled, uv backtracked through sglang versions trying to satisfy decord and silently landed on sglang==0.1.16, an ancient version with an entirely different dep tree (cloudpickle/outlines 0.0.44, etc.). Drop [all] so decord is no longer required, and floor sglang at 0.5.0 to prevent any future resolver misfire from degrading the version again. Assisted-by: Claude:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Ettore Di Giacinto --- backend/python/sglang/requirements-l4t13.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/python/sglang/requirements-l4t13.txt b/backend/python/sglang/requirements-l4t13.txt index ad9908650963..81de3f13d342 100644 --- a/backend/python/sglang/requirements-l4t13.txt +++ b/backend/python/sglang/requirements-l4t13.txt @@ -4,4 +4,9 @@ torch torchvision torchaudio transformers -sglang[all] +# Drop the [all] extra: it pulls outlines/decord, and decord has no +# aarch64 cp312 wheel anywhere (PyPI nor the jetson-ai-lab index ships +# only legacy cp35-cp37). With [all] uv backtracks through versions +# trying to satisfy decord and lands on sglang==0.1.16. Floor at 0.5.0 +# so uv can't silently downgrade if a future resolution misfires. +sglang>=0.5.0