diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 9ceb9d4f9a09..f9295af70c35 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -920,6 +920,32 @@ jobs: backend: "turboquant" dockerfile: "./backend/Dockerfile.turboquant" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-vllm' + runs-on: 'arc-runner-set' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "vllm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-vllm-omni' + runs-on: 'arc-runner-set' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "vllm-omni" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1076,6 +1102,45 @@ jobs: backend: "diffusers" dockerfile: "./backend/Dockerfile.python" context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-vllm' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "vllm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-vllm-omni' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "vllm-omni" + dockerfile: "./backend/Dockerfile.python" + context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-sglang' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "sglang" + dockerfile: "./backend/Dockerfile.python" + context: "./" - build-type: 'l4t' cuda-major-version: "13" cuda-minor-version: "0" diff --git a/backend/index.yaml b/backend/index.yaml index 63726c5b620e..38b8f798bea0 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -263,6 +263,8 @@ amd: "rocm-vllm" intel: "intel-vllm" nvidia-cuda-12: "cuda12-vllm" + nvidia-cuda-13: "cuda13-vllm" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm" cpu: "cpu-vllm" - &sglang name: "sglang" @@ -285,6 +287,7 @@ amd: "rocm-sglang" intel: "intel-sglang" nvidia-cuda-12: "cuda12-sglang" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sglang" cpu: "cpu-sglang" - &vllm-omni name: "vllm-omni" @@ -311,6 +314,8 @@ nvidia: "cuda12-vllm-omni" amd: "rocm-vllm-omni" nvidia-cuda-12: "cuda12-vllm-omni" + nvidia-cuda-13: "cuda13-vllm-omni" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-omni" - &mlx name: "mlx" uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx" @@ -1814,12 +1819,25 @@ nvidia: "cuda12-vllm-development" amd: "rocm-vllm-development" intel: "intel-vllm-development" + nvidia-cuda-12: "cuda12-vllm-development" + nvidia-cuda-13: "cuda13-vllm-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development" cpu: "cpu-vllm-development" - !!merge <<: *vllm name: "cuda12-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm" mirrors: - localai/localai-backends:latest-gpu-nvidia-cuda-12-vllm +- !!merge <<: *vllm + name: "cuda13-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-vllm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-vllm +- !!merge <<: *vllm + name: "cuda13-nvidia-l4t-arm64-vllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm - !!merge <<: *vllm name: "rocm-vllm" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-vllm" @@ -1840,6 +1858,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-12-vllm +- !!merge <<: *vllm + name: "cuda13-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-vllm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-vllm +- !!merge <<: *vllm + name: "cuda13-nvidia-l4t-arm64-vllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vllm" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vllm - !!merge <<: *vllm name: "rocm-vllm-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-vllm" @@ -1862,12 +1890,19 @@ nvidia: "cuda12-sglang-development" amd: "rocm-sglang-development" intel: "intel-sglang-development" + nvidia-cuda-12: "cuda12-sglang-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sglang-development" cpu: "cpu-sglang-development" - !!merge <<: *sglang name: "cuda12-sglang" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sglang" mirrors: - localai/localai-backends:latest-gpu-nvidia-cuda-12-sglang +- !!merge <<: *sglang + name: "cuda13-nvidia-l4t-arm64-sglang" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-sglang" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-sglang - !!merge <<: *sglang name: "rocm-sglang" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-sglang" @@ -1888,6 +1923,11 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sglang" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-12-sglang +- !!merge <<: *sglang + name: "cuda13-nvidia-l4t-arm64-sglang-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-sglang" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-sglang - !!merge <<: *sglang name: "rocm-sglang-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-sglang" @@ -1910,11 +1950,23 @@ nvidia: "cuda12-vllm-omni-development" amd: "rocm-vllm-omni-development" nvidia-cuda-12: "cuda12-vllm-omni-development" + nvidia-cuda-13: "cuda13-vllm-omni-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-omni-development" - !!merge <<: *vllm-omni name: "cuda12-vllm-omni" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm-omni" mirrors: - localai/localai-backends:latest-gpu-nvidia-cuda-12-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-vllm-omni" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-vllm-omni" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-nvidia-l4t-arm64-vllm-omni" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm-omni" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-vllm-omni - !!merge <<: *vllm-omni name: "rocm-vllm-omni" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-vllm-omni" @@ -1925,6 +1977,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm-omni" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-12-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-vllm-omni-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-vllm-omni" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-vllm-omni +- !!merge <<: *vllm-omni + name: "cuda13-nvidia-l4t-arm64-vllm-omni-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vllm-omni" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vllm-omni - !!merge <<: *vllm-omni name: "rocm-vllm-omni-development" uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-vllm-omni" diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh index 0f8f1f63456a..f0acc08e64b7 100755 --- a/backend/python/sglang/install.sh +++ b/backend/python/sglang/install.sh @@ -23,6 +23,19 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# JetPack 7 / L4T arm64 wheels are built for cp312 and shipped via +# pypi.jetson-ai-lab.io. Bump the venv Python so the prebuilt sglang +# wheel resolves cleanly. unsafe-best-match is required because the +# jetson-ai-lab index lists transitive deps (e.g. decord) at older +# versions only — without it uv refuses to fall through to PyPI for a +# compatible wheel and resolution fails. +if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + # sglang's CPU path has no prebuilt wheel on PyPI — upstream publishes # a separate pyproject_cpu.toml that must be swapped in before `pip install`. # Reference: docker/xeon.Dockerfile in the sglang upstream repo. diff --git a/backend/python/sglang/requirements-l4t13.txt b/backend/python/sglang/requirements-l4t13.txt new file mode 100644 index 000000000000..81de3f13d342 --- /dev/null +++ b/backend/python/sglang/requirements-l4t13.txt @@ -0,0 +1,12 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +accelerate +torch +torchvision +torchaudio +transformers +# Drop the [all] extra: it pulls outlines/decord, and decord has no +# aarch64 cp312 wheel anywhere (PyPI nor the jetson-ai-lab index ships +# only legacy cp35-cp37). With [all] uv backtracks through versions +# trying to satisfy decord and lands on sglang==0.1.16. Floor at 0.5.0 +# so uv can't silently downgrade if a future resolution misfires. +sglang>=0.5.0 diff --git a/backend/python/vllm-omni/install.sh b/backend/python/vllm-omni/install.sh index 3aa6367d33ef..9a42b472778c 100755 --- a/backend/python/vllm-omni/install.sh +++ b/backend/python/vllm-omni/install.sh @@ -12,11 +12,15 @@ else source $backend_dir/../common/libbackend.sh fi -# Handle l4t build profiles (Python 3.12, pip fallback) if needed +# Handle l4t build profiles (Python 3.12, pip fallback) if needed. +# unsafe-best-match is required on l4t13 because the jetson-ai-lab index +# lists transitive deps at limited versions — without it uv pins to the +# first matching index and fails to resolve a compatible wheel from PyPI. if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then PYTHON_VERSION="3.12" PYTHON_PATCH="12" PY_STANDALONE_TAG="20251120" + EXTRA_PIP_INSTALL_FLAGS="${EXTRA_PIP_INSTALL_FLAGS:-} --index-strategy=unsafe-best-match" fi if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then @@ -26,7 +30,11 @@ fi # Install base requirements first installRequirements -# Install vllm based on build type +# Install vllm based on build type. vllm-omni tracks vllm master from +# source (cloned below) so we leave the upstream vllm dependency unpinned +# — vllm 0.19+ ships cu130 wheels by default, which is what we want for +# cublas13. Older cuda12/rocm/cpu paths still resolve a compatible wheel +# from the relevant channel. if [ "x${BUILD_TYPE}" == "xhipblas" ]; then # ROCm if [ "x${USE_PIP}" == "xtrue" ]; then @@ -34,8 +42,26 @@ if [ "x${BUILD_TYPE}" == "xhipblas" ]; then else uv pip install vllm==0.14.0 --extra-index-url https://wheels.vllm.ai/rocm/0.14.0/rocm700 fi +elif [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + # JetPack 7 / L4T arm64 cu130 — vllm comes from the prebuilt SBSA wheel + # at jetson-ai-lab. Version is unpinned: the index ships whatever build + # matches the cu130/cp312 ABI. unsafe-best-match lets uv fall through + # to PyPI for transitive deps not present on the jetson-ai-lab index. + if [ "x${USE_PIP}" == "xtrue" ]; then + pip install vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 + else + uv pip install --index-strategy=unsafe-best-match vllm --extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 + fi +elif [ "x${BUILD_PROFILE}" == "xcublas13" ]; then + # vllm 0.19+ defaults to cu130 wheels on PyPI, no extra index needed. + if [ "x${USE_PIP}" == "xtrue" ]; then + pip install vllm --torch-backend=auto + else + uv pip install vllm --torch-backend=auto + fi elif [ "x${BUILD_TYPE}" == "xcublas" ] || [ "x${BUILD_TYPE}" == "x" ]; then - # CUDA (default) or CPU + # cuda12 / CPU — keep the 0.14.0 pin for compatibility with the existing + # cuda12 vllm-omni image; bumping should be its own change. if [ "x${USE_PIP}" == "xtrue" ]; then pip install vllm==0.14.0 --torch-backend=auto else diff --git a/backend/python/vllm-omni/requirements-cublas13.txt b/backend/python/vllm-omni/requirements-cublas13.txt new file mode 100644 index 000000000000..4ef40a539eb5 --- /dev/null +++ b/backend/python/vllm-omni/requirements-cublas13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +accelerate +torch +transformers +bitsandbytes diff --git a/backend/python/vllm-omni/requirements-l4t13.txt b/backend/python/vllm-omni/requirements-l4t13.txt new file mode 100644 index 000000000000..ff6f8e5b7817 --- /dev/null +++ b/backend/python/vllm-omni/requirements-l4t13.txt @@ -0,0 +1,13 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +accelerate +torch +torchvision +torchaudio +transformers +bitsandbytes +flash-attn +diffusers +librosa +soundfile +pillow +numpy diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index cf6fa7efe1c3..4660956340d3 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -32,6 +32,22 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# JetPack 7 / L4T arm64 wheels (torch, vllm, flash-attn) live on +# pypi.jetson-ai-lab.io and are built for cp312, so bump the venv Python +# accordingly. JetPack 6 keeps cp310 + USE_PIP=true. unsafe-best-match +# is required because the jetson-ai-lab index lists transitive deps at +# limited versions — without it uv pins to the first matching index and +# fails to resolve a compatible wheel from PyPI. +if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then + USE_PIP=true +fi +if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in # requirements-cpu-after.txt and compiles vllm locally against the host's # actual CPU. Not used by default because it takes ~30-40 minutes, but diff --git a/backend/python/vllm/requirements-cublas13-after.txt b/backend/python/vllm/requirements-cublas13-after.txt new file mode 100644 index 000000000000..1644a5544581 --- /dev/null +++ b/backend/python/vllm/requirements-cublas13-after.txt @@ -0,0 +1,2 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +vllm diff --git a/backend/python/vllm/requirements-cublas13.txt b/backend/python/vllm/requirements-cublas13.txt new file mode 100644 index 000000000000..4ef40a539eb5 --- /dev/null +++ b/backend/python/vllm/requirements-cublas13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +accelerate +torch +transformers +bitsandbytes diff --git a/backend/python/vllm/requirements-l4t13-after.txt b/backend/python/vllm/requirements-l4t13-after.txt new file mode 100644 index 000000000000..01be2590bdc2 --- /dev/null +++ b/backend/python/vllm/requirements-l4t13-after.txt @@ -0,0 +1,2 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +vllm diff --git a/backend/python/vllm/requirements-l4t13.txt b/backend/python/vllm/requirements-l4t13.txt new file mode 100644 index 000000000000..9afff7f6642d --- /dev/null +++ b/backend/python/vllm/requirements-l4t13.txt @@ -0,0 +1,8 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +accelerate +torch +torchvision +torchaudio +transformers +bitsandbytes +flash-attn