From c4a96f2ce47b00d2c916aff39bb76073174d8577 Mon Sep 17 00:00:00 2001 From: Kishor Prins Date: Thu, 9 Oct 2025 07:56:24 -0700 Subject: [PATCH] Support older AMD GFX uarch --- docker/rocm/Dockerfile | 19 ++++++- docker/rocm/kbd_install.sh | 112 +++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 docker/rocm/kbd_install.sh diff --git a/docker/rocm/Dockerfile b/docker/rocm/Dockerfile index 21bfbd19..72fa68ee 100644 --- a/docker/rocm/Dockerfile +++ b/docker/rocm/Dockerfile @@ -1,4 +1,4 @@ -FROM rocm/dev-ubuntu-24.04:6.4.3 +FROM rocm/dev-ubuntu-24.04:6.4.3-complete ENV DEBIAN_FRONTEND=noninteractive \ PHONEMIZER_ESPEAK_PATH=/usr/bin \ PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ @@ -16,6 +16,7 @@ RUN apt-get update && apt upgrade -y && apt-get install -y --no-install-recommen wget \ nano \ g++ \ + zstd \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && mkdir -p /usr/share/espeak-ng-data \ @@ -32,7 +33,6 @@ RUN apt-get update && apt upgrade -y && apt-get install -y --no-install-recommen && chown -R appuser:appuser /app \ # Models folder && mkdir -p /app/api/src/models/v1_0 -WORKDIR /app USER appuser WORKDIR /app @@ -46,13 +46,26 @@ ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \ # Install dependencies with GPU extras (using cache mounts) RUN --mount=type=cache,target=/root/.cache/uv \ - uv venv --python 3.10 && \ + uv venv --python 3.12 && \ uv sync --extra rocm +# Run kbd files +ENV ROCM_VERSION=6.4.3 +COPY --chown=appuser:appuser docker/rocm/kbd_install.sh /tmp/ +RUN /tmp/kbd_install.sh + +# Support older GFX Arch +RUN cd /tmp && wget https://archlinux.org/packages/extra/x86_64/rocblas/download -O rocblas.tar.zst \ + && pwd && ls -lah ./ \ + && tar --zstd -xvf rocblas.tar.zst && rm rocblas.tar.zst \ + && rm -rf /app/.venv/lib/python3.12/site-packages/torch/lib/rocblas/library/ \ + && mv ./opt/rocm/lib/rocblas/library/ /app/.venv/lib/python3.12/site-packages/torch/lib/rocblas/ + # Copy project files including models COPY --chown=appuser:appuser api ./api COPY --chown=appuser:appuser web ./web COPY --chown=appuser:appuser docker/scripts/ ./ + RUN chmod +x ./entrypoint.sh # Set all environment variables in one go diff --git a/docker/rocm/kbd_install.sh b/docker/rocm/kbd_install.sh new file mode 100644 index 00000000..29f72865 --- /dev/null +++ b/docker/rocm/kbd_install.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +set -e + +ver() { + printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); +} + +# Sets GFX_ARCH to default if not set +if [ -z "$GFX_ARCH" ]; then + echo "WARNING: missing env var GFX_ARCH, using default (this will take longer)" + GFX_ARCHS=("gfx900" "gfx906" "gfx908" "gfx90a" "gfx942" "gfx1030") +else + # Convert ; seperated string to array + IFS=';' read -ra GFX_ARCHS <<< "$GFX_ARCH" +fi + +# Sets ROCM_VERSION to "latest" if not set +if [ -z "$ROCM_VERSION" ]; then + echo "WARNING: missing env var ROCM_VERSION, using latest kdb repo (NOT RECOMMENDED)" + ROCM_VERSION="latest" +fi + +# Set PyTorch version and wheel install path +TORCH_INSTALL_PATH=$(uv pip show torch | grep Location | cut -d" " -f 2) + +# Check if Torch installation path exists +if [ ! -d "$TORCH_INSTALL_PATH" ]; then + echo "Error: Torch installation path '$TORCH_INSTALL_PATH' does not exist." + exit 1 +fi + +# Print variable overview +echo "ROCM version: $ROCM_VERSION" +echo "GFX architectures: ${GFX_ARCHS[@]}" +echo "PyTorch installation path: $TORCH_INSTALL_PATH" + +# Create directory for extraction +EXTRACT_DIR=extract_miopen_dbs +rm -rf $EXTRACT_DIR +mkdir -p "$EXTRACT_DIR" && cd "$EXTRACT_DIR" + +if [[ -f /etc/lsb-release ]]; then + # Exit if not 20.04, 22.04, or 24.04 + source /etc/lsb-release + echo "DISTRIB_RELEASE: $DISTRIB_RELEASE" + if [[ "$DISTRIB_RELEASE" != "20.04" && "$DISTRIB_RELEASE" != "22.04" ]]; then + if [[ "$ROCM_VERSION" != "latest" && $(ver $ROCM_VERSION) -lt $(ver 6.2) && "$DISTRIB_RELEASE" == "24.04" ]]; then + echo "ERROR: Unsupported Ubuntu version." + exit 1 + fi + fi + + for arch in "${GFX_ARCHS[@]}"; do + # Download MIOpen .kdbs for ROCm version and GPU architecture on ubuntu + echo "Downloading .kdb files for rocm-$ROCM_VERSION ($arch arch) ..." + wget -q -r -np -nd -A miopen-hip-$arch*kdb_*$DISTRIB_RELEASE*deb \ + https://repo.radeon.com/rocm/apt/$ROCM_VERSION/pool/main/m/ + + # Check if files were downloaded. No KDB files in repo.radeon will result in error. + if ! ls miopen-hip-$arch*kdb_*$DISTRIB_RELEASE*deb 1> /dev/null 2>&1; then + echo -e "ERROR: No MIOpen kernel database files found for $arch\nPlease check https://repo.radeon.com/rocm/apt/$ROCM_VERSION/pool/main/m/ for supported architectures" + exit 1 + fi + done + + # Extract all .deb files to local directory + echo "Extracting deb packages for ${GFX_ARCHS[@]} ..." + for deb_file in `ls *deb`; do + echo "Extracting $deb_file..." + dpkg-deb -xv "$deb_file" . > /dev/null 2>&1 + done + +elif [[ -f /etc/centos-release || -f /etc/redhat-release ]]; then + # Centos kdbs + source /etc/os-release && RHEL_VERSION="$VERSION_ID" + RHEL_MAJOR_VERSION=${RHEL_VERSION%%.*} + echo "RHEL_VERSION: $RHEL_VERSION; RHEL_MAJOR_VERSION: $RHEL_MAJOR_VERSION" + if [[ ! "$RHEL_VERSION" =~ ^(8|9) ]]; then + echo "ERROR: Unsupported CentOS/RHEL release" + fi + for arch in "${GFX_ARCHS[@]}"; do + # Download MIOpen .kdbs for ROCm version and GPU architecture on centos + echo "Downloading .kdb files for rocm-$ROCM_VERSION ($arch arch) ..." + wget -q -r -np -nd -A miopen-hip-$arch*kdb-[0-9]*rpm \ + https://repo.radeon.com/rocm/rhel${RHEL_MAJOR_VERSION}/$ROCM_VERSION/main + + # Check if files were downloaded. No KDB files in repo.radeon will result in error. + if ! ls miopen-hip-$arch*kdb-*rpm 1> /dev/null 2>&1; then + echo -e "ERROR: No MIOpen kernel database files found for $arch\nPlease check https://repo.radeon.com/rocm/rhel${RHEL_MAJOR_VERSION}/$ROCM_VERSION/main for supported architectures" + exit 1 + fi + done + + # Extract all RPM files to current directory + echo "Extracting rpm packages for ${GFX_ARCHS[@]} ..." + for rpm_file in `ls *rpm`; do + echo "Extracting $rpm_file..." + rpm2cpio "$rpm_file" | cpio -idmv 2> /dev/null + done +else + echo "ERROR: Unsupported operating system." + exit 1 +fi + +# Copy miopen db files to PyTorch installation path +echo "Copying kdb files to ${TORCH_INSTALL_PATH}/torch/share" +cp -ra opt/rocm-*/share/miopen $TORCH_INSTALL_PATH/torch/share + +# Remove downloaded files and extract directory +cd .. && rm -rf $EXTRACT_DIR +echo "Successfully installed MIOpen kernel database files"