ScalarLM/Dockerfile at main · tensorwavecloud/ScalarLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
ARG BASE_NAME=cpu

###############################################################################
# NVIDIA BASE IMAGE
FROM nvcr.io/nvidia/pytorch:25.10-py3 AS nvidia

RUN apt-get update -y && apt-get install -y python3-venv slurm-wlm libslurm-dev

ENV VIRTUAL_ENV=/app/.venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN python -m venv $VIRTUAL_ENV --system-site-packages && \
    . $VIRTUAL_ENV/bin/activate

# Put HPC-X MPI in the PATH, i.e. mpirun
ENV PATH=$PATH:/opt/hpcx/ompi/bin
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:/opt/hpcx/ompi/lib

ARG TORCH_VERSION="2.9.1"
ARG TORCH_CUDA_ARCH_LIST="7.5"

RUN pip install uv && \
    uv pip install ninja && \
    pip install --upgrade "protobuf>=6.30.0"

ENV PIP_CONSTRAINT=""

ARG INSTALL_ROOT=/app/cray
WORKDIR ${INSTALL_ROOT}

ENV BASE_NAME=nvidia

ENV TORCHINDUCTOR_MAX_AUTOTUNE=0
ENV TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=0
ENV TORCH_COMPILE_DISABLE=1

###############################################################################
# CPU BASE IMAGE
FROM ubuntu:24.04 AS cpu

RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y python3 python3-pip python3-venv \
    openmpi-bin libopenmpi-dev libpmix-dev slurm-wlm libslurm-dev \
    cmake

ENV VIRTUAL_ENV=/app/.venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN python3 -m venv $VIRTUAL_ENV && \
    . $VIRTUAL_ENV/bin/activate

ARG TORCH_VERSION="2.7.1"

RUN pip install uv && \
    uv pip install torch==${TORCH_VERSION}+cpu --index-url https://download.pytorch.org/whl/cpu && \
    uv pip install ninja

# Put torch on the LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:/app/.venv/lib64/python3.12/site-packages/torch/lib

ARG INSTALL_ROOT=/app/cray
WORKDIR ${INSTALL_ROOT}

ENV BASE_NAME=cpu

###############################################################################
# AMD BASE IMAGE
FROM gdiamos/rocm-base-mi300:v0.9926 AS amd

ENV BASE_NAME=amd

#RUN pip install pyhip>=1.1.0
ENV HIP_FORCE_DEV_KERNARG=1

ARG INSTALL_ROOT=/app/cray
WORKDIR ${INSTALL_ROOT}

ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:/app/venv/lib/python3.12/site-packages/torch/lib:/usr/local/rdma-lib

###############################################################################
# FRONTEND BUILD STAGE

FROM ${BASE_NAME} AS ui_base

RUN apt-get update -y && \
    apt-get install -y git curl libgomp1 libcurl4 dnsutils nano

# Install node 24.0
RUN curl -fsSL https://deb.nodesource.com/setup_24.x | bash - && \
    apt-get install -y nodejs && \
    node --version && \
    npm --version

ARG INSTALL_ROOT=/app
WORKDIR /app

# Configure Huggingface Chat UI source - can use either local directory or remote repo
ARG UI_SOURCE=remote
ARG UI_BRANCH=main
ARG UI_REPO=https://github.com/supermassive-intelligence/chat-ui-fork.git

# Handle Chat UI source - support both local and remote modes
COPY scripts/build-copy-chat-ui.sh ${INSTALL_ROOT}/build-copy-chat-ui.sh

# Handle Chat UI source - single RUN command with conditional mount
# For remote: clone from repository
# For local: mount and copy from ./chat-ui directory
RUN --mount=type=bind,source=./chat-ui,target=/workspace/chat-ui,rw \
    bash ${INSTALL_ROOT}/build-copy-chat-ui.sh ${UI_SOURCE} ${INSTALL_ROOT}/chat-ui \
    /workspace/chat-ui ${UI_REPO} ${UI_BRANCH}

# install dotenv-cli
RUN npm install -g dotenv-cli

USER root

# mkdir for ui and adjust ownership
RUN mkdir -p /app/ui && \
    touch /app/ui/.env.local && \
    cp ${INSTALL_ROOT}/chat-ui/.env /app/ui/.env && \
    cp ${INSTALL_ROOT}/chat-ui/entrypoint.sh /app/ui/entrypoint.sh && \
    cp ${INSTALL_ROOT}/chat-ui/package.json /app/ui/package.json && \
    cp ${INSTALL_ROOT}/chat-ui/package-lock.json /app/ui/package-lock.json && \
    chmod +x /app/ui/entrypoint.sh

FROM node:24.2.0 AS ui_builder

WORKDIR /app
ARG INSTALL_ROOT=/temp

USER root
RUN \
    apt-get update -y \
    && apt-get install -y git

# Configure Huggingface Chat UI source - can use either local directory or remote repo
ARG UI_SOURCE=remote
ARG UI_BRANCH=main
ARG UI_REPO=https://github.com/supermassive-intelligence/chat-ui-fork.git

# Handle Chat UI source - support both local and remote modes
COPY scripts/build-copy-chat-ui.sh ${INSTALL_ROOT}/build-copy-chat-ui.sh

# Handle Chat UI source - single RUN command with conditional mount
# For remote: clone from repository
# For local: mount and copy from ./chat-ui directory
RUN --mount=type=bind,source=./chat-ui,target=/workspace/chat-ui,rw \
    bash ${INSTALL_ROOT}/build-copy-chat-ui.sh ${UI_SOURCE} ${INSTALL_ROOT}/chat-ui \
    /workspace/chat-ui ${UI_REPO} ${UI_BRANCH}

RUN cp ${INSTALL_ROOT}/chat-ui/package-lock.json ${INSTALL_ROOT}/chat-ui/package.json ./

ARG APP_BASE=/chat
ARG PUBLIC_APP_COLOR=
ENV BODY_SIZE_LIMIT=15728640

RUN --mount=type=cache,target=/app/.npm \
    npm set cache /app/.npm && \
    npm ci

RUN cp -R ${INSTALL_ROOT}/chat-ui/. /app/ && \
    npm install -D @sveltejs/adapter-static

RUN git config --global --add safe.directory /app && \
    npm run build

# mongo image
FROM mongo:7 AS mongo

# image to be used if INCLUDE_DB is true
FROM ui_base AS local_db

# copy mongo from the other stage
COPY --from=mongo /usr/bin/mongo* /usr/bin/

ENV MONGODB_URL=mongodb://localhost:27017
USER root
RUN mkdir -p /data/db

# final image
FROM local_db AS ui_final

# build arg to determine if the database should be included
ENV INCLUDE_DB=true

# svelte requires APP_BASE at build time so it must be passed as a build arg
ARG APP_BASE=/chat
ARG PUBLIC_APP_COLOR=
ARG PUBLIC_COMMIT_SHA=
ENV PUBLIC_COMMIT_SHA=${PUBLIC_COMMIT_SHA}
ENV BODY_SIZE_LIMIT=15728640

#import the build & dependencies
COPY --from=ui_builder /app/build /app/build
COPY --from=ui_builder /app/node_modules /app/node_modules
COPY frontend/entrypoint.sh /app/ui/entrypoint.sh
COPY frontend/.env.local /app/ui/.env.local

#CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]

###############################################################################
# VLLM BUILD STAGE
FROM ui_final AS vllm

# Copy all of the frontend libraries and code
COPY --from=ui_final --chown=1000 /app /app/ui

# Copy all of the mongo binaries
COPY --from=ui_final /usr/bin/mongo* /usr/bin/

# Set environment variables from ui
ENV MONGODB_URL=mongodb://localhost:27017
ENV INCLUDE_DB=true

RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl git ccache vim numactl gcc-12 g++-12 libomp-dev libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libdnnl-dev \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

ARG INSTALL_ROOT=/app/cray

WORKDIR ${INSTALL_ROOT}

# Install build dependencies FIRST
RUN pip install setuptools-scm

# Configure vLLM source - can use either local directory or remote repo
ARG VLLM_SOURCE=remote
ARG VLLM_BRANCH=main
ARG VLLM_REPO=https://github.com/supermassive-intelligence/vllm-fork.git

# Handle vLLM source - support both local and remote modes
COPY scripts/build-copy-vllm.sh ${INSTALL_ROOT}/build-copy-vllm.sh

# Handle vLLM source - single RUN command with conditional mount
# For remote: clone from repository
# For local: mount and copy from ./vllm directory
RUN --mount=type=bind,source=./vllm,target=/workspace/vllm,rw \
    bash ${INSTALL_ROOT}/build-copy-vllm.sh ${VLLM_SOURCE} ${INSTALL_ROOT}/vllm \
    /workspace/vllm ${VLLM_REPO} ${VLLM_BRANCH}

WORKDIR ${INSTALL_ROOT}/vllm

# Set build environment variables for CPU compilation
ARG TORCH_CUDA_ARCH_LIST="7.5"
ARG VLLM_TARGET_DEVICE=cpu

ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
ENV VLLM_TARGET_DEVICE=${VLLM_TARGET_DEVICE}
ENV CMAKE_BUILD_TYPE=Release

# vLLM dependencies
COPY ./infra/requirements-vllm.txt ${INSTALL_ROOT}/requirements-vllm.txt
RUN uv pip install --no-compile --no-cache-dir -r ${INSTALL_ROOT}/requirements-vllm.txt && \
    python ${INSTALL_ROOT}/vllm/use_existing_torch.py

RUN \
    --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/app/cray/vllm/.deps \
    export MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/ {print int($2/4)}') ? $(nproc) : $(free -g | awk '/^Mem:/ {print int($2/4)}'))) && \
    pip install --no-build-isolation -e . --verbose

WORKDIR ${INSTALL_ROOT}

###############################################################################
# MAIN IMAGE
FROM vllm AS infra

# Build GPU-aware MPI
COPY ./infra/cray_infra/training/gpu_aware_mpi ${INSTALL_ROOT}/infra/cray_infra/training/gpu_aware_mpi
RUN python3 ${INSTALL_ROOT}/infra/cray_infra/training/gpu_aware_mpi/setup.py bdist_wheel --dist-dir=dist && \
    pip install dist/*.whl

RUN apt-get update -y  \
    && apt-get install -y build-essential \
    less curl wget net-tools vim iputils-ping strace gdb python3-dbg python3-dev \
    && rm -rf /var/lib/apt/lists/*

# Setup python path
ENV PYTHONPATH="${PYTHONPATH:-}:${INSTALL_ROOT}/infra"
ENV PYTHONPATH="${PYTHONPATH:-}:${INSTALL_ROOT}/sdk"
ENV PYTHONPATH="${PYTHONPATH:-}:${INSTALL_ROOT}/ml"
ENV PYTHONPATH="${PYTHONPATH:-}:${INSTALL_ROOT}/test"
ENV PYTHONPATH="${PYTHONPATH:-}:${INSTALL_ROOT}/vllm"

# Megatron dependencies (GPU only)
# note this has to happen after vllm because it overrides some packages installed by vllm
COPY ./infra/requirements-megatron.txt ${INSTALL_ROOT}/requirements-megatron.txt
COPY ./infra/requirements-megatron-cpu.txt ${INSTALL_ROOT}/requirements-megatron-cpu.txt
COPY ./requirements.txt ${INSTALL_ROOT}/requirements.txt

RUN if [ "$VLLM_TARGET_DEVICE" != "cpu" ]; then \
        uv pip install --no-deps --no-compile --no-cache-dir -r ${INSTALL_ROOT}/requirements-megatron.txt; \
    fi && \
    if [ "$VLLM_TARGET_DEVICE" != "cuda" ]; then \
        uv pip install --no-compile --no-cache-dir -r ${INSTALL_ROOT}/requirements-megatron-cpu.txt; \
    fi && \
    uv pip install --no-compile --no-cache-dir -r ${INSTALL_ROOT}/requirements.txt

RUN mkdir -p ${INSTALL_ROOT}/jobs ${INSTALL_ROOT}/nfs

# Copy the rest of the platform code
COPY ./infra ${INSTALL_ROOT}/infra
COPY ./sdk ${INSTALL_ROOT}/sdk
COPY ./test ${INSTALL_ROOT}/test
COPY ./ml ${INSTALL_ROOT}/ml
COPY ./scripts ${INSTALL_ROOT}/scripts

WORKDIR ${INSTALL_ROOT}

# Build SLURM plugin
RUN /app/cray/infra/slurm_src/compile.sh

ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${PYTHONPATH:-}:/usr/local/lib/slurm
ENV SLURM_CONF=${INSTALL_ROOT}/nfs/slurm.conf
ENV VLLM_CPU_MOE_PREPACK=0