Created
August 7, 2025 11:33
-
-
Save Ithanil/fd7644bf3e44eec752d1263a8b8acb3a to your computer and use it in GitHub Desktop.
VLLM Docker build for GPT-OSS on Ampere
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ARG CUDA_VERSION=12.8.1 | |
| FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS vllm_gpt-oss | |
| ARG CUDA_VERSION | |
| ARG PYTHON_VERSION=3.12 | |
| ### --- Stuff from default Dockerfile ---- ### | |
| # The PyPA get-pip.py script is a self contained script+zip file, that provides | |
| # both the installer script and the pip base85-encoded zip archive. This allows | |
| # bootstrapping pip in environment where a dsitribution package does not exist. | |
| # | |
| # By parameterizing the URL for get-pip.py installation script, we allow | |
| # third-party to use their own copy of the script stored in a private mirror. | |
| # We set the default value to the PyPA owned get-pip.py script. | |
| # | |
| # Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py | |
| ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" | |
| # PIP supports fetching the packages from custom indexes, allowing third-party | |
| # to host the packages in private mirrors. The PIP_INDEX_URL and | |
| # PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the | |
| # default indexes. By letting them empty by default, PIP will use its default | |
| # indexes if the build process doesn't override the indexes. | |
| # | |
| # Uv uses different variables. We set them by default to the same values as | |
| # PIP, but they can be overridden. | |
| ARG PIP_INDEX_URL | |
| ARG PIP_EXTRA_INDEX_URL | |
| ARG UV_INDEX_URL=${PIP_INDEX_URL} | |
| ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} | |
| # PyTorch provides its own indexes for standard and nightly builds | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl | |
| ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly | |
| # PIP supports multiple authentication schemes, including keyring | |
| # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to | |
| # disabled by default, we allow third-party to use keyring authentication for | |
| # their private Python indexes, while not changing the default behavior which | |
| # is no authentication. | |
| # | |
| # Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support | |
| ARG PIP_KEYRING_PROVIDER=disabled | |
| ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} | |
| # Flag enables built-in KV-connector dependency libs into docker images | |
| ARG INSTALL_KV_CONNECTORS=false | |
| # prepare basic build environment | |
| ARG TARGETPLATFORM | |
| ARG INSTALL_KV_CONNECTORS=false | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| ARG DEADSNAKES_MIRROR_URL | |
| ARG DEADSNAKES_GPGKEY_URL | |
| ARG GET_PIP_URL | |
| # Install Python and other dependencies | |
| RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ | |
| && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ | |
| && apt-get update -y \ | |
| && apt-get install -y ccache software-properties-common git curl sudo \ | |
| && add-apt-repository -y ppa:deadsnakes/ppa \ | |
| && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ | |
| && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ | |
| && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ | |
| && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | |
| && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ | |
| && python3 --version && python3 -m pip --version | |
| ARG PIP_INDEX_URL UV_INDEX_URL | |
| ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL | |
| ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL | |
| ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER | |
| # Install uv for faster pip installs | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| python3 -m pip install uv | |
| # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | |
| # Reference: https://github.com/astral-sh/uv/pull/1694 | |
| ENV UV_HTTP_TIMEOUT=500 | |
| ENV UV_INDEX_STRATEGY="unsafe-best-match" | |
| # Use copy mode to avoid hardlink failures with Docker cache mounts | |
| ENV UV_LINK_MODE=copy | |
| # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 | |
| # as it was causing spam when compiling the CUTLASS kernels | |
| RUN apt-get install -y gcc-10 g++-10 | |
| RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 | |
| RUN <<EOF | |
| gcc --version | |
| EOF | |
| # Workaround for https://github.com/openai/triton/issues/2507 and | |
| # https://github.com/pytorch/pytorch/issues/107960 -- hopefully | |
| # this won't be needed for future versions of this docker image | |
| # or future versions of triton. | |
| RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | |
| # max jobs used by Ninja to build extensions | |
| ARG max_jobs=2 | |
| ENV MAX_JOBS=${max_jobs} | |
| # number of threads used by nvcc | |
| ARG nvcc_threads=8 | |
| ENV NVCC_THREADS=$nvcc_threads | |
| ### ----------------------------------------------------- ### | |
| ### --- Build instructions for GPT-OSS on Ampere --- ### | |
| ### Translated from https://github.com/vllm-project/vllm/issues/22290#issuecomment-3162301278 ### | |
| ARG CCACHE_NOHASHDIR="true" | |
| COPY . /tmp | |
| WORKDIR /tmp | |
| RUN pip install uv | |
| RUN uv pip install --system --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 | |
| RUN uv pip install --system "transformers[torch]" | |
| RUN python3 use_existing_torch.py | |
| RUN uv pip install --system -r requirements/build.txt | |
| RUN uv pip install --system --no-build-isolation -e . -v | |
| Run uv pip uninstall --system triton pytorch-triton | |
| RUN uv pip install --system triton==3.4.0 openai_harmony mcp | |
| RUN git clone https://github.com/openai/triton.git | |
| RUN uv pip install --system -e triton/python/triton_kernels --no-deps | |
| # Run | |
| ENV VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 | |
| ENTRYPOINT ["vllm"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Yes. Problem was solved after deleting this parameter "--enable-expert-parallel"
but i have a new problem, may be you known have to solve this???
(VllmWorker TP0 pid=44) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP1 pid=45) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP2 pid=46) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP3 pid=47) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1) File "", line 198, in _run_module_as_main
(APIServer pid=1) File "", line 88, in _run_code
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1895, in
(APIServer pid=1) uvloop.run(run_server(args))
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
(APIServer pid=1) return __asyncio.run(
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=1) return runner.run(main)
(APIServer pid=1) ^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=1) return self._loop.run_until_complete(task)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
(APIServer pid=1) return await main
(APIServer pid=1) ^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1827, in run_server
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1855, in run_server_worker
(APIServer pid=1) await init_app_state(engine_client, vllm_config, app.state, args)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1657, in init_app_state
(APIServer pid=1) state.openai_serving_responses = OpenAIServingResponses(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/serving_responses.py", line 130, in init
(APIServer pid=1) get_stop_tokens_for_assistant_actions())
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/harmony_utils.py", line 187, in get_stop_tokens_for_assistant_actions
(APIServer pid=1) return get_encoding().stop_tokens_for_assistant_actions()
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/harmony_utils.py", line 37, in get_encoding
(APIServer pid=1) _harmony_encoding = load_harmony_encoding(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/openai_harmony/init.py", line 689, in load_harmony_encoding
(APIServer pid=1) inner: _PyHarmonyEncoding = _load_harmony_encoding(name)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) openai_harmony.HarmonyError: error downloading or loading vocab file: failed to download or load vocab file
Solved:
vllm-project/vllm#22525 (comment)