Last active
December 20, 2025 17:17
-
-
Save cloudnull/58f05958cb96ff73647b1eda3b6bff0d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms. | |
| # | |
| # Supported platforms: | |
| # - linux/amd64 (x86_64) | |
| # - linux/arm64 (aarch64) | |
| # | |
| # Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.: | |
| # docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu . | |
| # | |
| # Build targets: | |
| # vllm-openai (default): used for serving deployment | |
| # vllm-test: used for CI tests | |
| # vllm-dev: used for development | |
| # | |
| # Build arguments: | |
| # PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10 | |
| # VLLM_CPU_DISABLE_AVX512=false (default)|true | |
| # VLLM_CPU_AVX512BF16=false (default)|true | |
| # VLLM_CPU_AVX512VNNI=false (default)|true | |
| # VLLM_CPU_AMXBF16=false (default)|true | |
| # | |
| ######################### COMMON BASE IMAGE ######################### | |
| FROM ubuntu:22.04 AS base-common | |
| WORKDIR /workspace/ | |
| ARG PYTHON_VERSION=3.12 | |
| ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" | |
| # Install minimal dependencies and uv | |
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |
| --mount=type=cache,target=/var/lib/apt,sharing=locked \ | |
| apt-get update -y \ | |
| && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \ | |
| gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ | |
| && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ | |
| && curl -LsSf https://astral.sh/uv/install.sh | sh | |
| ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12 | |
| ENV CCACHE_DIR=/root/.cache/ccache | |
| ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache | |
| ENV PATH="/root/.local/bin:$PATH" | |
| ENV VIRTUAL_ENV="/opt/venv" | |
| ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python | |
| RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV} | |
| ENV PATH="$VIRTUAL_ENV/bin:$PATH" | |
| ENV UV_HTTP_TIMEOUT=500 | |
| # Install Python dependencies | |
| ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} | |
| ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} | |
| ENV UV_INDEX_STRATEGY="unsafe-best-match" | |
| ENV UV_LINK_MODE="copy" | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ | |
| --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ | |
| uv pip install --upgrade pip && \ | |
| uv pip install -r requirements/cpu.txt | |
| ARG TARGETARCH | |
| ENV TARGETARCH=${TARGETARCH} | |
| ######################### x86_64 BASE IMAGE ######################### | |
| FROM base-common AS base-amd64 | |
| ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so" | |
| ######################### arm64 BASE IMAGE ######################### | |
| FROM base-common AS base-arm64 | |
| ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" | |
| ######################### BASE IMAGE ######################### | |
| FROM base-${TARGETARCH} AS base | |
| RUN echo 'ulimit -c 0' >> ~/.bashrc | |
| ######################### BUILD IMAGE ######################### | |
| FROM base AS vllm-build | |
| ARG max_jobs=32 | |
| ENV MAX_JOBS=${max_jobs} | |
| ARG GIT_REPO_CHECK=0 | |
| # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... | |
| ARG VLLM_CPU_DISABLE_AVX512=0 | |
| ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} | |
| # Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ... | |
| ARG VLLM_CPU_AVX512BF16=0 | |
| ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} | |
| # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ... | |
| ARG VLLM_CPU_AVX512VNNI=0 | |
| ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} | |
| # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ... | |
| ARG VLLM_CPU_AMXBF16=0 | |
| ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16} | |
| WORKDIR /workspace/vllm | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \ | |
| uv pip install -r requirements/build.txt | |
| COPY . . | |
| RUN --mount=type=bind,source=.git,target=.git \ | |
| if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| --mount=type=cache,target=/root/.cache/ccache \ | |
| --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \ | |
| --mount=type=bind,source=.git,target=.git \ | |
| VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 | |
| ######################### TEST DEPS ######################### | |
| FROM base AS vllm-test-deps | |
| WORKDIR /workspace/vllm | |
| # TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version | |
| RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ | |
| cp requirements/test.in requirements/cpu-test.in && \ | |
| sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ | |
| remove_packages_not_supported_on_aarch64() { \ | |
| case "$(uname -m)" in \ | |
| aarch64|arm64) \ | |
| sed -i '/decord/d' requirements/cpu-test.in; \ | |
| sed -i '/terratorch/d' requirements/cpu-test.in; \ | |
| ;; \ | |
| esac; \ | |
| }; \ | |
| remove_packages_not_supported_on_aarch64 && \ | |
| sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \ | |
| sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ | |
| sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ | |
| uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| uv pip install -r requirements/cpu-test.txt | |
| ######################### DEV IMAGE ######################### | |
| FROM vllm-build AS vllm-dev | |
| WORKDIR /workspace/vllm | |
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |
| --mount=type=cache,target=/var/lib/apt,sharing=locked \ | |
| apt-get install -y --no-install-recommends vim numactl xz-utils | |
| # install development dependencies (for testing) | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| uv pip install -e tests/vllm_test_utils | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| --mount=type=cache,target=/root/.cache/ccache \ | |
| --mount=type=bind,source=.git,target=.git \ | |
| VLLM_TARGET_DEVICE=cpu python3 setup.py develop | |
| COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| uv pip install -r requirements/dev.txt && \ | |
| pre-commit install --hook-type pre-commit --hook-type commit-msg | |
| ENTRYPOINT ["bash"] | |
| ######################### TEST IMAGE ######################### | |
| FROM vllm-test-deps AS vllm-test | |
| WORKDIR /workspace/ | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ | |
| uv pip install dist/*.whl | |
| ADD ./tests/ ./tests/ | |
| ADD ./examples/ ./examples/ | |
| ADD ./benchmarks/ ./benchmarks/ | |
| ADD ./vllm/collect_env.py . | |
| ADD ./.buildkite/ ./.buildkite/ | |
| # Create symlink for vllm-workspace to maintain CI compatibility | |
| RUN ln -sf /workspace /vllm-workspace | |
| # install development dependencies (for testing) | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| uv pip install -e tests/vllm_test_utils | |
| ######################### RELEASE IMAGE ######################### | |
| FROM base AS vllm-openai | |
| WORKDIR /workspace/ | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| --mount=type=cache,target=/root/.cache/ccache \ | |
| --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ | |
| uv pip install dist/*.whl | |
| RUN apt-get update -y \ | |
| && apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \ | |
| gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \ | |
| libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \ | |
| && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 | |
| RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \ | |
| cd ZenDNN-pytorch-plugin && \ | |
| uv pip install -r requirements.txt && \ | |
| CC=gcc CXX=g++ python3 setup.py bdist_wheel && \ | |
| uv pip install dist/*.whl | |
| ENTRYPOINT ["vllm", "serve"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Results Qwen/Qwen3-8B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 64.40 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.31 | |
| Output token throughput (tok/s): 39.75 | |
| Peak output token throughput (tok/s): 52.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 79.50 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1616.70 | |
| Median TTFT (ms): 1724.28 | |
| P99 TTFT (ms): 2761.99 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 88.65 | |
| Median TPOT (ms): 86.42 | |
| P99 TPOT (ms): 102.80 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 88.65 | |
| Median ITL (ms): 87.08 | |
| P99 ITL (ms): 101.28 | |
| ================================================== | |
| Results Qwen/Qwen3-4B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 42.67 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.47 | |
| Output token throughput (tok/s): 60.00 | |
| Peak output token throughput (tok/s): 76.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 120.00 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1037.59 | |
| Median TTFT (ms): 1024.18 | |
| P99 TTFT (ms): 2069.61 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 59.00 | |
| Median TPOT (ms): 57.91 | |
| P99 TPOT (ms): 70.71 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 59.00 | |
| Median ITL (ms): 57.16 | |
| P99 ITL (ms): 66.71 | |
| ================================================== | |
| Results Qwen/Qwen3-1.7B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 26.79 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.75 | |
| Output token throughput (tok/s): 95.55 | |
| Peak output token throughput (tok/s): 112.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 191.10 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 560.05 | |
| Median TTFT (ms): 482.31 | |
| P99 TTFT (ms): 1492.30 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 37.77 | |
| Median TPOT (ms): 36.75 | |
| P99 TPOT (ms): 45.51 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 37.77 | |
| Median ITL (ms): 36.54 | |
| P99 ITL (ms): 45.62 | |
| ================================================== | |
| Results Qwen/Qwen3-0.6B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 20.51 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.98 | |
| Output token throughput (tok/s): 124.81 | |
| Peak output token throughput (tok/s): 148.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 249.62 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 495.14 | |
| Median TTFT (ms): 403.64 | |
| P99 TTFT (ms): 1452.09 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 28.39 | |
| Median TPOT (ms): 27.57 | |
| P99 TPOT (ms): 36.38 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 28.39 | |
| Median ITL (ms): 27.24 | |
| P99 ITL (ms): 31.03 | |
| ================================================== | |
| Results meta-llama/Llama-3.2-1B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 20.65 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.97 | |
| Output token throughput (tok/s): 123.95 | |
| Peak output token throughput (tok/s): 149.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 247.91 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 480.86 | |
| Median TTFT (ms): 389.88 | |
| P99 TTFT (ms): 1313.63 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 28.72 | |
| Median TPOT (ms): 27.99 | |
| P99 TPOT (ms): 35.85 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 28.72 | |
| Median ITL (ms): 27.64 | |
| P99 ITL (ms): 30.66 | |
| ================================================== | |
| Results meta-llama/Llama-3.2-3B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 35.62 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.56 | |
| Output token throughput (tok/s): 71.87 | |
| Peak output token throughput (tok/s): 92.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 143.74 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 979.22 | |
| Median TTFT (ms): 934.78 | |
| P99 TTFT (ms): 1795.37 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 48.35 | |
| Median TPOT (ms): 47.79 | |
| P99 TPOT (ms): 53.81 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 48.35 | |
| Median ITL (ms): 46.53 | |
| P99 ITL (ms): 52.83 | |
| ================================================== | |
| Results google/gemma-3-1b-it | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 26.30 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.76 | |
| Output token throughput (tok/s): 97.35 | |
| Peak output token throughput (tok/s): 112.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 194.70 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 483.12 | |
| Median TTFT (ms): 374.32 | |
| P99 TTFT (ms): 1394.82 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 37.59 | |
| Median TPOT (ms): 37.05 | |
| P99 TPOT (ms): 44.19 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 37.59 | |
| Median ITL (ms): 36.55 | |
| P99 ITL (ms): 40.29 | |
| ================================================== | |
| Results google/gemma-3-4b-it | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 47.56 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.42 | |
| Output token throughput (tok/s): 53.83 | |
| Peak output token throughput (tok/s): 68.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 107.65 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1194.13 | |
| Median TTFT (ms): 1178.90 | |
| P99 TTFT (ms): 1899.93 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 65.44 | |
| Median TPOT (ms): 64.73 | |
| P99 TPOT (ms): 69.39 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 65.44 | |
| Median ITL (ms): 63.80 | |
| P99 ITL (ms): 73.35 | |
| ================================================== | |
| Results google/gemma-3-12b-it | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 100.96 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.20 | |
| Output token throughput (tok/s): 25.36 | |
| Peak output token throughput (tok/s): 32.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 50.71 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 3094.45 | |
| Median TTFT (ms): 3508.26 | |
| P99 TTFT (ms): 4408.72 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 134.57 | |
| Median TPOT (ms): 129.60 | |
| P99 TPOT (ms): 156.76 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 134.57 | |
| Median ITL (ms): 126.41 | |
| P99 ITL (ms): 145.44 | |
| ================================================== | |
| Results microsoft/phi-4 | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 107.44 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.19 | |
| Output token throughput (tok/s): 23.83 | |
| Peak output token throughput (tok/s): 36.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 47.65 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 4263.18 | |
| Median TTFT (ms): 5383.14 | |
| P99 TTFT (ms): 6157.70 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 135.58 | |
| Median TPOT (ms): 129.80 | |
| P99 TPOT (ms): 167.44 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 135.58 | |
| Median ITL (ms): 124.98 | |
| P99 ITL (ms): 150.06 | |
| ================================================== | |
| Results microsoft/Phi-4-mini-instruct | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 38.03 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.53 | |
| Output token throughput (tok/s): 67.32 | |
| Peak output token throughput (tok/s): 92.00 | |
| Peak concurrent requests: 8.00 | |
| Total token throughput (tok/s): 134.64 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1380.97 | |
| Median TTFT (ms): 1519.74 | |
| P99 TTFT (ms): 2179.02 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 48.99 | |
| Median TPOT (ms): 46.95 | |
| P99 TPOT (ms): 60.58 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 48.99 | |
| Median ITL (ms): 45.89 | |
| P99 ITL (ms): 51.02 | |
| ================================================== |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Results Qwen/Qwen3-8B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 123.99 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.16 | |
| Output token throughput (tok/s): 20.65 | |
| Peak output token throughput (tok/s): 24.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 41.29 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 2473.31 | |
| Median TTFT (ms): 2156.38 | |
| P99 TTFT (ms): 3685.90 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 175.59 | |
| Median TPOT (ms): 176.40 | |
| P99 TPOT (ms): 178.31 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 175.59 | |
| Median ITL (ms): 171.24 | |
| P99 ITL (ms): 220.61 | |
| ================================================== | |
| Results Qwen/Qwen3-4B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 71.57 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.28 | |
| Output token throughput (tok/s): 35.77 | |
| Peak output token throughput (tok/s): 44.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 71.54 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1351.49 | |
| Median TTFT (ms): 1366.27 | |
| P99 TTFT (ms): 2523.72 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 102.02 | |
| Median TPOT (ms): 99.59 | |
| P99 TPOT (ms): 116.01 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 102.02 | |
| Median ITL (ms): 98.58 | |
| P99 ITL (ms): 145.66 | |
| ================================================== | |
| Results Qwen/Qwen3-1.7B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 37.10 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.54 | |
| Output token throughput (tok/s): 69.00 | |
| Peak output token throughput (tok/s): 80.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 137.99 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 630.01 | |
| Median TTFT (ms): 542.26 | |
| P99 TTFT (ms): 1645.25 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 53.45 | |
| Median TPOT (ms): 52.55 | |
| P99 TPOT (ms): 62.05 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 53.45 | |
| Median ITL (ms): 52.17 | |
| P99 ITL (ms): 57.15 | |
| ================================================== | |
| Results Qwen/Qwen3-0.6B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 21.13 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.95 | |
| Output token throughput (tok/s): 121.17 | |
| Peak output token throughput (tok/s): 140.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 242.33 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 436.76 | |
| Median TTFT (ms): 246.71 | |
| P99 TTFT (ms): 1347.68 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 29.80 | |
| Median TPOT (ms): 29.74 | |
| P99 TPOT (ms): 30.15 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 29.80 | |
| Median ITL (ms): 29.24 | |
| P99 ITL (ms): 34.54 | |
| ================================================== | |
| Results meta-llama/Llama-3.2-1B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 27.27 | |
| Total input tokens: 2540 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.73 | |
| Output token throughput (tok/s): 93.89 | |
| Peak output token throughput (tok/s): 108.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 187.04 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 491.38 | |
| Median TTFT (ms): 385.28 | |
| P99 TTFT (ms): 1484.12 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 39.06 | |
| Median TPOT (ms): 38.46 | |
| P99 TPOT (ms): 47.34 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 39.06 | |
| Median ITL (ms): 37.85 | |
| P99 ITL (ms): 48.89 | |
| ================================================== | |
| Results meta-llama/Llama-3.2-3B | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 58.71 | |
| Total input tokens: 2540 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.34 | |
| Output token throughput (tok/s): 43.61 | |
| Peak output token throughput (tok/s): 52.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 86.87 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1137.78 | |
| Median TTFT (ms): 934.36 | |
| P99 TTFT (ms): 2103.96 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 83.40 | |
| Median TPOT (ms): 83.52 | |
| P99 TPOT (ms): 84.41 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 83.40 | |
| Median ITL (ms): 81.68 | |
| P99 ITL (ms): 102.00 | |
| ================================================== | |
| Results google/gemma-3-1b-it | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 30.54 | |
| Total input tokens: 2540 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.65 | |
| Output token throughput (tok/s): 83.81 | |
| Peak output token throughput (tok/s): 96.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 166.97 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 452.64 | |
| Median TTFT (ms): 336.75 | |
| P99 TTFT (ms): 1422.21 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 44.52 | |
| Median TPOT (ms): 43.66 | |
| P99 TPOT (ms): 53.19 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 44.52 | |
| Median ITL (ms): 43.00 | |
| P99 ITL (ms): 55.78 | |
| ================================================== | |
| Results google/gemma-3-4b-it | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 70.37 | |
| Total input tokens: 2540 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.28 | |
| Output token throughput (tok/s): 36.38 | |
| Peak output token throughput (tok/s): 40.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 72.47 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1044.36 | |
| Median TTFT (ms): 1050.06 | |
| P99 TTFT (ms): 2162.43 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 102.52 | |
| Median TPOT (ms): 102.40 | |
| P99 TPOT (ms): 104.82 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 102.52 | |
| Median ITL (ms): 100.12 | |
| P99 ITL (ms): 122.59 | |
| ================================================== | |
| Results google/gemma-3-12b-it | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 183.76 | |
| Total input tokens: 2540 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.11 | |
| Output token throughput (tok/s): 13.93 | |
| Peak output token throughput (tok/s): 20.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 27.75 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 3657.92 | |
| Median TTFT (ms): 3872.83 | |
| P99 TTFT (ms): 5594.60 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 260.39 | |
| Median TPOT (ms): 260.42 | |
| P99 TPOT (ms): 268.68 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 260.39 | |
| Median ITL (ms): 251.44 | |
| P99 ITL (ms): 278.57 | |
| ================================================== | |
| Results microsoft/phi-4 | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 187.20 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.11 | |
| Output token throughput (tok/s): 13.68 | |
| Peak output token throughput (tok/s): 16.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 27.35 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 3416.20 | |
| Median TTFT (ms): 3838.29 | |
| P99 TTFT (ms): 4923.56 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 267.80 | |
| Median TPOT (ms): 263.83 | |
| P99 TPOT (ms): 287.36 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 267.80 | |
| Median ITL (ms): 260.23 | |
| P99 ITL (ms): 344.91 | |
| ================================================== | |
| Results microsoft/Phi-4-mini-instruct | |
| ============ Serving Benchmark Result ============ | |
| Successful requests: 20 | |
| Failed requests: 0 | |
| Maximum request concurrency: 4 | |
| Benchmark duration (s): 59.19 | |
| Total input tokens: 2560 | |
| Total generated tokens: 2560 | |
| Request throughput (req/s): 0.34 | |
| Output token throughput (tok/s): 43.25 | |
| Peak output token throughput (tok/s): 52.00 | |
| Peak concurrent requests: 8.00 | |
| Total Token throughput (tok/s): 86.50 | |
| ---------------Time to First Token---------------- | |
| Mean TTFT (ms): 1016.35 | |
| Median TTFT (ms): 999.32 | |
| P99 TTFT (ms): 2139.12 | |
| -----Time per Output Token (excl. 1st token)------ | |
| Mean TPOT (ms): 85.18 | |
| Median TPOT (ms): 83.85 | |
| P99 TPOT (ms): 94.76 | |
| ---------------Inter-token Latency---------------- | |
| Mean ITL (ms): 85.18 | |
| Median ITL (ms): 82.53 | |
| P99 ITL (ms): 108.94 | |
| ================================================== |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Add Docker's official GPG key | |
| apt update | |
| apt install ca-certificates curl | |
| install -m 0755 -d /etc/apt/keyrings | |
| curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc | |
| chmod a+r /etc/apt/keyrings/docker.asc | |
| # Add the repository to Apt sources | |
| tee /etc/apt/sources.list.d/docker.sources <<EOF | |
| Types: deb | |
| URIs: https://download.docker.com/linux/ubuntu | |
| Suites: $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") | |
| Components: stable | |
| Signed-By: /etc/apt/keyrings/docker.asc | |
| EOF | |
| apt update | |
| apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin | |
| git clone https://github.com/vllm-project/vllm | |
| pushd vllm | |
| # Create docker file and build image | |
| cat > vllm/docker/Dockerfile.cpu-amd << 'EOF' | |
| FROM vllm-cpu:latest | |
| RUN apt-get update -y \ | |
| && apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \ | |
| gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \ | |
| libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \ | |
| && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 | |
| RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \ | |
| cd ZenDNN-pytorch-plugin && \ | |
| uv pip install -r requirements.txt && \ | |
| CC=gcc CXX=g++ python3 setup.py bdist_wheel && \ | |
| uv pip install dist/*.whl | |
| ENTRYPOINT ["vllm", "serve"] | |
| EOF | |
| docker build -f docker/Dockerfile.cpu \ | |
| --build-arg VLLM_CPU_AVX512BF16=1 \ | |
| --build-arg VLLM_CPU_AVX512VNNI=1 \ | |
| --build-arg VLLM_CPU_DISABLE_AVX512=0 \ | |
| --tag vllm-cpu \ | |
| --target vllm-openai \ | |
| . | |
| docker build -f docker/Dockerfile.cpu-amd \ | |
| --build-arg VLLM_CPU_AVX512BF16=1 \ | |
| --build-arg VLLM_CPU_AVX512VNNI=1 \ | |
| --build-arg VLLM_CPU_DISABLE_AVX512=0 \ | |
| --tag vllm-cpu-zentorch \ | |
| --target vllm-openai \ | |
| . | |
| popd | |
| apt install python3.12-venv | |
| python3 -m venv ~/.venvs/vllm | |
| ~/.venvs/vllm/bin/pip install vllm ijson | |
| # set VLLM_CPU_OMP_THREADS_BIND based on your CPU cores | |
| function runVLLmServer() { | |
| docker kill vllm-server || echo "no existing container" && sleep 2 | |
| docker rm vllm-server || echo "no existing container to remove" && sleep 5 | |
| local MODEL="$1" | |
| local CORES="0-$(($(lscpu | awk '/^CPU\(s\)/ {print $2}') - 2))" | |
| local SHM_SIZE="$(($(free -m | awk '/Mem/ {print $2}') - 1024))" | |
| local HF_TOKEN=${2:-"None"} | |
| docker run --net=host \ | |
| --ipc=host \ | |
| --shm-size=${SHM_SIZE}m \ | |
| --privileged=true \ | |
| --detach \ | |
| --volume /var/lib/huggingface:/root/.cache/huggingface \ | |
| --env HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}" \ | |
| --env VLLM_PLUGINS="zentorch" \ | |
| --env VLLM_CPU_KVCACHE_SPACE=50 \ | |
| --env VLLM_CPU_OMP_THREADS_BIND=${CORES} \ | |
| --env VLLM_CPU_NUM_OF_RESERVED_CPU=1 \ | |
| --name vllm-server \ | |
| --rm \ | |
| vllm-cpu-zentorch:latest --dtype=bfloat16 \ | |
| --max-num-seqs=5 \ | |
| --model=${MODEL} \ | |
| && echo "Monitor logging with \`docker logs -f vllm-server\`" | |
| } | |
| function testVLLm() { | |
| local MODEL="$1" | |
| HUGGING_FACE_HUB_TOKEN=${2:-"None"} ~/.venvs/vllm/bin/python3 \ | |
| -m vllm.entrypoints.cli.main bench serve --backend vllm \ | |
| --base-url http://localhost:8000 \ | |
| --model ${MODEL} \ | |
| --tokenizer ${MODEL} \ | |
| --random-input-len 128 \ | |
| --random-output-len 128 \ | |
| --num-prompts 20 \ | |
| --max-concurrency 4 \ | |
| --temperature 0.7 | |
| } | |
| # validate functionality | |
| curl http://localhost:8000/v1/models | jq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment