Skip to content

Instantly share code, notes, and snippets.

@cloudnull
Last active December 20, 2025 17:17
Show Gist options
  • Select an option

  • Save cloudnull/58f05958cb96ff73647b1eda3b6bff0d to your computer and use it in GitHub Desktop.

Select an option

Save cloudnull/58f05958cb96ff73647b1eda3b6bff0d to your computer and use it in GitHub Desktop.
# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
#
# Supported platforms:
# - linux/amd64 (x86_64)
# - linux/arm64 (aarch64)
#
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
#
# Build targets:
# vllm-openai (default): used for serving deployment
# vllm-test: used for CI tests
# vllm-dev: used for development
#
# Build arguments:
# PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
# VLLM_CPU_DISABLE_AVX512=false (default)|true
# VLLM_CPU_AVX512BF16=false (default)|true
# VLLM_CPU_AVX512VNNI=false (default)|true
# VLLM_CPU_AMXBF16=false (default)|true
#
######################### COMMON BASE IMAGE #########################
FROM ubuntu:22.04 AS base-common
WORKDIR /workspace/
ARG PYTHON_VERSION=3.12
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
# Install minimal dependencies and uv
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
ENV PATH="/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV UV_HTTP_TIMEOUT=500
# Install Python dependencies
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE="copy"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt
ARG TARGETARCH
ENV TARGETARCH=${TARGETARCH}
######################### x86_64 BASE IMAGE #########################
FROM base-common AS base-amd64
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"
######################### arm64 BASE IMAGE #########################
FROM base-common AS base-arm64
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
######################### BASE IMAGE #########################
FROM base-${TARGETARCH} AS base
RUN echo 'ulimit -c 0' >> ~/.bashrc
######################### BUILD IMAGE #########################
FROM base AS vllm-build
ARG max_jobs=32
ENV MAX_JOBS=${max_jobs}
ARG GIT_REPO_CHECK=0
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512=0
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
ARG VLLM_CPU_AVX512BF16=0
ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
ARG VLLM_CPU_AVX512VNNI=0
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
ARG VLLM_CPU_AMXBF16=0
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
uv pip install -r requirements/build.txt
COPY . .
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
######################### TEST DEPS #########################
FROM base AS vllm-test-deps
WORKDIR /workspace/vllm
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
remove_packages_not_supported_on_aarch64() { \
case "$(uname -m)" in \
aarch64|arm64) \
sed -i '/decord/d' requirements/cpu-test.in; \
sed -i '/terratorch/d' requirements/cpu-test.in; \
;; \
esac; \
}; \
remove_packages_not_supported_on_aarch64 && \
sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/cpu-test.txt
######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl xz-utils
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"]
######################### TEST IMAGE #########################
FROM vllm-test-deps AS vllm-test
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
ADD ./tests/ ./tests/
ADD ./examples/ ./examples/
ADD ./benchmarks/ ./benchmarks/
ADD ./vllm/collect_env.py .
ADD ./.buildkite/ ./.buildkite/
# Create symlink for vllm-workspace to maintain CI compatibility
RUN ln -sf /workspace /vllm-workspace
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
######################### RELEASE IMAGE #########################
FROM base AS vllm-openai
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \
libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \
cd ZenDNN-pytorch-plugin && \
uv pip install -r requirements.txt && \
CC=gcc CXX=g++ python3 setup.py bdist_wheel && \
uv pip install dist/*.whl
ENTRYPOINT ["vllm", "serve"]
Results Qwen/Qwen3-8B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 64.40
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.31
Output token throughput (tok/s): 39.75
Peak output token throughput (tok/s): 52.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 79.50
---------------Time to First Token----------------
Mean TTFT (ms): 1616.70
Median TTFT (ms): 1724.28
P99 TTFT (ms): 2761.99
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 88.65
Median TPOT (ms): 86.42
P99 TPOT (ms): 102.80
---------------Inter-token Latency----------------
Mean ITL (ms): 88.65
Median ITL (ms): 87.08
P99 ITL (ms): 101.28
==================================================
Results Qwen/Qwen3-4B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 42.67
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.47
Output token throughput (tok/s): 60.00
Peak output token throughput (tok/s): 76.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 120.00
---------------Time to First Token----------------
Mean TTFT (ms): 1037.59
Median TTFT (ms): 1024.18
P99 TTFT (ms): 2069.61
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 59.00
Median TPOT (ms): 57.91
P99 TPOT (ms): 70.71
---------------Inter-token Latency----------------
Mean ITL (ms): 59.00
Median ITL (ms): 57.16
P99 ITL (ms): 66.71
==================================================
Results Qwen/Qwen3-1.7B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 26.79
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.75
Output token throughput (tok/s): 95.55
Peak output token throughput (tok/s): 112.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 191.10
---------------Time to First Token----------------
Mean TTFT (ms): 560.05
Median TTFT (ms): 482.31
P99 TTFT (ms): 1492.30
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 37.77
Median TPOT (ms): 36.75
P99 TPOT (ms): 45.51
---------------Inter-token Latency----------------
Mean ITL (ms): 37.77
Median ITL (ms): 36.54
P99 ITL (ms): 45.62
==================================================
Results Qwen/Qwen3-0.6B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 20.51
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.98
Output token throughput (tok/s): 124.81
Peak output token throughput (tok/s): 148.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 249.62
---------------Time to First Token----------------
Mean TTFT (ms): 495.14
Median TTFT (ms): 403.64
P99 TTFT (ms): 1452.09
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 28.39
Median TPOT (ms): 27.57
P99 TPOT (ms): 36.38
---------------Inter-token Latency----------------
Mean ITL (ms): 28.39
Median ITL (ms): 27.24
P99 ITL (ms): 31.03
==================================================
Results meta-llama/Llama-3.2-1B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 20.65
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.97
Output token throughput (tok/s): 123.95
Peak output token throughput (tok/s): 149.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 247.91
---------------Time to First Token----------------
Mean TTFT (ms): 480.86
Median TTFT (ms): 389.88
P99 TTFT (ms): 1313.63
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 28.72
Median TPOT (ms): 27.99
P99 TPOT (ms): 35.85
---------------Inter-token Latency----------------
Mean ITL (ms): 28.72
Median ITL (ms): 27.64
P99 ITL (ms): 30.66
==================================================
Results meta-llama/Llama-3.2-3B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 35.62
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.56
Output token throughput (tok/s): 71.87
Peak output token throughput (tok/s): 92.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 143.74
---------------Time to First Token----------------
Mean TTFT (ms): 979.22
Median TTFT (ms): 934.78
P99 TTFT (ms): 1795.37
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 48.35
Median TPOT (ms): 47.79
P99 TPOT (ms): 53.81
---------------Inter-token Latency----------------
Mean ITL (ms): 48.35
Median ITL (ms): 46.53
P99 ITL (ms): 52.83
==================================================
Results google/gemma-3-1b-it
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 26.30
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.76
Output token throughput (tok/s): 97.35
Peak output token throughput (tok/s): 112.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 194.70
---------------Time to First Token----------------
Mean TTFT (ms): 483.12
Median TTFT (ms): 374.32
P99 TTFT (ms): 1394.82
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 37.59
Median TPOT (ms): 37.05
P99 TPOT (ms): 44.19
---------------Inter-token Latency----------------
Mean ITL (ms): 37.59
Median ITL (ms): 36.55
P99 ITL (ms): 40.29
==================================================
Results google/gemma-3-4b-it
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 47.56
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.42
Output token throughput (tok/s): 53.83
Peak output token throughput (tok/s): 68.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 107.65
---------------Time to First Token----------------
Mean TTFT (ms): 1194.13
Median TTFT (ms): 1178.90
P99 TTFT (ms): 1899.93
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 65.44
Median TPOT (ms): 64.73
P99 TPOT (ms): 69.39
---------------Inter-token Latency----------------
Mean ITL (ms): 65.44
Median ITL (ms): 63.80
P99 ITL (ms): 73.35
==================================================
Results google/gemma-3-12b-it
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 100.96
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.20
Output token throughput (tok/s): 25.36
Peak output token throughput (tok/s): 32.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 50.71
---------------Time to First Token----------------
Mean TTFT (ms): 3094.45
Median TTFT (ms): 3508.26
P99 TTFT (ms): 4408.72
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 134.57
Median TPOT (ms): 129.60
P99 TPOT (ms): 156.76
---------------Inter-token Latency----------------
Mean ITL (ms): 134.57
Median ITL (ms): 126.41
P99 ITL (ms): 145.44
==================================================
Results microsoft/phi-4
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 107.44
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.19
Output token throughput (tok/s): 23.83
Peak output token throughput (tok/s): 36.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 47.65
---------------Time to First Token----------------
Mean TTFT (ms): 4263.18
Median TTFT (ms): 5383.14
P99 TTFT (ms): 6157.70
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 135.58
Median TPOT (ms): 129.80
P99 TPOT (ms): 167.44
---------------Inter-token Latency----------------
Mean ITL (ms): 135.58
Median ITL (ms): 124.98
P99 ITL (ms): 150.06
==================================================
Results microsoft/Phi-4-mini-instruct
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 38.03
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.53
Output token throughput (tok/s): 67.32
Peak output token throughput (tok/s): 92.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 134.64
---------------Time to First Token----------------
Mean TTFT (ms): 1380.97
Median TTFT (ms): 1519.74
P99 TTFT (ms): 2179.02
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 48.99
Median TPOT (ms): 46.95
P99 TPOT (ms): 60.58
---------------Inter-token Latency----------------
Mean ITL (ms): 48.99
Median ITL (ms): 45.89
P99 ITL (ms): 51.02
==================================================
Results Qwen/Qwen3-8B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 123.99
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.16
Output token throughput (tok/s): 20.65
Peak output token throughput (tok/s): 24.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 41.29
---------------Time to First Token----------------
Mean TTFT (ms): 2473.31
Median TTFT (ms): 2156.38
P99 TTFT (ms): 3685.90
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 175.59
Median TPOT (ms): 176.40
P99 TPOT (ms): 178.31
---------------Inter-token Latency----------------
Mean ITL (ms): 175.59
Median ITL (ms): 171.24
P99 ITL (ms): 220.61
==================================================
Results Qwen/Qwen3-4B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 71.57
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.28
Output token throughput (tok/s): 35.77
Peak output token throughput (tok/s): 44.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 71.54
---------------Time to First Token----------------
Mean TTFT (ms): 1351.49
Median TTFT (ms): 1366.27
P99 TTFT (ms): 2523.72
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 102.02
Median TPOT (ms): 99.59
P99 TPOT (ms): 116.01
---------------Inter-token Latency----------------
Mean ITL (ms): 102.02
Median ITL (ms): 98.58
P99 ITL (ms): 145.66
==================================================
Results Qwen/Qwen3-1.7B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 37.10
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.54
Output token throughput (tok/s): 69.00
Peak output token throughput (tok/s): 80.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 137.99
---------------Time to First Token----------------
Mean TTFT (ms): 630.01
Median TTFT (ms): 542.26
P99 TTFT (ms): 1645.25
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 53.45
Median TPOT (ms): 52.55
P99 TPOT (ms): 62.05
---------------Inter-token Latency----------------
Mean ITL (ms): 53.45
Median ITL (ms): 52.17
P99 ITL (ms): 57.15
==================================================
Results Qwen/Qwen3-0.6B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 21.13
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.95
Output token throughput (tok/s): 121.17
Peak output token throughput (tok/s): 140.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 242.33
---------------Time to First Token----------------
Mean TTFT (ms): 436.76
Median TTFT (ms): 246.71
P99 TTFT (ms): 1347.68
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 29.80
Median TPOT (ms): 29.74
P99 TPOT (ms): 30.15
---------------Inter-token Latency----------------
Mean ITL (ms): 29.80
Median ITL (ms): 29.24
P99 ITL (ms): 34.54
==================================================
Results meta-llama/Llama-3.2-1B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 27.27
Total input tokens: 2540
Total generated tokens: 2560
Request throughput (req/s): 0.73
Output token throughput (tok/s): 93.89
Peak output token throughput (tok/s): 108.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 187.04
---------------Time to First Token----------------
Mean TTFT (ms): 491.38
Median TTFT (ms): 385.28
P99 TTFT (ms): 1484.12
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 39.06
Median TPOT (ms): 38.46
P99 TPOT (ms): 47.34
---------------Inter-token Latency----------------
Mean ITL (ms): 39.06
Median ITL (ms): 37.85
P99 ITL (ms): 48.89
==================================================
Results meta-llama/Llama-3.2-3B
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 58.71
Total input tokens: 2540
Total generated tokens: 2560
Request throughput (req/s): 0.34
Output token throughput (tok/s): 43.61
Peak output token throughput (tok/s): 52.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 86.87
---------------Time to First Token----------------
Mean TTFT (ms): 1137.78
Median TTFT (ms): 934.36
P99 TTFT (ms): 2103.96
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 83.40
Median TPOT (ms): 83.52
P99 TPOT (ms): 84.41
---------------Inter-token Latency----------------
Mean ITL (ms): 83.40
Median ITL (ms): 81.68
P99 ITL (ms): 102.00
==================================================
Results google/gemma-3-1b-it
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 30.54
Total input tokens: 2540
Total generated tokens: 2560
Request throughput (req/s): 0.65
Output token throughput (tok/s): 83.81
Peak output token throughput (tok/s): 96.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 166.97
---------------Time to First Token----------------
Mean TTFT (ms): 452.64
Median TTFT (ms): 336.75
P99 TTFT (ms): 1422.21
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 44.52
Median TPOT (ms): 43.66
P99 TPOT (ms): 53.19
---------------Inter-token Latency----------------
Mean ITL (ms): 44.52
Median ITL (ms): 43.00
P99 ITL (ms): 55.78
==================================================
Results google/gemma-3-4b-it
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 70.37
Total input tokens: 2540
Total generated tokens: 2560
Request throughput (req/s): 0.28
Output token throughput (tok/s): 36.38
Peak output token throughput (tok/s): 40.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 72.47
---------------Time to First Token----------------
Mean TTFT (ms): 1044.36
Median TTFT (ms): 1050.06
P99 TTFT (ms): 2162.43
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 102.52
Median TPOT (ms): 102.40
P99 TPOT (ms): 104.82
---------------Inter-token Latency----------------
Mean ITL (ms): 102.52
Median ITL (ms): 100.12
P99 ITL (ms): 122.59
==================================================
Results google/gemma-3-12b-it
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 183.76
Total input tokens: 2540
Total generated tokens: 2560
Request throughput (req/s): 0.11
Output token throughput (tok/s): 13.93
Peak output token throughput (tok/s): 20.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 27.75
---------------Time to First Token----------------
Mean TTFT (ms): 3657.92
Median TTFT (ms): 3872.83
P99 TTFT (ms): 5594.60
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 260.39
Median TPOT (ms): 260.42
P99 TPOT (ms): 268.68
---------------Inter-token Latency----------------
Mean ITL (ms): 260.39
Median ITL (ms): 251.44
P99 ITL (ms): 278.57
==================================================
Results microsoft/phi-4
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 187.20
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.11
Output token throughput (tok/s): 13.68
Peak output token throughput (tok/s): 16.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 27.35
---------------Time to First Token----------------
Mean TTFT (ms): 3416.20
Median TTFT (ms): 3838.29
P99 TTFT (ms): 4923.56
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 267.80
Median TPOT (ms): 263.83
P99 TPOT (ms): 287.36
---------------Inter-token Latency----------------
Mean ITL (ms): 267.80
Median ITL (ms): 260.23
P99 ITL (ms): 344.91
==================================================
Results microsoft/Phi-4-mini-instruct
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 59.19
Total input tokens: 2560
Total generated tokens: 2560
Request throughput (req/s): 0.34
Output token throughput (tok/s): 43.25
Peak output token throughput (tok/s): 52.00
Peak concurrent requests: 8.00
Total Token throughput (tok/s): 86.50
---------------Time to First Token----------------
Mean TTFT (ms): 1016.35
Median TTFT (ms): 999.32
P99 TTFT (ms): 2139.12
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 85.18
Median TPOT (ms): 83.85
P99 TPOT (ms): 94.76
---------------Inter-token Latency----------------
Mean ITL (ms): 85.18
Median ITL (ms): 82.53
P99 ITL (ms): 108.94
==================================================
# Add Docker's official GPG key
apt update
apt install ca-certificates curl
install -m 0755 -d /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources
tee /etc/apt/sources.list.d/docker.sources <<EOF
Types: deb
URIs: https://download.docker.com/linux/ubuntu
Suites: $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}")
Components: stable
Signed-By: /etc/apt/keyrings/docker.asc
EOF
apt update
apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
git clone https://github.com/vllm-project/vllm
pushd vllm
# Create docker file and build image
cat > vllm/docker/Dockerfile.cpu-amd << 'EOF'
FROM vllm-cpu:latest
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \
libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \
cd ZenDNN-pytorch-plugin && \
uv pip install -r requirements.txt && \
CC=gcc CXX=g++ python3 setup.py bdist_wheel && \
uv pip install dist/*.whl
ENTRYPOINT ["vllm", "serve"]
EOF
docker build -f docker/Dockerfile.cpu \
--build-arg VLLM_CPU_AVX512BF16=1 \
--build-arg VLLM_CPU_AVX512VNNI=1 \
--build-arg VLLM_CPU_DISABLE_AVX512=0 \
--tag vllm-cpu \
--target vllm-openai \
.
docker build -f docker/Dockerfile.cpu-amd \
--build-arg VLLM_CPU_AVX512BF16=1 \
--build-arg VLLM_CPU_AVX512VNNI=1 \
--build-arg VLLM_CPU_DISABLE_AVX512=0 \
--tag vllm-cpu-zentorch \
--target vllm-openai \
.
popd
apt install python3.12-venv
python3 -m venv ~/.venvs/vllm
~/.venvs/vllm/bin/pip install vllm ijson
# set VLLM_CPU_OMP_THREADS_BIND based on your CPU cores
function runVLLmServer() {
docker kill vllm-server || echo "no existing container" && sleep 2
docker rm vllm-server || echo "no existing container to remove" && sleep 5
local MODEL="$1"
local CORES="0-$(($(lscpu | awk '/^CPU\(s\)/ {print $2}') - 2))"
local SHM_SIZE="$(($(free -m | awk '/Mem/ {print $2}') - 1024))"
local HF_TOKEN=${2:-"None"}
docker run --net=host \
--ipc=host \
--shm-size=${SHM_SIZE}m \
--privileged=true \
--detach \
--volume /var/lib/huggingface:/root/.cache/huggingface \
--env HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}" \
--env VLLM_PLUGINS="zentorch" \
--env VLLM_CPU_KVCACHE_SPACE=50 \
--env VLLM_CPU_OMP_THREADS_BIND=${CORES} \
--env VLLM_CPU_NUM_OF_RESERVED_CPU=1 \
--name vllm-server \
--rm \
vllm-cpu-zentorch:latest --dtype=bfloat16 \
--max-num-seqs=5 \
--model=${MODEL} \
&& echo "Monitor logging with \`docker logs -f vllm-server\`"
}
function testVLLm() {
local MODEL="$1"
HUGGING_FACE_HUB_TOKEN=${2:-"None"} ~/.venvs/vllm/bin/python3 \
-m vllm.entrypoints.cli.main bench serve --backend vllm \
--base-url http://localhost:8000 \
--model ${MODEL} \
--tokenizer ${MODEL} \
--random-input-len 128 \
--random-output-len 128 \
--num-prompts 20 \
--max-concurrency 4 \
--temperature 0.7
}
# validate functionality
curl http://localhost:8000/v1/models | jq
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment