cloudnull · December 20, 2025 17:17
diff --git a/Dockerfile.cpu-amd b/Dockerfile.cpu-amd
 # This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
 #
 # Supported platforms:
 #   - linux/amd64 (x86_64)
 #   - linux/arm64 (aarch64)
 #
 # Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
 #   docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
 #
 # Build targets:
 #   vllm-openai (default): used for serving deployment
 #   vllm-test: used for CI tests
 #   vllm-dev: used for development
 #
 # Build arguments:
 #   PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
 #   VLLM_CPU_AMXBF16=false (default)|true
 #

 ######################### COMMON BASE IMAGE #########################
 FROM ubuntu:22.04 AS base-common

 WORKDIR /workspace/

 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"

 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh

 ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache

 ENV PATH="/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
 RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 ENV UV_HTTP_TIMEOUT=500

 # Install Python dependencies
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
    uv pip install --upgrade pip && \
    uv pip install -r requirements/cpu.txt

 ARG TARGETARCH
 ENV TARGETARCH=${TARGETARCH}

 ######################### x86_64 BASE IMAGE #########################
 FROM base-common AS base-amd64

 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"

 ######################### arm64 BASE IMAGE #########################
 FROM base-common AS base-arm64

 ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"

 ######################### BASE IMAGE #########################
 FROM base-${TARGETARCH} AS base

 RUN echo 'ulimit -c 0' >> ~/.bashrc

 ######################### BUILD IMAGE #########################
 FROM base AS vllm-build

 ARG max_jobs=32
 ENV MAX_JOBS=${max_jobs}

 ARG GIT_REPO_CHECK=0
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512=0
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 # Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
 ARG VLLM_CPU_AVX512BF16=0
 ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
 # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
 ARG VLLM_CPU_AMXBF16=0
 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}

 WORKDIR /workspace/vllm

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
    uv pip install -r requirements/build.txt

 COPY . .
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

 ######################### TEST DEPS #########################
 FROM base AS vllm-test-deps

 WORKDIR /workspace/vllm

 # TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    remove_packages_not_supported_on_aarch64() { \
      case "$(uname -m)" in \
        aarch64|arm64) \
          sed -i '/decord/d' requirements/cpu-test.in; \
          sed -i '/terratorch/d' requirements/cpu-test.in; \
          ;; \
      esac; \
    }; \
    remove_packages_not_supported_on_aarch64 && \
    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -r requirements/cpu-test.txt

 ######################### DEV IMAGE #########################
 FROM vllm-build AS vllm-dev

 WORKDIR /workspace/vllm

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get install -y --no-install-recommends vim numactl xz-utils

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e tests/vllm_test_utils

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py develop

 COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt

 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -r requirements/dev.txt && \
    pre-commit install --hook-type pre-commit --hook-type commit-msg

 ENTRYPOINT ["bash"]

 ######################### TEST IMAGE #########################
 FROM vllm-test-deps AS vllm-test

 WORKDIR /workspace/

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl

 ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
 ADD ./vllm/collect_env.py .
 ADD ./.buildkite/ ./.buildkite/

 # Create symlink for vllm-workspace to maintain CI compatibility
 RUN ln -sf /workspace /vllm-workspace

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e tests/vllm_test_utils

 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai

 WORKDIR /workspace/

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl

 RUN apt-get update -y \
    && apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \
                                                  gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \
                                                  libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \
    cd ZenDNN-pytorch-plugin && \
    uv pip install -r requirements.txt && \
    CC=gcc CXX=g++ python3 setup.py bdist_wheel && \
    uv pip install dist/*.whl

 ENTRYPOINT ["vllm", "serve"]
diff --git a/results.optimized.txt b/results.optimized.txt
 Results Qwen/Qwen3-8B 

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  64.40     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.31      
 Output token throughput (tok/s):         39.75     
 Peak output token throughput (tok/s):    52.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          79.50     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1616.70   
 Median TTFT (ms):                        1724.28   
 P99 TTFT (ms):                           2761.99   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          88.65     
 Median TPOT (ms):                        86.42     
 P99 TPOT (ms):                           102.80    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           88.65     
 Median ITL (ms):                         87.08     
 P99 ITL (ms):                            101.28    
 ==================================================

 Results Qwen/Qwen3-4B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  42.67     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.47      
 Output token throughput (tok/s):         60.00     
 Peak output token throughput (tok/s):    76.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          120.00    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1037.59   
 Median TTFT (ms):                        1024.18   
 P99 TTFT (ms):                           2069.61   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          59.00     
 Median TPOT (ms):                        57.91     
 P99 TPOT (ms):                           70.71     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           59.00     
 Median ITL (ms):                         57.16     
 P99 ITL (ms):                            66.71     
 ==================================================

 Results Qwen/Qwen3-1.7B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  26.79     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.75      
 Output token throughput (tok/s):         95.55     
 Peak output token throughput (tok/s):    112.00    
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          191.10    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          560.05    
 Median TTFT (ms):                        482.31    
 P99 TTFT (ms):                           1492.30   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          37.77     
 Median TPOT (ms):                        36.75     
 P99 TPOT (ms):                           45.51     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           37.77     
 Median ITL (ms):                         36.54     
 P99 ITL (ms):                            45.62     
 ==================================================

 Results Qwen/Qwen3-0.6B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  20.51     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.98      
 Output token throughput (tok/s):         124.81    
 Peak output token throughput (tok/s):    148.00    
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          249.62    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          495.14    
 Median TTFT (ms):                        403.64    
 P99 TTFT (ms):                           1452.09   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          28.39     
 Median TPOT (ms):                        27.57     
 P99 TPOT (ms):                           36.38     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           28.39     
 Median ITL (ms):                         27.24     
 P99 ITL (ms):                            31.03     
 ==================================================

 Results meta-llama/Llama-3.2-1B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  20.65     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.97      
 Output token throughput (tok/s):         123.95    
 Peak output token throughput (tok/s):    149.00    
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          247.91    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          480.86    
 Median TTFT (ms):                        389.88    
 P99 TTFT (ms):                           1313.63   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          28.72     
 Median TPOT (ms):                        27.99     
 P99 TPOT (ms):                           35.85     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           28.72     
 Median ITL (ms):                         27.64     
 P99 ITL (ms):                            30.66     
 ==================================================

 Results meta-llama/Llama-3.2-3B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  35.62     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.56      
 Output token throughput (tok/s):         71.87     
 Peak output token throughput (tok/s):    92.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          143.74    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          979.22    
 Median TTFT (ms):                        934.78    
 P99 TTFT (ms):                           1795.37   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          48.35     
 Median TPOT (ms):                        47.79     
 P99 TPOT (ms):                           53.81     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           48.35     
 Median ITL (ms):                         46.53     
 P99 ITL (ms):                            52.83     
 ==================================================

 Results google/gemma-3-1b-it

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  26.30     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.76      
 Output token throughput (tok/s):         97.35     
 Peak output token throughput (tok/s):    112.00    
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          194.70    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          483.12    
 Median TTFT (ms):                        374.32    
 P99 TTFT (ms):                           1394.82   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          37.59     
 Median TPOT (ms):                        37.05     
 P99 TPOT (ms):                           44.19     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           37.59     
 Median ITL (ms):                         36.55     
 P99 ITL (ms):                            40.29     
 ==================================================

 Results google/gemma-3-4b-it

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  47.56     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.42      
 Output token throughput (tok/s):         53.83     
 Peak output token throughput (tok/s):    68.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          107.65    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1194.13   
 Median TTFT (ms):                        1178.90   
 P99 TTFT (ms):                           1899.93   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          65.44     
 Median TPOT (ms):                        64.73     
 P99 TPOT (ms):                           69.39     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           65.44     
 Median ITL (ms):                         63.80     
 P99 ITL (ms):                            73.35     
 ==================================================

 Results google/gemma-3-12b-it

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  100.96    
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.20      
 Output token throughput (tok/s):         25.36     
 Peak output token throughput (tok/s):    32.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          50.71     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          3094.45   
 Median TTFT (ms):                        3508.26   
 P99 TTFT (ms):                           4408.72   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          134.57    
 Median TPOT (ms):                        129.60    
 P99 TPOT (ms):                           156.76    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           134.57    
 Median ITL (ms):                         126.41    
 P99 ITL (ms):                            145.44    
 ==================================================

 Results microsoft/phi-4

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  107.44    
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.19      
 Output token throughput (tok/s):         23.83     
 Peak output token throughput (tok/s):    36.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          47.65     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          4263.18   
 Median TTFT (ms):                        5383.14   
 P99 TTFT (ms):                           6157.70   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          135.58    
 Median TPOT (ms):                        129.80    
 P99 TPOT (ms):                           167.44    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           135.58    
 Median ITL (ms):                         124.98    
 P99 ITL (ms):                            150.06    
 ==================================================

 Results microsoft/Phi-4-mini-instruct

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  38.03     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.53      
 Output token throughput (tok/s):         67.32     
 Peak output token throughput (tok/s):    92.00     
 Peak concurrent requests:                8.00      
 Total token throughput (tok/s):          134.64    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1380.97   
 Median TTFT (ms):                        1519.74   
 P99 TTFT (ms):                           2179.02   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          48.99     
 Median TPOT (ms):                        46.95     
 P99 TPOT (ms):                           60.58     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           48.99     
 Median ITL (ms):                         45.89     
 P99 ITL (ms):                            51.02     
 ==================================================
diff --git a/results.unoptimized.txt b/results.unoptimized.txt
 Results Qwen/Qwen3-8B 

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  123.99    
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.16      
 Output token throughput (tok/s):         20.65     
 Peak output token throughput (tok/s):    24.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          41.29     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          2473.31   
 Median TTFT (ms):                        2156.38   
 P99 TTFT (ms):                           3685.90   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          175.59    
 Median TPOT (ms):                        176.40    
 P99 TPOT (ms):                           178.31    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           175.59    
 Median ITL (ms):                         171.24    
 P99 ITL (ms):                            220.61    
 ==================================================

 Results Qwen/Qwen3-4B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  71.57     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.28      
 Output token throughput (tok/s):         35.77     
 Peak output token throughput (tok/s):    44.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          71.54     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1351.49   
 Median TTFT (ms):                        1366.27   
 P99 TTFT (ms):                           2523.72   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          102.02    
 Median TPOT (ms):                        99.59     
 P99 TPOT (ms):                           116.01    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           102.02    
 Median ITL (ms):                         98.58     
 P99 ITL (ms):                            145.66    
 ==================================================

 Results Qwen/Qwen3-1.7B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  37.10     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.54      
 Output token throughput (tok/s):         69.00     
 Peak output token throughput (tok/s):    80.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          137.99    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          630.01    
 Median TTFT (ms):                        542.26    
 P99 TTFT (ms):                           1645.25   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          53.45     
 Median TPOT (ms):                        52.55     
 P99 TPOT (ms):                           62.05     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           53.45     
 Median ITL (ms):                         52.17     
 P99 ITL (ms):                            57.15     
 ==================================================

 Results Qwen/Qwen3-0.6B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  21.13     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.95      
 Output token throughput (tok/s):         121.17    
 Peak output token throughput (tok/s):    140.00    
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          242.33    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          436.76    
 Median TTFT (ms):                        246.71    
 P99 TTFT (ms):                           1347.68   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          29.80     
 Median TPOT (ms):                        29.74     
 P99 TPOT (ms):                           30.15     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           29.80     
 Median ITL (ms):                         29.24     
 P99 ITL (ms):                            34.54     
 ==================================================

 Results meta-llama/Llama-3.2-1B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  27.27     
 Total input tokens:                      2540      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.73      
 Output token throughput (tok/s):         93.89     
 Peak output token throughput (tok/s):    108.00    
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          187.04    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          491.38    
 Median TTFT (ms):                        385.28    
 P99 TTFT (ms):                           1484.12   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          39.06     
 Median TPOT (ms):                        38.46     
 P99 TPOT (ms):                           47.34     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           39.06     
 Median ITL (ms):                         37.85     
 P99 ITL (ms):                            48.89     
 ==================================================

 Results meta-llama/Llama-3.2-3B

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  58.71     
 Total input tokens:                      2540      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.34      
 Output token throughput (tok/s):         43.61     
 Peak output token throughput (tok/s):    52.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          86.87     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1137.78   
 Median TTFT (ms):                        934.36    
 P99 TTFT (ms):                           2103.96   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          83.40     
 Median TPOT (ms):                        83.52     
 P99 TPOT (ms):                           84.41     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           83.40     
 Median ITL (ms):                         81.68     
 P99 ITL (ms):                            102.00    
 ==================================================

 Results google/gemma-3-1b-it

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  30.54     
 Total input tokens:                      2540      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.65      
 Output token throughput (tok/s):         83.81     
 Peak output token throughput (tok/s):    96.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          166.97    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          452.64    
 Median TTFT (ms):                        336.75    
 P99 TTFT (ms):                           1422.21   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          44.52     
 Median TPOT (ms):                        43.66     
 P99 TPOT (ms):                           53.19     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           44.52     
 Median ITL (ms):                         43.00     
 P99 ITL (ms):                            55.78     
 ==================================================

 Results google/gemma-3-4b-it

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  70.37     
 Total input tokens:                      2540      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.28      
 Output token throughput (tok/s):         36.38     
 Peak output token throughput (tok/s):    40.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          72.47     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1044.36   
 Median TTFT (ms):                        1050.06   
 P99 TTFT (ms):                           2162.43   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          102.52    
 Median TPOT (ms):                        102.40    
 P99 TPOT (ms):                           104.82    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           102.52    
 Median ITL (ms):                         100.12    
 P99 ITL (ms):                            122.59    
 ==================================================

 Results google/gemma-3-12b-it

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  183.76    
 Total input tokens:                      2540      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.11      
 Output token throughput (tok/s):         13.93     
 Peak output token throughput (tok/s):    20.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          27.75     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          3657.92   
 Median TTFT (ms):                        3872.83   
 P99 TTFT (ms):                           5594.60   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          260.39    
 Median TPOT (ms):                        260.42    
 P99 TPOT (ms):                           268.68    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           260.39    
 Median ITL (ms):                         251.44    
 P99 ITL (ms):                            278.57    
 ==================================================

 Results microsoft/phi-4

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  187.20    
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.11      
 Output token throughput (tok/s):         13.68     
 Peak output token throughput (tok/s):    16.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          27.35     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          3416.20   
 Median TTFT (ms):                        3838.29   
 P99 TTFT (ms):                           4923.56   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          267.80    
 Median TPOT (ms):                        263.83    
 P99 TPOT (ms):                           287.36    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           267.80    
 Median ITL (ms):                         260.23    
 P99 ITL (ms):                            344.91    
 ==================================================

 Results microsoft/Phi-4-mini-instruct

 ============ Serving Benchmark Result ============
 Successful requests:                     20        
 Failed requests:                         0         
 Maximum request concurrency:             4         
 Benchmark duration (s):                  59.19     
 Total input tokens:                      2560      
 Total generated tokens:                  2560      
 Request throughput (req/s):              0.34      
 Output token throughput (tok/s):         43.25     
 Peak output token throughput (tok/s):    52.00     
 Peak concurrent requests:                8.00      
 Total Token throughput (tok/s):          86.50     
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          1016.35   
 Median TTFT (ms):                        999.32    
 P99 TTFT (ms):                           2139.12   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          85.18     
 Median TPOT (ms):                        83.85     
 P99 TPOT (ms):                           94.76     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           85.18     
 Median ITL (ms):                         82.53     
 P99 ITL (ms):                            108.94    
 ==================================================
diff --git a/zendnn-docker-setup-notes.sh b/zendnn-docker-setup-notes.sh
 # Add Docker's official GPG key
 apt update
 apt install ca-certificates curl
 install -m 0755 -d /etc/apt/keyrings
 curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
 chmod a+r /etc/apt/keyrings/docker.asc

 # Add the repository to Apt sources
 tee /etc/apt/sources.list.d/docker.sources <<EOF
 Types: deb
 URIs: https://download.docker.com/linux/ubuntu
 Suites: $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}")
 Components: stable
 Signed-By: /etc/apt/keyrings/docker.asc
 EOF

 apt update

 apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin

 git clone https://github.com/vllm-project/vllm
 pushd vllm
    # Create docker file and build image
    cat > vllm/docker/Dockerfile.cpu-amd << 'EOF'
 FROM vllm-cpu:latest
 RUN apt-get update -y \
    && apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \
                                                  gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \
                                                  libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \
    cd ZenDNN-pytorch-plugin && \
    uv pip install -r requirements.txt && \
    CC=gcc CXX=g++ python3 setup.py bdist_wheel && \
    uv pip install dist/*.whl

 ENTRYPOINT ["vllm", "serve"]
 EOF
    docker build -f docker/Dockerfile.cpu \
                --build-arg VLLM_CPU_AVX512BF16=1 \
                --build-arg VLLM_CPU_AVX512VNNI=1 \
                --build-arg VLLM_CPU_DISABLE_AVX512=0 \
                --tag vllm-cpu \
                --target vllm-openai \
                .
    docker build -f docker/Dockerfile.cpu-amd \
                --build-arg VLLM_CPU_AVX512BF16=1 \
                --build-arg VLLM_CPU_AVX512VNNI=1 \
                --build-arg VLLM_CPU_DISABLE_AVX512=0 \
                --tag vllm-cpu-zentorch \
                --target vllm-openai \
                .
 popd

 apt install python3.12-venv
 python3 -m venv ~/.venvs/vllm
 ~/.venvs/vllm/bin/pip install vllm ijson


 # set VLLM_CPU_OMP_THREADS_BIND based on your CPU cores

 function runVLLmServer() {
    docker kill vllm-server || echo "no existing container" && sleep 2
    docker rm vllm-server || echo "no existing container to remove" && sleep 5

    local MODEL="$1"
    local CORES="0-$(($(lscpu | awk '/^CPU\(s\)/ {print $2}') - 2))"
    local SHM_SIZE="$(($(free -m | awk '/Mem/ {print $2}') - 1024))"
    local HF_TOKEN=${2:-"None"}

    docker run --net=host \
            --ipc=host \
            --shm-size=${SHM_SIZE}m \
            --privileged=true \
            --detach \
            --volume /var/lib/huggingface:/root/.cache/huggingface \
            --env HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}" \
            --env VLLM_PLUGINS="zentorch" \
            --env VLLM_CPU_KVCACHE_SPACE=50 \
            --env VLLM_CPU_OMP_THREADS_BIND=${CORES} \
            --env VLLM_CPU_NUM_OF_RESERVED_CPU=1 \
            --name vllm-server \
            --rm \
            vllm-cpu-zentorch:latest --dtype=bfloat16 \
                                     --max-num-seqs=5 \
                                     --model=${MODEL} \
    && echo "Monitor logging with \`docker logs -f vllm-server\`"
 }

 function testVLLm() {
    local MODEL="$1"
    HUGGING_FACE_HUB_TOKEN=${2:-"None"} ~/.venvs/vllm/bin/python3 \
        -m vllm.entrypoints.cli.main bench serve --backend vllm \
                                                 --base-url http://localhost:8000 \
                                                 --model ${MODEL} \
                                                 --tokenizer ${MODEL} \
                                                 --random-input-len 128 \
                                                 --random-output-len 128 \
                                                 --num-prompts 20 \
                                                 --max-concurrency 4 \
                                                 --temperature 0.7
 }


 # validate functionality
 curl http://localhost:8000/v1/models | jq
	# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
	#
	# Supported platforms:
	# - linux/amd64 (x86_64)
	# - linux/arm64 (aarch64)
	#
	# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
	# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
	#
	# Build targets:
	# vllm-openai (default): used for serving deployment
	# vllm-test: used for CI tests
	# vllm-dev: used for development
	#
	# Build arguments:
	# PYTHON_VERSION=3.13\|3.12 (default)\|3.11\|3.10
	# VLLM_CPU_DISABLE_AVX512=false (default)\|true
	# VLLM_CPU_AVX512BF16=false (default)\|true
	# VLLM_CPU_AVX512VNNI=false (default)\|true
	# VLLM_CPU_AMXBF16=false (default)\|true
	#

	######################### COMMON BASE IMAGE #########################
	FROM ubuntu:22.04 AS base-common

	WORKDIR /workspace/

	ARG PYTHON_VERSION=3.12
	ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"

	# Install minimal dependencies and uv
	RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
	--mount=type=cache,target=/var/lib/apt,sharing=locked \
	apt-get update -y \
	&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
	gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
	&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
	&& curl -LsSf https://astral.sh/uv/install.sh \| sh

	ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
	ENV CCACHE_DIR=/root/.cache/ccache
	ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache

	ENV PATH="/root/.local/bin:$PATH"
	ENV VIRTUAL_ENV="/opt/venv"
	ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
	RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
	ENV PATH="$VIRTUAL_ENV/bin:$PATH"

	ENV UV_HTTP_TIMEOUT=500

	# Install Python dependencies
	ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
	ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
	ENV UV_INDEX_STRATEGY="unsafe-best-match"
	ENV UV_LINK_MODE="copy"
	RUN --mount=type=cache,target=/root/.cache/uv \
	--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
	--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
	uv pip install --upgrade pip && \
	uv pip install -r requirements/cpu.txt

	ARG TARGETARCH
	ENV TARGETARCH=${TARGETARCH}

	######################### x86_64 BASE IMAGE #########################
	FROM base-common AS base-amd64

	ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"

	######################### arm64 BASE IMAGE #########################
	FROM base-common AS base-arm64

	ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"

	######################### BASE IMAGE #########################
	FROM base-${TARGETARCH} AS base

	RUN echo 'ulimit -c 0' >> ~/.bashrc

	######################### BUILD IMAGE #########################
	FROM base AS vllm-build

	ARG max_jobs=32
	ENV MAX_JOBS=${max_jobs}

	ARG GIT_REPO_CHECK=0
	# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
	ARG VLLM_CPU_DISABLE_AVX512=0
	ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
	# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
	ARG VLLM_CPU_AVX512BF16=0
	ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
	# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
	ARG VLLM_CPU_AVX512VNNI=0
	ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
	# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
	ARG VLLM_CPU_AMXBF16=0
	ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}

	WORKDIR /workspace/vllm

	RUN --mount=type=cache,target=/root/.cache/uv \
	--mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
	uv pip install -r requirements/build.txt

	COPY . .
	RUN --mount=type=bind,source=.git,target=.git \
	if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

	RUN --mount=type=cache,target=/root/.cache/uv \
	--mount=type=cache,target=/root/.cache/ccache \
	--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
	--mount=type=bind,source=.git,target=.git \
	VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

	######################### TEST DEPS #########################
	FROM base AS vllm-test-deps

	WORKDIR /workspace/vllm

	# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
	RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
	cp requirements/test.in requirements/cpu-test.in && \
	sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
	remove_packages_not_supported_on_aarch64() { \
	case "$(uname -m)" in \
	aarch64\|arm64) \
	sed -i '/decord/d' requirements/cpu-test.in; \
	sed -i '/terratorch/d' requirements/cpu-test.in; \
	;; \
	esac; \
	}; \
	remove_packages_not_supported_on_aarch64 && \
	sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
	sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
	sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
	uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

	RUN --mount=type=cache,target=/root/.cache/uv \
	uv pip install -r requirements/cpu-test.txt

	######################### DEV IMAGE #########################
	FROM vllm-build AS vllm-dev

	WORKDIR /workspace/vllm

	RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
	--mount=type=cache,target=/var/lib/apt,sharing=locked \
	apt-get install -y --no-install-recommends vim numactl xz-utils

	# install development dependencies (for testing)
	RUN --mount=type=cache,target=/root/.cache/uv \
	uv pip install -e tests/vllm_test_utils

	RUN --mount=type=cache,target=/root/.cache/uv \
	--mount=type=cache,target=/root/.cache/ccache \
	--mount=type=bind,source=.git,target=.git \
	VLLM_TARGET_DEVICE=cpu python3 setup.py develop

	COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt

	RUN --mount=type=cache,target=/root/.cache/uv \
	uv pip install -r requirements/dev.txt && \
	pre-commit install --hook-type pre-commit --hook-type commit-msg

	ENTRYPOINT ["bash"]

	######################### TEST IMAGE #########################
	FROM vllm-test-deps AS vllm-test

	WORKDIR /workspace/

	RUN --mount=type=cache,target=/root/.cache/uv \
	--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
	uv pip install dist/*.whl

	ADD ./tests/ ./tests/
	ADD ./examples/ ./examples/
	ADD ./benchmarks/ ./benchmarks/
	ADD ./vllm/collect_env.py .
	ADD ./.buildkite/ ./.buildkite/

	# Create symlink for vllm-workspace to maintain CI compatibility
	RUN ln -sf /workspace /vllm-workspace

	# install development dependencies (for testing)
	RUN --mount=type=cache,target=/root/.cache/uv \
	uv pip install -e tests/vllm_test_utils

	######################### RELEASE IMAGE #########################
	FROM base AS vllm-openai

	WORKDIR /workspace/

	RUN --mount=type=cache,target=/root/.cache/uv \
	--mount=type=cache,target=/root/.cache/ccache \
	--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
	uv pip install dist/*.whl

	RUN apt-get update -y \
	&& apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \
	gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \
	libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \
	&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

	RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \
	cd ZenDNN-pytorch-plugin && \
	uv pip install -r requirements.txt && \
	CC=gcc CXX=g++ python3 setup.py bdist_wheel && \
	uv pip install dist/*.whl

	ENTRYPOINT ["vllm", "serve"]
	Results Qwen/Qwen3-8B

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 64.40
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.31
	Output token throughput (tok/s): 39.75
	Peak output token throughput (tok/s): 52.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 79.50
	---------------Time to First Token----------------
	Mean TTFT (ms): 1616.70
	Median TTFT (ms): 1724.28
	P99 TTFT (ms): 2761.99
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 88.65
	Median TPOT (ms): 86.42
	P99 TPOT (ms): 102.80
	---------------Inter-token Latency----------------
	Mean ITL (ms): 88.65
	Median ITL (ms): 87.08
	P99 ITL (ms): 101.28
	==================================================

	Results Qwen/Qwen3-4B

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 42.67
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.47
	Output token throughput (tok/s): 60.00
	Peak output token throughput (tok/s): 76.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 120.00
	---------------Time to First Token----------------
	Mean TTFT (ms): 1037.59
	Median TTFT (ms): 1024.18
	P99 TTFT (ms): 2069.61
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 59.00
	Median TPOT (ms): 57.91
	P99 TPOT (ms): 70.71
	---------------Inter-token Latency----------------
	Mean ITL (ms): 59.00
	Median ITL (ms): 57.16
	P99 ITL (ms): 66.71
	==================================================

	Results Qwen/Qwen3-1.7B

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 26.79
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.75
	Output token throughput (tok/s): 95.55
	Peak output token throughput (tok/s): 112.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 191.10
	---------------Time to First Token----------------
	Mean TTFT (ms): 560.05
	Median TTFT (ms): 482.31
	P99 TTFT (ms): 1492.30
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 37.77
	Median TPOT (ms): 36.75
	P99 TPOT (ms): 45.51
	---------------Inter-token Latency----------------
	Mean ITL (ms): 37.77
	Median ITL (ms): 36.54
	P99 ITL (ms): 45.62
	==================================================

	Results Qwen/Qwen3-0.6B

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 20.51
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.98
	Output token throughput (tok/s): 124.81
	Peak output token throughput (tok/s): 148.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 249.62
	---------------Time to First Token----------------
	Mean TTFT (ms): 495.14
	Median TTFT (ms): 403.64
	P99 TTFT (ms): 1452.09
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 28.39
	Median TPOT (ms): 27.57
	P99 TPOT (ms): 36.38
	---------------Inter-token Latency----------------
	Mean ITL (ms): 28.39
	Median ITL (ms): 27.24
	P99 ITL (ms): 31.03
	==================================================

	Results meta-llama/Llama-3.2-1B

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 20.65
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.97
	Output token throughput (tok/s): 123.95
	Peak output token throughput (tok/s): 149.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 247.91
	---------------Time to First Token----------------
	Mean TTFT (ms): 480.86
	Median TTFT (ms): 389.88
	P99 TTFT (ms): 1313.63
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 28.72
	Median TPOT (ms): 27.99
	P99 TPOT (ms): 35.85
	---------------Inter-token Latency----------------
	Mean ITL (ms): 28.72
	Median ITL (ms): 27.64
	P99 ITL (ms): 30.66
	==================================================

	Results meta-llama/Llama-3.2-3B

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 35.62
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.56
	Output token throughput (tok/s): 71.87
	Peak output token throughput (tok/s): 92.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 143.74
	---------------Time to First Token----------------
	Mean TTFT (ms): 979.22
	Median TTFT (ms): 934.78
	P99 TTFT (ms): 1795.37
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 48.35
	Median TPOT (ms): 47.79
	P99 TPOT (ms): 53.81
	---------------Inter-token Latency----------------
	Mean ITL (ms): 48.35
	Median ITL (ms): 46.53
	P99 ITL (ms): 52.83
	==================================================

	Results google/gemma-3-1b-it

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 26.30
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.76
	Output token throughput (tok/s): 97.35
	Peak output token throughput (tok/s): 112.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 194.70
	---------------Time to First Token----------------
	Mean TTFT (ms): 483.12
	Median TTFT (ms): 374.32
	P99 TTFT (ms): 1394.82
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 37.59
	Median TPOT (ms): 37.05
	P99 TPOT (ms): 44.19
	---------------Inter-token Latency----------------
	Mean ITL (ms): 37.59
	Median ITL (ms): 36.55
	P99 ITL (ms): 40.29
	==================================================

	Results google/gemma-3-4b-it

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 47.56
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.42
	Output token throughput (tok/s): 53.83
	Peak output token throughput (tok/s): 68.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 107.65
	---------------Time to First Token----------------
	Mean TTFT (ms): 1194.13
	Median TTFT (ms): 1178.90
	P99 TTFT (ms): 1899.93
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 65.44
	Median TPOT (ms): 64.73
	P99 TPOT (ms): 69.39
	---------------Inter-token Latency----------------
	Mean ITL (ms): 65.44
	Median ITL (ms): 63.80
	P99 ITL (ms): 73.35
	==================================================

	Results google/gemma-3-12b-it

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 100.96
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.20
	Output token throughput (tok/s): 25.36
	Peak output token throughput (tok/s): 32.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 50.71
	---------------Time to First Token----------------
	Mean TTFT (ms): 3094.45
	Median TTFT (ms): 3508.26
	P99 TTFT (ms): 4408.72
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 134.57
	Median TPOT (ms): 129.60
	P99 TPOT (ms): 156.76
	---------------Inter-token Latency----------------
	Mean ITL (ms): 134.57
	Median ITL (ms): 126.41
	P99 ITL (ms): 145.44
	==================================================

	Results microsoft/phi-4

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 107.44
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.19
	Output token throughput (tok/s): 23.83
	Peak output token throughput (tok/s): 36.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 47.65
	---------------Time to First Token----------------
	Mean TTFT (ms): 4263.18
	Median TTFT (ms): 5383.14
	P99 TTFT (ms): 6157.70
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 135.58
	Median TPOT (ms): 129.80
	P99 TPOT (ms): 167.44
	---------------Inter-token Latency----------------
	Mean ITL (ms): 135.58
	Median ITL (ms): 124.98
	P99 ITL (ms): 150.06
	==================================================

	Results microsoft/Phi-4-mini-instruct

	============ Serving Benchmark Result ============
	Successful requests: 20
	Failed requests: 0
	Maximum request concurrency: 4
	Benchmark duration (s): 38.03
	Total input tokens: 2560
	Total generated tokens: 2560
	Request throughput (req/s): 0.53
	Output token throughput (tok/s): 67.32
	Peak output token throughput (tok/s): 92.00
	Peak concurrent requests: 8.00
	Total token throughput (tok/s): 134.64
	---------------Time to First Token----------------
	Mean TTFT (ms): 1380.97
	Median TTFT (ms): 1519.74
	P99 TTFT (ms): 2179.02
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 48.99
	Median TPOT (ms): 46.95
	P99 TPOT (ms): 60.58
	---------------Inter-token Latency----------------
	Mean ITL (ms): 48.99
	Median ITL (ms): 45.89
	P99 ITL (ms): 51.02
	==================================================
	# Add Docker's official GPG key
	apt update
	apt install ca-certificates curl
	install -m 0755 -d /etc/apt/keyrings
	curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
	chmod a+r /etc/apt/keyrings/docker.asc

	# Add the repository to Apt sources
	tee /etc/apt/sources.list.d/docker.sources <<EOF
	Types: deb
	URIs: https://download.docker.com/linux/ubuntu
	Suites: $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}")
	Components: stable
	Signed-By: /etc/apt/keyrings/docker.asc
	EOF

	apt update

	apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin

	git clone https://github.com/vllm-project/vllm
	pushd vllm
	# Create docker file and build image
	cat > vllm/docker/Dockerfile.cpu-amd << 'EOF'
	FROM vllm-cpu:latest
	RUN apt-get update -y \
	&& apt-get install -y --no-install-recommends make cmake ccache git curl wget ca-certificates \
	gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg \
	libsm6 libxext6 libgl1 jq lsof libjemalloc2 gfortran \
	&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

	RUN git clone https://github.com/amd/ZenDNN-pytorch-plugin.git && \
	cd ZenDNN-pytorch-plugin && \
	uv pip install -r requirements.txt && \
	CC=gcc CXX=g++ python3 setup.py bdist_wheel && \
	uv pip install dist/*.whl

	ENTRYPOINT ["vllm", "serve"]
	EOF
	docker build -f docker/Dockerfile.cpu \
	--build-arg VLLM_CPU_AVX512BF16=1 \
	--build-arg VLLM_CPU_AVX512VNNI=1 \
	--build-arg VLLM_CPU_DISABLE_AVX512=0 \
	--tag vllm-cpu \
	--target vllm-openai \
	.
	docker build -f docker/Dockerfile.cpu-amd \
	--build-arg VLLM_CPU_AVX512BF16=1 \
	--build-arg VLLM_CPU_AVX512VNNI=1 \
	--build-arg VLLM_CPU_DISABLE_AVX512=0 \
	--tag vllm-cpu-zentorch \
	--target vllm-openai \
	.
	popd

	apt install python3.12-venv
	python3 -m venv ~/.venvs/vllm
	~/.venvs/vllm/bin/pip install vllm ijson


	# set VLLM_CPU_OMP_THREADS_BIND based on your CPU cores

	function runVLLmServer() {
	docker kill vllm-server \|\| echo "no existing container" && sleep 2
	docker rm vllm-server \|\| echo "no existing container to remove" && sleep 5

	local MODEL="$1"
	local CORES="0-$(($(lscpu \| awk '/^CPU\(s\)/ {print $2}') - 2))"
	local SHM_SIZE="$(($(free -m \| awk '/Mem/ {print $2}') - 1024))"
	local HF_TOKEN=${2:-"None"}

	docker run --net=host \
	--ipc=host \
	--shm-size=${SHM_SIZE}m \
	--privileged=true \
	--detach \
	--volume /var/lib/huggingface:/root/.cache/huggingface \
	--env HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}" \
	--env VLLM_PLUGINS="zentorch" \
	--env VLLM_CPU_KVCACHE_SPACE=50 \
	--env VLLM_CPU_OMP_THREADS_BIND=${CORES} \
	--env VLLM_CPU_NUM_OF_RESERVED_CPU=1 \
	--name vllm-server \
	--rm \
	vllm-cpu-zentorch:latest --dtype=bfloat16 \
	--max-num-seqs=5 \
	--model=${MODEL} \
	&& echo "Monitor logging with \`docker logs -f vllm-server\`"
	}

	function testVLLm() {
	local MODEL="$1"
	HUGGING_FACE_HUB_TOKEN=${2:-"None"} ~/.venvs/vllm/bin/python3 \
	-m vllm.entrypoints.cli.main bench serve --backend vllm \
	--base-url http://localhost:8000 \
	--model ${MODEL} \
	--tokenizer ${MODEL} \
	--random-input-len 128 \
	--random-output-len 128 \
	--num-prompts 20 \
	--max-concurrency 4 \
	--temperature 0.7
	}


	# validate functionality
	curl http://localhost:8000/v1/models \| jq