Tested Docker commands for deploying Hugging Face LLMs with vLLM on NVIDIA H100 GPUs.
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<token>" \
-p 8002:8000 \
--ipc=host \
vllm/vllm-openai:gptoss \
--api-key token-sse123 \
--gpu-memory-utilization 0.9 \
--max-model-len 16384 \
--max_num_seqs 16 \
--model openai/gpt-oss-120b
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<token>" \
-p 8002:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--api-key token-sse123 \
--gpu_memory_utilization 0.92 \
--quantization bitsandbytes \
--load_format bitsandbytes \
--max_model_len 32768 \
--max_num_seqs 64 \
--enable_prefix_caching \
--model meta-llama/Llama-3.3-70B-Instruct
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<token>" \
-p 8002:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--api-key token-sse123 \
--gpu_memory_utilization 0.92 \
--max_model_len 16384 \
--max_num_seqs 64 \
--enable_prefix_caching \
--model Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<token>" \
-p 8002:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--api-key token-sse123 \
--gpu_memory_utilization 0.92 \
--max_model_len 32768 \
--max_num_seqs 64 \
--enable_prefix_caching \
--reasoning-parser qwen3 \
--model Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
-e HUGGING_FACE_HUB_TOKEN=<token> \
-p 8002:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--api-key token-sse123 \
--gpu_memory_utilization 0.90 \
--max_model_len 16384 \
--enable_prefix_caching \
--quantization bitsandbytes \
--load_format bitsandbytes \
--model mistralai/Mixtral-8x7B-Instruct-v0.1
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<token>" \
-p 8002:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--api-key token-sse123 \
--gpu_memory_utilization 0.92 \
--max_model_len 32768 \
--max_num_seqs 64 \
--enable_prefix_caching \
--quantization bitsandbytes \
--load_format bitsandbytes \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-70B