Skip to content

Instantly share code, notes, and snippets.

@namgyu-youn
Last active December 30, 2025 11:12
Show Gist options
  • Select an option

  • Save namgyu-youn/af263792ccf8fc8b9522fbda8cddd813 to your computer and use it in GitHub Desktop.

Select an option

Save namgyu-youn/af263792ccf8fc8b9522fbda8cddd813 to your computer and use it in GitHub Desktop.
[log] W8A8-INT benchmark in TorchAO
torch.__version__='2.9.0+cu128'
torch.cuda.get_device_name()='NVIDIA A100 80GB PCIe'
torchao.__version__='0.15.0'
vllm.__version__='0.13.0'
processing quant_recipe int8_rowwise
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info
Running model_id='meta-llama/Llama-3.1-8B' with quant_recipe_name='int8_rowwise'
Quantizing model with config: Int8DynamicActivationInt8WeightConfig(layout=PlainLayout(), act_mapping_type=<MappingType.SYMMETRIC: 1>, weight_only_decode=False, granularity=PerRow(dim=-1), set_inductor_config=True, version=1)
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
_C._set_float32_matmul_precision(precision)
Loading checkpoint shards: 25%|██▌ | 1/4 [00:01<00:04, 1.47s/it]
Loading checkpoint shards: 50%|█████ | 2/4 [00:02<00:02, 1.43s/it]
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:04<00:01, 1.39s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00, 1.05it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00, 1.13s/it]
LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(128256, 4096)
(layers): ModuleList(
(0-31): 32 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(in_features=4096, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
(k_proj): Linear(in_features=4096, out_features=1024, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
(v_proj): Linear(in_features=4096, out_features=1024, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
(o_proj): Linear(in_features=4096, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=4096, out_features=14336, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([14336, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
(up_proj): Linear(in_features=4096, out_features=14336, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([14336, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
(down_proj): Linear(in_features=14336, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 14336]), block_size=(1, 14336), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
(act_fn): SiLUActivation()
)
(input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
(post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
)
)
(norm): LlamaRMSNorm((4096,), eps=1e-05)
(rotary_emb): LlamaRotaryEmbedding()
)
(lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)
saved model_id='meta-llama/Llama-3.1-8B', quant_recipe_name='int8_rowwise' to model_output_dir='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'
checkpoint size: 9.101492908 GB
benchmarking vllm prefill performance with --num_prompts 32 --input_len 4096 --output_len 32 --max_model_len 4128
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info
The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
When dataset path is not set, it will default to random dataset
INFO 12-30 11:05:07 [datasets.py:612] Sampling input_len from [4095, 4095] and output_len from [32, 32]
INFO 12-30 11:05:08 [utils.py:253] non-default args: {'tokenizer': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', 'dtype': 'bfloat16', 'max_model_len': 4128, 'enable_lora': None, 'reasoning_parser_plugin': '', 'model': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'}
INFO 12-30 11:05:18 [model.py:514] Resolved architecture: LlamaForCausalLM
INFO 12-30 11:05:18 [model.py:1661] Using max model len 4128
INFO 12-30 11:05:19 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info
(EngineCore_DP0 pid=5072) INFO 12-30 11:05:30 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', speculative_config=None, tokenizer='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4128, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None}
(EngineCore_DP0 pid=5072) INFO 12-30 11:05:30 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.0.2.100:43923 backend=nccl
(EngineCore_DP0 pid=5072) INFO 12-30 11:05:30 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
(EngineCore_DP0 pid=5072) INFO 12-30 11:05:35 [gpu_model_runner.py:3562] Starting to load model benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/...
(EngineCore_DP0 pid=5072) /home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
(EngineCore_DP0 pid=5072) _C._set_float32_matmul_precision(precision)
(EngineCore_DP0 pid=5072) INFO 12-30 11:06:01 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')
(EngineCore_DP0 pid=5072)
Loading pt checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]
(EngineCore_DP0 pid=5072)
Loading pt checkpoint shards: 50% Completed | 1/2 [00:03<00:03, 3.86s/it]
(EngineCore_DP0 pid=5072)
Loading pt checkpoint shards: 100% Completed | 2/2 [00:06<00:00, 3.37s/it]
(EngineCore_DP0 pid=5072)
Loading pt checkpoint shards: 100% Completed | 2/2 [00:06<00:00, 3.45s/it]
(EngineCore_DP0 pid=5072)
(EngineCore_DP0 pid=5072) INFO 12-30 11:06:08 [default_loader.py:308] Loading weights took 6.93 seconds
(EngineCore_DP0 pid=5072) INFO 12-30 11:06:09 [gpu_model_runner.py:3659] Model loading took 8.4914 GiB memory and 32.976080 seconds
(EngineCore_DP0 pid=5072) INFO 12-30 11:06:20 [backends.py:643] Using cache directory: /home/elicer/.cache/vllm/torch_compile_cache/95cf394f53/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=5072) INFO 12-30 11:06:20 [backends.py:703] Dynamo bytecode transform time: 10.92 s
(EngineCore_DP0 pid=5072) INFO 12-30 11:07:06 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
(EngineCore_DP0 pid=5072) INFO 12-30 11:08:29 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 121.43 s
(EngineCore_DP0 pid=5072) INFO 12-30 11:08:29 [monitor.py:34] torch.compile takes 132.35 s in total
(EngineCore_DP0 pid=5072) INFO 12-30 11:08:31 [gpu_worker.py:375] Available KV cache memory: 61.40 GiB
(EngineCore_DP0 pid=5072) INFO 12-30 11:08:32 [kv_cache_utils.py:1291] GPU KV cache size: 502,944 tokens
(EngineCore_DP0 pid=5072) INFO 12-30 11:08:32 [kv_cache_utils.py:1296] Maximum concurrency for 4,128 tokens per request: 121.84x
(EngineCore_DP0 pid=5072)
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00<?, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 4%|▍ | 2/51 [00:00<00:03, 15.38it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 8%|▊ | 4/51 [00:00<00:02, 15.96it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 12%|█▏ | 6/51 [00:00<00:02, 16.31it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%|█▌ | 8/51 [00:00<00:02, 16.27it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 20%|█▉ | 10/51 [00:00<00:02, 17.11it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 24%|██▎ | 12/51 [00:00<00:02, 17.70it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 27%|██▋ | 14/51 [00:00<00:02, 18.15it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 31%|███▏ | 16/51 [00:00<00:01, 18.43it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 35%|███▌ | 18/51 [00:01<00:01, 18.63it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 39%|███▉ | 20/51 [00:01<00:01, 18.82it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 43%|████▎ | 22/51 [00:01<00:01, 18.88it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 47%|████▋ | 24/51 [00:01<00:01, 18.93it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 51%|█████ | 26/51 [00:01<00:01, 19.05it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 55%|█████▍ | 28/51 [00:01<00:01, 19.00it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 59%|█████▉ | 30/51 [00:01<00:01, 18.97it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 63%|██████▎ | 32/51 [00:01<00:01, 18.98it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 67%|██████▋ | 34/51 [00:01<00:00, 18.91it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 71%|███████ | 36/51 [00:01<00:00, 18.96it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 75%|███████▍ | 38/51 [00:02<00:00, 18.90it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 78%|███████▊ | 40/51 [00:02<00:00, 18.83it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 82%|████████▏ | 42/51 [00:02<00:00, 18.87it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 86%|████████▋ | 44/51 [00:02<00:00, 18.77it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 18.79it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 18.38it/s]
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] EngineCore failed to start.
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] Traceback (most recent call last):
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] super().__init__(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] self.collective_rpc("compile_or_warm_up_model")
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return func(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] cuda_graph_memory_bytes = self.model_runner.capture_model()
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] self._capture_cudagraphs(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] self._dummy_run(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return func(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] outputs = self.model(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] model_output = self.model(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self._call_with_optional_nvtx_range(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return callable_fn(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] def forward(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return fn(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self.optimized_call(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self._wrapped_call(self, *args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] raise e
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "<eval_with_key>.66", line 202, in forward
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return range_entry.runnable(*args)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self._compiled_fn(*args)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return fn(*args, **kwargs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return compiled_fn(full_args)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] all_outs = call_func_at_runtime_with_args(
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] out = normalize_as_list(f(args))
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] unwrapped_outs = compiled_fn(unwrapped_args)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] outs = compiled_fn(args)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return compiled_fn(runtime_args)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] return self.current_callable(inputs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] out = model(new_inputs)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
(EngineCore_DP0 pid=5072) ERROR 12-30 11:08:35 [core.py:866] RuntimeError: self.size(0) needs to be greater than 16, but got 16
(EngineCore_DP0 pid=5072) Process EngineCore_DP0:
(EngineCore_DP0 pid=5072) Traceback (most recent call last):
(EngineCore_DP0 pid=5072) File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=5072) self.run()
(EngineCore_DP0 pid=5072) File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=5072) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 870, in run_engine_core
(EngineCore_DP0 pid=5072) raise e
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
(EngineCore_DP0 pid=5072) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
(EngineCore_DP0 pid=5072) super().__init__(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
(EngineCore_DP0 pid=5072) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
(EngineCore_DP0 pid=5072) self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
(EngineCore_DP0 pid=5072) self.collective_rpc("compile_or_warm_up_model")
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
(EngineCore_DP0 pid=5072) result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=5072) return func(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
(EngineCore_DP0 pid=5072) cuda_graph_memory_bytes = self.model_runner.capture_model()
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
(EngineCore_DP0 pid=5072) self._capture_cudagraphs(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
(EngineCore_DP0 pid=5072) self._dummy_run(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=5072) return func(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
(EngineCore_DP0 pid=5072) outputs = self.model(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=5072) return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=5072) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=5072) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
(EngineCore_DP0 pid=5072) model_output = self.model(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
(EngineCore_DP0 pid=5072) return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
(EngineCore_DP0 pid=5072) return self._call_with_optional_nvtx_range(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
(EngineCore_DP0 pid=5072) return callable_fn(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
(EngineCore_DP0 pid=5072) def forward(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=5072) return fn(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
(EngineCore_DP0 pid=5072) return self.optimized_call(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
(EngineCore_DP0 pid=5072) return self._wrapped_call(self, *args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
(EngineCore_DP0 pid=5072) raise e
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
(EngineCore_DP0 pid=5072) return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=5072) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=5072) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "<eval_with_key>.66", line 202, in forward
(EngineCore_DP0 pid=5072) submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=5072) return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
(EngineCore_DP0 pid=5072) return range_entry.runnable(*args)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
(EngineCore_DP0 pid=5072) return self._compiled_fn(*args)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=5072) return fn(*args, **kwargs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
(EngineCore_DP0 pid=5072) return compiled_fn(full_args)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
(EngineCore_DP0 pid=5072) all_outs = call_func_at_runtime_with_args(
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
(EngineCore_DP0 pid=5072) out = normalize_as_list(f(args))
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
(EngineCore_DP0 pid=5072) unwrapped_outs = compiled_fn(unwrapped_args)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn
(EngineCore_DP0 pid=5072) outs = compiled_fn(args)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
(EngineCore_DP0 pid=5072) return compiled_fn(runtime_args)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
(EngineCore_DP0 pid=5072) return self.current_callable(inputs)
(EngineCore_DP0 pid=5072) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
(EngineCore_DP0 pid=5072) out = model(new_inputs)
(EngineCore_DP0 pid=5072) File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
(EngineCore_DP0 pid=5072) extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
(EngineCore_DP0 pid=5072) RuntimeError: self.size(0) needs to be greater than 16, but got 16
[rank0]:[W1230 11:08:37.382981035 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
Traceback (most recent call last):
File "/home/elicer/ao/.venv/bin/vllm", line 10, in <module>
sys.exit(main())
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
args.dispatch_function(args)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd
main(args)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 730, in main
elapsed_time, request_outputs = run_vllm(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 51, in run_vllm
llm = LLM(**dataclasses.asdict(engine_args))
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 351, in __init__
self.llm_engine = LLMEngine.from_engine_args(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 183, in from_engine_args
return cls(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 109, in __init__
self.engine_core = EngineCoreClient.make_client(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 93, in make_client
return SyncMPClient(vllm_config, executor_class, log_stats)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 648, in __init__
super().__init__(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__
with launch_core_engines(vllm_config, executor_class, log_stats) as (
File "/usr/local/lib/python3.10/contextlib.py", line 142, in __exit__
next(self.gen)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines
wait_for_engine_startup(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup
raise RuntimeError(
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info
The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
When dataset path is not set, it will default to random dataset
INFO 12-30 11:08:53 [datasets.py:612] Sampling input_len from [31, 31] and output_len from [2048, 2048]
INFO 12-30 11:08:53 [utils.py:253] non-default args: {'tokenizer': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', 'dtype': 'bfloat16', 'max_model_len': 2080, 'enable_lora': None, 'reasoning_parser_plugin': '', 'model': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'}
INFO 12-30 11:08:53 [model.py:514] Resolved architecture: LlamaForCausalLM
INFO 12-30 11:08:53 [model.py:1661] Using max model len 2080
INFO 12-30 11:08:54 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:05 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', speculative_config=None, tokenizer='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2080, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None}
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:05 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.0.2.100:54775 backend=nccl
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:05 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:06 [gpu_model_runner.py:3562] Starting to load model benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/...
(EngineCore_DP0 pid=9421) /home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
(EngineCore_DP0 pid=9421) _C._set_float32_matmul_precision(precision)
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:06 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')
(EngineCore_DP0 pid=9421)
Loading pt checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]
(EngineCore_DP0 pid=9421)
Loading pt checkpoint shards: 50% Completed | 1/2 [00:04<00:04, 4.07s/it]
(EngineCore_DP0 pid=9421)
Loading pt checkpoint shards: 100% Completed | 2/2 [00:07<00:00, 3.56s/it]
(EngineCore_DP0 pid=9421)
Loading pt checkpoint shards: 100% Completed | 2/2 [00:07<00:00, 3.63s/it]
(EngineCore_DP0 pid=9421)
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:14 [default_loader.py:308] Loading weights took 7.31 seconds
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:15 [gpu_model_runner.py:3659] Model loading took 8.4914 GiB memory and 8.213182 seconds
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:26 [backends.py:643] Using cache directory: /home/elicer/.cache/vllm/torch_compile_cache/626308a02e/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:26 [backends.py:703] Dynamo bytecode transform time: 10.79 s
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:35 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:56 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 22.31 s
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:56 [monitor.py:34] torch.compile takes 33.10 s in total
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:58 [gpu_worker.py:375] Available KV cache memory: 61.40 GiB
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:58 [kv_cache_utils.py:1291] GPU KV cache size: 502,976 tokens
(EngineCore_DP0 pid=9421) INFO 12-30 11:09:58 [kv_cache_utils.py:1296] Maximum concurrency for 2,080 tokens per request: 241.82x
(EngineCore_DP0 pid=9421)
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00<?, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 4%|▍ | 2/51 [00:00<00:02, 16.45it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 8%|▊ | 4/51 [00:00<00:02, 17.04it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 12%|█▏ | 6/51 [00:00<00:02, 17.42it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%|█▌ | 8/51 [00:00<00:02, 17.79it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 20%|█▉ | 10/51 [00:00<00:02, 18.31it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 24%|██▎ | 12/51 [00:00<00:02, 18.63it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 27%|██▋ | 14/51 [00:00<00:01, 18.90it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 31%|███▏ | 16/51 [00:00<00:01, 19.06it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 35%|███▌ | 18/51 [00:00<00:01, 19.20it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 39%|███▉ | 20/51 [00:01<00:01, 19.33it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 43%|████▎ | 22/51 [00:01<00:01, 19.37it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 47%|████▋ | 24/51 [00:01<00:01, 19.39it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 51%|█████ | 26/51 [00:01<00:01, 19.51it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 55%|█████▍ | 28/51 [00:01<00:01, 19.46it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 59%|█████▉ | 30/51 [00:01<00:01, 19.44it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 63%|██████▎ | 32/51 [00:01<00:00, 19.47it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 67%|██████▋ | 34/51 [00:01<00:00, 19.41it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 71%|███████ | 36/51 [00:01<00:00, 19.46it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 75%|███████▍ | 38/51 [00:01<00:00, 19.39it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 78%|███████▊ | 40/51 [00:02<00:00, 19.34it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 82%|████████▏ | 42/51 [00:02<00:00, 19.34it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 86%|████████▋ | 44/51 [00:02<00:00, 19.30it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 19.33it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 19.03it/s]
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] EngineCore failed to start.
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] Traceback (most recent call last):
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] super().__init__(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] self.collective_rpc("compile_or_warm_up_model")
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return func(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] cuda_graph_memory_bytes = self.model_runner.capture_model()
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] self._capture_cudagraphs(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] self._dummy_run(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return func(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] outputs = self.model(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] model_output = self.model(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self._call_with_optional_nvtx_range(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return callable_fn(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] def forward(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return fn(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self.optimized_call(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self._wrapped_call(self, *args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] raise e
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "<eval_with_key>.66", line 202, in forward
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return range_entry.runnable(*args)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self._compiled_fn(*args)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return fn(*args, **kwargs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return compiled_fn(full_args)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] all_outs = call_func_at_runtime_with_args(
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] out = normalize_as_list(f(args))
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return compiled_fn(runtime_args)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] unwrapped_outs = compiled_fn(unwrapped_args)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] return self.current_callable(inputs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] out = model(new_inputs)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
(EngineCore_DP0 pid=9421) ERROR 12-30 11:10:02 [core.py:866] RuntimeError: self.size(0) needs to be greater than 16, but got 16
(EngineCore_DP0 pid=9421) Process EngineCore_DP0:
(EngineCore_DP0 pid=9421) Traceback (most recent call last):
(EngineCore_DP0 pid=9421) File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=9421) self.run()
(EngineCore_DP0 pid=9421) File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=9421) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 870, in run_engine_core
(EngineCore_DP0 pid=9421) raise e
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
(EngineCore_DP0 pid=9421) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
(EngineCore_DP0 pid=9421) super().__init__(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
(EngineCore_DP0 pid=9421) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
(EngineCore_DP0 pid=9421) self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
(EngineCore_DP0 pid=9421) self.collective_rpc("compile_or_warm_up_model")
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
(EngineCore_DP0 pid=9421) result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=9421) return func(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
(EngineCore_DP0 pid=9421) cuda_graph_memory_bytes = self.model_runner.capture_model()
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
(EngineCore_DP0 pid=9421) self._capture_cudagraphs(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
(EngineCore_DP0 pid=9421) self._dummy_run(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=9421) return func(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
(EngineCore_DP0 pid=9421) outputs = self.model(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=9421) return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=9421) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=9421) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
(EngineCore_DP0 pid=9421) model_output = self.model(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
(EngineCore_DP0 pid=9421) return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
(EngineCore_DP0 pid=9421) return self._call_with_optional_nvtx_range(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
(EngineCore_DP0 pid=9421) return callable_fn(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
(EngineCore_DP0 pid=9421) def forward(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=9421) return fn(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
(EngineCore_DP0 pid=9421) return self.optimized_call(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
(EngineCore_DP0 pid=9421) return self._wrapped_call(self, *args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
(EngineCore_DP0 pid=9421) raise e
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
(EngineCore_DP0 pid=9421) return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(EngineCore_DP0 pid=9421) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(EngineCore_DP0 pid=9421) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "<eval_with_key>.66", line 202, in forward
(EngineCore_DP0 pid=9421) submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
(EngineCore_DP0 pid=9421) return self.runnable(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
(EngineCore_DP0 pid=9421) return range_entry.runnable(*args)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
(EngineCore_DP0 pid=9421) return self._compiled_fn(*args)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
(EngineCore_DP0 pid=9421) return fn(*args, **kwargs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
(EngineCore_DP0 pid=9421) return compiled_fn(full_args)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
(EngineCore_DP0 pid=9421) all_outs = call_func_at_runtime_with_args(
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
(EngineCore_DP0 pid=9421) out = normalize_as_list(f(args))
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
(EngineCore_DP0 pid=9421) return compiled_fn(runtime_args)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
(EngineCore_DP0 pid=9421) unwrapped_outs = compiled_fn(unwrapped_args)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
(EngineCore_DP0 pid=9421) return self.current_callable(inputs)
(EngineCore_DP0 pid=9421) File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
(EngineCore_DP0 pid=9421) out = model(new_inputs)
(EngineCore_DP0 pid=9421) File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
(EngineCore_DP0 pid=9421) extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
(EngineCore_DP0 pid=9421) RuntimeError: self.size(0) needs to be greater than 16, but got 16
[rank0]:[W1230 11:10:03.448819061 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
Traceback (most recent call last):
File "/home/elicer/ao/.venv/bin/vllm", line 10, in <module>
sys.exit(main())
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
args.dispatch_function(args)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd
main(args)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 730, in main
elapsed_time, request_outputs = run_vllm(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 51, in run_vllm
llm = LLM(**dataclasses.asdict(engine_args))
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 351, in __init__
self.llm_engine = LLMEngine.from_engine_args(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 183, in from_engine_args
return cls(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 109, in __init__
self.engine_core = EngineCoreClient.make_client(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 93, in make_client
return SyncMPClient(vllm_config, executor_class, log_stats)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 648, in __init__
super().__init__(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__
with launch_core_engines(vllm_config, executor_class, log_stats) as (
File "/usr/local/lib/python3.10/contextlib.py", line 142, in __exit__
next(self.gen)
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines
wait_for_engine_startup(
File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup
raise RuntimeError(
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment