Last active
December 30, 2025 11:12
-
-
Save namgyu-youn/af263792ccf8fc8b9522fbda8cddd813 to your computer and use it in GitHub Desktop.
[log] W8A8-INT benchmark in TorchAO
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| torch.__version__='2.9.0+cu128' | |
| torch.cuda.get_device_name()='NVIDIA A100 80GB PCIe' | |
| torchao.__version__='0.15.0' | |
| vllm.__version__='0.13.0' | |
| processing quant_recipe int8_rowwise | |
| Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info | |
| Running model_id='meta-llama/Llama-3.1-8B' with quant_recipe_name='int8_rowwise' | |
| Quantizing model with config: Int8DynamicActivationInt8WeightConfig(layout=PlainLayout(), act_mapping_type=<MappingType.SYMMETRIC: 1>, weight_only_decode=False, granularity=PerRow(dim=-1), set_inductor_config=True, version=1) | |
| Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) | |
| _C._set_float32_matmul_precision(precision) | |
| Loading checkpoint shards: 25%|██▌ | 1/4 [00:01<00:04, 1.47s/it] | |
| Loading checkpoint shards: 50%|█████ | 2/4 [00:02<00:02, 1.43s/it] | |
| Loading checkpoint shards: 75%|███████▌ | 3/4 [00:04<00:01, 1.39s/it] | |
| Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00, 1.05it/s] | |
| Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00, 1.13s/it] | |
| LlamaForCausalLM( | |
| (model): LlamaModel( | |
| (embed_tokens): Embedding(128256, 4096) | |
| (layers): ModuleList( | |
| (0-31): 32 x LlamaDecoderLayer( | |
| (self_attn): LlamaAttention( | |
| (q_proj): Linear(in_features=4096, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| (k_proj): Linear(in_features=4096, out_features=1024, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| (v_proj): Linear(in_features=4096, out_features=1024, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| (o_proj): Linear(in_features=4096, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| ) | |
| (mlp): LlamaMLP( | |
| (gate_proj): Linear(in_features=4096, out_features=14336, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([14336, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| (up_proj): Linear(in_features=4096, out_features=14336, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([14336, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| (down_proj): Linear(in_features=14336, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 14336]), block_size=(1, 14336), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None))) | |
| (act_fn): SiLUActivation() | |
| ) | |
| (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05) | |
| (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05) | |
| ) | |
| ) | |
| (norm): LlamaRMSNorm((4096,), eps=1e-05) | |
| (rotary_emb): LlamaRotaryEmbedding() | |
| ) | |
| (lm_head): Linear(in_features=4096, out_features=128256, bias=False) | |
| ) | |
| saved model_id='meta-llama/Llama-3.1-8B', quant_recipe_name='int8_rowwise' to model_output_dir='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' | |
| checkpoint size: 9.101492908 GB | |
| benchmarking vllm prefill performance with --num_prompts 32 --input_len 4096 --output_len 32 --max_model_len 4128 | |
| Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info | |
| The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. | |
| When dataset path is not set, it will default to random dataset | |
| INFO 12-30 11:05:07 [datasets.py:612] Sampling input_len from [4095, 4095] and output_len from [32, 32] | |
| INFO 12-30 11:05:08 [utils.py:253] non-default args: {'tokenizer': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', 'dtype': 'bfloat16', 'max_model_len': 4128, 'enable_lora': None, 'reasoning_parser_plugin': '', 'model': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'} | |
| INFO 12-30 11:05:18 [model.py:514] Resolved architecture: LlamaForCausalLM | |
| INFO 12-30 11:05:18 [model.py:1661] Using max model len 4128 | |
| INFO 12-30 11:05:19 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192. | |
| The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. | |
| Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:30 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', speculative_config=None, tokenizer='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4128, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None} | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:30 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.0.2.100:43923 backend=nccl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:30 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0 | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:35 [gpu_model_runner.py:3562] Starting to load model benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/... | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m /home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m _C._set_float32_matmul_precision(precision) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:01 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION') | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m | |
| Loading pt checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m | |
| Loading pt checkpoint shards: 50% Completed | 1/2 [00:03<00:03, 3.86s/it] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m | |
| Loading pt checkpoint shards: 100% Completed | 2/2 [00:06<00:00, 3.37s/it] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m | |
| Loading pt checkpoint shards: 100% Completed | 2/2 [00:06<00:00, 3.45s/it] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:08 [default_loader.py:308] Loading weights took 6.93 seconds | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:09 [gpu_model_runner.py:3659] Model loading took 8.4914 GiB memory and 32.976080 seconds | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:20 [backends.py:643] Using cache directory: /home/elicer/.cache/vllm/torch_compile_cache/95cf394f53/rank_0_0/backbone for vLLM's torch.compile | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:20 [backends.py:703] Dynamo bytecode transform time: 10.92 s | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:07:06 [backends.py:261] Cache the graph of compile range (1, 8192) for later use | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:29 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 121.43 s | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:29 [monitor.py:34] torch.compile takes 132.35 s in total | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:31 [gpu_worker.py:375] Available KV cache memory: 61.40 GiB | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:32 [kv_cache_utils.py:1291] GPU KV cache size: 502,944 tokens | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:32 [kv_cache_utils.py:1296] Maximum concurrency for 4,128 tokens per request: 121.84x | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00<?, ?it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 4%|▍ | 2/51 [00:00<00:03, 15.38it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 8%|▊ | 4/51 [00:00<00:02, 15.96it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 12%|█▏ | 6/51 [00:00<00:02, 16.31it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%|█▌ | 8/51 [00:00<00:02, 16.27it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 20%|█▉ | 10/51 [00:00<00:02, 17.11it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 24%|██▎ | 12/51 [00:00<00:02, 17.70it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 27%|██▋ | 14/51 [00:00<00:02, 18.15it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 31%|███▏ | 16/51 [00:00<00:01, 18.43it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 35%|███▌ | 18/51 [00:01<00:01, 18.63it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 39%|███▉ | 20/51 [00:01<00:01, 18.82it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 43%|████▎ | 22/51 [00:01<00:01, 18.88it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 47%|████▋ | 24/51 [00:01<00:01, 18.93it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 51%|█████ | 26/51 [00:01<00:01, 19.05it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 55%|█████▍ | 28/51 [00:01<00:01, 19.00it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 59%|█████▉ | 30/51 [00:01<00:01, 18.97it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 63%|██████▎ | 32/51 [00:01<00:01, 18.98it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 67%|██████▋ | 34/51 [00:01<00:00, 18.91it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 71%|███████ | 36/51 [00:01<00:00, 18.96it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 75%|███████▍ | 38/51 [00:02<00:00, 18.90it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 78%|███████▊ | 40/51 [00:02<00:00, 18.83it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 82%|████████▏ | 42/51 [00:02<00:00, 18.87it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 86%|████████▋ | 44/51 [00:02<00:00, 18.77it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 18.79it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 18.38it/s] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] EngineCore failed to start. | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] Traceback (most recent call last): | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] engine_core = EngineCoreProc(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] super().__init__( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] self.model_executor.initialize_from_config(kv_cache_configs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] self.collective_rpc("compile_or_warm_up_model") | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] result = run_method(self.driver_worker, method, args, kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] cuda_graph_memory_bytes = self.model_runner.capture_model() | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] self._capture_cudagraphs( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] self._dummy_run( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] outputs = self.model( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] model_output = self.model( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self._call_with_optional_nvtx_range( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return callable_fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] def forward( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self.optimized_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self._wrapped_call(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] raise e | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "<eval_with_key>.66", line 202, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return range_entry.runnable(*args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self._compiled_fn(*args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return compiled_fn(full_args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] all_outs = call_func_at_runtime_with_args( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] out = normalize_as_list(f(args)) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] unwrapped_outs = compiled_fn(unwrapped_args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] outs = compiled_fn(args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return compiled_fn(runtime_args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] return self.current_callable(inputs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] out = model(new_inputs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] RuntimeError: self.size(0) needs to be greater than 16, but got 16 | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m Process EngineCore_DP0: | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m Traceback (most recent call last): | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m self.run() | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m self._target(*self._args, **self._kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 870, in run_engine_core | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m raise e | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m engine_core = EngineCoreProc(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m super().__init__( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m self.model_executor.initialize_from_config(kv_cache_configs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m self.collective_rpc("compile_or_warm_up_model") | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m result = run_method(self.driver_worker, method, args, kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m cuda_graph_memory_bytes = self.model_runner.capture_model() | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m self._capture_cudagraphs( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m self._dummy_run( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m outputs = self.model( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m model_output = self.model( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self._call_with_optional_nvtx_range( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return callable_fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m def forward( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self.optimized_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self._wrapped_call(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m raise e | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "<eval_with_key>.66", line 202, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return range_entry.runnable(*args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self._compiled_fn(*args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return compiled_fn(full_args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m all_outs = call_func_at_runtime_with_args( | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m out = normalize_as_list(f(args)) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m unwrapped_outs = compiled_fn(unwrapped_args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m outs = compiled_fn(args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return compiled_fn(runtime_args) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m return self.current_callable(inputs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m out = model(new_inputs) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6) | |
| [0;36m(EngineCore_DP0 pid=5072)[0;0m RuntimeError: self.size(0) needs to be greater than 16, but got 16 | |
| [rank0]:[W1230 11:08:37.382981035 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
| Traceback (most recent call last): | |
| File "/home/elicer/ao/.venv/bin/vllm", line 10, in <module> | |
| sys.exit(main()) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main | |
| args.dispatch_function(args) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd | |
| main(args) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 730, in main | |
| elapsed_time, request_outputs = run_vllm( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 51, in run_vllm | |
| llm = LLM(**dataclasses.asdict(engine_args)) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 351, in __init__ | |
| self.llm_engine = LLMEngine.from_engine_args( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 183, in from_engine_args | |
| return cls( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 109, in __init__ | |
| self.engine_core = EngineCoreClient.make_client( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 93, in make_client | |
| return SyncMPClient(vllm_config, executor_class, log_stats) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 648, in __init__ | |
| super().__init__( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__ | |
| with launch_core_engines(vllm_config, executor_class, log_stats) as ( | |
| File "/usr/local/lib/python3.10/contextlib.py", line 142, in __exit__ | |
| next(self.gen) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines | |
| wait_for_engine_startup( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup | |
| raise RuntimeError( | |
| RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} | |
| Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info | |
| The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. | |
| When dataset path is not set, it will default to random dataset | |
| INFO 12-30 11:08:53 [datasets.py:612] Sampling input_len from [31, 31] and output_len from [2048, 2048] | |
| INFO 12-30 11:08:53 [utils.py:253] non-default args: {'tokenizer': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', 'dtype': 'bfloat16', 'max_model_len': 2080, 'enable_lora': None, 'reasoning_parser_plugin': '', 'model': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'} | |
| INFO 12-30 11:08:53 [model.py:514] Resolved architecture: LlamaForCausalLM | |
| INFO 12-30 11:08:53 [model.py:1661] Using max model len 2080 | |
| INFO 12-30 11:08:54 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192. | |
| The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. | |
| Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0 Please see https://github.com/pytorch/ao/issues/2919 for more info | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:05 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', speculative_config=None, tokenizer='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2080, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None} | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:05 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.0.2.100:54775 backend=nccl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:05 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0 | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:06 [gpu_model_runner.py:3562] Starting to load model benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/... | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m /home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m _C._set_float32_matmul_precision(precision) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:06 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION') | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m | |
| Loading pt checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m | |
| Loading pt checkpoint shards: 50% Completed | 1/2 [00:04<00:04, 4.07s/it] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m | |
| Loading pt checkpoint shards: 100% Completed | 2/2 [00:07<00:00, 3.56s/it] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m | |
| Loading pt checkpoint shards: 100% Completed | 2/2 [00:07<00:00, 3.63s/it] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:14 [default_loader.py:308] Loading weights took 7.31 seconds | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:15 [gpu_model_runner.py:3659] Model loading took 8.4914 GiB memory and 8.213182 seconds | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:26 [backends.py:643] Using cache directory: /home/elicer/.cache/vllm/torch_compile_cache/626308a02e/rank_0_0/backbone for vLLM's torch.compile | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:26 [backends.py:703] Dynamo bytecode transform time: 10.79 s | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:35 [backends.py:261] Cache the graph of compile range (1, 8192) for later use | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:56 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 22.31 s | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:56 [monitor.py:34] torch.compile takes 33.10 s in total | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:58 [gpu_worker.py:375] Available KV cache memory: 61.40 GiB | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:58 [kv_cache_utils.py:1291] GPU KV cache size: 502,976 tokens | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:58 [kv_cache_utils.py:1296] Maximum concurrency for 2,080 tokens per request: 241.82x | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00<?, ?it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 4%|▍ | 2/51 [00:00<00:02, 16.45it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 8%|▊ | 4/51 [00:00<00:02, 17.04it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 12%|█▏ | 6/51 [00:00<00:02, 17.42it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%|█▌ | 8/51 [00:00<00:02, 17.79it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 20%|█▉ | 10/51 [00:00<00:02, 18.31it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 24%|██▎ | 12/51 [00:00<00:02, 18.63it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 27%|██▋ | 14/51 [00:00<00:01, 18.90it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 31%|███▏ | 16/51 [00:00<00:01, 19.06it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 35%|███▌ | 18/51 [00:00<00:01, 19.20it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 39%|███▉ | 20/51 [00:01<00:01, 19.33it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 43%|████▎ | 22/51 [00:01<00:01, 19.37it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 47%|████▋ | 24/51 [00:01<00:01, 19.39it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 51%|█████ | 26/51 [00:01<00:01, 19.51it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 55%|█████▍ | 28/51 [00:01<00:01, 19.46it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 59%|█████▉ | 30/51 [00:01<00:01, 19.44it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 63%|██████▎ | 32/51 [00:01<00:00, 19.47it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 67%|██████▋ | 34/51 [00:01<00:00, 19.41it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 71%|███████ | 36/51 [00:01<00:00, 19.46it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 75%|███████▍ | 38/51 [00:01<00:00, 19.39it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 78%|███████▊ | 40/51 [00:02<00:00, 19.34it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 82%|████████▏ | 42/51 [00:02<00:00, 19.34it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 86%|████████▋ | 44/51 [00:02<00:00, 19.30it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 19.33it/s] | |
| Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|█████████ | 46/51 [00:02<00:00, 19.03it/s] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] EngineCore failed to start. | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] Traceback (most recent call last): | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] engine_core = EngineCoreProc(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] super().__init__( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] self.model_executor.initialize_from_config(kv_cache_configs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] self.collective_rpc("compile_or_warm_up_model") | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] result = run_method(self.driver_worker, method, args, kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] cuda_graph_memory_bytes = self.model_runner.capture_model() | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] self._capture_cudagraphs( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] self._dummy_run( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] outputs = self.model( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] model_output = self.model( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self._call_with_optional_nvtx_range( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return callable_fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] def forward( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self.optimized_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self._wrapped_call(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] raise e | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "<eval_with_key>.66", line 202, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return range_entry.runnable(*args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self._compiled_fn(*args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return compiled_fn(full_args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] all_outs = call_func_at_runtime_with_args( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] out = normalize_as_list(f(args)) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return compiled_fn(runtime_args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] unwrapped_outs = compiled_fn(unwrapped_args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] return self.current_callable(inputs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] out = model(new_inputs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] RuntimeError: self.size(0) needs to be greater than 16, but got 16 | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m Process EngineCore_DP0: | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m Traceback (most recent call last): | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m self.run() | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m self._target(*self._args, **self._kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 870, in run_engine_core | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m raise e | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m engine_core = EngineCoreProc(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m super().__init__( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m self.model_executor.initialize_from_config(kv_cache_configs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m self.collective_rpc("compile_or_warm_up_model") | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m result = run_method(self.driver_worker, method, args, kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m cuda_graph_memory_bytes = self.model_runner.capture_model() | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m self._capture_cudagraphs( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m self._dummy_run( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return func(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m outputs = self.model( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m model_output = self.model( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self._call_with_optional_nvtx_range( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return callable_fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m def forward( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self.optimized_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self._wrapped_call(self, *args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m raise e | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self._call_impl(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return forward_call(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "<eval_with_key>.66", line 202, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self.runnable(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return range_entry.runnable(*args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self._compiled_fn(*args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return fn(*args, **kwargs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return compiled_fn(full_args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m all_outs = call_func_at_runtime_with_args( | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m out = normalize_as_list(f(args)) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return compiled_fn(runtime_args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m unwrapped_outs = compiled_fn(unwrapped_args) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m return self.current_callable(inputs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m out = model(new_inputs) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6) | |
| [0;36m(EngineCore_DP0 pid=9421)[0;0m RuntimeError: self.size(0) needs to be greater than 16, but got 16 | |
| [rank0]:[W1230 11:10:03.448819061 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
| Traceback (most recent call last): | |
| File "/home/elicer/ao/.venv/bin/vllm", line 10, in <module> | |
| sys.exit(main()) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main | |
| args.dispatch_function(args) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd | |
| main(args) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 730, in main | |
| elapsed_time, request_outputs = run_vllm( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 51, in run_vllm | |
| llm = LLM(**dataclasses.asdict(engine_args)) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 351, in __init__ | |
| self.llm_engine = LLMEngine.from_engine_args( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 183, in from_engine_args | |
| return cls( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 109, in __init__ | |
| self.engine_core = EngineCoreClient.make_client( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 93, in make_client | |
| return SyncMPClient(vllm_config, executor_class, log_stats) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 648, in __init__ | |
| super().__init__( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__ | |
| with launch_core_engines(vllm_config, executor_class, log_stats) as ( | |
| File "/usr/local/lib/python3.10/contextlib.py", line 142, in __exit__ | |
| next(self.gen) | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines | |
| wait_for_engine_startup( | |
| File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup | |
| raise RuntimeError( | |
| RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment