namgyu-youn · December 30, 2025 11:12
diff --git a/ao_intmm_compile_error.txt b/ao_intmm_compile_error.txt
 torch.__version__='2.9.0+cu128'
 torch.cuda.get_device_name()='NVIDIA A100 80GB PCIe'
 torchao.__version__='0.15.0'
 vllm.__version__='0.13.0'

 processing quant_recipe int8_rowwise

 Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info

 Running model_id='meta-llama/Llama-3.1-8B' with quant_recipe_name='int8_rowwise'

 Quantizing model with config:  Int8DynamicActivationInt8WeightConfig(layout=PlainLayout(), act_mapping_type=<MappingType.SYMMETRIC: 1>, weight_only_decode=False, granularity=PerRow(dim=-1), set_inductor_config=True, version=1)

 Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
  _C._set_float32_matmul_precision(precision)

 Loading checkpoint shards:  25%|██▌       | 1/4 [00:01<00:04,  1.47s/it]
 Loading checkpoint shards:  50%|█████     | 2/4 [00:02<00:02,  1.43s/it]
 Loading checkpoint shards:  75%|███████▌  | 3/4 [00:04<00:01,  1.39s/it]
 Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.05it/s]
 Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]
 LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([14336, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([14336, 4096]), block_size=(1, 4096), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7f326b23d1b0>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 14336]), block_size=(1, 14336), device=cuda:0, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=None, quant_max=None)))
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
 )
 saved model_id='meta-llama/Llama-3.1-8B', quant_recipe_name='int8_rowwise' to model_output_dir='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'
 checkpoint size: 9.101492908 GB

 benchmarking vllm prefill performance with --num_prompts 32 --input_len 4096 --output_len 32 --max_model_len 4128

 Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info
 The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
 When dataset path is not set, it will default to random dataset
 INFO 12-30 11:05:07 [datasets.py:612] Sampling input_len from [4095, 4095] and output_len from [32, 32]
 INFO 12-30 11:05:08 [utils.py:253] non-default args: {'tokenizer': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', 'dtype': 'bfloat16', 'max_model_len': 4128, 'enable_lora': None, 'reasoning_parser_plugin': '', 'model': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'}
 INFO 12-30 11:05:18 [model.py:514] Resolved architecture: LlamaForCausalLM
 INFO 12-30 11:05:18 [model.py:1661] Using max model len 4128
 INFO 12-30 11:05:19 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
 The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
 Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:30 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', speculative_config=None, tokenizer='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4128, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None}
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:30 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.0.2.100:43923 backend=nccl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:30 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:05:35 [gpu_model_runner.py:3562] Starting to load model benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/...
 [0;36m(EngineCore_DP0 pid=5072)[0;0m /home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   _C._set_float32_matmul_precision(precision)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:01 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')
 [0;36m(EngineCore_DP0 pid=5072)[0;0m
 Loading pt checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m
 Loading pt checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.86s/it]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m
 Loading pt checkpoint shards: 100% Completed | 2/2 [00:06<00:00,  3.37s/it]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m
 Loading pt checkpoint shards: 100% Completed | 2/2 [00:06<00:00,  3.45s/it]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:08 [default_loader.py:308] Loading weights took 6.93 seconds
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:09 [gpu_model_runner.py:3659] Model loading took 8.4914 GiB memory and 32.976080 seconds
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:20 [backends.py:643] Using cache directory: /home/elicer/.cache/vllm/torch_compile_cache/95cf394f53/rank_0_0/backbone for vLLM's torch.compile
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:06:20 [backends.py:703] Dynamo bytecode transform time: 10.92 s
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:07:06 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:29 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 121.43 s
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:29 [monitor.py:34] torch.compile takes 132.35 s in total
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:31 [gpu_worker.py:375] Available KV cache memory: 61.40 GiB
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:32 [kv_cache_utils.py:1291] GPU KV cache size: 502,944 tokens
 [0;36m(EngineCore_DP0 pid=5072)[0;0m INFO 12-30 11:08:32 [kv_cache_utils.py:1296] Maximum concurrency for 4,128 tokens per request: 121.84x
 [0;36m(EngineCore_DP0 pid=5072)[0;0m
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/51 [00:00<?, ?it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|▍         | 2/51 [00:00<00:03, 15.38it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   8%|▊         | 4/51 [00:00<00:02, 15.96it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  12%|█▏        | 6/51 [00:00<00:02, 16.31it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|█▌        | 8/51 [00:00<00:02, 16.27it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  20%|█▉        | 10/51 [00:00<00:02, 17.11it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  24%|██▎       | 12/51 [00:00<00:02, 17.70it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  27%|██▋       | 14/51 [00:00<00:02, 18.15it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  31%|███▏      | 16/51 [00:00<00:01, 18.43it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  35%|███▌      | 18/51 [00:01<00:01, 18.63it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  39%|███▉      | 20/51 [00:01<00:01, 18.82it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  43%|████▎     | 22/51 [00:01<00:01, 18.88it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  47%|████▋     | 24/51 [00:01<00:01, 18.93it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  51%|█████     | 26/51 [00:01<00:01, 19.05it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  55%|█████▍    | 28/51 [00:01<00:01, 19.00it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  59%|█████▉    | 30/51 [00:01<00:01, 18.97it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  63%|██████▎   | 32/51 [00:01<00:01, 18.98it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  67%|██████▋   | 34/51 [00:01<00:00, 18.91it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  71%|███████   | 36/51 [00:01<00:00, 18.96it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  75%|███████▍  | 38/51 [00:02<00:00, 18.90it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  78%|███████▊  | 40/51 [00:02<00:00, 18.83it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  82%|████████▏ | 42/51 [00:02<00:00, 18.87it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  86%|████████▋ | 44/51 [00:02<00:00, 18.77it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|█████████ | 46/51 [00:02<00:00, 18.79it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|█████████ | 46/51 [00:02<00:00, 18.38it/s]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] EngineCore failed to start.
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] Traceback (most recent call last):
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     engine_core = EngineCoreProc(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     super().__init__(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     self.model_executor.initialize_from_config(kv_cache_configs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     self.collective_rpc("compile_or_warm_up_model")
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     result = run_method(self.driver_worker, method, args, kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     cuda_graph_memory_bytes = self.model_runner.capture_model()
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     self._capture_cudagraphs(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     self._dummy_run(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     outputs = self.model(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     model_output = self.model(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self._call_with_optional_nvtx_range(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return callable_fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     def forward(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self.optimized_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self._wrapped_call(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     raise e
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "<eval_with_key>.66", line 202, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_);  l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return range_entry.runnable(*args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self._compiled_fn(*args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return compiled_fn(full_args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     all_outs = call_func_at_runtime_with_args(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     out = normalize_as_list(f(args))
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     unwrapped_outs = compiled_fn(unwrapped_args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     outs = compiled_fn(args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return compiled_fn(runtime_args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     return self.current_callable(inputs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     out = model(new_inputs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]   File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866]     extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m ERROR 12-30 11:08:35 [core.py:866] RuntimeError: self.size(0) needs to be greater than 16, but got 16
 [0;36m(EngineCore_DP0 pid=5072)[0;0m Process EngineCore_DP0:
 [0;36m(EngineCore_DP0 pid=5072)[0;0m Traceback (most recent call last):
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     self.run()
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     self._target(*self._args, **self._kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 870, in run_engine_core
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     raise e
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     engine_core = EngineCoreProc(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     super().__init__(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     self.model_executor.initialize_from_config(kv_cache_configs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     self.collective_rpc("compile_or_warm_up_model")
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     result = run_method(self.driver_worker, method, args, kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     cuda_graph_memory_bytes = self.model_runner.capture_model()
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     self._capture_cudagraphs(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     self._dummy_run(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     outputs = self.model(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     model_output = self.model(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self._call_with_optional_nvtx_range(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return callable_fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     def forward(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self.optimized_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self._wrapped_call(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     raise e
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "<eval_with_key>.66", line 202, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_);  l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return range_entry.runnable(*args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self._compiled_fn(*args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return compiled_fn(full_args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     all_outs = call_func_at_runtime_with_args(
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     out = normalize_as_list(f(args))
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     unwrapped_outs = compiled_fn(unwrapped_args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     outs = compiled_fn(args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return compiled_fn(runtime_args)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     return self.current_callable(inputs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     out = model(new_inputs)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m   File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
 [0;36m(EngineCore_DP0 pid=5072)[0;0m     extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
 [0;36m(EngineCore_DP0 pid=5072)[0;0m RuntimeError: self.size(0) needs to be greater than 16, but got 16
 [rank0]:[W1230 11:08:37.382981035 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 Traceback (most recent call last):
  File "/home/elicer/ao/.venv/bin/vllm", line 10, in <module>
    sys.exit(main())
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
    args.dispatch_function(args)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd
    main(args)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 730, in main
    elapsed_time, request_outputs = run_vllm(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 51, in run_vllm
    llm = LLM(**dataclasses.asdict(engine_args))
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 351, in __init__
    self.llm_engine = LLMEngine.from_engine_args(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 183, in from_engine_args
    return cls(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 109, in __init__
    self.engine_core = EngineCoreClient.make_client(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 93, in make_client
    return SyncMPClient(vllm_config, executor_class, log_stats)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 648, in __init__
    super().__init__(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__
    with launch_core_engines(vllm_config, executor_class, log_stats) as (
  File "/usr/local/lib/python3.10/contextlib.py", line 142, in __exit__
    next(self.gen)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines
    wait_for_engine_startup(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup
    raise RuntimeError(
 RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}


 Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info
 The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
 When dataset path is not set, it will default to random dataset
 INFO 12-30 11:08:53 [datasets.py:612] Sampling input_len from [31, 31] and output_len from [2048, 2048]
 INFO 12-30 11:08:53 [utils.py:253] non-default args: {'tokenizer': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', 'dtype': 'bfloat16', 'max_model_len': 2080, 'enable_lora': None, 'reasoning_parser_plugin': '', 'model': 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/'}
 INFO 12-30 11:08:53 [model.py:514] Resolved architecture: LlamaForCausalLM
 INFO 12-30 11:08:53 [model.py:1661] Using max model len 2080
 INFO 12-30 11:08:54 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
 The tokenizer you are loading from 'benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
 Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:05 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', speculative_config=None, tokenizer='benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2080, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None}
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:05 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.0.2.100:54775 backend=nccl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:05 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:06 [gpu_model_runner.py:3562] Starting to load model benchmarks/data/quantized_model/meta-llama/Llama-3.1-8B-int8_rowwise/...
 [0;36m(EngineCore_DP0 pid=9421)[0;0m /home/elicer/ao/.venv/lib/python3.10/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   _C._set_float32_matmul_precision(precision)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:06 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')
 [0;36m(EngineCore_DP0 pid=9421)[0;0m
 Loading pt checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m
 Loading pt checkpoint shards:  50% Completed | 1/2 [00:04<00:04,  4.07s/it]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m
 Loading pt checkpoint shards: 100% Completed | 2/2 [00:07<00:00,  3.56s/it]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m
 Loading pt checkpoint shards: 100% Completed | 2/2 [00:07<00:00,  3.63s/it]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:14 [default_loader.py:308] Loading weights took 7.31 seconds
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:15 [gpu_model_runner.py:3659] Model loading took 8.4914 GiB memory and 8.213182 seconds
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:26 [backends.py:643] Using cache directory: /home/elicer/.cache/vllm/torch_compile_cache/626308a02e/rank_0_0/backbone for vLLM's torch.compile
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:26 [backends.py:703] Dynamo bytecode transform time: 10.79 s
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:35 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:56 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 22.31 s
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:56 [monitor.py:34] torch.compile takes 33.10 s in total
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:58 [gpu_worker.py:375] Available KV cache memory: 61.40 GiB
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:58 [kv_cache_utils.py:1291] GPU KV cache size: 502,976 tokens
 [0;36m(EngineCore_DP0 pid=9421)[0;0m INFO 12-30 11:09:58 [kv_cache_utils.py:1296] Maximum concurrency for 2,080 tokens per request: 241.82x
 [0;36m(EngineCore_DP0 pid=9421)[0;0m
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/51 [00:00<?, ?it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|▍         | 2/51 [00:00<00:02, 16.45it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   8%|▊         | 4/51 [00:00<00:02, 17.04it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  12%|█▏        | 6/51 [00:00<00:02, 17.42it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|█▌        | 8/51 [00:00<00:02, 17.79it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  20%|█▉        | 10/51 [00:00<00:02, 18.31it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  24%|██▎       | 12/51 [00:00<00:02, 18.63it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  27%|██▋       | 14/51 [00:00<00:01, 18.90it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  31%|███▏      | 16/51 [00:00<00:01, 19.06it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  35%|███▌      | 18/51 [00:00<00:01, 19.20it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  39%|███▉      | 20/51 [00:01<00:01, 19.33it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  43%|████▎     | 22/51 [00:01<00:01, 19.37it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  47%|████▋     | 24/51 [00:01<00:01, 19.39it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  51%|█████     | 26/51 [00:01<00:01, 19.51it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  55%|█████▍    | 28/51 [00:01<00:01, 19.46it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  59%|█████▉    | 30/51 [00:01<00:01, 19.44it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  63%|██████▎   | 32/51 [00:01<00:00, 19.47it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  67%|██████▋   | 34/51 [00:01<00:00, 19.41it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  71%|███████   | 36/51 [00:01<00:00, 19.46it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  75%|███████▍  | 38/51 [00:01<00:00, 19.39it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  78%|███████▊  | 40/51 [00:02<00:00, 19.34it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  82%|████████▏ | 42/51 [00:02<00:00, 19.34it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  86%|████████▋ | 44/51 [00:02<00:00, 19.30it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|█████████ | 46/51 [00:02<00:00, 19.33it/s]
 Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|█████████ | 46/51 [00:02<00:00, 19.03it/s]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] EngineCore failed to start.
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] Traceback (most recent call last):
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     engine_core = EngineCoreProc(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     super().__init__(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     self.model_executor.initialize_from_config(kv_cache_configs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     self.collective_rpc("compile_or_warm_up_model")
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     result = run_method(self.driver_worker, method, args, kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     cuda_graph_memory_bytes = self.model_runner.capture_model()
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     self._capture_cudagraphs(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     self._dummy_run(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     outputs = self.model(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     model_output = self.model(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self._call_with_optional_nvtx_range(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return callable_fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     def forward(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self.optimized_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self._wrapped_call(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     raise e
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "<eval_with_key>.66", line 202, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_);  l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return range_entry.runnable(*args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self._compiled_fn(*args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return compiled_fn(full_args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     all_outs = call_func_at_runtime_with_args(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     out = normalize_as_list(f(args))
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return compiled_fn(runtime_args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     unwrapped_outs = compiled_fn(unwrapped_args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     return self.current_callable(inputs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     out = model(new_inputs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]   File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866]     extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m ERROR 12-30 11:10:02 [core.py:866] RuntimeError: self.size(0) needs to be greater than 16, but got 16
 [0;36m(EngineCore_DP0 pid=9421)[0;0m Process EngineCore_DP0:
 [0;36m(EngineCore_DP0 pid=9421)[0;0m Traceback (most recent call last):
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     self.run()
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     self._target(*self._args, **self._kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 870, in run_engine_core
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     raise e
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 857, in run_engine_core
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     engine_core = EngineCoreProc(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 637, in __init__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     super().__init__(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 109, in __init__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 256, in _initialize_kv_caches
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     self.model_executor.initialize_from_config(kv_cache_configs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     self.collective_rpc("compile_or_warm_up_model")
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     result = run_method(self.driver_worker, method, args, kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/serial_utils.py", line 461, in run_method
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 459, in compile_or_warm_up_model
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     cuda_graph_memory_bytes = self.model_runner.capture_model()
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4540, in capture_model
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     self._capture_cudagraphs(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4641, in _capture_cudagraphs
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     self._dummy_run(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return func(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4198, in _dummy_run
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     outputs = self.model(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 623, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     model_output = self.model(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 439, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 223, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self._call_with_optional_nvtx_range(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/wrapper.py", line 109, in _call_with_optional_nvtx_range
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return callable_fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 412, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     def forward(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/caching.py", line 54, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self.optimized_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self._wrapped_call(self, *args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     raise e
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self._call_impl(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return forward_call(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "<eval_with_key>.66", line 202, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_);  l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 220, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self.runnable(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.py", line 178, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return range_entry.runnable(*args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/standalone_compile.py", line 63, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self._compiled_fn(*args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return fn(*args, **kwargs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return compiled_fn(full_args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 353, in runtime_wrapper
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     all_outs = call_func_at_runtime_with_args(
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     out = normalize_as_list(f(args))
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return compiled_fn(runtime_args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 690, in inner_fn
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     unwrapped_outs = compiled_fn(unwrapped_args)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     return self.current_callable(inputs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/home/elicer/ao/.venv/lib/python3.10/site-packages/torch/_inductor/utils.py", line 2962, in run
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     out = model(new_inputs)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m   File "/tmp/torchinductor_elicer/bs/cbskvs5py2ng3xnljyvn4sy6vjxpvbd7w7sch7mh7qpegxcvngn4.py", line 815, in call
 [0;36m(EngineCore_DP0 pid=9421)[0;0m     extern_kernels._int_mm(buf5, reinterpret_tensor(arg4_1, (4096, 6144), (1, 4096), 0), out=buf6)
 [0;36m(EngineCore_DP0 pid=9421)[0;0m RuntimeError: self.size(0) needs to be greater than 16, but got 16
 [rank0]:[W1230 11:10:03.448819061 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 Traceback (most recent call last):
  File "/home/elicer/ao/.venv/bin/vllm", line 10, in <module>
    sys.exit(main())
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
    args.dispatch_function(args)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd
    main(args)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 730, in main
    elapsed_time, request_outputs = run_vllm(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py", line 51, in run_vllm
    llm = LLM(**dataclasses.asdict(engine_args))
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 351, in __init__
    self.llm_engine = LLMEngine.from_engine_args(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 183, in from_engine_args
    return cls(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 109, in __init__
    self.engine_core = EngineCoreClient.make_client(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 93, in make_client
    return SyncMPClient(vllm_config, executor_class, log_stats)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 648, in __init__
    super().__init__(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__
    with launch_core_engines(vllm_config, executor_class, log_stats) as (
  File "/usr/local/lib/python3.10/contextlib.py", line 142, in __exit__
    next(self.gen)
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines
    wait_for_engine_startup(
  File "/home/elicer/ao/.venv/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup
    raise RuntimeError(
 RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
No results found