malaiwah · January 5, 2026 03:17
diff --git a/gistfile1.txt b/gistfile1.txt
 ==========
 == CUDA ==
 ==========

 CUDA Version 13.0.1

 Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 This container image and its contents are governed by the NVIDIA Deep Learning Container License.
 By pulling and using the container, you accept the terms and conditions of this license:
 https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

 A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.

 [2026-01-05 03:11:37] WARNING model_config.py:796: DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:37] WARNING server_args.py:1529: Attention backend not specified. Use flashinfer backend by default.
 [2026-01-05 03:11:37] WARNING server_args.py:1975: Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding.You can set env SGLANG_ENABLE_SPEC_V2=True to enable the experimental over
 lap scheduler.
 [2026-01-05 03:11:37] server_args=ServerArgs(model_path='zai-org/GLM-4.7-FP8', tokenizer_path='zai-org/GLM-4.7-FP8', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', mod
 el_loader_extra_config='{"enable_multithread_load": true, "num_threads": 16}', trust_remote_code=True, context_length=131072, is_embedding=False, enable_multimodal=None, limit_mm_data_per_request=None, revision=Non
 e, model_impl='auto', host='0.0.0.0', port=8000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, dtype='auto', quant
 ization=None, quantization_param_path=None, kv_cache_dtype='fp8_e4m3', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path
 =None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.9, max_running_requests=2, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=4096, enable_dynamic_chunking=False, ma
 x_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_pree
 mption_threshold=10, schedule_conservativeness=1.0, page_size=1, hybrid_kvcache_ratio=None, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', device='cuda', tp_size=4, pp_size
 =1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, stream_output=False, random_seed=1015957199, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, wat
 chdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_reques
 ts=False, log_requests_level=2, crash_dump_folder=None, show_time_cost=False, enable_metrics=True, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metric
 s_allowed_custom_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets
 =None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False,
 export_metrics_to_file_dir=None, api_key=None, served_model_name='glm-4.7-fp8', weight_version='default', chat_template=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False
 , reasoning_parser='glm45', tool_call_parser='glm47', tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', load_watch_interval=0.1, prefill_round_robin_balance=False, dist_init
 _addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_pe
 r_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, attention_backend='flashinfer', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', gra
 mmar_backend='xgrammar', mm_attention_backend=None, fp8_gemm_runner_backend='auto', nsa_prefill_backend='flashmla_sparse', nsa_decode_backend='fa3', disable_flashinfer_autotune=False, speculative_algorithm='EAGLE',
 speculative_draft_model_path='zai-org/GLM-4.7-FP8', speculative_draft_model_revision=None, speculative_draft_load_format=None, speculative_num_steps=3, speculative_eagle_topk=1, speculative_num_draft_tokens=4, spe
 culative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend
 ='auto', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_ngram_min_match_window_size=1, speculative_ngram_max_match_window_size=12, speculative_ngram_min_bfs_breadth=1, spec
 ulative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_branch_length=18, speculative_ngram_capacity=10000000, enable_multi_layer_eagle=False, ep_size=1, moe_a2a_backend='none', moe_
 runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=True, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial',
 enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_dis
 tribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, mooncake_ib_device=None, max_mamba_cache_size=None, mamba_ssm_dtyp
 e='float32', mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hi
 cache_io_backend='kernel', hicache_mem_layout='layer_first', hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, enable_lmcache=False, kt_weight_p
 ath=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, enable_double_sparsity=False
 , ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', multi_item_scoring_delimiter=None, disable_radix_cache=False, cuda_graph_max_bs=512, cuda_graph_bs=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, enable_torch_symm_mem=False, disable_overlap_schedule=True, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, enable_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=4096, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048, 2176, 2304, 2432, 2560, 2688, 2816, 2944, 3072, 3200, 3328, 3456, 3584, 3712, 3840, 3968, 4096], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, enable_nsa_prefill_context_parallel=False, enable_fused_qk_norm_rope=False, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, disaggregation_decode_enable_offload_kvcache=False, disaggregation_decode_enable_fake_auto=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, mm_max_concurrent_calls=32, mm_per_request_timeout=10.0, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None)
 [2026-01-05 03:11:37] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:38] Using default HuggingFace chat template with detected content format: openai
 [2026-01-05 03:11:43 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:43 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:43 TP3] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:43 TP2] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:43 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-01-05 03:11:43 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.






 [...]




 [2026-01-05 03:04:54 TP0] Decode batch, #running-req: 1, #token: 33598, token usage: 0.92, accept len: 3.10, accept rate: 0.78, cuda graph: True, gen throughput (token/s): 0.29, #queue-req: 0,
 [2026-01-05 03:05:01 TP0] Decode batch, #running-req: 1, #token: 33727, token usage: 0.93, accept len: 3.23, accept rate: 0.81, cuda graph: True, gen throughput (token/s): 16.92, #queue-req: 0,
 [2026-01-05 03:05:09 TP0] Decode batch, #running-req: 1, #token: 33839, token usage: 0.93, accept len: 2.80, accept rate: 0.70, cuda graph: True, gen throughput (token/s): 14.97, #queue-req: 0,
 [2026-01-05 03:05:18 TP0] Decode batch, #running-req: 1, #token: 33919, token usage: 0.93, accept len: 2.00, accept rate: 0.50, cuda graph: True, gen throughput (token/s): 9.09, #queue-req: 0,
 [2026-01-05 03:05:27 TP0] Decode batch, #running-req: 1, #token: 34010, token usage: 0.93, accept len: 2.27, accept rate: 0.57, cuda graph: True, gen throughput (token/s): 10.27, #queue-req: 0,
 [2026-01-05 03:05:35 TP0] Decode batch, #running-req: 1, #token: 34099, token usage: 0.94, accept len: 2.23, accept rate: 0.56, cuda graph: True, gen throughput (token/s): 11.22, #queue-req: 0,
 [2026-01-05 03:05:41 TP0] Decode batch, #running-req: 1, #token: 34180, token usage: 0.94, accept len: 2.02, accept rate: 0.51, cuda graph: True, gen throughput (token/s): 12.26, #queue-req: 0,
 [2026-01-05 03:05:47 TP0] Decode batch, #running-req: 1, #token: 34305, token usage: 0.94, accept len: 3.12, accept rate: 0.78, cuda graph: True, gen throughput (token/s): 20.01, #queue-req: 0,
 [2026-01-05 03:05:54 TP0] Decode batch, #running-req: 1, #token: 34447, token usage: 0.95, accept len: 3.55, accept rate: 0.89, cuda graph: True, gen throughput (token/s): 21.29, #queue-req: 0,
 [2026-01-05 03:05:59 TP0] Decode batch, #running-req: 1, #token: 34562, token usage: 0.95, accept len: 2.88, accept rate: 0.72, cuda graph: True, gen throughput (token/s): 21.76, #queue-req: 0,
 [2026-01-05 03:06:06 TP0] Decode batch, #running-req: 1, #token: 34683, token usage: 0.95, accept len: 3.02, accept rate: 0.76, cuda graph: True, gen throughput (token/s): 18.95, #queue-req: 0,
 [2026-01-05 03:06:13 TP0] Decode batch, #running-req: 1, #token: 34837, token usage: 0.96, accept len: 3.85, accept rate: 0.96, cuda graph: True, gen throughput (token/s): 22.18, #queue-req: 0,
 [2026-01-05 03:06:19 TP0] Decode batch, #running-req: 1, #token: 34964, token usage: 0.96, accept len: 3.17, accept rate: 0.79, cuda graph: True, gen throughput (token/s): 20.38, #queue-req: 0,
 [2026-01-05 03:06:25 TP0] Decode batch, #running-req: 1, #token: 35085, token usage: 0.96, accept len: 3.02, accept rate: 0.76, cuda graph: True, gen throughput (token/s): 18.82, #queue-req: 0,
 [2026-01-05 03:06:31 TP0] Decode batch, #running-req: 1, #token: 35173, token usage: 0.97, accept len: 2.20, accept rate: 0.55, cuda graph: True, gen throughput (token/s): 17.03, #queue-req: 0,
 [2026-01-05 03:06:36 TP0] Decode batch, #running-req: 1, #token: 35302, token usage: 0.97, accept len: 3.23, accept rate: 0.81, cuda graph: True, gen throughput (token/s): 21.76, #queue-req: 0,
 [2026-01-05 03:06:44 TP0] Decode batch, #running-req: 1, #token: 35443, token usage: 0.97, accept len: 3.52, accept rate: 0.88, cuda graph: True, gen throughput (token/s): 18.52, #queue-req: 0,
 [2026-01-05 03:06:52 TP0] Decode batch, #running-req: 1, #token: 35534, token usage: 0.98, accept len: 2.27, accept rate: 0.57, cuda graph: True, gen throughput (token/s): 11.46, #queue-req: 0,
 [2026-01-05 03:07:00 TP0] Decode batch, #running-req: 1, #token: 35631, token usage: 0.98, accept len: 2.42, accept rate: 0.61, cuda graph: True, gen throughput (token/s): 12.31, #queue-req: 0,
 [2026-01-05 03:07:08 TP0] Decode batch, #running-req: 1, #token: 35710, token usage: 0.98, accept len: 1.98, accept rate: 0.49, cuda graph: True, gen throughput (token/s): 9.80, #queue-req: 0,
 [2026-01-05 03:07:16 TP0] Decode batch, #running-req: 1, #token: 35806, token usage: 0.98, accept len: 2.40, accept rate: 0.60, cuda graph: True, gen throughput (token/s): 11.43, #queue-req: 0,
 [2026-01-05 03:07:24 TP0] Decode batch, #running-req: 1, #token: 35910, token usage: 0.99, accept len: 2.60, accept rate: 0.65, cuda graph: True, gen throughput (token/s): 13.54, #queue-req: 0,
 [2026-01-05 03:07:31 TP0] Decode batch, #running-req: 1, #token: 36006, token usage: 0.99, accept len: 2.40, accept rate: 0.60, cuda graph: True, gen throughput (token/s): 13.17, #queue-req: 0,
 [2026-01-05 03:07:39 TP0] Decode batch, #running-req: 1, #token: 36111, token usage: 0.99, accept len: 2.62, accept rate: 0.66, cuda graph: True, gen throughput (token/s): 13.59, #queue-req: 0,
	==========
	== CUDA ==
	==========

	CUDA Version 13.0.1

	Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

	This container image and its contents are governed by the NVIDIA Deep Learning Container License.
	By pulling and using the container, you accept the terms and conditions of this license:
	https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

	A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.

	[2026-01-05 03:11:37] WARNING model_config.py:796: DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:37] WARNING server_args.py:1529: Attention backend not specified. Use flashinfer backend by default.
	[2026-01-05 03:11:37] WARNING server_args.py:1975: Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding.You can set env SGLANG_ENABLE_SPEC_V2=True to enable the experimental over
	lap scheduler.
	[2026-01-05 03:11:37] server_args=ServerArgs(model_path='zai-org/GLM-4.7-FP8', tokenizer_path='zai-org/GLM-4.7-FP8', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', mod
	el_loader_extra_config='{"enable_multithread_load": true, "num_threads": 16}', trust_remote_code=True, context_length=131072, is_embedding=False, enable_multimodal=None, limit_mm_data_per_request=None, revision=Non
	e, model_impl='auto', host='0.0.0.0', port=8000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, dtype='auto', quant
	ization=None, quantization_param_path=None, kv_cache_dtype='fp8_e4m3', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path
	=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.9, max_running_requests=2, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=4096, enable_dynamic_chunking=False, ma
	x_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_pree
	mption_threshold=10, schedule_conservativeness=1.0, page_size=1, hybrid_kvcache_ratio=None, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', device='cuda', tp_size=4, pp_size
	=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, stream_output=False, random_seed=1015957199, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, wat
	chdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_reques
	ts=False, log_requests_level=2, crash_dump_folder=None, show_time_cost=False, enable_metrics=True, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metric
	s_allowed_custom_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets
	=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False,
	export_metrics_to_file_dir=None, api_key=None, served_model_name='glm-4.7-fp8', weight_version='default', chat_template=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False
	, reasoning_parser='glm45', tool_call_parser='glm47', tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', load_watch_interval=0.1, prefill_round_robin_balance=False, dist_init
	_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_pe
	r_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, attention_backend='flashinfer', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', gra
	mmar_backend='xgrammar', mm_attention_backend=None, fp8_gemm_runner_backend='auto', nsa_prefill_backend='flashmla_sparse', nsa_decode_backend='fa3', disable_flashinfer_autotune=False, speculative_algorithm='EAGLE',
	speculative_draft_model_path='zai-org/GLM-4.7-FP8', speculative_draft_model_revision=None, speculative_draft_load_format=None, speculative_num_steps=3, speculative_eagle_topk=1, speculative_num_draft_tokens=4, spe
	culative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend
	='auto', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_ngram_min_match_window_size=1, speculative_ngram_max_match_window_size=12, speculative_ngram_min_bfs_breadth=1, spec
	ulative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_branch_length=18, speculative_ngram_capacity=10000000, enable_multi_layer_eagle=False, ep_size=1, moe_a2a_backend='none', moe_
	runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=True, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial',
	enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_dis
	tribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, mooncake_ib_device=None, max_mamba_cache_size=None, mamba_ssm_dtyp
	e='float32', mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hi
	cache_io_backend='kernel', hicache_mem_layout='layer_first', hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, enable_lmcache=False, kt_weight_p
	ath=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, enable_double_sparsity=False
	, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', multi_item_scoring_delimiter=None, disable_radix_cache=False, cuda_graph_max_bs=512, cuda_graph_bs=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, enable_torch_symm_mem=False, disable_overlap_schedule=True, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, enable_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=4096, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048, 2176, 2304, 2432, 2560, 2688, 2816, 2944, 3072, 3200, 3328, 3456, 3584, 3712, 3840, 3968, 4096], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, enable_nsa_prefill_context_parallel=False, enable_fused_qk_norm_rope=False, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, disaggregation_decode_enable_offload_kvcache=False, disaggregation_decode_enable_fake_auto=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, mm_max_concurrent_calls=32, mm_per_request_timeout=10.0, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None)
	[2026-01-05 03:11:37] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:38] Using default HuggingFace chat template with detected content format: openai
	[2026-01-05 03:11:43 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:43 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:43 TP3] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:43 TP2] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:43 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
	[2026-01-05 03:11:43 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.






	[...]




	[2026-01-05 03:04:54 TP0] Decode batch, #running-req: 1, #token: 33598, token usage: 0.92, accept len: 3.10, accept rate: 0.78, cuda graph: True, gen throughput (token/s): 0.29, #queue-req: 0,
	[2026-01-05 03:05:01 TP0] Decode batch, #running-req: 1, #token: 33727, token usage: 0.93, accept len: 3.23, accept rate: 0.81, cuda graph: True, gen throughput (token/s): 16.92, #queue-req: 0,
	[2026-01-05 03:05:09 TP0] Decode batch, #running-req: 1, #token: 33839, token usage: 0.93, accept len: 2.80, accept rate: 0.70, cuda graph: True, gen throughput (token/s): 14.97, #queue-req: 0,
	[2026-01-05 03:05:18 TP0] Decode batch, #running-req: 1, #token: 33919, token usage: 0.93, accept len: 2.00, accept rate: 0.50, cuda graph: True, gen throughput (token/s): 9.09, #queue-req: 0,
	[2026-01-05 03:05:27 TP0] Decode batch, #running-req: 1, #token: 34010, token usage: 0.93, accept len: 2.27, accept rate: 0.57, cuda graph: True, gen throughput (token/s): 10.27, #queue-req: 0,
	[2026-01-05 03:05:35 TP0] Decode batch, #running-req: 1, #token: 34099, token usage: 0.94, accept len: 2.23, accept rate: 0.56, cuda graph: True, gen throughput (token/s): 11.22, #queue-req: 0,
	[2026-01-05 03:05:41 TP0] Decode batch, #running-req: 1, #token: 34180, token usage: 0.94, accept len: 2.02, accept rate: 0.51, cuda graph: True, gen throughput (token/s): 12.26, #queue-req: 0,
	[2026-01-05 03:05:47 TP0] Decode batch, #running-req: 1, #token: 34305, token usage: 0.94, accept len: 3.12, accept rate: 0.78, cuda graph: True, gen throughput (token/s): 20.01, #queue-req: 0,
	[2026-01-05 03:05:54 TP0] Decode batch, #running-req: 1, #token: 34447, token usage: 0.95, accept len: 3.55, accept rate: 0.89, cuda graph: True, gen throughput (token/s): 21.29, #queue-req: 0,
	[2026-01-05 03:05:59 TP0] Decode batch, #running-req: 1, #token: 34562, token usage: 0.95, accept len: 2.88, accept rate: 0.72, cuda graph: True, gen throughput (token/s): 21.76, #queue-req: 0,
	[2026-01-05 03:06:06 TP0] Decode batch, #running-req: 1, #token: 34683, token usage: 0.95, accept len: 3.02, accept rate: 0.76, cuda graph: True, gen throughput (token/s): 18.95, #queue-req: 0,
	[2026-01-05 03:06:13 TP0] Decode batch, #running-req: 1, #token: 34837, token usage: 0.96, accept len: 3.85, accept rate: 0.96, cuda graph: True, gen throughput (token/s): 22.18, #queue-req: 0,
	[2026-01-05 03:06:19 TP0] Decode batch, #running-req: 1, #token: 34964, token usage: 0.96, accept len: 3.17, accept rate: 0.79, cuda graph: True, gen throughput (token/s): 20.38, #queue-req: 0,
	[2026-01-05 03:06:25 TP0] Decode batch, #running-req: 1, #token: 35085, token usage: 0.96, accept len: 3.02, accept rate: 0.76, cuda graph: True, gen throughput (token/s): 18.82, #queue-req: 0,
	[2026-01-05 03:06:31 TP0] Decode batch, #running-req: 1, #token: 35173, token usage: 0.97, accept len: 2.20, accept rate: 0.55, cuda graph: True, gen throughput (token/s): 17.03, #queue-req: 0,
	[2026-01-05 03:06:36 TP0] Decode batch, #running-req: 1, #token: 35302, token usage: 0.97, accept len: 3.23, accept rate: 0.81, cuda graph: True, gen throughput (token/s): 21.76, #queue-req: 0,
	[2026-01-05 03:06:44 TP0] Decode batch, #running-req: 1, #token: 35443, token usage: 0.97, accept len: 3.52, accept rate: 0.88, cuda graph: True, gen throughput (token/s): 18.52, #queue-req: 0,
	[2026-01-05 03:06:52 TP0] Decode batch, #running-req: 1, #token: 35534, token usage: 0.98, accept len: 2.27, accept rate: 0.57, cuda graph: True, gen throughput (token/s): 11.46, #queue-req: 0,
	[2026-01-05 03:07:00 TP0] Decode batch, #running-req: 1, #token: 35631, token usage: 0.98, accept len: 2.42, accept rate: 0.61, cuda graph: True, gen throughput (token/s): 12.31, #queue-req: 0,
	[2026-01-05 03:07:08 TP0] Decode batch, #running-req: 1, #token: 35710, token usage: 0.98, accept len: 1.98, accept rate: 0.49, cuda graph: True, gen throughput (token/s): 9.80, #queue-req: 0,
	[2026-01-05 03:07:16 TP0] Decode batch, #running-req: 1, #token: 35806, token usage: 0.98, accept len: 2.40, accept rate: 0.60, cuda graph: True, gen throughput (token/s): 11.43, #queue-req: 0,
	[2026-01-05 03:07:24 TP0] Decode batch, #running-req: 1, #token: 35910, token usage: 0.99, accept len: 2.60, accept rate: 0.65, cuda graph: True, gen throughput (token/s): 13.54, #queue-req: 0,
	[2026-01-05 03:07:31 TP0] Decode batch, #running-req: 1, #token: 36006, token usage: 0.99, accept len: 2.40, accept rate: 0.60, cuda graph: True, gen throughput (token/s): 13.17, #queue-req: 0,
	[2026-01-05 03:07:39 TP0] Decode batch, #running-req: 1, #token: 36111, token usage: 0.99, accept len: 2.62, accept rate: 0.66, cuda graph: True, gen throughput (token/s): 13.59, #queue-req: 0,
No results found