Skip to content

Instantly share code, notes, and snippets.

@hyunsik
Created February 2, 2026 06:24
Show Gist options
  • Select an option

  • Save hyunsik/b0f8713356916cbb54edc32f9fe28dab to your computer and use it in GitHub Desktop.

Select an option

Save hyunsik/b0f8713356916cbb54edc32f9fe28dab to your computer and use it in GitHub Desktop.
FinegrainedFP8 example
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config
# 1. 설정
model_id = "Qwen/Qwen3-32B"
save_path = "./Qwen3-32B-FP8"
# 2. FineGrained FP8 설정
# DeepSeek-V3 등에서 사용하는 Block-wise(128x128) Dynamic FP8 양자화 설정
quantization_config = FineGrainedFP8Config(
activation_scheme="dynamic",
weight_block_size=(128, 128)
)
# 3. 모델 로드 (이 과정에서 Config에 따라 구조가 변경됨)
print(f"Loading and Quantizing {model_id}...")
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# 4. 양자화된 모델 저장
print(f"Saving to {save_path}...")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Quantization complete.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment