This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _dist_comms_impl( | |
| keys, | |
| key_to_rank, | |
| get_data_fn=lambda key: None, | |
| comm_fn=lambda data, target_rank: None, | |
| store_data_fn=lambda key, data: None, | |
| should_store_data_fn= lambda target_rank: False, | |
| context_fn = None, | |
| ): | |
| if context_fn is None: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class GPTQModifier ... | |
| def compress_modules(self): | |
| """ | |
| Quantize modules which have been calibrated | |
| """ | |
| ### Not Distributed | |
| if not (dist.is_initialized() and dist.get_world_size() > 1): | |
| self.compress_module_list(list(self._num_samples.keys())) | |
| ### Distributed |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from contextlib import contextmanager | |
| import torch | |
| from compressed_tensors.offload import offload_model | |
| from compressed_tensors.offload.dispatch import remove_dispatch | |
| from loguru import logger | |
| import torch.distributed as dist | |
| import inspect | |
| import os | |
| #### THIS STUFF WILL GO IN CT |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 'Current API' | |
| init_dist() | |
| MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| with ct_offload(): # <- context manager to wrap from_pretrained | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto", device_map="cpu") | |
| <load data and quantize model> | |
| with ct_offload(): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import random | |
| random.seed(0) | |
| from collections import deque | |
| import math | |
| import time | |
| ### SET UP PROBLEM | |
| l, h = 100,100000 | |
| delta = 10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @torch.no_grad | |
| @pytest.mark.unit | |
| @pytest.mark.parametrize( | |
| "n_balance_layers, group_size, n_input_features, strategy", | |
| [ | |
| (5, -1, 32, QuantizationStrategy.CHANNEL), # channel | |
| (4, 10, 40, QuantizationStrategy.GROUP), # group | |
| (4, torch.inf, 40, QuantizationStrategy.TENSOR), # tensor | |
| (3, 16, 64, QuantizationStrategy.TENSOR_GROUP), # tensor_group | |
| ], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import torch | |
| from tokenizers import Tokenizer | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| Qwen3MoeConfig, | |
| Qwen3MoeForCausalLM, | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Test to verify what happens when layers are in mappings but not in targets for AWQ. | |
| Adapted from https://github.com/vllm-project/llm-compressor/tree/main/examples/awq | |
| """ | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################## RESULTS | |
| # ######### GROUPED_MM ####### | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile | |
| # Average tokens/sec: 24.15 | |
| # Average tokens/sec including batches 193.18 | |
| # Memory used: 95.25 GB | |
| # model size: 93.62 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile --compile_mode "max-autotune" | |
| # Average tokens/sec: 23.97 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /data/users/hdcharles/pytorch/torch/backends/cuda/__init__.py:131: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /data/users/hdcharles/pytorch/aten/src/ATen/Context.cpp:80.) | |
| return torch._C._get_cublas_allow_tf32() | |
| /tmp/torchinductor_hdcharles/ah/cahrqbokro3llqbhea5qjqikmqu6ncya53rrk5ia2mdrxlmeglxs.py:105: UserWarning: Logical operators 'and' and 'or' are deprecated for non-scalar tensors; please use '&' or '|' instead | |
| mask = offs_am[:, None] < m_size and offs_bn[None, :] < n_size | |
| UserWarning: Enable tracemalloc to get the object allocation traceback | |
| /tmp/torchind |
NewerOlder