Skip to content

Instantly share code, notes, and snippets.

def _dist_comms_impl(
keys,
key_to_rank,
get_data_fn=lambda key: None,
comm_fn=lambda data, target_rank: None,
store_data_fn=lambda key, data: None,
should_store_data_fn= lambda target_rank: False,
context_fn = None,
):
if context_fn is None:
class GPTQModifier ...
def compress_modules(self):
"""
Quantize modules which have been calibrated
"""
### Not Distributed
if not (dist.is_initialized() and dist.get_world_size() > 1):
self.compress_module_list(list(self._num_samples.keys()))
### Distributed
from contextlib import contextmanager
import torch
from compressed_tensors.offload import offload_model
from compressed_tensors.offload.dispatch import remove_dispatch
from loguru import logger
import torch.distributed as dist
import inspect
import os
#### THIS STUFF WILL GO IN CT
@HDCharles
HDCharles / rfc.py
Last active February 3, 2026 20:55
'Current API'
init_dist()
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
with ct_offload(): # <- context manager to wrap from_pretrained
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto", device_map="cpu")
<load data and quantize model>
with ct_offload():
import random
random.seed(0)
from collections import deque
import math
import time
### SET UP PROBLEM
l, h = 100,100000
delta = 10
@torch.no_grad
@pytest.mark.unit
@pytest.mark.parametrize(
"n_balance_layers, group_size, n_input_features, strategy",
[
(5, -1, 32, QuantizationStrategy.CHANNEL), # channel
(4, 10, 40, QuantizationStrategy.GROUP), # group
(4, torch.inf, 40, QuantizationStrategy.TENSOR), # tensor
(3, 16, 64, QuantizationStrategy.TENSOR_GROUP), # tensor_group
],
import json
import torch
from tokenizers import Tokenizer
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Qwen3MoeConfig,
Qwen3MoeForCausalLM,
)
@HDCharles
HDCharles / test_awq_mapping_vs_targets.py
Last active November 19, 2025 16:13
Demo AWQ Mapping vs Targets
"""
Test to verify what happens when layers are in mappings but not in targets for AWQ.
Adapted from https://github.com/vllm-project/llm-compressor/tree/main/examples/awq
"""
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
################## RESULTS
# ######### GROUPED_MM #######
# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile
# Average tokens/sec: 24.15
# Average tokens/sec including batches 193.18
# Memory used: 95.25 GB
# model size: 93.62
# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile --compile_mode "max-autotune"
# Average tokens/sec: 23.97
@HDCharles
HDCharles / log.log
Created July 29, 2025 16:45
output from repro
/data/users/hdcharles/pytorch/torch/backends/cuda/__init__.py:131: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /data/users/hdcharles/pytorch/aten/src/ATen/Context.cpp:80.)
return torch._C._get_cublas_allow_tf32()
/tmp/torchinductor_hdcharles/ah/cahrqbokro3llqbhea5qjqikmqu6ncya53rrk5ia2mdrxlmeglxs.py:105: UserWarning: Logical operators 'and' and 'or' are deprecated for non-scalar tensors; please use '&' or '|' instead
mask = offs_am[:, None] < m_size and offs_bn[None, :] < n_size
UserWarning: Enable tracemalloc to get the object allocation traceback
/tmp/torchind