A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

import subprocess, sys
def pip(*pkgs):
subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “-q”, *pkgs])
pip(“llmcompressor”, “compressed-tensors”,
“transformers>=4.45”, “accelerate”, “datasets”)
import os, gc, time, json, math
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
assert torch.cuda.is_available(), \
“Enable a GPU: Runtime > Change runtime type > T4 GPU”
print(“GPU:”, torch.cuda.get_device_name(0),
“| CUDA:”, torch.version.cuda,
“| torch:”, torch.__version__)
MODEL_ID = “Qwen/Qwen2.5-0.5B-Instruct”
WORKDIR = Path(“/content/quant_lab”); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)
def free_mem():
gc.collect(); torch.cuda.empty_cache()
def dir_size_gb(path):
total = 0
for root, _, files in os.walk(path):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / 1e9
def time_generation(model, tok, prompt, max_new_tokens=64):
“””Greedy decode; reports latency & tokens/sec after a brief warmup.”””
inputs = tok(prompt, return_tensors=”pt”).to(model.device)
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=max_new_tokens,
do_sample=False, pad_token_id=tok.eos_token_id)
torch.cuda.synchronize()
dt = time.time() – t0
new_ids = out[0][inputs[“input_ids”].shape[1]:]
return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
“””Light WikiText-2 perplexity probe (fast, indicative).”””
ds = load_dataset(“wikitext”, “wikitext-2-raw-v1″, split=”test”)
text = “\n\n”.join(t for t in ds[“text”][:400] if t.strip())
enc = tok(text, return_tensors=”pt”).input_ids.to(model.device)
nll_sum, tok_count = 0.0, 0
for begin in range(0, enc.size(1) – seq_len, stride):
chunk = enc[:, begin:begin+seq_len]
out = model(chunk, labels=chunk)
nll_sum += out.loss.float().item() * seq_len
tok_count += seq_len
if tok_count // seq_len >= max_chunks: break
return math.exp(nll_sum / tok_count)
results = {}
PROMPT = (“<|im_start|>user\nIn two sentences, explain why post-training ”
“quantization works for large language models.<|im_end|>\n”
“<|im_start|>assistant\n”)
def benchmark(label, model_path_or_id):
free_mem()
print(f”\n──── benchmarking: {label} ────”)
tok = AutoTokenizer.from_pretrained(model_path_or_id)
m = AutoModelForCausalLM.from_pretrained(
model_path_or_id, torch_dtype=”auto”, device_map=”cuda”).eval()
sample, dt, tps = time_generation(m, tok, PROMPT)
ppl = wikitext_ppl(m, tok)
size = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
results[label] = {“size_gb”: size, “ppl”: round(ppl, 3),
“latency_s”: round(dt, 3), “tok_per_s”: round(tps, 1),
“sample”: sample.strip().replace(“\n”, ” “)[:180]}
print(json.dumps(results[label], indent=2))
del m; free_mem()

Source link

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

Two from MIT named 2026 Knight-Hennessy Scholars | MIT News

Claude’s next enterprise battle is not models: it’s the agent control plane

Physical AI moves closer to factory floors as companies test humanoid robots

Mira Murati’s Thinking Machines Lab Introduces Interaction Models: A Native Multimodal Architecture for Real-Time Human-AI Collaboration

Vitalik Buterin Moves $113K ETH via Privacy Pools

What the Average Canadian TFSA Balance Looks Like at Age 50

Strategy Plans Major Note Repurchase While Leaving Door Open to Bitcoin Sales

Two from MIT named 2026 Knight-Hennessy Scholars | MIT News

Bitcoin Short-Term Holder Basis Remains High Within Biggest Supply Cluster

Top Insights

Intesa Sanpaolo’s Crypto Portfolio Hits $235M as Italy’s Biggest Bank Goes Deeper Into Digital Assets

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

Related Posts