Upload scripts/quantize_all.py with huggingface_hub

3fba683 verified 22 days ago

18.1 kB

	#!/usr/bin/env python3
	"""
	Phase 6: INT8 Weight-Only Quantization for All Modules
	=======================================================
	Applies torchao int8_weight_only quantization to each module,
	re-exports to torch.export, and lowers to ExecuTorch .pte.

	int8_weight_only is INSTANT — no calibration data needed.
	"""

	import sys
	import os
	import copy
	import time
	import gc
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	MODEL_PATH = os.path.expanduser("~/Documents/Qwen3-TTS/models/1.7B-Base")
	VENV_SITE = os.path.expanduser("~/Documents/Qwen3-TTS/.venv/lib/python3.10/site-packages")
	QWEN_TTS_SRC = os.path.expanduser("~/Documents/Qwen3-TTS")
	OUTPUT_DIR = os.path.expanduser("~/Documents/Qwen3-TTS-ExecuTorch/exported")

	if VENV_SITE not in sys.path:
	sys.path.insert(0, VENV_SITE)
	if QWEN_TTS_SRC not in sys.path:
	sys.path.insert(0, QWEN_TTS_SRC)

	os.makedirs(OUTPUT_DIR, exist_ok=True)

	from torchao.quantization import quantize_, int8_weight_only
	from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig
	from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner

	print("=" * 70)
	print("PHASE 6: INT8 Weight-Only Quantization")
	print("=" * 70)


	def export_and_lower_int8(module, example_args, name, output_dir):
	"""Quantize, export, and lower a module to INT8 .pte."""
	# Apply INT8 weight-only quantization
	print(f" Applying int8_weight_only quantization...")
	t0 = time.time()
	quantize_(module, int8_weight_only())
	print(f" Quantized in {time.time() - t0:.1f}s")

	# torch.export
	print(f" Running torch.export...")
	t0 = time.time()
	exported = torch.export.export(module, example_args, strict=False)
	print(f" Exported in {time.time() - t0:.1f}s ({len(exported.graph.nodes)} nodes)")

	# Lower to .pte
	print(f" Lowering to ExecuTorch .pte...")
	t0 = time.time()
	edge = to_edge_transform_and_lower(
	exported,
	compile_config=EdgeCompileConfig(_check_ir_validity=False),
	partitioner=[XnnpackPartitioner()],
	)
	et_program = edge.to_executorch()

	pte_path = os.path.join(output_dir, f"{name}_int8.pte")
	with open(pte_path, "wb") as f:
	f.write(et_program.buffer)

	pte_size = os.path.getsize(pte_path) / 1e6
	print(f" Saved: {pte_path} ({pte_size:.1f} MB)")
	print(f" Lowered in {time.time() - t0:.1f}s")
	return pte_size


	# ── Load base model ──────────────────────────────────────────────────

	print("\n[0/4] Loading base model...")
	from qwen_tts.core.models.configuration_qwen3_tts import Qwen3TTSConfig
	from qwen_tts.core.models.modeling_qwen3_tts import Qwen3TTSForConditionalGeneration

	config = Qwen3TTSConfig.from_pretrained(MODEL_PATH)
	model = Qwen3TTSForConditionalGeneration.from_pretrained(
	MODEL_PATH, config=config, dtype=torch.float32,
	attn_implementation="sdpa", device_map="cpu",
	)
	model.eval()
	print(" Model loaded.")

	results = {}

	# ═══════════════════════════════════════════════════════════════════
	# 1. SPEAKER ENCODER
	# ═══════════════════════════════════════════════════════════════════

	print("\n[1/4] Speaker Encoder INT8")

	# Inline the wrapper class
	class _ExplicitPadConv1d(nn.Module):
	def __init__(self, original_conv, pad_left, pad_right, pad_mode):
	super().__init__()
	self.conv = nn.Conv1d(
	in_channels=original_conv.in_channels, out_channels=original_conv.out_channels,
	kernel_size=original_conv.kernel_size[0], stride=original_conv.stride[0],
	padding=0, dilation=original_conv.dilation[0], groups=original_conv.groups,
	bias=original_conv.bias is not None)
	self.conv.weight = original_conv.weight
	if original_conv.bias is not None:
	self.conv.bias = original_conv.bias
	self.pad_left = pad_left
	self.pad_right = pad_right
	self.pad_mode = pad_mode

	def forward(self, x):
	if self.pad_left > 0 or self.pad_right > 0:
	x = F.pad(x, (self.pad_left, self.pad_right), mode=self.pad_mode)
	return self.conv(x)


	class SpeakerEncoderForExport_Q(nn.Module):
	def __init__(self, original_encoder):
	super().__init__()
	self.encoder = copy.deepcopy(original_encoder)
	self._fix_conv_padding(self.encoder)

	def _fix_conv_padding(self, module):
	for name, child in module.named_children():
	if isinstance(child, nn.Conv1d) and child.padding == 'same':
	k = child.kernel_size[0]
	d = child.dilation[0]
	pad_total = d * (k - 1)
	new_conv = _ExplicitPadConv1d(child, pad_total // 2, pad_total - pad_total // 2, child.padding_mode)
	setattr(module, name, new_conv)
	else:
	self._fix_conv_padding(child)

	def forward(self, mel_input):
	return self.encoder(mel_input)


	FIXED_MEL_FRAMES = 469
	se = SpeakerEncoderForExport_Q(model.speaker_encoder)
	se.eval()
	se_args = (torch.randn(1, FIXED_MEL_FRAMES, 128),)
	fp32_size = os.path.getsize(os.path.join(OUTPUT_DIR, "speaker_encoder.pte")) / 1e6

	try:
	int8_size = export_and_lower_int8(se, se_args, "speaker_encoder", OUTPUT_DIR)
	results["speaker_encoder"] = {"fp32": fp32_size, "int8": int8_size}
	except Exception as e:
	print(f" FAILED: {e}")
	results["speaker_encoder"] = {"fp32": fp32_size, "int8": None, "error": str(e)}

	del se; gc.collect()

	# ═══════════════════════════════════════════════════════════════════
	# 2. TALKER
	# ═══════════════════════════════════════════════════════════════════

	print("\n[2/4] Talker INT8")

	# Re-use the TalkerForExport class inline — too large to duplicate,
	# so we import just the class from the module file
	# But to avoid re-executing the script, add the guard
	import importlib.util
	spec = importlib.util.spec_from_file_location(
	"export_talker_mod",
	os.path.join(os.path.dirname(os.path.abspath(__file__)), "export_talker.py")
	)
	# We can't import it without running the whole script.
	# Instead, construct a simpler approach: quantize the state dict and re-export.
	# OR: just reconstruct a minimal version of the wrapper here.

	# Actually, let's take a pragmatic approach: torch.export the FP32 .pt2 we already
	# saved, apply quantization via torchao's pt2e path.
	# But that's more complex. Let's just reconstruct the key class.

	MAX_SEQ_LEN = 2048; NUM_LAYERS = 28; NUM_KV_HEADS = 8; HEAD_DIM = 128
	NUM_HEADS = 16; HIDDEN_SIZE = 2048; CODEC_VOCAB = 3072

	class RMSNorm(nn.Module):
	def __init__(self, dim, eps=1e-6):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(dim))
	self.eps = eps
	def forward(self, x):
	dtype = x.dtype; x = x.float()
	v = x.pow(2).mean(-1, keepdim=True)
	return (self.weight * (x * torch.rsqrt(v + self.eps))).to(dtype)

	def rotate_half(x):
	x1 = x[..., :x.shape[-1]//2]; x2 = x[..., x.shape[-1]//2:]
	return torch.cat((-x2, x1), dim=-1)

	class TalkerAttnQ(nn.Module):
	def __init__(self, orig, layer_idx):
	super().__init__()
	self.layer_idx = layer_idx; self.head_dim = HEAD_DIM
	self.num_heads = NUM_HEADS; self.num_kv_heads = NUM_KV_HEADS
	self.num_kv_groups = NUM_HEADS // NUM_KV_HEADS; self.scaling = HEAD_DIM**-0.5
	self.q_proj = copy.deepcopy(orig.q_proj); self.k_proj = copy.deepcopy(orig.k_proj)
	self.v_proj = copy.deepcopy(orig.v_proj); self.o_proj = copy.deepcopy(orig.o_proj)
	self.q_norm = RMSNorm(HEAD_DIM); self.q_norm.weight = copy.deepcopy(orig.q_norm.weight)
	self.k_norm = RMSNorm(HEAD_DIM); self.k_norm.weight = copy.deepcopy(orig.k_norm.weight)

	def forward(self, h, cos, sin, cp, kc, vc, am):
	B, S, _ = h.shape
	q = self.q_norm(self.q_proj(h).view(B,S,self.num_heads,HEAD_DIM)).transpose(1,2)
	k = self.k_norm(self.k_proj(h).view(B,S,self.num_kv_heads,HEAD_DIM)).transpose(1,2)
	v = self.v_proj(h).view(B,S,self.num_kv_heads,HEAD_DIM).transpose(1,2)
	q = qcos + rotate_half(q)sin; k = kcos + rotate_half(k)sin
	kc = kc.clone(); vc = vc.clone()
	kc[:,:,cp,:] = k; vc[:,:,cp,:] = v
	ke = kc.unsqueeze(2).repeat(1,1,self.num_kv_groups,1,1).reshape(B,self.num_heads,MAX_SEQ_LEN,HEAD_DIM)
	ve = vc.unsqueeze(2).repeat(1,1,self.num_kv_groups,1,1).reshape(B,self.num_heads,MAX_SEQ_LEN,HEAD_DIM)
	o = F.scaled_dot_product_attention(q, ke, ve, attn_mask=am, scale=self.scaling)
	return self.o_proj(o.transpose(1,2).reshape(B,S,-1)), kc, vc

	class TalkerLayerQ(nn.Module):
	def __init__(self, orig, i):
	super().__init__()
	self.attn = TalkerAttnQ(orig.self_attn, i)
	self.gate_proj = copy.deepcopy(orig.mlp.gate_proj)
	self.up_proj = copy.deepcopy(orig.mlp.up_proj)
	self.down_proj = copy.deepcopy(orig.mlp.down_proj)
	self.n1 = RMSNorm(HIDDEN_SIZE); self.n1.weight = copy.deepcopy(orig.input_layernorm.weight)
	self.n2 = RMSNorm(HIDDEN_SIZE); self.n2.weight = copy.deepcopy(orig.post_attention_layernorm.weight)

	def forward(self, h, cos, sin, cp, kc, vc, am):
	r = h; a, kc, vc = self.attn(self.n1(h), cos, sin, cp, kc, vc, am); h = r + a
	r = h; x = self.n2(h); h = r + self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
	return h, kc, vc

	class TalkerQ(nn.Module):
	def __init__(self, orig):
	super().__init__()
	self.layers = nn.ModuleList([TalkerLayerQ(l, i) for i, l in enumerate(orig.model.layers)])
	self.norm = RMSNorm(HIDDEN_SIZE); self.norm.weight = copy.deepcopy(orig.model.norm.weight)
	self.codec_head = copy.deepcopy(orig.codec_head)
	self.register_buffer("inv_freq", orig.model.rotary_emb.inv_freq.clone())
	self.rope_scaling = getattr(orig.model.rotary_emb, 'attention_scaling', 1.0)

	def forward(self, ie, pid, cp, am, *kv):
	pos = pid[0].float()
	freqs = pos.unsqueeze(-1) * self.inv_freq.float().unsqueeze(0).unsqueeze(0)
	emb = torch.cat([freqs, freqs], dim=-1)
	cos = (emb.cos() * self.rope_scaling).to(ie.dtype).unsqueeze(1)
	sin = (emb.sin() * self.rope_scaling).to(ie.dtype).unsqueeze(1)
	h = ie; ukv = []
	for i, layer in enumerate(self.layers):
	h, nk, nv = layer(h, cos, sin, cp, kv[i2], kv[i2+1], am)
	ukv.append(nk); ukv.append(nv)
	return (self.codec_head(self.norm(h)), *ukv)

	t_mod = TalkerQ(model.talker); t_mod.eval()
	sl = 10
	cm = torch.full((1,1,sl,MAX_SEQ_LEN), float('-inf'))
	for i in range(sl): cm[:,:,i,:i+1] = 0.0
	t_args = (
	torch.randn(1,sl,HIDDEN_SIZE),
	torch.arange(sl).unsqueeze(0).unsqueeze(0).repeat(3,1,1),
	torch.arange(sl), cm,
	[torch.zeros(1,NUM_KV_HEADS,MAX_SEQ_LEN,HEAD_DIM) for _ in range(NUM_LAYERS2)]
	)

	fp32_size = os.path.getsize(os.path.join(OUTPUT_DIR, "talker_prefill.pte")) / 1e6
	try:
	int8_size = export_and_lower_int8(t_mod, t_args, "talker", OUTPUT_DIR)
	results["talker"] = {"fp32": fp32_size, "int8": int8_size}
	except Exception as e:
	print(f" FAILED: {e}")
	results["talker"] = {"fp32": fp32_size, "int8": None, "error": str(e)}

	del t_mod; gc.collect()

	# ═══════════════════════════════════════════════════════════════════
	# 3. CODE PREDICTOR
	# ═══════════════════════════════════════════════════════════════════

	print("\n[3/4] Code Predictor INT8")

	CP_MAX = 17; CPL = 5; CPKV = 8; CPHD = 128; CPH = 16; CPHS = 1024; THD = 2048

	class CPAttnQ(nn.Module):
	def __init__(self, orig, i):
	super().__init__()
	self.q_proj = copy.deepcopy(orig.q_proj); self.k_proj = copy.deepcopy(orig.k_proj)
	self.v_proj = copy.deepcopy(orig.v_proj); self.o_proj = copy.deepcopy(orig.o_proj)
	self.q_norm = RMSNorm(CPHD); self.q_norm.weight = copy.deepcopy(orig.q_norm.weight)
	self.k_norm = RMSNorm(CPHD); self.k_norm.weight = copy.deepcopy(orig.k_norm.weight)
	self.g = CPH // CPKV

	def forward(self, h, cos, sin, cp, kc, vc, am):
	B,S,_ = h.shape
	q = self.q_norm(self.q_proj(h).view(B,S,CPH,CPHD)).transpose(1,2)
	k = self.k_norm(self.k_proj(h).view(B,S,CPKV,CPHD)).transpose(1,2)
	v = self.v_proj(h).view(B,S,CPKV,CPHD).transpose(1,2)
	q = qcos + rotate_half(q)sin; k = kcos + rotate_half(k)sin
	kc = kc.clone(); vc = vc.clone(); kc[:,:,cp,:] = k; vc[:,:,cp,:] = v
	ke = kc.unsqueeze(2).repeat(1,1,self.g,1,1).reshape(B,CPH,CP_MAX,CPHD)
	ve = vc.unsqueeze(2).repeat(1,1,self.g,1,1).reshape(B,CPH,CP_MAX,CPHD)
	o = F.scaled_dot_product_attention(q,ke,ve,attn_mask=am,scale=CPHD**-0.5)
	return self.o_proj(o.transpose(1,2).reshape(B,S,-1)), kc, vc

	class CPLayerQ(nn.Module):
	def __init__(self, orig, i):
	super().__init__()
	self.attn = CPAttnQ(orig.self_attn, i)
	self.gp = copy.deepcopy(orig.mlp.gate_proj)
	self.up = copy.deepcopy(orig.mlp.up_proj)
	self.dp = copy.deepcopy(orig.mlp.down_proj)
	self.n1 = RMSNorm(CPHS); self.n1.weight = copy.deepcopy(orig.input_layernorm.weight)
	self.n2 = RMSNorm(CPHS); self.n2.weight = copy.deepcopy(orig.post_attention_layernorm.weight)

	def forward(self, h, cos, sin, cp, kc, vc, am):
	r=h; a,kc,vc = self.attn(self.n1(h),cos,sin,cp,kc,vc,am); h=r+a
	r=h; x=self.n2(h); h=r+self.dp(F.silu(self.gp(x))*self.up(x))
	return h, kc, vc

	class CPQ(nn.Module):
	def __init__(self, orig):
	super().__init__()
	self.layers = nn.ModuleList([CPLayerQ(l,i) for i,l in enumerate(orig.model.layers)])
	self.norm = RMSNorm(CPHS); self.norm.weight = copy.deepcopy(orig.model.norm.weight)
	self.proj = copy.deepcopy(orig.small_to_mtp_projection)
	self.register_buffer("inv_freq", orig.model.rotary_emb.inv_freq.clone())
	self.rs = getattr(orig.model.rotary_emb, 'attention_scaling', 1.0)

	def forward(self, ie, pid, cp, am, *kv):
	h = self.proj(ie)
	pos = pid.float()
	freqs = pos.unsqueeze(-1)*self.inv_freq.float().unsqueeze(0).unsqueeze(0)
	emb = torch.cat([freqs,freqs],dim=-1)
	cos = (emb.cos()*self.rs).to(h.dtype).unsqueeze(1)
	sin = (emb.sin()*self.rs).to(h.dtype).unsqueeze(1)
	ukv = []
	for i, l in enumerate(self.layers):
	h,nk,nv = l(h,cos,sin,cp,kv[i2],kv[i2+1],am); ukv.append(nk); ukv.append(nv)
	return (self.norm(h), *ukv)

	cp_mod = CPQ(model.talker.code_predictor); cp_mod.eval()
	csl = 2
	ccm = torch.full((1,1,csl,CP_MAX), float('-inf'))
	for i in range(csl): ccm[:,:,i,:i+1] = 0.0
	cp_args = (
	torch.randn(1,csl,THD), torch.arange(csl).unsqueeze(0), torch.arange(csl), ccm,
	[torch.zeros(1,CPKV,CP_MAX,CPHD) for _ in range(CPL2)]
	)

	fp32_size = os.path.getsize(os.path.join(OUTPUT_DIR, "code_predictor.pte")) / 1e6
	try:
	int8_size = export_and_lower_int8(cp_mod, cp_args, "code_predictor", OUTPUT_DIR)
	results["code_predictor"] = {"fp32": fp32_size, "int8": int8_size}
	except Exception as e:
	print(f" FAILED: {e}")
	results["code_predictor"] = {"fp32": fp32_size, "int8": None, "error": str(e)}

	del cp_mod; gc.collect()

	# ═══════════════════════════════════════════════════════════════════
	# 4. VOCODER
	# ═══════════════════════════════════════════════════════════════════

	print("\n[4/4] Vocoder INT8")

	class VocQ(nn.Module):
	def __init__(self, dec):
	super().__init__()
	self.decoder = copy.deepcopy(dec)
	def forward(self, codes):
	return self.decoder(codes)

	v_mod = VocQ(model.speech_tokenizer.model.decoder); v_mod.eval()
	v_args = (torch.randint(0, 2048, (1, 16, 50)),)

	fp32_size = os.path.getsize(os.path.join(OUTPUT_DIR, "vocoder.pte")) / 1e6
	try:
	int8_size = export_and_lower_int8(v_mod, v_args, "vocoder", OUTPUT_DIR)
	results["vocoder"] = {"fp32": fp32_size, "int8": int8_size}
	except Exception as e:
	print(f" FAILED: {e}")
	results["vocoder"] = {"fp32": fp32_size, "int8": None, "error": str(e)}

	del v_mod; gc.collect()

	# ── Summary ──────────────────────────────────────────────────────────

	print("\n" + "=" * 70)
	print("QUANTIZATION SUMMARY")
	print("=" * 70)
	print(f"\n{'Module':25s} {'FP32 (MB)':>12s} {'INT8 (MB)':>12s} {'Reduction':>10s}")
	print("-" * 60)

	total_fp32 = 0; total_int8 = 0
	for name, r in results.items():
	fp32 = r.get("fp32", 0) or 0
	int8 = r.get("int8")
	total_fp32 += fp32
	if int8 is not None:
	total_int8 += int8
	red = f"{fp32/int8:.1f}x" if int8 > 0 else "∞"
	else:
	red = f"FAILED: {r.get('error','')[:40]}"
	int8 = 0
	print(f" {name:23s} {fp32:10.1f} {int8:10.1f} {red}")

	print("-" * 60)
	ovr = f"{total_fp32/total_int8:.1f}x" if total_int8 > 0 else "N/A"
	print(f" {'TOTAL':23s} {total_fp32:10.1f} {total_int8:10.1f} {ovr}")
	print("\nPhase 6 complete!")