Quantization / .ipynb_checkpoints /app-checkpoint.py
rakesh9177's picture
new app.py
5557f4f
import gradio as gr
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from functools import partial
import gc
# core quantization method (simulated quantization)
def pseudo_quantize_tensor(w, n_bit=4, q_group_size=-1):
org_w_shape = w.shape
if q_group_size > 0:
assert org_w_shape[-1] % q_group_size == 0
w = w.reshape(-1, q_group_size)
assert w.dim() == 2
# Calculate the maximum (\alpha) and minimum values (\beta) in the tensor.
max_val = w.amax(dim=1, keepdim=True)
assert max_val.dim() == 2 and max_val.size(0) == w.size(0) and max_val.size(1) == 1
min_val = w.amin(dim=1, keepdim=True)
assert min_val.dim() == 2 and min_val.size(0) == w.size(0) and min_val.size(1) == 1
# Calculate the scale factor and zero point. (Formula 1 & 2)
max_int = 2 ** n_bit - 1
scales = (max_val - min_val).clamp(min=1e-5) / max_int
assert scales.shape == max_val.shape
zeros = (-torch.round(min_val / scales)).clamp_(0, max_int)
assert scales.shape == min_val.shape
assert torch.isnan(scales).sum() == 0
assert torch.isnan(w).sum() == 0
# Quantize W: Map values in the range [\beta, \alpha] to lie within [0, 2^b - 1] (Formula 3)
w = torch.clamp(torch.round(w / scales) + zeros, 0, max_int)
assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size
# Dequantize W (pseudo quantization, the inverse transformation of Formula 3)
w = (w - zeros) * scales
assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size
assert torch.isnan(w).sum() == 0
w = w.reshape(org_w_shape)
return w
@torch.no_grad()
def pseudo_quantize_model_weight(
model, w_bit, q_group_size,
):
for n, m in model.named_modules():
if isinstance(m, nn.Linear):
m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size)
# Load the tokenizer and model
model_path = "facebook/opt-1.3b"
offload_folder = "offload"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", offload_folder=offload_folder)
model_q = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",offload_folder=offload_folder)
pseudo_quantize_model_weight(model_q, w_bit=3, q_group_size=128)
# Define a function for model inference
generator = pipeline('text-generation', model="facebook/opt-1.3b")
def generate_text_pip(prompt):
generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0]['generated_text']
return generated_text
print(generator("I went to boston and"))
def generate_text(prompt):
inputs = tokenizer(prompt, return_tensors="pt")
output = model(**inputs)
logits = output.logits
predicted_ids = logits.argmax(-1)
generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
return generated_text
def generate_text_from_quantized(prompt):
inputs = tokenizer(prompt, return_tensors="pt")
output = model_q(**inputs)
logits = output.logits
predicted_ids = logits.argmax(-1)
generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
return generated_text
# Create a Gradio interface
iface = gr.Interface(fn=generate_text_pip, inputs="text", outputs="text", live=True)
iface_2 = gr.Interface(fn=generate_text_from_quantized, inputs="text", outputs="text", live=True)
app = gr.TabbedInterface([iface, iface_2],["Normal", "Quantized"])
# Launch the Gradio app
app.launch(server_name="0.0.0.0", share=True)