Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tqdm | |
| import torch | |
| from torch import nn | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from functools import partial | |
| import gc | |
| # core quantization method (simulated quantization) | |
| def pseudo_quantize_tensor(w, n_bit=4, q_group_size=-1): | |
| org_w_shape = w.shape | |
| if q_group_size > 0: | |
| assert org_w_shape[-1] % q_group_size == 0 | |
| w = w.reshape(-1, q_group_size) | |
| assert w.dim() == 2 | |
| # Calculate the maximum (\alpha) and minimum values (\beta) in the tensor. | |
| max_val = w.amax(dim=1, keepdim=True) | |
| assert max_val.dim() == 2 and max_val.size(0) == w.size(0) and max_val.size(1) == 1 | |
| min_val = w.amin(dim=1, keepdim=True) | |
| assert min_val.dim() == 2 and min_val.size(0) == w.size(0) and min_val.size(1) == 1 | |
| # Calculate the scale factor and zero point. (Formula 1 & 2) | |
| max_int = 2 ** n_bit - 1 | |
| scales = (max_val - min_val).clamp(min=1e-5) / max_int | |
| assert scales.shape == max_val.shape | |
| zeros = (-torch.round(min_val / scales)).clamp_(0, max_int) | |
| assert scales.shape == min_val.shape | |
| assert torch.isnan(scales).sum() == 0 | |
| assert torch.isnan(w).sum() == 0 | |
| # Quantize W: Map values in the range [\beta, \alpha] to lie within [0, 2^b - 1] (Formula 3) | |
| w = torch.clamp(torch.round(w / scales) + zeros, 0, max_int) | |
| assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size | |
| # Dequantize W (pseudo quantization, the inverse transformation of Formula 3) | |
| w = (w - zeros) * scales | |
| assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size | |
| assert torch.isnan(w).sum() == 0 | |
| w = w.reshape(org_w_shape) | |
| return w | |
| def pseudo_quantize_model_weight( | |
| model, w_bit, q_group_size, | |
| ): | |
| for n, m in model.named_modules(): | |
| if isinstance(m, nn.Linear): | |
| m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size) | |
| # Load the tokenizer and model | |
| model_path = "facebook/opt-1.3b" | |
| offload_folder = "offload" | |
| tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) | |
| model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", offload_folder=offload_folder) | |
| model_q = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",offload_folder=offload_folder) | |
| pseudo_quantize_model_weight(model_q, w_bit=3, q_group_size=128) | |
| # Define a function for model inference | |
| generator = pipeline('text-generation', model="facebook/opt-1.3b") | |
| def generate_text_pip(prompt): | |
| generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0]['generated_text'] | |
| return generated_text | |
| print(generator("I went to boston and")) | |
| def generate_text(prompt): | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| output = model(**inputs) | |
| logits = output.logits | |
| predicted_ids = logits.argmax(-1) | |
| generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True) | |
| return generated_text | |
| def generate_text_from_quantized(prompt): | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| output = model_q(**inputs) | |
| logits = output.logits | |
| predicted_ids = logits.argmax(-1) | |
| generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True) | |
| return generated_text | |
| # Create a Gradio interface | |
| iface = gr.Interface(fn=generate_text_pip, inputs="text", outputs="text", live=True) | |
| iface_2 = gr.Interface(fn=generate_text_from_quantized, inputs="text", outputs="text", live=True) | |
| app = gr.TabbedInterface([iface, iface_2],["Normal", "Quantized"]) | |
| # Launch the Gradio app | |
| app.launch(server_name="0.0.0.0", share=True) | |