{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "25d9b213-f625-4327-98b3-d9e67db11687", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "saving model\n", "Model Saved\n" ] }, { "data": { "text/plain": [ "'\\ngenerator = pipeline(\\'text-generation\\', model=\"facebook/opt-1.3b\")\\n\\ndef generate_text_pip(prompt):\\n generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0][\\'generated_text\\']\\n return generated_text\\nprint(generator(\"I went to boston and\"))\\n\\ndef generate_text(prompt):\\n inputs = tokenizer(prompt, return_tensors=\"pt\")\\n output = model(**inputs)\\n logits = output.logits\\n predicted_ids = logits.argmax(-1)\\n generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\\n return generated_text\\n\\ndef generate_text_from_quantized(prompt):\\n inputs = tokenizer(prompt, return_tensors=\"pt\")\\n output = model_q(**inputs)\\n logits = output.logits\\n predicted_ids = logits.argmax(-1)\\n generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\\n return generated_text\\n\\n# Create a Gradio interface\\niface = gr.Interface(fn=generate_text_pip, inputs=\"text\", outputs=\"text\", live=True)\\n\\niface_2 = gr.Interface(fn=generate_text_from_quantized, inputs=\"text\", outputs=\"text\", live=True)\\n\\n\\napp = gr.TabbedInterface([iface, iface_2],[\"Normal\", \"Quantized\"])\\n\\n# Launch the Gradio app\\napp.launch()\\n'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import gradio as gr\n", "import tqdm\n", "import torch\n", "from torch import nn\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n", "from functools import partial\n", "import gc\n", "\n", "\n", "# core quantization method (simulated quantization)\n", "def pseudo_quantize_tensor(w, n_bit=4, q_group_size=-1):\n", " org_w_shape = w.shape\n", " if q_group_size > 0:\n", " assert org_w_shape[-1] % q_group_size == 0\n", " w = w.reshape(-1, q_group_size)\n", "\n", " assert w.dim() == 2\n", "\n", " # Calculate the maximum (\\alpha) and minimum values (\\beta) in the tensor.\n", " max_val = w.amax(dim=1, keepdim=True)\n", " assert max_val.dim() == 2 and max_val.size(0) == w.size(0) and max_val.size(1) == 1\n", " min_val = w.amin(dim=1, keepdim=True)\n", " assert min_val.dim() == 2 and min_val.size(0) == w.size(0) and min_val.size(1) == 1\n", "\n", " # Calculate the scale factor and zero point. (Formula 1 & 2)\n", " max_int = 2 ** n_bit - 1\n", " scales = (max_val - min_val).clamp(min=1e-5) / max_int\n", " assert scales.shape == max_val.shape\n", " zeros = (-torch.round(min_val / scales)).clamp_(0, max_int)\n", " assert scales.shape == min_val.shape\n", "\n", " assert torch.isnan(scales).sum() == 0\n", " assert torch.isnan(w).sum() == 0\n", "\n", " # Quantize W: Map values in the range [\\beta, \\alpha] to lie within [0, 2^b - 1] (Formula 3)\n", " w = torch.clamp(torch.round(w / scales) + zeros, 0, max_int)\n", " assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size\n", "\n", " # Dequantize W (pseudo quantization, the inverse transformation of Formula 3)\n", " w = (w - zeros) * scales\n", " assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size\n", "\n", " assert torch.isnan(w).sum() == 0\n", "\n", " w = w.reshape(org_w_shape)\n", " return w\n", "\n", "@torch.no_grad()\n", "def pseudo_quantize_model_weight(\n", " model, w_bit, q_group_size,\n", "):\n", " for n, m in model.named_modules():\n", " if isinstance(m, nn.Linear):\n", " m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size)\n", " \n", " \n", " \n", " \n", "# Load the tokenizer and model\n", "model_path = \"facebook/opt-125m\"\n", "offload_folder = \"offload\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n", "model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", offload_folder=offload_folder)\n", "model_q = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\",offload_folder=offload_folder)\n", "pseudo_quantize_model_weight(model_q, w_bit=3, q_group_size=128)\n", "# Define a function for model inference\n", "\n", "\n", "quantized_model_path = \"facebook/opt-125m_3bit\"\n", "print(\"saving model\")\n", "model_q.save_pretrained(quantized_model_path)\n", "tokenizer.save_pretrained(quantized_model_path)\n", "print(\"Model Saved\")\n", "'''\n", "generator = pipeline('text-generation', model=\"facebook/opt-1.3b\")\n", "\n", "def generate_text_pip(prompt):\n", " generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0]['generated_text']\n", " return generated_text\n", "print(generator(\"I went to boston and\"))\n", "\n", "def generate_text(prompt):\n", " inputs = tokenizer(prompt, return_tensors=\"pt\")\n", " output = model(**inputs)\n", " logits = output.logits\n", " predicted_ids = logits.argmax(-1)\n", " generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n", " return generated_text\n", "\n", "def generate_text_from_quantized(prompt):\n", " inputs = tokenizer(prompt, return_tensors=\"pt\")\n", " output = model_q(**inputs)\n", " logits = output.logits\n", " predicted_ids = logits.argmax(-1)\n", " generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n", " return generated_text\n", "\n", "# Create a Gradio interface\n", "iface = gr.Interface(fn=generate_text_pip, inputs=\"text\", outputs=\"text\", live=True)\n", "\n", "iface_2 = gr.Interface(fn=generate_text_from_quantized, inputs=\"text\", outputs=\"text\", live=True)\n", "\n", "\n", "app = gr.TabbedInterface([iface, iface_2],[\"Normal\", \"Quantized\"])\n", "\n", "# Launch the Gradio app\n", "app.launch()\n", "'''" ] }, { "cell_type": "code", "execution_count": 6, "id": "5b3171de-b63f-4f5e-8347-c4a5da79c397", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "quantized model [{'generated_text': 'I went to boston and was hoping for a good time for a good time for a good time'}]\n" ] } ], "source": [ "model_q_path = \"facebook/opt-125m_3bit\"\n", "\n", "\n", "generator_q = pipeline('text-generation', model=model_q_path)\n", "print(\"quantized model\",generator_q(\"I went to boston and\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "8e969157-5261-4245-a6d0-d394e971b347", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }