Spaces:

rakesh9177
/

Quantization

Sleeping

App Files Files Community

rakesh9177 commited on Nov 26, 2023

Commit

3021feb

•

1 Parent(s): b68784d

o

Browse files

Files changed (3) hide show

.ipynb_checkpoints/Untitled-checkpoint.ipynb +178 -0
.ipynb_checkpoints/requirements-checkpoint.txt +5 -0
Untitled.ipynb +191 -0

.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,178 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "25d9b213-f625-4327-98b3-d9e67db11687",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "saving model\n",
+      "Model Saved\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\ngenerator = pipeline(\\'text-generation\\', model=\"facebook/opt-1.3b\")\\n\\ndef generate_text_pip(prompt):\\n    generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0][\\'generated_text\\']\\n    return generated_text\\nprint(generator(\"I went to boston and\"))\\n\\ndef generate_text(prompt):\\n    inputs = tokenizer(prompt, return_tensors=\"pt\")\\n    output = model(**inputs)\\n    logits = output.logits\\n    predicted_ids = logits.argmax(-1)\\n    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\\n    return generated_text\\n\\ndef generate_text_from_quantized(prompt):\\n    inputs = tokenizer(prompt, return_tensors=\"pt\")\\n    output = model_q(**inputs)\\n    logits = output.logits\\n    predicted_ids = logits.argmax(-1)\\n    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\\n    return generated_text\\n\\n# Create a Gradio interface\\niface = gr.Interface(fn=generate_text_pip, inputs=\"text\", outputs=\"text\", live=True)\\n\\niface_2 = gr.Interface(fn=generate_text_from_quantized, inputs=\"text\", outputs=\"text\", live=True)\\n\\n\\napp = gr.TabbedInterface([iface, iface_2],[\"Normal\", \"Quantized\"])\\n\\n# Launch the Gradio app\\napp.launch()\\n'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "import tqdm\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "from functools import partial\n",
+    "import gc\n",
+    "\n",
+    "\n",
+    "# core quantization method (simulated quantization)\n",
+    "def pseudo_quantize_tensor(w, n_bit=4, q_group_size=-1):\n",
+    "    org_w_shape = w.shape\n",
+    "    if q_group_size > 0:\n",
+    "        assert org_w_shape[-1] % q_group_size == 0\n",
+    "        w = w.reshape(-1, q_group_size)\n",
+    "\n",
+    "    assert w.dim() == 2\n",
+    "\n",
+    "    # Calculate the maximum (\\alpha) and minimum values (\\beta) in the tensor.\n",
+    "    max_val = w.amax(dim=1, keepdim=True)\n",
+    "    assert max_val.dim() == 2 and max_val.size(0) == w.size(0) and max_val.size(1) == 1\n",
+    "    min_val = w.amin(dim=1, keepdim=True)\n",
+    "    assert min_val.dim() == 2 and min_val.size(0) == w.size(0) and min_val.size(1) == 1\n",
+    "\n",
+    "    # Calculate the scale factor and zero point.  (Formula 1 & 2)\n",
+    "    max_int = 2 ** n_bit - 1\n",
+    "    scales = (max_val - min_val).clamp(min=1e-5) / max_int\n",
+    "    assert scales.shape == max_val.shape\n",
+    "    zeros = (-torch.round(min_val / scales)).clamp_(0, max_int)\n",
+    "    assert scales.shape == min_val.shape\n",
+    "\n",
+    "    assert torch.isnan(scales).sum() == 0\n",
+    "    assert torch.isnan(w).sum() == 0\n",
+    "\n",
+    "    # Quantize W: Map values in the range [\\beta, \\alpha] to lie within [0, 2^b - 1] (Formula 3)\n",
+    "    w = torch.clamp(torch.round(w / scales) + zeros, 0, max_int)\n",
+    "    assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size\n",
+    "\n",
+    "    # Dequantize W (pseudo quantization, the inverse transformation of Formula 3)\n",
+    "    w = (w - zeros) * scales\n",
+    "    assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size\n",
+    "\n",
+    "    assert torch.isnan(w).sum() == 0\n",
+    "\n",
+    "    w = w.reshape(org_w_shape)\n",
+    "    return w\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def pseudo_quantize_model_weight(\n",
+    "    model, w_bit, q_group_size,\n",
+    "):\n",
+    "    for n, m in model.named_modules():\n",
+    "        if isinstance(m, nn.Linear):\n",
+    "            m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size)\n",
+    "            \n",
+    "            \n",
+    "    \n",
+    "            \n",
+    "# Load the tokenizer and model\n",
+    "model_path = \"facebook/opt-125m\"\n",
+    "offload_folder = \"offload\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", offload_folder=offload_folder)\n",
+    "model_q = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\",offload_folder=offload_folder)\n",
+    "pseudo_quantize_model_weight(model_q, w_bit=3, q_group_size=128)\n",
+    "# Define a function for model inference\n",
+    "\n",
+    "\n",
+    "quantized_model_path = \"facebook/opt-125m_3bit\"\n",
+    "print(\"saving model\")\n",
+    "model_q.save_pretrained(quantized_model_path)\n",
+    "tokenizer.save_pretrained(quantized_model_path)\n",
+    "print(\"Model Saved\")\n",
+    "'''\n",
+    "generator = pipeline('text-generation', model=\"facebook/opt-1.3b\")\n",
+    "\n",
+    "def generate_text_pip(prompt):\n",
+    "    generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0]['generated_text']\n",
+    "    return generated_text\n",
+    "print(generator(\"I went to boston and\"))\n",
+    "\n",
+    "def generate_text(prompt):\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "    output = model(**inputs)\n",
+    "    logits = output.logits\n",
+    "    predicted_ids = logits.argmax(-1)\n",
+    "    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n",
+    "    return generated_text\n",
+    "\n",
+    "def generate_text_from_quantized(prompt):\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "    output = model_q(**inputs)\n",
+    "    logits = output.logits\n",
+    "    predicted_ids = logits.argmax(-1)\n",
+    "    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n",
+    "    return generated_text\n",
+    "\n",
+    "# Create a Gradio interface\n",
+    "iface = gr.Interface(fn=generate_text_pip, inputs=\"text\", outputs=\"text\", live=True)\n",
+    "\n",
+    "iface_2 = gr.Interface(fn=generate_text_from_quantized, inputs=\"text\", outputs=\"text\", live=True)\n",
+    "\n",
+    "\n",
+    "app = gr.TabbedInterface([iface, iface_2],[\"Normal\", \"Quantized\"])\n",
+    "\n",
+    "# Launch the Gradio app\n",
+    "app.launch()\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b3171de-b63f-4f5e-8347-c4a5da79c397",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "saving model\n",
+      "Model Saved\n"
+     ]
+    }
+   ],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/requirements-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+gradio
+transformers
+tqdm
+accelerate

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "25d9b213-f625-4327-98b3-d9e67db11687",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "saving model\n",
+      "Model Saved\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\ngenerator = pipeline(\\'text-generation\\', model=\"facebook/opt-1.3b\")\\n\\ndef generate_text_pip(prompt):\\n    generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0][\\'generated_text\\']\\n    return generated_text\\nprint(generator(\"I went to boston and\"))\\n\\ndef generate_text(prompt):\\n    inputs = tokenizer(prompt, return_tensors=\"pt\")\\n    output = model(**inputs)\\n    logits = output.logits\\n    predicted_ids = logits.argmax(-1)\\n    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\\n    return generated_text\\n\\ndef generate_text_from_quantized(prompt):\\n    inputs = tokenizer(prompt, return_tensors=\"pt\")\\n    output = model_q(**inputs)\\n    logits = output.logits\\n    predicted_ids = logits.argmax(-1)\\n    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\\n    return generated_text\\n\\n# Create a Gradio interface\\niface = gr.Interface(fn=generate_text_pip, inputs=\"text\", outputs=\"text\", live=True)\\n\\niface_2 = gr.Interface(fn=generate_text_from_quantized, inputs=\"text\", outputs=\"text\", live=True)\\n\\n\\napp = gr.TabbedInterface([iface, iface_2],[\"Normal\", \"Quantized\"])\\n\\n# Launch the Gradio app\\napp.launch()\\n'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "import tqdm\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "from functools import partial\n",
+    "import gc\n",
+    "\n",
+    "\n",
+    "# core quantization method (simulated quantization)\n",
+    "def pseudo_quantize_tensor(w, n_bit=4, q_group_size=-1):\n",
+    "    org_w_shape = w.shape\n",
+    "    if q_group_size > 0:\n",
+    "        assert org_w_shape[-1] % q_group_size == 0\n",
+    "        w = w.reshape(-1, q_group_size)\n",
+    "\n",
+    "    assert w.dim() == 2\n",
+    "\n",
+    "    # Calculate the maximum (\\alpha) and minimum values (\\beta) in the tensor.\n",
+    "    max_val = w.amax(dim=1, keepdim=True)\n",
+    "    assert max_val.dim() == 2 and max_val.size(0) == w.size(0) and max_val.size(1) == 1\n",
+    "    min_val = w.amin(dim=1, keepdim=True)\n",
+    "    assert min_val.dim() == 2 and min_val.size(0) == w.size(0) and min_val.size(1) == 1\n",
+    "\n",
+    "    # Calculate the scale factor and zero point.  (Formula 1 & 2)\n",
+    "    max_int = 2 ** n_bit - 1\n",
+    "    scales = (max_val - min_val).clamp(min=1e-5) / max_int\n",
+    "    assert scales.shape == max_val.shape\n",
+    "    zeros = (-torch.round(min_val / scales)).clamp_(0, max_int)\n",
+    "    assert scales.shape == min_val.shape\n",
+    "\n",
+    "    assert torch.isnan(scales).sum() == 0\n",
+    "    assert torch.isnan(w).sum() == 0\n",
+    "\n",
+    "    # Quantize W: Map values in the range [\\beta, \\alpha] to lie within [0, 2^b - 1] (Formula 3)\n",
+    "    w = torch.clamp(torch.round(w / scales) + zeros, 0, max_int)\n",
+    "    assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size\n",
+    "\n",
+    "    # Dequantize W (pseudo quantization, the inverse transformation of Formula 3)\n",
+    "    w = (w - zeros) * scales\n",
+    "    assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size\n",
+    "\n",
+    "    assert torch.isnan(w).sum() == 0\n",
+    "\n",
+    "    w = w.reshape(org_w_shape)\n",
+    "    return w\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def pseudo_quantize_model_weight(\n",
+    "    model, w_bit, q_group_size,\n",
+    "):\n",
+    "    for n, m in model.named_modules():\n",
+    "        if isinstance(m, nn.Linear):\n",
+    "            m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size)\n",
+    "            \n",
+    "            \n",
+    "    \n",
+    "            \n",
+    "# Load the tokenizer and model\n",
+    "model_path = \"facebook/opt-125m\"\n",
+    "offload_folder = \"offload\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", offload_folder=offload_folder)\n",
+    "model_q = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\",offload_folder=offload_folder)\n",
+    "pseudo_quantize_model_weight(model_q, w_bit=3, q_group_size=128)\n",
+    "# Define a function for model inference\n",
+    "\n",
+    "\n",
+    "quantized_model_path = \"facebook/opt-125m_3bit\"\n",
+    "print(\"saving model\")\n",
+    "model_q.save_pretrained(quantized_model_path)\n",
+    "tokenizer.save_pretrained(quantized_model_path)\n",
+    "print(\"Model Saved\")\n",
+    "'''\n",
+    "generator = pipeline('text-generation', model=\"facebook/opt-1.3b\")\n",
+    "\n",
+    "def generate_text_pip(prompt):\n",
+    "    generated_text = generator(prompt, max_length=1000, num_return_sequences=1)[0]['generated_text']\n",
+    "    return generated_text\n",
+    "print(generator(\"I went to boston and\"))\n",
+    "\n",
+    "def generate_text(prompt):\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "    output = model(**inputs)\n",
+    "    logits = output.logits\n",
+    "    predicted_ids = logits.argmax(-1)\n",
+    "    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n",
+    "    return generated_text\n",
+    "\n",
+    "def generate_text_from_quantized(prompt):\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "    output = model_q(**inputs)\n",
+    "    logits = output.logits\n",
+    "    predicted_ids = logits.argmax(-1)\n",
+    "    generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)\n",
+    "    return generated_text\n",
+    "\n",
+    "# Create a Gradio interface\n",
+    "iface = gr.Interface(fn=generate_text_pip, inputs=\"text\", outputs=\"text\", live=True)\n",
+    "\n",
+    "iface_2 = gr.Interface(fn=generate_text_from_quantized, inputs=\"text\", outputs=\"text\", live=True)\n",
+    "\n",
+    "\n",
+    "app = gr.TabbedInterface([iface, iface_2],[\"Normal\", \"Quantized\"])\n",
+    "\n",
+    "# Launch the Gradio app\n",
+    "app.launch()\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5b3171de-b63f-4f5e-8347-c4a5da79c397",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "quantized model [{'generated_text': 'I went to boston and was hoping for a good time for a good time for a good time'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_q_path = \"facebook/opt-125m_3bit\"\n",
+    "\n",
+    "\n",
+    "generator_q = pipeline('text-generation', model=model_q_path)\n",
+    "print(\"quantized model\",generator_q(\"I went to boston and\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e969157-5261-4245-a6d0-d394e971b347",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}