mesolitica
/

malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "19fe0df6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp310-cp310-linux_x86_64.whl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "20861f3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from awq import AutoAWQForCausalLM\n",
+    "from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "model_path = 'mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9939ad4e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fdb86f50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "72e76288",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained('./test', safe_serialization = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "aa245150",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoAWQForCausalLM.from_pretrained('./test')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d3949cf4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cd5f09268d5848dfa914559c17517de4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ae1ea0a4878e4fffa3a7a79de8bab6cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2366fa88ea66490da15cb43a70958b87",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "AWQ: 100%|██████████| 22/22 [02:20<00:00,  6.37s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_path = 'malaysian-tinyllama-1.1b-16k-instructions-rag-awq'\n",
+    "quant_config = { \"zero_point\": True, \"q_group_size\": 128, \"w_bit\": 4, \"version\": \"GEMM\" }\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
+    "model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ee290c1e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:`quant_config.json` is being deprecated in the future in favor of quantization_config in config.json.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('malaysian-tinyllama-1.1b-16k-instructions-rag-awq/tokenizer_config.json',\n",
+       " 'malaysian-tinyllama-1.1b-16k-instructions-rag-awq/special_tokens_map.json',\n",
+       " 'malaysian-tinyllama-1.1b-16k-instructions-rag-awq/tokenizer.json')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.save_quantized(quant_path, safetensors = False)\n",
+    "tokenizer.save_pretrained(quant_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "737f2403",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/6f906fb7dab784cdf9de4b625578fa9dd25d5c7d', commit_message='Upload tokenizer', commit_description='', oid='6f906fb7dab784cdf9de4b625578fa9dd25d5c7d', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ed92c8ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/41fbcd7dd92f45fe71a5bccc3a48cff27979b59d', commit_message='Upload config', commit_description='', oid='41fbcd7dd92f45fe71a5bccc3a48cff27979b59d', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantization_config = AwqConfig(\n",
+    "    bits=quant_config['w_bit'],\n",
+    "    group_size=quant_config['q_group_size'],\n",
+    "    zero_point=quant_config['zero_point'],\n",
+    "    backend='autoawq',\n",
+    "    version=quant_config['version'].lower(),\n",
+    ")\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(model_path)\n",
+    "config.quantization_config = quantization_config\n",
+    "\n",
+    "config.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c74b2f45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "config.json\t\tquant_config.json\t tokenizer_config.json\r\n",
+      "generation_config.json\tspecial_tokens_map.json\r\n",
+      "pytorch_model.bin\ttokenizer.json\r\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls malaysian-tinyllama-1.1b-16k-instructions-rag-awq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "2e0fb591",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "dd06cfa2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3d3700ff70fe467ea5dcf7e933c12d73",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/766M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/40801f6263ebf6127720f9d5bf6037557c565af2', commit_message='Upload pytorch_model.bin with huggingface_hub', commit_description='', oid='40801f6263ebf6127720f9d5bf6037557c565af2', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "api.upload_file(\n",
+    "    path_or_fileobj='malaysian-tinyllama-1.1b-16k-instructions-rag-awq/pytorch_model.bin',\n",
+    "    path_in_repo=\"pytorch_model.bin\",\n",
+    "    repo_id='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ',\n",
+    "    repo_type=\"model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "1383ff2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/021f56454c537c0658279317a6157218a3d04479', commit_message='Upload quant_config.json with huggingface_hub', commit_description='', oid='021f56454c537c0658279317a6157218a3d04479', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "api.upload_file(\n",
+    "    path_or_fileobj='malaysian-tinyllama-1.1b-16k-instructions-rag-awq/quant_config.json',\n",
+    "    path_in_repo=\"quant_config.json\",\n",
+    "    repo_id='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ',\n",
+    "    repo_type=\"model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "5852ec02",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7b3555660f64d56857595133fe3550b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dbcc6cdc52ac4c11b9facfc071dd0c40",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/766M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ')\n",
+    "_ = quantized_model.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "66895e20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {'role': 'user', 'content': 'KWSP tu apa'}\n",
+    "]\n",
+    "prompt = tokenizer.apply_chat_template(messages, tokenize = False)\n",
+    "inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4b320f33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 5.65 s, sys: 0 ns, total: 5.65 s\n",
+      "Wall time: 5.65 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'<s> [INST] KWSP tu apa [/INST]KWSP atau Skim Simpanan Pendidikan Nasional ialah skim yang ditubuhkan oleh kerajaan Malaysia untuk membantu ibu bapa membiayai pendidikan anak-anak mereka. Skim ini membolehkan individu menyumbang sejumlah wang daripada pendapatan mereka kepada akaun KWSP untuk digunakan sebagai dana pendidikan pada masa hadapan. Dengan menyumbang kepada KWSP, ibu bapa boleh menggunakan dana tersebut untuk membiayai yuran pendidikan, kos buku dan peralatan sekolah, yuran ujian, dan perbelanjaan lain yang berkaitan dengan pendidikan anak-anak mereka. Skim ini adalah inisiatif yang membantu ibu bapa untuk menyediakan persediaan kewangan yang diperlukan untuk pendidikan anak-anak mereka tanpa perlu mengeluarkan wang dari poket sendiri.</s>'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "generate_kwargs = dict(\n",
+    "    inputs,\n",
+    "    max_new_tokens=1024,\n",
+    "    top_p=0.95,\n",
+    "    top_k=50,\n",
+    "    temperature=0.9,\n",
+    "    do_sample=True,\n",
+    "    num_beams=1,\n",
+    ")\n",
+    "r = quantized_model.generate(**generate_kwargs)\n",
+    "tokenizer.decode(r[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9a93555",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}