{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "da47e672",
"metadata": {},
"outputs": [],
"source": [
"# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.6/autoawq-0.1.6+cu118-cp310-cp310-linux_x86_64.whl"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "27063032",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tue Nov 7 13:34:15 2023 \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\r\n",
"|-------------------------------+----------------------+----------------------+\r\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n",
"| | | MIG M. |\r\n",
"|===============================+======================+======================|\r\n",
"| 0 NVIDIA A100 80G... On | 00000001:00:00.0 Off | 0 |\r\n",
"| N/A 37C P0 65W / 300W | 5536MiB / 81920MiB | 0% Default |\r\n",
"| | | Disabled |\r\n",
"+-------------------------------+----------------------+----------------------+\r\n",
" \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| Processes: |\r\n",
"| GPU GI CI PID Type Process name GPU Memory |\r\n",
"| ID ID Usage |\r\n",
"|=============================================================================|\r\n",
"+-----------------------------------------------------------------------------+\r\n"
]
}
],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "1bde5916",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from awq import AutoAWQForCausalLM\n",
"from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM\n",
"import torch\n",
"\n",
"model_path = 'mesolitica/malaysian-mistral-7b-32k-instructions'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c658280e",
"metadata": {},
"outputs": [],
"source": [
"# !pip3 install transformers==4.35.0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "838ddb85",
"metadata": {},
"outputs": [],
"source": [
"# model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "637b41e1",
"metadata": {},
"outputs": [],
"source": [
"# model.save_pretrained('./test', safe_serialization = False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "417dbbf5",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "61e83560a1344a4593dd8e2d806992c4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = AutoAWQForCausalLM.from_pretrained('./test')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "212056b5",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d9fe43e4b9644a29ae5763a62f5cb1d3",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files: 0%| | 0/1 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8950b7ab9bcf4119bb65f235a2bd0b63",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data: 0%| | 0.00/351M [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7406dbb6a9434d31bb0eb46f9c98ff58",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting data files: 0%| | 0/1 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c3864d9070be43ecb470df927ed9bc30",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating train split: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"AWQ: 100%|██████████| 32/32 [09:45<00:00, 18.28s/it]\n"
]
}
],
"source": [
"quant_path = 'malaysian-mistral-7b-32k-instructions-awq'\n",
"quant_config = { \"zero_point\": True, \"q_group_size\": 128, \"w_bit\": 4, \"version\": \"GEMM\" }\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
"model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "77e03f18",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:`quant_config.json` is being deprecated in the future in favor of quantization_config in config.json.\n"
]
},
{
"data": {
"text/plain": [
"('malaysian-mistral-7b-32k-instructions-awq/tokenizer_config.json',\n",
" 'malaysian-mistral-7b-32k-instructions-awq/special_tokens_map.json',\n",
" 'malaysian-mistral-7b-32k-instructions-awq/tokenizer.model',\n",
" 'malaysian-mistral-7b-32k-instructions-awq/added_tokens.json',\n",
" 'malaysian-mistral-7b-32k-instructions-awq/tokenizer.json')"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.save_quantized(quant_path, safetensors = False)\n",
"tokenizer.save_pretrained(quant_path)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "fd35b057",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c9e54720c2f44b8b8a769ddc69bb6c82",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.model: 0%| | 0.00/493k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-AWQ/commit/f7c5657f4c023ba9ca5c04760856998e10f06875', commit_message='Upload tokenizer', commit_description='', oid='f7c5657f4c023ba9ca5c04760856998e10f06875', pr_url=None, pr_revision=None, pr_num=None)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.push_to_hub('mesolitica/malaysian-mistral-7b-32k-instructions-AWQ')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "c0d284f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"transformers.utils.quantization_config.AwqConfig"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AwqConfig()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "816dacc8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-AWQ/commit/ef01ad16114f487387c426a41e172df2a2b94341', commit_message='Upload config', commit_description='', oid='ef01ad16114f487387c426a41e172df2a2b94341', pr_url=None, pr_revision=None, pr_num=None)"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quantization_config = AwqConfig(\n",
" bits=quant_config['w_bit'],\n",
" group_size=quant_config['q_group_size'],\n",
" zero_point=quant_config['zero_point'],\n",
" backend='autoawq',\n",
" version=quant_config['version'].lower(),\n",
")\n",
"\n",
"config = AutoConfig.from_pretrained(model_path)\n",
"config.quantization_config = quantization_config\n",
"\n",
"config.push_to_hub('mesolitica/malaysian-mistral-7b-32k-instructions-AWQ')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "846835fa",
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import HfApi\n",
"\n",
"api = HfApi()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "f8c2bef7",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9e48773d264248c589f9cae73965b579",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"pytorch_model.bin: 0%| | 0.00/4.15G [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-AWQ/blob/main/pytorch_model.bin'"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"api.upload_file(\n",
" path_or_fileobj='malaysian-mistral-7b-32k-instructions-awq/pytorch_model.bin',\n",
" path_in_repo=\"pytorch_model.bin\",\n",
" repo_id='mesolitica/malaysian-mistral-7b-32k-instructions-AWQ',\n",
" repo_type=\"model\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "b6b0f30f",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dca4e6f178ec4431b17c4924adb13563",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/806 [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.\n"
]
}
],
"source": [
"quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-AWQ')\n",
"_ = quantized_model.cuda()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "698cd4c9",
"metadata": {},
"outputs": [],
"source": [
"def parse_mistral_chat(messages):\n",
"\n",
" user_query = messages[-1]['content']\n",
"\n",
" users, assistants = [], []\n",
" for q in messages[:-1]:\n",
" if q['role'] == 'user':\n",
" users.append(q['content'])\n",
" elif q['role'] == 'assistant':\n",
" assistants.append(q['content'])\n",
"\n",
" texts = ['']\n",
" for u, a in zip(users, assistants):\n",
" texts.append(f'[INST] {u.strip()} [/INST]{a.strip()} ')\n",
"\n",
" texts.append(f'[INST] {user_query.strip()} [/INST]')\n",
" prompt = ''.join(texts).strip()\n",
" return prompt"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "63315893",
"metadata": {},
"outputs": [],
"source": [
"messages = [\n",
" {'role': 'user', 'content': 'kwsp tu apa'}\n",
"]\n",
"prompt = parse_mistral_chat(messages)\n",
"inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "8a3c15d8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.67 s, sys: 0 ns, total: 2.67 s\n",
"Wall time: 2.67 s\n"
]
},
{
"data": {
"text/plain": [
"' [INST] kwsp tu apa [/INST]kwsp merujuk kepada Kumpulan Wang Simpanan Pekerja, iaitu sebuah organisasi simpanan persaraan yang ditubuhkan oleh kerajaan Malaysia untuk melindungi dan menyediakan simpanan untuk pekerja-pekerja sektor swasta pada akhir penggajian mereka.'"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"generate_kwargs = dict(\n",
" inputs,\n",
" max_new_tokens=1024,\n",
" top_p=0.95,\n",
" top_k=50,\n",
" temperature=0.9,\n",
" do_sample=True,\n",
" num_beams=1,\n",
")\n",
"r = quantized_model.generate(**generate_kwargs)\n",
"tokenizer.decode(r[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d73d43a0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}