In [1]:
# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp310-cp310-linux_x86_64.whl

In [2]:
from awq import AutoAWQForCausalLM
from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM
import torch

model_path = 'mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag'

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)

In [4]:
!rm -rf test

In [5]:
model.save_pretrained('./test', safe_serialization = False)

In [6]:
model = AutoAWQForCausalLM.from_pretrained('./test')

In [7]:
quant_path = 'malaysian-tinyllama-1.1b-16k-instructions-rag-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')

tokenizer_config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

AWQ: 100%|██████████| 22/22 [02:20<00:00,  6.37s/it]


In [8]:
model.save_quantized(quant_path, safetensors = False)
tokenizer.save_pretrained(quant_path)



('malaysian-tinyllama-1.1b-16k-instructions-rag-awq/tokenizer_config.json',
 'malaysian-tinyllama-1.1b-16k-instructions-rag-awq/special_tokens_map.json',
 'malaysian-tinyllama-1.1b-16k-instructions-rag-awq/tokenizer.json')

In [9]:
tokenizer.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/6f906fb7dab784cdf9de4b625578fa9dd25d5c7d', commit_message='Upload tokenizer', commit_description='', oid='6f906fb7dab784cdf9de4b625578fa9dd25d5c7d', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
quantization_config = AwqConfig(
    bits=quant_config['w_bit'],
    group_size=quant_config['q_group_size'],
    zero_point=quant_config['zero_point'],
    backend='autoawq',
    version=quant_config['version'].lower(),
)

config = AutoConfig.from_pretrained(model_path)
config.quantization_config = quantization_config

config.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/41fbcd7dd92f45fe71a5bccc3a48cff27979b59d', commit_message='Upload config', commit_description='', oid='41fbcd7dd92f45fe71a5bccc3a48cff27979b59d', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
!ls malaysian-tinyllama-1.1b-16k-instructions-rag-awq

config.json		quant_config.json	 tokenizer_config.json
generation_config.json	special_tokens_map.json
pytorch_model.bin	tokenizer.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
from huggingface_hub import HfApi

api = HfApi()

In [13]:
api.upload_file(
    path_or_fileobj='malaysian-tinyllama-1.1b-16k-instructions-rag-awq/pytorch_model.bin',
    path_in_repo="pytorch_model.bin",
    repo_id='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ',
    repo_type="model",
)

pytorch_model.bin:   0%|          | 0.00/766M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/40801f6263ebf6127720f9d5bf6037557c565af2', commit_message='Upload pytorch_model.bin with huggingface_hub', commit_description='', oid='40801f6263ebf6127720f9d5bf6037557c565af2', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
api.upload_file(
    path_or_fileobj='malaysian-tinyllama-1.1b-16k-instructions-rag-awq/quant_config.json',
    path_in_repo="quant_config.json",
    repo_id='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ',
    repo_type="model",
)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ/commit/021f56454c537c0658279317a6157218a3d04479', commit_message='Upload quant_config.json with huggingface_hub', commit_description='', oid='021f56454c537c0658279317a6157218a3d04479', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-rag-AWQ')
_ = quantized_model.cuda()

config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.


pytorch_model.bin:   0%|          | 0.00/766M [00:00<?, ?B/s]

In [18]:
messages = [
    {'role': 'user', 'content': 'KWSP tu apa'}
]
prompt = tokenizer.apply_chat_template(messages, tokenize = False)
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')

In [19]:
%%time

generate_kwargs = dict(
    inputs,
    max_new_tokens=1024,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)
r = quantized_model.generate(**generate_kwargs)
tokenizer.decode(r[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


CPU times: user 5.65 s, sys: 0 ns, total: 5.65 s
Wall time: 5.65 s


'<s> [INST] KWSP tu apa [/INST]KWSP atau Skim Simpanan Pendidikan Nasional ialah skim yang ditubuhkan oleh kerajaan Malaysia untuk membantu ibu bapa membiayai pendidikan anak-anak mereka. Skim ini membolehkan individu menyumbang sejumlah wang daripada pendapatan mereka kepada akaun KWSP untuk digunakan sebagai dana pendidikan pada masa hadapan. Dengan menyumbang kepada KWSP, ibu bapa boleh menggunakan dana tersebut untuk membiayai yuran pendidikan, kos buku dan peralatan sekolah, yuran ujian, dan perbelanjaan lain yang berkaitan dengan pendidikan anak-anak mereka. Skim ini adalah inisiatif yang membantu ibu bapa untuk menyediakan persediaan kewangan yang diperlukan untuk pendidikan anak-anak mereka tanpa perlu mengeluarkan wang dari poket sendiri.</s>'