|
--- |
|
license: apache-2.0 |
|
language: |
|
- th |
|
library_name: transformers |
|
pipeline_tag: text-generation |
|
--- |
|
|
|
# Summary |
|
|
|
A 4-bits quantization of [scb10x/typhoon-7b](https://huggingface.co/scb10x/typhoon-7b) with only less than 8 GB VRAM is required. |
|
|
|
# Steps to reproduce |
|
```python |
|
# init parameters |
|
model_name: str = 'scb10x/typhoon-7b' |
|
quantization_mode: str = 'q4-bnb_cuda' # possible values = {'q4-bnb_cuda', 'q8-bnb_cuda', 'q4-torch_ptdq', 'q8-torch_ptdq'} |
|
|
|
# load tokenizer |
|
from transformers import AutoTokenizer |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
print(tokenizer) # LlamaTokenizerFast |
|
|
|
# load model |
|
import torch |
|
from transformers import AutoModelForCausalLM |
|
|
|
if quantization_mode == 'q4-bnb_cuda': # ampere architecture with 8gb vram + cpu with 20gb is recommended |
|
print('4-bits bitsandbytes quantization with cuda') |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
load_in_4bit = True, |
|
device_map = 'auto', |
|
torch_dtype = torch.bfloat16) |
|
elif quantization_mode == 'q8-bnb_cuda': # ampere architecture with 12gb vram + cpu with 20gb is recommended |
|
print('8-bits bitsandbytes quantization with cuda') |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
load_in_8bit = True, |
|
device_map = 'auto', |
|
torch_dtype = torch.bfloat16) |
|
elif quantization_mode == 'q4-torch_ptdq': # cpu with 64gb++ ram is recommended |
|
print('4-bits x2 post training dynamic quantization') |
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype = torch.float32) |
|
model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint4x2) |
|
elif quantization_mode == 'q8-torch_ptdq': # cpu with 64gb++ ram is recommended |
|
print('8-bits post training dynamic quantization') |
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype = torch.float32) |
|
model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint8) |
|
else: |
|
print('default model') |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
print(model) # MistralForCausalLM |
|
|
|
# text generator |
|
from transformers import GenerationConfig, TextGenerationPipeline |
|
|
|
config = GenerationConfig.from_pretrained(model_name) |
|
config.num_return_sequences: int = 1 |
|
config.do_sample: bool = True |
|
config.max_new_tokens: int = 128 |
|
config.temperature: float = 0.7 |
|
config.top_p: float = 0.95 |
|
config.repetition_penalty: float = 1.3 |
|
generator = TextGenerationPipeline( |
|
model = model, |
|
tokenizer = tokenizer, |
|
return_full_text = True, |
|
generation_config = config) |
|
|
|
# sample |
|
sample: str = 'ความหมายของชีวิตคืออะไร?\n' |
|
output = generator(sample, pad_token_id = tokenizer.eos_token_id) |
|
print(output[0]['generated_text']) |
|
``` |
|
|
|
# `requirement.txt` |
|
```txt |
|
torch==2.1.2 |
|
accelerate==0.25.0 |
|
bitsandbytes==0.41.3 |
|
#transformers==4.37.0.dev0 |
|
transformers @ git+https://github.com/huggingface/transformers |
|
``` |