--- license: apache-2.0 language: - th library_name: transformers pipeline_tag: text-generation --- # Summary A 4-bits quantization of [scb10x/typhoon-7b](https://huggingface.co/scb10x/typhoon-7b) with only less than 8 GB VRAM is required. # Steps to reproduce ```python # init parameters model_name: str = 'scb10x/typhoon-7b' quantization_mode: str = 'q4-bnb_cuda' # possible values = {'q4-bnb_cuda', 'q8-bnb_cuda', 'q4-torch_ptdq', 'q8-torch_ptdq'} # load tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token_id = tokenizer.eos_token_id print(tokenizer) # LlamaTokenizerFast # load model import torch from transformers import AutoModelForCausalLM if quantization_mode == 'q4-bnb_cuda': # ampere architecture with 8gb vram + cpu with 20gb is recommended print('4-bits bitsandbytes quantization with cuda') model = AutoModelForCausalLM.from_pretrained( model_name, load_in_4bit = True, device_map = 'auto', torch_dtype = torch.bfloat16) elif quantization_mode == 'q8-bnb_cuda': # ampere architecture with 12gb vram + cpu with 20gb is recommended print('8-bits bitsandbytes quantization with cuda') model = AutoModelForCausalLM.from_pretrained( model_name, load_in_8bit = True, device_map = 'auto', torch_dtype = torch.bfloat16) elif quantization_mode == 'q4-torch_ptdq': # cpu with 64gb++ ram is recommended print('4-bits x2 post training dynamic quantization') base_model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype = torch.float32) model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint4x2) elif quantization_mode == 'q8-torch_ptdq': # cpu with 64gb++ ram is recommended print('8-bits post training dynamic quantization') base_model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype = torch.float32) model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint8) else: print('default model') model = AutoModelForCausalLM.from_pretrained(model_name) print(model) # MistralForCausalLM # text generator from transformers import GenerationConfig, TextGenerationPipeline config = GenerationConfig.from_pretrained(model_name) config.num_return_sequences: int = 1 config.do_sample: bool = True config.max_new_tokens: int = 128 config.temperature: float = 0.7 config.top_p: float = 0.95 config.repetition_penalty: float = 1.3 generator = TextGenerationPipeline( model = model, tokenizer = tokenizer, return_full_text = True, generation_config = config) # sample sample: str = 'ความหมายของชีวิตคืออะไร?\n' output = generator(sample, pad_token_id = tokenizer.eos_token_id) print(output[0]['generated_text']) ``` # `requirement.txt` ```txt torch==2.1.2 accelerate==0.25.0 bitsandbytes==0.41.3 #transformers==4.37.0.dev0 transformers @ git+https://github.com/huggingface/transformers ```