import sys sys.path.insert(1, '/workspace/asr/peft/src') # TODO set this path to the lazy-lora source code path, or you can install it from source code: # TODO, please install lazylora for usage: # git clone git@github.com:Xianchao-Wu/peft.git # cd peft # python setup.py install from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel, PeftConfig import os import torch #import ipdb; ipdb.set_trace() cache_dir="/workspace/asr/peft/qlora" # TODO set this cache_dir to the path where you stored (or, want to store) llama1-33b (huggyllama/llama-30b) model lazylora_dir=os.getcwd() # the path that contains 'adapter_config.json' and 'adapter_model.bin' config = PeftConfig.from_pretrained(lazylora_dir) tokenizer = AutoTokenizer.from_pretrained( config.base_model_name_or_path, cache_dir=cache_dir, use_auth_token=True ) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, quantization_config=bnb_config, device_map="auto", cache_dir=cache_dir, use_auth_token=True ) #model.print_trainable_parameters() print(sum(p.numel() for p in model.parameters())) # 16,477,866,496 -> half-size of 33B due to 4-bit loading model = PeftModel.from_pretrained(model, lazylora_dir) print('after adding lazy lora parameters:') model.print_trainable_parameters() # trainable params: 0 || all params: 16,965,645,824 || trainable%: 0.0