import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer model_name = "./llama2-hf" adapters_name = './checkpoint-760/adapter_model' print(f"Starting to load the model {model_name} into memory") m = AutoModelForCausalLM.from_pretrained( model_name, #load_in_4bit=True, torch_dtype=torch.bfloat16, #device_map={"": 0}, ) m = PeftModel.from_pretrained(m, adapters_name) m = m.merge_and_unload() print(f"Successfully loaded the model {model_name} into memory") m.save_pretrained("nyc-savvy")