from transformers import ( AutoModelForCausalLM, AutoTokenizer, ) from peft import PeftModel, PeftConfig import torch orig_checkpoint = 'google/gemma-2b' checkpoint = '.' HF_TOKEN = '' PROMPT = 'Salut, ca sa imi schimb buletinul pot sa' seq_len = 256 # load original model first tokenizer = AutoTokenizer.from_pretrained(orig_checkpoint, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(orig_checkpoint, token=HF_TOKEN) # then merge trained QLoRA weights model = PeftModel.from_pretrained(model, checkpoint) model.merge_and_unload() model = model.cuda() # generate normally inputs = tokenizer.encode(PROMPT, return_tensors="pt").cuda() outputs = model.generate(inputs, max_new_tokens=seq_len) print(tokenizer.decode(outputs[0]))