import os import torch from datasets import load_dataset import torch from peft import PeftModel from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, BitsAndBytesConfig from torch import cuda, bfloat16 from transformers import ( AutoModelForCausalLM, AutoTokenizer, # GPT2LMHeadModel, # GPT2Tokenizer, BitsAndBytesConfig, HfArgumentParser, pipeline, logging, ) from peft import LoraConfig, PeftModel import os from peft import AutoPeftModelForCausalLM from transformers import AutoTokenizer model_name = "Ayush28/Llama-2-7b" model_token= "Ayush28/Llama-2-tokenizer" trained_model = AutoPeftModelForCausalLM.from_pretrained( model_name, quantization_config=BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type='nf4' ), torch_dtype=torch.bfloat16,offload_folder="offload/", ) tokenizer = AutoTokenizer.from_pretrained(model_token) prompt = "I purchased a defective product from a store, and the store is refusing to replace or refund it. What do I do?" pipe = pipeline(task="text-generation", model=trained_model, tokenizer=tokenizer, max_length=1024) result = pipe(f"###Instruction:{prompt}") print(result[0]['generated_text'])