import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel import gradio as gr base_model_id='mistralai/Mistral-7B-Instruct-v0.1' PEFT_MODEL = 'johnstrenio/mistral_ski' bnb_config = BitsAndBytesConfig( load_in_4bit=True, # load model in 4-bit precision bnb_4bit_quant_type="nf4", # pre-trained model should be quantized in 4-bit NF format bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format ) base_model = AutoModelForCausalLM.from_pretrained( base_model_id, # Mistral, same as before quantization_config=bnb_config, # Same quantization config as before device_map="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = PeftModel.from_pretrained(base_model, PEFT_MODEL) model.eval() def predict(text): prompt = "[INST] " + text + " [/INST]" model_input = tokenizer(prompt, return_tensors="pt").to("cuda") resp = tokenizer.decode(model.generate(**model_input, max_new_tokens=200, pad_token_id=2, repetition_penalty=1.15)[0], skip_special_tokens=True) return resp demo = gr.Interface( fn=predict, inputs='text', outputs='text', ) demo.launch()