from tokenizers import Tokenizer import gradio as gr from transformers import pipeline import torch from transformers import BitsAndBytesConfig from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset from peft import prepare_model_for_kbit_training from peft import LoraConfig, get_peft_model from transformers import TrainingArguments from trl import SFTTrainer def run_inference_on_model(prompt): gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=max_length) result = gen(prompt) return (result[0]['generated_text'].replace(prompt, '')) max_length = 256 # Load Model model_name = "microsoft/phi-2" model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True) model.config.use_cache = False # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token # Load the fine tuned weights finetuned_model_path = './fine_tuned_model' model.load_adapter(finetuned_model_path) # Define Interface description = 'An AI assistant that runs on the Microsoft Phi 2 model fine tuned on Open Assistant dataset using QLora approach. Link to the model: https://huggingface.co/microsoft/phi-2 Link to the dataset: https://huggingface.co/datasets/OpenAssistant/oasst1 ' title = 'AI Chat bot finetuned on Microsoft Phi 2 model' demo = gr.Interface(run_inference_on_model, inputs = [gr.Textbox('Enter your prompt here', label="Input prompt")], outputs = [gr.Textbox(label='AI response', scale=2)], title = title, description = description ) demo.launch(debug=False)