from transformers import AutoTokenizer, AutoModelForCausalLM import torch from peft import PeftModel, PeftConfig import gradio as gr import os import huggingface from huggingface_hub import login # using hf token to login hf_token = os.environ.get('HUGGINGFACE_TOKEN') login(hf_token) # Define the device device = "cuda" if torch.cuda.is_available() else "cpu" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained('stabilityai/stablelm-3b-4e1t',token=hf_token, trust_remote_code=True) config = PeftConfig.from_pretrained("vaishakgkumar/stablemedv1") model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t",token=hf_token, trust_remote_code=True) model = PeftModel.from_pretrained(model, "vaishakgkumar/stablemedv1") model.to(device) class ChatBot: def __init__(self): self.history = [] def predict(self, user_input, system_prompt="You are an expert analyst and provide assessment:"): prompt = [{'role': 'user', 'content': user_input + "\n" + system_prompt + ":"}] inputs = tokenizer.apply_chat_template( prompt, add_generation_prompt=True, return_tensors='pt' ) # Generate a response using the model tokens = model.generate( inputs.to(model.device), max_new_tokens=250, temperature=0.8, do_sample=False ) # Decode the response response_text = tokenizer.decode(tokens[0], skip_special_tokens=False) # Free up memory del tokens torch.cuda.empty_cache() return response_text bot = ChatBot() title = "👋🏻Welcome to StableLM MED chat" description = """ """ examples = [["What is the proper treatment for buccal herpes?", "Please provide information on the most effective antiviral medications and home remedies for treating buccal herpes."]] iface = gr.Interface( fn=bot.predict, title=title, description=description, examples=examples, inputs=["text", "text"], outputs="text", theme="ParityError/Anime" ) iface.launch()