junaidbaber's picture
Uploading the demo
7077c22
raw
history blame
2.38 kB
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Hugging Face repository details
MODEL_ID = "meta-llama/CodeLlama-7b-Instruct-hf"
def load_model():
"""Load the Hugging Face model and tokenizer."""
try:
st.write("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype=torch.float16
)
st.write("Model and tokenizer successfully loaded.")
return tokenizer, model
except Exception as e:
st.error(f"Error loading model: {e}")
return None, None
# Load the model and tokenizer
@st.cache_resource
def get_model():
return load_model()
tokenizer, model = get_model()
# Streamlit UI
st.title("Medical Chatbot")
st.write("This chatbot provides medical assistance. Type your question below!")
if model is None or tokenizer is None:
st.error("Model failed to load. Please check the Hugging Face model path or environment configuration.")
else:
user_input = st.text_input("You:", placeholder="Enter your medical question here...", key="input_box")
if st.button("Send"):
if user_input.strip():
# Construct the prompt
SYSTEM_PROMPT = "You are a helpful medical assistant. Provide accurate and concise answers."
full_prompt = f"{SYSTEM_PROMPT}\nUser: {user_input}\nAssistant:"
# Tokenize the input
inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True).to("cuda")
try:
# Generate the response
outputs = model.generate(
inputs["input_ids"],
max_length=200, # Limit response length
temperature=0.7, # Control randomness
top_p=0.9, # Top-p sampling
pad_token_id=tokenizer.eos_token_id
)
# Decode and display the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip()
st.write(f"**Model:** {response}")
except Exception as e:
st.error(f"Error generating response: {e}")
else:
st.warning("Please enter a valid question.")