CHAT_BOX / app.py
anabury's picture
Update app.py
9a972c0 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
# Your adapter (LoRA fine-tuned model on Hugging Face)
ADAPTER_ID = "Anabury/My_Finetuned_Phi-4"
# Detect device
USE_GPU = torch.cuda.is_available()
# Pick base model depending on device
if USE_GPU:
BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit" # fast + quantized
else:
BASE_MODEL = "unsloth/phi-4" # full precision for CPU
print(f"Loading base model: {BASE_MODEL} on {'GPU' if USE_GPU else 'CPU'}")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
# Load base model
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto" if USE_GPU else None,
torch_dtype=torch.float16 if USE_GPU else torch.float32,
trust_remote_code=True
)
# Attach your LoRA adapter
model = PeftModel.from_pretrained(base, ADAPTER_ID)
model.eval()
# Chat function
def chat(message, history):
# simple prompt, you can swap in chat template later
inputs = tokenizer(message, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id,
)
reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
history.append((message, reply))
return history, history
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Phi-4 Chatbot (Fine-tuned)")
chatbot = gr.Chatbot(height=420)
msg = gr.Textbox(placeholder="Ask me anything…")
clear = gr.Button("Clear")
msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
clear.click(lambda: [], None, chatbot, queue=False)
demo.launch()