import torch from peft import AutoPeftModelForCausalLM from transformers import AutoTokenizer, pipeline import gradio as gr peft_model_id = "Pr123/TinyLlama-EA-Chat" # Load Model with PEFT adapter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoPeftModelForCausalLM.from_pretrained( peft_model_id, torch_dtype=torch.bfloat16 ).to(device) tokenizer = AutoTokenizer.from_pretrained(peft_model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=500) def chat_with_tinyllm(prompt): instruction = "Answer the following question: if you don't know the answer, just say that you don't know; don't try to make up an answer." prompt_content = f"[INST] <>{instruction}<>{prompt}[/INST]" result = pipe(prompt_content) result = result[0]['generated_text'].split('[/INST]')[-1] return result def chat_interface(): iface = gr.Interface( fn=chat_with_tinyllm, inputs=gr.Textbox(lines=2, placeholder="Type your question here..."), outputs="text", title="Chat with TinyLlama", description="This is a simple chatbot powered by a fine-tuned model on Hugging Face. If it doesn't know the answer, it will say so.") return iface iface = chat_interface() iface.launch()