import os import subprocess import gradio as gr from huggingface_hub import hf_hub_download # Hugging Face repository IDs base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF" adapter_repo = "Mat17892/llama_lora_gguf" # Download the base model GGUF file print("Downloading base model...") base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf") # Download the LoRA adapter GGUF file print("Downloading LoRA adapter...") lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf") # Function to run `llama-cli` with base model and adapter def run_llama_cli(prompt): print("Running inference with llama-cli...") cmd = [ "./llama-cli", "-c", "2048", # Context length "-cnv", # Enable conversational mode "-m", base_model_path, "--lora", lora_adapter_path, "--prompt", prompt, ] try: process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print("Error during inference:") print(stderr.decode()) return "Error: Could not generate response." return stdout.decode().strip() except Exception as e: print(f"Exception occurred: {e}") return "Error: Could not generate response." # Gradio interface def chatbot_fn(user_input, chat_history): # Build the full chat history as the prompt prompt = "" for user, ai in chat_history: prompt += f"User: {user}\nAI: {ai}\n" prompt += f"User: {user_input}\nAI:" # Add latest user input # Generate response using llama-cli response = run_llama_cli(prompt) # Update chat history chat_history.append((user_input, response)) return chat_history, chat_history # Build the Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter") chatbot = gr.Chatbot(label="Chat with the Model") with gr.Row(): with gr.Column(scale=4): user_input = gr.Textbox(label="Your Message", placeholder="Type a message...") with gr.Column(scale=