File size: 2,523 Bytes
eb2e235
 
5fd0c28
2936c26
01945bd
eb2e235
 
 
932195b
 
eb2e235
932195b
 
eb2e235
 
 
932195b
eb2e235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01945bd
def541d
 
eb2e235
01945bd
eb2e235
 
01945bd
eb2e235
def541d
01945bd
 
eb2e235
01945bd
eb2e235
 
01945bd
 
 
 
b97d649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

# Hugging Face repository IDs
base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
adapter_repo = "Mat17892/llama_lora_gguf"

# Download the base model GGUF file
print("Downloading base model...")
base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")

# Download the LoRA adapter GGUF file
print("Downloading LoRA adapter...")
lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")

# Function to run `llama-cli` with base model and adapter
def run_llama_cli(prompt):
    print("Running inference with llama-cli...")
    cmd = [
        "./llama-cli",
        "-c", "2048",  # Context length
        "-cnv",        # Enable conversational mode
        "-m", base_model_path,
        "--lora", lora_adapter_path,
        "--prompt", prompt,
    ]
    try:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

        if process.returncode != 0:
            print("Error during inference:")
            print(stderr.decode())
            return "Error: Could not generate response."

        return stdout.decode().strip()
    except Exception as e:
        print(f"Exception occurred: {e}")
        return "Error: Could not generate response."

# Gradio interface
def chatbot_fn(user_input, chat_history):
    # Build the full chat history as the prompt
    prompt = ""
    for user, ai in chat_history:
        prompt += f"User: {user}\nAI: {ai}\n"
    prompt += f"User: {user_input}\nAI:"  # Add latest user input

    # Generate response using llama-cli
    response = run_llama_cli(prompt)

    # Update chat history
    chat_history.append((user_input, response))
    return chat_history, chat_history

# Build the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
    chatbot = gr.Chatbot(label="Chat with the Model")

    with gr.Row():
        with gr.Column(scale=4):
            user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
        with gr.Column(scale=1):
            submit_btn = gr.Button("Send")

    chat_history = gr.State([])

    # Link components
    submit_btn.click(
        chatbot_fn,
        inputs=[user_input, chat_history],
        outputs=[chatbot, chat_history],
        show_progress=True,
    )

# Launch the Gradio app
demo.launch()