|
import gradio as gr |
|
from llama_cpp import Llama |
|
import requests |
|
|
|
|
|
MODELS = { |
|
"Llama-3.2-3B": { |
|
"repo_id": "lmstudio-community/Llama-3.2-3B-Instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"Llama-3.2-5B": { |
|
"repo_id": "lmstudio-community/Llama-3.2-1B-Instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"Phi-3.5-mini": { |
|
"repo_id": "bartowski/Phi-3.5-mini-instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"Granite-3B": { |
|
"repo_id": "lmstudio-community/granite-3.0-3b-a800m-instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"Qwen2.5-3B": { |
|
"repo_id": "lmstudio-community/Qwen2.5-3B-Instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"SmolLM2-1.7B": { |
|
"repo_id": "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"Qwen2.5-1.5B": { |
|
"repo_id": "lmstudio-community/Qwen2.5-1.5B-Instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"Granite-1B": { |
|
"repo_id": "lmstudio-community/granite-3.0-1b-a400m-instruct-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
}, |
|
"AMD-OLMo-1B": { |
|
"repo_id": "lmstudio-community/AMD-OLMo-1B-SFT-GGUF", |
|
"filename": "*Q4_K_M.gguf", |
|
"chat_format": "chatml" |
|
} |
|
} |
|
|
|
|
|
current_model = None |
|
|
|
def load_model(model_name): |
|
global current_model |
|
model_info = MODELS[model_name] |
|
current_model = Llama.from_pretrained( |
|
repo_id=model_info["repo_id"], |
|
filename=model_info["filename"], |
|
verbose=True, |
|
n_ctx=32768, |
|
n_threads=2, |
|
chat_format=model_info["chat_format"] |
|
) |
|
return current_model |
|
|
|
|
|
current_model = load_model(list(MODELS.keys())[0]) |
|
|
|
def respond( |
|
message, |
|
history, |
|
model_name, |
|
system_message, |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
): |
|
global current_model |
|
|
|
|
|
if current_model is None or model_name not in str(current_model.model_path): |
|
current_model = load_model(model_name) |
|
|
|
gr.Info(get_chat_title(model_name)) |
|
|
|
|
|
messages = [] |
|
if system_message and system_message.strip(): |
|
messages.append({"role": "system", "content": system_message}) |
|
|
|
|
|
for msg in history: |
|
if isinstance(msg, tuple): |
|
if msg[0]: |
|
messages.append({"role": "user", "content": msg[0]}) |
|
if msg[1]: |
|
messages.append({"role": "assistant", "content": msg[1]}) |
|
else: |
|
messages.append(msg) |
|
|
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
response = current_model.create_chat_completion( |
|
messages=messages, |
|
stream=True, |
|
max_tokens=max_tokens, |
|
temperature=temperature, |
|
top_p=top_p |
|
) |
|
|
|
message_repl = "" |
|
for chunk in response: |
|
if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: |
|
message_repl = message_repl + chunk['choices'][0]["delta"]["content"] |
|
yield message_repl |
|
|
|
def get_chat_title(model_name): |
|
return f"{model_name} < - Load different model in Additional Inputs" |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
title = gr.HTML(value=f"<h1>{get_chat_title(list(MODELS.keys())[0])}</h1>") |
|
|
|
with gr.Row(): |
|
chatbot = gr.Chatbot( |
|
value=[], |
|
type="messages", |
|
label="Chat Messages" |
|
) |
|
|
|
with gr.Row(): |
|
msg = gr.Textbox( |
|
label="Message", |
|
placeholder="Type your message here...", |
|
lines=1 |
|
) |
|
submit = gr.Button("Submit") |
|
|
|
with gr.Accordion("Additional Inputs", open=False): |
|
model_selector = gr.Dropdown( |
|
choices=list(MODELS.keys()), |
|
value=list(MODELS.keys())[0], |
|
label="Select Model", |
|
interactive=True, |
|
allow_custom_value=False, |
|
elem_id="model_selector", |
|
show_label=True |
|
) |
|
system_msg = gr.Textbox(value="You are a friendly Chatbot.", label="System message") |
|
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens") |
|
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature") |
|
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") |
|
|
|
gr.Markdown( |
|
"GGUF is popular model format, try HG models localy in: [LM Studio AI](https://lmstudio.ai) for PC | PocketPal AI ([Android](https://play.google.com/store/apps/details?id=com.pocketpalai) & [iOS](https://play.google.com/store/apps/details?id=com.pocketpalai)) on Tablet or Mobile" |
|
) |
|
|
|
def update_title(model_name): |
|
return f"<h1>{get_chat_title(model_name)}</h1>" |
|
|
|
model_selector.change( |
|
fn=update_title, |
|
inputs=[model_selector], |
|
outputs=[title] |
|
) |
|
|
|
def submit_message(message, chat_history, model_name, system_message, max_tokens, temperature, top_p): |
|
history = [] if chat_history is None else chat_history |
|
for response in respond(message, history, model_name, system_message, max_tokens, temperature, top_p): |
|
history = history + [[message, response]] |
|
yield history, "" |
|
|
|
submit_event = submit.click( |
|
fn=submit_message, |
|
inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p], |
|
outputs=[chatbot, msg], |
|
show_progress=True, |
|
) |
|
|
|
msg.submit( |
|
fn=submit_message, |
|
inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p], |
|
outputs=[chatbot, msg], |
|
show_progress=True, |
|
) |
|
|
|
demo.theme = gr.themes.Soft( |
|
primary_hue="blue", |
|
secondary_hue="purple", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |