|
from huggingface_hub import hf_hub_download |
|
import logging |
|
import sys |
|
import gradio as gr |
|
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt |
|
from llama_index.llms import LlamaCPP |
|
from llama_index.llms.llama_utils import ( |
|
messages_to_prompt, |
|
completion_to_prompt, |
|
) |
|
|
|
def download(model): |
|
repo_id = model_info[model]["repo_id"] |
|
filename = model_info[model]["filename"] |
|
|
|
model_path = hf_hub_download( |
|
repo_id=repo_id, |
|
filename=filename, |
|
resume_download=True, |
|
cache_dir=MODELS_PATH, |
|
) |
|
|
|
return model_path |
|
|
|
|
|
MODELS_PATH = "./models" |
|
models = ["Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5", "CodeLlama-7B","Falcon-7B-Instruct"] |
|
model_info = { |
|
"Llama-2-13B-chat": { |
|
"repo_id": "TheBloke/Llama-2-13B-chat-GGUF", |
|
"filename": "llama-2-13b-chat.Q4_K_M.gguf", |
|
}, |
|
"Mistral-7B-Instruct-v0.2": { |
|
"repo_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", |
|
"filename": "mistral-7b-instruct-v0.2.Q4_K_M.gguf", |
|
}, |
|
"zephyr-7B-beta": { |
|
"repo_id": "TheBloke/zephyr-7B-beta-GGUF", |
|
"filename": "zephyr-7b-beta.Q4_K_M.gguf", |
|
}, |
|
"vicuna-7B-v1.5": { |
|
"repo_id": "TheBloke/vicuna-7B-v1.5-GGUF", |
|
"filename": "vicuna-7b-v1.5.Q4_K_M.gguf", |
|
}, |
|
"CodeLlama-7B": { |
|
"repo_id": "TheBloke/CodeLlama-7B-GGUF", |
|
"filename": "codellama-7b.Q4_K_M.gguf", |
|
}, |
|
"Falcon-7B-Instruct": { |
|
"repo_id": "TheBloke/Falcon-7B-Instruct-GGML", |
|
"filename": "falcon-7b-instruct.ggccv1.q4_1.bin", |
|
}, |
|
|
|
} |
|
for model_name in models: |
|
download(model_name) |
|
|
|
|
|
mistral_model_path = hf_hub_download( |
|
repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", |
|
filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf", |
|
resume_download=True, |
|
cache_dir=MODELS_PATH,) |
|
|
|
|
|
|
|
|
|
"""Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance""" |
|
|
|
llm = LlamaCPP( |
|
|
|
|
|
|
|
model_path=mistral_model_path, |
|
temperature=0.1, |
|
max_new_tokens=256, |
|
|
|
context_window=3900, |
|
|
|
generate_kwargs={}, |
|
|
|
|
|
model_kwargs={"n_gpu_layers": -1}, |
|
|
|
messages_to_prompt=messages_to_prompt, |
|
completion_to_prompt=completion_to_prompt, |
|
verbose=True, |
|
) |
|
|
|
|
|
def model_initialization(model): |
|
if(model !=""): |
|
gr.Info("model downloading and configuration process has been started, please wait...") |
|
MODELS_PATH = "./models" |
|
repo_id="" |
|
filename="" |
|
if(model=="Llama-2-13B-chat"): |
|
repo_id="TheBloke/Llama-2-13B-chat-GGUF" |
|
filename="llama-2-13b-chat.Q4_K_M.gguf" |
|
elif(model=="Mistral-7B-Instruct-v0.2") : |
|
repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF" |
|
filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf" |
|
elif(model=="zephyr-7B-beta"): |
|
repo_id="TheBloke/zephyr-7B-beta-GGUF " |
|
filename="zephyr-7b-beta.Q4_K_M.gguf" |
|
elif(model=="vicuna-7B-v1.5"): |
|
repo_id="TheBloke/vicuna-7B-v1.5-GGUF" |
|
filename="vicuna-7b-v1.5.Q4_K_M.gguf" |
|
elif(model=="Falcon-7B-Instruct"): |
|
repo_id="TheBloke/Falcon-7B-Instruct-GGML" |
|
filename="falcon-7b-instruct.ggccv1.q4_1.bin" |
|
elif(model=="CodeLlama-7B"): |
|
repo_id="TheBloke/CodeLlama-7B-GGUF" |
|
filename="codellama-7b.Q4_K_M.gguf" |
|
else: |
|
gr.Warning("please select at least one model") |
|
|
|
|
|
mistral_model_path = hf_hub_download( |
|
repo_id= repo_id, |
|
filename= filename, |
|
resume_download=True, |
|
cache_dir=MODELS_PATH,) |
|
|
|
llm = LlamaCPP( |
|
|
|
|
|
|
|
model_path=mistral_model_path, |
|
temperature=0.1, |
|
max_new_tokens=256, |
|
|
|
context_window=3900, |
|
|
|
generate_kwargs={}, |
|
|
|
model_kwargs={"n_gpu_layers": -1}, |
|
|
|
messages_to_prompt=messages_to_prompt, |
|
completion_to_prompt=completion_to_prompt, |
|
verbose=True, |
|
) |
|
gr.Info("model has been configured and ready to chat") |
|
return "model has been configured and ready to chat, your current model is "+model |
|
|
|
def predict(message, history): |
|
messages = [] |
|
answer = [] |
|
response = llm.stream_complete(message) |
|
for bot_response in response: |
|
token = bot_response.delta |
|
answer.append(token) |
|
final_answer = " ".join(answer) |
|
yield final_answer |
|
|
|
with gr.Blocks() as UI: |
|
|
|
models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" ,"Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", |
|
"vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", |
|
"vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2") |
|
textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status"); |
|
|
|
chatUI= gr.ChatInterface( |
|
predict, |
|
title="Open Source LLM ChatBot", |
|
description="Ask any question", |
|
theme="soft", |
|
examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"], |
|
cache_examples=False, |
|
submit_btn="Send Message", |
|
retry_btn=None, |
|
undo_btn="Delete Previous", |
|
clear_btn="Clear", |
|
) |
|
|
|
models.change(fn=model_initialization,inputs=[models],outputs=[textInfo]) |
|
|
|
if __name__ == "__main__": |
|
UI.launch(debug=True) |