from huggingface_hub import hf_hub_download import logging import sys import gradio as gr from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt from llama_index.llms import LlamaCPP from llama_index.llms.llama_utils import ( messages_to_prompt, completion_to_prompt, ) def download(model): repo_id = model_info[model]["repo_id"] filename = model_info[model]["filename"] model_path = hf_hub_download( repo_id=repo_id, filename=filename, resume_download=True, cache_dir=MODELS_PATH, ) return model_path MODELS_PATH = "./models" models = ["Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5", "CodeLlama-7B"] model_info = { "Llama-2-13B-chat": { "repo_id": "TheBloke/Llama-2-13B-chat-GGUF", "filename": "llama-2-13b-chat.Q4_K_M.gguf", }, "Mistral-7B-Instruct-v0.2": { "repo_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", "filename": "mistral-7b-instruct-v0.2.Q4_K_M.gguf", }, "zephyr-7B-beta": { "repo_id": "TheBloke/zephyr-7B-beta-GGUF", "filename": "zephyr-7b-beta.Q4_K_M.gguf", }, "vicuna-7B-v1.5": { "repo_id": "TheBloke/vicuna-7B-v1.5-GGUF", "filename": "vicuna-7b-v1.5.Q4_K_M.gguf", }, "CodeLlama-7B": { "repo_id": "TheBloke/CodeLlama-7B-GGUF", "filename": "codellama-7b.Q4_K_M.gguf", }, # "Falcon-7B-Instruct": { # "repo_id": "TheBloke/Falcon-7B-Instruct-GGML", # "filename": "falcon-7b-instruct.ggccv1.q4_1.bin", # }, } for model_name in models: download(model_name) mistral_model_path = hf_hub_download( repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf", resume_download=True, cache_dir=MODELS_PATH,) """Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance""" llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically # model_url=model_url, # optionally, you can set the path to a pre-downloaded model instead of model_url model_path=mistral_model_path, temperature=0.1, max_new_tokens=256, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=3900, # kwargs to pass to __call__() generate_kwargs={}, # kwargs to pass to __init__() # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": -1}, # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) def model_initialization(model): if(model !=""): gr.Info("model downloading and configuration process has been started, please wait...") MODELS_PATH = "./models" repo_id="" filename="" if(model=="Llama-2-13B-chat"): repo_id="TheBloke/Llama-2-13B-chat-GGUF" filename="llama-2-13b-chat.Q4_K_M.gguf" elif(model=="Mistral-7B-Instruct-v0.2") : repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF" filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf" elif(model=="zephyr-7B-beta"): repo_id="TheBloke/zephyr-7B-beta-GGUF " filename="zephyr-7b-beta.Q4_K_M.gguf" elif(model=="vicuna-7B-v1.5"): repo_id="TheBloke/vicuna-7B-v1.5-GGUF" filename="vicuna-7b-v1.5.Q4_K_M.gguf" # elif(model=="Falcon-7B-Instruct"): # repo_id="TheBloke/Falcon-7B-Instruct-GGML" # filename="falcon-7b-instruct.ggccv1.q4_1.bin" elif(model=="CodeLlama-7B"): repo_id="TheBloke/CodeLlama-7B-GGUF" filename="codellama-7b.Q4_K_M.gguf" else: gr.Warning("please select at least one model") mistral_model_path = hf_hub_download( repo_id= repo_id, filename= filename, resume_download=True, cache_dir=MODELS_PATH,) llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically # model_url=model_url, # optionally, you can set the path to a pre-downloaded model instead of model_url model_path=mistral_model_path, temperature=0.1, max_new_tokens=256, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=3900, # kwargs to pass to __call__() generate_kwargs={}, # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": -1}, # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) gr.Info("model has been configured and ready to chat") return "model has been configured and ready to chat, your current model is "+model def predict(message, history): messages = [] answer = [] response = llm.stream_complete(message) for bot_response in response: token = bot_response.delta answer.append(token) final_answer = " ".join(answer) yield final_answer with gr.Blocks() as UI: models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat" ,"Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2") textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status"); # Chatbot interface chatUI= gr.ChatInterface( predict, title="Open Source LLM ChatBot", description="Ask any question", theme="soft", examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"], cache_examples=False, submit_btn="Send Message", retry_btn=None, undo_btn="Delete Previous", clear_btn="Clear", ) models.change(fn=model_initialization,inputs=[models],outputs=[textInfo]) if __name__ == "__main__": UI.launch(debug=True) #