Spaces:

ruslanmv
/

Open-Source-LLM-Chatbot

Running

App Files Files Community

Open-Source-LLM-Chatbot / app.py

ruslanmv

Update app.py

a2ed01f verified 10 months ago

raw

history blame

6.5 kB

	from huggingface_hub import hf_hub_download
	import logging
	import sys
	import gradio as gr
	from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
	from llama_index.llms import LlamaCPP
	from llama_index.llms.llama_utils import (
	messages_to_prompt,
	completion_to_prompt,
	)

	def download(model):
	repo_id = model_info[model]["repo_id"]
	filename = model_info[model]["filename"]

	model_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	resume_download=True,
	cache_dir=MODELS_PATH,
	)

	return model_path


	MODELS_PATH = "./models"
	models = ["Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5", "CodeLlama-7B","Falcon-7B-Instruct"]
	model_info = {
	"Llama-2-13B-chat": {
	"repo_id": "TheBloke/Llama-2-13B-chat-GGUF",
	"filename": "llama-2-13b-chat.Q4_K_M.gguf",
	},
	"Mistral-7B-Instruct-v0.2": {
	"repo_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
	"filename": "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
	},
	"zephyr-7B-beta": {
	"repo_id": "TheBloke/zephyr-7B-beta-GGUF",
	"filename": "zephyr-7b-beta.Q4_K_M.gguf",
	},
	"vicuna-7B-v1.5": {
	"repo_id": "TheBloke/vicuna-7B-v1.5-GGUF",
	"filename": "vicuna-7b-v1.5.Q4_K_M.gguf",
	},
	"CodeLlama-7B": {
	"repo_id": "TheBloke/CodeLlama-7B-GGUF",
	"filename": "codellama-7b.Q4_K_M.gguf",
	},
	"Falcon-7B-Instruct": {
	"repo_id": "TheBloke/Falcon-7B-Instruct-GGML",
	"filename": "falcon-7b-instruct.ggccv1.q4_1.bin",
	},

	}
	for model_name in models:
	download(model_name)


	mistral_model_path = hf_hub_download(
	repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
	filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
	resume_download=True,
	cache_dir=MODELS_PATH,)




	"""Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance"""

	llm = LlamaCPP(
	# You can pass in the URL to a GGML model to download it automatically
	# model_url=model_url,
	# optionally, you can set the path to a pre-downloaded model instead of model_url
	model_path=mistral_model_path,
	temperature=0.1,
	max_new_tokens=256,
	# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
	context_window=3900,
	# kwargs to pass to __call__()
	generate_kwargs={},
	# kwargs to pass to __init__()
	# set to at least 1 to use GPU
	model_kwargs={"n_gpu_layers": -1},
	# transform inputs into Llama2 format
	messages_to_prompt=messages_to_prompt,
	completion_to_prompt=completion_to_prompt,
	verbose=True,
	)


	def model_initialization(model):
	if(model !=""):
	gr.Info("model downloading and configuration process has been started, please wait...")
	MODELS_PATH = "./models"
	repo_id=""
	filename=""
	if(model=="Llama-2-13B-chat"):
	repo_id="TheBloke/Llama-2-13B-chat-GGUF"
	filename="llama-2-13b-chat.Q4_K_M.gguf"
	elif(model=="Mistral-7B-Instruct-v0.2") :
	repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
	filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
	elif(model=="zephyr-7B-beta"):
	repo_id="TheBloke/zephyr-7B-beta-GGUF "
	filename="zephyr-7b-beta.Q4_K_M.gguf"
	elif(model=="vicuna-7B-v1.5"):
	repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
	filename="vicuna-7b-v1.5.Q4_K_M.gguf"
	elif(model=="Falcon-7B-Instruct"):
	repo_id="TheBloke/Falcon-7B-Instruct-GGML"
	filename="falcon-7b-instruct.ggccv1.q4_1.bin"
	elif(model=="CodeLlama-7B"):
	repo_id="TheBloke/CodeLlama-7B-GGUF"
	filename="codellama-7b.Q4_K_M.gguf"
	else:
	gr.Warning("please select at least one model")


	mistral_model_path = hf_hub_download(
	repo_id= repo_id,
	filename= filename,
	resume_download=True,
	cache_dir=MODELS_PATH,)

	llm = LlamaCPP(
	# You can pass in the URL to a GGML model to download it automatically
	# model_url=model_url,
	# optionally, you can set the path to a pre-downloaded model instead of model_url
	model_path=mistral_model_path,
	temperature=0.1,
	max_new_tokens=256,
	# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
	context_window=3900,
	# kwargs to pass to __call__()
	generate_kwargs={},
	# set to at least 1 to use GPU
	model_kwargs={"n_gpu_layers": -1},
	# transform inputs into Llama2 format
	messages_to_prompt=messages_to_prompt,
	completion_to_prompt=completion_to_prompt,
	verbose=True,
	)
	gr.Info("model has been configured and ready to chat")
	return "model has been configured and ready to chat, your current model is "+model

	def predict(message, history):
	messages = []
	answer = []
	response = llm.stream_complete(message)
	for bot_response in response:
	token = bot_response.delta
	answer.append(token)
	final_answer = " ".join(answer)
	yield final_answer

	with gr.Blocks() as UI:

	models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" ,"Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
	"vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
	"vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2")
	textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status");
	# Chatbot interface
	chatUI= gr.ChatInterface(
	predict,
	title="Open Source LLM ChatBot",
	description="Ask any question",
	theme="soft",
	examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"],
	cache_examples=False,
	submit_btn="Send Message",
	retry_btn=None,
	undo_btn="Delete Previous",
	clear_btn="Clear",
	)

	models.change(fn=model_initialization,inputs=[models],outputs=[textInfo])

	if __name__ == "__main__":
	UI.launch(debug=True) #