import os import gradio as gr import fire from enum import Enum from threading import Thread from transformers import AutoModelForCausalLM, AutoTokenizer from auto_gptq import AutoGPTQForCausalLM from llama_cpp import Llama from huggingface_hub import hf_hub_download from transformers import TextIteratorStreamer from llama_chat_format import format_to_llama_chat_style # class syntax class Model_Type(Enum): gptq = 1 ggml = 2 full_precision = 3 def get_model_type(model_name): if "gptq" in model_name.lower(): return Model_Type.gptq elif "ggml" in model_name.lower(): return Model_Type.ggml else: return Model_Type.full_precision def create_folder_if_not_exists(folder_path): if not os.path.exists(folder_path): os.makedirs(folder_path) # running on gpu? you are using either full precission model or gptq quantization def initialize_gpu_model_and_tokenizer(model_name, model_type): if model_type == Model_Type.gptq: model = AutoGPTQForCausalLM.from_quantized(model_name, device_map="auto", use_safetensors=True, use_triton=False) tokenizer = AutoTokenizer.from_pretrained(model_name) else: model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=True) tokenizer = AutoTokenizer.from_pretrained(model_name, token=True) return model, tokenizer # this is if you run on CPU - then you are using the ggml model type def init_auto_model_and_tokenizer(model_name, model_type, file_name=None): model_type = get_model_type(model_name) if Model_Type.ggml == model_type: models_folder = "./models" create_folder_if_not_exists(models_folder) file_path = hf_hub_download(repo_id=model_name, filename=file_name, local_dir=models_folder) model = Llama(file_path, n_ctx=4096) tokenizer = None else: model, tokenizer = initialize_gpu_model_and_tokenizer(model_name, model_type=model_type) return model, tokenizer # Chatbot implementation based upon: https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks def run_ui(model, tokenizer, is_chat_model, model_type): #Blocks are made with a with clause, and any component created inside this clause is automatically added to the app. with gr.Blocks() as demo: #Gradio components created in the app - Chatbot, textbox and button chatbot = gr.Chatbot() msg = gr.Textbox() clear = gr.Button("Clear") # This implementation offers chat streaming. How: # First, we can stream responses so the user doesn’t have to wait as long for a message to be generated. # Second, we can have the user message appear immediately in the chat history, while the chatbot’s response is being generated. #The first method user() updates the chatbot with the user message and clears the input field. This method also makes the input field non interactive so that the user can’t send another message while the chatbot is responding. Because we want this to happen instantly, we set queue=False, which would skip any queue had it been enabled. The chatbot’s history is appended with (user_message, None), the None signifying that the bot has not responded. def user(user_message, history): return "", history + [[user_message, None]] #The second method, bot() updates the chatbot history with the bot’s response. Instead of creating a new message, we just replace the previously-created None message with the bot’s response. Finally, we construct the message character by character and yield the intermediate outputs as they are being constructed. Gradio automatically turns any function with the yield keyword into a streaming output interface. def bot(history): # it is required by llama implementation to format the chats before using (in case of using the finetuned chat model) # see for details: https://github.com/facebookresearch/llama if is_chat_model: instruction = format_to_llama_chat_style(history) else: instruction = history[-1][0] print('instruction',instruction) #this is the model generated text history[-1][1] = "" kwargs = dict(temperature=0.6, top_p=0.9) if model_type == Model_Type.ggml: kwargs["max_tokens"] = 512 for chunk in model(prompt=instruction, stream=True, **kwargs): token = chunk["choices"][0]["text"] history[-1][1] += token yield history else: streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, Timeout=5) inputs = tokenizer(instruction, return_tensors="pt").to(model.device) kwargs["max_new_tokens"] = 512 kwargs["input_ids"] = inputs["input_ids"] kwargs["streamer"] = streamer thread = Thread(target=model.generate, kwargs=kwargs) thread.start() for token in streamer: history[-1][1] += token yield history # when a user submits their message, we now chain three event events with .then(): msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot) clear.click(lambda: None, None, chatbot, queue=False) demo.queue() demo.launch(debug=True) def main(model_name="", file_name=""): model_name = "TheBloke/Llama-2-7B-Chat-GGML" #"TheBloke/Llama-2-13B-GPTQ" file_name = "llama-2-7b-chat.ggmlv3.q4_K_M.bin" is_chat_model = 'chat' in model_name.lower() model_type = get_model_type(model_name) model, tokenizer = init_auto_model_and_tokenizer(model_name, model_type, file_name) run_ui(model, tokenizer, is_chat_model, model_type) if __name__ == '__main__': fire.Fire(main)