import os
import gradio as gr
import fire
from enum import Enum
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from transformers import TextIteratorStreamer
from llama_chat_format import format_to_llama_chat_style


# class syntax
class Model_Type(Enum):
    gptq = 1
    ggml = 2
    full_precision = 3


def get_model_type(model_name):
  if "gptq" in model_name.lower():
    return Model_Type.gptq
  elif "ggml" in model_name.lower():
    return Model_Type.ggml
  else:
    return Model_Type.full_precision


def create_folder_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

# running on gpu? you are using either full precission model or gptq quantization
def initialize_gpu_model_and_tokenizer(model_name, model_type):
    if model_type == Model_Type.gptq:
      model = AutoGPTQForCausalLM.from_quantized(model_name, device_map="auto", use_safetensors=True, use_triton=False)
      tokenizer = AutoTokenizer.from_pretrained(model_name)
    else:
      model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=True)
      tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
    return model, tokenizer

# this is if you run on CPU - then you are using the ggml model type
def init_auto_model_and_tokenizer(model_name, model_type, file_name=None):
  model_type = get_model_type(model_name)

  if Model_Type.ggml == model_type:
    models_folder = "./models"
    create_folder_if_not_exists(models_folder)
    file_path = hf_hub_download(repo_id=model_name, filename=file_name, local_dir=models_folder)
    model = Llama(file_path, n_ctx=4096)
    tokenizer = None
  else:
    model, tokenizer = initialize_gpu_model_and_tokenizer(model_name, model_type=model_type)
  return model, tokenizer

# Chatbot implementation based upon: https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
def run_ui(model, tokenizer, is_chat_model, model_type):
    
  #Blocks are made with a with clause, and any component created inside this clause is automatically added to the app.  
  with gr.Blocks() as demo:
        
      #Gradio components created in the app - Chatbot, textbox and button
      chatbot = gr.Chatbot()
      msg = gr.Textbox()
      clear = gr.Button("Clear")

      # This implementation offers chat streaming. How:
      # First, we can stream responses so the user doesn’t have to wait as long for a message to be generated.
      # Second, we can have the user message appear immediately in the chat history, while the chatbot’s response is being generated. 
    
    
      #The first method user() updates the chatbot with the user message and clears the input field. This method also makes the input field non interactive so that the user can’t send another message while the chatbot is responding. Because we want this to happen instantly, we set queue=False, which would skip any queue had it been enabled. The chatbot’s history is appended with (user_message, None), the None signifying that the bot has not responded.
      def user(user_message, history):
          return "", history + [[user_message, None]]

      #The second method, bot() updates the chatbot history with the bot’s response. Instead of creating a new message, we just replace the previously-created None message with the bot’s response. Finally, we construct the message character by character and yield the intermediate outputs as they are being constructed. Gradio automatically turns any function with the yield keyword into a streaming output interface.
      def bot(history):
          # it is required by llama implementation to format the chats before using (in case of using the finetuned chat model)
          # see for details: https://github.com/facebookresearch/llama 
          if is_chat_model:
              instruction = format_to_llama_chat_style(history)
          else:
              instruction =  history[-1][0]
            
          print('instruction',instruction)
          #this is the model generated text
          history[-1][1] = ""
          kwargs = dict(temperature=0.6, top_p=0.9)
          if model_type == Model_Type.ggml:
              kwargs["max_tokens"] = 512
              for chunk in model(prompt=instruction, stream=True, **kwargs):
                  token = chunk["choices"][0]["text"]
                  history[-1][1] += token
                  yield history

          else:
              streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, Timeout=5)
              inputs = tokenizer(instruction, return_tensors="pt").to(model.device)
              kwargs["max_new_tokens"] = 512
              kwargs["input_ids"] = inputs["input_ids"]
              kwargs["streamer"] = streamer
              thread = Thread(target=model.generate, kwargs=kwargs)
              thread.start()

              for token in streamer:
                  history[-1][1] += token
                  yield history

      # when a user submits their message, we now chain three event events with .then():

      msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
      clear.click(lambda: None, None, chatbot, queue=False)
        
  demo.queue()
  demo.launch(debug=True)

def main(model_name="", file_name=""):
    model_name = "TheBloke/Llama-2-7B-Chat-GGML" #"TheBloke/Llama-2-13B-GPTQ"
    file_name = "llama-2-7b-chat.ggmlv3.q4_K_M.bin"

    is_chat_model = 'chat' in model_name.lower()
    model_type = get_model_type(model_name)

    model, tokenizer = init_auto_model_and_tokenizer(model_name, model_type, file_name)
    run_ui(model, tokenizer, is_chat_model, model_type)

if __name__ == '__main__':
  fire.Fire(main)