import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from threading import Thread
import re
import time 
from PIL import Image
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

tokenizer = AutoTokenizer.from_pretrained(
    'qnguyen3/nanoLLaVA',
    trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    'qnguyen3/nanoLLaVA',
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True)
model.to("cuda:0")

@spaces.GPU
def bot_streaming(message, history):
    chat_history = []
    if message["files"]:
      image = message["files"][-1]["path"]
    else:
      for i, hist in enumerate(history):
        if type(hist[0])==tuple:
          image = hist[0][0]
          image_turn = i
            
    if len(history) > 0 and image is not None:
        chat_history.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
        chat_history.append({"role": "assistant", "content": history[1][1] })
        for human, assistant in history[2:]:
            chat_history.append({"role": "user", "content": human })
            chat_history.append({"role": "assistant", "content": assistant })
        chat_history.append({"role": "user", "content": message['text']})
    elif len(history) > 0 and image is None:
        for human, assistant in history:
            chat_history.append({"role": "user", "content": human })
            chat_history.append({"role": "assistant", "content": assistant })
        chat_history.append({"role": "user", "content": message['text']})
    elif len(history) == 0 and image is not None:
        chat_history.append({"role": "user", "content": f"<image>\n{message['text']}"})
    elif len(history) == 0 and image is None:
        chat_history.append({"role": "user", "content": message['text'] })

    # if image is None:
    #     gr.Error("You need to upload an image for LLaVA to work.")
    prompt=f"[INST] <image>\n{message['text']} [/INST]"
    image = Image.open(image).convert("RGB")
    text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True)
    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
    streamer = TextIteratorStreamer(input_ids, **{"skip_special_tokens": True})
    image = Image.open(image)
    image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
    generation_kwargs = dict(inputs, images=image_tensor, streamer=streamer, max_new_tokens=100)
    generated_text = ""
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    text_prompt =f"<|im_start|>user\n{message['text']}<|im_end|>"
    
    buffer = ""
    for new_text in streamer:
      
      buffer += new_text
      
      generated_text_without_prompt = buffer[len(text_prompt):]
      time.sleep(0.04)
      yield generated_text_without_prompt


demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA NeXT", examples=[{"text": "What is on the flower?", "files":["./bee.jpg"]},
                                                                      {"text": "How to make this pastry?", "files":["./baklava.png"]}], 
                        description="Try [LLaVA NeXT](https://huggingface.co/docs/transformers/main/en/model_doc/llava_next) in this demo (more specifically, the [Mistral-7B variant](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
                        stop_btn="Stop Generation", multimodal=True)
demo.launch(debug=True)