from argparse import ArgumentParser
import copy
import gradio as gr
from gradio.themes.utils import colors, fonts, sizes

from utils.easydict import EasyDict
from tasks.eval.model_utils import load_pllava
from tasks.eval.eval_utils import (
    ChatPllava,
    conv_plain_v1,
    Conversation,
    conv_templates
)
from tasks.eval.demo import pllava_theme

SYSTEM="""You are Pllava, a large vision-language assistant. 
You are able to understand the video content that the user provides, and assist the user with a variety of tasks using natural language.
Follow the instructions carefully and explain your answers in detail based on the provided video.
"""
INIT_CONVERSATION: Conversation = conv_plain_v1.copy()


# ========================================
#             Model Initialization
# ========================================
def init_model(args):

    print('Initializing PLLaVA')
    model, processor = load_pllava(
        args.pretrained_model_name_or_path, args.num_frames, 
        use_lora=args.use_lora, 
        weight_dir=args.weight_dir, 
        lora_alpha=args.lora_alpha, 
        use_multi_gpus=args.use_multi_gpus)
    if not args.use_multi_gpus:
        model = model.to('cuda')
    chat = ChatPllava(model, processor)
    return chat


# ========================================
#             Gradio Setting
# ========================================
def gradio_reset(chat_state, img_list):
    if chat_state is not None:
        chat_state = INIT_CONVERSATION.copy()
    if img_list is not None:
        img_list = []
    return (
        None,
        gr.update(value=None, interactive=True),
        gr.update(value=None, interactive=True),
        gr.update(placeholder='Please upload your video first', interactive=False),
        gr.update(value="Upload & Start Chat", interactive=True),
        chat_state,
        img_list
    )


def upload_img(gr_img, gr_video, chat_state=None, num_segments=None, img_list=None):
    print(gr_img, gr_video)
    chat_state = INIT_CONVERSATION.copy() if chat_state is None else chat_state
    img_list = [] if img_list is None else img_list
    
    if gr_img is None and gr_video is None:
        return None, None, gr.update(interactive=True),gr.update(interactive=True, placeholder='Please upload video/image first!'), chat_state, None
    if gr_video: 
        llm_message, img_list, chat_state = chat.upload_video(gr_video, chat_state, img_list, num_segments)
        return (
            gr.update(interactive=True),
            gr.update(interactive=True),
            gr.update(interactive=True, placeholder='Type and press Enter'),
            gr.update(value="Start Chatting", interactive=False),
            chat_state,
            img_list,
        )
    if gr_img:
        llm_message, img_list,chat_state = chat.upload_img(gr_img, chat_state, img_list)
        return (
            gr.update(interactive=True),
            gr.update(interactive=True),
            gr.update(interactive=True, placeholder='Type and press Enter'),
            gr.update(value="Start Chatting", interactive=False),
            chat_state,
            img_list
        )


def gradio_ask(user_message, chatbot, chat_state, system):
    if len(user_message) == 0:
        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
    chat_state =  chat.ask(user_message, chat_state, system)
    chatbot = chatbot + [[user_message, None]]
    return '', chatbot, chat_state


def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
    llm_message, llm_message_token, chat_state = chat.answer(conv=chat_state, img_list=img_list, max_new_tokens=200, num_beams=num_beams, temperature=temperature)
    llm_message = llm_message.replace("<s>", "") # handle <s>
    chatbot[-1][1] = llm_message
    print(chat_state)
    print(f"Answer: {llm_message}")
    return chatbot, chat_state, img_list


def parse_args():
    parser = ArgumentParser()
    parser.add_argument(
        "--pretrained_model_name_or_path",
        type=str,
        required=True,
        default='llava-hf/llava-1.5-7b-hf'
    )
    parser.add_argument(
        "--num_frames",
        type=int,
        required=True,
        default=4,
    )
    parser.add_argument(
        "--use_lora",
        action='store_true'
    )
    parser.add_argument(
        "--use_multi_gpus",
        action='store_true'
    )
    parser.add_argument(
        "--weight_dir",
        type=str,
        required=False,
        default=None,
    )
    parser.add_argument(
        "--conv_mode",
        type=str,
        required=False,
        default=None,
    )
    parser.add_argument(
        "--lora_alpha",
        type=int,
        required=False,
        default=None,
    )
    parser.add_argument(
        "--server_port",
        type=int,
        required=False,
        default=7868,
    )
    args = parser.parse_args()
    return args


title = """<h1 align="center"><a href="https://github.com/magic-research/PLLaVA"><img src="https://raw.githubusercontent.com/magic-research/PLLaVA/main/assert/logo.png" alt="PLLAVA" border="0" style="margin: 0 auto; height: 100px;" /></a> </h1>"""
description = (
    """<br><p><a href='https://github.com/magic-research/PLLaVA'>
    # PLLAVA!
    <img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p>
    - Upload A Video
    - Press Upload
    - Start Chatting
    """
)

args = parse_args()

model_description = f"""
    # MODEL INFO
    - pretrained_model_name_or_path:{args.pretrained_model_name_or_path}
    - use_lora:{args.use_lora}
    - weight_dir:{args.weight_dir}
"""

# with gr.Blocks(title="InternVideo-VideoChat!",theme=gvlabtheme,css="#chatbot {overflow:auto; height:500px;} #InputVideo {overflow:visible; height:320px;} footer {visibility: none}") as demo:
with gr.Blocks(title="PLLaVA",
               theme=pllava_theme,
               css="#chatbot {overflow:auto; height:500px;} #InputVideo {overflow:visible; height:320px;} footer {visibility: none}") as demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.Markdown(model_description)
    with gr.Row():
        with gr.Column(scale=0.5, visible=True) as video_upload:
            # with gr.Column(elem_id="image", scale=0.5) as img_part:
            with gr.Tab("Video", elem_id='video_tab'):
                up_video = gr.Video(interactive=True, include_audio=True, elem_id="video_upload", height=360)
            with gr.Tab("Image", elem_id='image_tab'):
                up_image = gr.Image(type="pil", interactive=True, elem_id="image_upload", height=360)
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
            clear = gr.Button("Restart")
            
            # num_segments = gr.Slider(
            #     minimum=8,
            #     maximum=64,
            #     value=8,
            #     step=1,
            #     interactive=True,
            #     label="Video Segments",
            # )
        
        with gr.Column(visible=True)  as input_raws:
            system_string = gr.Textbox(SYSTEM, interactive=True, label='system')
            num_beams = gr.Slider(
                minimum=1,
                maximum=5,
                value=1,
                step=1,
                interactive=True,
                label="beam search numbers",
            )           
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=1.0,
                step=0.1,
                interactive=True,
                label="Temperature",
            )
            
            chat_state = gr.State()
            img_list = gr.State()
            chatbot = gr.Chatbot(elem_id="chatbot",label='Conversation')
            with gr.Row():
                with gr.Column(scale=0.7):
                    text_input = gr.Textbox(show_label=False, placeholder='Please upload your video first', interactive=False, container=False)
                with gr.Column(scale=0.15, min_width=0):
                    run = gr.Button("💭Send")
                with gr.Column(scale=0.15, min_width=0):
                    clear = gr.Button("🔄Clear")     
    
    with gr.Row():
        examples = gr.Examples(
            examples=[
                ['example/jesse_dance.mp4', 'What is the man doing?'],
                ['example/yoga.mp4', 'What is the woman doing?'],
                ['example/cooking.mp4', 'Describe the background, characters and the actions in the provided video.'],
                # ['example/cooking.mp4', 'What is happening in the video?'],
                ['example/working.mp4', 'Describe the background, characters and the actions in the provided video.'],
                ['example/1917.mp4', 'Describe the background, characters and the actions in the provided video.'],
            ],
            inputs=[up_video, text_input],
            cache_examples=False
        )


    chat = init_model(args)
    INIT_CONVERSATION = conv_templates[args.conv_mode]
    upload_button.click(upload_img, [up_image, up_video, chat_state], [up_image, up_video, text_input, upload_button, chat_state, img_list])
    
    text_input.submit(gradio_ask, [text_input, chatbot, chat_state, system_string], [text_input, chatbot, chat_state]).then(
        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
    )
    run.click(gradio_ask, [text_input, chatbot, chat_state, system_string], [text_input, chatbot, chat_state]).then(
        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
    )
    run.click(lambda: "", None, text_input)  
    clear.click(gradio_reset, [chat_state, img_list], [chatbot, up_image, up_video, text_input, upload_button, chat_state, img_list], queue=False)

demo.queue(max_size=5)
demo.launch()
# demo.launch(server_name="0.0.0.0", server_port=10034, enable_queue=True)