File size: 8,035 Bytes
5a444be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca5cecc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import gradio as gr
import openai
import requests
import csv
import argparse
from models.vlog import Vlogger

parser = argparse.ArgumentParser()
parser.add_argument('--video_path', default='examples/huaqiang.mp4')
parser.add_argument('--alpha', default=10, type=int, help='Determine the maximum segment number for KTS algorithm, the larger the value, the fewer segments.')
parser.add_argument('--beta', default=1, type=int, help='The smallest time gap between successive clips, in seconds.')
parser.add_argument('--data_dir', default='./examples', type=str, help='Directory for saving videos and logs.')
parser.add_argument('--tmp_dir', default='./tmp', type=str, help='Directory for saving intermediate files.')

# * Models settings *
parser.add_argument('--openai_api_key', default='xxx', type=str, help='OpenAI API key')
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP Image Caption')
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
parser.add_argument('--feature_extractor', default='openai/clip-vit-base-patch32', help='Select the feature extractor model for video segmentation')
parser.add_argument('--feature_extractor_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu')
parser.add_argument('--image_captioner', choices=['blip', 'blip2'], dest='captioner_base_model', default='blip2', help='blip2 requires 15G GPU memory, blip requires 6G GPU memory')
parser.add_argument('--image_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--dense_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
parser.add_argument('--audio_translator', default='large')
parser.add_argument('--audio_translator_device', choices=['cuda', 'cpu'], default='cuda')
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo'], default='gpt-3.5-turbo')

args = parser.parse_args()


def get_empty_state():
    return {"total_tokens": 0, "messages": []}


def submit_api_key_fn(api_key, vlogger):
    try:
        vlogger.init_llm_with_api_key(api_key)
        return gr.update(value = "OpenAI key submitted successful πŸŽ‰"), True, vlogger
    
    except Exception as e:
        return gr.update(value = f"Error {e}"), False, vlogger
    

def submit_message(prompt, state, vlogger, api_key_submitted, vlog_loaded):
    if not api_key_submitted:
        return gr.update(value=''), [("πŸ‘€", "Please enter your OpenAI API key 😊"),], state, vlogger
    
    if not vlog_loaded:
        return gr.update(value=''), [("πŸ‘€", "Please follow the instruction to select a video and generate the document for chatting 😊"),], state, vlogger
    
    history = state['messages']

    if not prompt:
        return gr.update(value=''), [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)], state, vlogger

    prompt_msg = { "role": "user", "content": prompt }
    
    try:
        history.append(prompt_msg)
        answer = vlogger.chat2video(prompt)
        history.append({"role": "system", "content": answer}) 

    except Exception as e:
        history.append(prompt_msg)
        history.append({
            "role": "system",
            "content": f"Error: {e}"
        })

    chat_messages = [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)]
    return '', chat_messages, state, vlogger

def clear_conversation(vlogger):
    vlogger.clean_history()
    
    # return input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded
    return gr.update(value=None, visible=True), gr.update(value=None, interactive=False), None, gr.update(value=None, visible=True), get_empty_state(),  vlogger, False

def vlog_fn(vid_path, vlogger, api_key_submitted):
    if not api_key_submitted:
        log_text = "====== Please enter your OpenAI API key first 😊 ====="
        return gr.update(value=log_text, visible=True), False, vlogger
    
    print(vid_path)
    if vid_path is None:
        log_text = "====== Please select an video from examples first πŸ€” ====="
        vloaded_flag = False
    else:
        log_list = vlogger.video2log(vid_path)
        log_text = "\n".join(log_list)
        vloaded_flag = True
    return gr.update(value=log_text, visible=True), vloaded_flag, vlogger

css = """
      #col-container {max-width: 90%; margin-left: auto; margin-right: auto;}
      #video_inp {min-height: 300px}
      #chatbox {min-height: 100px;}
      #header {text-align: center;
      #hint {font-size: 0.9em; padding: 0.5em; margin: 0;}
      .message { font-size: 1.2em; }
      """

with gr.Blocks(css=css) as demo:
    
    state = gr.State(get_empty_state())
    vlogger = gr.State(Vlogger(args))
    vlog_loaded = gr.State(False)
    api_key_submitted = gr.State(False)
    
    
    with gr.Column(elem_id="col-container"):
        gr.Markdown("""## 🎞️ VLog Demo
                    Powered by BLIP2, GRIT, Whisper, ChatGPT and LangChain
                    Github: [https://github.com/showlab/VLog](https://github.com/showlab/VLog)""",
                    elem_id="header")
        gr.Markdown("*Instruction*: For the current demo, please enter OpenAI api key, select an example video, click the button to generate a document and try chatting over the video 😊", elem_id="hint")
        with gr.Row():
            with gr.Column(scale=6):
                video_inp = gr.Video(label="video_input", interactive=False)
                chatbot = gr.Chatbot(elem_id="chatbox")
                input_message = gr.Textbox(show_label=False, placeholder="Enter text and press enter", visible=True).style(container=False)
                btn_submit = gr.Button("Submit")
                btn_clear_conversation = gr.Button("πŸ”ƒ Start New Conversation")
        
            with gr.Column(scale=6):
                vlog_btn = gr.Button("Generate Video Document")
                vlog_outp = gr.Textbox(label="Document output", lines=30)
            
            with gr.Column(scale=1):
                openai_api_key = gr.Textbox(
                    placeholder="Input OpenAI API key and press Enter",
                    show_label=False,
                    label = "OpenAI API Key",
                    lines=1,
                    type="password"
                )
                examples = gr.Examples(
                    examples=[
                        ["examples/basketball_vlog.mp4"],
                        ["examples/travel_in_roman.mp4"],
                        ["examples/C8lMW0MODFs.mp4"],
                        ["examples/outcGtbnMuQ.mp4"],
                        ["examples/huaqiang.mp4"],
                    ],
                    inputs=[video_inp],
                )               
                
    gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/TencentARC/VLog?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br></center>''')

    btn_submit.click(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
    input_message.submit(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
    btn_clear_conversation.click(clear_conversation, [vlogger], [input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded])
    vlog_btn.click(vlog_fn, [video_inp, vlogger, api_key_submitted], [vlog_outp, vlog_loaded, vlogger])
    openai_api_key.submit(submit_api_key_fn, [openai_api_key, vlogger], [vlog_outp, api_key_submitted, vlogger])
    demo.load(queur=False)

demo.queue(concurrency_count=5)
demo.launch(height='800px')