VLog / app.py
leiwx52's picture
VLog app
ca5cecc
raw
history blame contribute delete
No virus
8.04 kB
import os
import gradio as gr
import openai
import requests
import csv
import argparse
from models.vlog import Vlogger
parser = argparse.ArgumentParser()
parser.add_argument('--video_path', default='examples/huaqiang.mp4')
parser.add_argument('--alpha', default=10, type=int, help='Determine the maximum segment number for KTS algorithm, the larger the value, the fewer segments.')
parser.add_argument('--beta', default=1, type=int, help='The smallest time gap between successive clips, in seconds.')
parser.add_argument('--data_dir', default='./examples', type=str, help='Directory for saving videos and logs.')
parser.add_argument('--tmp_dir', default='./tmp', type=str, help='Directory for saving intermediate files.')
# * Models settings *
parser.add_argument('--openai_api_key', default='xxx', type=str, help='OpenAI API key')
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP Image Caption')
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
parser.add_argument('--feature_extractor', default='openai/clip-vit-base-patch32', help='Select the feature extractor model for video segmentation')
parser.add_argument('--feature_extractor_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu')
parser.add_argument('--image_captioner', choices=['blip', 'blip2'], dest='captioner_base_model', default='blip2', help='blip2 requires 15G GPU memory, blip requires 6G GPU memory')
parser.add_argument('--image_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--dense_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
parser.add_argument('--audio_translator', default='large')
parser.add_argument('--audio_translator_device', choices=['cuda', 'cpu'], default='cuda')
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo'], default='gpt-3.5-turbo')
args = parser.parse_args()
def get_empty_state():
return {"total_tokens": 0, "messages": []}
def submit_api_key_fn(api_key, vlogger):
try:
vlogger.init_llm_with_api_key(api_key)
return gr.update(value = "OpenAI key submitted successful πŸŽ‰"), True, vlogger
except Exception as e:
return gr.update(value = f"Error {e}"), False, vlogger
def submit_message(prompt, state, vlogger, api_key_submitted, vlog_loaded):
if not api_key_submitted:
return gr.update(value=''), [("πŸ‘€", "Please enter your OpenAI API key 😊"),], state, vlogger
if not vlog_loaded:
return gr.update(value=''), [("πŸ‘€", "Please follow the instruction to select a video and generate the document for chatting 😊"),], state, vlogger
history = state['messages']
if not prompt:
return gr.update(value=''), [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)], state, vlogger
prompt_msg = { "role": "user", "content": prompt }
try:
history.append(prompt_msg)
answer = vlogger.chat2video(prompt)
history.append({"role": "system", "content": answer})
except Exception as e:
history.append(prompt_msg)
history.append({
"role": "system",
"content": f"Error: {e}"
})
chat_messages = [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)]
return '', chat_messages, state, vlogger
def clear_conversation(vlogger):
vlogger.clean_history()
# return input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded
return gr.update(value=None, visible=True), gr.update(value=None, interactive=False), None, gr.update(value=None, visible=True), get_empty_state(), vlogger, False
def vlog_fn(vid_path, vlogger, api_key_submitted):
if not api_key_submitted:
log_text = "====== Please enter your OpenAI API key first 😊 ====="
return gr.update(value=log_text, visible=True), False, vlogger
print(vid_path)
if vid_path is None:
log_text = "====== Please select an video from examples first πŸ€” ====="
vloaded_flag = False
else:
log_list = vlogger.video2log(vid_path)
log_text = "\n".join(log_list)
vloaded_flag = True
return gr.update(value=log_text, visible=True), vloaded_flag, vlogger
css = """
#col-container {max-width: 90%; margin-left: auto; margin-right: auto;}
#video_inp {min-height: 300px}
#chatbox {min-height: 100px;}
#header {text-align: center;
#hint {font-size: 0.9em; padding: 0.5em; margin: 0;}
.message { font-size: 1.2em; }
"""
with gr.Blocks(css=css) as demo:
state = gr.State(get_empty_state())
vlogger = gr.State(Vlogger(args))
vlog_loaded = gr.State(False)
api_key_submitted = gr.State(False)
with gr.Column(elem_id="col-container"):
gr.Markdown("""## 🎞️ VLog Demo
Powered by BLIP2, GRIT, Whisper, ChatGPT and LangChain
Github: [https://github.com/showlab/VLog](https://github.com/showlab/VLog)""",
elem_id="header")
gr.Markdown("*Instruction*: For the current demo, please enter OpenAI api key, select an example video, click the button to generate a document and try chatting over the video 😊", elem_id="hint")
with gr.Row():
with gr.Column(scale=6):
video_inp = gr.Video(label="video_input", interactive=False)
chatbot = gr.Chatbot(elem_id="chatbox")
input_message = gr.Textbox(show_label=False, placeholder="Enter text and press enter", visible=True).style(container=False)
btn_submit = gr.Button("Submit")
btn_clear_conversation = gr.Button("πŸ”ƒ Start New Conversation")
with gr.Column(scale=6):
vlog_btn = gr.Button("Generate Video Document")
vlog_outp = gr.Textbox(label="Document output", lines=30)
with gr.Column(scale=1):
openai_api_key = gr.Textbox(
placeholder="Input OpenAI API key and press Enter",
show_label=False,
label = "OpenAI API Key",
lines=1,
type="password"
)
examples = gr.Examples(
examples=[
["examples/basketball_vlog.mp4"],
["examples/travel_in_roman.mp4"],
["examples/C8lMW0MODFs.mp4"],
["examples/outcGtbnMuQ.mp4"],
["examples/huaqiang.mp4"],
],
inputs=[video_inp],
)
gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/TencentARC/VLog?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br></center>''')
btn_submit.click(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
input_message.submit(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
btn_clear_conversation.click(clear_conversation, [vlogger], [input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded])
vlog_btn.click(vlog_fn, [video_inp, vlogger, api_key_submitted], [vlog_outp, vlog_loaded, vlogger])
openai_api_key.submit(submit_api_key_fn, [openai_api_key, vlogger], [vlog_outp, api_key_submitted, vlogger])
demo.load(queur=False)
demo.queue(concurrency_count=5)
demo.launch(height='800px')