import gradio as gr import os import torch from torchvision import transforms from lavis.processors import transforms_video from lavis.datasets.data_utils import load_video_demo from lavis.processors.blip_processors import ToUint8, ToTHWC from lavis.models.sevila_models.sevila import SeViLA from typing import Optional import warnings # model config img_size = 224 num_query_token = 32 t5_model = 'google/flan-t5-xl' drop_path_rate = 0 use_grad_checkpoint = False vit_precision = "fp16" freeze_vit = True prompt = '' max_txt_len = 77 answer_num = 5 apply_lemmatizer = False task = 'freeze_loc_freeze_qa_vid' # prompt LOC_propmpt = 'Does the information within the frame provide the necessary details to accurately answer the given question?' QA_prompt = 'Considering the information presented in the frame, select the correct answer from the options.' # processors config mean = (0.48145466, 0.4578275, 0.40821073) std = (0.26862954, 0.26130258, 0.27577711) normalize = transforms.Normalize(mean, std) image_size = img_size transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize]) print('model loading') sevila = SeViLA( img_size=img_size, drop_path_rate=drop_path_rate, use_grad_checkpoint=use_grad_checkpoint, vit_precision=vit_precision, freeze_vit=freeze_vit, num_query_token=num_query_token, t5_model=t5_model, prompt=prompt, max_txt_len=max_txt_len, apply_lemmatizer=apply_lemmatizer, frame_num=4, answer_num=answer_num, task=task, ) sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth') print('model loaded') ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'} # os.mkdir('video') def sevila_demo(video, question, option1, option2, option3, video_frame_num, keyframe_num): if torch.cuda.is_available(): device = 0 else: device = 'cpu' global sevila if device == "cpu": sevila = sevila.float() else: sevila = sevila.to(int(device)) vpath = video raw_clip, indice, fps, vlen = load_video_demo( video_path=vpath, n_frms=int(video_frame_num), height=image_size, width=image_size, sampling="uniform", clip_proposal=None ) clip = transform(raw_clip.permute(1,0,2,3)) if device == "cpu": clip = clip.float() else: clip = clip.float().to(int(device)) clip = clip.unsqueeze(0) # check if option1[-1] != '.': option1 += '.' if option2[-1] != '.': option2 += '.' if option3[-1] != '.': option3 += '.' option_dict = {0:option1, 1:option2, 2:option3} options = 'Option A:{} Option B:{} Option C:{}'.format(option1, option2, option3) text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num)) # print(out) answer_id = out['output_text'][0] answer = option_dict[answer_id] select_index = out['frame_idx'][0] # images = [] keyframes = [] timestamps =[] # print('raw_clip', len(raw_clip)) # for j in range(int(video_frame_num)): # image = raw_clip[:, j, :, :].int() # image = image.permute(1, 2, 0).numpy() # images.append(image) video_len = vlen/fps # seconds for i in select_index: image = raw_clip[:, i, :, :].int() image = image.permute(1, 2, 0).numpy() keyframes.append(image) select_i = indice[i] time = round((select_i / vlen) * video_len, 2) timestamps.append(str(time)+'s') gr.components.Gallery(keyframes) #gr.components.Gallery(images) timestamps_des = '' for i in range(len(select_index)): timestamps_des += 'Keyframe {}: {} \n'.format(str(i+1), timestamps[i]) return keyframes, timestamps_des, answer with gr.Blocks(title="SeViLA demo") as demo: description = """

Self-Chained Image-Language Model for Video Localization and Question Answering
Shoubin Yu, Jaemin Cho, Prateek Yadav, Mohit Bansal
[GitHub] [Paper]

To locate keyframes in a video and answer question, please:
(1) upolad your video; (2) write your question/options and set # video frame/# keyframe; (3) click Locate and Answer!
Just a heads up - loading the SeViLA model can take a few minutes (typically 2-3), and running examples requires about 12GB of memory.
We've got you covered! We've provided some example videos and questions below to help you get started. Feel free to try out SeViLA with these!

""" gr.HTML(description) with gr.Row(): with gr.Column(scale=1, min_width=600): video = gr.Video(label='Video') question = gr.Textbox(placeholder="Why did the two ladies put their hands above their eyes while staring out?", label='Question') with gr.Row(): option1 = gr.Textbox(placeholder="practicing cheer", label='Option 1') option2 = gr.Textbox(placeholder="posing for photo", label='Option 2') option3 = gr.Textbox(placeholder="to see better", label='Option 3') with gr.Row(): video_frame_num = gr.Textbox(placeholder=32, label='# Video Frame') keyframe_num = gr.Textbox(placeholder=4, label='# Keyframe') # device = gr.Textbox(placeholder=0, label='Device') gen_btn = gr.Button(value='Locate and Answer!') with gr.Column(scale=1, min_width=600): keyframes = gr.Gallery( label="Keyframes", show_label=False, elem_id="gallery", ).style(columns=[4], rows=[1], object_fit="contain", max_width=100, max_height=100) #keyframes = gr.Gallery(label='Keyframes') timestamps = gr.outputs.Textbox(label="Keyframe Timestamps") answer = gr.outputs.Textbox(label="Output Answer") gen_btn.click( sevila_demo, inputs=[video, question, option1, option2, option3, video_frame_num, keyframe_num], outputs=[keyframes, timestamps, answer], queue=True ) #demo = gr.Interface(sevila_demo, # inputs=[gr.Video(), question, option1, option2, option3, video_frame_num, keyframe_num, device], # outputs=['gallery', timestamps, answer], # examples=[['videos/demo1.mp4', 'Why did the two ladies put their hands above their eyes while staring out?', 'practicing cheer.', 'play ball.', 'to see better.', 32, 4, 0], # ['videos/demo2.mp4', 'What did both of them do after completing skiing?', 'jump and pose.' , 'bend down.','raised their hands.', 32, 4, 0], # ['videos/demo3.mp4', 'What room was Wilson breaking into when House found him?', 'the kitchen.' , 'the dining room.','the bathroom.', 32, 4, 0]] # ) with gr.Column(): gr.Examples( inputs=[video, question, option1, option2, option3, video_frame_num, keyframe_num], outputs=[keyframes, timestamps, answer], fn=sevila_demo, examples=[['videos/demo1.mp4', 'Why did the two ladies put their hands above their eyes while staring out?', 'practicing cheer', 'to place wreaths', 'to see better', 32, 4], ['videos/demo2.mp4', 'What did both of them do after completing skiing?', 'jump and pose' , 'bend down','raised their hands', 32, 4], ['videos/demo3.mp4', 'What room was Wilson breaking into when House found him?', 'the bedroom' , 'the bathroom','the kitchen', 32, 4], ['videos/demo4.mp4', 'What kind of bird is it?', 'chikadee' , 'eagle', 'sparrow', 32, 1]], cache_examples=False, ) demo.queue(concurrency_count=1, api_open=False) demo.launch(share=False)