Spaces:

SeViLA
/

SeViLA

Runtime error

App Files Files Community

SeViLA / app.py

Shoubin

Update app.py

ff1c51e 6 months ago

raw history blame contribute delete

No virus

8.85 kB

	import gradio as gr
	import os
	import torch
	from torchvision import transforms
	from lavis.processors import transforms_video
	from lavis.datasets.data_utils import load_video_demo
	from lavis.processors.blip_processors import ToUint8, ToTHWC
	from lavis.models.sevila_models.sevila import SeViLA
	from typing import Optional
	import warnings
	# model config

	img_size = 224
	num_query_token = 32
	t5_model = 'google/flan-t5-xl'
	drop_path_rate = 0
	use_grad_checkpoint = False
	vit_precision = "fp16"
	freeze_vit = True
	prompt = ''
	max_txt_len = 77
	answer_num = 5
	apply_lemmatizer = False
	task = 'freeze_loc_freeze_qa_vid'

	# prompt
	LOC_propmpt = 'Does the information within the frame provide the necessary details to accurately answer the given question?'
	QA_prompt = 'Considering the information presented in the frame, select the correct answer from the options.'

	# processors config
	mean = (0.48145466, 0.4578275, 0.40821073)
	std = (0.26862954, 0.26130258, 0.27577711)
	normalize = transforms.Normalize(mean, std)
	image_size = img_size
	transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize])

	print('model loading')
	sevila = SeViLA(
	img_size=img_size,
	drop_path_rate=drop_path_rate,
	use_grad_checkpoint=use_grad_checkpoint,
	vit_precision=vit_precision,
	freeze_vit=freeze_vit,
	num_query_token=num_query_token,
	t5_model=t5_model,
	prompt=prompt,
	max_txt_len=max_txt_len,
	apply_lemmatizer=apply_lemmatizer,
	frame_num=4,
	answer_num=answer_num,
	task=task,
	)

	sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth')
	print('model loaded')

	ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'}

	# os.mkdir('video')

	def sevila_demo(video,
	question,
	option1, option2, option3,
	video_frame_num,
	keyframe_num):

	if torch.cuda.is_available():
	device = 0
	else:
	device = 'cpu'

	global sevila
	if device == "cpu":
	sevila = sevila.float()
	else:
	sevila = sevila.to(int(device))

	vpath = video
	raw_clip, indice, fps, vlen = load_video_demo(
	video_path=vpath,
	n_frms=int(video_frame_num),
	height=image_size,
	width=image_size,
	sampling="uniform",
	clip_proposal=None
	)
	clip = transform(raw_clip.permute(1,0,2,3))
	if device == "cpu":
	clip = clip.float()
	else:
	clip = clip.float().to(int(device))

	clip = clip.unsqueeze(0)
	# check
	if option1[-1] != '.':
	option1 += '.'
	if option2[-1] != '.':
	option2 += '.'
	if option3[-1] != '.':
	option3 += '.'
	option_dict = {0:option1, 1:option2, 2:option3}
	options = 'Option A:{} Option B:{} Option C:{}'.format(option1, option2, option3)
	text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt
	text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt

	out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num))
	# print(out)
	answer_id = out['output_text'][0]
	answer = option_dict[answer_id]
	select_index = out['frame_idx'][0]
	# images = []
	keyframes = []
	timestamps =[]

	# print('raw_clip', len(raw_clip))
	# for j in range(int(video_frame_num)):
	# image = raw_clip[:, j, :, :].int()
	# image = image.permute(1, 2, 0).numpy()
	# images.append(image)

	video_len = vlen/fps # seconds

	for i in select_index:
	image = raw_clip[:, i, :, :].int()
	image = image.permute(1, 2, 0).numpy()
	keyframes.append(image)
	select_i = indice[i]
	time = round((select_i / vlen) * video_len, 2)
	timestamps.append(str(time)+'s')

	gr.components.Gallery(keyframes)
	#gr.components.Gallery(images)
	timestamps_des = ''
	for i in range(len(select_index)):
	timestamps_des += 'Keyframe {}: {} \n'.format(str(i+1), timestamps[i])

	return keyframes, timestamps_des, answer

	with gr.Blocks(title="SeViLA demo") as demo:
	description = """<p style="text-align: center; font-weight: bold;">
	<span style="font-size: 28px">Self-Chained Image-Language Model for Video Localization and Question Answering</span>
	<br>
	<span style="font-size: 18px" id="author-info">
	<a href="https://yui010206.github.io/" target="_blank">Shoubin Yu</a>,
	<a href="https://j-min.io/" target="_blank">Jaemin Cho</a>,
	<a href="https://prateek-yadav.github.io/" target="_blank">Prateek Yadav</a>,
	<a href="https://www.cs.unc.edu/~mbansal/" target="_blank">Mohit Bansal</a>
	</span>
	<br>
	<span style="font-size: 18px" id="paper-info">
	[<a href="https://github.com/Yui010206/SeViLA" target="_blank">GitHub</a>]
	[<a href="https://arxiv.org/abs/2305.06988" target="_blank">Paper</a>]
	</span>
	</p>
	<p>
	To locate keyframes in a video and answer question, please:
	<br>
	(1) upolad your video; (2) write your question/options and set # video frame/# keyframe; (3) click Locate and Answer!
	<br>
	Just a heads up - loading the SeViLA model can take a few minutes (typically 2-3), and running examples requires about 12GB of memory.
	<br>
	We've got you covered! We've provided some example videos and questions below to help you get started. Feel free to try out SeViLA with these!
	</p>
	"""
	gr.HTML(description)
	with gr.Row():
	with gr.Column(scale=1, min_width=600):
	video = gr.Video(label='Video')
	question = gr.Textbox(placeholder="Why did the two ladies put their hands above their eyes while staring out?", label='Question')
	with gr.Row():
	option1 = gr.Textbox(placeholder="practicing cheer", label='Option 1')
	option2 = gr.Textbox(placeholder="posing for photo", label='Option 2')
	option3 = gr.Textbox(placeholder="to see better", label='Option 3')
	with gr.Row():
	video_frame_num = gr.Textbox(placeholder=32, label='# Video Frame')
	keyframe_num = gr.Textbox(placeholder=4, label='# Keyframe')
	# device = gr.Textbox(placeholder=0, label='Device')
	gen_btn = gr.Button(value='Locate and Answer!')
	with gr.Column(scale=1, min_width=600):
	keyframes = gr.Gallery(
	label="Keyframes", show_label=False, elem_id="gallery",
	).style(columns=[4], rows=[1], object_fit="contain", max_width=100, max_height=100)
	#keyframes = gr.Gallery(label='Keyframes')
	timestamps = gr.outputs.Textbox(label="Keyframe Timestamps")
	answer = gr.outputs.Textbox(label="Output Answer")

	gen_btn.click(
	sevila_demo,
	inputs=[video, question, option1, option2, option3, video_frame_num, keyframe_num],
	outputs=[keyframes, timestamps, answer],
	queue=True
	)
	#demo = gr.Interface(sevila_demo,
	# inputs=[gr.Video(), question, option1, option2, option3, video_frame_num, keyframe_num, device],
	# outputs=['gallery', timestamps, answer],
	# examples=[['videos/demo1.mp4', 'Why did the two ladies put their hands above their eyes while staring out?', 'practicing cheer.', 'play ball.', 'to see better.', 32, 4, 0],
	# ['videos/demo2.mp4', 'What did both of them do after completing skiing?', 'jump and pose.' , 'bend down.','raised their hands.', 32, 4, 0],
	# ['videos/demo3.mp4', 'What room was Wilson breaking into when House found him?', 'the kitchen.' , 'the dining room.','the bathroom.', 32, 4, 0]]
	# )
	with gr.Column():
	gr.Examples(
	inputs=[video, question, option1, option2, option3, video_frame_num, keyframe_num],
	outputs=[keyframes, timestamps, answer],
	fn=sevila_demo,
	examples=[['videos/demo1.mp4', 'Why did the two ladies put their hands above their eyes while staring out?', 'practicing cheer', 'to place wreaths', 'to see better', 32, 4],
	['videos/demo2.mp4', 'What did both of them do after completing skiing?', 'jump and pose' , 'bend down','raised their hands', 32, 4],
	['videos/demo3.mp4', 'What room was Wilson breaking into when House found him?', 'the bedroom' , 'the bathroom','the kitchen', 32, 4],
	['videos/demo4.mp4', 'What kind of bird is it?', 'chikadee' , 'eagle', 'sparrow', 32, 1]],
	cache_examples=False,
	)
	demo.queue(concurrency_count=1, api_open=False)
	demo.launch(share=False)