Spaces:

SeViLA
/

SeViLA

Runtime error

App Files Files Community

Shoubin commited on May 14, 2023

Commit

0ea72e5

•

1 Parent(s): 3cd74d2

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -25

app.py CHANGED Viewed

@@ -34,23 +34,23 @@ image_size = img_size
 transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize])
 print('model loading')
-sevila = SeViLA(
-    img_size=img_size,
-    drop_path_rate=drop_path_rate,
-    use_grad_checkpoint=use_grad_checkpoint,
-    vit_precision=vit_precision,
-    freeze_vit=freeze_vit,
-    num_query_token=num_query_token,
-    t5_model=t5_model,
-    prompt=prompt,
-    max_txt_len=max_txt_len,
-    apply_lemmatizer=apply_lemmatizer,
-    frame_num=4,
-    answer_num=answer_num,
-    task=task,
-        )
-sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth')
 print('model loaded')
 ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'}
@@ -68,11 +68,11 @@ def sevila_demo(video,
     else:
         device = 'cpu'
-    global sevila
-    if device == "cpu":
-        sevila = sevila.float()
-    else:
-        sevila = sevila.to(int(device))
     vpath = video
     raw_clip, indice, fps, vlen = load_video_demo(
@@ -98,11 +98,11 @@ def sevila_demo(video,
     text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt
     text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt
-    out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num))
     # print(out)
-    answer_id = out['output_text'][0]
     answer = option_dict[answer_id]
-    select_index = out['frame_idx'][0]
     # images = []
     keyframes = []
     timestamps =[]
@@ -170,7 +170,7 @@ with gr.Blocks(title="SeViLA demo") as demo:
             keyframe_num = gr.Textbox(placeholder=4, label='# Keyframe')
             # device = gr.Textbox(placeholder=0, label='Device')
             gen_btn = gr.Button(value='Locate and Answer!')
-        with gr.Column(scale=1, min_width=100):
             keyframes = gr.Gallery(
                 label="Keyframes", show_label=False, elem_id="gallery", max_width=100, max_height=100,
                 ).style(columns=[4], rows=[1], object_fit="contain", height='auto')

 transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize])
 print('model loading')
+# sevila = SeViLA(
+#     img_size=img_size,
+#     drop_path_rate=drop_path_rate,
+#     use_grad_checkpoint=use_grad_checkpoint,
+#     vit_precision=vit_precision,
+#     freeze_vit=freeze_vit,
+#     num_query_token=num_query_token,
+#     t5_model=t5_model,
+#     prompt=prompt,
+#     max_txt_len=max_txt_len,
+#     apply_lemmatizer=apply_lemmatizer,
+#     frame_num=4,
+#     answer_num=answer_num,
+#     task=task,
+#         )
+# sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth')
 print('model loaded')
 ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'}
     else:
         device = 'cpu'
+    # global sevila
+    # if device == "cpu":
+    #     sevila = sevila.float()
+    # else:
+    #     sevila = sevila.to(int(device))
     vpath = video
     raw_clip, indice, fps, vlen = load_video_demo(
     text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt
     text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt
+    # out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num))
     # print(out)
+    answer_id = 0 #out['output_text'][0]
     answer = option_dict[answer_id]
+    select_index = [1,2,3,4]#out['frame_idx'][0]
     # images = []
     keyframes = []
     timestamps =[]
             keyframe_num = gr.Textbox(placeholder=4, label='# Keyframe')
             # device = gr.Textbox(placeholder=0, label='Device')
             gen_btn = gr.Button(value='Locate and Answer!')
+        with gr.Column(scale=1, min_width=600):
             keyframes = gr.Gallery(
                 label="Keyframes", show_label=False, elem_id="gallery", max_width=100, max_height=100,
                 ).style(columns=[4], rows=[1], object_fit="contain", height='auto')