Spaces:

SeViLA
/

SeViLA

Runtime error

App Files Files Community

Shoubin commited on May 14, 2023

Commit

530a71c

•

1 Parent(s): 4bdb13f

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -24

app.py CHANGED Viewed

@@ -34,23 +34,23 @@ image_size = img_size
 transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize])
 print('model loading')
-# sevila = SeViLA(
-#     img_size=img_size,
-#     drop_path_rate=drop_path_rate,
-#     use_grad_checkpoint=use_grad_checkpoint,
-#     vit_precision=vit_precision,
-#     freeze_vit=freeze_vit,
-#     num_query_token=num_query_token,
-#     t5_model=t5_model,
-#     prompt=prompt,
-#     max_txt_len=max_txt_len,
-#     apply_lemmatizer=apply_lemmatizer,
-#     frame_num=4,
-#     answer_num=answer_num,
-#     task=task,
-#         )
-# sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth')
 print('model loaded')
 ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'}
@@ -68,11 +68,11 @@ def sevila_demo(video,
     else:
         device = 'cpu'
-    # global sevila
-    # if device == "cpu":
-    #     sevila = sevila.float()
-    # else:
-    #     sevila = sevila.to(int(device))
     vpath = video
     raw_clip, indice, fps, vlen = load_video_demo(
@@ -98,11 +98,11 @@ def sevila_demo(video,
     text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt
     text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt
-    # out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num))
     # print(out)
-    answer_id = 0 #out['output_text'][0]
     answer = option_dict[answer_id]
-    select_index = [1,2,3,4]#out['frame_idx'][0]
     # images = []
     keyframes = []
     timestamps =[]

 transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize])
 print('model loading')
+sevila = SeViLA(
+    img_size=img_size,
+    drop_path_rate=drop_path_rate,
+    use_grad_checkpoint=use_grad_checkpoint,
+    vit_precision=vit_precision,
+    freeze_vit=freeze_vit,
+    num_query_token=num_query_token,
+    t5_model=t5_model,
+    prompt=prompt,
+    max_txt_len=max_txt_len,
+    apply_lemmatizer=apply_lemmatizer,
+    frame_num=4,
+    answer_num=answer_num,
+    task=task,
+        )
+sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth')
 print('model loaded')
 ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'}
     else:
         device = 'cpu'
+    global sevila
+    if device == "cpu":
+        sevila = sevila.float()
+    else:
+        sevila = sevila.to(int(device))
     vpath = video
     raw_clip, indice, fps, vlen = load_video_demo(
     text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt
     text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt
+    out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num))
     # print(out)
+    answer_id = out['output_text'][0]
     answer = option_dict[answer_id]
+    select_index = out['frame_idx'][0]
     # images = []
     keyframes = []
     timestamps =[]