Spaces:

lixin4ever
/

VideoLLaMA2

Running on Zero

ClownRat commited on Jun 15, 2024

Commit

506722a

1 Parent(s): d352d51

update inference model.

Files changed (2) hide show

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import gradio as gr
 import sys
 sys.path.append('./')
-from videollama2.constants import MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
 from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
 from videollama2.model.builder import load_pretrained_model
 from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
@@ -155,12 +155,14 @@ def generate(image, video, state, state_, textbox_in, temperature, top_p, max_ou
     text_en_in = textbox_in.replace("picture", "image")
     processor = handler.processor
     if os.path.exists(image) and not os.path.exists(video):
         tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
         modals.append('IMAGE')
     if not os.path.exists(image) and os.path.exists(video):
-        tensor.append(process_video(video, processor).to(handler.model.device, dtype=dtype))
         modals.append('VIDEO')
     if os.path.exists(image) and os.path.exists(video):
         raise NotImplementedError("Not support image and video at the same time")
@@ -222,7 +224,7 @@ def clear_history(state, state_):
 # 3. The function can't return tensor or other cuda objects.
 conv_mode = "llama_2"
-model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
 device = torch.device("cuda")

 import sys
 sys.path.append('./')
+from videollama2.constants import NUM_FRAMES, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
 from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
 from videollama2.model.builder import load_pretrained_model
 from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
     text_en_in = textbox_in.replace("picture", "image")
+    num_frames = handler.model.config.num_frames if hasattr(handler.model.config, "num_frames") else NUM_FRAMES
     processor = handler.processor
     if os.path.exists(image) and not os.path.exists(video):
         tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
         modals.append('IMAGE')
     if not os.path.exists(image) and os.path.exists(video):
+        tensor.append(process_video(video, processor, num_frames=num_frames, sample_scheme='fps').to(handler.model.device, dtype=dtype))
         modals.append('VIDEO')
     if os.path.exists(image) and os.path.exists(video):
         raise NotImplementedError("Not support image and video at the same time")
 # 3. The function can't return tensor or other cuda objects.
 conv_mode = "llama_2"
+model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-16F'
 device = torch.device("cuda")

videollama2/mm_utils.py CHANGED Viewed

@@ -381,7 +381,10 @@ def process_video(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAM
         elif mode == 'fps':
             assert local_fps is not None
             segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
-            return np.arange(segment_len // 2, duration, segment_len, dtype=int)
         else:
             raise ImportError(f'Unsupported frame sampling mode: {mode}')

         elif mode == 'fps':
             assert local_fps is not None
             segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
+            frame_id_list = np.arange(segment_len // 2, duration, segment_len, dtype=int)
+            if len(frame_id_list) < num_frames:
+                frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
+            return frame_id_list
         else:
             raise ImportError(f'Unsupported frame sampling mode: {mode}')