Chat-UniVi
/

Chat-UniVi-13B

Text Generation

Transformers

PyTorch

ChatUniVi

Inference Endpoints

Model card Files Files and versions Community

Chat-UniVi commited on Nov 29, 2023

Commit

164f596

•

1 Parent(s): a70ee08

Update README.md

Browse files

Files changed (1) hide show

README.md +11 -19

README.md CHANGED Viewed

@@ -39,11 +39,6 @@ import numpy as np
 def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
     # speed up video decode via decord.
-    video_mask = np.zeros(max_frames, dtype=np.int64)
-    max_video_length = 0
-    # T x 3 x H x W
-    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
     if s is None:
         start_time, end_time = None, None
@@ -83,25 +78,22 @@ def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH,
         patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
         slice_len = patch_images.shape[0]
-        max_video_length = max_video_length if max_video_length > slice_len else slice_len
-        if slice_len < 1:
-            pass
-        else:
-            video[:slice_len, ...] = patch_images
-        return patch_images, video_mask
     else:
         print("video path: {} error.".format(video_path))
-    video_mask[:max_video_length] = [1] * max_video_length
-    return torch.from_numpy(video), video_mask
 if __name__ == '__main__':
     # Model Parameter
     model_path = "Chat-UniVi/Chat-UniVi"  # or "Chat-UniVi/Chat-UniVi-13B"
     video_path = ${video_path}
-    max_frames = ${max_frames}
     # Input Text
     qs = "Describe the video."
@@ -136,13 +128,13 @@ if __name__ == '__main__':
     # Check if the video exists
     if video_path is not None:
-        video_frames, _ = _get_rawvideo_dec(video_path, image_processor, max_frames=max_frames)
         cur_prompt = qs
         if model.config.mm_use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
-            qs = DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + '\n' + qs
         conv = conv_templates[conv_mode].copy()
         conv.append_message(conv.roles[0], qs)

 def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
     # speed up video decode via decord.
     if s is None:
         start_time, end_time = None, None
         patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
         slice_len = patch_images.shape[0]
+        return patch_images, slice_len
     else:
         print("video path: {} error.".format(video_path))
 if __name__ == '__main__':
     # Model Parameter
     model_path = "Chat-UniVi/Chat-UniVi"  # or "Chat-UniVi/Chat-UniVi-13B"
     video_path = ${video_path}
+    # The number of visual tokens varies with the length of the video. "max_frames" is the maximum number of frames.
+    # When the video is long, we will uniformly downsample the video to meet the frames when equal to the "max_frames".
+    max_frames = 100
+    # The number of frames retained per second in the video.
+    video_framerate = 1
     # Input Text
     qs = "Describe the video."
     # Check if the video exists
     if video_path is not None:
+        video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=max_frames, video_framerate=video_framerate)
         cur_prompt = qs
         if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
+            qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
         conv = conv_templates[conv_mode].copy()
         conv.append_message(conv.roles[0], qs)