Chat-UniVi commited on
Commit
164f596
1 Parent(s): a70ee08

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -19
README.md CHANGED
@@ -39,11 +39,6 @@ import numpy as np
39
 
40
  def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
41
  # speed up video decode via decord.
42
- video_mask = np.zeros(max_frames, dtype=np.int64)
43
- max_video_length = 0
44
-
45
- # T x 3 x H x W
46
- video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
47
 
48
  if s is None:
49
  start_time, end_time = None, None
@@ -83,25 +78,22 @@ def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH,
83
  patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
84
  slice_len = patch_images.shape[0]
85
 
86
- max_video_length = max_video_length if max_video_length > slice_len else slice_len
87
- if slice_len < 1:
88
- pass
89
- else:
90
- video[:slice_len, ...] = patch_images
91
-
92
- return patch_images, video_mask
93
  else:
94
  print("video path: {} error.".format(video_path))
95
 
96
- video_mask[:max_video_length] = [1] * max_video_length
97
-
98
- return torch.from_numpy(video), video_mask
99
 
100
  if __name__ == '__main__':
101
  # Model Parameter
102
  model_path = "Chat-UniVi/Chat-UniVi" # or "Chat-UniVi/Chat-UniVi-13B"
103
  video_path = ${video_path}
104
- max_frames = ${max_frames}
 
 
 
 
 
 
105
 
106
  # Input Text
107
  qs = "Describe the video."
@@ -136,13 +128,13 @@ if __name__ == '__main__':
136
 
137
  # Check if the video exists
138
  if video_path is not None:
139
- video_frames, _ = _get_rawvideo_dec(video_path, image_processor, max_frames=max_frames)
140
 
141
  cur_prompt = qs
142
  if model.config.mm_use_im_start_end:
143
- qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + DEFAULT_IM_END_TOKEN + '\n' + qs
144
  else:
145
- qs = DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + '\n' + qs
146
 
147
  conv = conv_templates[conv_mode].copy()
148
  conv.append_message(conv.roles[0], qs)
 
39
 
40
  def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
41
  # speed up video decode via decord.
 
 
 
 
 
42
 
43
  if s is None:
44
  start_time, end_time = None, None
 
78
  patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
79
  slice_len = patch_images.shape[0]
80
 
81
+ return patch_images, slice_len
 
 
 
 
 
 
82
  else:
83
  print("video path: {} error.".format(video_path))
84
 
 
 
 
85
 
86
  if __name__ == '__main__':
87
  # Model Parameter
88
  model_path = "Chat-UniVi/Chat-UniVi" # or "Chat-UniVi/Chat-UniVi-13B"
89
  video_path = ${video_path}
90
+
91
+ # The number of visual tokens varies with the length of the video. "max_frames" is the maximum number of frames.
92
+ # When the video is long, we will uniformly downsample the video to meet the frames when equal to the "max_frames".
93
+ max_frames = 100
94
+
95
+ # The number of frames retained per second in the video.
96
+ video_framerate = 1
97
 
98
  # Input Text
99
  qs = "Describe the video."
 
128
 
129
  # Check if the video exists
130
  if video_path is not None:
131
+ video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=max_frames, video_framerate=video_framerate)
132
 
133
  cur_prompt = qs
134
  if model.config.mm_use_im_start_end:
135
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
136
  else:
137
+ qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
138
 
139
  conv = conv_templates[conv_mode].copy()
140
  conv.append_message(conv.roles[0], qs)