ClownRat commited on
Commit
506722a
•
1 Parent(s): d352d51

update inference model.

Browse files
Files changed (2) hide show
  1. app.py +5 -3
  2. videollama2/mm_utils.py +4 -1
app.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
 
8
  import sys
9
  sys.path.append('./')
10
- from videollama2.constants import MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
11
  from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
12
  from videollama2.model.builder import load_pretrained_model
13
  from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
@@ -155,12 +155,14 @@ def generate(image, video, state, state_, textbox_in, temperature, top_p, max_ou
155
 
156
  text_en_in = textbox_in.replace("picture", "image")
157
 
 
 
158
  processor = handler.processor
159
  if os.path.exists(image) and not os.path.exists(video):
160
  tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
161
  modals.append('IMAGE')
162
  if not os.path.exists(image) and os.path.exists(video):
163
- tensor.append(process_video(video, processor).to(handler.model.device, dtype=dtype))
164
  modals.append('VIDEO')
165
  if os.path.exists(image) and os.path.exists(video):
166
  raise NotImplementedError("Not support image and video at the same time")
@@ -222,7 +224,7 @@ def clear_history(state, state_):
222
  # 3. The function can't return tensor or other cuda objects.
223
 
224
  conv_mode = "llama_2"
225
- model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
226
 
227
  device = torch.device("cuda")
228
 
 
7
 
8
  import sys
9
  sys.path.append('./')
10
+ from videollama2.constants import NUM_FRAMES, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
11
  from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
12
  from videollama2.model.builder import load_pretrained_model
13
  from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
 
155
 
156
  text_en_in = textbox_in.replace("picture", "image")
157
 
158
+ num_frames = handler.model.config.num_frames if hasattr(handler.model.config, "num_frames") else NUM_FRAMES
159
+
160
  processor = handler.processor
161
  if os.path.exists(image) and not os.path.exists(video):
162
  tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
163
  modals.append('IMAGE')
164
  if not os.path.exists(image) and os.path.exists(video):
165
+ tensor.append(process_video(video, processor, num_frames=num_frames, sample_scheme='fps').to(handler.model.device, dtype=dtype))
166
  modals.append('VIDEO')
167
  if os.path.exists(image) and os.path.exists(video):
168
  raise NotImplementedError("Not support image and video at the same time")
 
224
  # 3. The function can't return tensor or other cuda objects.
225
 
226
  conv_mode = "llama_2"
227
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-16F'
228
 
229
  device = torch.device("cuda")
230
 
videollama2/mm_utils.py CHANGED
@@ -381,7 +381,10 @@ def process_video(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAM
381
  elif mode == 'fps':
382
  assert local_fps is not None
383
  segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
384
- return np.arange(segment_len // 2, duration, segment_len, dtype=int)
 
 
 
385
  else:
386
  raise ImportError(f'Unsupported frame sampling mode: {mode}')
387
 
 
381
  elif mode == 'fps':
382
  assert local_fps is not None
383
  segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
384
+ frame_id_list = np.arange(segment_len // 2, duration, segment_len, dtype=int)
385
+ if len(frame_id_list) < num_frames:
386
+ frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
387
+ return frame_id_list
388
  else:
389
  raise ImportError(f'Unsupported frame sampling mode: {mode}')
390