ClownRat commited on
Commit
e428df4
·
1 Parent(s): e6c8196

init demo.

Browse files
Files changed (46) hide show
  1. .gitattributes +1 -0
  2. app.py +279 -52
  3. examples/desert.jpg +0 -0
  4. examples/extreme_ironing.jpg +0 -0
  5. examples/waterview.jpg +0 -0
  6. requirements.txt +31 -1
  7. videollama2/__init__.py +1 -0
  8. videollama2/constants.py +38 -0
  9. videollama2/conversation.py +484 -0
  10. videollama2/eval/eval_benchmark_1_correctness.py +210 -0
  11. videollama2/eval/eval_benchmark_2_detailed_orientation.py +210 -0
  12. videollama2/eval/eval_benchmark_3_context.py +212 -0
  13. videollama2/eval/eval_benchmark_4_temporal.py +206 -0
  14. videollama2/eval/eval_benchmark_5_consistency.py +218 -0
  15. videollama2/eval/eval_video_qa_gpt.py +219 -0
  16. videollama2/eval/eval_video_qa_mvbench.py +64 -0
  17. videollama2/eval/run_inference_video_qa_batch.py +563 -0
  18. videollama2/eval/run_inference_video_qa_gpt.py +151 -0
  19. videollama2/eval/run_inference_video_qa_gpt_consistency.py +182 -0
  20. videollama2/eval/run_inference_video_qa_gpt_general.py +177 -0
  21. videollama2/eval/run_inference_video_qa_perception_test_mcqa.py +214 -0
  22. videollama2/mm_utils.py +535 -0
  23. videollama2/model/__init__.py +3 -0
  24. videollama2/model/builder.py +170 -0
  25. videollama2/model/language_model/videollama2_llama.py +147 -0
  26. videollama2/model/language_model/videollama2_mistral.py +149 -0
  27. videollama2/model/language_model/videollama2_mixtral.py +149 -0
  28. videollama2/model/multimodal_encoder/builder.py +15 -0
  29. videollama2/model/multimodal_encoder/clip_encoder.py +84 -0
  30. videollama2/model/multimodal_projector/__init__.py +1 -0
  31. videollama2/model/multimodal_projector/builder.py +250 -0
  32. videollama2/model/videollama2_arch.py +346 -0
  33. videollama2/serve/cli.py +144 -0
  34. videollama2/serve/controller.py +298 -0
  35. videollama2/serve/examples/desert.jpg +0 -0
  36. videollama2/serve/examples/extreme_ironing.jpg +0 -0
  37. videollama2/serve/examples/waterview.jpg +0 -0
  38. videollama2/serve/gradio_web_server.py +503 -0
  39. videollama2/serve/model_worker.py +397 -0
  40. videollama2/serve/register_worker.py +26 -0
  41. videollama2/serve/sglang_worker.py +244 -0
  42. videollama2/serve/test_message.py +62 -0
  43. videollama2/train.py +963 -0
  44. videollama2/train_flash_attn.py +12 -0
  45. videollama2/utils.py +126 -0
  46. videollama2/videollama2_trainer.py +263 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,63 +1,290 @@
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
27
 
28
- response = ""
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
 
 
 
 
38
 
39
- response += token
40
- yield response
 
 
41
 
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
-
61
-
62
- if __name__ == "__main__":
63
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+ import torch
5
+ import tempfile
6
  import gradio as gr
7
+ from PIL import Image
8
+ from fastapi import FastAPI
9
 
10
+ import sys
11
+ sys.path.append('./')
12
+ from videollama2.constants import MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
13
+ from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
14
+ from videollama2.model.builder import load_pretrained_model
15
+ from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
16
+
17
+
18
+ title_markdown = ("""
19
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
20
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
21
+ <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA2🚀" style="max-width: 120px; height: auto;">
22
+ </a>
23
+ <div>
24
+ <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
25
+ <h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
26
+ </div>
27
+ </div>
28
+
29
+
30
+ <div align="center">
31
+ <div style="display:flex; gap: 0.25rem;" align="center">
32
+ <a href='VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
33
+ <a href="https://arxiv.org/pdf/2406.07476.pdf"><img src="https://img.shields.io/badge/Arxiv-2406.07476-red"></a>
34
+ <a href='https://github.com/DAMO-NLP-SG/VideoLLaMA2/stargazers'><img src='https://img.shields.io/github/stars/DAMO-NLP-SG/VideoLLaMA2.svg?style=social'></a>
35
+ </div>
36
+ </div>
37
+ """)
38
+
39
+
40
+ block_css = """
41
+ #buttons button {
42
+ min-width: min(120px,100%);
43
+ }
44
  """
 
 
 
45
 
46
 
47
+ tos_markdown = ("""
48
+ ### Terms of use
49
+ By using this service, users are required to agree to the following terms:
50
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
51
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
52
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
53
+ """)
 
 
54
 
 
 
 
 
 
55
 
56
+ learn_more_markdown = ("""
57
+ ### License
58
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
59
+ """)
60
 
 
61
 
62
+ class Chat:
63
+ def __init__(self, model_path, conv_mode, model_base=None, load_8bit=False, load_4bit=False, device='cuda'):
64
+ # disable_torch_init()
65
+ model_name = get_model_name_from_path(model_path)
66
+ self.tokenizer, self.model, processor, context_len = load_pretrained_model(
67
+ model_path, model_base, model_name,
68
+ load_8bit, load_4bit,
69
+ device=device)
70
+ self.processor = processor
71
+ self.conv_mode = conv_mode
72
+ self.conv = conv_templates[conv_mode].copy()
73
+ self.device = self.model.device
74
 
75
+ def get_prompt(self, qs, state):
76
+ state.append_message(state.roles[0], qs)
77
+ state.append_message(state.roles[1], None)
78
+ return state
79
 
80
+ @torch.inference_mode()
81
+ def generate(self, tensor: list, modals: list, prompt: str, first_run: bool, state):
82
+ # TODO: support multiple turns of conversation.
83
+ assert len(tensor) == len(modals)
84
+
85
+ # 1. prepare model, tokenizer, and processor.
86
+ tokenizer, model, processor = self.tokenizer, self.model, self.processor
87
+
88
+ # 2. text preprocess (tag process & generate prompt).
89
+ state = self.get_prompt(prompt, state)
90
+ prompt = state.get_prompt()
91
+ # print('\n\n\n')
92
+ # print(prompt)
93
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt').unsqueeze(0).to(self.device)
94
+
95
+ # 3. generate response according to visual signals and prompts.
96
+ stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2
97
+ # keywords = ["<s>", "</s>"]
98
+ keywords = [stop_str]
99
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
100
+
101
+ with torch.inference_mode():
102
+ output_ids = model.generate(
103
+ input_ids,
104
+ images_or_videos=tensor,
105
+ modal_list=modals,
106
+ do_sample=True,
107
+ temperature=0.2,
108
+ max_new_tokens=1024,
109
+ use_cache=True,
110
+ stopping_criteria=[stopping_criteria],
111
+ )
112
+
113
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
114
+ print(outputs)
115
+ return outputs, state
116
+
117
+
118
+ def save_image_to_local(image):
119
+ filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
120
+ image = Image.open(image)
121
+ image.save(filename)
122
+ return filename
123
+
124
+
125
+ def save_video_to_local(video_path):
126
+ filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
127
+ shutil.copyfile(video_path, filename)
128
+ return filename
129
+
130
+
131
+ def generate(image, video, first_run, state, state_, textbox_in, tensor, modals, dtype=torch.float16):
132
+ flag = 1
133
+ if not textbox_in:
134
+ if len(state_.messages) > 0:
135
+ textbox_in = state_.messages[-1][1]
136
+ state_.messages.pop(-1)
137
+ flag = 0
138
+ else:
139
+ return "Please enter instruction"
140
+
141
+ image = image if image else "none"
142
+ video = video if video else "none"
143
+ assert not (os.path.exists(image) and os.path.exists(video))
144
+
145
+ if type(state) is not Conversation:
146
+ state = conv_templates[conv_mode].copy()
147
+ state_ = conv_templates[conv_mode].copy()
148
+ tensor = []
149
+ modals = []
150
+
151
+ first_run = False if len(state.messages) > 0 else True
152
+
153
+ text_en_in = textbox_in.replace("picture", "image")
154
+
155
+ processor = handler.processor
156
+ if os.path.exists(image) and not os.path.exists(video):
157
+ tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
158
+ modals.append('IMAGE')
159
+ if not os.path.exists(image) and os.path.exists(video):
160
+ tensor.append(process_video(video, processor).to(handler.model.device, dtype=dtype))
161
+ modals.append('VIDEO')
162
+ if os.path.exists(image) and os.path.exists(video):
163
+ raise NotImplementedError("Not support image and video at the same time")
164
+
165
+ # BUG: Only support single video and image inference now.
166
+ if os.path.exists(image) and not os.path.exists(video):
167
+ text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['IMAGE'], '').strip()
168
+ text_en_in = DEFAULT_MMODAL_TOKEN['IMAGE'] + '\n' + text_en_in
169
+ if not os.path.exists(image) and os.path.exists(video):
170
+ text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip()
171
+ text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in
172
+ # if os.path.exists(image) and os.path.exists(video):
173
+ # pass
174
+ text_en_out, state_ = handler.generate(tensor, modals, text_en_in, first_run=first_run, state=state_)
175
+ state_.messages[-1] = (state_.roles[1], text_en_out)
176
+
177
+ text_en_out = text_en_out.split('#')[0]
178
+ textbox_out = text_en_out
179
+
180
+ show_images = ""
181
+ if os.path.exists(image):
182
+ filename = save_image_to_local(image)
183
+ show_images += f'<img src="./file={filename}" style="display: inline-block;width: 250px;max-height: 400px;">'
184
+ if os.path.exists(video):
185
+ filename = save_video_to_local(video)
186
+ show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={filename}"></video>'
187
+
188
+ if flag:
189
+ state.append_message(state.roles[0], textbox_in + "\n" + show_images)
190
+ state.append_message(state.roles[1], textbox_out)
191
+
192
+ return (gr.update(value=image if os.path.exists(image) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True),
193
+ state.to_gradio_chatbot(), False, state, state_, gr.update(value=None, interactive=True), tensor, modals)
194
+
195
+
196
+ def regenerate(state, state_, textbox, tensor, modals):
197
+ state.messages.pop(-1)
198
+ state_.messages.pop(-1)
199
+ tensor.pop(-1)
200
+ modals.pop(-1)
201
+ textbox = gr.update(value=None, interactive=True)
202
+ if len(state.messages) > 0:
203
+ return state, state_, textbox, state.to_gradio_chatbot(), False, tensor, modals
204
+ return state, state_, textbox, state.to_gradio_chatbot(), True, tensor, modals
205
+
206
+
207
+ def clear_history(state, state_, tensor, modals):
208
+ state = conv_templates[conv_mode].copy()
209
+ state_ = conv_templates[conv_mode].copy()
210
+ return (gr.update(value=None, interactive=True),
211
+ gr.update(value=None, interactive=True), \
212
+ state.to_gradio_chatbot(), \
213
+ True, state, state_, gr.update(value=None, interactive=True), [], [])
214
+
215
+
216
+ if __name__ == '__main__':
217
+ conv_mode = "llama_2"
218
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
219
+
220
+ handler = Chat(model_path, conv_mode=conv_mode, load_8bit=False, load_4bit=False, device='cuda')
221
+ handler.model.to(dtype=torch.float16)
222
+
223
+ if not os.path.exists("temp"):
224
+ os.makedirs("temp")
225
+
226
+ app = FastAPI()
227
+
228
+ textbox = gr.Textbox(
229
+ show_label=False, placeholder="Enter text and press ENTER", container=False
230
+ )
231
+ with gr.Blocks(title='VideoLLaMA2🚀', theme=gr.themes.Default(), css=block_css) as demo:
232
+ gr.Markdown(title_markdown)
233
+ state = gr.State()
234
+ state_ = gr.State()
235
+ first_run = gr.State()
236
+ tensor = gr.State()
237
+ modals = gr.State()
238
+
239
+ with gr.Row():
240
+ with gr.Column(scale=3):
241
+ image = gr.Image(label="Input Image", type="filepath")
242
+ video = gr.Video(label="Input Video")
243
+
244
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
245
+ gr.Examples(
246
+ examples=[
247
+ [
248
+ f"{cur_dir}/examples/extreme_ironing.jpg",
249
+ "What is unusual about this image?",
250
+ ],
251
+ [
252
+ f"{cur_dir}/examples/waterview.jpg",
253
+ "What are the things I should be cautious about when I visit here?",
254
+ ],
255
+ [
256
+ f"{cur_dir}/examples/desert.jpg",
257
+ "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?",
258
+ ],
259
+ ],
260
+ inputs=[image, textbox],
261
+ )
262
+
263
+ with gr.Column(scale=7):
264
+ chatbot = gr.Chatbot(label="VideoLLaMA2", bubble_full_width=True).style(height=750)
265
+ with gr.Row():
266
+ with gr.Column(scale=8):
267
+ textbox.render()
268
+ with gr.Column(scale=1, min_width=50):
269
+ submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
270
+ with gr.Row(elem_id="buttons") as button_row:
271
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
272
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
273
+ # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
274
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
275
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=True)
276
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)
277
+
278
+ gr.Markdown(tos_markdown)
279
+ gr.Markdown(learn_more_markdown)
280
+
281
+ submit_btn.click(generate, [image, video, first_run, state, state_, textbox, tensor, modals],
282
+ [image, video, chatbot, first_run, state, state_, textbox, tensor, modals])
283
+
284
+ regenerate_btn.click(regenerate, [state, state_, textbox, tensor, modals], [state, state_, textbox, chatbot, first_run, tensor, modals]).then(
285
+ generate, [image, video, first_run, state, state_, textbox, tensor, modals], [image, video, chatbot, first_run, state, state_, textbox, tensor, modals])
286
+
287
+ clear_btn.click(clear_history, [state, state_, tensor, modals],
288
+ [image, video, chatbot, first_run, state, state_, textbox, tensor, modals])
289
+
290
+ demo.launch()
examples/desert.jpg ADDED
examples/extreme_ironing.jpg ADDED
examples/waterview.jpg ADDED
requirements.txt CHANGED
@@ -1 +1,31 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.22.2
2
+ torch>=2.0.1
3
+ torchvision>=0.15.2
4
+ transformers==4.37.2
5
+ tokenizers==0.15.1
6
+ sentencepiece==0.1.99
7
+ shortuuid
8
+ deepspeed==0.13.1
9
+ accelerate==0.21.0
10
+ peft==0.4.0
11
+ decord==0.6.0
12
+ pytorchvideo==0.1.5
13
+ imageio==2.34.0
14
+ imageio-ffmpeg==0.4.9
15
+ moviepy==1.0.3
16
+ scenedetect==0.6.3
17
+ numpy
18
+ scikit-learn==1.2.2
19
+ einops==0.6.1
20
+ einops-exts==0.0.4
21
+ timm==0.6.13
22
+ bitsandbytes==0.41.0
23
+ pydantic<2,>=1
24
+ markdown2[all]
25
+ gradio==3.35.2
26
+ gradio_client==0.2.9
27
+ requests
28
+ httpx==0.24.0
29
+ uvicorn
30
+ fastapi
31
+ wandb
videollama2/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .model import Videollama2LlamaForCausalLM, Videollama2MistralForCausalLM
videollama2/constants.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "./log_dir"
5
+
6
+ NUM_FRAMES = 8
7
+ MAX_FRAMES = 32
8
+ NUM_FRAMES_PER_SECOND = 1
9
+ Grids = [(2, 2), (1, 2), (1, 3), (1, 4), (2, 1), (3, 1), (4, 1)]
10
+
11
+ # Model Constants
12
+ IGNORE_INDEX = -100
13
+ IMAGE_TOKEN_INDEX = -200
14
+ DEFAULT_IMAGE_TOKEN = "<image>"
15
+ DEFAULT_VIDEO_TOKEN = "<video>"
16
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
17
+ DEFAULT_IM_START_TOKEN = "<im_start>"
18
+ DEFAULT_IM_END_TOKEN = "<im_end>"
19
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
20
+
21
+
22
+ DEFAULT_IMAGE_TOKEN = "<image>"
23
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
24
+ DEFAULT_IM_START_TOKEN = "<im_start>"
25
+ DEFAULT_IM_END_TOKEN = "<im_end>"
26
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
27
+
28
+
29
+ MMODAL_TOKEN_INDEX = {"IMAGE": -200, "VIDEO": -201, "AUDIO": -202}
30
+ MMODAL_INDEX_TOKEN = {v: k for k, v in MMODAL_TOKEN_INDEX.items()}
31
+ MMODAL_START_TOKEN_INDEX = {"IMAGE": "<im_start>", "VIDEO": "<vid_start>", "AUDIO": "<ad_start>"}
32
+ MMODAL_END_TOKEN_INDEX = {"IMAGE": "<im_end>", "VIDEO": "<vid_end>", "AUDIO": "<ad_end>"}
33
+
34
+
35
+ DEFAULT_MMODAL_TOKEN = {"IMAGE": "<image>", "VIDEO": "<video>", "AUDIO": "<audio>"}
36
+ DEFAULT_MMODAL_PATCH_TOKEN = {"IMAGE": "<im_patch>", "VIDEO": "<vid_patch>", "AUDIO": "<ad_patch>"}
37
+ DEFAULT_MMODAL_START_TOKEN = {"IMAGE": "<Image>", "VIDEO": "<Video>", "AUDIO": "<ad_start>"}
38
+ DEFAULT_MMODAL_END_TOKEN = {"IMAGE": "<\Image>", "VIDEO": "<\Video>", "AUDIO": "<\Audio>"}
videollama2/conversation.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import dataclasses
3
+ from io import BytesIO
4
+ from enum import auto, Enum
5
+ from typing import List, Tuple
6
+
7
+ from PIL import Image
8
+ from .constants import LOGDIR, NUM_FRAMES
9
+
10
+
11
+ class SeparatorStyle(Enum):
12
+ """Different separator style."""
13
+ SINGLE = auto()
14
+ TWO = auto()
15
+ MPT = auto()
16
+ PLAIN = auto()
17
+ LLAMA_2 = auto()
18
+
19
+
20
+ @dataclasses.dataclass
21
+ class Conversation:
22
+ """A class that keeps all conversation history."""
23
+ system: str
24
+ roles: List[str]
25
+ messages: List[List[str]]
26
+ offset: int
27
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
28
+ sep: str = "###"
29
+ sep2: str = None
30
+ version: str = "Unknown"
31
+
32
+ skip_next: bool = False
33
+ modality: str = "image"
34
+
35
+ def get_prompt(self):
36
+ messages = self.messages
37
+ modality_token = f"<{self.modality}>"
38
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
39
+ messages = self.messages.copy()
40
+ init_role, init_msg = messages[0].copy()
41
+ init_msg = init_msg[0].replace(modality_token, "").strip()
42
+ if 'mmtag' in self.version:
43
+ messages[0] = (init_role, init_msg)
44
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
45
+ messages.insert(1, (self.roles[1], "Received."))
46
+ else:
47
+ messages[0] = (init_role, f"{modality_token}\n" + init_msg)
48
+
49
+ if self.sep_style == SeparatorStyle.SINGLE:
50
+ ret = self.system + self.sep
51
+ for role, message in messages:
52
+ if message:
53
+ if type(message) is tuple:
54
+ message, _, _ = message
55
+ ret += role + ": " + message + self.sep
56
+ else:
57
+ ret += role + ":"
58
+ elif self.sep_style == SeparatorStyle.TWO:
59
+ seps = [self.sep, self.sep2]
60
+ ret = self.system + seps[0]
61
+ for i, (role, message) in enumerate(messages):
62
+ if message:
63
+ if type(message) is tuple:
64
+ message, _, _ = message
65
+ ret += role + ": " + message + seps[i % 2]
66
+ else:
67
+ ret += role + ":"
68
+ elif self.sep_style == SeparatorStyle.MPT:
69
+ ret = self.system + self.sep
70
+ for role, message in messages:
71
+ if message:
72
+ if type(message) is tuple:
73
+ message, _, _ = message
74
+ ret += role + message + self.sep
75
+ else:
76
+ ret += role
77
+ elif self.sep_style == SeparatorStyle.LLAMA_2:
78
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
79
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
80
+ ret = ""
81
+
82
+ for i, (role, message) in enumerate(messages):
83
+ if i == 0:
84
+ assert message, "first message should not be none"
85
+ assert role == self.roles[0], "first message should come from user"
86
+ if message:
87
+ if type(message) is tuple:
88
+ message, _, _ = message
89
+ if i == 0: message = wrap_sys(self.system) + message
90
+ if i % 2 == 0:
91
+ message = wrap_inst(message)
92
+ ret += self.sep + message
93
+ else:
94
+ ret += " " + message + " " + self.sep2
95
+ else:
96
+ ret += ""
97
+ ret = ret.lstrip(self.sep)
98
+ elif self.sep_style == SeparatorStyle.PLAIN:
99
+ seps = [self.sep, self.sep2]
100
+ ret = self.system
101
+ for i, (role, message) in enumerate(messages):
102
+ if message:
103
+ if type(message) is tuple:
104
+ message, _, _ = message
105
+ ret += message + seps[i % 2]
106
+ else:
107
+ ret += ""
108
+ else:
109
+ raise ValueError(f"Invalid style: {self.sep_style}")
110
+
111
+ return ret
112
+
113
+ def append_message(self, role, message):
114
+ self.messages.append([role, message])
115
+
116
+
117
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=800, min_len=400):
118
+ if image_process_mode == "Pad":
119
+ def expand2square(pil_img, background_color=(122, 116, 104)):
120
+ width, height = pil_img.size
121
+ if width == height:
122
+ return pil_img
123
+ elif width > height:
124
+ result = Image.new(pil_img.mode, (width, width), background_color)
125
+ result.paste(pil_img, (0, (width - height) // 2))
126
+ return result
127
+ else:
128
+ result = Image.new(pil_img.mode, (height, height), background_color)
129
+ result.paste(pil_img, ((height - width) // 2, 0))
130
+ return result
131
+ image = expand2square(image)
132
+ elif image_process_mode in ["Default", "Crop"]:
133
+ pass
134
+ elif image_process_mode == "Resize":
135
+ image = image.resize((336, 336))
136
+ else:
137
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
138
+ if max(image.size) > max_len:
139
+ max_hw, min_hw = max(image.size), min(image.size)
140
+ aspect_ratio = max_hw / min_hw
141
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
142
+ longest_edge = int(shortest_edge * aspect_ratio)
143
+ W, H = image.size
144
+ if H > W:
145
+ H, W = longest_edge, shortest_edge
146
+ else:
147
+ H, W = shortest_edge, longest_edge
148
+ image = image.resize((W, H))
149
+ if return_pil:
150
+ return image
151
+ else:
152
+ buffered = BytesIO()
153
+ image.save(buffered, format=image_format)
154
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
155
+ return img_b64_str
156
+
157
+
158
+ def get_videos(self, return_pil=False):
159
+ video_frames = []
160
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
161
+ if i % 2 == 0:
162
+ if type(msg) is tuple:
163
+ from decord import VideoReader, cpu
164
+ import numpy as np
165
+ # here video is the file path of input video
166
+ msg, video, image_process_mode = msg
167
+ if not return_pil:
168
+ # return filepath
169
+ video_frames.append(video)
170
+ else:
171
+ # read video using decord.VideoReader
172
+ decord_vr = VideoReader(uri=video, ctx=cpu(0))
173
+ duration = len(decord_vr)
174
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
175
+ # convert the extracted image frames into PIL objects
176
+ all_images = [Image.fromarray(f) for f in decord_vr.get_batch(frame_id_list).asnumpy()]
177
+ video_frames.extend([self.process_image(image, image_process_mode, return_pil=return_pil) for image in all_images])
178
+ return video_frames
179
+
180
+
181
+ def get_images(self, return_pil=False):
182
+ images = []
183
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
184
+ if i % 2 == 0:
185
+ if type(msg) is tuple:
186
+ msg, image, image_process_mode = msg
187
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
188
+ images.append(image)
189
+
190
+ # import base64
191
+ # from io import BytesIO
192
+ # from PIL import Image
193
+ # # here image is a PIL object
194
+ # msg, image, image_process_mode = msg
195
+ # if image_process_mode == "Pad":
196
+ # def expand2square(pil_img, background_color=(122, 116, 104)):
197
+ # width, height = pil_img.size
198
+ # if width == height:
199
+ # return pil_img
200
+ # elif width > height:
201
+ # result = Image.new(pil_img.mode, (width, width), background_color)
202
+ # result.paste(pil_img, (0, (width - height) // 2))
203
+ # return result
204
+ # else:
205
+ # result = Image.new(pil_img.mode, (height, height), background_color)
206
+ # result.paste(pil_img, ((height - width) // 2, 0))
207
+ # return result
208
+ # image = expand2square(image)
209
+ # elif image_process_mode in ["Default", "Crop"]:
210
+ # pass
211
+ # elif image_process_mode == "Resize":
212
+ # image = image.resize((336, 336))
213
+ # else:
214
+ # raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
215
+ # max_hw, min_hw = max(image.size), min(image.size)
216
+ # aspect_ratio = max_hw / min_hw
217
+ # max_len, min_len = 800, 400
218
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
219
+ # longest_edge = int(shortest_edge * aspect_ratio)
220
+ # W, H = image.size
221
+ # if longest_edge != max(image.size):
222
+ # if H > W:
223
+ # H, W = longest_edge, shortest_edge
224
+ # else:
225
+ # H, W = shortest_edge, longest_edge
226
+ # image = image.resize((W, H))
227
+ # if return_pil:
228
+ # images.append(image)
229
+ # else:
230
+ # buffered = BytesIO()
231
+ # image.save(buffered, format="PNG")
232
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
233
+ # images.append(img_b64_str)
234
+ return images
235
+
236
+ def to_gradio_chatbot(self):
237
+ ret = []
238
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
239
+ if i % 2 == 0:
240
+ if type(msg) is tuple:
241
+ # import base64
242
+ # from io import BytesIO
243
+ # from PIL import Image
244
+ # msg, image, image_process_mode = msg
245
+ # max_hw, min_hw = max(image.size), min(image.size)
246
+ # aspect_ratio = max_hw / min_hw
247
+ # max_len, min_len = 800, 400
248
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
249
+ # longest_edge = int(shortest_edge * aspect_ratio)
250
+ # W, H = image.size
251
+ # if H > W:
252
+ # H, W = longest_edge, shortest_edge
253
+ # else:
254
+ # H, W = shortest_edge, longest_edge
255
+ # image = image.resize((W, H))
256
+ # buffered = BytesIO()
257
+ # image.save(buffered, format="JPEG")
258
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
259
+ # img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
260
+ # display image/video in the textbox
261
+ msg, image_or_video, image_process_mode = msg
262
+ ##print("imagebox:", image)
263
+ if isinstance(image_or_video, Image.Image):
264
+ # image is PIL object
265
+ img_b64_str = self.process_image(image_or_video, "Default", return_pil=False, image_format='JPEG')
266
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
267
+ msg = img_str + msg.replace('<image>', '').strip()
268
+ else:
269
+ # video is file path
270
+ vid_str = f'<video controls playsinline width="500" style="display: inline-block;" src="./file={image_or_video}"></video><br>'
271
+ msg = vid_str + msg.replace('<video>', '').strip()
272
+ ret.append([msg, None])
273
+ else:
274
+ ret.append([msg, None])
275
+ else:
276
+ ret[-1][-1] = msg
277
+ return ret
278
+
279
+ def copy(self):
280
+ return Conversation(
281
+ system=self.system,
282
+ roles=self.roles,
283
+ messages=[[x, y] for x, y in self.messages],
284
+ offset=self.offset,
285
+ sep_style=self.sep_style,
286
+ sep=self.sep,
287
+ sep2=self.sep2,
288
+ version=self.version)
289
+
290
+ def dict(self):
291
+ if (self.modality == "image" and len(self.get_images()) > 0) or \
292
+ (self.modality == "video" and len(self.get_videos()) > 0):
293
+ return {
294
+ "system": self.system,
295
+ "roles": self.roles,
296
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
297
+ "offset": self.offset,
298
+ "sep": self.sep,
299
+ "sep2": self.sep2,
300
+ "modality": self.modality
301
+ }
302
+ return {
303
+ "system": self.system,
304
+ "roles": self.roles,
305
+ "messages": self.messages,
306
+ "offset": self.offset,
307
+ "sep": self.sep,
308
+ "sep2": self.sep2,
309
+ }
310
+
311
+ conv_mistral_instruct = Conversation(
312
+ system="A chat between a curious user and an artificial intelligence assistant. "
313
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
314
+ roles=("USER", "ASSISTANT"),
315
+ version="llama_v2",
316
+ messages=(),
317
+ offset=0,
318
+ sep_style=SeparatorStyle.LLAMA_2,
319
+ sep="",
320
+ sep2="</s>",
321
+ )
322
+ conv_vicuna_v0 = Conversation(
323
+ system="A chat between a curious human and an artificial intelligence assistant. "
324
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
325
+ roles=("Human", "Assistant"),
326
+ messages=(
327
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
328
+ ("Assistant",
329
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
330
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
331
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
332
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
333
+ "renewable and non-renewable energy sources:\n"
334
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
335
+ "energy sources are finite and will eventually run out.\n"
336
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
337
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
338
+ "and other negative effects.\n"
339
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
340
+ "have lower operational costs than non-renewable sources.\n"
341
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
342
+ "locations than non-renewable sources.\n"
343
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
344
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
345
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
346
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
347
+ ),
348
+ offset=2,
349
+ sep_style=SeparatorStyle.SINGLE,
350
+ sep="###",
351
+ )
352
+
353
+ conv_vicuna_v1 = Conversation(
354
+ system="A chat between a curious user and an artificial intelligence assistant. "
355
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
356
+ roles=("USER", "ASSISTANT"),
357
+ version="v1",
358
+ messages=(),
359
+ offset=0,
360
+ sep_style=SeparatorStyle.TWO,
361
+ sep=" ",
362
+ sep2="</s>",
363
+ )
364
+
365
+ conv_llama_2 = Conversation(
366
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
367
+
368
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
369
+ roles=("USER", "ASSISTANT"),
370
+ version="llama_v2",
371
+ messages=(),
372
+ offset=0,
373
+ sep_style=SeparatorStyle.LLAMA_2,
374
+ sep="<s>",
375
+ sep2="</s>",
376
+ )
377
+
378
+ conv_llava_llama_2 = Conversation(
379
+ system="You are a helpful language and vision assistant. "
380
+ "You are able to understand the visual content that the user provides, "
381
+ "and assist the user with a variety of tasks using natural language.",
382
+ roles=("USER", "ASSISTANT"),
383
+ version="llama_v2",
384
+ messages=(),
385
+ offset=0,
386
+ sep_style=SeparatorStyle.LLAMA_2,
387
+ sep="<s>",
388
+ sep2="</s>",
389
+ )
390
+
391
+ conv_mpt = Conversation(
392
+ system="""<|im_start|>system
393
+ A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
394
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
395
+ version="mpt",
396
+ messages=(),
397
+ offset=0,
398
+ sep_style=SeparatorStyle.MPT,
399
+ sep="<|im_end|>",
400
+ )
401
+
402
+ conv_llava_plain = Conversation(
403
+ system="",
404
+ roles=("", ""),
405
+ messages=(
406
+ ),
407
+ offset=0,
408
+ sep_style=SeparatorStyle.PLAIN,
409
+ sep="\n",
410
+ )
411
+
412
+ conv_llava_v0 = Conversation(
413
+ system="A chat between a curious human and an artificial intelligence assistant. "
414
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
415
+ roles=("Human", "Assistant"),
416
+ messages=(
417
+ ),
418
+ offset=0,
419
+ sep_style=SeparatorStyle.SINGLE,
420
+ sep="###",
421
+ )
422
+
423
+ conv_llava_v0_mmtag = Conversation(
424
+ system="A chat between a curious user and an artificial intelligence assistant. "
425
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
426
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
427
+ roles=("Human", "Assistant"),
428
+ messages=(
429
+ ),
430
+ offset=0,
431
+ sep_style=SeparatorStyle.SINGLE,
432
+ sep="###",
433
+ version="v0_mmtag",
434
+ )
435
+
436
+ conv_llava_v1 = Conversation(
437
+ system="A chat between a curious human and an artificial intelligence assistant. "
438
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
439
+ roles=("USER", "ASSISTANT"),
440
+ version="v1",
441
+ messages=(),
442
+ offset=0,
443
+ sep_style=SeparatorStyle.TWO,
444
+ sep=" ",
445
+ sep2="</s>",
446
+ )
447
+
448
+ conv_llava_v1_mmtag = Conversation(
449
+ system="A chat between a curious user and an artificial intelligence assistant. "
450
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
451
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
452
+ roles=("USER", "ASSISTANT"),
453
+ messages=(),
454
+ offset=0,
455
+ sep_style=SeparatorStyle.TWO,
456
+ sep=" ",
457
+ sep2="</s>",
458
+ version="v1_mmtag",
459
+ )
460
+
461
+ default_conversation = conv_vicuna_v1
462
+ conv_templates = {
463
+ "default": conv_vicuna_v0,
464
+ "v0": conv_vicuna_v0,
465
+ "v1": conv_vicuna_v1,
466
+ "vicuna_v1": conv_vicuna_v1,
467
+ "llama_2": conv_llama_2,
468
+
469
+ "plain": conv_llava_plain,
470
+ "v0_plain": conv_llava_plain,
471
+ "llava_v0": conv_llava_v0,
472
+ "v0_mmtag": conv_llava_v0_mmtag,
473
+ "llava_v1": conv_llava_v1,
474
+ "v1_mmtag": conv_llava_v1_mmtag,
475
+ "llava_llama_2": conv_llava_llama_2,
476
+
477
+ "video_llama_beta": conv_llava_llama_2,
478
+ "mistral_instruct": conv_mistral_instruct,
479
+ "mpt": conv_mpt,
480
+ }
481
+
482
+
483
+ if __name__ == "__main__":
484
+ print(default_conversation.get_prompt())
videollama2/eval/eval_benchmark_1_correctness.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3
40
+ Returns a score for correctness.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ message = [
51
+ {
52
+ "role": "system",
53
+ "content":
54
+ "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
55
+ "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
56
+ "------"
57
+ "##INSTRUCTIONS: "
58
+ "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
59
+ "- The predicted answer must be factually accurate and align with the video content.\n"
60
+ "- Consider synonyms or paraphrases as valid matches.\n"
61
+ "- Evaluate the factual accuracy of the prediction compared to the answer."
62
+ },
63
+ {
64
+ "role": "user",
65
+ "content":
66
+ "Please evaluate the following video-based question-answer pair:\n\n"
67
+ f"Question: {question}\n"
68
+ f"Correct Answer: {answer}\n"
69
+ f"Predicted Answer: {pred}\n\n"
70
+ "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
71
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
72
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
73
+ "For example, your response should look like this: {''score': 4.8}."
74
+ }
75
+ ]
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ traceback.print_exc()
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for correctness:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_benchmark_2_detailed_orientation.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ from tqdm import tqdm
6
+ from multiprocessing.pool import Pool
7
+
8
+ from openai import AzureOpenAI
9
+
10
+
11
+ def init():
12
+ client = AzureOpenAI(
13
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
14
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
15
+ api_version="2024-02-15-preview"
16
+ )
17
+
18
+ return client
19
+
20
+
21
+ def interaction(client, message_text):
22
+ completion = client.chat.completions.create(
23
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
24
+ messages = message_text,
25
+ temperature=0.7,
26
+ max_tokens=800,
27
+ top_p=0.95,
28
+ frequency_penalty=0,
29
+ presence_penalty=0,
30
+ stop=None
31
+ )
32
+
33
+ return completion
34
+
35
+
36
+ def annotate(prediction_set, caption_files, output_dir, args):
37
+ """
38
+ Evaluates question and answer pairs using GPT-3 and
39
+ returns a score for detailed orientation.
40
+ """
41
+ for file in tqdm(caption_files):
42
+ key = file[:-5] # Strip file extension
43
+ qa_set = prediction_set[key]
44
+ question = qa_set['q']
45
+ answer = qa_set['a']
46
+ pred = qa_set['p']
47
+ try:
48
+ # Compute the detailed-orientation score
49
+ message = [
50
+ {
51
+ "role": "system",
52
+ "content":
53
+ "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
54
+ "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
55
+ "------"
56
+ "##INSTRUCTIONS: "
57
+ "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
58
+ "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
59
+ "- Consider synonyms or paraphrases as valid matches.\n"
60
+ "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
61
+ },
62
+ {
63
+ "role": "user",
64
+ "content":
65
+ "Please evaluate the following video-based question-answer pair:\n\n"
66
+ f"Question: {question}\n"
67
+ f"Correct Answer: {answer}\n"
68
+ f"Predicted Answer: {pred}\n\n"
69
+ "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
70
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
71
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
72
+ "For example, your response should look like this: {''score': 4.8}."
73
+ }
74
+ ]
75
+
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ print(f"Error: {e}")
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for detailed orientation:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_benchmark_3_context.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for contextual understanding.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ # Compute the contextual understanding score
51
+ message = [
52
+ {
53
+ "role": "system",
54
+ "content":
55
+ "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
56
+ "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
57
+ "------"
58
+ "##INSTRUCTIONS: "
59
+ "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
60
+ "- The predicted answer must capture the main themes and sentiments of the video.\n"
61
+ "- Consider synonyms or paraphrases as valid matches.\n"
62
+ "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content":
67
+ "Please evaluate the following video-based question-answer pair:\n\n"
68
+ f"Question: {question}\n"
69
+ f"Correct Answer: {answer}\n"
70
+ f"Predicted Answer: {pred}\n\n"
71
+ "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
72
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
73
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
74
+ "For example, your response should look like this: {''score': 4.8}."
75
+ }
76
+ ]
77
+
78
+ completion = interaction(client, message)
79
+ # Convert response to a Python dictionary.
80
+ response_message = completion.choices[0].message.content
81
+ response_dict = ast.literal_eval(response_message)
82
+ result_qa_pair = [response_dict, qa_set]
83
+
84
+ # Save the question-answer pairs to a json file.
85
+ with open(f"{output_dir}/{key}.json", "w") as f:
86
+ json.dump(result_qa_pair, f)
87
+
88
+ except Exception as e:
89
+ print(f"Error processing file '{key}': {e}")
90
+
91
+
92
+ def main(args):
93
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
94
+
95
+ # Dictionary to store the count of occurrences for each video_id
96
+ video_id_counts = {}
97
+ new_pred_contents = []
98
+
99
+ # Iterate through each sample in pred_contents
100
+ for sample in pred_contents:
101
+ video_id = sample['video_name']
102
+ if video_id in video_id_counts:
103
+ video_id_counts[video_id] += 1
104
+ else:
105
+ video_id_counts[video_id] = 0
106
+
107
+ # Create a new sample with the modified key
108
+ new_sample = sample
109
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
110
+ new_pred_contents.append(new_sample)
111
+
112
+ # Generating list of id's and corresponding files
113
+ id_list = [x['video_name'] for x in new_pred_contents]
114
+ caption_files = [f"{id}.json" for id in id_list]
115
+
116
+ output_dir = args.output_dir
117
+ # Generate output directory if not exists.
118
+ if not os.path.exists(output_dir):
119
+ os.makedirs(output_dir)
120
+
121
+ # Preparing dictionary of question-answer sets
122
+ prediction_set = {}
123
+ for sample in new_pred_contents:
124
+ id = sample['video_name']
125
+ question = sample['Q']
126
+ answer = sample['A']
127
+ pred = sample['P']
128
+ qa_set = {"q": question, "a": answer, "p": pred}
129
+ prediction_set[id] = qa_set
130
+
131
+ # Set the OpenAI API key.
132
+ # openai.api_key = args.api_key
133
+ num_tasks = args.num_tasks
134
+
135
+ # While loop to ensure that all captions are processed.
136
+ while True:
137
+ try:
138
+ # Files that have not been processed yet.
139
+ completed_files = os.listdir(output_dir)
140
+ print(f"completed_files: {len(completed_files)}")
141
+
142
+ # Files that have not been processed yet.
143
+ incomplete_files = [f for f in caption_files if f not in completed_files]
144
+ print(f"incomplete_files: {len(incomplete_files)}")
145
+
146
+ # Break the loop when there are no incomplete files
147
+ if len(incomplete_files) == 0:
148
+ break
149
+ if len(incomplete_files) <= num_tasks:
150
+ num_tasks = 1
151
+
152
+ # Split tasks into parts.
153
+ part_len = len(incomplete_files) // num_tasks
154
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
155
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
156
+
157
+ # Use a pool of workers to process the files in parallel.
158
+ with Pool() as pool:
159
+ pool.starmap(annotate, task_args)
160
+
161
+ except Exception as e:
162
+ print(f"Error: {e}")
163
+
164
+ # Combine all the processed files into one
165
+ combined_contents = {}
166
+ json_path = args.output_json
167
+
168
+ # Iterate through json files
169
+ for file_name in tqdm(os.listdir(output_dir)):
170
+ if file_name.endswith(".json"):
171
+ file_path = os.path.join(output_dir, file_name)
172
+ with open(file_path, "r") as json_file:
173
+ content = json.load(json_file)
174
+ combined_contents[file_name[:-5]] = content
175
+
176
+ # Write combined content to a json file
177
+ with open(json_path, "w") as json_file:
178
+ json.dump(combined_contents, json_file)
179
+ print("All evaluation completed!")
180
+
181
+ # Calculate average score
182
+ score_sum = 0
183
+ count = 0
184
+ for key, result in combined_contents.items():
185
+ count += 1
186
+ score_match = result[0]['score']
187
+ score = int(score_match)
188
+ score_sum += score
189
+ average_score = score_sum / count
190
+
191
+ print("Average score for contextual understanding:", average_score)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
196
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
197
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
198
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
199
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
200
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
201
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
202
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
203
+ args = parser.parse_args()
204
+
205
+ # Set the OpenAI API key.
206
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
207
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
208
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
209
+
210
+ client = init()
211
+
212
+ main(args)
videollama2/eval/eval_benchmark_4_temporal.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+
39
+ for file in tqdm(caption_files):
40
+ key = file[:-5] # Strip file extension
41
+ qa_set = prediction_set[key]
42
+ question = qa_set['q']
43
+ answer = qa_set['a']
44
+ pred = qa_set['p']
45
+ try:
46
+ message = [
47
+ {
48
+ "role": "system",
49
+ "content":
50
+ "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
51
+ "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
52
+ "------"
53
+ "##INSTRUCTIONS: "
54
+ "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
55
+ "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
56
+ "- Evaluate the temporal accuracy of the prediction compared to the answer."
57
+ },
58
+ {
59
+ "role": "user",
60
+ "content":
61
+ "Please evaluate the following video-based question-answer pair:\n\n"
62
+ f"Question: {question}\n"
63
+ f"Correct Answer: {answer}\n"
64
+ f"Predicted Answer: {pred}\n\n"
65
+ "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
66
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
67
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
68
+ "For example, your response should look like this: {''score': 4.8}."
69
+ }
70
+ ]
71
+
72
+ completion = interaction(client, message)
73
+ # Convert response to a Python dictionary.
74
+ response_message = completion.choices[0].message.content
75
+ response_dict = ast.literal_eval(response_message)
76
+ result_qa_pair = [response_dict, qa_set]
77
+
78
+ # Save the question-answer pairs to a json file.
79
+ with open(f"{output_dir}/{key}.json", "w") as f:
80
+ json.dump(result_qa_pair, f)
81
+
82
+ except Exception as e:
83
+ print(f"Error processing file '{key}': {e}")
84
+
85
+
86
+ def main(args):
87
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
88
+
89
+ # Dictionary to store the count of occurrences for each video_id
90
+ video_id_counts = {}
91
+ new_pred_contents = []
92
+
93
+ # Iterate through each sample in pred_contents
94
+ for sample in pred_contents:
95
+ video_id = sample['video_name']
96
+ if video_id in video_id_counts:
97
+ video_id_counts[video_id] += 1
98
+ else:
99
+ video_id_counts[video_id] = 0
100
+
101
+ # Create a new sample with the modified key
102
+ new_sample = sample
103
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
104
+ new_pred_contents.append(new_sample)
105
+
106
+ # Generating list of id's and corresponding files
107
+ id_list = [x['video_name'] for x in new_pred_contents]
108
+ caption_files = [f"{id}.json" for id in id_list]
109
+
110
+ output_dir = args.output_dir
111
+ # Generate output directory if not exists.
112
+ if not os.path.exists(output_dir):
113
+ os.makedirs(output_dir)
114
+
115
+ # Preparing dictionary of question-answer sets
116
+ prediction_set = {}
117
+ for sample in new_pred_contents:
118
+ id = sample['video_name']
119
+ question = sample['Q']
120
+ answer = sample['A']
121
+ pred = sample['P']
122
+ qa_set = {"q": question, "a": answer, "p": pred}
123
+ prediction_set[id] = qa_set
124
+
125
+ # Set the OpenAI API key.
126
+ # openai.api_key = args.api_key
127
+ num_tasks = args.num_tasks
128
+
129
+ # While loop to ensure that all captions are processed.
130
+ while True:
131
+ try:
132
+ # Files that have not been processed yet.
133
+ completed_files = os.listdir(output_dir)
134
+ print(f"completed_files: {len(completed_files)}")
135
+
136
+ # Files that have not been processed yet.
137
+ incomplete_files = [f for f in caption_files if f not in completed_files]
138
+ print(f"incomplete_files: {len(incomplete_files)}")
139
+
140
+ # Break the loop when there are no incomplete files
141
+ if len(incomplete_files) == 0:
142
+ break
143
+ if len(incomplete_files) <= num_tasks:
144
+ num_tasks = 1
145
+
146
+ # Split tasks into parts.
147
+ part_len = len(incomplete_files) // num_tasks
148
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
149
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
150
+
151
+ # Use a pool of workers to process the files in parallel.
152
+ with Pool() as pool:
153
+ pool.starmap(annotate, task_args)
154
+
155
+ except Exception as e:
156
+ print(f"Error: {e}")
157
+
158
+ # Combine all the processed files into one
159
+ combined_contents = {}
160
+ json_path = args.output_json
161
+
162
+ # Iterate through json files
163
+ for file_name in os.listdir(output_dir):
164
+ if file_name.endswith(".json"):
165
+ file_path = os.path.join(output_dir, file_name)
166
+ with open(file_path, "r") as json_file:
167
+ content = json.load(json_file)
168
+ combined_contents[file_name[:-5]] = content
169
+
170
+ # Write combined content to a json file
171
+ with open(json_path, "w") as json_file:
172
+ json.dump(combined_contents, json_file)
173
+ print("All evaluation completed!")
174
+
175
+ # Calculate average score
176
+ score_sum = 0
177
+ count = 0
178
+ for key, result in combined_contents.items():
179
+ count += 1
180
+ score_match = result[0]['score']
181
+ score = int(score_match)
182
+ score_sum += score
183
+ average_score = score_sum / count
184
+
185
+ print("Average score temporal understanding:", average_score)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
190
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
191
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
192
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
193
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
194
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
195
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
196
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
197
+ args = parser.parse_args()
198
+
199
+ # Set the OpenAI API key.
200
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
201
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
202
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
203
+
204
+ client = init()
205
+
206
+ main(args)
videollama2/eval/eval_benchmark_5_consistency.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for consistency.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question1 = qa_set['q1']
47
+ question2 = qa_set['q2']
48
+ answer = qa_set['a']
49
+ pred1 = qa_set['p1']
50
+ pred2 = qa_set['p2']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content":
56
+ "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
57
+ "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
58
+ "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
59
+ "------"
60
+ "##INSTRUCTIONS: "
61
+ "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
62
+ "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
63
+ "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
64
+ "- Evaluate the consistency of the two predicted answers compared to the correct answer."
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content":
69
+ "Please evaluate the following video-based question-answer pair:\n\n"
70
+ f"Question 1: {question1}\n"
71
+ f"Question 2: {question2}\n"
72
+ f"Correct Answer: {answer}\n"
73
+ f"Predicted Answer to Question 1: {pred1}\n"
74
+ f"Predicted Answer to Question 2: {pred2}\n\n"
75
+ "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
76
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
77
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
78
+ "For example, your response should look like this: {''score': 4.8}."
79
+ }
80
+ ]
81
+
82
+ completion = interaction(client, message)
83
+ # Convert response to a Python dictionary.
84
+ response_message = completion.choices[0].message.content
85
+ response_dict = ast.literal_eval(response_message)
86
+ result_qa_pair = [response_dict, qa_set]
87
+
88
+ # Save the question-answer pairs to a json file.
89
+ with open(f"{output_dir}/{key}.json", "w") as f:
90
+ json.dump(result_qa_pair, f)
91
+
92
+ except Exception as e:
93
+ print(f"Error processing file '{key}': {e}")
94
+
95
+
96
+ def main(args):
97
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
98
+
99
+ # Dictionary to store the count of occurrences for each video_id
100
+ video_id_counts = {}
101
+ new_pred_contents = []
102
+
103
+ # Iterate through each sample in pred_contents
104
+ for sample in pred_contents:
105
+ video_id = sample['video_name']
106
+ if video_id in video_id_counts:
107
+ video_id_counts[video_id] += 1
108
+ else:
109
+ video_id_counts[video_id] = 0
110
+
111
+ # Create a new sample with the modified key
112
+ new_sample = sample
113
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
114
+ new_pred_contents.append(new_sample)
115
+
116
+ # Generating list of id's and corresponding files
117
+ id_list = [x['video_name'] for x in new_pred_contents]
118
+ caption_files = [f"{id}.json" for id in id_list]
119
+
120
+ output_dir = args.output_dir
121
+ # Generate output directory if not exists.
122
+ if not os.path.exists(output_dir):
123
+ os.makedirs(output_dir)
124
+
125
+ # Preparing dictionary of question-answer sets
126
+ prediction_set = {}
127
+ for sample in new_pred_contents:
128
+ id = sample['video_name']
129
+ question1 = sample['Q1']
130
+ question2 = sample['Q2']
131
+ answer = sample['A']
132
+ pred1 = sample['P1']
133
+ pred2 = sample['P2']
134
+ qa_set = {"q1": question1, "q2": question2, "a": answer, "p1": pred1, "p2": pred2}
135
+ prediction_set[id] = qa_set
136
+
137
+ # Set the OpenAI API key.
138
+ # openai.api_key = args.api_key
139
+ num_tasks = args.num_tasks
140
+
141
+ # While loop to ensure that all captions are processed.
142
+ while True:
143
+ try:
144
+ # Files that have not been processed yet.
145
+ completed_files = os.listdir(output_dir)
146
+ print(f"completed_files: {len(completed_files)}")
147
+
148
+ # Files that have not been processed yet.
149
+ incomplete_files = [f for f in caption_files if f not in completed_files]
150
+ print(f"incomplete_files: {len(incomplete_files)}")
151
+
152
+ # Break the loop when there are no incomplete files
153
+ if len(incomplete_files) == 0:
154
+ break
155
+ if len(incomplete_files) <= num_tasks:
156
+ num_tasks = 1
157
+
158
+ # Split tasks into parts.
159
+ part_len = len(incomplete_files) // num_tasks
160
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
161
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
162
+
163
+ # Use a pool of workers to process the files in parallel.
164
+ with Pool() as pool:
165
+ pool.starmap(annotate, task_args)
166
+
167
+ except Exception as e:
168
+ print(f"Error: {e}")
169
+
170
+ # Combine all the processed files into one
171
+ combined_contents = {}
172
+ json_path = args.output_json
173
+
174
+ # Iterate through json files
175
+ for file_name in os.listdir(output_dir):
176
+ if file_name.endswith(".json"):
177
+ file_path = os.path.join(output_dir, file_name)
178
+ with open(file_path, "r") as json_file:
179
+ content = json.load(json_file)
180
+ combined_contents[file_name[:-5]] = content
181
+
182
+ # Write combined content to a json file
183
+ with open(json_path, "w") as json_file:
184
+ json.dump(combined_contents, json_file)
185
+ print("All evaluation completed!")
186
+
187
+ # Calculate average score
188
+ score_sum = 0
189
+ count = 0
190
+ for key, result in combined_contents.items():
191
+ count += 1
192
+ score_match = result[0]['score']
193
+ score = int(score_match)
194
+ score_sum += score
195
+ average_score = score_sum / count
196
+
197
+ print("Average score for consistency:", average_score)
198
+
199
+
200
+ if __name__ == "__main__":
201
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
202
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
203
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
204
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
205
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
206
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
207
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
208
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
209
+ args = parser.parse_args()
210
+
211
+ # Set the OpenAI API key.
212
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
213
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
214
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
215
+
216
+ client = init()
217
+
218
+ main(args)
videollama2/eval/eval_video_qa_gpt.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ast
3
+ import json
4
+ import time
5
+ import argparse
6
+ import traceback
7
+ from tqdm import tqdm
8
+ from multiprocessing.pool import Pool
9
+
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ def init():
14
+ client = AzureOpenAI(
15
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
16
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
17
+ api_version="2024-02-15-preview"
18
+ )
19
+
20
+ return client
21
+
22
+
23
+ def interaction(client, message_text):
24
+ completion = client.chat.completions.create(
25
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
26
+ messages = message_text,
27
+ temperature=0.7,
28
+ max_tokens=800,
29
+ top_p=0.95,
30
+ frequency_penalty=0,
31
+ presence_penalty=0,
32
+ stop=None
33
+ )
34
+
35
+ return completion
36
+
37
+
38
+ def prompt_gpt(question, answer, pred, key, qa_set, output_dir):
39
+ message = [
40
+ {
41
+ "role": "system",
42
+ "content":
43
+ "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
44
+ "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
45
+ "------"
46
+ "##INSTRUCTIONS: "
47
+ "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
48
+ "- Consider synonyms or paraphrases as valid matches.\n"
49
+ "- Evaluate the correctness of the prediction compared to the answer."
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content":
54
+ "Please evaluate the following video-based question-answer pair:\n\n"
55
+ f"Question: {question}\n"
56
+ f"Correct Answer: {answer}\n"
57
+ f"Predicted Answer: {pred}\n\n"
58
+ "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
59
+ "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
60
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
61
+ "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
62
+ }
63
+ ]
64
+ completion = interaction(client, message)
65
+ # Convert response to a Python dictionary.
66
+ response_message = completion.choices[0].message.content
67
+ response_dict = ast.literal_eval(response_message)
68
+ result_qa_pair = [response_dict, qa_set]
69
+ # # Save the question-answer pairs to a json file.
70
+ with open(f"{output_dir}/{key}.json", "w") as f:
71
+ json.dump(result_qa_pair, f)
72
+
73
+
74
+ def annotate(prediction_set, caption_files, output_dir, args):
75
+ """
76
+ Evaluates question and answer pairs using GPT-3
77
+ Returns a score for correctness.
78
+ """
79
+
80
+ for file in tqdm(caption_files):
81
+ key = file[:-5] # Strip file extension
82
+ qa_set = prediction_set[key]
83
+ question = qa_set['q']
84
+ answer = qa_set['a']
85
+ pred = qa_set['p']
86
+ try:
87
+ prompt_gpt(question, answer, pred, key, qa_set, output_dir)
88
+ except Exception as e:
89
+ traceback.print_exc()
90
+ prompt_gpt(question, answer, pred[:50], key, qa_set, output_dir)
91
+
92
+ time.sleep(1)
93
+
94
+
95
+ def main(args):
96
+
97
+ file = open(args.pred_path)
98
+ new_pred_contents = [eval(i.strip()) for i in file.readlines()]
99
+
100
+ # Generating list of id's and corresponding files
101
+ id_list = [x['id'] for x in new_pred_contents]
102
+ caption_files = [f"{id}.json" for id in id_list]
103
+
104
+ output_dir = args.output_dir
105
+ # Generate output directory if not exists.
106
+ if not os.path.exists(output_dir):
107
+ os.makedirs(output_dir)
108
+
109
+ # Preparing dictionary of question-answer sets
110
+ prediction_set = {}
111
+ for sample in new_pred_contents:
112
+ id = sample['id']
113
+ question = sample['question']
114
+ answer = sample['answer']
115
+ pred = sample['pred']
116
+ qa_set = {"q": question, "a": answer, "p": pred}
117
+ prediction_set[id] = qa_set
118
+
119
+ num_tasks = args.num_tasks
120
+
121
+ # While loop to ensure that all captions are processed.
122
+ while True:
123
+ try:
124
+ # Files that have not been processed yet.
125
+ completed_files = os.listdir(output_dir)
126
+ print(f"completed_files: {len(completed_files)}")
127
+
128
+ # Files that have not been processed yet.
129
+ incomplete_files = [f for f in caption_files if f not in completed_files]
130
+ print(f"incomplete_files: {len(incomplete_files)}")
131
+
132
+ # Break the loop when there are no incomplete files
133
+ if len(incomplete_files) == 0:
134
+ break
135
+ if len(incomplete_files) <= num_tasks:
136
+ num_tasks = 1
137
+
138
+ # Split tasks into parts.
139
+ part_len = len(incomplete_files) // num_tasks
140
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
141
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
142
+
143
+ # Use a pool of workers to process the files in parallel.
144
+ with Pool() as pool:
145
+ pool.starmap(annotate, task_args)
146
+
147
+ except Exception as e:
148
+ print(f"Error: {e}")
149
+
150
+ # Combine all the processed files into one
151
+ combined_contents = {}
152
+ json_path = args.output_json
153
+
154
+ # Iterate through json files
155
+ for file_name in tqdm(os.listdir(output_dir)):
156
+ if file_name.endswith(".json"):
157
+ file_path = os.path.join(output_dir, file_name)
158
+ with open(file_path, "r") as json_file:
159
+ try:
160
+ content = json.load(json_file)
161
+ except:
162
+ print(json_file)
163
+ exit(0)
164
+ combined_contents[file_name[:-5]] = content
165
+
166
+ # Write combined content to a json file
167
+ with open(json_path, "w") as json_file:
168
+ json.dump(combined_contents, json_file)
169
+ print("All evaluation completed!")
170
+
171
+ # Calculate average score and accuracy
172
+ score_sum = 0
173
+ count = 0
174
+ yes_count = 0
175
+ no_count = 0
176
+ for key, result in tqdm(combined_contents.items()):
177
+ try:
178
+ # Computing score
179
+ count += 1
180
+ score_match = result[0]['score']
181
+ score = int(score_match)
182
+ score_sum += score
183
+
184
+ # Computing accuracy
185
+ pred = result[0]['pred']
186
+ if "yes" in pred.lower():
187
+ yes_count += 1
188
+ elif "no" in pred.lower():
189
+ no_count += 1
190
+ except:
191
+ print(result)
192
+
193
+ average_score = score_sum / count
194
+ accuracy = yes_count / (yes_count + no_count)
195
+ print("Yes count:", yes_count)
196
+ print("No count:", no_count)
197
+ print("Accuracy:", accuracy)
198
+ print("Average score:", average_score)
199
+
200
+
201
+ if __name__ == "__main__":
202
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
203
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
204
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
205
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
206
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
207
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
208
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
209
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
210
+ args = parser.parse_args()
211
+
212
+ # Set the OpenAI API key.
213
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
214
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
215
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
216
+
217
+ client = init()
218
+
219
+ main(args)
videollama2/eval/eval_video_qa_mvbench.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tabulate import tabulate
4
+
5
+
6
+ tasks = {
7
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
8
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
9
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
10
+ "Fine-grained Action": ("fine_grained_action.json", "pMoments_in_Time_Raw/videos/", "video", False),
11
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
12
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
13
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
14
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
15
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
16
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
17
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
18
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
19
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
20
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
21
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
22
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
23
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
24
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
25
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
26
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
27
+ }
28
+
29
+
30
+ def main():
31
+ args = parse_args()
32
+ res = [eval(x.strip()) for x in open(args.pred_path, 'r').readlines()]
33
+ task_types = tasks.keys()
34
+ task_acc = {x: [] for x in task_types}
35
+ acc = []
36
+ for i, x in enumerate(res):
37
+ value = 1
38
+ if x['pred'] != x['gt']:
39
+ value = 0
40
+ acc.append(value)
41
+ task_acc[x['task_type']].append(value)
42
+ acc = sum(acc) * 100 / len(acc)
43
+ task_acc = {x: sum(task_acc[x]) * 100 / len(task_acc[x]) for x in task_acc}
44
+ print(f"{args.pred_path}:", acc)
45
+ task_names = list(tasks.keys())
46
+
47
+ table_data = []
48
+ for i in range(len(task_names) // 4):
49
+ row_task_names = task_names[i * 4: (i + 1) * 4]
50
+ row_task_acc = [task_acc[x] for x in row_task_names]
51
+ table_data.append(row_task_names)
52
+ table_data.append(row_task_acc)
53
+ print(tabulate(table_data, floatfmt=".1f"), '\n')
54
+
55
+
56
+ def parse_args():
57
+ parser = argparse.ArgumentParser(description="Evaluate video captioning.")
58
+ parser.add_argument("--pred_path", default=r'', help="The path to file containing prediction.")
59
+ args = parser.parse_args()
60
+ return args
61
+
62
+
63
+ if __name__ == '__main__':
64
+ main()
videollama2/eval/run_inference_video_qa_batch.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+
8
+ import torch
9
+ import decord
10
+ import numpy as np
11
+ import transformers
12
+ from PIL import Image
13
+ from tqdm import tqdm
14
+ from decord import VideoReader, cpu
15
+ from torch.utils.data import Dataset, DataLoader
16
+ from torchvision import transforms as T
17
+ from torchvision.transforms import functional as F
18
+
19
+ import sys
20
+ sys.path.append('./')
21
+ from videollama2.conversation import conv_templates, SeparatorStyle
22
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
23
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_videos, expand2square
24
+ from videollama2.model.builder import load_pretrained_model
25
+
26
+
27
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
28
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
29
+
30
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
31
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
32
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
33
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
34
+
35
+
36
+ def split_list(lst, n):
37
+ """Split a list into n (roughly) equal-sized chunks"""
38
+ chunk_size = math.ceil(len(lst) / n) # integer division
39
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
40
+
41
+
42
+ def get_chunk(lst, n, k):
43
+ chunks = split_list(lst, n)
44
+ return chunks[k]
45
+
46
+
47
+ class MVBenchDataset(Dataset):
48
+
49
+ def __init__(self, data_list, processor, num_segments=8):
50
+ self.data_list = data_list
51
+
52
+ self.decord_method = {
53
+ 'video': self.read_video,
54
+ 'gif': self.read_gif,
55
+ 'frame': self.read_frame,
56
+ }
57
+
58
+ self.processor = processor
59
+ self.num_segments = num_segments
60
+
61
+ def __str__(self):
62
+ len_list = {}
63
+ option_list = {}
64
+ for data in self.data_list:
65
+ if data['task_type'] not in len_list:
66
+ len_list[data['task_type']] = 0
67
+ len_list[data['task_type']] += 1
68
+ if data['task_type'] not in option_list:
69
+ option_list[data['task_type']] = 0
70
+ option_list[data['task_type']] += len(data['data']['candidates'])
71
+
72
+ correct = 0
73
+ total = 0
74
+ res = f"There are {len(self.data_list)} videos as follow:\n"
75
+ for k, v in len_list.items():
76
+ correct += len_list[k]
77
+ total += option_list[k]
78
+ res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n"
79
+ correct = correct + 1 / option_list[k]
80
+ res += f"Total random accuracy: {correct/total*100:.2f}%"
81
+ return res.rstrip()
82
+
83
+ def __len__(self):
84
+ return len(self.data_list)
85
+
86
+ def get_index(self, bound, fps, max_frame, first_idx=0):
87
+ if bound:
88
+ start, end = bound[0], bound[1]
89
+ else:
90
+ start, end = -100000, 100000
91
+ start_idx = max(first_idx, round(start * fps))
92
+ end_idx = min(round(end * fps), max_frame)
93
+ seg_size = float(end_idx - start_idx) / self.num_segments
94
+ frame_indices = np.array([
95
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
96
+ for idx in range(self.num_segments)
97
+ ])
98
+ return frame_indices
99
+
100
+ def read_video(self, video_path, bound=None):
101
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
102
+ max_frame = len(vr) - 1
103
+ fps = float(vr.get_avg_fps())
104
+
105
+ images_group = list()
106
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
107
+ for frame_index in frame_indices:
108
+ img = Image.fromarray(vr[frame_index].asnumpy())
109
+ images_group.append(img)
110
+ # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
111
+ torch_imgs = self.processor(images_group, return_tensors='pt')['pixel_values']
112
+ return torch_imgs
113
+
114
+ def read_gif(self, video_path, bound=None, fps=25):
115
+ gif = imageio.get_reader(video_path)
116
+ max_frame = len(gif) - 1
117
+
118
+ images_group = list()
119
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
120
+ for index, frame in enumerate(gif):
121
+ if index in frame_indices:
122
+ img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
123
+ img = Image.fromarray(img)
124
+ images_group.append(img)
125
+ # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
126
+ torch_imgs = self.processor(images_group, return_tensors='pt')['pixel_values']
127
+ return torch_imgs
128
+
129
+ def read_frame(self, video_path, bound=None, fps=3):
130
+ max_frame = len(os.listdir(video_path))
131
+ images_group = list()
132
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
133
+ for frame_index in frame_indices:
134
+ img = Image.open(os.path.join(video_path, f"{frame_index:05d}.jpg"))
135
+ images_group.append(img)
136
+ # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
137
+ torch_imgs = self.processor.preprocess(images_group, return_tensors='pt')['pixel_values']
138
+ return torch_imgs
139
+
140
+ def qa_template(self, data):
141
+ question = f"Question: {data['question']}\n"
142
+ question += "Options:\n"
143
+ answer = data['answer']
144
+ answer_idx = -1
145
+ for idx, c in enumerate(data['candidates']):
146
+ question += f"({chr(ord('A') + idx)}) {c}\n"
147
+ if c == answer:
148
+ answer_idx = idx
149
+ question = question.rstrip()
150
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
151
+ return question, answer
152
+
153
+ def __getitem__(self, idx):
154
+ decord_method = self.decord_method[self.data_list[idx]['data_type']]
155
+ bound = None
156
+ if self.data_list[idx]['bound']:
157
+ bound = (
158
+ self.data_list[idx]['data']['start'],
159
+ self.data_list[idx]['data']['end'],
160
+ )
161
+ video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
162
+ torch_imgs = decord_method(video_path, bound)
163
+ question = self.data_list[idx]['data']['question']
164
+ options = self.data_list[idx]['data']['candidates']
165
+ answer = self.data_list[idx]['data']['answer']
166
+ task_type = self.data_list[idx]['task_type']
167
+
168
+ # question, answer = self.qa_template(self.data_list[idx]['data'])
169
+
170
+ answer_idx = -1
171
+ letters = []
172
+ options_string = ''
173
+ for option_idx, c in enumerate(options):
174
+ letters.append(f"{chr(ord('A') + option_idx)}")
175
+ options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
176
+ if c == answer:
177
+ answer_idx = option_idx
178
+
179
+ option_question = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.'
180
+
181
+ return {
182
+ 'video': torch_imgs,
183
+ 'video_path': video_path,
184
+ 'question': option_question,
185
+ 'letters': ','.join(letters),
186
+ 'answer_idx': answer_idx,
187
+ 'task_type': task_type
188
+ }
189
+
190
+
191
+ tasks = {
192
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
193
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
194
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
195
+ "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
196
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
197
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
198
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
199
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
200
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
201
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
202
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
203
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
204
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
205
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
206
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
207
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
208
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
209
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
210
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
211
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
212
+ }
213
+
214
+
215
+ def build_mvbench_eval(args, processor, num_frames):
216
+ data_list = []
217
+ for task_name, task in tasks.items():
218
+ json_file = os.path.join(args.question_file, task[0])
219
+ vis_folder = os.path.join(args.video_folder, task[1])
220
+ with open(json_file, 'r') as f:
221
+ json_data = json.load(f)
222
+ for data in json_data:
223
+ data_list.append({
224
+ 'task_type': task_name,
225
+ 'prefix': vis_folder,
226
+ 'data_type': task[2],
227
+ 'bound': task[3],
228
+ 'data': data
229
+ })
230
+ data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
231
+ dataset = MVBenchDataset(data_list, processor, num_segments=num_frames)
232
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
233
+
234
+ return dataloader
235
+
236
+
237
+ def mvbench_dump(ans_file, line, outputs):
238
+ for idx, output in enumerate(outputs):
239
+ vid = line['video_path'][idx]
240
+ task_type = line['task_type'][idx]
241
+ letters = line['letters'][idx].split(',')
242
+ answer_idx = line['answer_idx'][idx].item()
243
+
244
+ pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
245
+ if len(pred_answer) == 0:
246
+ pred_idx = (answer_idx + 1) % len(letters)
247
+ else:
248
+ pred_answer = pred_answer[0].strip()
249
+ if pred_answer.startswith('('):
250
+ pred_answer = pred_answer.strip('()')
251
+ pred_idx = letters.index(pred_answer)
252
+
253
+ ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
254
+
255
+
256
+ class NextoeDataset(Dataset):
257
+
258
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
259
+
260
+ def __init__(self, data_list, processor, num_segments=8):
261
+ self.data_list = data_list
262
+ self.processor = processor
263
+ self.num_segments = num_segments
264
+
265
+ def __len__(self):
266
+ return len(self.data_list)
267
+
268
+ def __getitem__(self, idx):
269
+ line = self.data_list[idx]
270
+ video_name = line['video']
271
+ question = line['question']
272
+ answer = line['answer']
273
+
274
+ for fmt in self.video_formats: # Added this line
275
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
276
+ if os.path.exists(temp_path):
277
+ video_path = temp_path
278
+ break
279
+
280
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
281
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, 8, dtype=int)).asnumpy()
282
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
283
+
284
+ wrapped_question = f'Question: {question}\nAnswer the question using a single word or a short phrase with multiple words.'
285
+
286
+ return {
287
+ 'video': video_tensor,
288
+ 'question': wrapped_question,
289
+ 'answer': answer,
290
+ 'qid': line['qid']
291
+ }
292
+
293
+
294
+ def build_nextoe_eval(args, processor, num_frames):
295
+ questions = json.load(open(args.question_file, "r"))
296
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
297
+ dataset = NextoeDataset(questions, processor, num_segments=num_frames)
298
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
299
+
300
+ return dataloader
301
+
302
+
303
+ def nextoe_dump(ans_file, line, outputs):
304
+ for idx, output in enumerate(outputs):
305
+ vid, qid = line['qid'][idx].split('_')
306
+ ans_file.write(json.dumps({"vid": vid, "qid": qid, "prediction": output}) + '\n')
307
+
308
+
309
+ class NextqaDataset(Dataset):
310
+
311
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
312
+
313
+ def __init__(self, data_list, processor, num_segments=8):
314
+ self.data_list = data_list
315
+ self.processor = processor
316
+ self.num_segments = num_segments
317
+
318
+ def __len__(self):
319
+ return len(self.data_list)
320
+
321
+ def __getitem__(self, idx):
322
+ line = self.data_list[idx]
323
+ video_name = line['video']
324
+ question = line['question']
325
+ answer = line['answer']
326
+
327
+ for fmt in self.video_formats: # Added this line
328
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
329
+ if os.path.exists(temp_path):
330
+ video_path = temp_path
331
+ break
332
+
333
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
334
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, 8, dtype=int)).asnumpy()
335
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
336
+
337
+ assert line['num_option'] == 5
338
+ a0 = line['a0']
339
+ a1 = line['a1']
340
+ a2 = line['a2']
341
+ a3 = line['a3']
342
+ a4 = line['a4']
343
+
344
+ option_question = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
345
+
346
+ return {
347
+ 'video': video_tensor,
348
+ 'question': option_question,
349
+ 'answer': answer,
350
+ 'qid': line['qid']
351
+ }
352
+
353
+
354
+ def build_nextqa_eval(args, processor, num_frames):
355
+ questions = json.load(open(args.question_file, "r"))
356
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
357
+ dataset = NextqaDataset(questions, processor, num_segments=num_frames)
358
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
359
+
360
+ return dataloader
361
+
362
+
363
+ def nextqa_dump(ans_file, line, outputs):
364
+ for idx, output in enumerate(outputs):
365
+ qid = line['qid'][idx]
366
+ answer = line['answer'][idx].item()
367
+
368
+ letters = ['A', 'B', 'C', 'D', 'E']
369
+
370
+ pred_answer = re.findall('[\(,\ ]*[A-E][\),\ ]*', output)
371
+ if len(pred_answer) == 0:
372
+ pred_idx = 2
373
+ else:
374
+ pred_answer = pred_answer[0].strip()
375
+ if pred_answer.startswith('('):
376
+ pred_answer = pred_answer.strip('()')
377
+ pred_idx = letters.index(pred_answer)
378
+
379
+ ans_file.write(json.dumps({"id": qid, "prediction": pred_idx, "answer": answer}) + '\n')
380
+
381
+
382
+ class EgoschemaDataset(Dataset):
383
+
384
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
385
+
386
+ def __init__(self, data_list, processor, num_segments=8):
387
+ self.data_list = data_list
388
+ self.processor = processor
389
+ self.num_segments = num_segments
390
+
391
+ def __len__(self):
392
+ return len(self.data_list)
393
+
394
+ def __getitem__(self, idx):
395
+ line = self.data_list[idx]
396
+ q_uid = line['q_uid']
397
+
398
+ for fmt in self.video_formats: # Added this line
399
+ temp_path = os.path.join(args.video_folder, f"{q_uid}{fmt}")
400
+ if os.path.exists(temp_path):
401
+ video_path = temp_path
402
+ break
403
+
404
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
405
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, self.num_segments, dtype=int)).asnumpy()
406
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
407
+
408
+ question = line['question']
409
+ a0 = line['option 0']
410
+ a1 = line['option 1']
411
+ a2 = line['option 2']
412
+ a3 = line['option 3']
413
+ a4 = line['option 4']
414
+ axs = [a0, a1, a2, a3, a4]
415
+ ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
416
+
417
+ option_question = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\n.Answer with the option\'s letter from the given choices directly and only give the best option.'
418
+
419
+ return {
420
+ 'q_uid': q_uid,
421
+ 'video': video_tensor,
422
+ 'question': option_question,
423
+ }
424
+
425
+
426
+ def build_egoschema_eval(args, processor, num_frames):
427
+ questions = json.load(open(args.question_file, "r"))
428
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
429
+ dataset = EgoschemaDataset(questions, processor, num_segments=num_frames)
430
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
431
+
432
+ return dataloader
433
+
434
+
435
+ def egoschema_dump(ans_file, line, outputs):
436
+ for idx, output in enumerate(outputs):
437
+ q_uid = line['q_uid'][idx]
438
+ letters = ['A', 'B', 'C', 'D', 'E']
439
+
440
+ pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
441
+ if len(pred_answer) == 0:
442
+ pred_idx = 2
443
+ else:
444
+ pred_answer = pred_answer[0].strip()
445
+ # if pred_answer.startswith('('):
446
+ pred_answer = pred_answer.strip('()')
447
+ pred_idx = letters.index(pred_answer)
448
+ ans_file.write(f'{q_uid}, {pred_idx}\n')
449
+
450
+
451
+ def get_model_output(model, video_tensor, tokenizer, questions, conv_mode="v1", device='cuda'):
452
+
453
+ input_ids = []
454
+ modal_list = []
455
+ for qs in questions:
456
+ if model.config.mm_use_im_start_end:
457
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
458
+ else:
459
+ qs = default_mm_token + "\n" + qs
460
+
461
+ conv = conv_templates[conv_mode].copy()
462
+ conv.append_message(conv.roles[0], qs)
463
+ conv.append_message(conv.roles[1], None)
464
+ prompt = conv.get_prompt()
465
+
466
+ input_id = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt')
467
+ input_ids.append(input_id)
468
+ modal_list.append("video")
469
+
470
+ # left pad sequence
471
+ input_ids = torch.nn.utils.rnn.pad_sequence(
472
+ [x.flip(dims=[0]) for x in input_ids],
473
+ batch_first=True,
474
+ padding_value=tokenizer.pad_token_id).flip(dims=[1]).to(device)
475
+
476
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(device)
477
+
478
+ video_tensor = video_tensor.half().to(args.device)
479
+
480
+ with torch.inference_mode():
481
+ output_ids = model.generate(
482
+ input_ids,
483
+ attention_mask=attention_mask,
484
+ images_or_videos=video_tensor,
485
+ modal_list=modal_list,
486
+ do_sample=False,
487
+ max_new_tokens=1024,
488
+ use_cache=True,
489
+ pad_token_id=tokenizer.eos_token_id)
490
+
491
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
492
+ return outputs
493
+
494
+
495
+ def run_inference(args):
496
+ """
497
+ Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
498
+
499
+ Args:
500
+ args: Command-line arguments.
501
+ """
502
+ # Initialize the model
503
+ model_name = get_model_name_from_path(args.model_path)
504
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
505
+
506
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
507
+
508
+ answer_file = os.path.expanduser(args.answer_file)
509
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
510
+ ans_file = open(answer_file, "w")
511
+
512
+ output_list = [] # List to store the output results
513
+
514
+ if args.dataset == 'mvbench':
515
+ val_loader = build_mvbench_eval(args, processor, num_frames)
516
+ elif args.dataset == 'nextoe':
517
+ val_loader = build_nextoe_eval(args, processor, num_frames)
518
+ elif args.dataset == 'nextqa':
519
+ val_loader = build_nextqa_eval(args, processor, num_frames)
520
+ elif args.dataset == 'egoschema':
521
+ val_loader = build_egoschema_eval(args, processor, num_frames)
522
+ else:
523
+ raise NotImplementedError(f"Dataset {args.dataset} not implemented.")
524
+
525
+ # Iterate over each sample in the ground truth file
526
+ for i, line in enumerate(tqdm(val_loader)):
527
+ video_tensor = line['video']
528
+ questions = line['question']
529
+
530
+ outputs = get_model_output(model, video_tensor, tokenizer, questions, args.conv_mode, args.device)
531
+
532
+ if args.dataset == 'mvbench':
533
+ mvbench_dump(ans_file, line, outputs)
534
+ elif args.dataset == 'nextoe':
535
+ nextoe_dump(ans_file, line, outputs)
536
+ elif args.dataset == 'nextqa':
537
+ nextqa_dump(ans_file, line, outputs)
538
+ elif args.dataset == 'egoschema':
539
+ egoschema_dump(ans_file, line, outputs)
540
+ else:
541
+ raise NotImplementedError(f"Dataset {args.dataset} not implemented.")
542
+
543
+ ans_file.close()
544
+
545
+
546
+ if __name__ == "__main__":
547
+ parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
548
+
549
+ parser.add_argument('--dataset', help='Dataset to evaluate on.', required=True)
550
+ parser.add_argument('--model-path', help='', required=True)
551
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
552
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
553
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
554
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
555
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
556
+ parser.add_argument("--num-chunks", type=int, default=1)
557
+ parser.add_argument("--chunk-idx", type=int, default=0)
558
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
559
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
560
+ parser.add_argument("--batch-size", type=int, default=1)
561
+ parser.add_argument("--num-workers", type=int, default=8)
562
+ args = parser.parse_args()
563
+ run_inference(args)
videollama2/eval/run_inference_video_qa_gpt.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import argparse
4
+ import json
5
+ import warnings
6
+ from tqdm import tqdm
7
+
8
+ import torch
9
+ import numpy as np
10
+ import transformers
11
+ import decord
12
+ from decord import VideoReader, cpu
13
+
14
+ import sys
15
+ sys.path.append('./')
16
+ from videollama2.conversation import conv_templates, SeparatorStyle
17
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
18
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
19
+ from videollama2.model.builder import load_pretrained_model
20
+
21
+
22
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
23
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
24
+
25
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
26
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
27
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
28
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
29
+
30
+
31
+ def split_list(lst, n):
32
+ """Split a list into n (roughly) equal-sized chunks"""
33
+ chunk_size = math.ceil(len(lst) / n) # integer division
34
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
35
+
36
+
37
+ def get_chunk(lst, n, k):
38
+ chunks = split_list(lst, n)
39
+ return chunks[k]
40
+
41
+
42
+ def get_model_output(model, tokenizer, video_tensor, questions, conv_mode="v1", device='cuda'):
43
+
44
+ input_ids = []
45
+ modal_list = []
46
+ for qs in questions:
47
+ if model.config.mm_use_im_start_end:
48
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
49
+ else:
50
+ qs = default_mm_token + "\n" + qs
51
+
52
+ conv = conv_templates[conv_mode].copy()
53
+ conv.append_message(conv.roles[0], qs)
54
+ conv.append_message(conv.roles[1], None)
55
+ prompt = conv.get_prompt()
56
+
57
+ input_id = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt')
58
+ input_ids.append(input_id)
59
+ modal_list.append("video")
60
+
61
+ # left pad sequence
62
+ input_ids = torch.nn.utils.rnn.pad_sequence(
63
+ [x.flip(dims=[0]) for x in input_ids],
64
+ batch_first=True,
65
+ padding_value=tokenizer.pad_token_id).flip(dims=[1]).to(device)
66
+
67
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(device)
68
+
69
+ video_tensor = video_tensor.half().to(args.device)
70
+
71
+ with torch.inference_mode():
72
+ output_ids = model.generate(
73
+ input_ids,
74
+ attention_mask=attention_mask,
75
+ images_or_videos=video_tensor,
76
+ modal_list=modal_list,
77
+ do_sample=False,
78
+ max_new_tokens=1024,
79
+ use_cache=True,
80
+ pad_token_id=tokenizer.eos_token_id)
81
+
82
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
83
+ return outputs
84
+
85
+
86
+ def run_inference(args):
87
+ # Initialize the model
88
+ model_name = get_model_name_from_path(args.model_path)
89
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
90
+
91
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
92
+
93
+ gt_questions = json.load(open(args.question_file, "r"))
94
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
95
+ gt_answers = json.load(open(args.answer_file, "r"))
96
+ gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
97
+
98
+ answer_file = os.path.join(args.output_file)
99
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
100
+ ans_file = open(answer_file, "w")
101
+
102
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
103
+
104
+ # Iterate over each sample in the ground truth file
105
+ for idx, sample in enumerate(tqdm(gt_questions)):
106
+ video_name = sample['video_name']
107
+ question = sample['question']
108
+ id = sample['question_id']
109
+ answer = gt_answers[idx]['answer']
110
+
111
+ # Load the video file
112
+ for fmt in video_formats: # Added this line
113
+ temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
114
+ if os.path.exists(temp_path):
115
+ video_path = temp_path
116
+ break
117
+ # BUG: compatibility for MSVD, MSRVTT, TGIF
118
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
119
+ if os.path.exists(temp_path):
120
+ video_path = temp_path
121
+ break
122
+
123
+ # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
124
+
125
+ video_tensor = process_video(video_path, processor, aspect_ratio=None, sample_scheme='uniform', num_frames=num_frames)
126
+ output = get_model_output(model, tokenizer, video_tensor[None], [question], args.conv_mode, args.device)[0]
127
+
128
+ sample_set = {'id': id, 'question': question, 'answer': answer, 'pred': output}
129
+ ans_file.write(json.dumps(sample_set) + "\n")
130
+
131
+ ans_file.close()
132
+
133
+
134
+ if __name__ == "__main__":
135
+ parser = argparse.ArgumentParser()
136
+
137
+ # Define the command-line arguments
138
+ parser.add_argument('--model-path', help='', required=True)
139
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
140
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
141
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
142
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
143
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
144
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
145
+ parser.add_argument("--num-chunks", type=int, default=1)
146
+ parser.add_argument("--chunk-idx", type=int, default=0)
147
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
148
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
149
+
150
+ args = parser.parse_args()
151
+ run_inference(args)
videollama2/eval/run_inference_video_qa_gpt_consistency.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import decord
11
+ import numpy as np
12
+ import transformers
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2.conversation import conv_templates, SeparatorStyle
19
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
20
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
21
+ from videollama2.model.builder import load_pretrained_model
22
+
23
+
24
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
25
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
26
+
27
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
28
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
29
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
30
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
31
+
32
+
33
+ def split_list(lst, n):
34
+ """Split a list into n (roughly) equal-sized chunks"""
35
+ chunk_size = math.ceil(len(lst) / n) # integer division
36
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
37
+
38
+
39
+ def get_chunk(lst, n, k):
40
+ chunks = split_list(lst, n)
41
+ return chunks[k]
42
+
43
+
44
+ class VCGPTDataset(Dataset):
45
+
46
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
47
+
48
+ def __init__(self, data_list, processor, num_frames):
49
+ self.data_list = data_list
50
+ self.processor = processor
51
+ self.num_frames = num_frames
52
+
53
+ def __len__(self):
54
+ return len(self.data_list)
55
+
56
+ def __getitem__(self, idx):
57
+ line = self.data_list[idx]
58
+ question1 = line['Q1']
59
+ question2 = line['Q2']
60
+ answer = line['A']
61
+ video_name = line['video_name']
62
+
63
+ for fmt in self.video_formats: # Added this line
64
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
65
+ if os.path.exists(temp_path):
66
+ video_path = temp_path
67
+ break
68
+
69
+ video_tensor = process_video(video_path, self.processor, aspect_ratio=None, sample_scheme='uniform', num_frames=self.num_frames)
70
+
71
+ return {
72
+ 'video': video_tensor,
73
+ 'video_name': video_name,
74
+ 'question1': question1,
75
+ 'question2': question2,
76
+ 'answer': answer,
77
+ }
78
+
79
+
80
+ def collate_fn(batch):
81
+ vid = [x['video'] for x in batch]
82
+ v_id = [x['video_name'] for x in batch]
83
+ qus1 = [x['question1'] for x in batch]
84
+ qus2 = [x['question2'] for x in batch]
85
+ ans = [x['answer'] for x in batch]
86
+ vid = torch.stack(vid, dim=0)
87
+ return vid, v_id, qus1, qus2, ans
88
+
89
+
90
+ def get_model_output(model, tokenizer, qs, video_tensor, args):
91
+ if model.config.mm_use_im_start_end:
92
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
93
+ else:
94
+ qs = default_mm_token + "\n" + qs
95
+
96
+ conv = conv_templates[args.conv_mode].copy()
97
+ conv.append_message(conv.roles[0], qs)
98
+ conv.append_message(conv.roles[1], None)
99
+ prompt = conv.get_prompt()
100
+
101
+ # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
102
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
103
+
104
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
105
+
106
+ modal_list = ["video"]
107
+ video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
108
+
109
+ with torch.inference_mode():
110
+ output_ids = model.generate(
111
+ input_ids.unsqueeze(0),
112
+ attention_mask=attention_mask.unsqueeze(0),
113
+ images_or_videos=[video_tensor],
114
+ modal_list=modal_list,
115
+ do_sample=False,
116
+ max_new_tokens=1024,
117
+ use_cache=True,
118
+ pad_token_id=tokenizer.eos_token_id)
119
+
120
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
121
+ return outputs
122
+
123
+
124
+ def run_inference(args):
125
+ model_name = get_model_name_from_path(args.model_path)
126
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
127
+
128
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
129
+
130
+ questions = json.load(open(args.question_file, "r"))
131
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
132
+
133
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
134
+ dataset = VCGPTDataset(questions, processor, num_frames)
135
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
136
+
137
+ answer_file = os.path.expanduser(args.answer_file)
138
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
139
+ ans_file = open(answer_file, "w")
140
+
141
+ output_list = [] # List to store the output results
142
+
143
+ # Iterate over each sample in the ground truth file
144
+ for i, (video_tensors, video_names, questions1, questions2, answers) in enumerate(tqdm(dataloader)):
145
+
146
+ # reduce batch dimension
147
+ video_tensor = video_tensors[0]
148
+ video_name = video_names[0]
149
+ question1 = questions1[0]
150
+ question2 = questions2[0]
151
+ answer = answers[0]
152
+
153
+ output1 = get_model_output(model, tokenizer, question1, video_tensor, args)
154
+ output2 = get_model_output(model, tokenizer, question2, video_tensor, args)
155
+
156
+ qa = {'video_name': video_name, 'Q1': question1, 'Q2': question2, 'A': answer, 'P1': output1, 'P2': output2}
157
+
158
+ ans_file.write(json.dumps(qa) + "\n")
159
+
160
+ ans_file.close()
161
+
162
+
163
+ if __name__ == "__main__":
164
+ parser = argparse.ArgumentParser()
165
+
166
+ # Define the command-line arguments
167
+ parser.add_argument('--model-path', help='', required=True)
168
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
169
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
170
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
171
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
172
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
173
+ parser.add_argument("--num-chunks", type=int, default=1)
174
+ parser.add_argument("--chunk-idx", type=int, default=0)
175
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
176
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
177
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
178
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
179
+
180
+ args = parser.parse_args()
181
+
182
+ run_inference(args)
videollama2/eval/run_inference_video_qa_gpt_general.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import decord
11
+ import numpy as np
12
+ import transformers
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2.conversation import conv_templates, SeparatorStyle
19
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
20
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
21
+ from videollama2.model.builder import load_pretrained_model
22
+
23
+
24
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
25
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
26
+
27
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
28
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
29
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
30
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
31
+
32
+
33
+ def split_list(lst, n):
34
+ """Split a list into n (roughly) equal-sized chunks"""
35
+ chunk_size = math.ceil(len(lst) / n) # integer division
36
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
37
+
38
+
39
+ def get_chunk(lst, n, k):
40
+ chunks = split_list(lst, n)
41
+ return chunks[k]
42
+
43
+
44
+ class VCGPTDataset(Dataset):
45
+
46
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
47
+
48
+ def __init__(self, data_list, processor, num_frames):
49
+ self.data_list = data_list
50
+ self.processor = processor
51
+ self.num_frames = num_frames
52
+
53
+ def __len__(self):
54
+ return len(self.data_list)
55
+
56
+ def __getitem__(self, idx):
57
+ line = self.data_list[idx]
58
+ question = line['Q']
59
+ answer = line['A']
60
+ video_name = line['video_name']
61
+
62
+ for fmt in self.video_formats: # Added this line
63
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
64
+ if os.path.exists(temp_path):
65
+ video_path = temp_path
66
+ break
67
+
68
+ video_tensor = process_video(video_path, self.processor, aspect_ratio=None, sample_scheme='uniform', num_frames=self.num_frames)
69
+
70
+ return {
71
+ 'video': video_tensor,
72
+ 'video_name': video_name,
73
+ 'question': question,
74
+ 'answer': answer,
75
+ }
76
+
77
+
78
+ def collate_fn(batch):
79
+ vid = [x['video'] for x in batch]
80
+ v_id = [x['video_name'] for x in batch]
81
+ qus = [x['question'] for x in batch]
82
+ ans = [x['answer'] for x in batch]
83
+ vid = torch.stack(vid, dim=0)
84
+ return vid, v_id, qus, ans
85
+
86
+
87
+ def get_model_output(model, tokenizer, qs, video_tensor, args):
88
+ if model.config.mm_use_im_start_end:
89
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
90
+ else:
91
+ qs = default_mm_token + "\n" + qs
92
+
93
+ conv = conv_templates[args.conv_mode].copy()
94
+ conv.append_message(conv.roles[0], qs)
95
+ conv.append_message(conv.roles[1], None)
96
+ prompt = conv.get_prompt()
97
+
98
+ # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
99
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
100
+
101
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
102
+
103
+ modal_list = ["video"]
104
+ video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
105
+
106
+ with torch.inference_mode():
107
+ output_ids = model.generate(
108
+ input_ids.unsqueeze(0),
109
+ attention_mask=attention_mask.unsqueeze(0),
110
+ images_or_videos=[video_tensor],
111
+ modal_list=modal_list,
112
+ do_sample=False,
113
+ max_new_tokens=1024,
114
+ use_cache=True,
115
+ pad_token_id=tokenizer.eos_token_id)
116
+
117
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
118
+ return outputs
119
+
120
+
121
+ def run_inference(args):
122
+ model_name = get_model_name_from_path(args.model_path)
123
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
124
+
125
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
126
+
127
+ questions = json.load(open(args.question_file, "r"))
128
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
129
+
130
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
131
+ dataset = VCGPTDataset(questions, processor, num_frames)
132
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
133
+
134
+ answer_file = os.path.expanduser(args.answer_file)
135
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
136
+ ans_file = open(answer_file, "w")
137
+
138
+ output_list = [] # List to store the output results
139
+
140
+ # Iterate over each sample in the ground truth file
141
+ for i, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
142
+
143
+ # reduce batch dimension
144
+ video_tensor = video_tensors[0]
145
+ video_name = video_names[0]
146
+ question = questions[0]
147
+ answer = answers[0]
148
+
149
+ output = get_model_output(model, tokenizer, question, video_tensor, args)
150
+
151
+ qa = {'video_name': video_name, 'Q': question, 'A': answer, 'P': output}
152
+
153
+ ans_file.write(json.dumps(qa) + "\n")
154
+
155
+ ans_file.close()
156
+
157
+
158
+ if __name__ == "__main__":
159
+ parser = argparse.ArgumentParser()
160
+
161
+ # Define the command-line arguments
162
+ parser.add_argument('--model-path', help='', required=True)
163
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
164
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
165
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
166
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
167
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
168
+ parser.add_argument("--num-chunks", type=int, default=1)
169
+ parser.add_argument("--chunk-idx", type=int, default=0)
170
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
171
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
172
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
173
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
174
+
175
+ args = parser.parse_args()
176
+
177
+ run_inference(args)
videollama2/eval/run_inference_video_qa_perception_test_mcqa.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import decord
11
+ import numpy as np
12
+ import transformers
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2.conversation import conv_templates, SeparatorStyle
19
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
20
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_videos
21
+ from videollama2.model.builder import load_pretrained_model
22
+
23
+
24
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
25
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
26
+
27
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
28
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
29
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
30
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
31
+
32
+
33
+ def split_list(lst, n):
34
+ """Split a list into n (roughly) equal-sized chunks"""
35
+ chunk_size = math.ceil(len(lst) / n) # integer division
36
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
37
+
38
+
39
+ def get_chunk(lst, n, k):
40
+ chunks = split_list(lst, n)
41
+ return chunks[k]
42
+
43
+
44
+ class PerceptionTestMCQADataset(Dataset):
45
+
46
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
47
+
48
+ def __init__(self, data_list, processor, num_segments=8):
49
+ self.data_list = data_list
50
+ self.processor = processor
51
+ self.num_segments = num_segments
52
+
53
+ def __len__(self):
54
+ return len(self.data_list)
55
+
56
+ def __getitem__(self, idx):
57
+ line = self.data_list[idx]
58
+ video_name = line['metadata']['video_id']
59
+ mc_questions = line['mc_question']
60
+
61
+ for fmt in self.video_formats: # Added this line
62
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
63
+ if os.path.exists(temp_path):
64
+ video_path = temp_path
65
+ break
66
+
67
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
68
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, self.num_segments, dtype=int)).asnumpy()
69
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
70
+
71
+ qs = []
72
+ qids = []
73
+ ops = []
74
+ for q in mc_questions:
75
+ question = q['question']
76
+ qid = q['id']
77
+ options = q['options']
78
+ option_question = f'Question: {question}\nOptions:\n(A) {options[0]}\n(B) {options[1]}\n(C) {options[2]}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
79
+
80
+ qs.append(option_question)
81
+ qids.append(qid)
82
+ ops.append(options)
83
+
84
+ return {
85
+ 'video': video_tensor,
86
+ 'video_id': video_name,
87
+ 'questions': qs,
88
+ 'question_ids': qids,
89
+ 'options': ops,
90
+ }
91
+
92
+
93
+ def collate_fn(batch):
94
+ vid = [x['video'] for x in batch]
95
+ v_id = [x['video_id'] for x in batch]
96
+ qs = [x['questions'] for x in batch]
97
+ q_ids = [x['question_ids'] for x in batch]
98
+ ops = [x['options'] for x in batch]
99
+ vid = torch.stack(vid, dim=0)
100
+ return vid, v_id, qs, q_ids, ops
101
+
102
+
103
+ def get_model_output(model, tokenizer, qs, video_tensor, args):
104
+ if model.config.mm_use_im_start_end:
105
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
106
+ else:
107
+ qs = default_mm_token + "\n" + qs
108
+
109
+ conv = conv_templates[args.conv_mode].copy()
110
+ conv.append_message(conv.roles[0], qs)
111
+ conv.append_message(conv.roles[1], None)
112
+ prompt = conv.get_prompt()
113
+
114
+ # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
115
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
116
+
117
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
118
+
119
+ modal_list = ["video"]
120
+ video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
121
+
122
+ with torch.inference_mode():
123
+ output_ids = model.generate(
124
+ input_ids.unsqueeze(0),
125
+ attention_mask=attention_mask.unsqueeze(0),
126
+ images_or_videos=[video_tensor],
127
+ modal_list=modal_list,
128
+ do_sample=False,
129
+ max_new_tokens=1024,
130
+ use_cache=True,
131
+ pad_token_id=tokenizer.eos_token_id)
132
+
133
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
134
+ return outputs
135
+
136
+
137
+ def run_inference(args):
138
+ # Initialize the model
139
+ model_name = get_model_name_from_path(args.model_path)
140
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
141
+
142
+ questions = json.load(open(args.question_file, "r"))
143
+ questions = list(questions.values())
144
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
145
+
146
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
147
+
148
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
149
+ dataset = PerceptionTestMCQADataset(questions, processor, num_frames)
150
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
151
+
152
+ answer_file = os.path.expanduser(args.answer_file)
153
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
154
+ ans_file = open(answer_file, "w")
155
+
156
+ output_list = [] # List to store the output results
157
+
158
+ # Iterate over each sample in the ground truth file
159
+ for i, (video_tensor, video_id, questions, question_ids, options) in enumerate(tqdm(dataloader)):
160
+
161
+ # reduce batch dimension
162
+ video_tensor = video_tensor[0]
163
+ video_id = video_id[0]
164
+ questions = questions[0]
165
+ question_ids = question_ids[0]
166
+ options = options[0]
167
+
168
+ qas = []
169
+ for idx, question in enumerate(questions):
170
+ letters = ['(A)', '(B)', '(C)']
171
+ question_id = question_ids[idx]
172
+ _options = options[idx]
173
+
174
+ output = get_model_output(model, tokenizer, question, video_tensor, args)
175
+ pred_answer = re.findall('\(*[A-C]\)*', output)
176
+ if len(pred_answer) == 0:
177
+ tmp_options = [x.lower() for x in _options]
178
+ if output.lower() in tmp_options:
179
+ tmp_options = [x.lower() for x in _options]
180
+ pred_idx = tmp_options.index(output.lower())
181
+ else:
182
+ pred_idx = 2
183
+ else:
184
+ pred_answer = pred_answer[0].strip()
185
+ if not pred_answer.startswith('('):
186
+ pred_answer = f'({pred_answer})'
187
+ pred_idx = letters.index(pred_answer)
188
+
189
+ qas.append({'id': question_id, 'answer_id': pred_idx, 'answer': _options[pred_idx]})
190
+
191
+ ans_file.write('\"{}\": {},\n'.format(video_id, json.dumps(qas)))
192
+
193
+ ans_file.close()
194
+
195
+
196
+ if __name__ == "__main__":
197
+ parser = argparse.ArgumentParser()
198
+
199
+ # Define the command-line arguments
200
+ parser.add_argument('--model-path', help='', required=True)
201
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
202
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
203
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
204
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
205
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
206
+ parser.add_argument("--num-chunks", type=int, default=1)
207
+ parser.add_argument("--chunk-idx", type=int, default=0)
208
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
209
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
210
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
211
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
212
+ args = parser.parse_args()
213
+
214
+ run_inference(args)
videollama2/mm_utils.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import math
3
+ import base64
4
+ from io import BytesIO
5
+
6
+ import torch
7
+ import decord
8
+ import imageio
9
+ import numpy as np
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+ from moviepy.editor import VideoFileClip
13
+ from transformers import StoppingCriteria
14
+
15
+ from scenedetect import open_video, SceneManager
16
+ from scenedetect.detectors import ContentDetector
17
+ from scenedetect.stats_manager import StatsManager
18
+
19
+ from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MMODAL_INDEX_TOKEN, IMAGE_TOKEN_INDEX
20
+
21
+
22
+ def merge_scenes(cut_list, cut_scores, scene_list,num_frames,max_scene_num=4, num_frame_per_scene=8, min_frames_per_scene=30):
23
+ if len(scene_list) == len(cut_list) and len(scene_list) == 0:
24
+ frame_ids = np.linspace(0, num_frames-1, num_frame_per_scene, dtype=int) # only one scene for current video
25
+ return [frame_ids]
26
+
27
+ scene_list, cut_results = merge_scenes_not_exeed_max_scene_num(cut_list,cut_scores,scene_list, max_scene_num)
28
+
29
+ prev_cut_point = 0
30
+ list_of_scene_frames = []
31
+ for (cur_cut_point, _) in cut_results:
32
+ frame_ids = list(np.linspace(prev_cut_point, cur_cut_point-1, num_frame_per_scene, dtype=int))
33
+ list_of_scene_frames.append(frame_ids)
34
+ prev_cut_point = cur_cut_point
35
+ if cur_cut_point < num_frames:
36
+ frame_ids = np.linspace(cur_cut_point, num_frames-1, num_frame_per_scene, dtype=int)
37
+ list_of_scene_frames.append(frame_ids)
38
+
39
+ return list_of_scene_frames
40
+
41
+
42
+ def merge_scenes_not_exeed_max_scene_num(cut_list,cut_scores, scene_list, max_scene_num):
43
+ cut_frames = [ele.get_frames() for ele in cut_list]
44
+ cut_results = list(zip(cut_frames, cut_scores))
45
+ while len(scene_list) > max_scene_num:
46
+ min_idx = np.argmin(cut_scores)
47
+ cut_frames = [ele for idx, ele in enumerate(cut_frames) if idx != min_idx]
48
+ cut_scores = [ele for idx, ele in enumerate(cut_scores) if idx != min_idx]
49
+
50
+ # merge scene list
51
+ num_scenes = len(scene_list)
52
+ #print("Current min_idx:", min_idx)
53
+ s1 = scene_list[min_idx]
54
+ s2 = scene_list[min_idx+1]
55
+ new_scene = (s1[0], s2[1])
56
+ if min_idx == 0:
57
+ # merge the first two scenes
58
+ new_scene_list = [new_scene] + scene_list[2:]
59
+ elif min_idx == num_scenes - 1:
60
+ # # merge the last two scenes
61
+ new_scene_list = scene_list[:min_idx-1] + [new_scene]
62
+ else:
63
+ new_scene_list = scene_list[:min_idx] + [new_scene] + scene_list[min_idx+2:]
64
+ scene_list = new_scene_list
65
+ cut_results = list(zip(cut_frames, cut_scores))
66
+ return scene_list, cut_results
67
+
68
+
69
+ def split_video_into_scenes(video_path, threshold=27.0, max_scene_num=10, num_frame_per_scene=8):
70
+ # Open video, create a scene manager, and add a detector.
71
+ video = open_video(video_path)
72
+ stats_manager = StatsManager()
73
+ scene_manager = SceneManager(stats_manager)
74
+ detector = ContentDetector(threshold=threshold)
75
+ scene_manager.add_detector(detector)
76
+ scene_manager.detect_scenes(video)
77
+ scene_list = scene_manager.get_scene_list()
78
+ cut_list = scene_manager.get_cut_list()
79
+ num_frames = video.duration.get_frames()
80
+ if len(scene_list) == len(cut_list) and len(scene_list) == 0:
81
+ frame_ids = np.linspace(0, num_frames-1, num_frame_per_scene, dtype=int) # only one scene for current video
82
+ return [frame_ids]
83
+ assert len(scene_list) == len(cut_list) + 1, f"inconsistent lengths for scene list ({len(scene_list)}) vs. cut list ({len(cut_list)})"
84
+ cut_frames = [ele.get_frames() for ele in cut_list]
85
+ cut_scores = [stats_manager.get_metrics(f, ["delta_lum"])[0] for f in cut_frames]
86
+ cut_results = list(zip(cut_frames, cut_scores))
87
+ #print(f"Original cut scores: {cut_scores}, original scene list: {scene_list}")
88
+ while len(scene_list) > max_scene_num:
89
+ min_idx = np.argmin(cut_scores)
90
+ cut_frames = [ele for idx, ele in enumerate(cut_frames) if idx != min_idx]
91
+ cut_scores = [ele for idx, ele in enumerate(cut_scores) if idx != min_idx]
92
+
93
+ # merge scene list
94
+ num_scenes = len(scene_list)
95
+ #print("Current min_idx:", min_idx)
96
+ s1 = scene_list[min_idx]
97
+ s2 = scene_list[min_idx+1]
98
+ new_scene = (s1[0], s2[1])
99
+ if min_idx == 0:
100
+ # merge the first two scenes
101
+ new_scene_list = [new_scene] + scene_list[2:]
102
+ elif min_idx == num_scenes - 1:
103
+ # # merge the last two scenes
104
+ new_scene_list = scene_list[:min_idx-1] + [new_scene]
105
+ else:
106
+ new_scene_list = scene_list[:min_idx] + [new_scene] + scene_list[min_idx+2:]
107
+ scene_list = new_scene_list
108
+ cut_results = list(zip(cut_frames, cut_scores))
109
+ #print(f"Cut scores after merging: {cut_scores}, scene list: {scene_list}")
110
+ prev_cut_point = 0
111
+ list_of_scene_frames = []
112
+ for (cur_cut_point, _) in cut_results:
113
+ frame_ids = list(np.linspace(prev_cut_point, cur_cut_point-1, num_frame_per_scene, dtype=int))
114
+ list_of_scene_frames.append(frame_ids)
115
+ prev_cut_point = cur_cut_point
116
+ if cur_cut_point < num_frames:
117
+ frame_ids = np.linspace(cur_cut_point, num_frames-1, num_frame_per_scene, dtype=int)
118
+ list_of_scene_frames.append(frame_ids)
119
+ # print(f"Finally got {len(list_of_scene_frames)} scenes where we evenly sampled {num_frame_per_scene} frames for each scene")
120
+ return list_of_scene_frames
121
+
122
+
123
+ def select_best_resolution(original_size, possible_resolutions):
124
+ """
125
+ Selects the best resolution from a list of possible resolutions based on the original size.
126
+ Args:
127
+ original_size (tuple): The original size of the image in the format (width, height).
128
+ possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
129
+ Returns:
130
+ tuple: The best fit resolution in the format (width, height).
131
+ """
132
+ original_width, original_height = original_size
133
+ best_fit = None
134
+ max_effective_resolution = 0
135
+ min_wasted_resolution = float('inf')
136
+ for width, height in possible_resolutions:
137
+ scale = min(width / original_width, height / original_height)
138
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
139
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
140
+ wasted_resolution = (width * height) - effective_resolution
141
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
142
+ max_effective_resolution = effective_resolution
143
+ min_wasted_resolution = wasted_resolution
144
+ best_fit = (width, height)
145
+ return best_fit
146
+
147
+
148
+ def resize_and_pad_image(image, target_resolution):
149
+ """
150
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
151
+ Args:
152
+ image (PIL.Image.Image): The input image.
153
+ target_resolution (tuple): The target resolution (width, height) of the image.
154
+ Returns:
155
+ PIL.Image.Image: The resized and padded image.
156
+ """
157
+ original_width, original_height = image.size
158
+ target_width, target_height = target_resolution
159
+ scale_w = target_width / original_width
160
+ scale_h = target_height / original_height
161
+ if scale_w < scale_h:
162
+ new_width = target_width
163
+ new_height = min(math.ceil(original_height * scale_w), target_height)
164
+ else:
165
+ new_height = target_height
166
+ new_width = min(math.ceil(original_width * scale_h), target_width)
167
+ # Resize the image
168
+ resized_image = image.resize((new_width, new_height))
169
+ new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
170
+ paste_x = (target_width - new_width) // 2
171
+ paste_y = (target_height - new_height) // 2
172
+ new_image.paste(resized_image, (paste_x, paste_y))
173
+ return new_image
174
+
175
+
176
+ def divide_to_patches(image, patch_size):
177
+ """
178
+ Divides an image into patches of a specified size.
179
+ Args:
180
+ image (PIL.Image.Image): The input image.
181
+ patch_size (int): The size of each patch.
182
+ Returns:
183
+ list: A list of PIL.Image.Image objects representing the patches.
184
+ """
185
+ patches = []
186
+ width, height = image.size
187
+ for i in range(0, height, patch_size):
188
+ for j in range(0, width, patch_size):
189
+ box = (j, i, j + patch_size, i + patch_size)
190
+ patch = image.crop(box)
191
+ patches.append(patch)
192
+ return patches
193
+
194
+
195
+ def get_anyres_image_grid_shape(image_size, grids, patch_size):
196
+ """
197
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
198
+ Args:
199
+ image_size (tuple): The size of the input image in the format (width, height).
200
+ grids (str, List[tuple[int]]): Patch segmentation grid.
201
+ patch_size (int): The size of each image patch.
202
+ Returns:
203
+ tuple: The shape of the image patch grid in the format (width, height).
204
+ """
205
+ if type(grids) is list:
206
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in grids]
207
+ else:
208
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in ast.literal_eval(grids)]
209
+ width, height = select_best_resolution(image_size, possible_resolutions)
210
+ return width // patch_size, height // patch_size
211
+
212
+
213
+ def process_anyres_image(image, grids, patch_size):
214
+ """
215
+ Process an image with variable resolutions.
216
+ Args:
217
+ image (PIL.Image.Image): The input image to be processed.
218
+ grids (str, List[tuple[int]]): Patch segmentation grid.
219
+ patch_size (int): The size of the patches to be extracted.
220
+ Returns:
221
+ torch.Tensor: A tensor containing the processed image patches.
222
+ """
223
+ if type(grids) is list:
224
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in grids]
225
+ else:
226
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in ast.literal_eval(grids)]
227
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
228
+ image_padded = resize_and_pad_image(image, best_resolution)
229
+ patches = divide_to_patches(image_padded, patch_size)
230
+ image_original_resize = resize_and_pad_image(image, (patch_size, patch_size))
231
+ image_patches = [image_original_resize] + patches
232
+ return image_patches
233
+
234
+
235
+ def chunk_list(input_list, chunk_size):
236
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
237
+
238
+
239
+ def frame_expansion(frame_list, n):
240
+ assert len(frame_list) == n * n
241
+ width, height = frame_list[0].width, frame_list[0].height
242
+ expanded_width = n * width
243
+ expanded_height = n * height
244
+ expanded_frame = Image.new('RGB', (expanded_width, expanded_height))
245
+ for i in range(n):
246
+ for j in range(n):
247
+ frame = frame_list[i * n + j]
248
+ coordinate = (j*width, i*height)
249
+ expanded_frame.paste(frame, coordinate)
250
+ return expanded_frame
251
+
252
+
253
+ def load_image_from_base64(image):
254
+ return Image.open(BytesIO(base64.b64decode(image)))
255
+
256
+
257
+ def expand2square(pil_img, background_color):
258
+ width, height = pil_img.size
259
+ if width == height:
260
+ return pil_img
261
+ elif width > height:
262
+ result = Image.new(pil_img.mode, (width, width), background_color)
263
+ result.paste(pil_img, (0, (width - height) // 2))
264
+ return result
265
+ else:
266
+ result = Image.new(pil_img.mode, (height, height), background_color)
267
+ result.paste(pil_img, ((height - width) // 2, 0))
268
+ return result
269
+
270
+
271
+ def process_images(images, image_processor, model_cfg):
272
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
273
+ new_images = []
274
+ #print("Current image_aspect_ratio:", image_aspect_ratio)
275
+ if image_aspect_ratio == 'pad':
276
+ for image in images:
277
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
278
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
279
+ new_images.append(image)
280
+ else:
281
+ return image_processor(images, return_tensors='pt')['pixel_values']
282
+ if all(x.shape == new_images[0].shape for x in new_images):
283
+ new_images = torch.stack(new_images, dim=0)
284
+ return new_images
285
+
286
+
287
+ def process_videos(frames, image_processor, model_cfg):
288
+ # this function only used during inference
289
+ # image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
290
+ # new_frames = []
291
+ # print("Current image_aspect_ratio:", image_aspect_ratio)
292
+ # if image_aspect_ratio == 'pad':
293
+ # for image in frames:
294
+ # image = Image.fromarray(image)
295
+ # image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
296
+ # image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
297
+ # new_frames.append(image)
298
+ # else:
299
+ # return image_processor(frames, return_tensors='pt')['pixel_values']
300
+ # if all(x.shape == new_frames[0].shape for x in new_frames):
301
+ # new_frames = torch.stack(new_frames, dim=0)
302
+ new_frames = image_processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
303
+ return new_frames
304
+
305
+
306
+ def create_photo_grid(arr, rows=None, cols=None):
307
+ """
308
+ Create a photo grid from a 4D numpy array with shape [t, h, w, c].
309
+
310
+ Parameters:
311
+ arr (numpy.ndarray): Input array with shape [t, h, w, c].
312
+ rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
313
+ cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
314
+
315
+ Returns:
316
+ numpy.ndarray: A 3D numpy array representing the photo grid.
317
+ """
318
+
319
+ if isinstance(arr, list):
320
+ if isinstance(arr[0], Image.Image):
321
+ arr = np.stack([np.array(img) for img in arr])
322
+ elif isinstance(arr[0], np.ndarray):
323
+ arr = np.stack(arr)
324
+ else:
325
+ raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
326
+
327
+ t, h, w, c = arr.shape
328
+
329
+ # Calculate the number of rows and columns if not provided
330
+ if rows is None and cols is None:
331
+ rows = math.ceil(math.sqrt(t))
332
+ cols = math.ceil(t / rows)
333
+ elif rows is None:
334
+ rows = math.ceil(t / cols)
335
+ elif cols is None:
336
+ cols = math.ceil(t / rows)
337
+
338
+ # Check if the grid can hold all the images
339
+ if rows * cols < t:
340
+ raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
341
+
342
+ # Create the grid array with appropriate height and width
343
+ grid_height = h * rows
344
+ grid_width = w * cols
345
+ grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
346
+
347
+ # Fill the grid with images
348
+ for i in range(t):
349
+ row_idx = i // cols
350
+ col_idx = i % cols
351
+ grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
352
+
353
+ return grid
354
+
355
+
356
+ def process_image(image_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False):
357
+ image = Image.open(image_path).convert('RGB')
358
+
359
+ if image_grid:
360
+ pg = np.stack([np.array(image)] * num_frames)
361
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
362
+ pg = create_photo_grid(pg, grid_h, grid_w)
363
+ images = [pg, np.array(image)]
364
+ else:
365
+ images = [np.array(image)]
366
+
367
+ if aspect_ratio == 'pad':
368
+ images = [Image.fromarray(f) for f in images]
369
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
370
+ else:
371
+ images = [Image.fromarray(f) for f in images]
372
+
373
+ images = processor.preprocess(images, return_tensors='pt')['pixel_values']
374
+ return images
375
+
376
+
377
+ def process_video(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
378
+ def frame_sample(duration, mode='uniform', local_fps=None):
379
+ if mode == 'uniform':
380
+ return np.linspace(0, duration-1, num_frames, dtype=int)
381
+ elif mode == 'fps':
382
+ assert local_fps is not None
383
+ segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
384
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
385
+ else:
386
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
387
+
388
+ if isinstance(video_path, str):
389
+ if video_path.endswith('.gif'):
390
+ video_gif = imageio.get_reader(video_path)
391
+ duration, local_fps = len(video_gif), 10
392
+
393
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
394
+ # limit the max input frames
395
+ if len(frame_id_list) > MAX_FRAMES:
396
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
397
+ video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
398
+ # added by lixin4ever, include the support of .webm files from sthsthv2
399
+ elif video_path.endswith('.webm'):
400
+ video_webm = VideoFileClip(video_path)
401
+ video_frames = np.array(list(video_webm.iter_frames()))
402
+
403
+ duration, local_fps = len(video_frames), video_webm.fps
404
+
405
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
406
+ # limit the max input frames
407
+ if len(frame_id_list) > MAX_FRAMES:
408
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
409
+ video_data = video_frames[frame_id_list]
410
+ else:
411
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0)) if "Valley/finetune/source_videos" not in video_path else VideoReader(uri=video_path, ctx=cpu(0), num_threads=1) # add num_threads=1 for Valley videos
412
+ duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
413
+
414
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
415
+ # limit the max input frames
416
+ if len(frame_id_list) > MAX_FRAMES:
417
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
418
+ try:
419
+ video_data = decord_vr.get_batch(frame_id_list).numpy()
420
+ except:
421
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
422
+
423
+ # if self.data_args.use_temp_aug:
424
+ # frame_id_list = np.linspace(0, duration-1, num_frames * 2 * 2, dtype=int)
425
+ # video_data = decord_vr.get_batch(frame_id_list)
426
+ # video_frames = [Image.fromarray(f) for f in video_data.numpy()]
427
+ # chunked_video_frames = chunk_list(video_frames, 2*2)
428
+ # video_data = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
429
+ else:
430
+ video = video_path
431
+ frame_id_list = frame_sample(duration, mode='uniform')
432
+ video_data = [video.get_data(frame_id) for frame_id in frame_id_list]
433
+
434
+ if image_grid:
435
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
436
+ pg = create_photo_grid(video_data, grid_h, grid_w)
437
+ video_data = [pg, *video_data]
438
+
439
+ if aspect_ratio == 'pad':
440
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
441
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
442
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
443
+ else:
444
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
445
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
446
+
447
+ return video
448
+
449
+
450
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
451
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
452
+
453
+ def insert_separator(X, sep):
454
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
455
+
456
+ input_ids = []
457
+ offset = 0
458
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
459
+ offset = 1
460
+ input_ids.append(prompt_chunks[0][0])
461
+
462
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
463
+ input_ids.extend(x[offset:])
464
+
465
+ if return_tensors is not None:
466
+ if return_tensors == 'pt':
467
+ return torch.tensor(input_ids, dtype=torch.long)
468
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
469
+ return input_ids
470
+
471
+
472
+ def tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
473
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(f'<{MMODAL_INDEX_TOKEN[MMODAL_token_index].lower()}>')]
474
+ num_prompt_chunks = len(prompt.split(f'<{MMODAL_INDEX_TOKEN[MMODAL_token_index].lower()}>'))
475
+
476
+ def insert_separator(X, sep):
477
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
478
+
479
+ input_ids = []
480
+ offset = 0
481
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
482
+ offset = 1
483
+ input_ids.append(prompt_chunks[0][0])
484
+
485
+ for x in insert_separator(prompt_chunks, [MMODAL_token_index] * (offset + 1)):
486
+ input_ids.extend(x[offset:])
487
+
488
+ if return_tensors is not None:
489
+ if return_tensors == 'pt':
490
+ return torch.tensor(input_ids, dtype=torch.long)
491
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
492
+ return input_ids
493
+
494
+
495
+ def get_model_name_from_path(model_path):
496
+ model_path = model_path.strip("/")
497
+ model_paths = model_path.split("/")
498
+ if model_paths[-1].startswith('checkpoint-'):
499
+ return model_paths[-2] + "_" + model_paths[-1]
500
+ else:
501
+ return model_paths[-1]
502
+
503
+
504
+ class KeywordsStoppingCriteria(StoppingCriteria):
505
+ def __init__(self, keywords, tokenizer, input_ids):
506
+ self.keywords = keywords
507
+ self.keyword_ids = []
508
+ self.max_keyword_len = 0
509
+ for keyword in keywords:
510
+ cur_keyword_ids = tokenizer(keyword).input_ids
511
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
512
+ cur_keyword_ids = cur_keyword_ids[1:]
513
+ if len(cur_keyword_ids) > self.max_keyword_len:
514
+ self.max_keyword_len = len(cur_keyword_ids)
515
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
516
+ self.tokenizer = tokenizer
517
+ self.start_len = input_ids.shape[1]
518
+
519
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
520
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
521
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
522
+ for keyword_id in self.keyword_ids:
523
+ if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
524
+ return True
525
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
526
+ for keyword in self.keywords:
527
+ if keyword in outputs:
528
+ return True
529
+ return False
530
+
531
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
532
+ outputs = []
533
+ for i in range(output_ids.shape[0]):
534
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
535
+ return all(outputs)
videollama2/model/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .language_model.videollama2_llama import Videollama2LlamaForCausalLM, Videollama2Config
2
+ from .language_model.videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
3
+ from .language_model.videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
videollama2/model/builder.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import os
18
+ import warnings
19
+ import shutil
20
+
21
+ import torch
22
+ from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
+
24
+ from . import *
25
+ from .multimodal_projector import load_mm_projector
26
+ from ..constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
27
+
28
+
29
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
30
+ if 'token' in kwargs:
31
+ token = kwargs['token']
32
+ else:
33
+ token = None
34
+
35
+ kwargs = {"device_map": device_map, **kwargs}
36
+
37
+ if device != "cuda":
38
+ kwargs['device_map'] = {"": device}
39
+
40
+ if load_8bit:
41
+ kwargs['load_in_8bit'] = True
42
+ elif load_4bit:
43
+ kwargs['load_in_4bit'] = True
44
+ kwargs['quantization_config'] = BitsAndBytesConfig(
45
+ load_in_4bit=True,
46
+ bnb_4bit_compute_dtype=torch.float16,
47
+ bnb_4bit_use_double_quant=True,
48
+ bnb_4bit_quant_type='nf4'
49
+ )
50
+ else:
51
+ kwargs['torch_dtype'] = torch.float16
52
+
53
+ if use_flash_attn:
54
+ kwargs['attn_implementation'] = 'flash_attention_2'
55
+
56
+ if "videollama" in model_name.lower():
57
+ # Load LLaVA model
58
+ if 'lora' in model_name.lower() and model_base is None:
59
+ warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
60
+ if 'lora' in model_name.lower() and model_base is not None:
61
+ lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
62
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
63
+ print('Loading VideoLLaMA from base model...')
64
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
65
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
66
+ if model.lm_head.weight.shape[0] != token_num:
67
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
68
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
69
+
70
+ print('Loading additional VideoLLaMA weights...')
71
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
72
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
73
+ else:
74
+ # this is probably from HF Hub
75
+ from huggingface_hub import hf_hub_download
76
+ def load_from_hf(repo_id, filename, subfolder=None):
77
+ cache_file = hf_hub_download(
78
+ repo_id=repo_id,
79
+ filename=filename,
80
+ subfolder=subfolder)
81
+ return torch.load(cache_file, map_location='cpu')
82
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
83
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
84
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
85
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
86
+ model.load_state_dict(non_lora_trainables, strict=False)
87
+
88
+ from peft import PeftModel
89
+ print('Loading LoRA weights...')
90
+ model = PeftModel.from_pretrained(model, model_path)
91
+ print('Merging LoRA weights...')
92
+ model = model.merge_and_unload()
93
+ print('Model is loaded...')
94
+ elif model_base is not None or '-base' in model_name.lower():
95
+ # loading vision-language projector
96
+ print('Loading VideoLLaMA 2 from base model...')
97
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
98
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
99
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
100
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
101
+
102
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
103
+
104
+ if 'vicuna' in model_name.lower():
105
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
106
+ elif 'mixtral' in model_name.lower():
107
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
108
+ else:
109
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
110
+
111
+ # NOTE: old codes for loading local mm_projector.bin
112
+ # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
113
+ # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
114
+ # model.load_state_dict(mm_projector_weights, strict=False)
115
+ # NOTE: new codes which supports loading mm_projector.bin both offline and online
116
+ mm_projector_weights = load_mm_projector(model_path, token=token)
117
+ model.load_state_dict(mm_projector_weights, strict=False)
118
+ else:
119
+ if 'vicuna' in model_name.lower():
120
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
121
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
122
+ elif 'mixtral' in model_name.lower():
123
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
124
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
125
+ else:
126
+ # NOTE: mistral-based model is our default model.
127
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
128
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
129
+ else:
130
+ # Load language model
131
+ if model_base is not None:
132
+ # PEFT model
133
+ from peft import PeftModel
134
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
135
+ model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
136
+ print(f"Loading LoRA weights from {model_path}")
137
+ model = PeftModel.from_pretrained(model, model_path)
138
+ print(f"Merging weights")
139
+ model = model.merge_and_unload()
140
+ print('Convert to FP16...')
141
+ model.to(torch.float16)
142
+ else:
143
+ use_fast = False
144
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
145
+ model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
146
+
147
+ processor = None
148
+
149
+ if "videollama" in model_name.lower():
150
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
151
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
152
+ if mm_use_im_patch_token:
153
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
154
+ if mm_use_im_start_end:
155
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
156
+ model.resize_token_embeddings(len(tokenizer))
157
+
158
+ vision_tower = model.get_vision_tower()
159
+ if not vision_tower.is_loaded:
160
+ vision_tower.load_model()
161
+ vision_tower.to(device=device, dtype=torch.float16)
162
+ # NOTE: videollama2 adopts the same processor for processing image and video.
163
+ processor = vision_tower.image_processor
164
+
165
+ if hasattr(model.config, "max_sequence_length"):
166
+ context_len = model.config.max_sequence_length
167
+ else:
168
+ context_len = 2048
169
+
170
+ return tokenizer, model, processor, context_len
videollama2/model/language_model/videollama2_llama.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ LlamaConfig, LlamaModel, LlamaForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2Config(LlamaConfig):
31
+ model_type = "videollama2_llama"
32
+
33
+
34
+ class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
35
+ config_class = Videollama2Config
36
+
37
+ def __init__(self, config: LlamaConfig):
38
+ super(Videollama2LlamaModel, self).__init__(config)
39
+
40
+
41
+ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
42
+ config_class = Videollama2Config
43
+
44
+ def __init__(self, config, **kwargs):
45
+ super(LlamaForCausalLM, self).__init__(config)
46
+ self.model = Videollama2LlamaModel(config)
47
+ self.pretraining_tp = config.pretraining_tp
48
+ self.vocab_size = config.vocab_size
49
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
50
+
51
+ # Initialize weights and apply final processing
52
+ self.post_init()
53
+
54
+ def get_model(self):
55
+ return self.model
56
+
57
+ def forward(
58
+ self,
59
+ input_ids: torch.LongTensor = None,
60
+ attention_mask: Optional[torch.Tensor] = None,
61
+ position_ids: Optional[torch.LongTensor] = None,
62
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
63
+ inputs_embeds: Optional[torch.FloatTensor] = None,
64
+ labels: Optional[torch.LongTensor] = None,
65
+ use_cache: Optional[bool] = None,
66
+ output_attentions: Optional[bool] = None,
67
+ output_hidden_states: Optional[bool] = None,
68
+ images: Optional[torch.FloatTensor] = None,
69
+ return_dict: Optional[bool] = None,
70
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
71
+
72
+ if inputs_embeds is None:
73
+ (
74
+ input_ids,
75
+ attention_mask,
76
+ past_key_values,
77
+ inputs_embeds,
78
+ labels
79
+ ) = self.prepare_inputs_labels_for_multimodal(
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ labels,
84
+ images
85
+ )
86
+
87
+ return super().forward(
88
+ input_ids=input_ids,
89
+ attention_mask=attention_mask,
90
+ past_key_values=past_key_values,
91
+ inputs_embeds=inputs_embeds,
92
+ labels=labels,
93
+ use_cache=use_cache,
94
+ output_attentions=output_attentions,
95
+ output_hidden_states=output_hidden_states,
96
+ return_dict=return_dict
97
+ )
98
+
99
+ @torch.no_grad()
100
+ def generate(
101
+ self,
102
+ inputs: Optional[torch.Tensor] = None,
103
+ images_or_videos: Optional[torch.Tensor] = None,
104
+ modal_list: Optional[torch.Tensor] = None,
105
+ **kwargs,
106
+ ) -> Union[GenerateOutput, torch.LongTensor]:
107
+ position_ids = kwargs.pop("position_ids", None)
108
+ attention_mask = kwargs.pop("attention_mask", None)
109
+ if "inputs_embeds" in kwargs:
110
+ raise NotImplementedError("`inputs_embeds` is not supported")
111
+
112
+ if images_or_videos is not None:
113
+ (
114
+ input_ids,
115
+ attention_mask,
116
+ past_key_values,
117
+ inputs_embeds,
118
+ _
119
+ ) = self.prepare_inputs_labels_for_multimodal(
120
+ input_ids=inputs,
121
+ attention_mask=attention_mask,
122
+ past_key_values=None,
123
+ labels=None,
124
+ X_modalities=[images_or_videos, modal_list]
125
+ )
126
+ else:
127
+ inputs_embeds = self.get_model().embed_tokens(inputs)
128
+
129
+ return super().generate(
130
+ position_ids=position_ids,
131
+ attention_mask=attention_mask,
132
+ inputs_embeds=inputs_embeds,
133
+ **kwargs
134
+ )
135
+
136
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
137
+ images = kwargs.pop("images", None)
138
+ _inputs = super().prepare_inputs_for_generation(
139
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
140
+ )
141
+ if images is not None:
142
+ _inputs['images'] = images
143
+ return _inputs
144
+
145
+
146
+ AutoConfig.register("videollama2_llama", Videollama2Config)
147
+ AutoModelForCausalLM.register(Videollama2Config, Videollama2LlamaForCausalLM)
videollama2/model/language_model/videollama2_mistral.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, \
24
+ MistralConfig, MistralModel, MistralForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2MistralConfig(MistralConfig):
33
+ model_type = "videollama2_mistral"
34
+
35
+
36
+ class Videollama2MistralModel(Videollama2MetaModel, MistralModel):
37
+ config_class = Videollama2MistralConfig
38
+
39
+ def __init__(self, config: MistralConfig):
40
+ super(Videollama2MistralModel, self).__init__(config)
41
+
42
+
43
+ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausalLM):
44
+ config_class = Videollama2MistralConfig
45
+
46
+ def __init__(self, config, **kwargs):
47
+ super(MistralForCausalLM, self).__init__(config)
48
+ self.model = Videollama2MistralModel(config)
49
+ # self.pretraining_tp = config.pretraining_tp
50
+ self.vocab_size = config.vocab_size
51
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
52
+
53
+ # Initialize weights and apply final processing
54
+ self.post_init()
55
+
56
+ def get_model(self):
57
+ return self.model
58
+
59
+ def forward(
60
+ self,
61
+ input_ids: torch.LongTensor = None,
62
+ attention_mask: Optional[torch.Tensor] = None,
63
+ position_ids: Optional[torch.LongTensor] = None,
64
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
65
+ inputs_embeds: Optional[torch.FloatTensor] = None,
66
+ labels: Optional[torch.LongTensor] = None,
67
+ use_cache: Optional[bool] = None,
68
+ output_attentions: Optional[bool] = None,
69
+ output_hidden_states: Optional[bool] = None,
70
+ images: Optional[torch.FloatTensor] = None,
71
+ return_dict: Optional[bool] = None,
72
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
73
+
74
+ if inputs_embeds is None:
75
+ (
76
+ input_ids,
77
+ attention_mask,
78
+ past_key_values,
79
+ inputs_embeds,
80
+ labels
81
+ ) = self.prepare_inputs_labels_for_multimodal(
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ labels,
86
+ images
87
+ )
88
+
89
+ return super().forward(
90
+ input_ids=input_ids,
91
+ attention_mask=attention_mask,
92
+ past_key_values=past_key_values,
93
+ inputs_embeds=inputs_embeds,
94
+ labels=labels,
95
+ use_cache=use_cache,
96
+ output_attentions=output_attentions,
97
+ output_hidden_states=output_hidden_states,
98
+ return_dict=return_dict
99
+ )
100
+
101
+ @torch.no_grad()
102
+ def generate(
103
+ self,
104
+ inputs: Optional[torch.Tensor] = None,
105
+ images_or_videos: Optional[torch.Tensor] = None,
106
+ modal_list: Optional[torch.Tensor] = None,
107
+ **kwargs,
108
+ ) -> Union[GenerateOutput, torch.LongTensor]:
109
+ position_ids = kwargs.pop("position_ids", None)
110
+ attention_mask = kwargs.pop("attention_mask", None)
111
+ if "inputs_embeds" in kwargs:
112
+ raise NotImplementedError("`inputs_embeds` is not supported")
113
+
114
+ if images_or_videos is not None:
115
+ (
116
+ input_ids,
117
+ attention_mask,
118
+ past_key_values,
119
+ inputs_embeds,
120
+ _
121
+ ) = self.prepare_inputs_labels_for_multimodal(
122
+ input_ids=inputs,
123
+ attention_mask=attention_mask,
124
+ past_key_values=None,
125
+ labels=None,
126
+ X_modalities=[images_or_videos, modal_list]
127
+ )
128
+ else:
129
+ inputs_embeds = self.get_model().embed_tokens(inputs)
130
+
131
+ return super().generate(
132
+ position_ids=position_ids,
133
+ attention_mask=attention_mask,
134
+ inputs_embeds=inputs_embeds,
135
+ **kwargs
136
+ )
137
+
138
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
139
+ images = kwargs.pop("images", None)
140
+ _inputs = super().prepare_inputs_for_generation(
141
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
142
+ )
143
+ if images is not None:
144
+ _inputs['images'] = images
145
+ return _inputs
146
+
147
+
148
+ AutoConfig.register("videollama2_mistral", Videollama2MistralConfig)
149
+ AutoModelForCausalLM.register(Videollama2MistralConfig, Videollama2MistralForCausalLM)
videollama2/model/language_model/videollama2_mixtral.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ MixtralConfig, MixtralModel, MixtralForCausalLM
24
+
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+ from transformers.generation.utils import GenerateOutput
27
+
28
+ from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
29
+
30
+
31
+ class Videollama2MixtralConfig(MixtralConfig):
32
+ model_type = "videollama2_mixtral"
33
+
34
+
35
+ class Videollama2MixtralModel(Videollama2MetaModel, MixtralModel):
36
+ config_class = Videollama2MixtralConfig
37
+
38
+ def __init__(self, config: MixtralConfig):
39
+ super(Videollama2MixtralModel, self).__init__(config)
40
+
41
+
42
+ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausalLM):
43
+ config_class = Videollama2MixtralConfig
44
+
45
+ def __init__(self, config, **kwargs):
46
+ super(MixtralForCausalLM, self).__init__(config)
47
+ self.model = Videollama2MixtralModel(config)
48
+ # self.pretraining_tp = config.pretraining_tp
49
+ self.vocab_size = config.vocab_size
50
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
51
+
52
+ # Initialize weights and apply final processing
53
+ self.post_init()
54
+
55
+ def get_model(self):
56
+ return self.model
57
+
58
+ def forward(
59
+ self,
60
+ input_ids: torch.LongTensor = None,
61
+ attention_mask: Optional[torch.Tensor] = None,
62
+ position_ids: Optional[torch.LongTensor] = None,
63
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
64
+ inputs_embeds: Optional[torch.FloatTensor] = None,
65
+ labels: Optional[torch.LongTensor] = None,
66
+ use_cache: Optional[bool] = None,
67
+ output_attentions: Optional[bool] = None,
68
+ output_hidden_states: Optional[bool] = None,
69
+ images: Optional[torch.FloatTensor] = None,
70
+ return_dict: Optional[bool] = None,
71
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
72
+
73
+ if inputs_embeds is None:
74
+ (
75
+ input_ids,
76
+ attention_mask,
77
+ past_key_values,
78
+ inputs_embeds,
79
+ labels
80
+ ) = self.prepare_inputs_labels_for_multimodal(
81
+ input_ids,
82
+ attention_mask,
83
+ past_key_values,
84
+ labels,
85
+ images
86
+ )
87
+
88
+ return super().forward(
89
+ input_ids=input_ids,
90
+ attention_mask=attention_mask,
91
+ past_key_values=past_key_values,
92
+ inputs_embeds=inputs_embeds,
93
+ labels=labels,
94
+ use_cache=use_cache,
95
+ output_attentions=output_attentions,
96
+ output_hidden_states=output_hidden_states,
97
+ return_dict=return_dict
98
+ )
99
+
100
+ @torch.no_grad()
101
+ def generate(
102
+ self,
103
+ inputs: Optional[torch.Tensor] = None,
104
+ images_or_videos: Optional[torch.Tensor] = None,
105
+ timestamps: Optional[torch.Tensor] = None,
106
+ modal_list: Optional[torch.Tensor] = None,
107
+ **kwargs,
108
+ ) -> Union[GenerateOutput, torch.LongTensor]:
109
+ position_ids = kwargs.pop("position_ids", None)
110
+ attention_mask = kwargs.pop("attention_mask", None)
111
+ if "inputs_embeds" in kwargs:
112
+ raise NotImplementedError("`inputs_embeds` is not supported")
113
+
114
+ if images_or_videos is not None:
115
+ X_modalities = [images_or_videos, modal_list] if timestamps is None else [images_or_videos, modal_list, timestamps]
116
+ (
117
+ input_ids,
118
+ attention_mask,
119
+ past_key_values,
120
+ inputs_embeds,
121
+ _
122
+ ) = self.prepare_inputs_labels_for_multimodal(
123
+ input_ids=inputs,
124
+ attention_mask=attention_mask,
125
+ past_key_values=None,
126
+ labels=None,
127
+ X_modalities=X_modalities
128
+ )
129
+ else:
130
+ inputs_embeds = self.get_model().embed_tokens(inputs)
131
+
132
+ return super().generate(
133
+ position_ids=position_ids,
134
+ attention_mask=attention_mask,
135
+ inputs_embeds=inputs_embeds,
136
+ **kwargs
137
+ )
138
+
139
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
140
+ images = kwargs.pop("images", None)
141
+ _inputs = super().prepare_inputs_for_generation(
142
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
143
+ )
144
+ if images is not None:
145
+ _inputs['images'] = images
146
+ return _inputs
147
+
148
+ AutoConfig.register("videollama2_mixtral", Videollama2MixtralConfig)
149
+ AutoModelForCausalLM.register(Videollama2MixtralConfig, Videollama2MixtralForCausalLM)
videollama2/model/multimodal_encoder/builder.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from .clip_encoder import CLIPVisionTower
4
+
5
+
6
+ def build_vision_tower(vision_tower_cfg, **kwargs):
7
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
8
+
9
+ is_absolute_path_exists = os.path.exists(vision_tower)
10
+ if vision_tower.startswith("openai") or vision_tower.startswith("laion") or 'clip' in vision_tower:
11
+ vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
12
+ else:
13
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
14
+
15
+ return vision_tower
videollama2/model/multimodal_encoder/clip_encoder.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
5
+
6
+
7
+ class CLIPVisionTower(nn.Module):
8
+
9
+ def __init__(self, vision_tower, args, delay_load=False):
10
+ super().__init__()
11
+
12
+ self.is_loaded = False
13
+
14
+ self.vision_tower_name = vision_tower
15
+ self.select_layer = args.mm_vision_select_layer
16
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
17
+
18
+ if not delay_load:
19
+ self.load_model()
20
+ else:
21
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
22
+
23
+ def load_model(self):
24
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
25
+
26
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
27
+ self.vision_tower.requires_grad_(False)
28
+
29
+ self.is_loaded = True
30
+
31
+ def feature_select(self, image_forward_outs):
32
+ image_features = image_forward_outs.hidden_states[self.select_layer]
33
+ if self.select_feature == 'patch':
34
+ image_features = image_features[:, 1:]
35
+ elif self.select_feature == 'cls_patch':
36
+ image_features = image_features
37
+ else:
38
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
39
+ return image_features
40
+
41
+ @torch.no_grad()
42
+ def forward(self, images):
43
+ if type(images) is list:
44
+ image_features = []
45
+ for image in images:
46
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
47
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
48
+ image_features.append(image_feature)
49
+ else:
50
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
51
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
52
+
53
+ return image_features
54
+
55
+ @property
56
+ def dummy_feature(self):
57
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
58
+
59
+ @property
60
+ def dtype(self):
61
+ return self.vision_tower.dtype
62
+
63
+ @property
64
+ def device(self):
65
+ return self.vision_tower.device
66
+
67
+ @property
68
+ def config(self):
69
+ if self.is_loaded:
70
+ return self.vision_tower.config
71
+ else:
72
+ return self.cfg_only
73
+
74
+ @property
75
+ def hidden_size(self):
76
+ return self.config.hidden_size
77
+
78
+ @property
79
+ def num_patches(self):
80
+ return (self.config.image_size // self.config.patch_size) ** 2
81
+
82
+ @property
83
+ def num_patches_per_side(self):
84
+ return self.config.image_size // self.config.patch_size
videollama2/model/multimodal_projector/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .builder import load_mm_projector
videollama2/model/multimodal_projector/builder.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Alibaba DAMO Academy
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+
18
+ import einops
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+ from timm.models.regnet import RegStage
23
+ from timm.models.layers import LayerNorm, LayerNorm2d
24
+ from transformers import TRANSFORMERS_CACHE
25
+
26
+
27
+ def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
28
+ revision = "main"
29
+ # 1. parse the downloaded cache folder
30
+ if cache_dir is None:
31
+ cache_dir = TRANSFORMERS_CACHE
32
+ else:
33
+ cache_dir = cache_dir
34
+ object_id = repo_id.replace("/", "--")
35
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
36
+ # 2. resolve refs (for instance to convert main to the associated commit sha)
37
+ refs_dir = os.path.join(repo_cache, "refs")
38
+ if os.path.isdir(refs_dir):
39
+ revision_file = os.path.join(refs_dir, revision)
40
+ if os.path.isfile(revision_file):
41
+ with open(revision_file) as f:
42
+ revision = f.read()
43
+ # 3. acquire the snapshot folder
44
+ folder = os.path.join(repo_cache, "snapshots", revision)
45
+
46
+ return folder
47
+
48
+
49
+ def load_mm_projector(model_path, cache_dir=None, token=None):
50
+ if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
51
+ is_local = True
52
+ folder = model_path
53
+ else:
54
+ is_local = False
55
+ folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
56
+ if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
57
+ # downloading from remote repo
58
+ from huggingface_hub import snapshot_download
59
+ snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
60
+
61
+ mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
62
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
63
+ return mm_projector_weights
64
+
65
+
66
+ class IdentityMap(nn.Module):
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ def forward(self, x, *args, **kwargs):
72
+ return x
73
+
74
+ @property
75
+ def config(self):
76
+ return {"mm_projector_type": 'identity'}
77
+
78
+
79
+ class SimpleResBlock(nn.Module):
80
+
81
+ def __init__(self, channels):
82
+ super().__init__()
83
+ self.pre_norm = nn.LayerNorm(channels)
84
+
85
+ self.proj = nn.Sequential(
86
+ nn.Linear(channels, channels),
87
+ nn.GELU(),
88
+ nn.Linear(channels, channels)
89
+ )
90
+ def forward(self, x):
91
+ x = self.pre_norm(x)
92
+ return x + self.proj(x)
93
+
94
+
95
+ def build_vision_projector(config, delay_load=False, **kwargs):
96
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
97
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
98
+ if mlp_gelu_match:
99
+ mlp_depth = int(mlp_gelu_match.group(1))
100
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
101
+ for _ in range(1, mlp_depth):
102
+ modules.append(nn.GELU())
103
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
104
+ return nn.Sequential(*modules)
105
+
106
+ if projector_type == "linear":
107
+ # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
108
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
109
+ elif projector_type == "stc_connector":
110
+ return STCConnector(config)
111
+ elif projector_type == "stp_connector":
112
+ return STPConnector(config)
113
+ elif projector_type == "stc_connector_v35":
114
+ return STCConnectorV35(config)
115
+ elif projector_type == "spatial_conv":
116
+ return SpatialConv(config)
117
+ elif projector_type == "spatial_pool":
118
+ return SpatialPool(config)
119
+ if projector_type == 'identity':
120
+ return IdentityMap()
121
+
122
+ raise ValueError(f'Unknown projector type: {projector_type}')
123
+
124
+
125
+ def build_mlp(depth, hidden_size, output_hidden_size):
126
+ modules = [nn.Linear(hidden_size, output_hidden_size)]
127
+ for _ in range(1, depth):
128
+ modules.append(nn.GELU())
129
+ modules.append(nn.Linear(output_hidden_size, output_hidden_size))
130
+ return nn.Sequential(*modules)
131
+
132
+
133
+ class STCConnector(nn.Module):
134
+
135
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
136
+ """Temporal Convolutional Vision-Language Connector.
137
+
138
+ Args:
139
+ config: config object.
140
+ downsample: (temporal, height, width) downsample rate.
141
+ depth: depth of the spatial interaction blocks.
142
+ mlp_depth: depth of the vision-language projector layers.
143
+ """
144
+ super().__init__()
145
+ self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
146
+ self.hidden_size = hidden_size = config.hidden_size
147
+ self.output_hidden_size = output_hidden_size = config.hidden_size
148
+ # TODO: make these as config arguments
149
+ self.depth = depth
150
+ self.mlp_depth = mlp_depth
151
+ self.downsample = downsample
152
+ if depth != 0:
153
+ self.s1 = RegStage(
154
+ depth=depth,
155
+ in_chs=encoder_hidden_size,
156
+ out_chs=hidden_size,
157
+ stride=1,
158
+ dilation=1,
159
+ act_layer=nn.SiLU,
160
+ norm_layer=LayerNorm2d,
161
+ )
162
+ else:
163
+ self.s1 = nn.Identity()
164
+ self.sampler = nn.Sequential(
165
+ nn.Conv3d(
166
+ in_channels=hidden_size,
167
+ out_channels=hidden_size,
168
+ kernel_size=downsample,
169
+ stride=downsample,
170
+ padding=1,
171
+ bias=True
172
+ ),
173
+ nn.SiLU()
174
+ )
175
+ if depth != 0:
176
+ self.s2 = RegStage(
177
+ depth=depth,
178
+ in_chs=hidden_size,
179
+ out_chs=hidden_size,
180
+ stride=1,
181
+ dilation=1,
182
+ act_layer=nn.SiLU,
183
+ norm_layer=LayerNorm2d,
184
+ )
185
+ else:
186
+ self.s2 = nn.Identity()
187
+ self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
188
+
189
+ def forward(self, x):
190
+ """Aggregate tokens on the temporal and spatial dimensions.
191
+ Args:
192
+ x: input tokens [b, t, h, w, d] / [b, t, l, d]
193
+ Returns:
194
+ aggregated tokens [b, l, d]
195
+ """
196
+ t = x.size(1)
197
+ if x.ndim == 4:
198
+ hw = int(x.size(2) ** 0.5)
199
+ x = einops.rearrange(x, "b t (h w) d -> b d t h w", h=hw, w=hw)
200
+ elif x.ndim == 5:
201
+ x = einops.rearrange(x, "b t h w d -> b d t h w")
202
+
203
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
204
+ # 1. the first stage of the adapter
205
+ x = self.s1(x)
206
+ x = einops.rearrange(x, "(b t) d h w -> b d t h w", t=t)
207
+ # 2. downsampler
208
+ x = self.sampler(x)
209
+ new_t = x.size(2)
210
+ # 3. the second stage of the adapter
211
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
212
+ x = self.s2(x)
213
+ x = einops.rearrange(x, "(b t) d h w -> b (t h w) d", t=new_t)
214
+ x = self.readout(x)
215
+ return x
216
+
217
+
218
+ class STPConnector(STCConnector):
219
+
220
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
221
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
222
+ self.sampler = nn.Sequential(nn.AvgPool3d(downsample), nn.SiLU())
223
+
224
+
225
+ class STCConnectorV35(STCConnector):
226
+
227
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
228
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
229
+ self.sampler = nn.Sequential(
230
+ nn.Conv3d(
231
+ in_channels=self.hidden_size,
232
+ out_channels=self.hidden_size,
233
+ kernel_size=downsample,
234
+ stride=downsample,
235
+ padding=0,
236
+ bias=True
237
+ ),
238
+ nn.SiLU())
239
+
240
+
241
+ class SpatialConv(STCConnector):
242
+
243
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
244
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
245
+
246
+
247
+ class SpatialPool(STPConnector):
248
+
249
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
250
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
videollama2/model/videollama2_arch.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+
19
+ import einops
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from .multimodal_encoder.builder import build_vision_tower
24
+ from .multimodal_projector.builder import build_vision_projector
25
+ from ..mm_utils import get_anyres_image_grid_shape
26
+ from ..constants import NUM_FRAMES, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN,DEFAULT_MMODAL_PATCH_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
27
+
28
+
29
+ class Videollama2MetaModel:
30
+
31
+ def __init__(self, config):
32
+ super(Videollama2MetaModel, self).__init__(config)
33
+
34
+ if hasattr(config, "mm_vision_tower"):
35
+ self.vision_tower = build_vision_tower(config, delay_load=True)
36
+ self.mm_projector = build_vision_projector(config)
37
+
38
+ def get_vision_tower(self):
39
+ vision_tower = getattr(self, 'vision_tower', None)
40
+ if type(vision_tower) is list:
41
+ vision_tower = vision_tower[0]
42
+ return vision_tower
43
+
44
+ def initialize_vision_modules(self, model_args, fsdp=None):
45
+ vision_tower = model_args.vision_tower
46
+ mm_vision_select_layer = model_args.mm_vision_select_layer
47
+ mm_vision_select_feature = model_args.mm_vision_select_feature
48
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
49
+
50
+ self.config.mm_vision_tower = vision_tower
51
+
52
+ if self.get_vision_tower() is None:
53
+ vision_tower = build_vision_tower(model_args)
54
+
55
+ if fsdp is not None and len(fsdp) > 0:
56
+ self.vision_tower = [vision_tower]
57
+ else:
58
+ self.vision_tower = vision_tower
59
+ else:
60
+ if fsdp is not None and len(fsdp) > 0:
61
+ vision_tower = self.vision_tower[0]
62
+ else:
63
+ vision_tower = self.vision_tower
64
+ vision_tower.load_model()
65
+
66
+ self.config.use_mm_proj = True
67
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
68
+ self.config.mm_hidden_size = vision_tower.hidden_size
69
+ self.config.mm_vision_select_layer = mm_vision_select_layer
70
+ self.config.mm_vision_select_feature = mm_vision_select_feature
71
+
72
+ if getattr(self, 'mm_projector', None) is None:
73
+ self.mm_projector = build_vision_projector(self.config)
74
+ else:
75
+ # In case it is frozen by LoRA
76
+ for p in self.mm_projector.parameters():
77
+ p.requires_grad = True
78
+
79
+ if pretrain_mm_mlp_adapter is not None:
80
+ if os.path.exists(pretrain_mm_mlp_adapter):
81
+ is_local = True
82
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
83
+ else:
84
+ # Support loading projector weights from remote HuggingFace model hub
85
+ is_local = False
86
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.replace('mm_projector.bin', '')
87
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.strip('/').strip('\\').strip()
88
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
89
+
90
+ def get_w(weights, keyword):
91
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
92
+
93
+ # self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
94
+ # set strict=False to avoid missing key error regarding bert.embeddings.position_ids
95
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
96
+
97
+
98
+ class Videollama2MetaForCausalLM(ABC):
99
+
100
+ @abstractmethod
101
+ def get_model(self):
102
+ pass
103
+
104
+ def num_frames(self):
105
+ if hasattr(self.config, 'num_frames'):
106
+ return self.config.num_frames
107
+ else:
108
+ return NUM_FRAMES
109
+
110
+ def get_vision_tower(self):
111
+ return self.get_model().get_vision_tower()
112
+
113
+ def encode_images_or_videos(self, images_or_videos, modalities):
114
+ num_frames = self.config.num_frames if hasattr(self.config, 'num_frames') else NUM_FRAMES
115
+
116
+ videos = [x.unsqueeze(0).expand(num_frames, -1, -1, -1) if modal == 'image' else x for x, modal in zip(images_or_videos, modalities)]
117
+ videos = torch.stack(videos, dim=0)
118
+
119
+ assert len(videos.size()) == 5
120
+ batch_size = videos.size(0)
121
+
122
+ frames = einops.rearrange(videos, 'b t c h w -> (b t) c h w')
123
+ frames_features = self.get_model().get_vision_tower()(frames)
124
+ frames_features = einops.rearrange(frames_features, '(b t) n h -> b t n h', b = batch_size)
125
+
126
+ return self.temporal_aggregator(frames_features)
127
+
128
+ def temporal_aggregator(self, frames_features):
129
+ """Temporal aggregation of frame features.
130
+ Args:
131
+ frames_features (torch.Tensor): Frame features with shape (b, t, n, h).
132
+ Returns:
133
+ torch.Tensor: Video features with shape (b, n, h).
134
+ """
135
+ # TODO: improve the merging method.
136
+ # *********** mean pooling *************
137
+ if self.config.mm_projector_type == "mlp2x_gelu" or self.config.mm_projector_type == "linear":
138
+ video_features = self.get_model().mm_projector(frames_features.mean(1))
139
+ # *********** spatial convolution *************
140
+ elif self.config.mm_projector_type == "spatial_conv":
141
+ video_features = self.get_model().mm_projector(frames_features)
142
+ # *********** spatial pooling *************
143
+ elif self.config.mm_projector_type == "spatial_pool":
144
+ video_features = self.get_model().mm_projector(frames_features)
145
+ # *********** time ************
146
+ elif "tc_connector" in self.config.mm_projector_type or "tp_connector" in self.config.mm_projector_type:
147
+ video_features = self.get_model().mm_projector(frames_features)
148
+ else:
149
+ raise Exception(f"Unsupported projector type {self.config.mm_projector_type}!!!")
150
+
151
+ return video_features
152
+
153
+ def prepare_inputs_labels_for_multimodal(
154
+ self, input_ids, attention_mask, past_key_values, labels, X_modalities
155
+ ):
156
+ vision_tower = self.get_vision_tower()
157
+ # NOTE: text-only situation
158
+ if vision_tower is None or X_modalities is None or input_ids.shape[1] == 1:
159
+ # if past_key_values is not None and vision_tower is not None and Xs is not None and input_ids.shape[1] == 1:
160
+ # attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
161
+ return input_ids, attention_mask, past_key_values, None, labels
162
+
163
+ Xs, keys = X_modalities
164
+ X_features = self.encode_images_or_videos(Xs, keys)
165
+
166
+ new_input_embeds = []
167
+ new_labels = [] if labels is not None else None
168
+ cur_X_idx = 0
169
+ # replace image/video/audio tokens with pre-computed embeddings
170
+ for batch_idx, cur_input_ids in enumerate(input_ids):
171
+ # cur_X_features = X_features[batch_idx]
172
+ if (torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0)).sum() == 0:
173
+ half_len = cur_input_ids.shape[0] // 2
174
+ cur_X_features = X_features[cur_X_idx]
175
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
176
+ cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
177
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_X_features[0:0], cur_input_embeds_2], dim=0)
178
+ new_input_embeds.append(cur_input_embeds)
179
+ if labels is not None:
180
+ new_labels.append(labels[batch_idx])
181
+ cur_X_idx += 1
182
+ continue
183
+
184
+ X_token_indices = torch.where(torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0))[0]
185
+ cur_new_input_embeds = []
186
+ if labels is not None:
187
+ cur_labels = labels[batch_idx]
188
+ cur_new_labels = []
189
+ assert cur_labels.shape == cur_input_ids.shape
190
+
191
+ # X_index_inonesample = 0
192
+ while X_token_indices.numel() > 0:
193
+ cur_X_features = X_features[cur_X_idx]
194
+ X_token_start = X_token_indices[0]
195
+
196
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:X_token_start]))
197
+ cur_new_input_embeds.append(cur_X_features)
198
+ if labels is not None:
199
+ cur_new_labels.append(cur_labels[:X_token_start])
200
+ cur_new_labels.append(torch.full((cur_X_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
201
+ cur_labels = cur_labels[X_token_start+1:]
202
+
203
+ cur_X_idx += 1
204
+ cur_input_ids = cur_input_ids[X_token_start+1:]
205
+ X_token_indices = torch.where(torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0))[0]
206
+
207
+ if cur_input_ids.numel() > 0:
208
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
209
+ if labels is not None:
210
+ cur_new_labels.append(cur_labels)
211
+ cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
212
+ # NOTE: one cur_new_input_embeds per each
213
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
214
+ new_input_embeds.append(cur_new_input_embeds)
215
+ if labels is not None:
216
+ cur_new_labels = torch.cat(cur_new_labels, dim=0)
217
+ new_labels.append(cur_new_labels)
218
+
219
+ # padding
220
+ if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
221
+ max_len = max(x.shape[0] for x in new_input_embeds)
222
+
223
+ new_input_embeds_align = []
224
+ for cur_new_embed in new_input_embeds:
225
+ cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
226
+ new_input_embeds_align.append(cur_new_embed)
227
+ new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
228
+
229
+ if labels is not None:
230
+ new_labels_align = []
231
+ _new_labels = new_labels
232
+ for cur_new_label in new_labels:
233
+ cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
234
+ new_labels_align.append(cur_new_label)
235
+ new_labels = torch.stack(new_labels_align, dim=0)
236
+
237
+ if attention_mask is not None:
238
+ new_attention_mask = []
239
+ for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
240
+ new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
241
+ new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
242
+ cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
243
+ new_attention_mask.append(cur_new_attention_mask)
244
+ attention_mask = torch.stack(new_attention_mask, dim=0)
245
+ assert attention_mask.shape == new_labels.shape
246
+ else:
247
+ new_input_embeds = torch.stack(new_input_embeds, dim=0)
248
+ if labels is not None:
249
+ new_labels = torch.stack(new_labels, dim=0)
250
+
251
+ if attention_mask is not None:
252
+ new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
253
+ attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
254
+ assert attention_mask.shape == new_input_embeds.shape[:2]
255
+
256
+ return None, attention_mask, past_key_values, new_input_embeds, new_labels
257
+
258
+ def initialize_vision_tokenizer(self, model_args, tokenizer):
259
+ if model_args.mm_use_im_patch_token:
260
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
261
+ self.resize_token_embeddings(len(tokenizer))
262
+
263
+ if model_args.mm_use_im_start_end:
264
+ num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
265
+ self.resize_token_embeddings(len(tokenizer))
266
+
267
+ if num_new_tokens > 0:
268
+ input_embeddings = self.get_input_embeddings().weight.data
269
+ output_embeddings = self.get_output_embeddings().weight.data
270
+
271
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
272
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
273
+
274
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
275
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
276
+
277
+ if model_args.tune_mm_mlp_adapter:
278
+ for p in self.get_input_embeddings().parameters():
279
+ p.requires_grad = True
280
+ for p in self.get_output_embeddings().parameters():
281
+ p.requires_grad = False
282
+
283
+ if model_args.pretrain_mm_mlp_adapter:
284
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
285
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
286
+ assert num_new_tokens == 2
287
+ if input_embeddings.shape == embed_tokens_weight.shape:
288
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
289
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
290
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
291
+ else:
292
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
293
+ elif model_args.mm_use_im_patch_token:
294
+ if model_args.tune_mm_mlp_adapter:
295
+ for p in self.get_input_embeddings().parameters():
296
+ p.requires_grad = False
297
+ for p in self.get_output_embeddings().parameters():
298
+ p.requires_grad = False
299
+
300
+ def initialize_MM_tokenizer(self, model_args, tokenizer):
301
+ if model_args.mm_use_im_patch_token:
302
+ for modal in ['IMAGE', 'VIDEO', 'AUDIO']:
303
+ tokenizer.add_tokens([DEFAULT_MMODAL_PATCH_TOKEN[modal.upper()]], special_tokens=True)
304
+ # tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
305
+ self.resize_token_embeddings(len(tokenizer))
306
+
307
+ if model_args.mm_use_im_start_end:
308
+ num_new_tokens = 0
309
+ for modal in ['IMAGE', 'VIDEO', 'AUDIO']:
310
+ num_new_tokens += tokenizer.add_tokens([DEFAULT_MMODAL_START_TOKEN[modal.upper()], DEFAULT_MMODAL_END_TOKEN[modal.upper()]], special_tokens=True)
311
+ self.resize_token_embeddings(len(tokenizer))
312
+
313
+ if num_new_tokens > 0:
314
+ input_embeddings = self.get_input_embeddings().weight.data
315
+ output_embeddings = self.get_output_embeddings().weight.data
316
+
317
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
318
+ dim=0, keepdim=True)
319
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
320
+ dim=0, keepdim=True)
321
+
322
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
323
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
324
+
325
+ if model_args.tune_mm_mlp_adapter:
326
+ for p in self.get_input_embeddings().parameters():
327
+ p.requires_grad = True
328
+ for p in self.get_output_embeddings().parameters():
329
+ p.requires_grad = False
330
+
331
+ if model_args.pretrain_mm_mlp_adapter:
332
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
333
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
334
+ assert num_new_tokens == 6 # start/end tokens for image/video/audio
335
+ if input_embeddings.shape == embed_tokens_weight.shape:
336
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
337
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
338
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
339
+ else:
340
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
341
+ elif model_args.mm_use_im_patch_token:
342
+ if model_args.tune_mm_mlp_adapter:
343
+ for p in self.get_input_embeddings().parameters():
344
+ p.requires_grad = False
345
+ for p in self.get_output_embeddings().parameters():
346
+ p.requires_grad = False
videollama2/serve/cli.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+
4
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, NUM_FRAMES
5
+ from videollama2.conversation import conv_templates, SeparatorStyle
6
+ from videollama2.model.builder import load_pretrained_model
7
+ from videollama2.utils import disable_torch_init
8
+ from videollama2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, tokenizer_MMODAL_token
9
+
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+
13
+ import requests
14
+ from io import BytesIO
15
+ from transformers import TextStreamer
16
+
17
+
18
+ def load_image(image_file):
19
+ if image_file.startswith('http://') or image_file.startswith('https://'):
20
+ response = requests.get(image_file)
21
+ image = Image.open(BytesIO(response.content)).convert('RGB')
22
+ else:
23
+ image = Image.open(image_file).convert('RGB')
24
+ return image
25
+
26
+ def load_video(video_file):
27
+ decord_vr = VideoReader(uri=video_file, ctx=cpu(0))
28
+ duration = len(decord_vr)
29
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
30
+ video = decord_vr.get_batch(frame_id_list)
31
+ return video
32
+
33
+ def load_image_or_video(image_or_video_file):
34
+ if file_path.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
35
+ return load_image(image_file=image_or_video_file)
36
+ elif file_path.endswith(('.mp4', '.avi', '.mov')):
37
+ return load_video(video_file=image_or_video_file)
38
+ else:
39
+ raise Exception(f"File type of {image_or_video_file} not supported!!!")
40
+
41
+
42
+ def main(args):
43
+ # Model
44
+ disable_torch_init()
45
+
46
+ model_name = get_model_name_from_path(args.model_path)
47
+ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
48
+
49
+ # if "llama-2" in model_name.lower():
50
+ # conv_mode = "llava_llama_2"
51
+ # elif "mistral" in model_name.lower():
52
+ # conv_mode = "mistral_instruct"
53
+ # elif "v1.6-34b" in model_name.lower():
54
+ # conv_mode = "chatml_direct"
55
+ # elif "v1" in model_name.lower():
56
+ # conv_mode = "llava_v1"
57
+ # elif "mpt" in model_name.lower():
58
+ # conv_mode = "mpt"
59
+ # else:
60
+ # conv_mode = "llava_v0"
61
+ conv_mode = "llava_v1" # fix conversation mode for now
62
+
63
+ if args.conv_mode is not None and conv_mode != args.conv_mode:
64
+ print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
65
+ else:
66
+ args.conv_mode = conv_mode
67
+
68
+ conv = conv_templates[args.conv_mode].copy()
69
+ if "mpt" in model_name.lower():
70
+ roles = ('user', 'assistant')
71
+ else:
72
+ roles = conv.roles
73
+
74
+ image = load_image(args.image_file)
75
+ image_size = image.size
76
+ # Similar operation in model_worker.py
77
+ image_tensor = process_images([image], image_processor, model.config)
78
+ if type(image_tensor) is list:
79
+ image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
80
+ else:
81
+ image_tensor = image_tensor.to(model.device, dtype=torch.float16)
82
+
83
+ while True:
84
+ try:
85
+ inp = input(f"{roles[0]}: ")
86
+ except EOFError:
87
+ inp = ""
88
+ if not inp:
89
+ print("exit...")
90
+ break
91
+
92
+ print(f"{roles[1]}: ", end="")
93
+
94
+ if image is not None:
95
+ # first message
96
+ if model.config.mm_use_im_start_end:
97
+ inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
98
+ else:
99
+ inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
100
+ conv.append_message(conv.roles[0], inp)
101
+ image = None
102
+ else:
103
+ # later messages
104
+ conv.append_message(conv.roles[0], inp)
105
+ conv.append_message(conv.roles[1], None)
106
+ prompt = conv.get_prompt()
107
+
108
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
109
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
110
+ keywords = [stop_str]
111
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
112
+
113
+ with torch.inference_mode():
114
+ output_ids = model.generate(
115
+ input_ids,
116
+ images=image_tensor,
117
+ image_sizes=[image_size],
118
+ do_sample=True if args.temperature > 0 else False,
119
+ temperature=args.temperature,
120
+ max_new_tokens=args.max_new_tokens,
121
+ streamer=streamer,
122
+ use_cache=True)
123
+
124
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
125
+ conv.messages[-1][-1] = outputs
126
+
127
+ if args.debug:
128
+ print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
134
+ parser.add_argument("--model-base", type=str, default=None)
135
+ parser.add_argument("--image-file", type=str, required=True)
136
+ parser.add_argument("--device", type=str, default="cuda")
137
+ parser.add_argument("--conv-mode", type=str, default=None)
138
+ parser.add_argument("--temperature", type=float, default=0.2)
139
+ parser.add_argument("--max-new-tokens", type=int, default=512)
140
+ parser.add_argument("--load-8bit", action="store_true")
141
+ parser.add_argument("--load-4bit", action="store_true")
142
+ parser.add_argument("--debug", action="store_true")
143
+ args = parser.parse_args()
144
+ main(args)
videollama2/serve/controller.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A controller manages distributed workers.
3
+ It sends worker addresses to clients.
4
+ """
5
+ import argparse
6
+ import asyncio
7
+ import dataclasses
8
+ from enum import Enum, auto
9
+ import json
10
+ import logging
11
+ import time
12
+ from typing import List, Union
13
+ import threading
14
+
15
+ from fastapi import FastAPI, Request
16
+ from fastapi.responses import StreamingResponse
17
+ import numpy as np
18
+ import requests
19
+ import uvicorn
20
+
21
+ from videollama2.constants import CONTROLLER_HEART_BEAT_EXPIRATION
22
+ from videollama2.utils import build_logger, server_error_msg
23
+
24
+
25
+ logger = build_logger("controller", "controller.log")
26
+
27
+
28
+ class DispatchMethod(Enum):
29
+ LOTTERY = auto()
30
+ SHORTEST_QUEUE = auto()
31
+
32
+ @classmethod
33
+ def from_str(cls, name):
34
+ if name == "lottery":
35
+ return cls.LOTTERY
36
+ elif name == "shortest_queue":
37
+ return cls.SHORTEST_QUEUE
38
+ else:
39
+ raise ValueError(f"Invalid dispatch method")
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class WorkerInfo:
44
+ model_names: List[str]
45
+ speed: int
46
+ queue_length: int
47
+ check_heart_beat: bool
48
+ last_heart_beat: str
49
+
50
+
51
+ def heart_beat_controller(controller):
52
+ while True:
53
+ time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
54
+ controller.remove_stable_workers_by_expiration()
55
+
56
+
57
+ class Controller:
58
+ def __init__(self, dispatch_method: str):
59
+ # Dict[str -> WorkerInfo]
60
+ self.worker_info = {}
61
+ self.dispatch_method = DispatchMethod.from_str(dispatch_method)
62
+
63
+ self.heart_beat_thread = threading.Thread(
64
+ target=heart_beat_controller, args=(self,), daemon=True)
65
+ self.heart_beat_thread.start()
66
+
67
+ logger.info("Init controller")
68
+
69
+ def register_worker(self, worker_name: str, check_heart_beat: bool,
70
+ worker_status: dict):
71
+ if worker_name not in self.worker_info:
72
+ logger.info(f"Register a new worker: {worker_name}")
73
+ else:
74
+ logger.info(f"Register an existing worker: {worker_name}")
75
+
76
+ if not worker_status:
77
+ worker_status = self.get_worker_status(worker_name)
78
+ if not worker_status:
79
+ return False
80
+
81
+ self.worker_info[worker_name] = WorkerInfo(
82
+ worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
83
+ check_heart_beat, time.time())
84
+
85
+ logger.info(f"Register done: {worker_name}, {worker_status}")
86
+ return True
87
+
88
+ def get_worker_status(self, worker_name: str):
89
+ try:
90
+ r = requests.post(worker_name + "/worker_get_status", timeout=5)
91
+ except requests.exceptions.RequestException as e:
92
+ logger.error(f"Get status fails: {worker_name}, {e}")
93
+ return None
94
+
95
+ if r.status_code != 200:
96
+ logger.error(f"Get status fails: {worker_name}, {r}")
97
+ return None
98
+
99
+ return r.json()
100
+
101
+ def remove_worker(self, worker_name: str):
102
+ del self.worker_info[worker_name]
103
+
104
+ def refresh_all_workers(self):
105
+ old_info = dict(self.worker_info)
106
+ self.worker_info = {}
107
+
108
+ for w_name, w_info in old_info.items():
109
+ if not self.register_worker(w_name, w_info.check_heart_beat, None):
110
+ logger.info(f"Remove stale worker: {w_name}")
111
+
112
+ def list_models(self):
113
+ model_names = set()
114
+
115
+ for w_name, w_info in self.worker_info.items():
116
+ model_names.update(w_info.model_names)
117
+
118
+ return list(model_names)
119
+
120
+ def get_worker_address(self, model_name: str):
121
+ if self.dispatch_method == DispatchMethod.LOTTERY:
122
+ worker_names = []
123
+ worker_speeds = []
124
+ for w_name, w_info in self.worker_info.items():
125
+ if model_name in w_info.model_names:
126
+ worker_names.append(w_name)
127
+ worker_speeds.append(w_info.speed)
128
+ worker_speeds = np.array(worker_speeds, dtype=np.float32)
129
+ norm = np.sum(worker_speeds)
130
+ if norm < 1e-4:
131
+ return ""
132
+ worker_speeds = worker_speeds / norm
133
+ if True: # Directly return address
134
+ pt = np.random.choice(np.arange(len(worker_names)),
135
+ p=worker_speeds)
136
+ worker_name = worker_names[pt]
137
+ return worker_name
138
+
139
+ # Check status before returning
140
+ while True:
141
+ pt = np.random.choice(np.arange(len(worker_names)),
142
+ p=worker_speeds)
143
+ worker_name = worker_names[pt]
144
+
145
+ if self.get_worker_status(worker_name):
146
+ break
147
+ else:
148
+ self.remove_worker(worker_name)
149
+ worker_speeds[pt] = 0
150
+ norm = np.sum(worker_speeds)
151
+ if norm < 1e-4:
152
+ return ""
153
+ worker_speeds = worker_speeds / norm
154
+ continue
155
+ return worker_name
156
+ elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
157
+ worker_names = []
158
+ worker_qlen = []
159
+ for w_name, w_info in self.worker_info.items():
160
+ if model_name in w_info.model_names:
161
+ worker_names.append(w_name)
162
+ worker_qlen.append(w_info.queue_length / w_info.speed)
163
+ if len(worker_names) == 0:
164
+ return ""
165
+ min_index = np.argmin(worker_qlen)
166
+ w_name = worker_names[min_index]
167
+ self.worker_info[w_name].queue_length += 1
168
+ logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
169
+ return w_name
170
+ else:
171
+ raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
172
+
173
+ def receive_heart_beat(self, worker_name: str, queue_length: int):
174
+ if worker_name not in self.worker_info:
175
+ logger.info(f"Receive unknown heart beat. {worker_name}")
176
+ return False
177
+
178
+ self.worker_info[worker_name].queue_length = queue_length
179
+ self.worker_info[worker_name].last_heart_beat = time.time()
180
+ logger.info(f"Receive heart beat. {worker_name}")
181
+ return True
182
+
183
+ def remove_stable_workers_by_expiration(self):
184
+ expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
185
+ to_delete = []
186
+ for worker_name, w_info in self.worker_info.items():
187
+ if w_info.check_heart_beat and w_info.last_heart_beat < expire:
188
+ to_delete.append(worker_name)
189
+
190
+ for worker_name in to_delete:
191
+ self.remove_worker(worker_name)
192
+
193
+ def worker_api_generate_stream(self, params):
194
+ worker_addr = self.get_worker_address(params["model"])
195
+ if not worker_addr:
196
+ logger.info(f"no worker: {params['model']}")
197
+ ret = {
198
+ "text": server_error_msg,
199
+ "error_code": 2,
200
+ }
201
+ yield json.dumps(ret).encode() + b"\0"
202
+
203
+ try:
204
+ response = requests.post(worker_addr + "/worker_generate_stream",
205
+ json=params, stream=True, timeout=5)
206
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
207
+ if chunk:
208
+ yield chunk + b"\0"
209
+ except requests.exceptions.RequestException as e:
210
+ logger.info(f"worker timeout: {worker_addr}")
211
+ ret = {
212
+ "text": server_error_msg,
213
+ "error_code": 3,
214
+ }
215
+ yield json.dumps(ret).encode() + b"\0"
216
+
217
+
218
+ # Let the controller act as a worker to achieve hierarchical
219
+ # management. This can be used to connect isolated sub networks.
220
+ def worker_api_get_status(self):
221
+ model_names = set()
222
+ speed = 0
223
+ queue_length = 0
224
+
225
+ for w_name in self.worker_info:
226
+ worker_status = self.get_worker_status(w_name)
227
+ if worker_status is not None:
228
+ model_names.update(worker_status["model_names"])
229
+ speed += worker_status["speed"]
230
+ queue_length += worker_status["queue_length"]
231
+
232
+ return {
233
+ "model_names": list(model_names),
234
+ "speed": speed,
235
+ "queue_length": queue_length,
236
+ }
237
+
238
+
239
+ app = FastAPI()
240
+
241
+
242
+ @app.post("/register_worker")
243
+ async def register_worker(request: Request):
244
+ data = await request.json()
245
+ controller.register_worker(
246
+ data["worker_name"], data["check_heart_beat"],
247
+ data.get("worker_status", None))
248
+
249
+
250
+ @app.post("/refresh_all_workers")
251
+ async def refresh_all_workers():
252
+ models = controller.refresh_all_workers()
253
+
254
+
255
+ @app.post("/list_models")
256
+ async def list_models():
257
+ models = controller.list_models()
258
+ return {"models": models}
259
+
260
+
261
+ @app.post("/get_worker_address")
262
+ async def get_worker_address(request: Request):
263
+ data = await request.json()
264
+ addr = controller.get_worker_address(data["model"])
265
+ return {"address": addr}
266
+
267
+
268
+ @app.post("/receive_heart_beat")
269
+ async def receive_heart_beat(request: Request):
270
+ data = await request.json()
271
+ exist = controller.receive_heart_beat(
272
+ data["worker_name"], data["queue_length"])
273
+ return {"exist": exist}
274
+
275
+
276
+ @app.post("/worker_generate_stream")
277
+ async def worker_api_generate_stream(request: Request):
278
+ params = await request.json()
279
+ generator = controller.worker_api_generate_stream(params)
280
+ return StreamingResponse(generator)
281
+
282
+
283
+ @app.post("/worker_get_status")
284
+ async def worker_api_get_status(request: Request):
285
+ return controller.worker_api_get_status()
286
+
287
+
288
+ if __name__ == "__main__":
289
+ parser = argparse.ArgumentParser()
290
+ parser.add_argument("--host", type=str, default="localhost")
291
+ parser.add_argument("--port", type=int, default=21001)
292
+ parser.add_argument("--dispatch-method", type=str, choices=[
293
+ "lottery", "shortest_queue"], default="shortest_queue")
294
+ args = parser.parse_args()
295
+ logger.info(f"args: {args}")
296
+
297
+ controller = Controller(args.dispatch_method)
298
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/examples/desert.jpg ADDED
videollama2/serve/examples/extreme_ironing.jpg ADDED
videollama2/serve/examples/waterview.jpg ADDED
videollama2/serve/gradio_web_server.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import hashlib
5
+ import requests
6
+ import argparse
7
+ import datetime
8
+
9
+ import numpy as np
10
+ import gradio as gr
11
+ from decord import VideoReader, cpu
12
+
13
+ from videollama2.constants import LOGDIR, NUM_FRAMES
14
+ from videollama2.conversation import (default_conversation, conv_templates,SeparatorStyle)
15
+ from videollama2.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
16
+
17
+
18
+ logger = build_logger("gradio_web_server", "gradio_web_server.log")
19
+
20
+ headers = {"User-Agent": "Videollama2 Client"}
21
+
22
+ no_change_btn = gr.Button.update()
23
+ enable_btn = gr.Button.update(interactive=True)
24
+ disable_btn = gr.Button.update(interactive=False)
25
+
26
+ priority = {
27
+ "vicuna-13b": "aaaaaaa",
28
+ "koala-13b": "aaaaaab",
29
+ }
30
+
31
+
32
+ def get_conv_log_filename():
33
+ t = datetime.datetime.now()
34
+ name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
35
+ return name
36
+
37
+
38
+ def get_model_list():
39
+ ret = requests.post(args.controller_url + "/refresh_all_workers")
40
+ assert ret.status_code == 200
41
+ ret = requests.post(args.controller_url + "/list_models")
42
+ models = ret.json()["models"]
43
+ models.sort(key=lambda x: priority.get(x, x))
44
+ logger.info(f"Models: {models}")
45
+ return models
46
+
47
+
48
+ get_window_url_params = """
49
+ function() {
50
+ const params = new URLSearchParams(window.location.search);
51
+ url_params = Object.fromEntries(params);
52
+ console.log(url_params);
53
+ return url_params;
54
+ }
55
+ """
56
+
57
+
58
+ def load_demo(url_params, request: gr.Request):
59
+ logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
60
+
61
+ dropdown_update = gr.Dropdown.update(visible=True)
62
+ if "model" in url_params:
63
+ model = url_params["model"]
64
+ if model in models:
65
+ dropdown_update = gr.Dropdown.update(
66
+ value=model, visible=True)
67
+
68
+ state = default_conversation.copy()
69
+ return state, dropdown_update
70
+
71
+
72
+ def load_demo_refresh_model_list(request: gr.Request):
73
+ logger.info(f"load_demo. ip: {request.client.host}")
74
+ models = get_model_list()
75
+ state = default_conversation.copy()
76
+ dropdown_update = gr.Dropdown.update(
77
+ choices=models,
78
+ value=models[0] if len(models) > 0 else ""
79
+ )
80
+ return state, dropdown_update
81
+
82
+
83
+ def vote_last_response(state, vote_type, model_selector, request: gr.Request):
84
+ with open(get_conv_log_filename(), "a") as fout:
85
+ data = {
86
+ "tstamp": round(time.time(), 4),
87
+ "type": vote_type,
88
+ "model": model_selector,
89
+ "state": state.dict(),
90
+ "ip": request.client.host,
91
+ }
92
+ fout.write(json.dumps(data) + "\n")
93
+
94
+
95
+ def upvote_last_response(state, model_selector, request: gr.Request):
96
+ logger.info(f"upvote. ip: {request.client.host}")
97
+ vote_last_response(state, "upvote", model_selector, request)
98
+ return ("",) + (disable_btn,) * 3
99
+
100
+
101
+ def downvote_last_response(state, model_selector, request: gr.Request):
102
+ logger.info(f"downvote. ip: {request.client.host}")
103
+ vote_last_response(state, "downvote", model_selector, request)
104
+ return ("",) + (disable_btn,) * 3
105
+
106
+
107
+ def flag_last_response(state, model_selector, request: gr.Request):
108
+ logger.info(f"flag. ip: {request.client.host}")
109
+ vote_last_response(state, "flag", model_selector, request)
110
+ return ("",) + (disable_btn,) * 3
111
+
112
+
113
+ def regenerate(state, image_process_mode, request: gr.Request):
114
+ logger.info(f"regenerate. ip: {request.client.host}")
115
+ state.messages[-1][-1] = None
116
+ prev_human_msg = state.messages[-2]
117
+ if type(prev_human_msg[1]) in (tuple, list):
118
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
119
+ state.skip_next = False
120
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
121
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
122
+
123
+
124
+ def clear_history(request: gr.Request):
125
+ logger.info(f"clear_history. ip: {request.client.host}")
126
+ state = default_conversation.copy()
127
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
128
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
129
+
130
+
131
+ def add_text_ori(state, text, image, video, image_process_mode, request: gr.Request):
132
+ # note: imagebox itself is PIL object while videobox is filepath
133
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
134
+ if len(text) <= 0 and image is None:
135
+ state.skip_next = True
136
+ return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
137
+ if args.moderate:
138
+ flagged = violates_moderation(text)
139
+ if flagged:
140
+ state.skip_next = True
141
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
142
+ no_change_btn,) * 5
143
+ assert image is None or video is None, "Please don't feed image and video inputs at the same time!!!"
144
+ text = text[:1536] # Hard cut-off
145
+ if image is not None:
146
+ # here image is the PIL object itself
147
+ text = text[:1200] # Hard cut-off for images
148
+ if '<image>' not in text:
149
+ # text = '<Image><image></Image>' + text
150
+ text = text + '\n<image>'
151
+ text = (text, image, image_process_mode)
152
+ if len(state.get_images(return_pil=True)) > 0:
153
+ state = default_conversation.copy()
154
+ state.modality = "image"
155
+ if video is not None:
156
+ print("Video box:", video)
157
+ # here video is the file path of video
158
+ text = text[:1200] # Hard cut-off for images
159
+ if '<video>' not in text:
160
+ # text = '<Image><image></Image>' + text
161
+ text = text + '\n<video>'
162
+ text = (text, video, image_process_mode)
163
+ if len(state.get_videos(return_pil=True)) > 0:
164
+ state = default_conversation.copy()
165
+ state.modality = "video"
166
+ print("Set modality as video...")
167
+ state.append_message(state.roles[0], text)
168
+ state.append_message(state.roles[1], None)
169
+ state.skip_next = False
170
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
171
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
172
+
173
+
174
+ def add_text(state, text, image, video, image_process_mode, request: gr.Request):
175
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
176
+
177
+ # if input is new video or image ,reset the state
178
+ if image is not None or video is not None:
179
+ state = default_conversation.copy()
180
+
181
+ if len(text) <= 0 and image is None and video is None:
182
+ state.skip_next = True
183
+ return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
184
+
185
+ if args.moderate:
186
+ flagged = violates_moderation(text)
187
+ if flagged:
188
+ state.skip_next = True
189
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (no_change_btn,) * 5
190
+
191
+ # process the input video
192
+ if video is not None:
193
+ text = text[:1200] #
194
+ if '<video>' not in text:
195
+ text = text + '\n<video>'
196
+ text = (text, video, image_process_mode)
197
+ state.modality = "video"
198
+ # process the input image
199
+ elif image is not None:
200
+ text = text[:1200] #
201
+ if '<image>' not in text:
202
+ text = text + '\n<image>'
203
+ text = (text, image, image_process_mode)
204
+ state.modality = "image"
205
+ elif state.modality == "image" and len(text)>0:
206
+ state.modality = "image_text"
207
+ text = text[:1536] # Hard cut-off
208
+ elif state.modality == "video" and len(text)>0:
209
+ state.modality = "video_text"
210
+ text = text[:1536] # Hard cut-off
211
+
212
+ state.append_message(state.roles[0], text)
213
+ state.append_message(state.roles[1], None)
214
+ state.skip_next = False
215
+
216
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
217
+
218
+
219
+ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
220
+ logger.info(f"http_bot. ip: {request.client.host}")
221
+ start_tstamp = time.time()
222
+ model_name = model_selector
223
+
224
+ if state.skip_next:
225
+ # This generate call is skipped due to invalid inputs
226
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
227
+ return
228
+
229
+ if len(state.messages) == state.offset + 2:
230
+ # First round of conversation
231
+ if "llava" in model_name.lower():
232
+ if 'llama-2' in model_name.lower():
233
+ template_name = "llava_llama_2"
234
+ elif "v1" in model_name.lower():
235
+ if 'mmtag' in model_name.lower():
236
+ template_name = "v1_mmtag"
237
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
238
+ template_name = "v1_mmtag"
239
+ else:
240
+ template_name = "llava_v1"
241
+ elif "mpt" in model_name.lower():
242
+ template_name = "mpt"
243
+ else:
244
+ if 'mmtag' in model_name.lower():
245
+ template_name = "v0_mmtag"
246
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
247
+ template_name = "v0_mmtag"
248
+ else:
249
+ template_name = "llava_v0"
250
+ elif "mpt" in model_name:
251
+ template_name = "mpt_text"
252
+ elif "llama-2" in model_name:
253
+ template_name = "llama_2"
254
+ else:
255
+ template_name = "vicuna_v1"
256
+ template_name = "llava_v1"
257
+ new_state = conv_templates[template_name].copy()
258
+ new_state.append_message(new_state.roles[0], state.messages[-2][1])
259
+ new_state.append_message(new_state.roles[1], None)
260
+ new_state.modality = state.modality
261
+ state = new_state
262
+
263
+ # Query worker address
264
+ controller_url = args.controller_url
265
+ ret = requests.post(controller_url + "/get_worker_address",
266
+ json={"model": model_name})
267
+ worker_addr = ret.json()["address"]
268
+ logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
269
+
270
+ # No available worker
271
+ if worker_addr == "":
272
+ state.messages[-1][-1] = server_error_msg
273
+ yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
274
+ return
275
+
276
+ # Construct prompt
277
+ prompt = state.get_prompt()
278
+ if state.modality == "image" or state.modality == "image_text":
279
+ all_images = state.get_images(return_pil=True) # return PIL.Image object
280
+ elif state.modality == "video" or state.modality == "video_text":
281
+ all_images = state.get_videos(return_pil=True) # return video frames where each frame is a PIL.Image object
282
+ all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
283
+ for idx, (image, hash) in enumerate(zip(all_images, all_image_hash)):
284
+ t = datetime.datetime.now()
285
+ if state.modality == "image" or state.modality == "image_text":
286
+ filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
287
+ elif state.modality == "video" or state.modality == "video_text":
288
+ filename = os.path.join(LOGDIR, "serve_videos", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}_{idx}.jpg")
289
+ if not os.path.isfile(filename):
290
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
291
+ image.save(filename)
292
+
293
+ # Make requests
294
+ pload = {
295
+ "model": model_name,
296
+ "prompt": prompt,
297
+ "temperature": float(temperature),
298
+ "top_p": float(top_p),
299
+ "max_new_tokens": min(int(max_new_tokens), 1536),
300
+ "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
301
+ #"images": f'List of {len(state.get_images())} images: {all_image_hash}',
302
+ "images": f'List of {len(all_image_hash)} images: {all_image_hash}',
303
+ }
304
+ logger.info(f"==== request ====\n{pload}")
305
+
306
+ if state.modality == "image" or state.modality == "image_text":
307
+ pload['images'] = state.get_images()
308
+ elif state.modality == "video" or state.modality == "video_text":
309
+ pload['images'] = state.get_videos()
310
+
311
+ state.messages[-1][-1] = "▌"
312
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
313
+
314
+ try:
315
+ # Stream output
316
+ response = requests.post(worker_addr + "/worker_generate_stream",
317
+ headers=headers, json=pload, stream=True, timeout=10)
318
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
319
+ if chunk:
320
+ data = json.loads(chunk.decode())
321
+ if data["error_code"] == 0:
322
+ output = data["text"][len(prompt):].strip()
323
+ state.messages[-1][-1] = output + "▌"
324
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
325
+ else:
326
+ output = data["text"] + f" (error_code: {data['error_code']})"
327
+ state.messages[-1][-1] = output
328
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
329
+ return
330
+ time.sleep(0.03)
331
+ except requests.exceptions.RequestException as e:
332
+ state.messages[-1][-1] = server_error_msg
333
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
334
+ return
335
+
336
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
337
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
338
+
339
+ finish_tstamp = time.time()
340
+ logger.info(f"{output}")
341
+
342
+ with open(get_conv_log_filename(), "a") as fout:
343
+ data = {
344
+ "tstamp": round(finish_tstamp, 4),
345
+ "type": "chat",
346
+ "model": model_name,
347
+ "start": round(start_tstamp, 4),
348
+ "finish": round(start_tstamp, 4),
349
+ #"state": state.dict(),
350
+ "images": all_image_hash,
351
+ "ip": request.client.host,
352
+ }
353
+ fout.write(json.dumps(data) + "\n")
354
+
355
+ title_markdown = ("""
356
+ # The publicl release of VideoLLaMA2
357
+ """)
358
+
359
+ tos_markdown = ("""
360
+ ### Terms of use
361
+ By using this service, users are required to agree to the following terms:
362
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
363
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
364
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
365
+ """)
366
+
367
+
368
+ learn_more_markdown = ("""
369
+ ### License
370
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
371
+ """)
372
+
373
+ block_css = """
374
+
375
+ #buttons button {
376
+ min-width: min(120px,100%);
377
+ }
378
+
379
+ """
380
+
381
+ def build_demo(embed_mode):
382
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
383
+ with gr.Blocks(title="Video-Llama", theme=gr.themes.Default(), css=block_css) as demo:
384
+ state = gr.State()
385
+
386
+ if not embed_mode:
387
+ gr.Markdown(title_markdown)
388
+
389
+ with gr.Row():
390
+ with gr.Column(scale=3):
391
+ with gr.Row(elem_id="model_selector_row"):
392
+ model_selector = gr.Dropdown(
393
+ choices=models,
394
+ value=models[0] if len(models) > 0 else "",
395
+ interactive=True,
396
+ show_label=False,
397
+ container=False)
398
+
399
+ imagebox = gr.Image(type="pil")
400
+ videobox = gr.Video()
401
+ image_process_mode = gr.Radio(
402
+ ["Crop", "Resize", "Pad", "Default"],
403
+ value="Default",
404
+ label="Preprocess for non-square image", visible=False)
405
+
406
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
407
+ gr.Examples(examples=[
408
+ [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
409
+ [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
410
+ [f"{cur_dir}/examples/desert.jpg", "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?"],
411
+ ], inputs=[imagebox, textbox], label="Image examples")
412
+
413
+ # video example inputs
414
+ gr.Examples(examples=[
415
+ [f"{cur_dir}/examples/sample_demo_1.mp4", "Why is this video funny?"],
416
+ [f"{cur_dir}/examples/sample_demo_3.mp4", "Can you identify any safety hazards in this video?"],
417
+ [f"{cur_dir}/examples/1034346401.mp4", "What is this young woman doing?"]
418
+ ], inputs=[videobox, textbox], label="Video examples")
419
+ #[f"{cur_dir}/examples/sample_demo_9.mp4", "Describe the video in detail and please do not generate repetitive content."]
420
+
421
+ with gr.Accordion("Parameters", open=False) as parameter_row:
422
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
423
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
424
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
425
+
426
+ with gr.Column(scale=8):
427
+ chatbot = gr.Chatbot(elem_id="chatbot", label="Videollama2 Chatbot", height=550)
428
+ with gr.Row():
429
+ with gr.Column(scale=8):
430
+ textbox.render()
431
+ with gr.Column(scale=1, min_width=50):
432
+ submit_btn = gr.Button(value="Send", variant="primary")
433
+ with gr.Row(elem_id="buttons") as button_row:
434
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
435
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
436
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
437
+ #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
438
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
439
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
440
+
441
+ if not embed_mode:
442
+ gr.Markdown(tos_markdown)
443
+ gr.Markdown(learn_more_markdown)
444
+ url_params = gr.JSON(visible=False)
445
+
446
+ # Register listeners
447
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
448
+ upvote_btn.click(upvote_last_response,
449
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
450
+ downvote_btn.click(downvote_last_response,
451
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
452
+ flag_btn.click(flag_last_response,
453
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
454
+ regenerate_btn.click(regenerate, [state, image_process_mode],
455
+ [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
456
+ http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
457
+ [state, chatbot] + btn_list)
458
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
459
+
460
+ textbox.submit(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
461
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
462
+ [state, chatbot] + btn_list)
463
+ submit_btn.click(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
464
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
465
+ [state, chatbot] + btn_list)
466
+
467
+ if args.model_list_mode == "once":
468
+ demo.load(load_demo, [url_params], [state, model_selector],
469
+ _js=get_window_url_params)
470
+ elif args.model_list_mode == "reload":
471
+ demo.load(load_demo_refresh_model_list, None, [state, model_selector])
472
+ else:
473
+ raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
474
+
475
+ return demo
476
+
477
+
478
+ if __name__ == "__main__":
479
+ parser = argparse.ArgumentParser()
480
+ parser.add_argument("--host", type=str, default="0.0.0.0")
481
+ parser.add_argument("--port", type=int)
482
+ parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
483
+ parser.add_argument("--concurrency-count", type=int, default=10)
484
+ parser.add_argument("--model-list-mode", type=str, default="once",
485
+ choices=["once", "reload"])
486
+ parser.add_argument("--share", action="store_true")
487
+ parser.add_argument("--moderate", action="store_true")
488
+ parser.add_argument("--embed", action="store_true")
489
+ args = parser.parse_args()
490
+ logger.info(f"args: {args}")
491
+
492
+ models = get_model_list()
493
+
494
+ logger.info(args)
495
+ demo = build_demo(args.embed)
496
+ demo.queue(
497
+ concurrency_count=args.concurrency_count,
498
+ api_open=False
499
+ ).launch(
500
+ server_name=args.host,
501
+ server_port=args.port,
502
+ share=args.share
503
+ )
videollama2/serve/model_worker.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import os
5
+ import json
6
+ import time
7
+ import uuid
8
+ import asyncio
9
+ import requests
10
+ import argparse
11
+ import threading
12
+ from threading import Thread
13
+ from functools import partial
14
+ from typing import Iterator, List, Optional, Tuple
15
+
16
+ import uvicorn
17
+ from fastapi import FastAPI, Request, BackgroundTasks
18
+ from fastapi.responses import StreamingResponse
19
+
20
+ import torch
21
+ import decord
22
+ import numpy as np
23
+ from PIL import Image
24
+ from decord import VideoReader, cpu
25
+ from transformers import TextIteratorStreamer
26
+
27
+ from videollama2.constants import WORKER_HEART_BEAT_INTERVAL
28
+ from videollama2.utils import (build_logger, server_error_msg, pretty_print_semaphore)
29
+ from videollama2.model.builder import load_pretrained_model
30
+ from videollama2.mm_utils import process_images, process_videos, load_image_from_base64, tokenizer_image_token, KeywordsStoppingCriteria, tokenizer_MMODAL_token
31
+ from videollama2.mm_utils import chunk_list, frame_expansion
32
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, NUM_FRAMES, MMODAL_TOKEN_INDEX
33
+
34
+
35
+ GB = 1 << 30
36
+
37
+ worker_id = str(uuid.uuid4())[:6]
38
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
39
+ global_counter = 0
40
+
41
+ model_semaphore = None
42
+
43
+
44
+ # variable_content = os.getenv('MY_VARIABLE', '')
45
+ # KEYWORDS_LIST = set(variable_content.split('\n'))
46
+ KEYWORDS_LIST = []
47
+ path = 'assets/keywords.txt'
48
+ if os.path.exists(path):
49
+ with open(path, 'r', encoding='utf-8') as file:
50
+ for line in file:
51
+
52
+ KEYWORDS_LIST.append(line.strip())
53
+ else:
54
+ KEYWORDS_LIST = []
55
+
56
+
57
+ KEYWORD_BLOCK_MESSAGE2 = "The output contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
58
+ KEYWORD_BLOCK_MESSAGE1 = "Your input question contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
59
+ STREAM_CHECK_MULTIPLE = 20
60
+
61
+
62
+ def heart_beat_worker(controller):
63
+
64
+ while True:
65
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
66
+ controller.send_heart_beat()
67
+
68
+
69
+ def safety_check(text, history=None, ) -> Optional[str]:
70
+
71
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
72
+ print('############')
73
+ return KEYWORD_BLOCK_MESSAGE2
74
+
75
+ return None
76
+
77
+
78
+ def input_safety_check(text) -> Optional[str]:
79
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
80
+ print('######## Input keyword alarm triggered:', text)
81
+ return KEYWORD_BLOCK_MESSAGE1
82
+ return None
83
+
84
+
85
+ class ModelWorker:
86
+
87
+ def __init__(self, controller_addr, worker_addr,
88
+ worker_id, no_register,
89
+ model_path, model_base, model_name,
90
+ load_8bit, load_4bit, device):
91
+ self.controller_addr = controller_addr
92
+ self.worker_addr = worker_addr
93
+ self.worker_id = worker_id
94
+ self.model_path = model_path
95
+ if model_path.endswith("/"):
96
+ model_path = model_path[:-1]
97
+ if model_name is None:
98
+ model_paths = model_path.split("/")
99
+ if model_paths[-1].startswith('checkpoint-'):
100
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
101
+ else:
102
+ self.model_name = model_paths[-1]
103
+ else:
104
+ self.model_name = model_name
105
+
106
+ self.device = device
107
+ logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
108
+ self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
109
+ model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)
110
+ self.is_multimodal = 'videollama2' in self.model_name.lower() or 'vlb' in self.model_name.lower()
111
+
112
+ if not no_register:
113
+ self.register_to_controller()
114
+ self.heart_beat_thread = threading.Thread(
115
+ target=heart_beat_worker, args=(self,))
116
+ self.heart_beat_thread.start()
117
+
118
+ def register_to_controller(self):
119
+ logger.info("Register to controller")
120
+
121
+ url = self.controller_addr + "/register_worker"
122
+ data = {
123
+ "worker_name": self.worker_addr,
124
+ "check_heart_beat": True,
125
+ "worker_status": self.get_status()
126
+ }
127
+ r = requests.post(url, json=data)
128
+ assert r.status_code == 200
129
+
130
+ def send_heart_beat(self):
131
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
132
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
133
+ f"global_counter: {global_counter}")
134
+
135
+ url = self.controller_addr + "/receive_heart_beat"
136
+
137
+ while True:
138
+ try:
139
+ ret = requests.post(url, json={
140
+ "worker_name": self.worker_addr,
141
+ "queue_length": self.get_queue_length()}, timeout=5)
142
+ exist = ret.json()["exist"]
143
+ break
144
+ except requests.exceptions.RequestException as e:
145
+ logger.error(f"heart beat error: {e}")
146
+ time.sleep(5)
147
+
148
+ if not exist:
149
+ self.register_to_controller()
150
+
151
+ def get_queue_length(self):
152
+ if model_semaphore is None:
153
+ return 0
154
+ else:
155
+ return args.limit_model_concurrency - model_semaphore._value + (len(
156
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
157
+
158
+ def get_status(self):
159
+ return {
160
+ "model_names": [self.model_name],
161
+ "speed": 1,
162
+ "queue_length": self.get_queue_length(),
163
+ }
164
+
165
+ @torch.inference_mode()
166
+ def generate_stream(self, params):
167
+ tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
168
+
169
+ prompt = params["prompt"]
170
+ ori_prompt = prompt
171
+ images_or_videos = params.get("images", None)
172
+ #print("Input images:", images_or_videos)
173
+ num_image_tokens = 0
174
+ modal_list = []
175
+ if images_or_videos is not None and len(images_or_videos) and self.is_multimodal:
176
+ if len(images_or_videos) > 0:
177
+ if len(images_or_videos) != prompt.count(DEFAULT_IMAGE_TOKEN) and len(images_or_videos) != (prompt.count(DEFAULT_VIDEO_TOKEN)):
178
+ raise ValueError("Number of images/videos does not match number of <image>/<video> tokens in prompt")
179
+
180
+ try:
181
+ print("Load image...")
182
+ images_or_videos = [load_image_from_base64(image) for image in images_or_videos]
183
+ images_or_videos = process_images(images_or_videos, image_processor, model.config)
184
+
185
+ modal_list = ["image"]
186
+ replace_token = DEFAULT_IMAGE_TOKEN
187
+ modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
188
+ except:
189
+ print("Load video instead...")
190
+ decord_vr = VideoReader(uri=images_or_videos[0], ctx=cpu(0))
191
+ duration = len(decord_vr)
192
+ if not "use_taug" in self.model_path:
193
+ frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
194
+ video_frames = decord_vr.get_batch(frame_id_list).asnumpy()
195
+ images_or_videos = process_videos(video_frames, image_processor, model.config)
196
+ else:
197
+ print("Temporal augmentation activated!!!")
198
+ frame_id_list = np.linspace(0, duration-1, 8 * 2 * 2, dtype=int)
199
+ video_data = decord_vr.get_batch(frame_id_list)
200
+ video_frames = [Image.fromarray(f) for f in video_data.asnumpy()]
201
+ chunked_video_frames = chunk_list(video_frames, 2*2)
202
+ expanded_video_frames = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
203
+ images_or_videos = process_videos(expanded_video_frames, image_processor, model.config)
204
+
205
+ # frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
206
+ # images_or_videos = decord_vr.get_batch(frame_id_list).asnumpy()
207
+ # images_or_videos = process_videos(images_or_videos, image_processor, model.config)
208
+ #print("images_or_videos.shape:", images_or_videos.shape)
209
+ modal_list = ["video"]
210
+ replace_token = DEFAULT_VIDEO_TOKEN
211
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
212
+
213
+ if type(images_or_videos) is list:
214
+ images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
215
+ else:
216
+ images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
217
+ if modal_list[0] == "video":
218
+ print("Video:", images_or_videos.shape)
219
+ images_or_videos = [images_or_videos]
220
+ else:
221
+ print("Image:", images_or_videos.shape)
222
+
223
+
224
+ #image_sizes = [image.size for image in images_or_videos]
225
+
226
+
227
+ # if len(images_or_videos) % NUM_FRAMES == 0:
228
+ # images_or_videos = process_images(images_or_videos, image_processor, model.config)
229
+ # #images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
230
+ # #modal_list = ["image"] * len(images_or_videos)
231
+ # images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
232
+ # modal_list = ["video"]
233
+ # replace_token = DEFAULT_VIDEO_TOKEN
234
+ # else:
235
+
236
+ if getattr(self.model.config, 'mm_use_im_start_end', False):
237
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
238
+ prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
239
+
240
+ num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
241
+ else:
242
+ images = None
243
+ modal_list = []
244
+ image_args = {"images_or_videos": images_or_videos, "modal_list": modal_list}
245
+ else:
246
+ images = None
247
+ image_args = {}
248
+ print("image_args:", image_args)
249
+ temperature = float(params.get("temperature", 1.0))
250
+ top_p = float(params.get("top_p", 1.0))
251
+ max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
252
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
253
+ stop_str = params.get("stop", None)
254
+ do_sample = True if temperature > 0.001 else False
255
+
256
+ #input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
257
+ # tokenizer for our video-llama beta
258
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to(self.device)
259
+ #print("Current prompt:", prompt)
260
+ #print("input_ids.shape:", input_ids.shape)
261
+ keywords = [stop_str]
262
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
263
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
264
+
265
+ max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
266
+
267
+ if max_new_tokens < 1:
268
+ yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
269
+ return
270
+
271
+ thread = Thread(target=model.generate, kwargs=dict(
272
+ inputs=input_ids,
273
+ do_sample=do_sample,
274
+ temperature=temperature,
275
+ top_p=top_p,
276
+ max_new_tokens=max_new_tokens,
277
+ streamer=streamer,
278
+ stopping_criteria=[stopping_criteria],
279
+ use_cache=True,
280
+ **image_args
281
+ ))
282
+ thread.start()
283
+
284
+ generated_text = ori_prompt
285
+ token_count = 0
286
+ for new_text in streamer:
287
+ generated_text += new_text
288
+ token_count += len(tokenizer.encode(new_text))
289
+ if token_count >= STREAM_CHECK_MULTIPLE:
290
+ safety_message = safety_check(generated_text)
291
+ if safety_message:
292
+ print('####### Keyword alarm triggered:', generated_text)
293
+ yield json.dumps({"text": safety_message , "error_code": 1}).encode() + b"\0"
294
+ return
295
+ token_count = 0 #
296
+
297
+
298
+ if generated_text.endswith(stop_str):
299
+ generated_text = generated_text[:-len(stop_str)]
300
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
301
+
302
+ def generate_stream_gate(self, params):
303
+ try:
304
+ input_text = params.get("prompt", "")
305
+ safety_message = input_safety_check(input_text)
306
+ if safety_message:
307
+ yield json.dumps({"text": safety_message, "error_code": 1}).encode() + b"\0"
308
+ return
309
+
310
+ for x in self.generate_stream(params):
311
+ yield x
312
+ except ValueError as e:
313
+ print("Caught ValueError:", e)
314
+ ret = {
315
+ "text": server_error_msg,
316
+ "error_code": 1,
317
+ }
318
+ yield json.dumps(ret).encode() + b"\0"
319
+ except torch.cuda.CudaError as e:
320
+ print("Caught torch.cuda.CudaError:", e)
321
+ ret = {
322
+ "text": server_error_msg,
323
+ "error_code": 1,
324
+ }
325
+ yield json.dumps(ret).encode() + b"\0"
326
+ except Exception as e:
327
+ print("Caught Unknown Error", e)
328
+ ret = {
329
+ "text": server_error_msg,
330
+ "error_code": 1,
331
+ }
332
+ yield json.dumps(ret).encode() + b"\0"
333
+
334
+
335
+ app = FastAPI()
336
+
337
+
338
+ def release_model_semaphore(fn=None):
339
+ model_semaphore.release()
340
+ if fn is not None:
341
+ fn()
342
+
343
+
344
+ @app.post("/worker_generate_stream")
345
+ async def generate_stream(request: Request):
346
+ global model_semaphore, global_counter
347
+ global_counter += 1
348
+ params = await request.json()
349
+
350
+ if model_semaphore is None:
351
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
352
+ await model_semaphore.acquire()
353
+ worker.send_heart_beat()
354
+ generator = worker.generate_stream_gate(params)
355
+ background_tasks = BackgroundTasks()
356
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
357
+ return StreamingResponse(generator, background=background_tasks)
358
+
359
+
360
+ @app.post("/worker_get_status")
361
+ async def get_status(request: Request):
362
+ return worker.get_status()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ parser = argparse.ArgumentParser()
367
+ parser.add_argument("--host", type=str, default="localhost")
368
+ parser.add_argument("--port", type=int, default=21002)
369
+ parser.add_argument("--worker-address", type=str, default="http://localhost:21002")
370
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
371
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
372
+ parser.add_argument("--model-base", type=str, default=None)
373
+ parser.add_argument("--model-name", type=str)
374
+ parser.add_argument("--device", type=str, default="cuda")
375
+ parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
376
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
377
+ parser.add_argument("--stream-interval", type=int, default=1)
378
+ parser.add_argument("--no-register", action="store_true")
379
+ parser.add_argument("--load-8bit", action="store_true")
380
+ parser.add_argument("--load-4bit", action="store_true")
381
+ args = parser.parse_args()
382
+ logger.info(f"args: {args}")
383
+
384
+ if args.multi_modal:
385
+ logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
386
+
387
+ worker = ModelWorker(args.controller_address,
388
+ args.worker_address,
389
+ worker_id,
390
+ args.no_register,
391
+ args.model_path,
392
+ args.model_base,
393
+ args.model_name,
394
+ args.load_8bit,
395
+ args.load_4bit,
396
+ args.device)
397
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/register_worker.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manually register workers.
3
+
4
+ Usage:
5
+ python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6
+ """
7
+
8
+ import argparse
9
+
10
+ import requests
11
+
12
+ if __name__ == "__main__":
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--controller-address", type=str)
15
+ parser.add_argument("--worker-name", type=str)
16
+ parser.add_argument("--check-heart-beat", action="store_true")
17
+ args = parser.parse_args()
18
+
19
+ url = args.controller_address + "/register_worker"
20
+ data = {
21
+ "worker_name": args.worker_name,
22
+ "check_heart_beat": args.check_heart_beat,
23
+ "worker_status": None,
24
+ }
25
+ r = requests.post(url, json=data)
26
+ assert r.status_code == 200
videollama2/serve/sglang_worker.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import argparse
5
+ import asyncio
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import json
8
+ import time
9
+ import threading
10
+ import uuid
11
+
12
+ from fastapi import FastAPI, Request, BackgroundTasks
13
+ from fastapi.responses import StreamingResponse
14
+ import requests
15
+ import re
16
+ import uvicorn
17
+ from functools import partial
18
+
19
+ from llava.constants import WORKER_HEART_BEAT_INTERVAL
20
+ from llava.utils import (build_logger, server_error_msg,
21
+ pretty_print_semaphore)
22
+ from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
23
+ from llava.constants import DEFAULT_IMAGE_TOKEN
24
+
25
+ import sglang as sgl
26
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
27
+
28
+
29
+ GB = 1 << 30
30
+
31
+ worker_id = str(uuid.uuid4())[:6]
32
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
33
+ global_counter = 0
34
+
35
+ model_semaphore = None
36
+
37
+
38
+ def heart_beat_worker(controller):
39
+ while True:
40
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
41
+ controller.send_heart_beat()
42
+
43
+
44
+ @sgl.function
45
+ def pipeline(s, prompt, max_tokens):
46
+ for p in prompt:
47
+ if type(p) is str:
48
+ s += p
49
+ else:
50
+ s += sgl.image(p)
51
+ s += sgl.gen("response", max_tokens=max_tokens)
52
+
53
+
54
+ class ModelWorker:
55
+ def __init__(self, controller_addr, worker_addr, sgl_endpoint,
56
+ worker_id, no_register, model_name):
57
+ self.controller_addr = controller_addr
58
+ self.worker_addr = worker_addr
59
+ self.worker_id = worker_id
60
+
61
+ # Select backend
62
+ backend = RuntimeEndpoint(sgl_endpoint)
63
+ sgl.set_default_backend(backend)
64
+ model_path = backend.model_info["model_path"]
65
+
66
+ if model_path.endswith("/"):
67
+ model_path = model_path[:-1]
68
+ if model_name is None:
69
+ model_paths = model_path.split("/")
70
+ if model_paths[-1].startswith('checkpoint-'):
71
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
72
+ else:
73
+ self.model_name = model_paths[-1]
74
+ else:
75
+ self.model_name = model_name
76
+
77
+ logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
78
+
79
+ if not no_register:
80
+ self.register_to_controller()
81
+ self.heart_beat_thread = threading.Thread(
82
+ target=heart_beat_worker, args=(self,), daemon=True)
83
+ self.heart_beat_thread.start()
84
+
85
+ def register_to_controller(self):
86
+ logger.info("Register to controller")
87
+
88
+ url = self.controller_addr + "/register_worker"
89
+ data = {
90
+ "worker_name": self.worker_addr,
91
+ "check_heart_beat": True,
92
+ "worker_status": self.get_status()
93
+ }
94
+ r = requests.post(url, json=data)
95
+ assert r.status_code == 200
96
+
97
+ def send_heart_beat(self):
98
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
99
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
100
+ f"global_counter: {global_counter}")
101
+
102
+ url = self.controller_addr + "/receive_heart_beat"
103
+
104
+ while True:
105
+ try:
106
+ ret = requests.post(url, json={
107
+ "worker_name": self.worker_addr,
108
+ "queue_length": self.get_queue_length()}, timeout=5)
109
+ exist = ret.json()["exist"]
110
+ break
111
+ except requests.exceptions.RequestException as e:
112
+ logger.error(f"heart beat error: {e}")
113
+ time.sleep(5)
114
+
115
+ if not exist:
116
+ self.register_to_controller()
117
+
118
+ def get_queue_length(self):
119
+ if model_semaphore is None:
120
+ return 0
121
+ else:
122
+ return args.limit_model_concurrency - model_semaphore._value + (len(
123
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
124
+
125
+ def get_status(self):
126
+ return {
127
+ "model_names": [self.model_name],
128
+ "speed": 1,
129
+ "queue_length": self.get_queue_length(),
130
+ }
131
+
132
+ async def generate_stream(self, params):
133
+ ori_prompt = prompt = params["prompt"]
134
+ images = params.get("images", None)
135
+ if images is not None and len(images) > 0:
136
+ if len(images) > 0:
137
+ if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
138
+ raise ValueError("Number of images does not match number of <image> tokens in prompt")
139
+
140
+ images = [load_image_from_base64(image) for image in images]
141
+
142
+ # FIXME: for image-start/end token
143
+ # replace_token = DEFAULT_IMAGE_TOKEN
144
+ # if getattr(self.model.config, 'mm_use_im_start_end', False):
145
+ # replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
146
+ # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
147
+ prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
148
+ prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
149
+ prompt = []
150
+ for i in range(len(prompt_split)):
151
+ prompt.append(prompt_split[i])
152
+ if i < len(images):
153
+ prompt.append(images[i])
154
+ else:
155
+ prompt = [prompt]
156
+
157
+ temperature = float(params.get("temperature", 1.0))
158
+ top_p = float(params.get("top_p", 1.0))
159
+ # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
160
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
161
+ stop_str = params.get("stop", None)
162
+ stop_str = [stop_str] if stop_str is not None else None
163
+
164
+ print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
165
+ state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
166
+
167
+ generated_text = ori_prompt
168
+ async for text_outputs in state.text_async_iter(var_name="response"):
169
+ generated_text += text_outputs
170
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
171
+
172
+ async def generate_stream_gate(self, params):
173
+ try:
174
+ async for x in self.generate_stream(params):
175
+ yield x
176
+ except ValueError as e:
177
+ print("Caught ValueError:", e)
178
+ ret = {
179
+ "text": server_error_msg,
180
+ "error_code": 1,
181
+ }
182
+ yield json.dumps(ret).encode() + b"\0"
183
+ except Exception as e:
184
+ print("Caught Unknown Error", e)
185
+ ret = {
186
+ "text": server_error_msg,
187
+ "error_code": 1,
188
+ }
189
+ yield json.dumps(ret).encode() + b"\0"
190
+
191
+
192
+ app = FastAPI()
193
+
194
+
195
+ def release_model_semaphore(fn=None):
196
+ model_semaphore.release()
197
+ if fn is not None:
198
+ fn()
199
+
200
+
201
+ @app.post("/worker_generate_stream")
202
+ async def generate_stream(request: Request):
203
+ global model_semaphore, global_counter
204
+ global_counter += 1
205
+ params = await request.json()
206
+
207
+ if model_semaphore is None:
208
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
209
+ await model_semaphore.acquire()
210
+ worker.send_heart_beat()
211
+ generator = worker.generate_stream_gate(params)
212
+ background_tasks = BackgroundTasks()
213
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
214
+ return StreamingResponse(generator, background=background_tasks)
215
+
216
+
217
+ @app.post("/worker_get_status")
218
+ async def get_status(request: Request):
219
+ return worker.get_status()
220
+
221
+
222
+ if __name__ == "__main__":
223
+ parser = argparse.ArgumentParser()
224
+ parser.add_argument("--host", type=str, default="localhost")
225
+ parser.add_argument("--port", type=int, default=21002)
226
+ parser.add_argument("--worker-address", type=str,
227
+ default="http://localhost:21002")
228
+ parser.add_argument("--controller-address", type=str,
229
+ default="http://localhost:21001")
230
+ parser.add_argument("--model-name", type=str)
231
+ parser.add_argument("--sgl-endpoint", type=str)
232
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
233
+ parser.add_argument("--stream-interval", type=int, default=1)
234
+ parser.add_argument("--no-register", action="store_true")
235
+ args = parser.parse_args()
236
+ logger.info(f"args: {args}")
237
+
238
+ worker = ModelWorker(args.controller_address,
239
+ args.worker_address,
240
+ args.sgl_endpoint,
241
+ worker_id,
242
+ args.no_register,
243
+ args.model_name)
244
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/test_message.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+ import requests
5
+
6
+ from llava.conversation import default_conversation
7
+
8
+
9
+ def main():
10
+ if args.worker_address:
11
+ worker_addr = args.worker_address
12
+ else:
13
+ controller_addr = args.controller_address
14
+ ret = requests.post(controller_addr + "/refresh_all_workers")
15
+ ret = requests.post(controller_addr + "/list_models")
16
+ models = ret.json()["models"]
17
+ models.sort()
18
+ print(f"Models: {models}")
19
+
20
+ ret = requests.post(controller_addr + "/get_worker_address",
21
+ json={"model": args.model_name})
22
+ worker_addr = ret.json()["address"]
23
+ print(f"worker_addr: {worker_addr}")
24
+
25
+ if worker_addr == "":
26
+ return
27
+
28
+ conv = default_conversation.copy()
29
+ conv.append_message(conv.roles[0], args.message)
30
+ prompt = conv.get_prompt()
31
+
32
+ headers = {"User-Agent": "LLaVA Client"}
33
+ pload = {
34
+ "model": args.model_name,
35
+ "prompt": prompt,
36
+ "max_new_tokens": args.max_new_tokens,
37
+ "temperature": 0.7,
38
+ "stop": conv.sep,
39
+ }
40
+ response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41
+ json=pload, stream=True)
42
+
43
+ print(prompt.replace(conv.sep, "\n"), end="")
44
+ for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45
+ if chunk:
46
+ data = json.loads(chunk.decode("utf-8"))
47
+ output = data["text"].split(conv.sep)[-1]
48
+ print(output, end="\r")
49
+ print("")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ parser = argparse.ArgumentParser()
54
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55
+ parser.add_argument("--worker-address", type=str)
56
+ parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57
+ parser.add_argument("--max-new-tokens", type=int, default=32)
58
+ parser.add_argument("--message", type=str, default=
59
+ "Tell me a story with more than 1000 words.")
60
+ args = parser.parse_args()
61
+
62
+ main()
videollama2/train.py ADDED
@@ -0,0 +1,963 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import os
19
+ import sys
20
+ import copy
21
+ import json
22
+ import random
23
+ import logging
24
+ import pathlib
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, Optional, Sequence, List
27
+
28
+ # torch-related packages
29
+ import torch
30
+ from torch.utils.data import Dataset
31
+ from torchvision.transforms import Compose, Lambda, ToTensor
32
+ from pytorchvideo.data.encoded_video import EncodedVideo
33
+ from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
34
+
35
+ import cv2
36
+ import decord
37
+ import imageio
38
+ import traceback
39
+ import numpy as np
40
+ import transformers
41
+ from PIL import Image
42
+ from decord import VideoReader, cpu
43
+ from moviepy.editor import VideoFileClip
44
+ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
45
+
46
+ sys.path.append('./')
47
+ from videollama2 import conversation as conversation_lib
48
+ from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN
49
+ from videollama2.videollama2_trainer import VideoLLaMA2Trainer
50
+ from videollama2.model import *
51
+ from videollama2.mm_utils import tokenizer_MMODAL_token, tokenizer_image_token, expand2square, process_video, process_image
52
+
53
+ local_rank = None
54
+
55
+
56
+ def rank0_print(*args):
57
+ if local_rank == 0:
58
+ print(*args)
59
+
60
+
61
+ def set_seed(seed=42):
62
+ """
63
+ Set the random seed for reproducible results.
64
+
65
+ :param seed: An integer value to be used as the random seed.
66
+ """
67
+ torch.manual_seed(seed)
68
+ torch.cuda.manual_seed(seed)
69
+ torch.cuda.manual_seed_all(seed) # for multi-GPU setups
70
+ torch.backends.cudnn.deterministic = True
71
+ torch.backends.cudnn.benchmark = False
72
+
73
+
74
+ @dataclass
75
+ class ModelArguments:
76
+ # LLM Arguments
77
+ model_name_or_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
78
+ version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
79
+ freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
80
+ # Connector Arguments
81
+ mm_projector_type: Optional[str] = field(default='linear')
82
+ tune_mm_mlp_adapter: bool = field(default=False)
83
+ pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
84
+ # Vision tower Arguments
85
+ vision_tower: Optional[str] = field(default=None)
86
+ mm_vision_select_layer: Optional[int] = field(default=-1)
87
+ mm_vision_select_feature: Optional[str] = field(default="patch")
88
+ # Other Arguments
89
+ mm_use_im_start_end: bool = field(default=False)
90
+ mm_use_im_patch_token: bool = field(default=True)
91
+
92
+
93
+ @dataclass
94
+ class DataArguments:
95
+ # Path Arguments
96
+ data_path: str = field(default=None, metadata={"help": "Path to the training data."})
97
+ # image_folder: Optional[str] = field(default=None)
98
+ # video_folder: Optional[str] = field(default=None)
99
+ data_folder: Optional[str] = field(default=None)
100
+ # Loading Arguments
101
+ is_multimodal: bool = False
102
+ lazy_preprocess: bool = False
103
+ num_frames: Optional[int] = field(default=None)
104
+ # Preprocess Arguments
105
+ image_aspect_ratio: str = 'square'
106
+
107
+
108
+ @dataclass
109
+ class TrainingArguments(transformers.TrainingArguments):
110
+ optim: str = field(default="adamw_torch")
111
+ mm_projector_lr: Optional[float] = None
112
+ freeze_mm_mlp_adapter: bool = field(default=False)
113
+ remove_unused_columns: bool = field(default=False)
114
+ cache_dir: Optional[str] = field(default=None)
115
+ # Training Data Arguments
116
+ group_by_modality_length: bool = field(default=False)
117
+ model_max_length: int = field(
118
+ default=512,
119
+ metadata={
120
+ "help":
121
+ "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
122
+ },
123
+ )
124
+ # Lora or Quant Arguments
125
+ double_quant: bool = field(
126
+ default=True,
127
+ metadata={"help": "Compress the quantization statistics through double quantization."}
128
+ )
129
+ quant_type: str = field(
130
+ default="nf4",
131
+ metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
132
+ )
133
+ bits: int = field(
134
+ default=16,
135
+ metadata={"help": "How many bits to use."}
136
+ )
137
+ lora_enable: bool = False
138
+ lora_r: int = 64
139
+ lora_alpha: int = 16
140
+ lora_dropout: float = 0.05
141
+ lora_weight_path: str = ""
142
+ lora_bias: str = "none"
143
+
144
+
145
+ def maybe_zero_3(param, ignore_status=False, name=None):
146
+ from deepspeed import zero
147
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
148
+ if hasattr(param, "ds_id"):
149
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
150
+ if not ignore_status:
151
+ logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
152
+ with zero.GatheredParameters([param]):
153
+ param = param.data.detach().cpu().clone()
154
+ else:
155
+ param = param.detach().cpu().clone()
156
+ return param
157
+
158
+
159
+ # Borrowed from peft.utils.get_peft_model_state_dict
160
+ def get_peft_state_maybe_zero_3(named_params, bias):
161
+ if bias == "none":
162
+ to_return = {k: t for k, t in named_params if "lora_" in k}
163
+ elif bias == "all":
164
+ to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
165
+ elif bias == "lora_only":
166
+ to_return = {}
167
+ maybe_lora_bias = {}
168
+ lora_bias_names = set()
169
+ for k, t in named_params:
170
+ if "lora_" in k:
171
+ to_return[k] = t
172
+ bias_name = k.split("lora_")[0] + "bias"
173
+ lora_bias_names.add(bias_name)
174
+ elif "bias" in k:
175
+ maybe_lora_bias[k] = t
176
+ for k, t in maybe_lora_bias:
177
+ if bias_name in lora_bias_names:
178
+ to_return[bias_name] = t
179
+ else:
180
+ raise NotImplementedError
181
+ to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
182
+ return to_return
183
+
184
+
185
+ def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
186
+ to_return = {k: t for k, t in named_params if "lora_" not in k}
187
+ if require_grad_only:
188
+ to_return = {k: t for k, t in to_return.items() if t.requires_grad}
189
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
190
+ return to_return
191
+
192
+
193
+ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
194
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
195
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
196
+ return to_return
197
+
198
+
199
+ def find_all_linear_names(model):
200
+ cls = torch.nn.Linear
201
+ lora_module_names = set()
202
+ multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
203
+ for name, module in model.named_modules():
204
+ if any(mm_keyword in name for mm_keyword in multimodal_keywords):
205
+ continue
206
+ if isinstance(module, cls):
207
+ names = name.split('.')
208
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
209
+
210
+ if 'lm_head' in lora_module_names: # needed for 16-bit
211
+ lora_module_names.remove('lm_head')
212
+ return list(lora_module_names)
213
+
214
+
215
+ def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
216
+ output_dir: str):
217
+ """Collects the state dict and dump to disk."""
218
+
219
+ if getattr(trainer.args, "tune_mm_mlp_adapter", False):
220
+ # Only save Adapter
221
+ keys_to_match = ['mm_projector']
222
+ if getattr(trainer.args, "use_im_start_end", False):
223
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
224
+
225
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
226
+ trainer.model.config.save_pretrained(output_dir)
227
+
228
+ current_folder = output_dir.split('/')[-1]
229
+ parent_folder = os.path.dirname(output_dir)
230
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
231
+ if current_folder.startswith('checkpoint-'):
232
+ mm_projector_folder = os.path.join(parent_folder, "mm_projector")
233
+ os.makedirs(mm_projector_folder, exist_ok=True)
234
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
235
+ else:
236
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
237
+ return
238
+
239
+ if trainer.deepspeed:
240
+ torch.cuda.synchronize()
241
+ trainer.save_model(output_dir)
242
+ return
243
+
244
+ state_dict = trainer.model.state_dict()
245
+ if trainer.args.should_save:
246
+ cpu_state_dict = {
247
+ key: value.cpu()
248
+ for key, value in state_dict.items()
249
+ }
250
+ del state_dict
251
+ trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
252
+
253
+
254
+ def smart_tokenizer_and_embedding_resize(
255
+ special_tokens_dict: Dict,
256
+ tokenizer: transformers.PreTrainedTokenizer,
257
+ model: transformers.PreTrainedModel,
258
+ ):
259
+ """Resize tokenizer and embedding.
260
+
261
+ Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
262
+ """
263
+ num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
264
+ model.resize_token_embeddings(len(tokenizer))
265
+
266
+ if num_new_tokens > 0:
267
+ input_embeddings = model.get_input_embeddings().weight.data
268
+ output_embeddings = model.get_output_embeddings().weight.data
269
+
270
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
271
+ dim=0, keepdim=True)
272
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
273
+ dim=0, keepdim=True)
274
+
275
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
276
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
277
+
278
+
279
+ def _tokenize_fn(strings: Sequence[str],
280
+ tokenizer: transformers.PreTrainedTokenizer) -> Dict:
281
+ """Tokenize a list of strings."""
282
+ tokenized_list = [
283
+ tokenizer(
284
+ text,
285
+ return_tensors="pt",
286
+ padding="longest",
287
+ max_length=tokenizer.model_max_length,
288
+ truncation=True,
289
+ ) for text in strings
290
+ ]
291
+ input_ids = labels = [
292
+ tokenized.input_ids[0] for tokenized in tokenized_list
293
+ ]
294
+ input_ids_lens = labels_lens = [
295
+ tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
296
+ for tokenized in tokenized_list
297
+ ]
298
+ return dict(
299
+ input_ids=input_ids,
300
+ labels=labels,
301
+ input_ids_lens=input_ids_lens,
302
+ labels_lens=labels_lens,
303
+ )
304
+
305
+
306
+ def _mask_targets(target, tokenized_lens, speakers):
307
+ # cur_idx = 0
308
+ cur_idx = tokenized_lens[0]
309
+ tokenized_lens = tokenized_lens[1:]
310
+ target[:cur_idx] = IGNORE_INDEX
311
+ for tokenized_len, speaker in zip(tokenized_lens, speakers):
312
+ if speaker == "human":
313
+ target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
314
+ cur_idx += tokenized_len
315
+
316
+
317
+ def _add_speaker_and_signal(header, source, get_conversation=True):
318
+ """Add speaker and start/end signal on each round."""
319
+ BEGIN_SIGNAL = "### "
320
+ END_SIGNAL = "\n"
321
+ conversation = header
322
+ for sentence in source:
323
+ from_str = sentence["from"]
324
+ if from_str.lower() == "human":
325
+ from_str = conversation_lib.default_conversation.roles[0]
326
+ elif from_str.lower() == "gpt":
327
+ from_str = conversation_lib.default_conversation.roles[1]
328
+ else:
329
+ from_str = 'unknown'
330
+ sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
331
+ sentence["value"] + END_SIGNAL)
332
+ if get_conversation:
333
+ conversation += sentence["value"]
334
+ conversation += BEGIN_SIGNAL
335
+ return conversation
336
+
337
+
338
+ def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
339
+ is_multimodal = data_args.is_multimodal
340
+ if not is_multimodal:
341
+ return sources
342
+
343
+ for source in sources:
344
+ for sentence in source:
345
+ # NOTE: scan token of each modal and move them to the beginning of the sentence.
346
+ for DEFAULT_TOKEN in DEFAULT_MMODAL_TOKEN.values():
347
+ MODAL_TYPE = None
348
+ if DEFAULT_TOKEN in sentence['value']:
349
+ MODAL_TYPE = DEFAULT_TOKEN[1:-1]
350
+ sentence['value'] = sentence['value'].replace(DEFAULT_TOKEN, '').strip()
351
+ sentence['value'] = DEFAULT_TOKEN + '\n' + sentence['value']
352
+ sentence['value'] = sentence['value'].strip()
353
+ if "mmtag" in conversation_lib.default_conversation.version:
354
+ sentence['value'] = sentence['value'].replace(DEFAULT_TOKEN, f'<{MODAL_TYPE.capitalize()}>' + DEFAULT_TOKEN + f'</{MODAL_TYPE.capitalize()}>')
355
+ replace_token = DEFAULT_TOKEN
356
+ if data_args.mm_use_im_start_end and MODAL_TYPE is not None:
357
+ replace_token = DEFAULT_MMODAL_START_TOKEN[MODAL_TYPE.upper()] + replace_token + DEFAULT_MMODAL_START_TOKEN[MODAL_TYPE.upper()]
358
+ sentence["value"] = sentence["value"].replace(DEFAULT_TOKEN, replace_token)
359
+
360
+ return sources
361
+
362
+
363
+ def preprocess_llama_2(
364
+ sources,
365
+ tokenizer: transformers.PreTrainedTokenizer,
366
+ MODAL_list = [],
367
+ ) -> Dict:
368
+ conv = conversation_lib.default_conversation.copy()
369
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
370
+
371
+ # Apply prompt templates
372
+ conversations = []
373
+ for i, source in enumerate(sources):
374
+ if roles[source[0]["from"]] != conv.roles[0]:
375
+ # Skip the first one if it is not from human
376
+ source = source[1:]
377
+
378
+ conv.messages = []
379
+ for j, sentence in enumerate(source):
380
+ role = roles[sentence["from"]]
381
+ assert role == conv.roles[j % 2], f"{i}"
382
+ conv.append_message(role, sentence["value"])
383
+ conversations.append(conv.get_prompt())
384
+
385
+ # Tokenize conversations
386
+ if len(MODAL_list) > 0:
387
+ # input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
388
+ input_ids = torch.stack([tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[i]], return_tensors='pt') for i, prompt in enumerate(conversations)], dim=0)
389
+ else:
390
+ input_ids = tokenizer(
391
+ conversations,
392
+ return_tensors="pt",
393
+ padding="longest",
394
+ max_length=tokenizer.model_max_length,
395
+ truncation=True,
396
+ ).input_ids
397
+
398
+ targets = input_ids.clone()
399
+
400
+ assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
401
+
402
+ # Mask targets
403
+ sep = "[/INST] "
404
+ for idx, (conversation, target) in enumerate(zip(conversations, targets)):
405
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
406
+
407
+ rounds = conversation.split(conv.sep2)
408
+ cur_len = 1
409
+ target[:cur_len] = IGNORE_INDEX
410
+ for i, rou in enumerate(rounds):
411
+ if rou == "":
412
+ break
413
+
414
+ parts = rou.split(sep)
415
+ if len(parts) != 2:
416
+ break
417
+ parts[0] += sep
418
+
419
+ if len(MODAL_list) > 0:
420
+ # round_len = len(tokenizer_image_token(rou, tokenizer))
421
+ # instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
422
+ round_len = len(tokenizer_MMODAL_token(rou, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]]))
423
+ instruction_len = len(tokenizer_MMODAL_token(parts[0], tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]])) - 2
424
+ else:
425
+ round_len = len(tokenizer(rou).input_ids)
426
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
427
+
428
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
429
+
430
+ cur_len += round_len
431
+ target[cur_len:] = IGNORE_INDEX
432
+
433
+ if cur_len < tokenizer.model_max_length:
434
+ if cur_len != total_len:
435
+ target[:] = IGNORE_INDEX
436
+ print(
437
+ f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
438
+ f" (ignored)"
439
+ )
440
+
441
+ return dict(
442
+ input_ids=input_ids,
443
+ labels=targets,
444
+ )
445
+
446
+
447
+ def preprocess_v1(
448
+ sources,
449
+ tokenizer: transformers.PreTrainedTokenizer,
450
+ MODAL_list = [],
451
+ ) -> Dict:
452
+ conv = conversation_lib.default_conversation.copy()
453
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
454
+
455
+ assert len(sources) == len(MODAL_list)
456
+ # Apply prompt templates
457
+ conversations = []
458
+ for i, source in enumerate(sources):
459
+ if roles[source[0]["from"]] != conv.roles[0]:
460
+ # Skip the first one if it is not from human
461
+ source = source[1:]
462
+
463
+ conv.messages = []
464
+ # source is the conversations in the input data
465
+ for j, sentence in enumerate(source):
466
+ role = roles[sentence["from"]]
467
+ assert role == conv.roles[j % 2], f"{i}"
468
+ conv.append_message(role, sentence["value"])
469
+ conversations.append(conv.get_prompt())
470
+
471
+ # Tokenize conversations
472
+ if len(MODAL_list) > 0:
473
+ # input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
474
+ input_ids = torch.stack([tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[i]], return_tensors='pt') for i, prompt in enumerate(conversations)], dim=0)
475
+ else:
476
+ input_ids = tokenizer(
477
+ conversations,
478
+ return_tensors="pt",
479
+ padding="longest",
480
+ max_length=tokenizer.model_max_length,
481
+ truncation=True,
482
+ ).input_ids
483
+
484
+ targets = input_ids.clone()
485
+
486
+ assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
487
+
488
+ # Mask targets
489
+ sep = conv.sep + conv.roles[1] + ": "
490
+ #for conversation, target in zip(conversations, targets):
491
+ for idx, (conversation, target) in enumerate(zip(conversations, targets)):
492
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
493
+
494
+ rounds = conversation.split(conv.sep2)
495
+ cur_len = 1
496
+ target[:cur_len] = IGNORE_INDEX
497
+ for i, rou in enumerate(rounds):
498
+ if rou == "":
499
+ break
500
+
501
+ parts = rou.split(sep)
502
+ if len(parts) != 2:
503
+ break
504
+ parts[0] += sep
505
+
506
+ if len(MODAL_list) > 0:
507
+ # round_len = len(tokenizer_image_token(rou, tokenizer))
508
+ # instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
509
+ # fix the issue of tokenization mismatch
510
+ round_len = len(tokenizer_MMODAL_token(rou, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]]))
511
+ instruction_len = len(tokenizer_MMODAL_token(parts[0], tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]])) - 2
512
+ else:
513
+ round_len = len(tokenizer(rou).input_ids)
514
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
515
+
516
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
517
+
518
+ cur_len += round_len
519
+ target[cur_len:] = IGNORE_INDEX
520
+
521
+ if cur_len < tokenizer.model_max_length:
522
+ if cur_len != total_len:
523
+ target[:] = IGNORE_INDEX
524
+ print(
525
+ f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
526
+ f" (ignored)"
527
+ )
528
+
529
+ return dict(
530
+ input_ids=input_ids,
531
+ labels=targets,
532
+ )
533
+
534
+
535
+ def preprocess_plain(
536
+ sources: Sequence[str],
537
+ tokenizer: transformers.PreTrainedTokenizer,
538
+ MODAL_list=[]
539
+ ) -> Dict:
540
+ # add end signal and concatenate together
541
+ conversations = []
542
+ DEFAULT_TOKEN = DEFAULT_MMODAL_TOKEN[MODAL_list[0]]
543
+ for source in sources:
544
+ assert len(source) == 2
545
+ source[0]['value'] = DEFAULT_TOKEN
546
+ conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
547
+ conversations.append(conversation)
548
+ # tokenize conversations
549
+ input_ids = [tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[0]], return_tensors='pt') for prompt in conversations]
550
+ targets = copy.deepcopy(input_ids)
551
+ for target, source in zip(targets, sources):
552
+ tokenized_len = len(tokenizer_MMODAL_token(source[0]['value'], tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[0]]))
553
+ target[:tokenized_len] = IGNORE_INDEX
554
+
555
+ return dict(input_ids=input_ids, labels=targets)
556
+
557
+
558
+ def preprocess(
559
+ sources: Sequence[str],
560
+ tokenizer: transformers.PreTrainedTokenizer,
561
+ MODAL_list: list = []
562
+ ) -> Dict:
563
+ """
564
+ Given a list of sources, each is a conversation list. This transform:
565
+ 1. Add signal '### ' at the beginning each sentence, with end signal '\n';
566
+ 2. Concatenate conversations together;
567
+ 3. Tokenize the concatenated conversation;
568
+ 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
569
+ """
570
+ if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
571
+ return preprocess_plain(sources, tokenizer, MODAL_list)
572
+ if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
573
+ return preprocess_llama_2(sources, tokenizer, MODAL_list)
574
+ if conversation_lib.default_conversation.version.startswith("v1"):
575
+ return preprocess_v1(sources, tokenizer, MODAL_list)
576
+ # add end signal and concatenate together
577
+ conversations = []
578
+ for source in sources:
579
+ header = f"{conversation_lib.default_conversation.system}\n\n"
580
+ conversation = _add_speaker_and_signal(header, source)
581
+ conversations.append(conversation)
582
+ # tokenize conversations
583
+ def get_tokenize_len(prompts, token_index):
584
+ return [len(tokenizer_MMODAL_token(prompt, tokenizer, token_index)) for prompt in prompts]
585
+
586
+ if len(MODAL_list) > 0:
587
+ input_ids = [tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[i]], return_tensors='pt') for i, prompt in enumerate(conversations)]
588
+ else:
589
+ conversations_tokenized = _tokenize_fn(conversations, tokenizer)
590
+ input_ids = conversations_tokenized["input_ids"]
591
+
592
+ targets = copy.deepcopy(input_ids)
593
+ for idx, (target, source) in enumerate(zip(targets, sources)):
594
+ if len(MODAL_list) > 0:
595
+ tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source], MODAL_list[idx])
596
+ else:
597
+ tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
598
+ speakers = [sentence["from"] for sentence in source]
599
+ _mask_targets(target, tokenized_lens, speakers)
600
+
601
+ return dict(input_ids=input_ids, labels=targets)
602
+
603
+
604
+ class LazySupervisedDataset(Dataset):
605
+ """Dataset for supervised fine-tuning."""
606
+
607
+ def __init__(self, data_path: str,
608
+ tokenizer: transformers.PreTrainedTokenizer,
609
+ data_args: DataArguments):
610
+ super(LazySupervisedDataset, self).__init__()
611
+ list_data_dict = json.load(open(data_path, "r"))
612
+
613
+ rank0_print("Formatting inputs...Skip in lazy mode")
614
+ self.tokenizer = tokenizer
615
+ self.list_data_dict = list_data_dict
616
+ self.data_args = data_args
617
+
618
+ def __len__(self):
619
+ return len(self.list_data_dict)
620
+
621
+ @property
622
+ def lengths(self):
623
+ length_list = []
624
+ for sample in self.list_data_dict:
625
+ img_tokens = 513 if 'image' in sample else 0
626
+ length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
627
+ return length_list
628
+
629
+ @property
630
+ def modality_lengths(self):
631
+ length_list = []
632
+ for sample in self.list_data_dict:
633
+ cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
634
+ cur_len = cur_len if 'image' in sample else -cur_len
635
+ length_list.append(cur_len)
636
+ return length_list
637
+
638
+ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
639
+ sources = self.list_data_dict[i]
640
+ image_processor = self.data_args.image_processor
641
+ video_processor = self.data_args.video_processor
642
+
643
+ num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames
644
+
645
+ if isinstance(i, int):
646
+ sources = [sources]
647
+ assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME
648
+ MODAL_list = []
649
+ if 'image' in sources[0]:
650
+ image_file = self.list_data_dict[i]['image']
651
+ image_file = os.path.join(self.data_args.data_folder, image_file)
652
+
653
+ try:
654
+ image = process_image(image_file, image_processor, self.data_args.image_aspect_ratio)[0]
655
+ except Exception as e:
656
+ traceback.print_exc()
657
+ backup_idx = random.randint(0, len(self.list_data_dict)-1)
658
+ print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!")
659
+ return self.__getitem__(backup_idx)
660
+
661
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
662
+ MODAL_list.append('IMAGE')
663
+ elif 'video' in sources[0]:
664
+ video_file = self.list_data_dict[i]['video']
665
+ video_file = os.path.join(self.data_args.data_folder, video_file)
666
+
667
+ try:
668
+ video = process_video(video_file, video_processor, self.data_args.image_aspect_ratio, num_frames)
669
+ except Exception as e:
670
+ traceback.print_exc()
671
+ backup_idx = random.randint(0, len(self.list_data_dict)-1)
672
+ print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!")
673
+ return self.__getitem__(backup_idx)
674
+
675
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
676
+ MODAL_list.append('VIDEO')
677
+ else:
678
+ sources = copy.deepcopy([e["conversations"] for e in sources])
679
+ # NOTE: for sharegpt data in the sft stage, we use the default IMAGE as modal token
680
+ MODAL_list.append('IMAGE')
681
+
682
+ data_dict = preprocess(sources, self.tokenizer, MODAL_list=MODAL_list)
683
+ if isinstance(i, int):
684
+ data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
685
+
686
+ if 'image' in self.list_data_dict[i]:
687
+ data_dict['image'] = image
688
+ elif 'video' in self.list_data_dict[i]:
689
+ data_dict['video'] = video
690
+ elif self.data_args.is_multimodal:
691
+ # image does not exist in the data, but the model is multimodal
692
+ crop_size = self.data_args.image_processor.crop_size
693
+ data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
694
+ return data_dict
695
+
696
+
697
+ @dataclass
698
+ class DataCollatorForSupervisedDataset(object):
699
+ """Collate examples for supervised fine-tuning."""
700
+
701
+ tokenizer: transformers.PreTrainedTokenizer
702
+
703
+ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
704
+ input_ids, labels = tuple([instance[key] for instance in instances]
705
+ for key in ("input_ids", "labels"))
706
+ input_ids = torch.nn.utils.rnn.pad_sequence(
707
+ input_ids,
708
+ batch_first=True,
709
+ padding_value=self.tokenizer.pad_token_id)
710
+ labels = torch.nn.utils.rnn.pad_sequence(
711
+ labels,
712
+ batch_first=True,
713
+ padding_value=IGNORE_INDEX)
714
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
715
+ labels = labels[:, :self.tokenizer.model_max_length]
716
+ batch = dict(
717
+ input_ids=input_ids,
718
+ labels=labels,
719
+ attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
720
+ )
721
+
722
+ Xs, keys = [], []
723
+ for instance in instances:
724
+ for x in DEFAULT_MMODAL_TOKEN.keys():
725
+ x = x.lower()
726
+ if x in instance:
727
+ Xs.append(instance[x])
728
+ keys.append(x)
729
+ batch['images'] = [Xs, keys] # we do not change the key's name.
730
+ return batch
731
+
732
+
733
+ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
734
+ data_args) -> Dict:
735
+ """Make dataset and collator for supervised fine-tuning."""
736
+ train_dataset = LazySupervisedDataset(
737
+ tokenizer=tokenizer,
738
+ data_path=data_args.data_path,
739
+ data_args=data_args
740
+ )
741
+ data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
742
+ return dict(train_dataset=train_dataset,
743
+ eval_dataset=None,
744
+ data_collator=data_collator)
745
+
746
+
747
+ def train(attn_implementation=None):
748
+ global local_rank
749
+ set_seed(42)
750
+
751
+ parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
752
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
753
+
754
+ local_rank = training_args.local_rank
755
+ compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
756
+
757
+ bnb_model_from_pretrained_args = {}
758
+ if training_args.bits in [4, 8]:
759
+ from transformers import BitsAndBytesConfig
760
+ bnb_model_from_pretrained_args.update(dict(
761
+ device_map={"": training_args.device},
762
+ load_in_4bit=training_args.bits == 4,
763
+ load_in_8bit=training_args.bits == 8,
764
+ quantization_config=BitsAndBytesConfig(
765
+ load_in_4bit=training_args.bits == 4,
766
+ load_in_8bit=training_args.bits == 8,
767
+ llm_int8_skip_modules=["mm_projector"],
768
+ llm_int8_threshold=6.0,
769
+ llm_int8_has_fp16_weight=False,
770
+ bnb_4bit_compute_dtype=compute_dtype,
771
+ bnb_4bit_use_double_quant=training_args.double_quant,
772
+ bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
773
+ )
774
+ ))
775
+
776
+ if model_args.vision_tower is not None:
777
+ if 'mistral' in model_args.model_name_or_path.lower():
778
+ config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
779
+ config._attn_implementation = attn_implementation
780
+ model = Videollama2MistralForCausalLM.from_pretrained(
781
+ model_args.model_name_or_path,
782
+ config=config,
783
+ cache_dir=training_args.cache_dir,
784
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
785
+ do_sample=True,
786
+ **bnb_model_from_pretrained_args
787
+ )
788
+ elif 'mixtral' in model_args.model_name_or_path.lower():
789
+ config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
790
+ config._attn_implementation = attn_implementation
791
+ model = Videollama2MixtralForCausalLM.from_pretrained(
792
+ model_args.model_name_or_path,
793
+ config=config,
794
+ cache_dir=training_args.cache_dir,
795
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
796
+ do_sample=True,
797
+ **bnb_model_from_pretrained_args
798
+ )
799
+ import deepspeed
800
+ deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
801
+ else:
802
+ config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
803
+ config._attn_implementation = attn_implementation
804
+ model = Videollama2LlamaForCausalLM.from_pretrained(
805
+ model_args.model_name_or_path,
806
+ config=config,
807
+ cache_dir=training_args.cache_dir,
808
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
809
+ do_sample=True,
810
+ **bnb_model_from_pretrained_args
811
+ )
812
+ else:
813
+ config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
814
+ config._attn_implementation = attn_implementation
815
+ model = transformers.LlamaForCausalLM.from_pretrained(
816
+ model_args.model_name_or_path,
817
+ config=config,
818
+ cache_dir=training_args.cache_dir,
819
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
820
+ do_sample=True,
821
+ **bnb_model_from_pretrained_args
822
+ )
823
+ model.config.use_cache = False
824
+
825
+ if model_args.freeze_backbone:
826
+ model.model.requires_grad_(False)
827
+
828
+ if training_args.bits in [4, 8]:
829
+ from peft import prepare_model_for_kbit_training
830
+ model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
831
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
832
+
833
+ if training_args.gradient_checkpointing:
834
+ if hasattr(model, "enable_input_require_grads"):
835
+ model.enable_input_require_grads()
836
+ else:
837
+ def make_inputs_require_grad(module, input, output):
838
+ output.requires_grad_(True)
839
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
840
+
841
+ if training_args.lora_enable:
842
+ from peft import LoraConfig, get_peft_model
843
+ lora_config = LoraConfig(
844
+ r=training_args.lora_r,
845
+ lora_alpha=training_args.lora_alpha,
846
+ target_modules=find_all_linear_names(model),
847
+ lora_dropout=training_args.lora_dropout,
848
+ bias=training_args.lora_bias,
849
+ task_type="CAUSAL_LM",
850
+ )
851
+ if training_args.bits == 16:
852
+ if training_args.bf16:
853
+ model.to(torch.bfloat16)
854
+ if training_args.fp16:
855
+ model.to(torch.float16)
856
+ rank0_print("Adding LoRA adapters...")
857
+ model = get_peft_model(model, lora_config)
858
+
859
+
860
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
861
+ model_args.model_name_or_path,
862
+ cache_dir=training_args.cache_dir,
863
+ model_max_length=training_args.model_max_length,
864
+ padding_side="right",
865
+ use_fast=True,
866
+ )
867
+
868
+ if model_args.version == "v0":
869
+ if tokenizer.pad_token is None:
870
+ smart_tokenizer_and_embedding_resize(
871
+ special_tokens_dict=dict(pad_token="[PAD]"),
872
+ tokenizer=tokenizer,
873
+ model=model,
874
+ )
875
+ elif model_args.version == "v0.5":
876
+ tokenizer.pad_token = tokenizer.unk_token
877
+ else:
878
+ tokenizer.pad_token = tokenizer.unk_token
879
+ if model_args.version in conversation_lib.conv_templates:
880
+ conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
881
+ else:
882
+ if model_args.version == "v1":
883
+ conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
884
+ elif model_args.version == "v1_mistral":
885
+ conversation_lib.default_conversation = conversation_lib.conv_templates["mistral_instruct"]
886
+
887
+ if model_args.vision_tower is not None:
888
+ # initialize vision encoder + multi-modal projector
889
+ model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
890
+
891
+ vision_tower = model.get_vision_tower()
892
+ vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
893
+
894
+ data_args.image_processor = vision_tower.image_processor
895
+ data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
896
+
897
+ data_args.is_multimodal = True
898
+
899
+ model.config.image_aspect_ratio = data_args.image_aspect_ratio
900
+ model.config.tokenizer_padding_side = tokenizer.padding_side
901
+ model.config.tokenizer_model_max_length = tokenizer.model_max_length
902
+
903
+ model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
904
+ if model_args.tune_mm_mlp_adapter:
905
+ model.requires_grad_(False)
906
+ for p in model.get_model().mm_projector.parameters():
907
+ p.requires_grad = True
908
+
909
+ model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
910
+ if training_args.freeze_mm_mlp_adapter:
911
+ for p in model.get_model().mm_projector.parameters():
912
+ p.requires_grad = False
913
+
914
+ if training_args.bits in [4, 8]:
915
+ model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
916
+
917
+ model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
918
+ model.config.mm_projector_lr = training_args.mm_projector_lr
919
+ training_args.use_im_start_end = model_args.mm_use_im_start_end
920
+ model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
921
+ model.initialize_MM_tokenizer(model_args, tokenizer=tokenizer)
922
+
923
+ model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames
924
+
925
+ if training_args.bits in [4, 8]:
926
+ from peft.tuners.lora import LoraLayer
927
+ for name, module in model.named_modules():
928
+ if isinstance(module, LoraLayer):
929
+ if training_args.bf16:
930
+ module = module.to(torch.bfloat16)
931
+ if 'norm' in name:
932
+ module = module.to(torch.float32)
933
+ if 'lm_head' in name or 'embed_tokens' in name:
934
+ if hasattr(module, 'weight'):
935
+ if training_args.bf16 and module.weight.dtype == torch.float32:
936
+ module = module.to(torch.bfloat16)
937
+
938
+ print("Current model:", model)
939
+ data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
940
+ # select a Trainer
941
+ trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
942
+
943
+ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
944
+ trainer.train(resume_from_checkpoint=True)
945
+ else:
946
+ trainer.train()
947
+ trainer.save_state()
948
+
949
+ model.config.use_cache = True
950
+
951
+ if training_args.lora_enable:
952
+ state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
953
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
954
+ if training_args.local_rank == 0 or training_args.local_rank == -1:
955
+ model.config.save_pretrained(training_args.output_dir)
956
+ model.save_pretrained(training_args.output_dir, state_dict=state_dict)
957
+ torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
958
+ else:
959
+ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
960
+
961
+
962
+ if __name__ == "__main__":
963
+ train()
videollama2/train_flash_attn.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
5
+
6
+ import sys
7
+ sys.path.append('./')
8
+
9
+ from videollama2.train import train
10
+
11
+ if __name__ == "__main__":
12
+ train(attn_implementation="flash_attention_2")
videollama2/utils.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ import logging.handlers
4
+ import os
5
+ import sys
6
+
7
+ import requests
8
+
9
+ from .constants import LOGDIR
10
+
11
+ server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
12
+ moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
13
+
14
+ handler = None
15
+
16
+
17
+ def build_logger(logger_name, logger_filename):
18
+ global handler
19
+
20
+ formatter = logging.Formatter(
21
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
22
+ datefmt="%Y-%m-%d %H:%M:%S",
23
+ )
24
+
25
+ # Set the format of root handlers
26
+ if not logging.getLogger().handlers:
27
+ logging.basicConfig(level=logging.INFO)
28
+ logging.getLogger().handlers[0].setFormatter(formatter)
29
+
30
+ # Redirect stdout and stderr to loggers
31
+ stdout_logger = logging.getLogger("stdout")
32
+ stdout_logger.setLevel(logging.INFO)
33
+ sl = StreamToLogger(stdout_logger, logging.INFO)
34
+ sys.stdout = sl
35
+
36
+ stderr_logger = logging.getLogger("stderr")
37
+ stderr_logger.setLevel(logging.ERROR)
38
+ sl = StreamToLogger(stderr_logger, logging.ERROR)
39
+ sys.stderr = sl
40
+
41
+ # Get logger
42
+ logger = logging.getLogger(logger_name)
43
+ logger.setLevel(logging.INFO)
44
+
45
+ # Add a file handler for all loggers
46
+ if handler is None:
47
+ os.makedirs(LOGDIR, exist_ok=True)
48
+ filename = os.path.join(LOGDIR, logger_filename)
49
+ handler = logging.handlers.TimedRotatingFileHandler(
50
+ filename, when='D', utc=True, encoding='UTF-8')
51
+ handler.setFormatter(formatter)
52
+
53
+ for name, item in logging.root.manager.loggerDict.items():
54
+ if isinstance(item, logging.Logger):
55
+ item.addHandler(handler)
56
+
57
+ return logger
58
+
59
+
60
+ class StreamToLogger(object):
61
+ """
62
+ Fake file-like stream object that redirects writes to a logger instance.
63
+ """
64
+ def __init__(self, logger, log_level=logging.INFO):
65
+ self.terminal = sys.stdout
66
+ self.logger = logger
67
+ self.log_level = log_level
68
+ self.linebuf = ''
69
+
70
+ def __getattr__(self, attr):
71
+ return getattr(self.terminal, attr)
72
+
73
+ def write(self, buf):
74
+ temp_linebuf = self.linebuf + buf
75
+ self.linebuf = ''
76
+ for line in temp_linebuf.splitlines(True):
77
+ # From the io.TextIOWrapper docs:
78
+ # On output, if newline is None, any '\n' characters written
79
+ # are translated to the system default line separator.
80
+ # By default sys.stdout.write() expects '\n' newlines and then
81
+ # translates them so this is still cross platform.
82
+ if line[-1] == '\n':
83
+ self.logger.log(self.log_level, line.rstrip())
84
+ else:
85
+ self.linebuf += line
86
+
87
+ def flush(self):
88
+ if self.linebuf != '':
89
+ self.logger.log(self.log_level, self.linebuf.rstrip())
90
+ self.linebuf = ''
91
+
92
+
93
+ def disable_torch_init():
94
+ """
95
+ Disable the redundant torch default initialization to accelerate model creation.
96
+ """
97
+ import torch
98
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
99
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100
+
101
+
102
+ def violates_moderation(text):
103
+ """
104
+ Check whether the text violates OpenAI moderation API.
105
+ """
106
+ url = "https://api.openai.com/v1/moderations"
107
+ headers = {"Content-Type": "application/json",
108
+ "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109
+ text = text.replace("\n", "")
110
+ data = "{" + '"input": ' + f'"{text}"' + "}"
111
+ data = data.encode("utf-8")
112
+ try:
113
+ ret = requests.post(url, headers=headers, data=data, timeout=5)
114
+ flagged = ret.json()["results"][0]["flagged"]
115
+ except requests.exceptions.RequestException as e:
116
+ flagged = False
117
+ except KeyError as e:
118
+ flagged = False
119
+
120
+ return flagged
121
+
122
+
123
+ def pretty_print_semaphore(semaphore):
124
+ if semaphore is None:
125
+ return "None"
126
+ return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
videollama2/videollama2_trainer.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.utils.data import Sampler
8
+
9
+ from transformers import Trainer
10
+ from transformers.trainer import (
11
+ is_sagemaker_mp_enabled,
12
+ get_parameter_names,
13
+ has_length,
14
+ ALL_LAYERNORM_LAYERS,
15
+ logger,
16
+ TRAINER_STATE_NAME,
17
+ )
18
+
19
+
20
+ def maybe_zero_3(param, ignore_status=False, name=None):
21
+ from deepspeed import zero
22
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
23
+ if hasattr(param, "ds_id"):
24
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
25
+ if not ignore_status:
26
+ print(name, 'no ignore status')
27
+ with zero.GatheredParameters([param]):
28
+ param = param.data.detach().cpu().clone()
29
+ else:
30
+ param = param.detach().cpu().clone()
31
+ return param
32
+
33
+
34
+ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
35
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
36
+ to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
37
+ return to_return
38
+
39
+
40
+ def split_to_even_chunks(indices, lengths, num_chunks):
41
+ """
42
+ Split a list of indices into `chunks` chunks of roughly equal lengths.
43
+ """
44
+
45
+ if len(indices) % num_chunks != 0:
46
+ return [indices[i::num_chunks] for i in range(num_chunks)]
47
+
48
+ num_indices_per_chunk = len(indices) // num_chunks
49
+
50
+ chunks = [[] for _ in range(num_chunks)]
51
+ chunks_lengths = [0 for _ in range(num_chunks)]
52
+ for index in indices:
53
+ shortest_chunk = chunks_lengths.index(min(chunks_lengths))
54
+ chunks[shortest_chunk].append(index)
55
+ chunks_lengths[shortest_chunk] += lengths[index]
56
+ if len(chunks[shortest_chunk]) == num_indices_per_chunk:
57
+ chunks_lengths[shortest_chunk] = float("inf")
58
+
59
+ return chunks
60
+
61
+
62
+ def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
63
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
64
+ assert all(l != 0 for l in lengths), "Should not have zero length."
65
+ if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
66
+ # all samples are in the same modality
67
+ return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
68
+ mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
69
+ lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
70
+
71
+ mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
72
+ lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
73
+ megabatch_size = world_size * batch_size
74
+ mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
75
+ lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
76
+
77
+ last_mm = mm_megabatches[-1]
78
+ last_lang = lang_megabatches[-1]
79
+ additional_batch = last_mm + last_lang
80
+ megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
81
+ megabatch_indices = torch.randperm(len(megabatches), generator=generator)
82
+ megabatches = [megabatches[i] for i in megabatch_indices]
83
+
84
+ if len(additional_batch) > 0:
85
+ megabatches.append(sorted(additional_batch))
86
+
87
+ return [i for megabatch in megabatches for i in megabatch]
88
+
89
+
90
+ def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
91
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
92
+ indices = torch.randperm(len(lengths), generator=generator)
93
+ megabatch_size = world_size * batch_size
94
+ megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
95
+ megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
96
+ megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
97
+
98
+ return [i for megabatch in megabatches for batch in megabatch for i in batch]
99
+
100
+
101
+ class LengthGroupedSampler(Sampler):
102
+ r"""
103
+ Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
104
+ keeping a bit of randomness.
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ batch_size: int,
110
+ world_size: int,
111
+ lengths: Optional[List[int]] = None,
112
+ generator=None,
113
+ group_by_modality: bool = False,
114
+ ):
115
+ if lengths is None:
116
+ raise ValueError("Lengths must be provided.")
117
+
118
+ self.batch_size = batch_size
119
+ self.world_size = world_size
120
+ self.lengths = lengths
121
+ self.generator = generator
122
+ self.group_by_modality = group_by_modality
123
+
124
+ def __len__(self):
125
+ return len(self.lengths)
126
+
127
+ def __iter__(self):
128
+ if self.group_by_modality:
129
+ indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
130
+ else:
131
+ indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
132
+ return iter(indices)
133
+
134
+
135
+ class VideoLLaMA2Trainer(Trainer):
136
+
137
+ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
138
+ if self.train_dataset is None or not has_length(self.train_dataset):
139
+ return None
140
+
141
+ if self.args.group_by_modality_length:
142
+ lengths = self.train_dataset.modality_lengths
143
+ return LengthGroupedSampler(
144
+ self.args.train_batch_size,
145
+ world_size=self.args.world_size * self.args.gradient_accumulation_steps,
146
+ lengths=lengths,
147
+ group_by_modality=True,
148
+ )
149
+ else:
150
+ return super()._get_train_sampler()
151
+
152
+ def create_optimizer(self):
153
+ """
154
+ Setup the optimizer.
155
+
156
+ We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
157
+ Trainer's init through `optimizers`, or subclass and override this method in a subclass.
158
+ """
159
+ if is_sagemaker_mp_enabled():
160
+ return super().create_optimizer()
161
+
162
+ opt_model = self.model
163
+
164
+ if self.optimizer is None:
165
+ decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
166
+ decay_parameters = [name for name in decay_parameters if "bias" not in name]
167
+ if self.args.mm_projector_lr is not None:
168
+ projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
169
+ optimizer_grouped_parameters = [
170
+ {
171
+ "params": [
172
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
173
+ ],
174
+ "weight_decay": self.args.weight_decay,
175
+ },
176
+ {
177
+ "params": [
178
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
179
+ ],
180
+ "weight_decay": 0.0,
181
+ },
182
+ {
183
+ "params": [
184
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
185
+ ],
186
+ "weight_decay": self.args.weight_decay,
187
+ "lr": self.args.mm_projector_lr,
188
+ },
189
+ {
190
+ "params": [
191
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
192
+ ],
193
+ "weight_decay": 0.0,
194
+ "lr": self.args.mm_projector_lr,
195
+ },
196
+ ]
197
+ else:
198
+ optimizer_grouped_parameters = [
199
+ {
200
+ "params": [
201
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
202
+ ],
203
+ "weight_decay": self.args.weight_decay,
204
+ },
205
+ {
206
+ "params": [
207
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
208
+ ],
209
+ "weight_decay": 0.0,
210
+ },
211
+ ]
212
+
213
+ optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
214
+
215
+ self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
216
+ if optimizer_cls.__name__ == "Adam8bit":
217
+ import bitsandbytes
218
+
219
+ manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
220
+
221
+ skipped = 0
222
+ for module in opt_model.modules():
223
+ if isinstance(module, nn.Embedding):
224
+ skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
225
+ logger.info(f"skipped {module}: {skipped/2**20}M params")
226
+ manager.register_module_override(module, "weight", {"optim_bits": 32})
227
+ logger.debug(f"bitsandbytes: will optimize {module} in fp32")
228
+ logger.info(f"skipped: {skipped/2**20}M params")
229
+
230
+ return self.optimizer
231
+
232
+ def _save_checkpoint(self, model, trial, metrics=None):
233
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
234
+ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
235
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
236
+
237
+ run_dir = self._get_output_dir(trial=trial)
238
+ output_dir = os.path.join(run_dir, checkpoint_folder)
239
+
240
+ # Only save Adapter
241
+ keys_to_match = ['mm_projector', 'vision_resampler']
242
+ if getattr(self.args, "use_im_start_end", False):
243
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
244
+
245
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
246
+
247
+ if self.args.local_rank == 0 or self.args.local_rank == -1:
248
+ self.model.config.save_pretrained(output_dir)
249
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
250
+ # Save optimizer and scheduler
251
+ self._save_optimizer_and_scheduler(output_dir)
252
+ # Save RNG state
253
+ self._save_rng_state(output_dir)
254
+ self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
255
+ self.args.distributed_state.wait_for_everyone()
256
+ else:
257
+ super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
258
+
259
+ def _save(self, output_dir: Optional[str] = None, state_dict=None):
260
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
261
+ pass
262
+ else:
263
+ super(VideoLLaMA2Trainer, self)._save(output_dir, state_dict)