This view is limited to 50 files because it contains too many changes.  See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .gitignore +54 -0
  3. app.py +370 -0
  4. examples/00000368.mp4 +3 -0
  5. examples/00003491.mp4 +3 -0
  6. examples/1034346401.mp4 +3 -0
  7. examples/Traffic and pedestrians.wav +3 -0
  8. examples/WBS4I.mp4 +3 -0
  9. examples/Y--ZHUMfueO0.flac +0 -0
  10. examples/desert.jpg +0 -0
  11. examples/extreme_ironing.jpg +0 -0
  12. examples/sample_demo_1.mp4 +3 -0
  13. examples/sample_demo_3.mp4 +3 -0
  14. examples/sample_demo_9.mp4 +3 -0
  15. examples/waterview.jpg +0 -0
  16. videollama2/__init__.py +120 -0
  17. videollama2/constants.py +32 -0
  18. videollama2/conversation.py +507 -0
  19. videollama2/inference_audio.py +292 -0
  20. videollama2/mm_utils.py +473 -0
  21. videollama2/model/__init__.py +208 -0
  22. videollama2/model/beats/BEATs.py +185 -0
  23. videollama2/model/beats/LICENSE_beats +21 -0
  24. videollama2/model/beats/Tokenizers.py +172 -0
  25. videollama2/model/beats/__init__.py +0 -0
  26. videollama2/model/beats/backbone.py +783 -0
  27. videollama2/model/beats/modules.py +218 -0
  28. videollama2/model/beats/quantizer.py +215 -0
  29. videollama2/model/beats/weight_norm_fix.py +139 -0
  30. videollama2/model/encoder.py +211 -0
  31. videollama2/model/mel_filters.npz +3 -0
  32. videollama2/model/projector.py +265 -0
  33. videollama2/model/videollama2_arch.py +377 -0
  34. videollama2/model/videollama2_gemma2.py +176 -0
  35. videollama2/model/videollama2_llama.py +157 -0
  36. videollama2/model/videollama2_mistral.py +159 -0
  37. videollama2/model/videollama2_mixtral.py +154 -0
  38. videollama2/model/videollama2_phi3.py +159 -0
  39. videollama2/model/videollama2_qwen2.py +153 -0
  40. videollama2/serve/cli.py +139 -0
  41. videollama2/serve/controller.py +298 -0
  42. videollama2/serve/gradio_web_server.py +499 -0
  43. videollama2/serve/model_worker.py +397 -0
  44. videollama2/serve/register_worker.py +26 -0
  45. videollama2/serve/sglang_worker.py +244 -0
  46. videollama2/serve/test_message.py +62 -0
  47. videollama2/train.py +700 -0
  48. videollama2/train_flash_attn.py +12 -0
  49. videollama2/utils.py +126 -0
  50. videollama2/videollama2_trainer.py +447 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__
3
+ *.pyc
4
+ *.egg-info
5
+ dist
6
+
7
+ # Log
8
+ *.log
9
+ *.log.*
10
+ *.json
11
+ *.jsonl
12
+ log_dir*/
13
+
14
+ # Data
15
+ !**/alpaca-data-conversation.json
16
+
17
+ # Editor
18
+ .idea
19
+ *.swp
20
+
21
+ # Other
22
+ .DS_Store
23
+
24
+ # jupyter
25
+ .ipynb_checkpoints
26
+ *.ipynb
27
+
28
+ # DevContainer
29
+ !.devcontainer/*
30
+
31
+ # Demo
32
+ serve_images/
33
+
34
+ # data folder
35
+ data/
36
+ dataset/
37
+ datasets/
38
+
39
+ # training folder
40
+ wandb
41
+ ckpts*
42
+ output
43
+ output/
44
+ checkpoints
45
+ checkpoints/
46
+ work_dirs*/
47
+
48
+ # evaluation folder
49
+ /eval/
50
+
51
+ # pretrained weights
52
+ pretrained/
53
+ publish_models/
54
+ public_models/
app.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+
3
+ import os
4
+ import re
5
+
6
+ import torch
7
+ import gradio as gr
8
+
9
+ import sys
10
+ sys.path.append('./videollama2')
11
+ from videollama2 import model_init, mm_infer
12
+ from videollama2.utils import disable_torch_init
13
+
14
+
15
+ title_markdown = ("""
16
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
17
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
18
+ <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
19
+ </a>
20
+ <div>
21
+ <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
22
+ <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
23
+ </div>
24
+ </div>
25
+
26
+
27
+ <div align="center">
28
+ <div style="display:flex; gap: 0.25rem; margin-top: 10px;" align="center">
29
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2"><img src='https://img.shields.io/badge/Github-VideoLLaMA2-9C276A'></a>
30
+ <a href="https://arxiv.org/pdf/2406.07476.pdf"><img src="https://img.shields.io/badge/Arxiv-2406.07476-AD1C18"></a>
31
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2/stargazers"><img src="https://img.shields.io/github/stars/DAMO-NLP-SG/VideoLLaMA2.svg?style=social"></a>
32
+ </div>
33
+ </div>
34
+ """)
35
+
36
+
37
+ block_css = """
38
+ #buttons button {
39
+ min-width: min(120px,100%);
40
+ color: #9C276A
41
+ }
42
+ """
43
+
44
+
45
+ tos_markdown = ("""
46
+ ### Terms of use
47
+ By using this service, users are required to agree to the following terms:
48
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
49
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
50
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
51
+ """)
52
+
53
+
54
+ learn_more_markdown = ("""
55
+ ### License
56
+ This project is released under the Apache 2.0 license as found in the LICENSE file. The service is a research preview intended for non-commercial use ONLY, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
57
+ """)
58
+
59
+
60
+ plum_color = gr.themes.colors.Color(
61
+ name='plum',
62
+ c50='#F8E4EF',
63
+ c100='#E9D0DE',
64
+ c200='#DABCCD',
65
+ c300='#CBA8BC',
66
+ c400='#BC94AB',
67
+ c500='#AD809A',
68
+ c600='#9E6C89',
69
+ c700='#8F5878',
70
+ c800='#804467',
71
+ c900='#713056',
72
+ c950='#662647',
73
+ )
74
+
75
+
76
+ class Chat:
77
+
78
+ def __init__(self, model_path, load_8bit=False, load_4bit=False):
79
+ disable_torch_init()
80
+
81
+ self.model, self.processor, self.tokenizer = model_init(model_path, load_8bit=load_8bit, load_4bit=load_4bit)
82
+
83
+ @spaces.GPU(duration=120)
84
+ @torch.inference_mode()
85
+ def generate(self, data: list, message, temperature, top_p, max_output_tokens):
86
+ # TODO: support multiple turns of conversation.
87
+ assert len(data) == 1
88
+
89
+ tensor, modal = data[0]
90
+ response = mm_infer(tensor, message, self.model, self.tokenizer, modal=modal.strip('<>'),
91
+ do_sample=True if temperature > 0.0 else False,
92
+ temperature=temperature,
93
+ top_p=top_p,
94
+ max_new_tokens=max_output_tokens)
95
+
96
+ return response
97
+
98
+
99
+ @spaces.GPU(duration=120)
100
+ def generate(image, video, audio, message, chatbot, va_tag, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
101
+ data = []
102
+
103
+ processor = handler.processor
104
+ try:
105
+ if image is not None:
106
+ data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
107
+ elif video is not None:
108
+ video_audio = processor['video'](video, va=va_tag=="Audio Vision")
109
+ if va_tag=="Audio Vision":
110
+ for k,v in video_audio.items():
111
+ video_audio[k] = v.to(handler.model.device, dtype=dtype)
112
+ else:
113
+ video_audio = video_audio.to(handler.model.device, dtype=dtype)
114
+ data.append((video_audio, '<video>'))
115
+ elif audio is not None:
116
+ data.append((processor['audio'](audio).to(handler.model.device, dtype=dtype), '<audio>'))
117
+ elif image is None and video is None:
118
+ data.append((None, '<text>'))
119
+ else:
120
+ raise NotImplementedError("Not support image and video at the same time")
121
+ except Exception as e:
122
+ traceback.print_exc()
123
+ return gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), message, chatbot
124
+
125
+ assert len(message) % 2 == 0, "The message should be a pair of user and system message."
126
+
127
+ show_images = ""
128
+ if image is not None:
129
+ show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
130
+ if video is not None:
131
+ show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={video}"></video>'
132
+ if audio is not None:
133
+ show_images += f'<audio controls style="display: inline-block;" src="./file={audio}"></audio>'
134
+
135
+ one_turn_chat = [textbox_in, None]
136
+
137
+ # 1. first run case
138
+ if len(chatbot) == 0:
139
+ one_turn_chat[0] += "\n" + show_images
140
+ # 2. not first run case
141
+ else:
142
+ previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[0][0])
143
+ previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;" src="./file=(.+?)"', chatbot[0][0])
144
+ previous_audio = re.findall(r'<audio controls style="display: inline-block;" src="./file=(.+?)"', chatbot[0][0])
145
+ if len(previous_image) > 0:
146
+ previous_image = previous_image[0]
147
+ # 2.1 new image append or pure text input will start a new conversation
148
+ if image is not None and os.path.basename(previous_image) != os.path.basename(image):
149
+ message.clear()
150
+ one_turn_chat[0] += "\n" + show_images
151
+ elif len(previous_video) > 0:
152
+ previous_video = previous_video[0]
153
+ # 2.2 new video append or pure text input will start a new conversation
154
+ if video is not None and os.path.basename(previous_video) != os.path.basename(video):
155
+ message.clear()
156
+ one_turn_chat[0] += "\n" + show_images
157
+ elif len(previous_audio) > 0:
158
+ previous_audio = previous_audio[0]
159
+ # 2.3 new audio append or pure text input will start a new conversation
160
+ if audio is not None and os.path.basename(previous_audio) != os.path.basename(video):
161
+ message.clear()
162
+ one_turn_chat[0] += "\n" + show_images
163
+
164
+ message.append({'role': 'user', 'content': textbox_in})
165
+
166
+ if va_tag == "Vision Only":
167
+ audio_tower = handler.model.model.audio_tower
168
+ handler.model.model.audio_tower = None
169
+ elif va_tag == "Audio Only":
170
+ vision_tower = handler.model.model.vision_tower
171
+ handler.model.model.vision_tower = None
172
+
173
+ text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
174
+
175
+ if va_tag == "Vision Only":
176
+ handler.model.model.audio_tower = audio_tower
177
+ elif va_tag == "Audio Only":
178
+ handler.model.model.vision_tower = vision_tower
179
+
180
+ message.append({'role': 'assistant', 'content': text_en_out})
181
+
182
+ one_turn_chat[1] = text_en_out
183
+ chatbot.append(one_turn_chat)
184
+
185
+ return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), gr.update(value=audio, interactive=True), message, chatbot
186
+
187
+
188
+ def regenerate(message, chatbot):
189
+ message.pop(-1), message.pop(-1)
190
+ chatbot.pop(-1)
191
+ return message, chatbot
192
+
193
+
194
+ def clear_history(message, chatbot):
195
+ message.clear(), chatbot.clear()
196
+ return (gr.update(value=None, interactive=True),
197
+ gr.update(value=None, interactive=True),
198
+ gr.update(value=None, interactive=True),
199
+ message, chatbot,
200
+ gr.update(value=None, interactive=True))
201
+
202
+
203
+ # BUG of Zero Environment
204
+ # 1. The environment is fixed to torch>=2.0,<=2.2, gradio>=4.x.x
205
+ # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
206
+ # 3. The function can't return tensor or other cuda objects.
207
+
208
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-AV'
209
+
210
+ handler = Chat(model_path, load_8bit=False, load_4bit=False)
211
+
212
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
213
+
214
+ theme = gr.themes.Default(primary_hue=plum_color)
215
+ # theme.update_color("primary", plum_color.c500)
216
+ theme.set(slider_color="#9C276A")
217
+ theme.set(block_title_text_color="#9C276A")
218
+ theme.set(block_label_text_color="#9C276A")
219
+ theme.set(button_primary_text_color="#9C276A")
220
+ # theme.set(button_secondary_text_color="*neutral_800")
221
+
222
+
223
+ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
224
+ gr.Markdown(title_markdown)
225
+ message = gr.State([])
226
+
227
+ with gr.Row():
228
+ with gr.Column(scale=3):
229
+ image = gr.Image(label="Input Image", type="filepath")
230
+ video = gr.Video(label="Input Video")
231
+ audio = gr.Audio(label="Input Audio", type="filepath")
232
+
233
+ with gr.Accordion("Parameters", open=True) as parameter_row:
234
+ # num_beams = gr.Slider(
235
+ # minimum=1,
236
+ # maximum=10,
237
+ # value=1,
238
+ # step=1,
239
+ # interactive=True,
240
+ # label="beam search numbers",
241
+ # )
242
+
243
+ va_tag = gr.Radio(choices=["Audio Vision", "Vision Only", "Audio Only"], value="Audio Vision", label="Select one")
244
+
245
+ temperature = gr.Slider(
246
+ minimum=0.1,
247
+ maximum=1.0,
248
+ value=0.2,
249
+ step=0.1,
250
+ interactive=True,
251
+ label="Temperature",
252
+ )
253
+
254
+ top_p = gr.Slider(
255
+ minimum=0.0,
256
+ maximum=1.0,
257
+ value=0.7,
258
+ step=0.1,
259
+ interactive=True,
260
+ label="Top P",
261
+ )
262
+
263
+ max_output_tokens = gr.Slider(
264
+ minimum=64,
265
+ maximum=1024,
266
+ value=512,
267
+ step=64,
268
+ interactive=True,
269
+ label="Max output tokens",
270
+ )
271
+
272
+ with gr.Column(scale=7):
273
+ chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
274
+ with gr.Row():
275
+ with gr.Column(scale=8):
276
+ textbox.render()
277
+ with gr.Column(scale=1, min_width=50):
278
+ submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
279
+ with gr.Row(elem_id="buttons") as button_row:
280
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
281
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
282
+ # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
283
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
284
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=True)
285
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)
286
+
287
+ with gr.Row():
288
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
289
+
290
+ with gr.Column():
291
+ gr.Examples(
292
+ examples=[
293
+ [
294
+ f"{cur_dir}/examples/extreme_ironing.jpg",
295
+ "What happens in this image?",
296
+ ],
297
+ [
298
+ f"{cur_dir}/examples/waterview.jpg",
299
+ "What are the things I should be cautious about when I visit here?",
300
+ ],
301
+ ],
302
+ inputs=[image, textbox],
303
+ )
304
+
305
+ with gr.Column():
306
+ gr.Examples(
307
+ examples=[
308
+ [
309
+ f"{cur_dir}/examples/WBS4I.mp4",
310
+ "Please describe the video:",
311
+ ],
312
+ [
313
+ f"{cur_dir}/examples/sample_demo_1.mp4",
314
+ "Please describe the video:",
315
+ ],
316
+ ],
317
+ inputs=[video, textbox],
318
+ )
319
+ with gr.Column():
320
+ gr.Examples(
321
+ examples=[
322
+ [
323
+ f"{cur_dir}/examples/00000368.mp4",
324
+ "Where is the loudest instrument?",
325
+ ],
326
+ [
327
+ f"{cur_dir}/examples/00003491.mp4",
328
+ "Is the instrument on the left louder than the instrument on the right?",
329
+ ],
330
+ ],
331
+ inputs=[video, textbox],
332
+ )
333
+ with gr.Column():
334
+ # audio
335
+ gr.Examples(
336
+ examples=[
337
+ [
338
+ f"{cur_dir}/examples/Y--ZHUMfueO0.flac",
339
+ "Please describe the audio:",
340
+ ],
341
+ [
342
+ f"{cur_dir}/examples/Traffic and pedestrians.wav",
343
+ "Please describe the audio:",
344
+ ],
345
+ ],
346
+ inputs=[audio, textbox],
347
+ )
348
+
349
+ gr.Markdown(tos_markdown)
350
+ gr.Markdown(learn_more_markdown)
351
+
352
+ submit_btn.click(
353
+ generate,
354
+ [image, video, audio, message, chatbot, va_tag, textbox, temperature, top_p, max_output_tokens],
355
+ [image, video, audio, message, chatbot])
356
+
357
+ regenerate_btn.click(
358
+ regenerate,
359
+ [message, chatbot],
360
+ [message, chatbot]).then(
361
+ generate,
362
+ [image, video, audio, message, chatbot, va_tag, textbox, temperature, top_p, max_output_tokens],
363
+ [image, video, audio, message, chatbot])
364
+
365
+ clear_btn.click(
366
+ clear_history,
367
+ [message, chatbot],
368
+ [image, video, audio, message, chatbot, textbox])
369
+
370
+ demo.launch(share=False)
examples/00000368.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad8a2238cad4bc690de0e3fe0d1f891e83ebc9f1e0bfd06e17145e34f8031f14
3
+ size 4383040
examples/00003491.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8513fbd368e2ed18b2ae188120ee0efd733105be1633a14e48697257e283b795
3
+ size 3997338
examples/1034346401.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08b62a634fe49edc0a19fc53f6ea5cfb345d9b2a6a7047811344c16832dc42b2
3
+ size 1678095
examples/Traffic and pedestrians.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39d805c8e0e487427d60c47ded7d7cca9b8fa288c1a53c93118b15f68ecf6792
3
+ size 1656254
examples/WBS4I.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7129dddf8da11c9296845eed65f0016dc67a503972c57500fe9f7c3ad2ee1ff3
3
+ size 1052064
examples/Y--ZHUMfueO0.flac ADDED
Binary file (324 kB). View file
 
examples/desert.jpg ADDED
examples/extreme_ironing.jpg ADDED
examples/sample_demo_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6562a172eb9cb3c760a3c9992349c1faa2c793c112b7b9e50bd5cb17c2164d
3
+ size 1549315
examples/sample_demo_3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da6126bce64c64a3d6f7ce889fbe15b5f1c2e3f978846351d8c7a79a950b429e
3
+ size 463547
examples/sample_demo_9.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9702694f185e27ae016b85024b367e140cf93a4e3124d072816fd32f2ca0d96
3
+ size 631864
examples/waterview.jpg ADDED
videollama2/__init__.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ import warnings
4
+ import shutil
5
+ from functools import partial
6
+
7
+ import torch
8
+
9
+ from .model import load_pretrained_model
10
+ from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
11
+ from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
12
+
13
+
14
+ def model_init(model_path=None, **kwargs):
15
+ model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
16
+ model_name = get_model_name_from_path(model_path)
17
+ tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
18
+
19
+ if tokenizer.pad_token is None and tokenizer.unk_token is not None:
20
+ tokenizer.pad_token = tokenizer.unk_token
21
+
22
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
23
+ #num_frames = 16
24
+ processor = {
25
+ 'image': partial(process_image, processor=processor, aspect_ratio=None),
26
+ 'video': partial(process_video, processor=processor, aspect_ratio=None, num_frames=num_frames),
27
+ 'audio': process_audio_file,
28
+ }
29
+
30
+ return model, processor, tokenizer
31
+
32
+
33
+ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs):
34
+ """inference api of VideoLLaMA2 for video understanding.
35
+
36
+ Args:
37
+ model: VideoLLaMA2 model.
38
+ image_or_video (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
39
+ instruct (str): text instruction for understanding video.
40
+ tokenizer: tokenizer.
41
+ do_sample (bool): whether to sample.
42
+ modal (str): inference modality.
43
+ Returns:
44
+ str: response of the model.
45
+ """
46
+
47
+ # 1. text preprocess (tag process & generate prompt).
48
+ if modal == 'image':
49
+ modal_token = DEFAULT_IMAGE_TOKEN
50
+ elif modal == 'video':
51
+ modal_token = DEFAULT_VIDEO_TOKEN
52
+ elif modal == 'text':
53
+ modal_token = ''
54
+ elif modal == 'audio':
55
+ modal_token = DEFAULT_AUDIO_TOKEN
56
+ else:
57
+ raise ValueError(f"Unsupported modal: {modal}")
58
+
59
+ # 1. vision preprocess (load & transform image or video).
60
+ if modal == 'text':
61
+ tensor = None
62
+ else:
63
+ if isinstance(image_or_video, dict):
64
+ tensor = {k: v.half().cuda() for k, v in image_or_video.items()}
65
+ else:
66
+ tensor = image_or_video.half().cuda()
67
+ tensor = [(tensor, modal)]
68
+
69
+ # 2. text preprocess (tag process & generate prompt).
70
+ if isinstance(instruct, str):
71
+ message = [{'role': 'user', 'content': modal_token + '\n' + instruct}]
72
+ elif isinstance(instruct, list):
73
+ message = copy.deepcopy(instruct)
74
+ message[0]['content'] = modal_token + '\n' + message[0]['content']
75
+ else:
76
+ raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
77
+
78
+ if model.config.model_type in ['videollama2', 'videollama2_mistral', 'videollama2_mixtral']:
79
+ system_message = [
80
+ {'role': 'system', 'content': (
81
+ """<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
82
+ """\n"""
83
+ """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>""")
84
+ }
85
+ ]
86
+ else:
87
+ system_message = []
88
+
89
+ message = system_message + message
90
+ prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
91
+
92
+ input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
93
+ attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
94
+
95
+ # 3. generate response according to visual signals and prompts.
96
+ keywords = [tokenizer.eos_token]
97
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
98
+
99
+ do_sample = kwargs.get('do_sample', False)
100
+ temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
101
+ top_p = kwargs.get('top_p', 0.9)
102
+ max_new_tokens = kwargs.get('max_new_tokens', 2048)
103
+
104
+ with torch.inference_mode():
105
+ output_ids = model.generate(
106
+ input_ids,
107
+ attention_mask=attention_masks,
108
+ images=tensor,
109
+ do_sample=do_sample,
110
+ temperature=temperature,
111
+ max_new_tokens=max_new_tokens,
112
+ top_p=top_p,
113
+ use_cache=True,
114
+ stopping_criteria=[stopping_criteria],
115
+ pad_token_id=tokenizer.eos_token_id,
116
+ )
117
+
118
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
119
+
120
+ return outputs
videollama2/constants.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "."
5
+
6
+ # Model Constants
7
+ IGNORE_INDEX = -100
8
+
9
+ # Image arguments
10
+ IMAGE_TOKEN_INDEX = -200
11
+ DEFAULT_IMAGE_TOKEN = "<image>"
12
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13
+ DEFAULT_IM_START_TOKEN = "<im_start>"
14
+ DEFAULT_IM_END_TOKEN = "<im_end>"
15
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
16
+
17
+ # Video arguments
18
+ VIDEO_TOKEN_INDEX = -201
19
+ DEFAULT_VIDEO_TOKEN = "<video>"
20
+ NUM_FRAMES = 8
21
+ MAX_FRAMES = 32
22
+ NUM_FRAMES_PER_SECOND = 1
23
+
24
+ # Audio arguments
25
+ AUDIO_TOKEN_INDEX = -202
26
+ DEFAULT_AUDIO_TOKEN = "<audio>"
27
+
28
+ MODAL_INDEX_MAP = {
29
+ "<image>": -200,
30
+ "<video>": -201,
31
+ "<audio>": -202,
32
+ }
videollama2/conversation.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import dataclasses
3
+ from io import BytesIO
4
+ from enum import auto, Enum
5
+ from typing import List, Tuple
6
+
7
+ from PIL import Image
8
+ from .constants import LOGDIR, NUM_FRAMES
9
+
10
+
11
+ class SeparatorStyle(Enum):
12
+ """Different separator style."""
13
+ SINGLE = auto()
14
+ TWO = auto()
15
+ PLAIN = auto()
16
+ LLAMA2 = auto()
17
+ QWEN = auto()
18
+
19
+ @dataclasses.dataclass
20
+ class Conversation:
21
+ """A class that keeps all conversation history."""
22
+ system: str
23
+ roles: List[str]
24
+ messages: List[List[str]]
25
+ offset: int
26
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
27
+ sep: str = "###"
28
+ sep2: str = None
29
+ version: str = "Unknown"
30
+
31
+ skip_next: bool = False
32
+ modality: str = "image"
33
+
34
+ def get_prompt(self):
35
+ messages = self.messages
36
+ modality_token = f"<{self.modality}>"
37
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
38
+ messages = self.messages.copy()
39
+ init_role, init_msg = messages[0].copy()
40
+ init_msg = init_msg[0].replace(modality_token, "").strip()
41
+ if 'mmtag' in self.version:
42
+ messages[0] = (init_role, init_msg)
43
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
44
+ messages.insert(1, (self.roles[1], "Received."))
45
+ else:
46
+ messages[0] = (init_role, f"{modality_token}\n" + init_msg)
47
+
48
+ if self.sep_style == SeparatorStyle.SINGLE:
49
+ ret = self.system + self.sep
50
+ for role, message in messages:
51
+ if message:
52
+ if type(message) is tuple:
53
+ message, _, _ = message
54
+ ret += role + ": " + message + self.sep
55
+ else:
56
+ ret += role + ":"
57
+ elif self.sep_style == SeparatorStyle.TWO:
58
+ seps = [self.sep, self.sep2]
59
+ ret = self.system + seps[0]
60
+ for i, (role, message) in enumerate(messages):
61
+ if message:
62
+ if type(message) is tuple:
63
+ message, _, _ = message
64
+ ret += role + ": " + message + seps[i % 2]
65
+ else:
66
+ ret += role + ":"
67
+ elif self.sep_style == SeparatorStyle.LLAMA2:
68
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
69
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
70
+ ret = ""
71
+
72
+ for i, (role, message) in enumerate(messages):
73
+ if i == 0:
74
+ assert message, "first message should not be none"
75
+ assert role == self.roles[0], "first message should come from user"
76
+ if message:
77
+ if type(message) is tuple:
78
+ message, _, _ = message
79
+ if i == 0: message = wrap_sys(self.system) + message
80
+ if i % 2 == 0:
81
+ message = wrap_inst(message)
82
+ ret += self.sep + message
83
+ else:
84
+ ret += " " + message + " " + self.sep2
85
+ else:
86
+ ret += ""
87
+ ret = ret.lstrip(self.sep)
88
+ elif self.sep_style == SeparatorStyle.QWEN:
89
+ ret = ""
90
+ # 1. Add system prompt
91
+ ret += self.system + self.sep + "\n"
92
+ # 2. Iterate message
93
+ for i, (role, message) in enumerate(messages):
94
+ if i == 0:
95
+ assert message, "first message should not be none"
96
+ assert role == self.roles[0], "first message should come from user"
97
+ if message:
98
+ if type(message) is tuple:
99
+ message, _, _ = message
100
+ # 2.1 Add role and message
101
+ ret += role + message + self.sep + "\n"
102
+ else:
103
+ # 2.2 Add generation prompt
104
+ ret += role
105
+ elif self.sep_style == SeparatorStyle.PLAIN:
106
+ seps = [self.sep, self.sep2]
107
+ ret = self.system
108
+ for i, (role, message) in enumerate(messages):
109
+ if message:
110
+ if type(message) is tuple:
111
+ message, _, _ = message
112
+ ret += role + message + seps[i % 2]
113
+ else:
114
+ ret += role
115
+ else:
116
+ raise ValueError(f"Invalid style: {self.sep_style}")
117
+
118
+ return ret
119
+
120
+ def append_message(self, role, message):
121
+ self.messages.append([role, message])
122
+
123
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=800, min_len=400):
124
+ if image_process_mode == "Pad":
125
+ def expand2square(pil_img, background_color=(122, 116, 104)):
126
+ width, height = pil_img.size
127
+ if width == height:
128
+ return pil_img
129
+ elif width > height:
130
+ result = Image.new(pil_img.mode, (width, width), background_color)
131
+ result.paste(pil_img, (0, (width - height) // 2))
132
+ return result
133
+ else:
134
+ result = Image.new(pil_img.mode, (height, height), background_color)
135
+ result.paste(pil_img, ((height - width) // 2, 0))
136
+ return result
137
+ image = expand2square(image)
138
+ elif image_process_mode in ["Default", "Crop"]:
139
+ pass
140
+ elif image_process_mode == "Resize":
141
+ image = image.resize((336, 336))
142
+ else:
143
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
144
+ if max(image.size) > max_len:
145
+ max_hw, min_hw = max(image.size), min(image.size)
146
+ aspect_ratio = max_hw / min_hw
147
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
148
+ longest_edge = int(shortest_edge * aspect_ratio)
149
+ W, H = image.size
150
+ if H > W:
151
+ H, W = longest_edge, shortest_edge
152
+ else:
153
+ H, W = shortest_edge, longest_edge
154
+ image = image.resize((W, H))
155
+ if return_pil:
156
+ return image
157
+ else:
158
+ buffered = BytesIO()
159
+ image.save(buffered, format=image_format)
160
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
161
+ return img_b64_str
162
+
163
+
164
+ def get_videos(self, return_pil=False):
165
+ video_frames = []
166
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
167
+ if i % 2 == 0:
168
+ if type(msg) is tuple:
169
+ from decord import VideoReader, cpu
170
+ import numpy as np
171
+ # here video is the file path of input video
172
+ msg, video, image_process_mode = msg
173
+ if not return_pil:
174
+ # return filepath
175
+ video_frames.append(video)
176
+ else:
177
+ # read video using decord.VideoReader
178
+ decord_vr = VideoReader(uri=video, ctx=cpu(0))
179
+ duration = len(decord_vr)
180
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
181
+ # convert the extracted image frames into PIL objects
182
+ all_images = [Image.fromarray(f) for f in decord_vr.get_batch(frame_id_list).asnumpy()]
183
+ video_frames.extend([self.process_image(image, image_process_mode, return_pil=return_pil) for image in all_images])
184
+ return video_frames
185
+
186
+
187
+ def get_images(self, return_pil=False):
188
+ images = []
189
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
190
+ if i % 2 == 0:
191
+ if type(msg) is tuple:
192
+ msg, image, image_process_mode = msg
193
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
194
+ images.append(image)
195
+
196
+ # import base64
197
+ # from io import BytesIO
198
+ # from PIL import Image
199
+ # # here image is a PIL object
200
+ # msg, image, image_process_mode = msg
201
+ # if image_process_mode == "Pad":
202
+ # def expand2square(pil_img, background_color=(122, 116, 104)):
203
+ # width, height = pil_img.size
204
+ # if width == height:
205
+ # return pil_img
206
+ # elif width > height:
207
+ # result = Image.new(pil_img.mode, (width, width), background_color)
208
+ # result.paste(pil_img, (0, (width - height) // 2))
209
+ # return result
210
+ # else:
211
+ # result = Image.new(pil_img.mode, (height, height), background_color)
212
+ # result.paste(pil_img, ((height - width) // 2, 0))
213
+ # return result
214
+ # image = expand2square(image)
215
+ # elif image_process_mode in ["Default", "Crop"]:
216
+ # pass
217
+ # elif image_process_mode == "Resize":
218
+ # image = image.resize((336, 336))
219
+ # else:
220
+ # raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
221
+ # max_hw, min_hw = max(image.size), min(image.size)
222
+ # aspect_ratio = max_hw / min_hw
223
+ # max_len, min_len = 800, 400
224
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
225
+ # longest_edge = int(shortest_edge * aspect_ratio)
226
+ # W, H = image.size
227
+ # if longest_edge != max(image.size):
228
+ # if H > W:
229
+ # H, W = longest_edge, shortest_edge
230
+ # else:
231
+ # H, W = shortest_edge, longest_edge
232
+ # image = image.resize((W, H))
233
+ # if return_pil:
234
+ # images.append(image)
235
+ # else:
236
+ # buffered = BytesIO()
237
+ # image.save(buffered, format="PNG")
238
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
239
+ # images.append(img_b64_str)
240
+ return images
241
+
242
+ def to_gradio_chatbot(self):
243
+ ret = []
244
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
245
+ if i % 2 == 0:
246
+ if type(msg) is tuple:
247
+ # import base64
248
+ # from io import BytesIO
249
+ # from PIL import Image
250
+ # msg, image, image_process_mode = msg
251
+ # max_hw, min_hw = max(image.size), min(image.size)
252
+ # aspect_ratio = max_hw / min_hw
253
+ # max_len, min_len = 800, 400
254
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
255
+ # longest_edge = int(shortest_edge * aspect_ratio)
256
+ # W, H = image.size
257
+ # if H > W:
258
+ # H, W = longest_edge, shortest_edge
259
+ # else:
260
+ # H, W = shortest_edge, longest_edge
261
+ # image = image.resize((W, H))
262
+ # buffered = BytesIO()
263
+ # image.save(buffered, format="JPEG")
264
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
265
+ # img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
266
+ # display image/video in the textbox
267
+ msg, image_or_video, image_process_mode = msg
268
+ ##print("imagebox:", image)
269
+ if isinstance(image_or_video, Image.Image):
270
+ # image is PIL object
271
+ img_b64_str = self.process_image(image_or_video, "Default", return_pil=False, image_format='JPEG')
272
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
273
+ msg = img_str + msg.replace('<image>', '').strip()
274
+ else:
275
+ # video is file path
276
+ vid_str = f'<video controls playsinline width="500" style="display: inline-block;" src="./file={image_or_video}"></video><br>'
277
+ msg = vid_str + msg.replace('<video>', '').strip()
278
+ ret.append([msg, None])
279
+ else:
280
+ ret.append([msg, None])
281
+ else:
282
+ ret[-1][-1] = msg
283
+ return ret
284
+
285
+ def copy(self):
286
+ return Conversation(
287
+ system=self.system,
288
+ roles=self.roles,
289
+ messages=[[x, y] for x, y in self.messages],
290
+ offset=self.offset,
291
+ sep_style=self.sep_style,
292
+ sep=self.sep,
293
+ sep2=self.sep2,
294
+ version=self.version)
295
+
296
+ def dict(self):
297
+ if (self.modality == "image" and len(self.get_images()) > 0) or \
298
+ (self.modality == "video" and len(self.get_videos()) > 0):
299
+ return {
300
+ "system": self.system,
301
+ "roles": self.roles,
302
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
303
+ "offset": self.offset,
304
+ "sep": self.sep,
305
+ "sep2": self.sep2,
306
+ "modality": self.modality
307
+ }
308
+ return {
309
+ "system": self.system,
310
+ "roles": self.roles,
311
+ "messages": self.messages,
312
+ "offset": self.offset,
313
+ "sep": self.sep,
314
+ "sep2": self.sep2,
315
+ }
316
+
317
+
318
+ conv_vicuna_v0 = Conversation(
319
+ system="A chat between a curious human and an artificial intelligence assistant. "
320
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
321
+ roles=("Human", "Assistant"),
322
+ messages=(
323
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
324
+ ("Assistant",
325
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
326
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
327
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
328
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
329
+ "renewable and non-renewable energy sources:\n"
330
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
331
+ "energy sources are finite and will eventually run out.\n"
332
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
333
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
334
+ "and other negative effects.\n"
335
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
336
+ "have lower operational costs than non-renewable sources.\n"
337
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
338
+ "locations than non-renewable sources.\n"
339
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
340
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
341
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
342
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
343
+ ),
344
+ offset=2,
345
+ sep_style=SeparatorStyle.SINGLE,
346
+ sep="###",
347
+ )
348
+
349
+ conv_llava_plain = Conversation(
350
+ system="",
351
+ roles=("", ""),
352
+ messages=(),
353
+ offset=0,
354
+ sep_style=SeparatorStyle.PLAIN,
355
+ sep="",
356
+ sep2="\n"
357
+ )
358
+
359
+ conv_llava_v0_mmtag = Conversation(
360
+ system="A chat between a curious user and an artificial intelligence assistant. "
361
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
362
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
363
+ roles=("Human", "Assistant"),
364
+ messages=(
365
+ ),
366
+ offset=0,
367
+ sep_style=SeparatorStyle.SINGLE,
368
+ sep="###",
369
+ version="v0_mmtag",
370
+ )
371
+
372
+ conv_llava_v0 = Conversation(
373
+ system="A chat between a curious human and an artificial intelligence assistant. "
374
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
375
+ roles=("Human", "Assistant"),
376
+ messages=(
377
+ ),
378
+ offset=0,
379
+ sep_style=SeparatorStyle.SINGLE,
380
+ sep="###",
381
+ )
382
+
383
+ conv_vicuna_v1 = Conversation(
384
+ system="A chat between a curious user and an artificial intelligence assistant. "
385
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
386
+ roles=("USER", "ASSISTANT"),
387
+ version="v1",
388
+ messages=(),
389
+ offset=0,
390
+ sep_style=SeparatorStyle.TWO,
391
+ sep=" ",
392
+ sep2="</s>",
393
+ )
394
+
395
+ conv_llava_v1_mmtag = Conversation(
396
+ system="A chat between a curious user and an artificial intelligence assistant. "
397
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
398
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
399
+ roles=("USER", "ASSISTANT"),
400
+ messages=(),
401
+ offset=0,
402
+ sep_style=SeparatorStyle.TWO,
403
+ sep=" ",
404
+ sep2="</s>",
405
+ version="v1_mmtag",
406
+ )
407
+
408
+ conv_llava_v1 = Conversation(
409
+ system="A chat between a curious human and an artificial intelligence assistant. "
410
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
411
+ roles=("USER", "ASSISTANT"),
412
+ version="v1",
413
+ messages=(),
414
+ offset=0,
415
+ sep_style=SeparatorStyle.TWO,
416
+ sep=" ",
417
+ sep2="</s>",
418
+ )
419
+
420
+ conv_llava_llama2 = Conversation(
421
+ system="You are a helpful language and vision assistant. "
422
+ "You are able to understand the visual content that the user provides, "
423
+ "and assist the user with a variety of tasks using natural language.",
424
+ roles=("USER", "ASSISTANT"),
425
+ version="llama2",
426
+ messages=(),
427
+ offset=0,
428
+ sep_style=SeparatorStyle.LLAMA2,
429
+ sep="<s>",
430
+ sep2="</s>",
431
+ )
432
+
433
+ conv_llama2 = Conversation(
434
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
435
+
436
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
437
+ roles=("USER", "ASSISTANT"),
438
+ version="llama2",
439
+ messages=(),
440
+ offset=0,
441
+ sep_style=SeparatorStyle.LLAMA2,
442
+ sep="<s>",
443
+ sep2="</s>",
444
+ )
445
+
446
+ conv_mistral = Conversation(
447
+ system="A chat between a curious user and an artificial intelligence assistant. "
448
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
449
+ roles=("USER", "ASSISTANT"),
450
+ version="llama2",
451
+ messages=(),
452
+ offset=0,
453
+ sep_style=SeparatorStyle.LLAMA2,
454
+ sep="",
455
+ sep2="</s>",
456
+ )
457
+
458
+ conv_qwen = Conversation(
459
+ system="<|im_start|>system\nYou are a helpful assistant.",
460
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
461
+ messages=(),
462
+ offset=0,
463
+ sep_style=SeparatorStyle.QWEN,
464
+ sep="<|im_end|>",
465
+ version="qwen",
466
+ )
467
+
468
+ conv_qwen_plain = Conversation(
469
+ system="",
470
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
471
+ messages=(),
472
+ offset=0,
473
+ sep_style=SeparatorStyle.PLAIN,
474
+ sep="<|im_end|>",
475
+ sep2="<|im_end|>",
476
+ version="qwen_plain",
477
+ )
478
+
479
+ default_conversation = conv_mistral
480
+ conv_templates = {
481
+ "default": conv_vicuna_v0,
482
+ # pretrain template
483
+ "plain": conv_llava_plain,
484
+ # llava v0
485
+ "v0": conv_vicuna_v0,
486
+ "v0_plain": conv_llava_plain,
487
+ "v0_mmtag": conv_llava_v0_mmtag,
488
+ "llava_v0": conv_llava_v0,
489
+ # llava v1
490
+ "v1": conv_vicuna_v1,
491
+ "v1_mmtag": conv_llava_v1_mmtag,
492
+ "llava_v1": conv_llava_v1,
493
+ "vicuna_v1": conv_vicuna_v1,
494
+ # llava v1.5
495
+ "llava_llama2": conv_llava_llama2,
496
+ # llama2
497
+ "llama2": conv_llama2,
498
+ # mistral
499
+ "mistral": conv_mistral,
500
+ # qwen
501
+ "qwen": conv_qwen,
502
+ "qwen_plain": conv_qwen_plain,
503
+ }
504
+
505
+
506
+ if __name__ == "__main__":
507
+ print(default_conversation.get_prompt())
videollama2/inference_audio.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+ import argparse
5
+ import warnings
6
+ import traceback
7
+ from tqdm import tqdm
8
+
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+ import sys
12
+ sys.path.append('./')
13
+ from videollama2 import model_init, mm_infer
14
+ from videollama2.utils import disable_torch_init
15
+
16
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
17
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
18
+
19
+
20
+ def split_list(lst, n):
21
+ """Split a list into n (roughly) equal-sized chunks"""
22
+ chunk_size = math.ceil(len(lst) / n) # integer division
23
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
24
+
25
+
26
+ def get_chunk(lst, n, k):
27
+ chunks = split_list(lst, n)
28
+ return chunks[k]
29
+
30
+
31
+ class ClothoAQADataset(Dataset):
32
+
33
+ audoi_formats = ['.wav', '.flac']
34
+
35
+ def __init__(self, questions, processor):
36
+ self.questions = questions
37
+ self.processor = processor
38
+
39
+ def __len__(self):
40
+ return len(self.questions)
41
+
42
+ def __getitem__(self, idx):
43
+ sample = self.questions[idx]
44
+
45
+ audio_path = sample['audio']
46
+ question = sample['conversations'][0]["value"]
47
+ wrapped_question = f"Question: {question}\nAnswer the question using a single word."
48
+ question_id = sample['id']
49
+ answer = sample['conversations'][1]["value"]
50
+
51
+ audio_tensor = self.processor(audio_path)
52
+
53
+ return {
54
+ 'audio': audio_tensor,
55
+ 'audio_name': audio_path.split("/")[-1],
56
+ 'question': wrapped_question,
57
+ 'question_id': question_id,
58
+ 'answer': answer,
59
+ }
60
+
61
+ class ClothoDataset(Dataset):
62
+
63
+ audoi_formats = ['.wav', '.flac']
64
+
65
+ def __init__(self, questions, processor):
66
+ self.questions = questions
67
+ self.processor = processor
68
+
69
+ def __len__(self):
70
+ return len(self.questions)
71
+
72
+ def __getitem__(self, idx):
73
+ sample = self.questions[idx]
74
+
75
+ audio_path = sample['audio']
76
+ wrapped_question = f"Describe the audio."
77
+ question_id = audio_path.split("/")[-1]
78
+ answer = sample['captions']
79
+
80
+ audio_tensor = self.processor(audio_path)
81
+
82
+ return {
83
+ 'audio': audio_tensor,
84
+ 'audio_name': audio_path.split("/")[-1],
85
+ 'question': wrapped_question,
86
+ 'question_id': question_id,
87
+ 'answer': answer,
88
+ }
89
+
90
+ class TUT2017Dataset(Dataset):
91
+
92
+ audoi_formats = ['.wav', '.flac']
93
+
94
+ def __init__(self, questions, processor):
95
+ self.questions = questions
96
+ self.processor = processor
97
+
98
+ def __len__(self):
99
+ return len(self.questions)
100
+
101
+ def __getitem__(self, idx):
102
+ sample = self.questions[idx]
103
+
104
+ audio_path = sample['audio']
105
+ wrapped_question = f"Question: Identify the sound event in the audio.\nOptions:\n(A) beach\n(B) bus\n(C) cafe or restaurant\n(D) car\n(E) city center\n(F) forest path\n(G) grocery store\n(H) home\n(I) library\n(J) metro station\n(K) office\n(L) park\n(M) residential area\n(N) train\n(O) tram\n.Answer with the option's letter from the given choices directly and only give the best option."
106
+ question_id = audio_path.split("/")[-1]
107
+ answer = sample['gt']
108
+
109
+ audio_tensor = self.processor(audio_path)
110
+
111
+ return {
112
+ 'audio': audio_tensor,
113
+ 'audio_name': audio_path.split("/")[-1],
114
+ 'question': wrapped_question,
115
+ 'question_id': question_id,
116
+ 'answer': answer,
117
+ }
118
+
119
+ class VocalSoundDataset(Dataset):
120
+
121
+ audoi_formats = ['.wav', '.flac']
122
+
123
+ def __init__(self, questions, processor):
124
+ self.questions = questions
125
+ self.processor = processor
126
+
127
+ def __len__(self):
128
+ return len(self.questions)
129
+
130
+ def __getitem__(self, idx):
131
+ sample = self.questions[idx]
132
+
133
+ audio_path = sample['audio']
134
+ wrapped_question = f"Identify the human sound in the audio.\nOptions:\n(A) Laughter\n(B) Sigh\n(C) Cough\n(D) Throat clearing\n(E) Sneeze\n(F) Sniff\n.Answer with the option's letter from the given choices directly and only give the best option."
135
+ question_id = audio_path.split("/")[-1]
136
+ answer = sample['gt']
137
+
138
+ audio_tensor = self.processor(audio_path)
139
+
140
+ return {
141
+ 'audio': audio_tensor,
142
+ 'audio_name': audio_path.split("/")[-1],
143
+ 'question': wrapped_question,
144
+ 'question_id': question_id,
145
+ 'answer': answer,
146
+ }
147
+
148
+ class AIRDataset(Dataset):
149
+
150
+ audoi_formats = ['.wav', '.flac']
151
+
152
+ def __init__(self, questions, processor):
153
+ self.questions = questions
154
+ self.processor = processor
155
+
156
+ def __len__(self):
157
+ return len(self.questions)
158
+
159
+ def __getitem__(self, idx):
160
+ sample = self.questions[idx]
161
+
162
+ audio_path = sample['audio']
163
+ wrapped_question = sample['query']
164
+ question_id = sample['id']
165
+ answer = sample['answer']
166
+
167
+ audio_tensor = self.processor(audio_path)
168
+
169
+ return {
170
+ 'audio': audio_tensor,
171
+ 'audio_name': audio_path.split("/")[-1],
172
+ 'question': wrapped_question,
173
+ 'question_id': question_id,
174
+ 'answer': answer,
175
+ }
176
+
177
+
178
+ def collate_fn(batch):
179
+ vid = [x['audio'] for x in batch]
180
+ v_id = [x['audio_name'] for x in batch]
181
+ qus = [x['question'] for x in batch]
182
+ qid = [x['question_id'] for x in batch]
183
+ ans = [x['answer'] for x in batch]
184
+ return vid, v_id, qus, qid, ans
185
+
186
+
187
+ def run_inference(args):
188
+ disable_torch_init()
189
+
190
+ # Initialize the model
191
+ model, processor, tokenizer = model_init(args.model_path)
192
+ model.model.vision_tower = None
193
+
194
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
195
+ if args.dataset == "clothoAQA":
196
+ gt_questions = json.load(open(args.question_file, "r"))
197
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
198
+ dataset = ClothoAQADataset(gt_questions, processor['audio'])
199
+ elif args.dataset == "clotho":
200
+ import csv
201
+ gt_questions = []
202
+ with open(args.question_file, mode='r', encoding='utf-8') as file:
203
+ reader = csv.reader(file)
204
+ header = next(reader) # remove header
205
+ for row in reader:
206
+ gt_questions.append({
207
+ "audio": os.path.join(args.video_folder, row[0]),
208
+ "captions": row[1:]
209
+ })
210
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
211
+ dataset = ClothoDataset(gt_questions, processor['audio'])
212
+ elif args.dataset == "TUT2017":
213
+ gt_questions = []
214
+ with open(args.question_file, "r") as fp:
215
+ for x in fp.readlines():
216
+ gt_questions.append(json.loads(x))
217
+ gt_questions[-1]["audio"] = os.path.join(args.video_folder, gt_questions[-1]["audio"])
218
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
219
+ dataset = TUT2017Dataset(gt_questions, processor['audio'])
220
+ elif args.dataset == "vocalsound":
221
+ gt_questions = []
222
+ with open(args.question_file, "r") as fp:
223
+ for x in fp.readlines():
224
+ gt_questions.append(json.loads(x))
225
+ gt_questions[-1]["audio"] = os.path.join(args.video_folder, gt_questions[-1]["audio"].split("/")[-1])
226
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
227
+ dataset = VocalSoundDataset(gt_questions, processor['audio'])
228
+ elif args.dataset == "AIR":
229
+ gt_answer = {x["uniq_id"]: x for x in json.load(open(args.answer_file, "r"))}
230
+
231
+ gt_questions = []
232
+ with open(args.question_file, "r") as fp:
233
+ for x in fp.readlines():
234
+ gt_questions.append(json.loads(x))
235
+ gt_questions[-1]["answer"] = gt_answer[gt_questions[-1]["id"]]["answer_gt"]
236
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
237
+ dataset = AIRDataset(gt_questions, processor['audio'])
238
+ else:
239
+ raise NotImplementedError
240
+
241
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
242
+
243
+ answer_file = os.path.join(args.output_file)
244
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
245
+ ans_file = open(answer_file, "w")
246
+
247
+ # Iterate over each sample in the ground truth file
248
+ for i, (audio_tensors, audio_names, questions, question_ids, answers) in enumerate(tqdm(dataloader)):
249
+ audio_tensor = audio_tensors[0]
250
+ audio_name = audio_names[0]
251
+ question = questions[0]
252
+ question_id = question_ids[0]
253
+ answer = answers[0]
254
+
255
+ # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
256
+
257
+ try:
258
+ output = mm_infer(
259
+ audio_tensor,
260
+ question,
261
+ model=model,
262
+ tokenizer=tokenizer,
263
+ modal='audio',
264
+ do_sample=False,
265
+ )
266
+ except:
267
+ traceback.print_exc()
268
+ output = "error"
269
+
270
+ sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
271
+ ans_file.write(json.dumps(sample_set) + "\n")
272
+
273
+ ans_file.close()
274
+
275
+
276
+ if __name__ == "__main__":
277
+ parser = argparse.ArgumentParser()
278
+
279
+ parser.add_argument('--model-path', help='', required=True)
280
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
281
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
282
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=False)
283
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
284
+ parser.add_argument("--num-chunks", type=int, default=1)
285
+ parser.add_argument("--chunk-idx", type=int, default=0)
286
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
287
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
288
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
289
+ parser.add_argument("--dataset", type=str, required=True)
290
+ args = parser.parse_args()
291
+
292
+ run_inference(args)
videollama2/mm_utils.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ import math
4
+ import base64
5
+ import traceback
6
+ from io import BytesIO
7
+
8
+ import cv2
9
+ import torch
10
+ import imageio
11
+ import numpy as np
12
+ from PIL import Image
13
+ from decord import VideoReader, cpu
14
+ from moviepy.editor import VideoFileClip
15
+ from transformers import StoppingCriteria
16
+
17
+ from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MODAL_INDEX_MAP, DEFAULT_IMAGE_TOKEN
18
+ from moviepy.editor import VideoFileClip
19
+ import random
20
+ import librosa
21
+ import soundfile as sf
22
+ import torchaudio.compliance.kaldi as ta_kaldi
23
+ from subprocess import CalledProcessError, run, Popen, PIPE
24
+ import math
25
+ from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
26
+
27
+ def chunk_list(input_list, chunk_size):
28
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
29
+
30
+
31
+ def load_image_from_base64(image):
32
+ return Image.open(BytesIO(base64.b64decode(image)))
33
+
34
+
35
+ def expand2square(pil_img, background_color):
36
+ width, height = pil_img.size
37
+ if width == height:
38
+ return pil_img
39
+ elif width > height:
40
+ result = Image.new(pil_img.mode, (width, width), background_color)
41
+ result.paste(pil_img, (0, (width - height) // 2))
42
+ return result
43
+ else:
44
+ result = Image.new(pil_img.mode, (height, height), background_color)
45
+ result.paste(pil_img, ((height - width) // 2, 0))
46
+ return result
47
+
48
+
49
+ def create_photo_grid(arr, rows=None, cols=None):
50
+ """
51
+ Create a photo grid from a 4D numpy array with shape [t, h, w, c].
52
+
53
+ Parameters:
54
+ arr (numpy.ndarray): Input array with shape [t, h, w, c].
55
+ rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
56
+ cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
57
+
58
+ Returns:
59
+ numpy.ndarray: A 3D numpy array representing the photo grid.
60
+ """
61
+
62
+ if isinstance(arr, list):
63
+ if isinstance(arr[0], Image.Image):
64
+ arr = np.stack([np.array(img) for img in arr])
65
+ elif isinstance(arr[0], np.ndarray):
66
+ arr = np.stack(arr)
67
+ else:
68
+ raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
69
+
70
+ t, h, w, c = arr.shape
71
+
72
+ # Calculate the number of rows and columns if not provided
73
+ if rows is None and cols is None:
74
+ rows = math.ceil(math.sqrt(t))
75
+ cols = math.ceil(t / rows)
76
+ elif rows is None:
77
+ rows = math.ceil(t / cols)
78
+ elif cols is None:
79
+ cols = math.ceil(t / rows)
80
+
81
+ # Check if the grid can hold all the images
82
+ if rows * cols < t:
83
+ raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
84
+
85
+ # Create the grid array with appropriate height and width
86
+ grid_height = h * rows
87
+ grid_width = w * cols
88
+ grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
89
+
90
+ # Fill the grid with images
91
+ for i in range(t):
92
+ row_idx = i // cols
93
+ col_idx = i % cols
94
+ grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
95
+
96
+ return grid
97
+
98
+
99
+ def process_image(image_path, processor, aspect_ratio='pad'):
100
+ image = Image.open(image_path).convert('RGB')
101
+
102
+ images = [np.array(image)]
103
+
104
+ if aspect_ratio == 'pad':
105
+ images = [Image.fromarray(f) for f in images]
106
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
107
+ else:
108
+ images = [Image.fromarray(f) for f in images]
109
+
110
+ images = processor.preprocess(images, return_tensors='pt')['pixel_values']
111
+ return images
112
+
113
+
114
+ def frame_sample(duration, mode='uniform', num_frames=None, fps=None):
115
+ if mode == 'uniform':
116
+ assert num_frames is not None, "Number of frames must be provided for uniform sampling."
117
+ # NOTE: v1 version
118
+ # Calculate the size of each segment from which a frame will be extracted
119
+ seg_size = float(duration - 1) / num_frames
120
+
121
+ frame_ids = []
122
+ for i in range(num_frames):
123
+ # Calculate the start and end indices of each segment
124
+ start = seg_size * i
125
+ end = seg_size * (i + 1)
126
+ # Append the middle index of the segment to the list
127
+ frame_ids.append((start + end) / 2)
128
+
129
+ return np.round(np.array(frame_ids) + 1e-6).astype(int)
130
+ # NOTE: v0 version
131
+ # return np.linspace(0, duration-1, num_frames, dtype=int)
132
+ elif mode == 'fps':
133
+ assert fps is not None, "FPS must be provided for FPS sampling."
134
+ segment_len = min(fps // NUM_FRAMES_PER_SECOND, duration)
135
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
136
+ else:
137
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
138
+
139
+
140
+ def process_audio_file(wav_path):
141
+ # read wav
142
+ #print(wav_path)
143
+ wav, sr = sf.read(wav_path)
144
+ if len(wav.shape) == 2:
145
+ wav = wav[:, 0]
146
+ if len(wav) > 30 * sr:
147
+ max_start = len(wav) - 30 * sr
148
+ start = random.randint(0, max_start)
149
+ wav = wav[start: start + 30 * sr]
150
+ if len(wav) < 30 * sr:
151
+ pad_length = 30 * sr - len(wav)
152
+ wav = np.pad(wav, (0, pad_length), mode='constant', constant_values=0.0)
153
+ if sr != 16000:
154
+ wav = librosa.resample(wav, orig_sr=sr, target_sr=16000, res_type="fft")
155
+
156
+ # beats
157
+ raw_wav = torch.from_numpy(wav).to('cpu')
158
+ waveform = raw_wav.unsqueeze(0) * 2 ** 15
159
+ fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10).to(torch.bfloat16)
160
+ return fbank.unsqueeze(0)
161
+
162
+ def get_clip_timepoints(clip_sampler, duration):
163
+ # Read out all clips in this video
164
+ all_clips_timepoints = []
165
+ is_last_clip = False
166
+ end = 0.0
167
+ while not is_last_clip:
168
+ start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
169
+ all_clips_timepoints.append((start, end))
170
+ return all_clips_timepoints
171
+
172
+ def load_audio_from_video(file: str, sr: int = 16000):
173
+ """
174
+ Open an audio file and read as mono waveform, resampling as necessary
175
+
176
+ Parameters
177
+ ----------
178
+ file: str
179
+ The audio file to open
180
+
181
+ sr: int
182
+ The sample rate to resample the audio if necessary
183
+
184
+ Returns
185
+ -------
186
+ A NumPy array containing the audio waveform, in float32 dtype.
187
+ """
188
+
189
+ # This launches a subprocess to decode audio while down-mixing
190
+ # and resampling as necessary. Requires the ffmpeg CLI in PATH.
191
+
192
+ cmd = ["ffmpeg", "-nostdin", "-i", file, "-vn", # no video
193
+ "-acodec", "pcm_s16le", # output audio codec (pcm_s16le for .wav)
194
+ "-ac", "1", # audio channels (1 for mono)
195
+ "-ar", str(sr), # audio sample rate
196
+ "-f", "s16le", # output format (s16le for 16-bit PCM)
197
+ "-" # output to stdout
198
+ ]
199
+ # fmt: on
200
+ try:
201
+ out = run(cmd, capture_output=True, check=True).stdout
202
+ except CalledProcessError as e:
203
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
204
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0, sr
205
+
206
+
207
+ def process_audio_from_video(audio_path, clip_duration, device="cpu", num_mel_bins=128, sample_rate=16000, clips_per_video=8, mean=-4.268, std=9.138):
208
+ clip_sampler = ConstantClipsPerVideoSampler(
209
+ clip_duration=2, clips_per_video=clips_per_video
210
+ )
211
+ try:
212
+ waveform, sr = load_audio_from_video(audio_path)
213
+ #print(audio_path)
214
+ except Exception as audio_error:
215
+ print(f"Failed to process audio from video due to error: {audio_error}")
216
+ waveform = torch.zeros(480000)
217
+ waveform = waveform.numpy()
218
+ sr = 16000
219
+ all_clips_timepoints = get_clip_timepoints(clip_sampler, waveform.shape[0] / sample_rate)
220
+ all_clips = []
221
+ #print(waveform.shape[0] / sample_rate)
222
+ for clip_timepoints in all_clips_timepoints:
223
+ #print(float(clip_timepoints[0]))
224
+ #print(float(clip_timepoints[1]))
225
+ waveform_clip = waveform[
226
+ int(clip_timepoints[0] * sample_rate) : int(
227
+ clip_timepoints[1] * sample_rate)]
228
+ all_clips.append(waveform_clip)
229
+ all_clips_tensors = [torch.from_numpy(clip) for clip in all_clips]
230
+ wav = torch.cat(all_clips_tensors, dim=0)
231
+ if len(wav) > 30 * sr:
232
+ max_start = len(wav) - 30 * sr
233
+ start = torch.randint(0, max_start, (1,)).item()
234
+ wav = wav[start: start + 30 * sr]
235
+ if len(wav) < 30 * sr:
236
+ pad_length = 30 * sr - len(wav)
237
+ wav = torch.nn.functional.pad(wav, (0, pad_length), mode='constant', value=0.0)
238
+ waveform = wav.unsqueeze(0) * 2 ** 15
239
+ fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10).to(torch.bfloat16)
240
+ return fbank.unsqueeze(0)
241
+
242
+
243
+ def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num_frames=NUM_FRAMES, va=False):
244
+ if isinstance(video_path, str):
245
+ if s is not None and e is not None:
246
+ s = s if s >= 0. else 0.
247
+ e = e if e >= 0. else 0.
248
+ if s > e:
249
+ s, e = e, s
250
+ elif s == e:
251
+ e = s + 1
252
+
253
+ # 1. Loading Video
254
+ if os.path.isdir(video_path):
255
+ frame_files = sorted(os.listdir(video_path))
256
+
257
+ fps = 3
258
+ num_frames_of_video = len(frame_files)
259
+ elif video_path.endswith('.gif'):
260
+ gif_reader = imageio.get_reader(video_path)
261
+
262
+ fps = 25
263
+ num_frames_of_video = len(gif_reader)
264
+ else:
265
+ vreader = VideoReader(video_path, ctx=cpu(0), num_threads=1)
266
+
267
+ fps = vreader.get_avg_fps()
268
+ num_frames_of_video = len(vreader)
269
+
270
+ # 2. Determine frame range & Calculate frame indices
271
+ f_start = 0 if s is None else max(int(s * fps) - 1, 0)
272
+ f_end = num_frames_of_video - 1 if e is None else min(int(e * fps) - 1, num_frames_of_video - 1)
273
+ frame_indices = list(range(f_start, f_end + 1))
274
+
275
+ duration = len(frame_indices)
276
+ # 3. Sampling frame indices
277
+ if num_frames is None:
278
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', fps=fps)]
279
+ else:
280
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=num_frames)]
281
+
282
+ # 4. Acquire frame data
283
+ if os.path.isdir(video_path):
284
+ video_data = [Image.open(os.path.join(video_path, frame_files[f_idx])) for f_idx in sampled_frame_indices]
285
+ elif video_path.endswith('.gif'):
286
+ video_data = [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
287
+ else:
288
+ video_data = [Image.fromarray(frame) for frame in vreader.get_batch(sampled_frame_indices).asnumpy()]
289
+
290
+ elif isinstance(video_path, np.ndarray):
291
+ video_data = [Image.fromarray(f) for f in video_path]
292
+ elif isinstance(video_path, list) and isinstance(video_path[0], np.ndarray):
293
+ video_data = [Image.fromarray(f) for f in video_path]
294
+ elif isinstance(video_path, list) and isinstance(video_path[0], str):
295
+ video_data = [Image.open(f) for f in video_path]
296
+ elif isinstance(video_path, list) and isinstance(video_path[0], Image.Image):
297
+ video_data = video_path
298
+ else:
299
+ raise ValueError(f"Unsupported video path type: {type(video_path)}")
300
+
301
+ while num_frames is not None and len(video_data) < num_frames:
302
+ video_data.append(Image.fromarray(np.zeros((*video_data[-1].size, 3), dtype=np.uint8)))
303
+
304
+ # MAX_FRAMES filter
305
+ video_data = video_data[:MAX_FRAMES]
306
+
307
+ if aspect_ratio == 'pad':
308
+ images = [expand2square(f, tuple(int(x*255) for x in processor.image_mean)) for f in video_data]
309
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
310
+ else:
311
+ images = [f for f in video_data]
312
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
313
+
314
+ if va:
315
+ # Calculate the duration of the video in seconds
316
+ video_duration_seconds = num_frames_of_video / fps
317
+ audio = process_audio_from_video(video_path, video_duration_seconds)
318
+ video = {'video': video, 'audio': audio}
319
+
320
+ return video
321
+
322
+ def process_video_old(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
323
+ def frame_sample(duration, mode='uniform', local_fps=None):
324
+ if mode == 'uniform':
325
+ # Calculate the size of each segment from which a frame will be extracted
326
+ seg_size = float(duration - 1) / num_frames
327
+
328
+ frame_ids = []
329
+ for i in range(num_frames):
330
+ # Calculate the start and end indices of each segment
331
+ start = int(np.round(seg_size * i))
332
+ end = int(np.round(seg_size * (i + 1)))
333
+ # Append the middle index of the segment to the list
334
+ frame_ids.append((start + end) // 2)
335
+
336
+ return frame_ids
337
+ # NOTE: old version
338
+ # return np.linspace(0, duration-1, num_frames, dtype=int)
339
+ elif mode == 'fps':
340
+ assert local_fps is not None
341
+ segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
342
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
343
+ else:
344
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
345
+
346
+ if isinstance(video_path, str):
347
+ if video_path.endswith('.gif'):
348
+ video_gif = imageio.get_reader(video_path)
349
+ duration, local_fps = len(video_gif), 10
350
+
351
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
352
+ # limit the max input frames
353
+ if len(frame_id_list) > MAX_FRAMES:
354
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
355
+ video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
356
+ # added by lixin4ever, include the support of .webm files from sthsthv2
357
+ elif video_path.endswith('.webm'):
358
+ video_webm = VideoFileClip(video_path)
359
+ video_frames = np.array(list(video_webm.iter_frames()))
360
+
361
+ duration, local_fps = len(video_frames), video_webm.fps
362
+
363
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
364
+ # limit the max input frames
365
+ if len(frame_id_list) > MAX_FRAMES:
366
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
367
+ video_data = video_frames[frame_id_list]
368
+ else:
369
+ # NOTE: num_threads=1 is required to avoid deadlock in multiprocessing
370
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0), num_threads=1)
371
+ duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
372
+
373
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
374
+ # limit the max input frames
375
+ if len(frame_id_list) > MAX_FRAMES:
376
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
377
+ try:
378
+ video_data = decord_vr.get_batch(frame_id_list).numpy()
379
+ except:
380
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
381
+
382
+ elif isinstance(video_path, np.ndarray):
383
+ assert len(video_path) == num_frames
384
+ video_data = video_path
385
+ elif isinstance(video_path, list):
386
+ assert len(video_path) == num_frames
387
+ video_data = np.stack([np.array(x) for x in video_path])
388
+
389
+ if image_grid:
390
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
391
+ pg = create_photo_grid(video_data, grid_h, grid_w)
392
+ video_data = [pg, *video_data]
393
+
394
+ if aspect_ratio == 'pad':
395
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
396
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
397
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
398
+ else:
399
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
400
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
401
+
402
+ return video
403
+
404
+
405
+ def tokenizer_multimodal_token(prompt, tokenizer, multimodal_token=DEFAULT_IMAGE_TOKEN, return_tensors=None):
406
+ """Tokenize text and multimodal tag to input_ids.
407
+
408
+ Args:
409
+ prompt (str): Text prompt (w/ multimodal tag), e.g., '<video>\nDescribe the video.'
410
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer object.
411
+ multimodal_token (int): Token index corresponding to the multimodal tag.
412
+ """
413
+ multimodal_token_index = MODAL_INDEX_MAP.get(multimodal_token, None)
414
+ if multimodal_token_index is None:
415
+ input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
416
+ else:
417
+ prompt_chunks = [tokenizer(chunk, add_special_tokens=False).input_ids for idx, chunk in enumerate(prompt.split(multimodal_token))]
418
+
419
+ input_ids = []
420
+ for i in range(1, 2 * len(prompt_chunks)):
421
+ if i % 2 == 1:
422
+ input_ids.extend(prompt_chunks[i // 2])
423
+ else:
424
+ input_ids.append(multimodal_token_index)
425
+
426
+ if return_tensors is not None:
427
+ if return_tensors == 'pt':
428
+ return torch.tensor(input_ids, dtype=torch.long)
429
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
430
+ return input_ids
431
+
432
+
433
+ def get_model_name_from_path(model_path):
434
+ model_path = model_path.strip("/")
435
+ model_paths = model_path.split("/")
436
+ if model_paths[-1].startswith('checkpoint-'):
437
+ return model_paths[-2] + "_" + model_paths[-1]
438
+ else:
439
+ return model_paths[-1]
440
+
441
+
442
+ class KeywordsStoppingCriteria(StoppingCriteria):
443
+ def __init__(self, keywords, tokenizer, input_ids):
444
+ self.keywords = keywords
445
+ self.keyword_ids = []
446
+ self.max_keyword_len = 0
447
+ for keyword in keywords:
448
+ cur_keyword_ids = tokenizer(keyword).input_ids
449
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
450
+ cur_keyword_ids = cur_keyword_ids[1:]
451
+ if len(cur_keyword_ids) > self.max_keyword_len:
452
+ self.max_keyword_len = len(cur_keyword_ids)
453
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
454
+ self.tokenizer = tokenizer
455
+ self.start_len = input_ids.shape[1]
456
+
457
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
458
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
459
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
460
+ for keyword_id in self.keyword_ids:
461
+ if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
462
+ return True
463
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
464
+ for keyword in self.keywords:
465
+ if keyword in outputs:
466
+ return True
467
+ return False
468
+
469
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
470
+ outputs = []
471
+ for i in range(output_ids.shape[0]):
472
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
473
+ return all(outputs)
videollama2/model/__init__.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import os
18
+ import warnings
19
+ import shutil
20
+
21
+ import torch
22
+ from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
+
24
+ from .projector import load_mm_projector
25
+ from .videollama2_llama import Videollama2LlamaForCausalLM, Videollama2LlamaConfig
26
+ from .videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
27
+ from .videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
28
+ from .videollama2_qwen2 import Videollama2Qwen2ForCausalLM, Videollama2Qwen2Config
29
+ from .videollama2_gemma2 import Videollama2Gemma2ForCausalLM, Videollama2Gemma2Config
30
+ from .videollama2_phi3 import Videollama2Phi3ForCausalLM, Videollama2Phi3Config
31
+
32
+
33
+ VLLMs = {
34
+ "videollama2": Videollama2MistralForCausalLM,
35
+ "videollama2_llama": Videollama2LlamaForCausalLM,
36
+ "videollama2_mistral": Videollama2MistralForCausalLM,
37
+ "videollama2_mixtral": Videollama2MixtralForCausalLM,
38
+ "videollama2_qwen2": Videollama2Qwen2ForCausalLM,
39
+ "videollama2_gemma2": Videollama2Gemma2ForCausalLM,
40
+ "videollama2_phi3": Videollama2Phi3ForCausalLM,
41
+ }
42
+
43
+ VLLMConfigs = {
44
+ "videollama2": Videollama2MistralConfig,
45
+ "videollama2_llama": Videollama2LlamaConfig,
46
+ "videollama2_mistral": Videollama2MistralConfig,
47
+ "videollama2_mixtral": Videollama2MixtralConfig,
48
+ "videollama2_qwen2": Videollama2Qwen2Config,
49
+ "videollama2_gemma2": Videollama2Gemma2Config,
50
+ "videollama2_phi3": Videollama2Phi3Config,
51
+ }
52
+
53
+
54
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
55
+ if 'token' in kwargs:
56
+ token = kwargs['token']
57
+ else:
58
+ token = None
59
+
60
+ kwargs = {"device_map": device_map, **kwargs}
61
+
62
+ if device != "cuda":
63
+ kwargs['device_map'] = {"": device}
64
+
65
+ if load_8bit:
66
+ kwargs['load_in_8bit'] = True
67
+ elif load_4bit:
68
+ # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
69
+ # kwargs['load_in_4bit'] = True
70
+ kwargs['quantization_config'] = BitsAndBytesConfig(
71
+ load_in_4bit=True,
72
+ bnb_4bit_compute_dtype=torch.float16,
73
+ bnb_4bit_use_double_quant=True,
74
+ bnb_4bit_quant_type='nf4'
75
+ )
76
+ else:
77
+ kwargs['torch_dtype'] = torch.float16
78
+
79
+ if use_flash_attn:
80
+ kwargs['attn_implementation'] = 'flash_attention_2'
81
+
82
+ config = AutoConfig.from_pretrained(model_path)
83
+
84
+ # judge model type
85
+ model_type = config.model_type
86
+
87
+ # judge pretrain/finetune
88
+ try:
89
+ is_pretraining = config.tune_mm_mlp_adapter
90
+ except:
91
+ is_pretraining = False
92
+
93
+ # NOTE: lora/qlora model loading
94
+ if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
95
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
96
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
97
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
98
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
99
+
100
+ # NOTE: remove qlora training quantization config
101
+ if hasattr(lora_cfg_pretrained, 'quantization_config'):
102
+ del lora_cfg_pretrained.quantization_config
103
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
104
+ print('Loading VideoLLaMA from base model...')
105
+
106
+ if 'vicuna' in model_base.lower():
107
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
108
+ elif 'mistral' in model_base.lower():
109
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
110
+ else:
111
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
112
+
113
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
114
+ if model.lm_head.weight.shape[0] != token_num:
115
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
116
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
117
+
118
+ print('Loading additional VideoLLaMA weights...')
119
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
120
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
121
+ else:
122
+ # this is probably from HF Hub
123
+ from huggingface_hub import hf_hub_download
124
+ def load_from_hf(repo_id, filename, subfolder=None):
125
+ cache_file = hf_hub_download(
126
+ repo_id=repo_id,
127
+ filename=filename,
128
+ subfolder=subfolder)
129
+ return torch.load(cache_file, map_location='cpu')
130
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
131
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
132
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
133
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
134
+ model.load_state_dict(non_lora_trainables, strict=False)
135
+
136
+ from peft import PeftModel
137
+ print('Loading LoRA weights...')
138
+ model = PeftModel.from_pretrained(model, model_path)
139
+ print('Merging LoRA weights...')
140
+ model = model.merge_and_unload()
141
+ print('Model is loaded...')
142
+ elif model_base is not None or '-base' in model_name.lower() or is_pretraining:
143
+ # NOTE: Base/Pretrain model loading
144
+ print('Loading VideoLLaMA 2 from base model...')
145
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
146
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
147
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
148
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
149
+
150
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
151
+
152
+ if model_type in ['videollama2', 'videollama2_mistral']:
153
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
154
+ elif model_type in ['videollama2_mixtral']:
155
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
156
+ elif model_type in ['videollama2_qwen2']:
157
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
158
+ elif model_type in ['videollama2_gemma2']:
159
+ model = Videollama2Gemma2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
160
+ elif model_type in ['videollama2_phi3']:
161
+ model = Videollama2Phi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
162
+ else:
163
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
164
+
165
+ # NOTE; loading vision-language projector
166
+ # * old codes for loading local mm_projector.bin
167
+ # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
168
+ # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
169
+ # model.load_state_dict(mm_projector_weights, strict=False)
170
+ # * new codes which supports loading mm_projector.bin both offline and online
171
+ mm_projector_weights = load_mm_projector(model_path, token=token)
172
+ model.load_state_dict(mm_projector_weights, strict=False)
173
+ elif 'videollama2' in model_type:
174
+ # NOTE: SFT model loading
175
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
176
+
177
+ if model_type in ['videollama2', 'videollama2_mistral']:
178
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
179
+ elif model_type in ['videollama2_mixtral']:
180
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
181
+ elif model_type in ['videollama2_qwen2']:
182
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
183
+ elif model_type in ['videollama2_gemma2']:
184
+ model = Videollama2Gemma2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
185
+ elif model_type in ['videollama2_phi3']:
186
+ model = Videollama2Phi3ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
187
+ else:
188
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
189
+ else:
190
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
191
+ model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
192
+
193
+ processor = None
194
+
195
+ if "videollama" in model_type:
196
+ vision_tower = model.get_vision_tower()
197
+ if not vision_tower.is_loaded:
198
+ vision_tower.load_model()
199
+ vision_tower.to(device=device, dtype=torch.float16)
200
+ # NOTE: videollama2 adopts the same processor for processing image and video.
201
+ processor = vision_tower.image_processor
202
+
203
+ if hasattr(model.config, "max_sequence_length"):
204
+ context_len = model.config.max_sequence_length
205
+ else:
206
+ context_len = 2048
207
+
208
+ return tokenizer, model, processor, context_len
videollama2/model/beats/BEATs.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
3
+ # Github source: https://github.com/microsoft/unilm/tree/master/beats
4
+ # Copyright (c) 2022 Microsoft
5
+ # Licensed under The MIT License [see LICENSE for details]
6
+ # Based on fairseq code bases
7
+ # https://github.com/pytorch/fairseq
8
+ # --------------------------------------------------------
9
+
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn import LayerNorm
14
+ import torchaudio.compliance.kaldi as ta_kaldi
15
+
16
+ from .backbone import (
17
+ TransformerEncoder,
18
+ )
19
+
20
+ import logging
21
+ from typing import Optional
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class BEATsConfig:
27
+ def __init__(self, cfg=None):
28
+ self.input_patch_size: int = 16 # path size of patch embedding
29
+ self.embed_dim: int = 512 # patch embedding dimension
30
+ self.conv_bias: bool = False # include bias in conv encoder
31
+
32
+ self.encoder_layers: int = 12 # num encoder layers in the transformer
33
+ self.hidden_size: int = 4096 # 3584 for Qwen2
34
+ self.encoder_embed_dim: int = 768 # encoder embedding dimension
35
+ self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
36
+ self.encoder_attention_heads: int = 12 # num encoder attention heads
37
+ self.activation_fn: str = "gelu" # activation function to use
38
+
39
+ self.layer_wise_gradient_decay_ratio: float = 0.6 # ratio for layer-wise gradient decay
40
+ self.layer_norm_first: bool = False # apply layernorm first in the transformer
41
+ self.deep_norm: bool = True # apply deep_norm first in the transformer
42
+
43
+ # dropouts
44
+ self.dropout: float = 0.0 # dropout probability for the transformer
45
+ self.attention_dropout: float = 0.0 # dropout probability for attention weights
46
+ self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
47
+ self.encoder_layerdrop: float = 0.05 # probability of dropping a tarnsformer layer
48
+ self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
49
+
50
+ # positional embeddings
51
+ self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
52
+ self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
53
+
54
+ # relative position embedding
55
+ self.relative_position_embedding: bool = True # apply relative position embedding
56
+ self.num_buckets: int = 320 # number of buckets for relative position embedding
57
+ self.max_distance: int = 800 # maximum distance for relative position embedding
58
+ self.gru_rel_pos: bool = True # apply gated relative position embedding
59
+
60
+ # label predictor
61
+ self.finetuned_model: bool = True # whether the model is a fine-tuned model.
62
+ self.predictor_dropout: float = 0.0 # dropout probability for the predictor
63
+ self.predictor_class: int = 527 # target class number for the predictor
64
+
65
+ if cfg is not None:
66
+ self.update(cfg)
67
+
68
+ def update(self, cfg: dict):
69
+ self.__dict__.update(cfg)
70
+
71
+
72
+ class BEATs(nn.Module):
73
+ def __init__(
74
+ self,
75
+ cfg: BEATsConfig,
76
+ ) -> None:
77
+ super().__init__()
78
+ logger.info(f"BEATs Config: {cfg.__dict__}")
79
+
80
+ self.cfg = cfg
81
+
82
+ self.embed = cfg.embed_dim
83
+ self.post_extract_proj = (
84
+ nn.Linear(self.embed, cfg.encoder_embed_dim)
85
+ if self.embed != cfg.encoder_embed_dim
86
+ else None
87
+ )
88
+
89
+ self.input_patch_size = cfg.input_patch_size
90
+ self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
91
+ bias=cfg.conv_bias)
92
+
93
+ self.dropout_input = nn.Dropout(cfg.dropout_input)
94
+
95
+ assert not cfg.deep_norm or not cfg.layer_norm_first
96
+ self.encoder = TransformerEncoder(cfg)
97
+ self.layer_norm = LayerNorm(self.embed)
98
+
99
+ if cfg.finetuned_model:
100
+ self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
101
+ self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class)
102
+ else:
103
+ self.predictor = None
104
+
105
+ def forward_padding_mask(
106
+ self,
107
+ features: torch.Tensor,
108
+ padding_mask: torch.Tensor,
109
+ ) -> torch.Tensor:
110
+ extra = padding_mask.size(1) % features.size(1)
111
+ if extra > 0:
112
+ padding_mask = padding_mask[:, :-extra]
113
+ padding_mask = padding_mask.view(
114
+ padding_mask.size(0), features.size(1), -1
115
+ )
116
+ padding_mask = padding_mask.all(-1)
117
+ return padding_mask
118
+
119
+ def preprocess(
120
+ self,
121
+ source: torch.Tensor,
122
+ fbank_mean: float = 15.41663,
123
+ fbank_std: float = 6.55582,
124
+ ) -> torch.Tensor:
125
+ '''
126
+ fbanks = []
127
+ for waveform in source:
128
+ waveform = waveform.unsqueeze(0) * 2 ** 15
129
+ fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
130
+ fbanks.append(fbank)
131
+ fbank = torch.stack(fbanks, dim=0)
132
+ '''
133
+ fbank = source
134
+ fbank = (fbank - fbank_mean) / (2 * fbank_std)
135
+ return fbank
136
+
137
+ def extract_features(
138
+ self,
139
+ source: torch.Tensor,
140
+ padding_mask: Optional[torch.Tensor] = None,
141
+ fbank_mean: float = 15.41663,
142
+ fbank_std: float = 6.55582,
143
+ feature_only=True,
144
+ ):
145
+ fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
146
+
147
+ if padding_mask is not None:
148
+ padding_mask = self.forward_padding_mask(fbank, padding_mask)
149
+
150
+ fbank = fbank.unsqueeze(1)
151
+ features = self.patch_embedding(fbank)
152
+ T = features.shape[2]
153
+ F = features.shape[3]
154
+ features = features.reshape(features.shape[0], features.shape[1], -1)
155
+ features = features.transpose(1, 2)
156
+ features = self.layer_norm(features)
157
+
158
+ if padding_mask is not None:
159
+ padding_mask = self.forward_padding_mask(features, padding_mask)
160
+
161
+ if self.post_extract_proj is not None:
162
+ features = self.post_extract_proj(features)
163
+
164
+ x = self.dropout_input(features)
165
+
166
+ x, layer_results = self.encoder(
167
+ x,
168
+ padding_mask=padding_mask,
169
+ )
170
+ if not feature_only and self.predictor is not None:
171
+ x = self.predictor_dropout(x)
172
+ logits = self.predictor(x)
173
+
174
+ if padding_mask is not None and padding_mask.any():
175
+ logits[padding_mask] = 0
176
+ logits = logits.sum(dim=1)
177
+ logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits)
178
+ else:
179
+ logits = logits.mean(dim=1)
180
+
181
+ lprobs = torch.sigmoid(logits)
182
+
183
+ return lprobs, padding_mask
184
+ else:
185
+ return x, T, F
videollama2/model/beats/LICENSE_beats ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) Microsoft Corporation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
videollama2/model/beats/Tokenizers.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
3
+ # Github source: https://github.com/microsoft/unilm/tree/master/beats
4
+ # Copyright (c) 2022 Microsoft
5
+ # Licensed under The MIT License [see LICENSE for details]
6
+ # Based on fairseq code bases
7
+ # https://github.com/pytorch/fairseq
8
+ # --------------------------------------------------------
9
+
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn import LayerNorm
14
+ import torchaudio.compliance.kaldi as ta_kaldi
15
+
16
+ from beats.backbone import (
17
+ TransformerEncoder,
18
+ )
19
+ from beats.quantizer import (
20
+ NormEMAVectorQuantizer,
21
+ )
22
+
23
+ import logging
24
+ from typing import Optional
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class TokenizersConfig:
30
+ def __init__(self, cfg=None):
31
+ self.input_patch_size: int = -1 # path size of patch embedding
32
+ self.embed_dim: int = 512 # patch embedding dimension
33
+ self.conv_bias: bool = False # include bias in conv encoder
34
+
35
+ self.encoder_layers: int = 12 # num encoder layers in the transformer
36
+ self.encoder_embed_dim: int = 768 # encoder embedding dimension
37
+ self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
38
+ self.encoder_attention_heads: int = 12 # num encoder attention heads
39
+ self.activation_fn: str = "gelu" # activation function to use
40
+
41
+ self.layer_norm_first: bool = False # apply layernorm first in the transformer
42
+ self.deep_norm: bool = False # apply deep_norm first in the transformer
43
+
44
+ # dropouts
45
+ self.dropout: float = 0.1 # dropout probability for the transformer
46
+ self.attention_dropout: float = 0.1 # dropout probability for attention weights
47
+ self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
48
+ self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer
49
+ self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
50
+
51
+ # positional embeddings
52
+ self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
53
+ self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
54
+
55
+ # relative position embedding
56
+ self.relative_position_embedding: bool = False # apply relative position embedding
57
+ self.num_buckets: int = 320 # number of buckets for relative position embedding
58
+ self.max_distance: int = 1280 # maximum distance for relative position embedding
59
+ self.gru_rel_pos: bool = False # apply gated relative position embedding
60
+
61
+ # quantizer
62
+ self.quant_n: int = 1024 # codebook number in quantizer
63
+ self.quant_dim: int = 256 # codebook dimension in quantizer
64
+
65
+ if cfg is not None:
66
+ self.update(cfg)
67
+
68
+ def update(self, cfg: dict):
69
+ self.__dict__.update(cfg)
70
+
71
+
72
+ class Tokenizers(nn.Module):
73
+ def __init__(
74
+ self,
75
+ cfg: TokenizersConfig,
76
+ ) -> None:
77
+ super().__init__()
78
+ logger.info(f"Tokenizers Config: {cfg.__dict__}")
79
+
80
+ self.cfg = cfg
81
+
82
+ self.embed = cfg.embed_dim
83
+ self.post_extract_proj = (
84
+ nn.Linear(self.embed, cfg.encoder_embed_dim)
85
+ if self.embed != cfg.encoder_embed_dim
86
+ else None
87
+ )
88
+
89
+ self.input_patch_size = cfg.input_patch_size
90
+ self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
91
+ bias=cfg.conv_bias)
92
+
93
+ self.dropout_input = nn.Dropout(cfg.dropout_input)
94
+
95
+ assert not cfg.deep_norm or not cfg.layer_norm_first
96
+ self.encoder = TransformerEncoder(cfg)
97
+ self.layer_norm = LayerNorm(self.embed)
98
+
99
+ self.quantize = NormEMAVectorQuantizer(
100
+ n_embed=cfg.quant_n, embedding_dim=cfg.quant_dim, beta=1.0, kmeans_init=True, decay=0.99,
101
+ )
102
+ self.quant_n = cfg.quant_n
103
+ self.quantize_layer = nn.Sequential(
104
+ nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim),
105
+ nn.Tanh(),
106
+ nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim) # for quantize
107
+ )
108
+
109
+ def forward_padding_mask(
110
+ self,
111
+ features: torch.Tensor,
112
+ padding_mask: torch.Tensor,
113
+ ) -> torch.Tensor:
114
+ extra = padding_mask.size(1) % features.size(1)
115
+ if extra > 0:
116
+ padding_mask = padding_mask[:, :-extra]
117
+ padding_mask = padding_mask.view(
118
+ padding_mask.size(0), features.size(1), -1
119
+ )
120
+ padding_mask = padding_mask.all(-1)
121
+ return padding_mask
122
+
123
+ def preprocess(
124
+ self,
125
+ source: torch.Tensor,
126
+ fbank_mean: float = 15.41663,
127
+ fbank_std: float = 6.55582,
128
+ ) -> torch.Tensor:
129
+ fbanks = []
130
+ for waveform in source:
131
+ waveform = waveform.unsqueeze(0) * 2 ** 15
132
+ fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
133
+ fbanks.append(fbank)
134
+ fbank = torch.stack(fbanks, dim=0)
135
+ fbank = (fbank - fbank_mean) / (2 * fbank_std)
136
+ return fbank
137
+
138
+ def extract_labels(
139
+ self,
140
+ source: torch.Tensor,
141
+ padding_mask: Optional[torch.Tensor] = None,
142
+ fbank_mean: float = 15.41663,
143
+ fbank_std: float = 6.55582,
144
+ ):
145
+ fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
146
+
147
+ if padding_mask is not None:
148
+ padding_mask = self.forward_padding_mask(fbank, padding_mask)
149
+
150
+ fbank = fbank.unsqueeze(1)
151
+ features = self.patch_embedding(fbank)
152
+ features = features.reshape(features.shape[0], features.shape[1], -1)
153
+ features = features.transpose(1, 2)
154
+ features = self.layer_norm(features)
155
+
156
+ if padding_mask is not None:
157
+ padding_mask = self.forward_padding_mask(features, padding_mask)
158
+
159
+ if self.post_extract_proj is not None:
160
+ features = self.post_extract_proj(features)
161
+
162
+ x = self.dropout_input(features)
163
+
164
+ x, layer_results = self.encoder(
165
+ x,
166
+ padding_mask=padding_mask,
167
+ )
168
+
169
+ quantize_input = self.quantize_layer(x)
170
+ quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input)
171
+
172
+ return embed_ind
videollama2/model/beats/__init__.py ADDED
File without changes
videollama2/model/beats/backbone.py ADDED
@@ -0,0 +1,783 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
3
+ # Github source: https://github.com/microsoft/unilm/tree/master/beats
4
+ # Copyright (c) 2022 Microsoft
5
+ # Licensed under The MIT License [see LICENSE for details]
6
+ # Based on fairseq code bases
7
+ # https://github.com/pytorch/fairseq
8
+ # --------------------------------------------------------
9
+
10
+ import math
11
+ import numpy as np
12
+ from typing import Dict, Optional, Tuple
13
+ import torch
14
+ from torch import Tensor, nn
15
+ import torch.nn.functional as F
16
+ from torch.nn import LayerNorm, Parameter
17
+ from .modules import (
18
+ GradMultiply,
19
+ SamePad,
20
+ get_activation_fn,
21
+ GLU_Linear,
22
+ quant_noise,
23
+ )
24
+ from .weight_norm_fix import weight_norm
25
+
26
+ class TransformerEncoder(nn.Module):
27
+ def __init__(self, args):
28
+ super().__init__()
29
+
30
+ self.dropout = args.dropout
31
+ self.embedding_dim = args.encoder_embed_dim
32
+
33
+ self.pos_conv = nn.Conv1d(
34
+ self.embedding_dim,
35
+ self.embedding_dim,
36
+ kernel_size=args.conv_pos,
37
+ padding=args.conv_pos // 2,
38
+ groups=args.conv_pos_groups,
39
+ )
40
+ dropout = 0
41
+ std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
42
+ nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
43
+ nn.init.constant_(self.pos_conv.bias, 0)
44
+
45
+ self.pos_conv = weight_norm(self.pos_conv, name="weight", dim=2)
46
+ self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
47
+
48
+ if hasattr(args, "relative_position_embedding"):
49
+ self.relative_position_embedding = args.relative_position_embedding
50
+ self.num_buckets = args.num_buckets
51
+ self.max_distance = args.max_distance
52
+ else:
53
+ self.relative_position_embedding = False
54
+ self.num_buckets = 0
55
+ self.max_distance = 0
56
+
57
+ self.layers = nn.ModuleList(
58
+ [
59
+ TransformerSentenceEncoderLayer(
60
+ embedding_dim=self.embedding_dim,
61
+ ffn_embedding_dim=args.encoder_ffn_embed_dim,
62
+ num_attention_heads=args.encoder_attention_heads,
63
+ dropout=self.dropout,
64
+ attention_dropout=args.attention_dropout,
65
+ activation_dropout=args.activation_dropout,
66
+ activation_fn=args.activation_fn,
67
+ layer_norm_first=args.layer_norm_first,
68
+ deep_norm=args.deep_norm,
69
+ has_relative_attention_bias=self.relative_position_embedding,
70
+ num_buckets=self.num_buckets,
71
+ max_distance=self.max_distance,
72
+ gru_rel_pos=args.gru_rel_pos,
73
+ encoder_layers=args.encoder_layers,
74
+ )
75
+ for i in range(args.encoder_layers)
76
+ ]
77
+ )
78
+ if self.relative_position_embedding:
79
+ for i in range(1, args.encoder_layers):
80
+ del self.layers[i].self_attn.relative_attention_bias
81
+ self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias
82
+
83
+ self.layer_norm_first = args.layer_norm_first
84
+ self.layer_norm = LayerNorm(self.embedding_dim)
85
+ self.layerdrop = args.encoder_layerdrop
86
+
87
+ #self.apply(init_bert_params)
88
+
89
+ if args.deep_norm:
90
+ deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4)
91
+ for i in range(args.encoder_layers):
92
+ nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1)
93
+ nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta)
94
+ nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1)
95
+ nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta)
96
+ nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta)
97
+ nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta)
98
+
99
+ self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1)
100
+
101
+ def forward(self, x, padding_mask=None, layer=None):
102
+ x, layer_results = self.extract_features(x, padding_mask, layer)
103
+
104
+ if self.layer_norm_first and layer is None:
105
+ x = self.layer_norm(x)
106
+
107
+ return x, layer_results
108
+
109
+ def extract_features(self, x, padding_mask=None, tgt_layer=None):
110
+
111
+ if padding_mask is not None:
112
+ x[padding_mask] = 0
113
+
114
+ x_conv = self.pos_conv(x.transpose(1, 2))
115
+ x_conv = x_conv.transpose(1, 2)
116
+ x = x + x_conv
117
+
118
+ if not self.layer_norm_first:
119
+ x = self.layer_norm(x)
120
+
121
+ x = F.dropout(x, p=self.dropout, training=self.training)
122
+
123
+ # B x T x C -> T x B x C
124
+ x = x.transpose(0, 1)
125
+
126
+ layer_results = []
127
+ z = None
128
+ if tgt_layer is not None:
129
+ layer_results.append((x, z))
130
+ r = None
131
+ pos_bias = None
132
+ for i, layer in enumerate(self.layers):
133
+ if self.layer_wise_gradient_decay_ratio != 1.0:
134
+ x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio)
135
+ dropout_probability = np.random.random()
136
+ if not self.training or (dropout_probability > self.layerdrop):
137
+ x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias)
138
+ if tgt_layer is not None:
139
+ layer_results.append((x, z))
140
+ if i == tgt_layer:
141
+ r = x
142
+ break
143
+
144
+ if r is not None:
145
+ x = r
146
+
147
+ # T x B x C -> B x T x C
148
+ x = x.transpose(0, 1)
149
+
150
+ return x, layer_results
151
+
152
+
153
+ class TransformerSentenceEncoderLayer(nn.Module):
154
+ def __init__(
155
+ self,
156
+ embedding_dim: float = 768,
157
+ ffn_embedding_dim: float = 3072,
158
+ num_attention_heads: float = 8,
159
+ dropout: float = 0.1,
160
+ attention_dropout: float = 0.1,
161
+ activation_dropout: float = 0.1,
162
+ activation_fn: str = "relu",
163
+ layer_norm_first: bool = False,
164
+ deep_norm: bool = False,
165
+ has_relative_attention_bias: bool = False,
166
+ num_buckets: int = 0,
167
+ max_distance: int = 0,
168
+ rescale_init: bool = False,
169
+ gru_rel_pos: bool = False,
170
+ encoder_layers: int = 0,
171
+ ) -> None:
172
+
173
+ super().__init__()
174
+ self.embedding_dim = embedding_dim
175
+ self.dropout = dropout
176
+ self.activation_dropout = activation_dropout
177
+
178
+ self.activation_name = activation_fn
179
+ self.activation_fn = get_activation_fn(activation_fn)
180
+ self.self_attn = MultiheadAttention(
181
+ self.embedding_dim,
182
+ num_attention_heads,
183
+ dropout=attention_dropout,
184
+ self_attention=True,
185
+ has_relative_attention_bias=has_relative_attention_bias,
186
+ num_buckets=num_buckets,
187
+ max_distance=max_distance,
188
+ rescale_init=rescale_init,
189
+ gru_rel_pos=gru_rel_pos,
190
+ )
191
+
192
+ self.dropout1 = nn.Dropout(dropout)
193
+ self.dropout2 = nn.Dropout(self.activation_dropout)
194
+ self.dropout3 = nn.Dropout(dropout)
195
+
196
+ self.layer_norm_first = layer_norm_first
197
+
198
+ self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
199
+
200
+ if self.activation_name == "glu":
201
+ self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
202
+ else:
203
+ self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
204
+ self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
205
+
206
+ self.final_layer_norm = LayerNorm(self.embedding_dim)
207
+
208
+ self.deep_norm = deep_norm
209
+ if self.deep_norm:
210
+ self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4)
211
+ else:
212
+ self.deep_norm_alpha = 1
213
+
214
+ def forward(
215
+ self,
216
+ x: torch.Tensor,
217
+ self_attn_mask: torch.Tensor = None,
218
+ self_attn_padding_mask: torch.Tensor = None,
219
+ need_weights: bool = False,
220
+ pos_bias=None
221
+ ):
222
+ residual = x
223
+
224
+ if self.layer_norm_first:
225
+ x = self.self_attn_layer_norm(x)
226
+ x, attn, pos_bias = self.self_attn(
227
+ query=x,
228
+ key=x,
229
+ value=x,
230
+ key_padding_mask=self_attn_padding_mask,
231
+ need_weights=False,
232
+ attn_mask=self_attn_mask,
233
+ position_bias=pos_bias
234
+ )
235
+ x = self.dropout1(x)
236
+ x = residual + x
237
+
238
+ residual = x
239
+ x = self.final_layer_norm(x)
240
+ if self.activation_name == "glu":
241
+ x = self.fc1(x)
242
+ else:
243
+ x = self.activation_fn(self.fc1(x))
244
+ x = self.dropout2(x)
245
+ x = self.fc2(x)
246
+ x = self.dropout3(x)
247
+ x = residual + x
248
+ else:
249
+ x, attn, pos_bias = self.self_attn(
250
+ query=x,
251
+ key=x,
252
+ value=x,
253
+ key_padding_mask=self_attn_padding_mask,
254
+ need_weights=need_weights,
255
+ attn_mask=self_attn_mask,
256
+ position_bias=pos_bias
257
+ )
258
+
259
+ x = self.dropout1(x)
260
+ x = residual * self.deep_norm_alpha + x
261
+
262
+ x = self.self_attn_layer_norm(x)
263
+
264
+ residual = x
265
+ if self.activation_name == "glu":
266
+ x = self.fc1(x)
267
+ else:
268
+ x = self.activation_fn(self.fc1(x))
269
+ x = self.dropout2(x)
270
+ x = self.fc2(x)
271
+ x = self.dropout3(x)
272
+ x = residual * self.deep_norm_alpha + x
273
+ x = self.final_layer_norm(x)
274
+
275
+ return x, attn, pos_bias
276
+
277
+
278
+ class MultiheadAttention(nn.Module):
279
+ """Multi-headed attention.
280
+
281
+ See "Attention Is All You Need" for more details.
282
+ """
283
+
284
+ def __init__(
285
+ self,
286
+ embed_dim,
287
+ num_heads,
288
+ kdim=None,
289
+ vdim=None,
290
+ dropout=0.0,
291
+ bias=True,
292
+ add_bias_kv=False,
293
+ add_zero_attn=False,
294
+ self_attention=False,
295
+ encoder_decoder_attention=False,
296
+ q_noise=0.0,
297
+ qn_block_size=8,
298
+ has_relative_attention_bias=False,
299
+ num_buckets=32,
300
+ max_distance=128,
301
+ gru_rel_pos=False,
302
+ rescale_init=False,
303
+ ):
304
+ super().__init__()
305
+ self.embed_dim = embed_dim
306
+ self.kdim = kdim if kdim is not None else embed_dim
307
+ self.vdim = vdim if vdim is not None else embed_dim
308
+ self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
309
+
310
+ self.num_heads = num_heads
311
+ self.dropout_module = nn.Dropout(dropout)
312
+
313
+ self.has_relative_attention_bias = has_relative_attention_bias
314
+ self.num_buckets = num_buckets
315
+ self.max_distance = max_distance
316
+ if self.has_relative_attention_bias:
317
+ self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
318
+
319
+ self.head_dim = embed_dim // num_heads
320
+ self.q_head_dim = self.head_dim
321
+ self.k_head_dim = self.head_dim
322
+ assert (
323
+ self.head_dim * num_heads == self.embed_dim
324
+ ), "embed_dim must be divisible by num_heads"
325
+ self.scaling = self.head_dim ** -0.5
326
+
327
+ self.self_attention = self_attention
328
+ self.encoder_decoder_attention = encoder_decoder_attention
329
+
330
+ assert not self.self_attention or self.qkv_same_dim, (
331
+ "Self-attention requires query, key and " "value to be of the same size"
332
+ )
333
+
334
+ k_bias = True
335
+ if rescale_init:
336
+ k_bias = False
337
+
338
+ k_embed_dim = embed_dim
339
+ q_embed_dim = embed_dim
340
+
341
+ self.k_proj = quant_noise(
342
+ nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size
343
+ )
344
+ self.v_proj = quant_noise(
345
+ nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
346
+ )
347
+ self.q_proj = quant_noise(
348
+ nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size
349
+ )
350
+
351
+ self.out_proj = quant_noise(
352
+ nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
353
+ )
354
+
355
+ if add_bias_kv:
356
+ self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
357
+ self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
358
+ else:
359
+ self.bias_k = self.bias_v = None
360
+
361
+ self.add_zero_attn = add_zero_attn
362
+
363
+ self.gru_rel_pos = gru_rel_pos
364
+ if self.gru_rel_pos:
365
+ self.grep_linear = nn.Linear(self.q_head_dim, 8)
366
+ self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
367
+
368
+ self.reset_parameters()
369
+
370
+ def reset_parameters(self):
371
+ if self.qkv_same_dim:
372
+ # Empirically observed the convergence to be much better with
373
+ # the scaled initialization
374
+ nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
375
+ nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
376
+ nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
377
+ else:
378
+ nn.init.xavier_uniform_(self.k_proj.weight)
379
+ nn.init.xavier_uniform_(self.v_proj.weight)
380
+ nn.init.xavier_uniform_(self.q_proj.weight)
381
+
382
+ nn.init.xavier_uniform_(self.out_proj.weight)
383
+ if self.out_proj.bias is not None:
384
+ nn.init.constant_(self.out_proj.bias, 0.0)
385
+ if self.bias_k is not None:
386
+ nn.init.xavier_normal_(self.bias_k)
387
+ if self.bias_v is not None:
388
+ nn.init.xavier_normal_(self.bias_v)
389
+ if self.has_relative_attention_bias:
390
+ nn.init.xavier_normal_(self.relative_attention_bias.weight)
391
+
392
+ def _relative_positions_bucket(self, relative_positions, bidirectional=True):
393
+ num_buckets = self.num_buckets
394
+ max_distance = self.max_distance
395
+ relative_buckets = 0
396
+
397
+ if bidirectional:
398
+ num_buckets = num_buckets // 2
399
+ relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
400
+ relative_positions = torch.abs(relative_positions)
401
+ else:
402
+ relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions))
403
+
404
+ max_exact = num_buckets // 2
405
+ is_small = relative_positions < max_exact
406
+
407
+ relative_postion_if_large = max_exact + (
408
+ torch.log(relative_positions.float() / max_exact)
409
+ / math.log(max_distance / max_exact)
410
+ * (num_buckets - max_exact)
411
+ ).to(torch.long)
412
+ relative_postion_if_large = torch.min(
413
+ relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
414
+ )
415
+
416
+ relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large)
417
+ return relative_buckets
418
+
419
+ def compute_bias(self, query_length, key_length):
420
+ context_position = torch.arange(query_length, dtype=torch.long)[:, None]
421
+ memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
422
+ relative_position = memory_position - context_position
423
+ relative_position_bucket = self._relative_positions_bucket(
424
+ relative_position,
425
+ bidirectional=True
426
+ )
427
+ relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
428
+ values = self.relative_attention_bias(relative_position_bucket)
429
+ values = values.permute([2, 0, 1])
430
+ return values
431
+
432
+ def forward(
433
+ self,
434
+ query,
435
+ key: Optional[Tensor],
436
+ value: Optional[Tensor],
437
+ key_padding_mask: Optional[Tensor] = None,
438
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
439
+ need_weights: bool = True,
440
+ static_kv: bool = False,
441
+ attn_mask: Optional[Tensor] = None,
442
+ before_softmax: bool = False,
443
+ need_head_weights: bool = False,
444
+ position_bias: Optional[Tensor] = None
445
+ ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
446
+ """Input shape: Time x Batch x Channel
447
+
448
+ Args:
449
+ key_padding_mask (ByteTensor, optional): mask to exclude
450
+ keys that are pads, of shape `(batch, src_len)`, where
451
+ padding elements are indicated by 1s.
452
+ need_weights (bool, optional): return the attention weights,
453
+ averaged over heads (default: False).
454
+ attn_mask (ByteTensor, optional): typically used to
455
+ implement causal attention, where the mask prevents the
456
+ attention from looking forward in time (default: None).
457
+ before_softmax (bool, optional): return the raw attention
458
+ weights and values before the attention softmax.
459
+ need_head_weights (bool, optional): return the attention
460
+ weights for each head. Implies *need_weights*. Default:
461
+ return the average attention weights over all heads.
462
+ """
463
+ if need_head_weights:
464
+ need_weights = True
465
+
466
+ is_tpu = query.device.type == "xla"
467
+
468
+ tgt_len, bsz, embed_dim = query.size()
469
+ src_len = tgt_len
470
+ assert embed_dim == self.embed_dim
471
+ assert list(query.size()) == [tgt_len, bsz, embed_dim]
472
+ if key is not None:
473
+ src_len, key_bsz, _ = key.size()
474
+ if not torch.jit.is_scripting():
475
+ assert key_bsz == bsz
476
+ assert value is not None
477
+ assert src_len, bsz == value.shape[:2]
478
+
479
+ if self.has_relative_attention_bias and position_bias is None:
480
+ position_bias = self.compute_bias(tgt_len, src_len)
481
+ position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len)
482
+
483
+ if incremental_state is not None:
484
+ saved_state = self._get_input_buffer(incremental_state)
485
+ if saved_state is not None and "prev_key" in saved_state:
486
+ # previous time steps are cached - no need to recompute
487
+ # key and value if they are static
488
+ if static_kv:
489
+ assert self.encoder_decoder_attention and not self.self_attention
490
+ key = value = None
491
+ else:
492
+ saved_state = None
493
+
494
+ if self.self_attention:
495
+ q = self.q_proj(query)
496
+ k = self.k_proj(query)
497
+ v = self.v_proj(query)
498
+ elif self.encoder_decoder_attention:
499
+ # encoder-decoder attention
500
+ q = self.q_proj(query)
501
+ if key is None:
502
+ assert value is None
503
+ k = v = None
504
+ else:
505
+ k = self.k_proj(key)
506
+ v = self.v_proj(key)
507
+
508
+ else:
509
+ assert key is not None and value is not None
510
+ q = self.q_proj(query)
511
+ k = self.k_proj(key)
512
+ v = self.v_proj(value)
513
+ q *= self.scaling
514
+ alpha = 32
515
+ q *= 1 / alpha
516
+
517
+ if self.bias_k is not None:
518
+ assert self.bias_v is not None
519
+ k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
520
+ v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
521
+ if attn_mask is not None:
522
+ attn_mask = torch.cat(
523
+ [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
524
+ )
525
+ if key_padding_mask is not None:
526
+ key_padding_mask = torch.cat(
527
+ [
528
+ key_padding_mask,
529
+ key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
530
+ ],
531
+ dim=1,
532
+ )
533
+
534
+ q = (
535
+ q.contiguous()
536
+ .view(tgt_len, bsz * self.num_heads, self.q_head_dim)
537
+ .transpose(0, 1)
538
+ )
539
+ if k is not None:
540
+ k = (
541
+ k.contiguous()
542
+ .view(-1, bsz * self.num_heads, self.k_head_dim)
543
+ .transpose(0, 1)
544
+ )
545
+ if v is not None:
546
+ v = (
547
+ v.contiguous()
548
+ .view(-1, bsz * self.num_heads, self.head_dim)
549
+ .transpose(0, 1)
550
+ )
551
+
552
+ if saved_state is not None:
553
+ # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
554
+ if "prev_key" in saved_state:
555
+ _prev_key = saved_state["prev_key"]
556
+ assert _prev_key is not None
557
+ prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
558
+ if static_kv:
559
+ k = prev_key
560
+ else:
561
+ assert k is not None
562
+ k = torch.cat([prev_key, k], dim=1)
563
+ src_len = k.size(1)
564
+ if "prev_value" in saved_state:
565
+ _prev_value = saved_state["prev_value"]
566
+ assert _prev_value is not None
567
+ prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
568
+ if static_kv:
569
+ v = prev_value
570
+ else:
571
+ assert v is not None
572
+ v = torch.cat([prev_value, v], dim=1)
573
+ prev_key_padding_mask: Optional[Tensor] = None
574
+ if "prev_key_padding_mask" in saved_state:
575
+ prev_key_padding_mask = saved_state["prev_key_padding_mask"]
576
+ assert k is not None and v is not None
577
+ key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
578
+ key_padding_mask=key_padding_mask,
579
+ prev_key_padding_mask=prev_key_padding_mask,
580
+ batch_size=bsz,
581
+ src_len=k.size(1),
582
+ static_kv=static_kv,
583
+ )
584
+
585
+ saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
586
+ saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
587
+ saved_state["prev_key_padding_mask"] = key_padding_mask
588
+ # In this branch incremental_state is never None
589
+ assert incremental_state is not None
590
+ incremental_state = self._set_input_buffer(incremental_state, saved_state)
591
+ assert k is not None
592
+ assert k.size(1) == src_len
593
+
594
+ # This is part of a workaround to get around fork/join parallelism
595
+ # not supporting Optional types.
596
+ if key_padding_mask is not None and key_padding_mask.dim() == 0:
597
+ key_padding_mask = None
598
+
599
+ if key_padding_mask is not None:
600
+ assert key_padding_mask.size(0) == bsz
601
+ assert key_padding_mask.size(1) == src_len
602
+
603
+ if self.add_zero_attn:
604
+ assert v is not None
605
+ src_len += 1
606
+ k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
607
+ v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
608
+ if attn_mask is not None:
609
+ attn_mask = torch.cat(
610
+ [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
611
+ )
612
+ if key_padding_mask is not None:
613
+ key_padding_mask = torch.cat(
614
+ [
615
+ key_padding_mask,
616
+ torch.zeros(key_padding_mask.size(0), 1).type_as(
617
+ key_padding_mask
618
+ ),
619
+ ],
620
+ dim=1,
621
+ )
622
+
623
+ attn_weights = torch.bmm(q, k.transpose(1, 2))
624
+ attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha
625
+ attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
626
+
627
+ assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
628
+
629
+ if attn_mask is not None:
630
+ attn_mask = attn_mask.unsqueeze(0)
631
+ attn_weights += attn_mask
632
+
633
+ if key_padding_mask is not None:
634
+ # don't attend to padding symbols
635
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
636
+ if not is_tpu:
637
+ attn_weights = attn_weights.masked_fill(
638
+ key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
639
+ float("-inf"),
640
+ )
641
+ else:
642
+ attn_weights = attn_weights.transpose(0, 2)
643
+ attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
644
+ attn_weights = attn_weights.transpose(0, 2)
645
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
646
+
647
+ if before_softmax:
648
+ return attn_weights, v, position_bias
649
+
650
+ if position_bias is not None:
651
+ attn_mask_rel_pos = position_bias
652
+ if self.gru_rel_pos == 1:
653
+ query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling
654
+ _B, _H, _L, __ = query_layer.size()
655
+ gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view(
656
+ _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1)
657
+ gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
658
+ attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias
659
+
660
+ attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size())
661
+
662
+ attn_weights = attn_weights + attn_mask_rel_pos
663
+
664
+ attn_weights_float = F.softmax(
665
+ attn_weights, dim=-1
666
+ )
667
+ attn_weights = attn_weights_float.type_as(attn_weights)
668
+ attn_probs = self.dropout_module(attn_weights)
669
+
670
+ assert v is not None
671
+ attn = torch.bmm(attn_probs, v)
672
+ assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
673
+ attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
674
+ attn = self.out_proj(attn)
675
+ attn_weights: Optional[Tensor] = None
676
+ if need_weights:
677
+ attn_weights = attn_weights_float.view(
678
+ bsz, self.num_heads, tgt_len, src_len
679
+ ).transpose(1, 0)
680
+ if not need_head_weights:
681
+ # average attention weights over heads
682
+ attn_weights = attn_weights.mean(dim=0)
683
+
684
+ return attn, attn_weights, position_bias
685
+
686
+ @staticmethod
687
+ def _append_prev_key_padding_mask(
688
+ key_padding_mask: Optional[Tensor],
689
+ prev_key_padding_mask: Optional[Tensor],
690
+ batch_size: int,
691
+ src_len: int,
692
+ static_kv: bool,
693
+ ) -> Optional[Tensor]:
694
+ # saved key padding masks have shape (bsz, seq_len)
695
+ if prev_key_padding_mask is not None and static_kv:
696
+ new_key_padding_mask = prev_key_padding_mask
697
+ elif prev_key_padding_mask is not None and key_padding_mask is not None:
698
+ new_key_padding_mask = torch.cat(
699
+ [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
700
+ )
701
+ # During incremental decoding, as the padding token enters and
702
+ # leaves the frame, there will be a time when prev or current
703
+ # is None
704
+ elif prev_key_padding_mask is not None:
705
+ if src_len > prev_key_padding_mask.size(1):
706
+ filler = torch.zeros(
707
+ (batch_size, src_len - prev_key_padding_mask.size(1)),
708
+ device=prev_key_padding_mask.device,
709
+ )
710
+ new_key_padding_mask = torch.cat(
711
+ [prev_key_padding_mask.float(), filler.float()], dim=1
712
+ )
713
+ else:
714
+ new_key_padding_mask = prev_key_padding_mask.float()
715
+ elif key_padding_mask is not None:
716
+ if src_len > key_padding_mask.size(1):
717
+ filler = torch.zeros(
718
+ (batch_size, src_len - key_padding_mask.size(1)),
719
+ device=key_padding_mask.device,
720
+ )
721
+ new_key_padding_mask = torch.cat(
722
+ [filler.float(), key_padding_mask.float()], dim=1
723
+ )
724
+ else:
725
+ new_key_padding_mask = key_padding_mask.float()
726
+ else:
727
+ new_key_padding_mask = prev_key_padding_mask
728
+ return new_key_padding_mask
729
+
730
+ def _get_input_buffer(
731
+ self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
732
+ ) -> Dict[str, Optional[Tensor]]:
733
+ result = self.get_incremental_state(incremental_state, "attn_state")
734
+ if result is not None:
735
+ return result
736
+ else:
737
+ empty_result: Dict[str, Optional[Tensor]] = {}
738
+ return empty_result
739
+
740
+ def _set_input_buffer(
741
+ self,
742
+ incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
743
+ buffer: Dict[str, Optional[Tensor]],
744
+ ):
745
+ return self.set_incremental_state(incremental_state, "attn_state", buffer)
746
+
747
+ def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
748
+ return attn_weights
749
+
750
+
751
+ def init_bert_params(module):
752
+ """
753
+ Initialize the weights specific to the BERT Model.
754
+ This overrides the default initializations depending on the specified arguments.
755
+ 1. If normal_init_linear_weights is set then weights of linear
756
+ layer will be initialized using the normal distribution and
757
+ bais will be set to the specified value.
758
+ 2. If normal_init_embed_weights is set then weights of embedding
759
+ layer will be initialized using the normal distribution.
760
+ 3. If normal_init_proj_weights is set then weights of
761
+ in_project_weight for MultiHeadAttention initialized using
762
+ the normal distribution (to be validated).
763
+ """
764
+
765
+ def normal_(data):
766
+ # with FSDP, module params will be on CUDA, so we cast them back to CPU
767
+ # so that the RNG is consistent with and without FSDP
768
+ data.copy_(
769
+ data.cpu().normal_(mean=0.0, std=0.02).to(data.device)
770
+ )
771
+
772
+ if isinstance(module, nn.Linear):
773
+ normal_(module.weight.data)
774
+ if module.bias is not None:
775
+ module.bias.data.zero_()
776
+ if isinstance(module, nn.Embedding):
777
+ normal_(module.weight.data)
778
+ if module.padding_idx is not None:
779
+ module.weight.data[module.padding_idx].zero_()
780
+ if isinstance(module, MultiheadAttention):
781
+ normal_(module.q_proj.weight.data)
782
+ normal_(module.k_proj.weight.data)
783
+ normal_(module.v_proj.weight.data)
videollama2/model/beats/modules.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
3
+ # Github source: https://github.com/microsoft/unilm/tree/master/beats
4
+ # Copyright (c) 2022 Microsoft
5
+ # Licensed under The MIT License [see LICENSE for details]
6
+ # Based on fairseq code bases
7
+ # https://github.com/pytorch/fairseq
8
+ # --------------------------------------------------------
9
+
10
+ import math
11
+ import warnings
12
+ import torch
13
+ from torch import Tensor, nn
14
+ import torch.nn.functional as F
15
+
16
+
17
+ class GradMultiply(torch.autograd.Function):
18
+ @staticmethod
19
+ def forward(ctx, x, scale):
20
+ ctx.scale = scale
21
+ res = x.new(x)
22
+ return res
23
+
24
+ @staticmethod
25
+ def backward(ctx, grad):
26
+ return grad * ctx.scale, None
27
+
28
+
29
+ class SamePad(nn.Module):
30
+ def __init__(self, kernel_size, causal=False):
31
+ super().__init__()
32
+ if causal:
33
+ self.remove = kernel_size - 1
34
+ else:
35
+ self.remove = 1 if kernel_size % 2 == 0 else 0
36
+
37
+ def forward(self, x):
38
+ if self.remove > 0:
39
+ x = x[:, :, : -self.remove]
40
+ return x
41
+
42
+
43
+ class Swish(nn.Module):
44
+ def __init__(self):
45
+ super(Swish, self).__init__()
46
+ self.act = torch.nn.Sigmoid()
47
+
48
+ def forward(self, x):
49
+ return x * self.act(x)
50
+
51
+
52
+ class GLU_Linear(nn.Module):
53
+ def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
54
+ super(GLU_Linear, self).__init__()
55
+
56
+ self.glu_type = glu_type
57
+ self.output_dim = output_dim
58
+
59
+ if glu_type == "sigmoid":
60
+ self.glu_act = torch.nn.Sigmoid()
61
+ elif glu_type == "swish":
62
+ self.glu_act = Swish()
63
+ elif glu_type == "relu":
64
+ self.glu_act = torch.nn.ReLU()
65
+ elif glu_type == "gelu":
66
+ self.glu_act = torch.nn.GELU()
67
+
68
+ if bias_in_glu:
69
+ self.linear = nn.Linear(input_dim, output_dim * 2, True)
70
+ else:
71
+ self.linear = nn.Linear(input_dim, output_dim * 2, False)
72
+
73
+ def forward(self, x):
74
+ # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
75
+ x = self.linear(x)
76
+
77
+ if self.glu_type == "bilinear":
78
+ x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2])
79
+ else:
80
+ x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
81
+
82
+ return x
83
+
84
+
85
+ def gelu_accurate(x):
86
+ if not hasattr(gelu_accurate, "_a"):
87
+ gelu_accurate._a = math.sqrt(2 / math.pi)
88
+ return (
89
+ 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
90
+ )
91
+
92
+
93
+ def gelu(x: torch.Tensor) -> torch.Tensor:
94
+ return torch.nn.functional.gelu(x.float()).type_as(x)
95
+
96
+
97
+ def get_activation_fn(activation: str):
98
+ """Returns the activation function corresponding to `activation`"""
99
+
100
+ if activation == "relu":
101
+ return F.relu
102
+ elif activation == "gelu":
103
+ return gelu
104
+ elif activation == "gelu_fast":
105
+ warnings.warn(
106
+ "--activation-fn=gelu_fast has been renamed to gelu_accurate"
107
+ )
108
+ return gelu_accurate
109
+ elif activation == "gelu_accurate":
110
+ return gelu_accurate
111
+ elif activation == "tanh":
112
+ return torch.tanh
113
+ elif activation == "linear":
114
+ return lambda x: x
115
+ elif activation == "glu":
116
+ return lambda x: x
117
+ else:
118
+ raise RuntimeError("--activation-fn {} not supported".format(activation))
119
+
120
+
121
+ def quant_noise(module, p, block_size):
122
+ """
123
+ Wraps modules and applies quantization noise to the weights for
124
+ subsequent quantization with Iterative Product Quantization as
125
+ described in "Training with Quantization Noise for Extreme Model Compression"
126
+
127
+ Args:
128
+ - module: nn.Module
129
+ - p: amount of Quantization Noise
130
+ - block_size: size of the blocks for subsequent quantization with iPQ
131
+
132
+ Remarks:
133
+ - Module weights must have the right sizes wrt the block size
134
+ - Only Linear, Embedding and Conv2d modules are supported for the moment
135
+ - For more detail on how to quantize by blocks with convolutional weights,
136
+ see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
137
+ - We implement the simplest form of noise here as stated in the paper
138
+ which consists in randomly dropping blocks
139
+ """
140
+
141
+ # if no quantization noise, don't register hook
142
+ if p <= 0:
143
+ return module
144
+
145
+ # supported modules
146
+ assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
147
+
148
+ # test whether module.weight has the right sizes wrt block_size
149
+ is_conv = module.weight.ndim == 4
150
+
151
+ # 2D matrix
152
+ if not is_conv:
153
+ assert (
154
+ module.weight.size(1) % block_size == 0
155
+ ), "Input features must be a multiple of block sizes"
156
+
157
+ # 4D matrix
158
+ else:
159
+ # 1x1 convolutions
160
+ if module.kernel_size == (1, 1):
161
+ assert (
162
+ module.in_channels % block_size == 0
163
+ ), "Input channels must be a multiple of block sizes"
164
+ # regular convolutions
165
+ else:
166
+ k = module.kernel_size[0] * module.kernel_size[1]
167
+ assert k % block_size == 0, "Kernel size must be a multiple of block size"
168
+
169
+ def _forward_pre_hook(mod, input):
170
+ # no noise for evaluation
171
+ if mod.training:
172
+ if not is_conv:
173
+ # gather weight and sizes
174
+ weight = mod.weight
175
+ in_features = weight.size(1)
176
+ out_features = weight.size(0)
177
+
178
+ # split weight matrix into blocks and randomly drop selected blocks
179
+ mask = torch.zeros(
180
+ in_features // block_size * out_features, device=weight.device
181
+ )
182
+ mask.bernoulli_(p)
183
+ mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
184
+
185
+ else:
186
+ # gather weight and sizes
187
+ weight = mod.weight
188
+ in_channels = mod.in_channels
189
+ out_channels = mod.out_channels
190
+
191
+ # split weight matrix into blocks and randomly drop selected blocks
192
+ if mod.kernel_size == (1, 1):
193
+ mask = torch.zeros(
194
+ int(in_channels // block_size * out_channels),
195
+ device=weight.device,
196
+ )
197
+ mask.bernoulli_(p)
198
+ mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
199
+ else:
200
+ mask = torch.zeros(
201
+ weight.size(0), weight.size(1), device=weight.device
202
+ )
203
+ mask.bernoulli_(p)
204
+ mask = (
205
+ mask.unsqueeze(2)
206
+ .unsqueeze(3)
207
+ .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
208
+ )
209
+
210
+ # scale weights and apply mask
211
+ mask = mask.to(
212
+ torch.bool
213
+ ) # x.bool() is not currently supported in TorchScript
214
+ s = 1 / (1 - p)
215
+ mod.weight.data = s * weight.masked_fill(mask, 0)
216
+
217
+ module.register_forward_pre_hook(_forward_pre_hook)
218
+ return module
videollama2/model/beats/quantizer.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
3
+ # Github source: https://github.com/microsoft/unilm/tree/master/beats
4
+ # Copyright (c) 2022 Microsoft
5
+ # Licensed under The MIT License [see LICENSE for details]
6
+ # Based on VQGAN code bases
7
+ # https://github.com/CompVis/taming-transformers
8
+ # --------------------------------------------------------'
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ import torch.distributed as distributed
14
+
15
+ try:
16
+ from einops import rearrange, repeat
17
+ except ImportError:
18
+ pass
19
+
20
+
21
+ def l2norm(t):
22
+ return F.normalize(t, p=2, dim=-1)
23
+
24
+
25
+ def ema_inplace(moving_avg, new, decay):
26
+ moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
27
+
28
+
29
+ def sample_vectors(samples, num):
30
+ num_samples, device = samples.shape[0], samples.device
31
+
32
+ if num_samples >= num:
33
+ indices = torch.randperm(num_samples, device=device)[:num]
34
+ else:
35
+ indices = torch.randint(0, num_samples, (num,), device=device)
36
+
37
+ return samples[indices]
38
+
39
+
40
+ def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False):
41
+ dim, dtype, device = samples.shape[-1], samples.dtype, samples.device
42
+
43
+ means = sample_vectors(samples, num_clusters)
44
+
45
+ for _ in range(num_iters):
46
+ if use_cosine_sim:
47
+ dists = samples @ means.t()
48
+ else:
49
+ diffs = rearrange(samples, 'n d -> n () d') \
50
+ - rearrange(means, 'c d -> () c d')
51
+ dists = -(diffs ** 2).sum(dim=-1)
52
+
53
+ buckets = dists.max(dim=-1).indices
54
+ bins = torch.bincount(buckets, minlength=num_clusters)
55
+ zero_mask = bins == 0
56
+ bins_min_clamped = bins.masked_fill(zero_mask, 1)
57
+
58
+ new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
59
+ new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d=dim), samples)
60
+ new_means = new_means / bins_min_clamped[..., None]
61
+
62
+ if use_cosine_sim:
63
+ new_means = l2norm(new_means)
64
+
65
+ means = torch.where(zero_mask[..., None], means, new_means)
66
+
67
+ return means, bins
68
+
69
+
70
+ class EmbeddingEMA(nn.Module):
71
+ def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''):
72
+ super().__init__()
73
+ self.num_tokens = num_tokens
74
+ self.codebook_dim = codebook_dim
75
+ self.decay = decay
76
+ self.eps = eps
77
+ if codebook_init_path == '':
78
+ if not kmeans_init:
79
+ weight = torch.randn(num_tokens, codebook_dim)
80
+ weight = l2norm(weight)
81
+ else:
82
+ weight = torch.zeros(num_tokens, codebook_dim)
83
+ self.register_buffer('initted', torch.Tensor([not kmeans_init]))
84
+ else:
85
+ print(f"load init codebook weight from {codebook_init_path}")
86
+ codebook_ckpt_weight = torch.load(codebook_init_path, map_location='cpu')
87
+ weight = codebook_ckpt_weight.clone()
88
+ self.register_buffer('initted', torch.Tensor([True]))
89
+
90
+ self.weight = nn.Parameter(weight, requires_grad=False)
91
+ self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False)
92
+ self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
93
+ # self.register_buffer('initted', torch.Tensor([not kmeans_init]))
94
+ self.update = True
95
+
96
+ @torch.jit.ignore
97
+ def init_embed_(self, data):
98
+ if self.initted:
99
+ return
100
+ print("Performing Kemans init for codebook")
101
+ embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True)
102
+ self.weight.data.copy_(embed)
103
+ self.cluster_size.data.copy_(cluster_size)
104
+ self.initted.data.copy_(torch.Tensor([True]))
105
+
106
+ def forward(self, embed_id):
107
+ return F.embedding(embed_id, self.weight)
108
+
109
+ def cluster_size_ema_update(self, new_cluster_size):
110
+ self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay)
111
+
112
+ def embed_avg_ema_update(self, new_embed_avg):
113
+ self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay)
114
+
115
+ def weight_update(self, num_tokens):
116
+ n = self.cluster_size.sum()
117
+ smoothed_cluster_size = (
118
+ (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n
119
+ )
120
+ # normalize embedding average with smoothed cluster size
121
+ embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
122
+ # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1))
123
+ self.weight.data.copy_(embed_normalized)
124
+
125
+
126
+ def norm_ema_inplace(moving_avg, new, decay):
127
+ moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
128
+ moving_avg.data.copy_(l2norm(moving_avg.data))
129
+
130
+
131
+ class NormEMAVectorQuantizer(nn.Module):
132
+ def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5,
133
+ statistic_code_usage=True, kmeans_init=False, codebook_init_path=''):
134
+ super().__init__()
135
+ self.codebook_dim = embedding_dim
136
+ self.num_tokens = n_embed
137
+ self.beta = beta
138
+ self.decay = decay
139
+
140
+ # learnable = True if orthogonal_reg_weight > 0 else False
141
+ self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path)
142
+
143
+ self.statistic_code_usage = statistic_code_usage
144
+ if statistic_code_usage:
145
+ self.register_buffer('cluster_size', torch.zeros(n_embed))
146
+ if distributed.is_available() and distributed.is_initialized():
147
+ print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!")
148
+ self.all_reduce_fn = distributed.all_reduce
149
+ else:
150
+ self.all_reduce_fn = nn.Identity()
151
+
152
+ def reset_cluster_size(self, device):
153
+ if self.statistic_code_usage:
154
+ self.register_buffer('cluster_size', torch.zeros(self.num_tokens))
155
+ self.cluster_size = self.cluster_size.to(device)
156
+
157
+ def forward(self, z):
158
+ # reshape z -> (batch, height, width, channel) and flatten
159
+ # z, 'b c h w -> b h w c'
160
+ # z = rearrange(z, 'b c h w -> b h w c')
161
+ # z = z.transpose(1, 2)
162
+ z = l2norm(z)
163
+ z_flattened = z.reshape(-1, self.codebook_dim)
164
+
165
+ self.embedding.init_embed_(z_flattened)
166
+
167
+ d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \
168
+ self.embedding.weight.pow(2).sum(dim=1) - 2 * \
169
+ torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n'
170
+
171
+ encoding_indices = torch.argmin(d, dim=1)
172
+
173
+ z_q = self.embedding(encoding_indices).view(z.shape)
174
+
175
+ encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
176
+
177
+ if not self.training:
178
+ with torch.no_grad():
179
+ cluster_size = encodings.sum(0)
180
+ self.all_reduce_fn(cluster_size)
181
+ ema_inplace(self.cluster_size, cluster_size, self.decay)
182
+
183
+ if self.training and self.embedding.update:
184
+ # EMA cluster size
185
+
186
+ bins = encodings.sum(0)
187
+ self.all_reduce_fn(bins)
188
+
189
+ # self.embedding.cluster_size_ema_update(bins)
190
+ ema_inplace(self.cluster_size, bins, self.decay)
191
+
192
+ zero_mask = (bins == 0)
193
+ bins = bins.masked_fill(zero_mask, 1.)
194
+
195
+ embed_sum = z_flattened.t() @ encodings
196
+ self.all_reduce_fn(embed_sum)
197
+
198
+ embed_normalized = (embed_sum / bins.unsqueeze(0)).t()
199
+ embed_normalized = l2norm(embed_normalized)
200
+
201
+ embed_normalized = torch.where(zero_mask[..., None], self.embedding.weight,
202
+ embed_normalized)
203
+ norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay)
204
+
205
+ # compute loss for embedding
206
+ loss = self.beta * F.mse_loss(z_q.detach(), z)
207
+
208
+ # preserve gradients
209
+ z_q = z + (z_q - z).detach()
210
+
211
+ # reshape back to match original input shape
212
+ # z_q, 'b h w c -> b c h w'
213
+ # z_q = rearrange(z_q, 'b h w c -> b c h w')
214
+ # z_q = z_q.transpose(1, 2)
215
+ return z_q, loss, encoding_indices
videollama2/model/beats/weight_norm_fix.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ r"""Weight Normalization from https://arxiv.org/abs/1602.07868."""
2
+ from torch.nn.parameter import Parameter, UninitializedParameter
3
+ from torch import norm_except_dim
4
+ from typing import Any, TypeVar
5
+ import warnings
6
+ from torch.nn.modules import Module
7
+ import torch
8
+
9
+ class WeightNorm:
10
+ name: str
11
+ dim: int
12
+
13
+ def __init__(self, name: str, dim: int) -> None:
14
+ if dim is None:
15
+ dim = -1
16
+ self.name = name
17
+ self.dim = dim
18
+
19
+ # TODO Make return type more specific
20
+ def compute_weight(self, module: Module) -> Any:
21
+ g = getattr(module, self.name + '_g')
22
+ v = getattr(module, self.name + '_v')
23
+
24
+ input_dtype = v.dtype
25
+ v = v.to(torch.float32)
26
+ reduce_dims = list(range(v.dim()))
27
+ reduce_dims.pop(self.dim)
28
+ variance = v.pow(2).sum(reduce_dims, keepdim=True)
29
+ v = v * torch.rsqrt(variance + 1e-6)
30
+
31
+ return g * v.to(input_dtype)
32
+
33
+ @staticmethod
34
+ def apply(module, name: str, dim: int) -> 'WeightNorm':
35
+ warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.")
36
+
37
+ for hook in module._forward_pre_hooks.values():
38
+ if isinstance(hook, WeightNorm) and hook.name == name:
39
+ raise RuntimeError(f"Cannot register two weight_norm hooks on the same parameter {name}")
40
+
41
+ if dim is None:
42
+ dim = -1
43
+
44
+ fn = WeightNorm(name, dim)
45
+
46
+ weight = getattr(module, name)
47
+ if isinstance(weight, UninitializedParameter):
48
+ raise ValueError(
49
+ 'The module passed to `WeightNorm` can\'t have uninitialized parameters. '
50
+ 'Make sure to run the dummy forward before applying weight normalization')
51
+ # remove w from parameter list
52
+ del module._parameters[name]
53
+
54
+ # add g and v as new parameters and express w as g/||v|| * v
55
+ module.register_parameter(name + '_g', Parameter(norm_except_dim(weight, 2, dim).data))
56
+ module.register_parameter(name + '_v', Parameter(weight.data))
57
+ setattr(module, name, fn.compute_weight(module))
58
+
59
+ # recompute weight before every forward()
60
+ module.register_forward_pre_hook(fn)
61
+
62
+ return fn
63
+
64
+ def remove(self, module: Module) -> None:
65
+ weight = self.compute_weight(module)
66
+ delattr(module, self.name)
67
+ del module._parameters[self.name + '_g']
68
+ del module._parameters[self.name + '_v']
69
+ setattr(module, self.name, Parameter(weight.data))
70
+
71
+ def __call__(self, module: Module, inputs: Any) -> None:
72
+ setattr(module, self.name, self.compute_weight(module))
73
+
74
+
75
+ T_module = TypeVar('T_module', bound=Module)
76
+
77
+ def weight_norm(module: T_module, name: str = 'weight', dim: int = 0) -> T_module:
78
+ r"""Apply weight normalization to a parameter in the given module.
79
+
80
+ .. math::
81
+ \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
82
+
83
+ Weight normalization is a reparameterization that decouples the magnitude
84
+ of a weight tensor from its direction. This replaces the parameter specified
85
+ by :attr:`name` (e.g. ``'weight'``) with two parameters: one specifying the magnitude
86
+ (e.g. ``'weight_g'``) and one specifying the direction (e.g. ``'weight_v'``).
87
+ Weight normalization is implemented via a hook that recomputes the weight
88
+ tensor from the magnitude and direction before every :meth:`~Module.forward`
89
+ call.
90
+
91
+ By default, with ``dim=0``, the norm is computed independently per output
92
+ channel/plane. To compute a norm over the entire weight tensor, use
93
+ ``dim=None``.
94
+
95
+ See https://arxiv.org/abs/1602.07868
96
+
97
+ .. warning::
98
+
99
+ This function is deprecated. Use :func:`torch.nn.utils.parametrizations.weight_norm`
100
+ which uses the modern parametrization API. The new ``weight_norm`` is compatible
101
+ with ``state_dict`` generated from old ``weight_norm``.
102
+
103
+ Migration guide:
104
+
105
+ * The magnitude (``weight_g``) and direction (``weight_v``) are now expressed
106
+ as ``parametrizations.weight.original0`` and ``parametrizations.weight.original1``
107
+ respectively. If this is bothering you, please comment on
108
+ https://github.com/pytorch/pytorch/issues/102999
109
+
110
+ * To remove the weight normalization reparametrization, use
111
+ :func:`torch.nn.utils.parametrize.remove_parametrizations`.
112
+
113
+ * The weight is no longer recomputed once at module forward; instead, it will
114
+ be recomputed on every access. To restore the old behavior, use
115
+ :func:`torch.nn.utils.parametrize.cached` before invoking the module
116
+ in question.
117
+
118
+ Args:
119
+ module (Module): containing module
120
+ name (str, optional): name of weight parameter
121
+ dim (int, optional): dimension over which to compute the norm
122
+
123
+ Returns:
124
+ The original module with the weight norm hook
125
+
126
+ Example::
127
+
128
+ >>> m = weight_norm(nn.Linear(20, 40), name='weight')
129
+ >>> m
130
+ Linear(in_features=20, out_features=40, bias=True)
131
+ >>> m.weight_g.size()
132
+ torch.Size([40, 1])
133
+ >>> m.weight_v.size()
134
+ torch.Size([40, 20])
135
+
136
+ """
137
+ WeightNorm.apply(module, name, dim)
138
+ return module
139
+
videollama2/model/encoder.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from transformers import (
7
+ CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig,
8
+ SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
9
+ )
10
+ from .beats.BEATs import BEATsConfig, BEATs
11
+
12
+ class CLIPVisionTower(nn.Module):
13
+
14
+ def __init__(self, vision_tower, args, delay_load=False):
15
+ super().__init__()
16
+
17
+ self.is_loaded = False
18
+
19
+ self.vision_tower_name = vision_tower
20
+ self.select_layer = args.mm_vision_select_layer
21
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
22
+
23
+ if not delay_load:
24
+ self.load_model()
25
+ else:
26
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
27
+
28
+ def load_model(self):
29
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
30
+
31
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
32
+ self.vision_tower.requires_grad_(False)
33
+
34
+ self.is_loaded = True
35
+
36
+ def feature_select(self, image_forward_outs):
37
+ image_features = image_forward_outs.hidden_states[self.select_layer]
38
+ if self.select_feature == 'patch':
39
+ image_features = image_features[:, 1:]
40
+ elif self.select_feature == 'cls_patch':
41
+ image_features = image_features
42
+ else:
43
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
44
+ return image_features
45
+
46
+ @torch.no_grad()
47
+ def forward(self, images):
48
+ if type(images) is list:
49
+ image_features = []
50
+ for image in images:
51
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
52
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
53
+ image_features.append(image_feature)
54
+ else:
55
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
56
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
57
+
58
+ return image_features
59
+
60
+ @property
61
+ def dummy_feature(self):
62
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
63
+
64
+ @property
65
+ def dtype(self):
66
+ return self.vision_tower.dtype
67
+
68
+ @property
69
+ def device(self):
70
+ return self.vision_tower.device
71
+
72
+ @property
73
+ def config(self):
74
+ if self.is_loaded:
75
+ return self.vision_tower.config
76
+ else:
77
+ return self.cfg_only
78
+
79
+ @property
80
+ def hidden_size(self):
81
+ return self.config.hidden_size
82
+
83
+ @property
84
+ def num_patches(self):
85
+ return (self.config.image_size // self.config.patch_size) ** 2
86
+
87
+ @property
88
+ def num_patches_per_side(self):
89
+ return self.config.image_size // self.config.patch_size
90
+
91
+ @property
92
+ def image_size(self):
93
+ return self.config.image_size
94
+
95
+
96
+ class SiglipVisionTower(nn.Module):
97
+
98
+ def __init__(self, vision_tower, args, delay_load=False):
99
+ super().__init__()
100
+
101
+ self.is_loaded = False
102
+
103
+ self.vision_tower_name = vision_tower
104
+ self.select_layer = args.mm_vision_select_layer
105
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
106
+
107
+ if not delay_load:
108
+ self.load_model()
109
+ else:
110
+ self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
111
+
112
+ def load_model(self):
113
+ self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
114
+
115
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
116
+ self.vision_tower.requires_grad_(False)
117
+
118
+ self.is_loaded = True
119
+
120
+ def feature_select(self, image_forward_outs):
121
+ image_features = image_forward_outs.hidden_states[self.select_layer]
122
+ if self.select_feature == 'patch':
123
+ image_features = image_features
124
+ else:
125
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
126
+ return image_features
127
+
128
+ @torch.no_grad()
129
+ def forward(self, images):
130
+ if type(images) is list:
131
+ image_features = []
132
+ for image in images:
133
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
134
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
135
+ image_features.append(image_feature)
136
+ else:
137
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
138
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
139
+
140
+ return image_features
141
+
142
+ @property
143
+ def dummy_feature(self):
144
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
145
+
146
+ @property
147
+ def dtype(self):
148
+ return self.vision_tower.dtype
149
+
150
+ @property
151
+ def device(self):
152
+ return self.vision_tower.device
153
+
154
+ @property
155
+ def config(self):
156
+ if self.is_loaded:
157
+ return self.vision_tower.config
158
+ else:
159
+ return self.cfg_only
160
+
161
+ @property
162
+ def hidden_size(self):
163
+ return self.config.hidden_size
164
+
165
+ @property
166
+ def num_patches(self):
167
+ return (self.config.image_size // self.config.patch_size) ** 2
168
+
169
+ @property
170
+ def num_patches_per_side(self):
171
+ return self.config.image_size // self.config.patch_size
172
+
173
+ @property
174
+ def image_size(self):
175
+ return self.config.image_size
176
+
177
+
178
+ def build_vision_tower(vision_tower_cfg, **kwargs):
179
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
180
+ if 'clip' in vision_tower:
181
+ vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
182
+ elif 'siglip' in vision_tower:
183
+ vision_tower = SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
184
+ else:
185
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
186
+ #print(vision_tower)
187
+ return vision_tower
188
+
189
+ def build_audio_tower(audio_tower_cfg, delay_load=False, **kwargs):
190
+ audio_tower = getattr(audio_tower_cfg, 'mm_audio_tower', getattr(audio_tower_cfg, 'audio_tower', None))
191
+ if not delay_load:
192
+ beats_checkpoint = torch.load(audio_tower, map_location='cpu')
193
+ if 'cfg' in beats_checkpoint:
194
+ beats_cfg = BEATsConfig(beats_checkpoint['cfg'])
195
+ else:
196
+ beats_cfg = BEATsConfig()
197
+ beats = BEATs(beats_cfg)
198
+ if not audio_tower.endswith('.bin'):
199
+ print(beats.load_state_dict(beats_checkpoint['model']))
200
+ else:
201
+ filtered_checkpoint = {}
202
+ prefix = 'model.audio_tower.'
203
+ for key, value in beats_checkpoint.items():
204
+ if key.startswith(prefix):
205
+ new_key = key[len(prefix):] # 去除前缀
206
+ filtered_checkpoint[new_key] = value
207
+ print(beats.load_state_dict(filtered_checkpoint, strict=False))
208
+ else:
209
+ beats_cfg = BEATsConfig()
210
+ beats = BEATs(beats_cfg)
211
+ return beats, beats_cfg
videollama2/model/mel_filters.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd2cc75e70e36fcbdd8ffbc2499062f30094093e6bf2cbafa9859f59972b420b
3
+ size 2048
videollama2/model/projector.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Alibaba DAMO Academy
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+
18
+ import einops
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+ from timm.models.regnet import RegStage
23
+ from timm.models.layers import LayerNorm, LayerNorm2d
24
+ from transformers import TRANSFORMERS_CACHE
25
+
26
+
27
+ def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
28
+ revision = "main"
29
+ # 1. parse the downloaded cache folder
30
+ if cache_dir is None:
31
+ cache_dir = TRANSFORMERS_CACHE
32
+ else:
33
+ cache_dir = cache_dir
34
+ object_id = repo_id.replace("/", "--")
35
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
36
+ # 2. resolve refs (for instance to convert main to the associated commit sha)
37
+ refs_dir = os.path.join(repo_cache, "refs")
38
+ if os.path.isdir(refs_dir):
39
+ revision_file = os.path.join(refs_dir, revision)
40
+ if os.path.isfile(revision_file):
41
+ with open(revision_file) as f:
42
+ revision = f.read()
43
+ # 3. acquire the snapshot folder
44
+ folder = os.path.join(repo_cache, "snapshots", revision)
45
+
46
+ return folder
47
+
48
+
49
+ def load_mm_projector(model_path, cache_dir=None, token=None):
50
+ if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
51
+ is_local = True
52
+ folder = model_path
53
+ else:
54
+ is_local = False
55
+ folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
56
+ if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
57
+ # downloading from remote repo
58
+ from huggingface_hub import snapshot_download
59
+ snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
60
+
61
+ mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
62
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
63
+ return mm_projector_weights
64
+
65
+
66
+ class IdentityMap(nn.Module):
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ def forward(self, x, *args, **kwargs):
72
+ return x
73
+
74
+ @property
75
+ def config(self):
76
+ return {"mm_projector_type": 'identity'}
77
+
78
+
79
+ class SimpleResBlock(nn.Module):
80
+
81
+ def __init__(self, channels):
82
+ super().__init__()
83
+ self.pre_norm = nn.LayerNorm(channels)
84
+
85
+ self.proj = nn.Sequential(
86
+ nn.Linear(channels, channels),
87
+ nn.GELU(),
88
+ nn.Linear(channels, channels)
89
+ )
90
+ def forward(self, x):
91
+ x = self.pre_norm(x)
92
+ return x + self.proj(x)
93
+
94
+
95
+ def build_vision_projector(config, delay_load=False, **kwargs):
96
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
97
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
98
+ if mlp_gelu_match:
99
+ mlp_depth = int(mlp_gelu_match.group(1))
100
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
101
+ for _ in range(1, mlp_depth):
102
+ modules.append(nn.GELU())
103
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
104
+ return nn.Sequential(*modules)
105
+
106
+ if projector_type == "linear":
107
+ # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
108
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
109
+ elif projector_type == "stc_connector":
110
+ return STCConnector(config)
111
+ elif projector_type == "stp_connector":
112
+ return STPConnector(config)
113
+ elif projector_type == "stc_connector_v35":
114
+ return STCConnectorV35(config)
115
+ elif projector_type == "spatial_conv":
116
+ return SpatialConv(config)
117
+ elif projector_type == "spatial_pool":
118
+ return SpatialPool(config)
119
+ if projector_type == 'identity':
120
+ return IdentityMap()
121
+
122
+ raise ValueError(f'Unknown projector type: {projector_type}')
123
+
124
+ def build_audio_projector(config, delay_load=False, **kwargs):
125
+ projector_type = getattr(config, 'mm_projector_a_type', 'linear')
126
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
127
+ if mlp_gelu_match:
128
+ mlp_depth = int(mlp_gelu_match.group(1))
129
+ modules = [nn.Linear(config.mm_hidden_size_a, config.hidden_size_a)]
130
+ for _ in range(1, mlp_depth):
131
+ modules.append(nn.GELU())
132
+ modules.append(nn.Linear(config.hidden_size_a, config.hidden_size_a))
133
+ return nn.Sequential(*modules)
134
+ if projector_type == "linear":
135
+ # note that for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
136
+ return nn.Linear(config.mm_hidden_size_a, config.hidden_size_a)
137
+ if projector_type == 'identity':
138
+ return IdentityMap()
139
+
140
+ def build_mlp(depth, hidden_size, output_hidden_size):
141
+ modules = [nn.Linear(hidden_size, output_hidden_size)]
142
+ for _ in range(1, depth):
143
+ modules.append(nn.GELU())
144
+ modules.append(nn.Linear(output_hidden_size, output_hidden_size))
145
+ return nn.Sequential(*modules)
146
+
147
+
148
+ class STCConnector(nn.Module):
149
+
150
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
151
+ """Temporal Convolutional Vision-Language Connector.
152
+
153
+ Args:
154
+ config: config object.
155
+ downsample: (temporal, height, width) downsample rate.
156
+ depth: depth of the spatial interaction blocks.
157
+ mlp_depth: depth of the vision-language projector layers.
158
+ """
159
+ super().__init__()
160
+ self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
161
+ self.hidden_size = hidden_size = config.hidden_size
162
+ self.output_hidden_size = output_hidden_size = config.hidden_size
163
+ # TODO: make these as config arguments
164
+ self.depth = depth
165
+ self.mlp_depth = mlp_depth
166
+ self.downsample = downsample
167
+ if depth != 0:
168
+ self.s1 = RegStage(
169
+ depth=depth,
170
+ in_chs=encoder_hidden_size,
171
+ out_chs=hidden_size,
172
+ stride=1,
173
+ dilation=1,
174
+ act_layer=nn.SiLU,
175
+ norm_layer=LayerNorm2d,
176
+ )
177
+ else:
178
+ self.s1 = nn.Identity()
179
+ self.sampler = nn.Sequential(
180
+ nn.Conv3d(
181
+ in_channels=hidden_size,
182
+ out_channels=hidden_size,
183
+ kernel_size=downsample,
184
+ stride=downsample,
185
+ padding=1,
186
+ bias=True
187
+ ),
188
+ nn.SiLU()
189
+ )
190
+ if depth != 0:
191
+ self.s2 = RegStage(
192
+ depth=depth,
193
+ in_chs=hidden_size,
194
+ out_chs=hidden_size,
195
+ stride=1,
196
+ dilation=1,
197
+ act_layer=nn.SiLU,
198
+ norm_layer=LayerNorm2d,
199
+ )
200
+ else:
201
+ self.s2 = nn.Identity()
202
+ self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
203
+
204
+ def forward(self, x):
205
+ """Aggregate tokens on the temporal and spatial dimensions.
206
+ Args:
207
+ x: input tokens [b, t, h, w, d] / [b, t, l, d]
208
+ Returns:
209
+ aggregated tokens [b, l, d]
210
+ """
211
+ t = x.size(1)
212
+ if x.ndim == 4:
213
+ hw = int(x.size(2) ** 0.5)
214
+ x = einops.rearrange(x, "b t (h w) d -> b d t h w", h=hw, w=hw)
215
+ elif x.ndim == 5:
216
+ x = einops.rearrange(x, "b t h w d -> b d t h w")
217
+
218
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
219
+ # 1. the first stage of the adapter
220
+ x = self.s1(x)
221
+ x = einops.rearrange(x, "(b t) d h w -> b d t h w", t=t)
222
+ # 2. downsampler
223
+ x = self.sampler(x)
224
+ new_t = x.size(2)
225
+ # 3. the second stage of the adapter
226
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
227
+ x = self.s2(x)
228
+ x = einops.rearrange(x, "(b t) d h w -> b (t h w) d", t=new_t)
229
+ x = self.readout(x)
230
+ return x
231
+
232
+
233
+ class STPConnector(STCConnector):
234
+
235
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
236
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
237
+ self.sampler = nn.Sequential(nn.AvgPool3d(downsample), nn.SiLU())
238
+
239
+
240
+ class STCConnectorV35(STCConnector):
241
+
242
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
243
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
244
+ self.sampler = nn.Sequential(
245
+ nn.Conv3d(
246
+ in_channels=self.hidden_size,
247
+ out_channels=self.hidden_size,
248
+ kernel_size=downsample,
249
+ stride=downsample,
250
+ padding=0,
251
+ bias=True
252
+ ),
253
+ nn.SiLU())
254
+
255
+
256
+ class SpatialConv(STCConnector):
257
+
258
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
259
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
260
+
261
+
262
+ class SpatialPool(STPConnector):
263
+
264
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
265
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
videollama2/model/videollama2_arch.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+
19
+ import einops
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from .projector import load_mm_projector, build_vision_projector, build_audio_projector
24
+ from .encoder import build_vision_tower, build_audio_tower
25
+ from ..constants import IGNORE_INDEX, NUM_FRAMES, MODAL_INDEX_MAP
26
+
27
+
28
+ class Videollama2MetaModel:
29
+
30
+ def __init__(self, config):
31
+ super(Videollama2MetaModel, self).__init__(config)
32
+
33
+ if hasattr(config, "mm_vision_tower"):
34
+ self.vision_tower = build_vision_tower(config, delay_load=True)
35
+ self.mm_projector = build_vision_projector(config)
36
+ if hasattr(config, "mm_audio_tower"):
37
+ self.audio_tower, audio_tower_cfg = build_audio_tower(config, delay_load=True)
38
+ self.mm_projector_a = build_audio_projector(config)
39
+
40
+ def get_vision_tower(self):
41
+ vision_tower = getattr(self, 'vision_tower', None)
42
+ if type(vision_tower) is list:
43
+ vision_tower = vision_tower[0]
44
+ return vision_tower
45
+
46
+ def get_audio_tower(self):
47
+ audio_tower = getattr(self, 'audio_tower', None)
48
+ if type(audio_tower) is list:
49
+ audio_tower = audio_tower[0]
50
+ return audio_tower
51
+
52
+ def initialize_vision_modules(self, model_args, fsdp=None):
53
+ vision_tower = model_args.vision_tower
54
+ mm_vision_select_layer = model_args.mm_vision_select_layer
55
+ mm_vision_select_feature = model_args.mm_vision_select_feature
56
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
57
+
58
+ self.config.mm_vision_tower = vision_tower
59
+
60
+ if self.get_vision_tower() is None:
61
+ vision_tower = build_vision_tower(model_args)
62
+
63
+ if fsdp is not None and len(fsdp) > 0:
64
+ self.vision_tower = [vision_tower]
65
+ else:
66
+ self.vision_tower = vision_tower
67
+ else:
68
+ if fsdp is not None and len(fsdp) > 0:
69
+ vision_tower = self.vision_tower[0]
70
+ else:
71
+ vision_tower = self.vision_tower
72
+ vision_tower.load_model()
73
+
74
+ self.config.use_mm_proj = True
75
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
76
+ self.config.mm_hidden_size = vision_tower.hidden_size
77
+ self.config.mm_vision_select_layer = mm_vision_select_layer
78
+ self.config.mm_vision_select_feature = mm_vision_select_feature
79
+
80
+ if getattr(self, 'mm_projector', None) is None:
81
+ self.mm_projector = build_vision_projector(self.config)
82
+ else:
83
+ # In case it is frozen by LoRA
84
+ for p in self.mm_projector.parameters():
85
+ p.requires_grad = True
86
+
87
+ if pretrain_mm_mlp_adapter is not None:
88
+ if os.path.exists(pretrain_mm_mlp_adapter):
89
+ is_local = True
90
+ if os.path.isdir(pretrain_mm_mlp_adapter):
91
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
92
+ else:
93
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
94
+ else:
95
+ # Support loading projector weights from remote HuggingFace model hub
96
+ is_local = False
97
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.replace('mm_projector.bin', '')
98
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.strip('/').strip('\\').strip()
99
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
100
+
101
+ def get_w(weights, keyword):
102
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
103
+
104
+ # self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
105
+ # set strict=False to avoid missing key error regarding bert.embeddings.position_ids
106
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
107
+
108
+
109
+ def initialize_audio_modules(self, model_args, fsdp=None):
110
+ audio_tower = model_args.audio_tower
111
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter_a
112
+ self.config.mm_audio_tower = audio_tower
113
+ if self.get_audio_tower() is None:
114
+ audio_tower, audio_tower_cfg = build_audio_tower(model_args)
115
+ if fsdp is not None and len(fsdp) > 0:
116
+ self.audio_tower = [audio_tower]
117
+ else:
118
+ self.audio_tower = audio_tower
119
+ else:
120
+ if fsdp is not None and len(fsdp) > 0:
121
+ audio_tower = self.audio_tower[0]
122
+ else:
123
+ audio_tower = self.audio_tower
124
+ self.config.use_mm_proj = True
125
+ self.config.mm_projector_a_type = getattr(model_args, 'mm_projector_a_type', 'linear')
126
+ if model_args.model_type == 'videollama2_qwen2':
127
+ audio_tower_cfg.hidden_size = 3584
128
+ self.config.mm_hidden_size_a = audio_tower_cfg.encoder_embed_dim
129
+ self.config.hidden_size_a = audio_tower_cfg.hidden_size
130
+ if getattr(self, 'mm_projector_a', None) is None:
131
+ self.mm_projector_a = build_audio_projector(self.config)
132
+ else:
133
+ # In case it is frozen by LoRA
134
+ for p in self.mm_projector_a.parameters():
135
+ p.requires_grad = True
136
+ if pretrain_mm_mlp_adapter is not None:
137
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
138
+ def get_w(weights, keyword):
139
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
140
+ self.mm_projector_a.load_state_dict(get_w(mm_projector_weights, 'mm_projector_a'), strict=True)
141
+
142
+
143
+ class Videollama2MetaForCausalLM(ABC):
144
+
145
+ @abstractmethod
146
+ def get_model(self):
147
+ pass
148
+
149
+ def num_frames(self):
150
+ if hasattr(self.config, 'num_frames'):
151
+ return self.config.num_frames
152
+ else:
153
+ return NUM_FRAMES
154
+
155
+ def get_vision_tower(self):
156
+ return self.get_model().get_vision_tower()
157
+
158
+ def get_audio_tower(self):
159
+ return self.get_model().get_audio_tower()
160
+
161
+ def encode_images_or_videos(self, images):
162
+ num_frames = self.config.num_frames if hasattr(self.config, 'num_frames') else NUM_FRAMES
163
+
164
+ data_batch = []
165
+ for i, (data, modal) in enumerate(images):
166
+ if modal == 'image':
167
+ data = data.expand(num_frames, -1, -1, -1)
168
+ else:
169
+ data = data
170
+ data_batch.append(data)
171
+
172
+ data_batch = torch.stack(data_batch, dim=0)
173
+
174
+ assert len(data_batch.size()) == 5
175
+ batch_size = data_batch.size(0)
176
+
177
+ frames = einops.rearrange(data_batch, 'b t c h w -> (b t) c h w')
178
+ frames_features = self.get_model().get_vision_tower()(frames)
179
+ frames_features = einops.rearrange(frames_features, '(b t) n h -> b t n h', b = batch_size)
180
+
181
+ return self.temporal_aggregator(frames_features)
182
+
183
+ def temporal_aggregator(self, frames_features):
184
+ """Temporal aggregation of frame features.
185
+ Args:
186
+ frames_features (torch.Tensor): Frame features with shape (b, t, n, h).
187
+ Returns:
188
+ torch.Tensor: Video features with shape (b, n, h).
189
+ """
190
+ # TODO: improve the merging method.
191
+ # *********** mean pooling *************
192
+ if self.config.mm_projector_type == "mlp2x_gelu" or self.config.mm_projector_type == "linear":
193
+ video_features = self.get_model().mm_projector(frames_features.mean(1))
194
+ # *********** spatial convolution *************
195
+ elif self.config.mm_projector_type == "spatial_conv":
196
+ video_features = self.get_model().mm_projector(frames_features)
197
+ # *********** spatial pooling *************
198
+ elif self.config.mm_projector_type == "spatial_pool":
199
+ video_features = self.get_model().mm_projector(frames_features)
200
+ # *********** time ************
201
+ elif "tc_connector" in self.config.mm_projector_type or "tp_connector" in self.config.mm_projector_type:
202
+ video_features = self.get_model().mm_projector(frames_features)
203
+ else:
204
+ raise Exception(f"Unsupported projector type {self.config.mm_projector_type}!!!")
205
+
206
+ return video_features
207
+
208
+ def prepare_inputs_labels_for_multimodal(
209
+ self, input_ids, attention_mask, past_key_values, labels, images
210
+ ):
211
+ vision_tower = self.get_vision_tower()
212
+ audio_tower = self.get_audio_tower()
213
+ # NOTE: text-only situation
214
+ if (vision_tower is None and audio_tower is None) or images is None or input_ids.shape[1] == 1:
215
+ # if past_key_values is not None and vision_tower is not None and Xs is not None and input_ids.shape[1] == 1:
216
+ # attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
217
+ return input_ids, attention_mask, past_key_values, None, labels
218
+ if audio_tower is None:
219
+ mm_features = self.encode_images_or_videos(images)
220
+ elif audio_tower is not None and vision_tower is not None and any(modal == 'video' for (_, modal) in images):
221
+ # [tensor, "image"]
222
+ # [tensor, "audio"]
223
+ # [tensor, "video"]
224
+ # [dict(”, audio), "video"]
225
+
226
+ X_video = []
227
+ X_audio = []
228
+
229
+ select_audio_id = []
230
+ select_videoimage_id = []
231
+ for idx, data_list in enumerate(images):
232
+ #print(data_list)
233
+ if isinstance(data_list[0], dict):
234
+ assert data_list[1] == "video"
235
+ X_audio.append(data_list[0]["audio"])
236
+ select_audio_id.append(True)
237
+ X_video.append((data_list[0]["video"], "video"))
238
+ select_videoimage_id.append(True)
239
+ else:
240
+ if data_list[1] == "audio":
241
+ X_audio.append(data_list[0])
242
+ select_audio_id.append(True)
243
+ select_videoimage_id.append(False)
244
+ elif data_list[1] == "video" or data_list[1] == "image":
245
+ X_video.append(data_list)
246
+ select_videoimage_id.append(True)
247
+ select_audio_id.append(False)
248
+ else:
249
+ raise NotImplementedError
250
+
251
+ if len(X_audio) > 0:
252
+ Xa_features = torch.cat(X_audio, dim=0)
253
+ audio_padding_mask = torch.zeros(Xa_features.shape, device=self.device).bool()
254
+ audio_embedding, T, F = self.get_model().get_audio_tower().extract_features(Xa_features, padding_mask=audio_padding_mask, feature_only=True)
255
+ Xa_features = self.get_model().mm_projector_a(audio_embedding)
256
+ Xa_features = Xa_features.view(len(X_audio), -1, Xa_features.shape[-1])
257
+
258
+ if len(X_video) > 0:
259
+ X_features = self.encode_images_or_videos(X_video)
260
+
261
+ mm_features = []
262
+ idx_a, idx_v = 0, 0
263
+ for audio_idx, videoimage_idx in zip(select_audio_id, select_videoimage_id):
264
+ if audio_idx and videoimage_idx:
265
+ mm_features.append(torch.cat([X_features[idx_v], Xa_features[idx_a]], dim=0))
266
+ idx_a += 1
267
+ idx_v += 1
268
+ elif audio_idx:
269
+ mm_features.append(Xa_features[idx_a])
270
+ idx_a += 1
271
+ elif videoimage_idx:
272
+ mm_features.append(X_features[idx_v])
273
+ idx_v += 1
274
+ else:
275
+ raise NotImplementedError
276
+ else:
277
+ data_batch = []
278
+ for i, (data, modal) in enumerate(images):
279
+ data_batch.append(data)
280
+ X_features = torch.cat(data_batch, dim=0)
281
+ audio_padding_mask = torch.zeros(X_features.shape, device=self.device).bool()
282
+ audio_embedding, T, F = self.get_model().get_audio_tower().extract_features(X_features,
283
+ padding_mask=audio_padding_mask, feature_only=True)
284
+ mm_features = self.get_model().mm_projector_a(audio_embedding)
285
+ #X_features = X_features.view(len(X_features), -1, X_features.shape[-1])
286
+
287
+ new_input_embeds = []
288
+ new_labels = [] if labels is not None else None
289
+ cur_mm_idx = 0
290
+ # replace image/video/audio tokens with pre-computed embeddings
291
+ for batch_idx, cur_input_ids in enumerate(input_ids):
292
+ num_multimodals = sum((cur_input_ids == mm_token_idx).sum() for mm_token_idx in MODAL_INDEX_MAP.values())
293
+ # pure text input
294
+ if num_multimodals == 0:
295
+ half_len = cur_input_ids.shape[0] // 2
296
+ cur_mm_features = mm_features[cur_mm_idx]
297
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
298
+ cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
299
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_mm_features[0:0], cur_input_embeds_2], dim=0)
300
+ new_input_embeds.append(cur_input_embeds)
301
+ if labels is not None:
302
+ new_labels.append(labels[batch_idx])
303
+ cur_mm_idx += 1
304
+ continue
305
+
306
+ cur_new_input_embeds = []
307
+ if labels is not None:
308
+ cur_labels = labels[batch_idx]
309
+ cur_new_labels = []
310
+ assert cur_labels.shape == cur_input_ids.shape
311
+
312
+ mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
313
+ while mm_token_indices.numel() > 0:
314
+ cur_mm_features = mm_features[cur_mm_idx]
315
+ mm_token_start = mm_token_indices[0]
316
+
317
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:mm_token_start]))
318
+ cur_new_input_embeds.append(cur_mm_features)
319
+ if labels is not None:
320
+ cur_new_labels.append(cur_labels[:mm_token_start])
321
+ cur_new_labels.append(torch.full((cur_mm_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
322
+ cur_labels = cur_labels[mm_token_start+1:]
323
+
324
+ cur_mm_idx += 1
325
+ cur_input_ids = cur_input_ids[mm_token_start+1:]
326
+ mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
327
+
328
+ if cur_input_ids.numel() > 0:
329
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
330
+ if labels is not None:
331
+ cur_new_labels.append(cur_labels)
332
+ cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
333
+ # NOTE: one cur_new_input_embeds per each
334
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
335
+ new_input_embeds.append(cur_new_input_embeds)
336
+ if labels is not None:
337
+ cur_new_labels = torch.cat(cur_new_labels, dim=0)
338
+ new_labels.append(cur_new_labels)
339
+
340
+ # padding
341
+ if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
342
+ max_len = max(x.shape[0] for x in new_input_embeds)
343
+
344
+ new_input_embeds_align = []
345
+ for cur_new_embed in new_input_embeds:
346
+ cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
347
+ new_input_embeds_align.append(cur_new_embed)
348
+ new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
349
+
350
+ if labels is not None:
351
+ new_labels_align = []
352
+ _new_labels = new_labels
353
+ for cur_new_label in new_labels:
354
+ cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
355
+ new_labels_align.append(cur_new_label)
356
+ new_labels = torch.stack(new_labels_align, dim=0)
357
+
358
+ if attention_mask is not None:
359
+ new_attention_mask = []
360
+ for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
361
+ new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
362
+ new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
363
+ cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
364
+ new_attention_mask.append(cur_new_attention_mask)
365
+ attention_mask = torch.stack(new_attention_mask, dim=0)
366
+ assert attention_mask.shape == new_labels.shape
367
+ else:
368
+ new_input_embeds = torch.stack(new_input_embeds, dim=0)
369
+ if labels is not None:
370
+ new_labels = torch.stack(new_labels, dim=0)
371
+
372
+ if attention_mask is not None:
373
+ new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
374
+ attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
375
+ assert attention_mask.shape == new_input_embeds.shape[:2]
376
+
377
+ return None, attention_mask, past_key_values, new_input_embeds, new_labels
videollama2/model/videollama2_gemma2.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, \
24
+ Gemma2Config, Gemma2Model, Gemma2ForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2Gemma2Config(Gemma2Config):
33
+ model_type = "videollama2_gemma2"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_gemma2"
38
+
39
+
40
+ class Videollama2Gemma2Model(Videollama2MetaModel, Gemma2Model):
41
+ config_class = Videollama2Gemma2Config
42
+
43
+ def __init__(self, config: Gemma2Config):
44
+ super(Videollama2Gemma2Model, self).__init__(config)
45
+
46
+
47
+ class Videollama2Gemma2ForCausalLM(Gemma2ForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2Gemma2Config
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(Gemma2ForCausalLM, self).__init__(config)
52
+ self.model = Videollama2Gemma2Model(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ cache_position: Optional[int] = None,
77
+ **kwargs
78
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
79
+
80
+ if inputs_embeds is None:
81
+ (
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ inputs_embeds,
86
+ labels
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ attention_mask,
90
+ past_key_values,
91
+ labels,
92
+ images
93
+ )
94
+
95
+ outputs = super().forward(
96
+ input_ids=input_ids,
97
+ attention_mask=attention_mask,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ labels=labels,
101
+ use_cache=use_cache,
102
+ output_attentions=output_attentions,
103
+ output_hidden_states=output_hidden_states,
104
+ return_dict=return_dict,
105
+ cache_position=cache_position,
106
+ )
107
+
108
+ outputs.labels = labels
109
+
110
+ return outputs
111
+
112
+ @torch.no_grad()
113
+ def generate(
114
+ self,
115
+ inputs: Optional[torch.Tensor] = None,
116
+ images: Optional[torch.Tensor] = None,
117
+ **kwargs,
118
+ ) -> Union[GenerateOutput, torch.LongTensor]:
119
+ position_ids = kwargs.pop("position_ids", None)
120
+ attention_mask = kwargs.pop("attention_mask", None)
121
+ if "inputs_embeds" in kwargs:
122
+ raise NotImplementedError("`inputs_embeds` is not supported")
123
+
124
+ if images is not None:
125
+ (
126
+ input_ids,
127
+ attention_mask,
128
+ past_key_values,
129
+ inputs_embeds,
130
+ _
131
+ ) = self.prepare_inputs_labels_for_multimodal(
132
+ input_ids=inputs,
133
+ attention_mask=attention_mask,
134
+ past_key_values=None,
135
+ labels=None,
136
+ images=images
137
+ )
138
+ else:
139
+ inputs_embeds = self.get_model().embed_tokens(inputs)
140
+
141
+ return super().generate(
142
+ position_ids=position_ids,
143
+ attention_mask=attention_mask,
144
+ inputs_embeds=inputs_embeds,
145
+ **kwargs
146
+ )
147
+
148
+ def _prepare_generated_length(self, model_input_name, inputs_tensor, **kwargs):
149
+ if model_input_name == "inputs_embeds":
150
+ self.inputs_embeds_length = inputs_tensor.size(1)
151
+ else:
152
+ self.inputs_embeds_length = 0
153
+ return super()._prepare_generated_length(
154
+ model_input_name=model_input_name,
155
+ inputs_tensor=inputs_tensor,
156
+ **kwargs)
157
+
158
+ def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int, **kwargs):
159
+ return super()._get_cache(
160
+ cache_implementation=cache_implementation,
161
+ max_batch_size=max_batch_size,
162
+ max_cache_len=max_cache_len + self.inputs_embeds_length,
163
+ **kwargs)
164
+
165
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
166
+ images = kwargs.pop("images", None)
167
+ _inputs = super().prepare_inputs_for_generation(
168
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
169
+ )
170
+ if images is not None:
171
+ _inputs['images'] = images
172
+ return _inputs
173
+
174
+
175
+ AutoConfig.register("videollama2_gemma2", Videollama2Gemma2Config)
176
+ AutoModelForCausalLM.register(Videollama2Gemma2Config, Videollama2Gemma2ForCausalLM)
videollama2/model/videollama2_llama.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ LlamaConfig, LlamaModel, LlamaForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2LlamaConfig(LlamaConfig):
31
+ model_type = "videollama2_llama"
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.model_type = "videollama2_llama"
36
+
37
+
38
+ class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
39
+ config_class = Videollama2LlamaConfig
40
+
41
+ def __init__(self, config: LlamaConfig):
42
+ super(Videollama2LlamaModel, self).__init__(config)
43
+
44
+
45
+ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2LlamaConfig
47
+
48
+ def __init__(self, config, **kwargs):
49
+ super(LlamaForCausalLM, self).__init__(config)
50
+ self.model = Videollama2LlamaModel(config)
51
+ self.pretraining_tp = config.pretraining_tp
52
+ self.vocab_size = config.vocab_size
53
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
54
+
55
+ # Initialize weights and apply final processing
56
+ self.post_init()
57
+
58
+ def get_model(self):
59
+ return self.model
60
+
61
+ def forward(
62
+ self,
63
+ input_ids: torch.LongTensor = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ position_ids: Optional[torch.LongTensor] = None,
66
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
67
+ inputs_embeds: Optional[torch.FloatTensor] = None,
68
+ labels: Optional[torch.LongTensor] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ images: Optional[torch.FloatTensor] = None,
73
+ return_dict: Optional[bool] = None,
74
+ cache_position: Optional[torch.LongTensor] = None,
75
+ **kwargs
76
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
77
+
78
+ if inputs_embeds is None:
79
+ (
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ inputs_embeds,
84
+ labels
85
+ ) = self.prepare_inputs_labels_for_multimodal(
86
+ input_ids,
87
+ attention_mask,
88
+ past_key_values,
89
+ labels,
90
+ images
91
+ )
92
+
93
+ outputs = super().forward(
94
+ input_ids=input_ids,
95
+ attention_mask=attention_mask,
96
+ past_key_values=past_key_values,
97
+ inputs_embeds=inputs_embeds,
98
+ labels=labels,
99
+ use_cache=use_cache,
100
+ output_attentions=output_attentions,
101
+ output_hidden_states=output_hidden_states,
102
+ return_dict=return_dict,
103
+ cache_position=cache_position,
104
+ )
105
+
106
+ outputs.labels = labels
107
+
108
+ return outputs
109
+
110
+ @torch.no_grad()
111
+ def generate(
112
+ self,
113
+ inputs: Optional[torch.Tensor] = None,
114
+ images: Optional[torch.Tensor] = None,
115
+ **kwargs,
116
+ ) -> Union[GenerateOutput, torch.LongTensor]:
117
+ position_ids = kwargs.pop("position_ids", None)
118
+ attention_mask = kwargs.pop("attention_mask", None)
119
+ if "inputs_embeds" in kwargs:
120
+ raise NotImplementedError("`inputs_embeds` is not supported")
121
+
122
+ if images is not None:
123
+ (
124
+ input_ids,
125
+ attention_mask,
126
+ past_key_values,
127
+ inputs_embeds,
128
+ _
129
+ ) = self.prepare_inputs_labels_for_multimodal(
130
+ input_ids=inputs,
131
+ attention_mask=attention_mask,
132
+ past_key_values=None,
133
+ labels=None,
134
+ images=images
135
+ )
136
+ else:
137
+ inputs_embeds = self.get_model().embed_tokens(inputs)
138
+
139
+ return super().generate(
140
+ position_ids=position_ids,
141
+ attention_mask=attention_mask,
142
+ inputs_embeds=inputs_embeds,
143
+ **kwargs
144
+ )
145
+
146
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
147
+ images = kwargs.pop("images", None)
148
+ _inputs = super().prepare_inputs_for_generation(
149
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
+ )
151
+ if images is not None:
152
+ _inputs['images'] = images
153
+ return _inputs
154
+
155
+
156
+ AutoConfig.register("videollama2_llama", Videollama2LlamaConfig)
157
+ AutoModelForCausalLM.register(Videollama2LlamaConfig, Videollama2LlamaForCausalLM)
videollama2/model/videollama2_mistral.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
24
+ MistralConfig, MistralModel, MistralForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2MistralConfig(MistralConfig):
33
+ model_type = "videollama2_mistral"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_mistral"
38
+
39
+
40
+ class Videollama2MistralModel(Videollama2MetaModel, MistralModel):
41
+ config_class = Videollama2MistralConfig
42
+
43
+ def __init__(self, config: MistralConfig):
44
+ super(Videollama2MistralModel, self).__init__(config)
45
+
46
+
47
+ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2MistralConfig
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(MistralForCausalLM, self).__init__(config)
52
+ self.model = Videollama2MistralModel(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ cache_position: Optional[int] = None,
77
+ **kwargs
78
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
79
+
80
+ if inputs_embeds is None:
81
+ (
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ inputs_embeds,
86
+ labels
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ attention_mask,
90
+ past_key_values,
91
+ labels,
92
+ images
93
+ )
94
+
95
+ outputs = super().forward(
96
+ input_ids=input_ids,
97
+ attention_mask=attention_mask,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ labels=labels,
101
+ use_cache=use_cache,
102
+ output_attentions=output_attentions,
103
+ output_hidden_states=output_hidden_states,
104
+ return_dict=return_dict,
105
+ cache_position=cache_position,
106
+ )
107
+
108
+ outputs.labels = labels
109
+
110
+ return outputs
111
+
112
+ @torch.no_grad()
113
+ def generate(
114
+ self,
115
+ inputs: Optional[torch.Tensor] = None,
116
+ images: Optional[torch.Tensor] = None,
117
+ **kwargs,
118
+ ) -> Union[GenerateOutput, torch.LongTensor]:
119
+ position_ids = kwargs.pop("position_ids", None)
120
+ attention_mask = kwargs.pop("attention_mask", None)
121
+ if "inputs_embeds" in kwargs:
122
+ raise NotImplementedError("`inputs_embeds` is not supported")
123
+
124
+ if images is not None:
125
+ (
126
+ input_ids,
127
+ attention_mask,
128
+ past_key_values,
129
+ inputs_embeds,
130
+ _
131
+ ) = self.prepare_inputs_labels_for_multimodal(
132
+ input_ids=inputs,
133
+ attention_mask=attention_mask,
134
+ past_key_values=None,
135
+ labels=None,
136
+ images=images
137
+ )
138
+ else:
139
+ inputs_embeds = self.get_model().embed_tokens(inputs)
140
+
141
+ return super().generate(
142
+ position_ids=position_ids,
143
+ attention_mask=attention_mask,
144
+ inputs_embeds=inputs_embeds,
145
+ **kwargs
146
+ )
147
+
148
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
149
+ images = kwargs.pop("images", None)
150
+ _inputs = super().prepare_inputs_for_generation(
151
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
152
+ )
153
+ if images is not None:
154
+ _inputs['images'] = images
155
+ return _inputs
156
+
157
+
158
+ AutoConfig.register("videollama2_mistral", Videollama2MistralConfig)
159
+ AutoModelForCausalLM.register(Videollama2MistralConfig, Videollama2MistralForCausalLM)
videollama2/model/videollama2_mixtral.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ MixtralConfig, MixtralModel, MixtralForCausalLM
24
+
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+ from transformers.generation.utils import GenerateOutput
27
+
28
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
29
+
30
+
31
+ class Videollama2MixtralConfig(MixtralConfig):
32
+ model_type = "videollama2_mixtral"
33
+
34
+ def __init__(self, **kwargs):
35
+ super().__init__(**kwargs)
36
+ self.model_type = "videollama2_mixtral"
37
+
38
+
39
+ class Videollama2MixtralModel(Videollama2MetaModel, MixtralModel):
40
+ config_class = Videollama2MixtralConfig
41
+
42
+ def __init__(self, config: MixtralConfig):
43
+ super(Videollama2MixtralModel, self).__init__(config)
44
+
45
+
46
+ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausalLM):
47
+ config_class = Videollama2MixtralConfig
48
+
49
+ def __init__(self, config, **kwargs):
50
+ super(MixtralForCausalLM, self).__init__(config)
51
+ self.model = Videollama2MixtralModel(config)
52
+ # self.pretraining_tp = config.pretraining_tp
53
+ self.vocab_size = config.vocab_size
54
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
55
+
56
+ # Initialize weights and apply final processing
57
+ self.post_init()
58
+
59
+ def get_model(self):
60
+ return self.model
61
+
62
+ def forward(
63
+ self,
64
+ input_ids: torch.LongTensor = None,
65
+ attention_mask: Optional[torch.Tensor] = None,
66
+ position_ids: Optional[torch.LongTensor] = None,
67
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
68
+ inputs_embeds: Optional[torch.FloatTensor] = None,
69
+ labels: Optional[torch.LongTensor] = None,
70
+ use_cache: Optional[bool] = None,
71
+ output_attentions: Optional[bool] = None,
72
+ output_hidden_states: Optional[bool] = None,
73
+ images: Optional[torch.FloatTensor] = None,
74
+ return_dict: Optional[bool] = None,
75
+ cache_position: Optional[int] = None,
76
+ **kwargs
77
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
78
+
79
+ if inputs_embeds is None:
80
+ (
81
+ input_ids,
82
+ attention_mask,
83
+ past_key_values,
84
+ inputs_embeds,
85
+ labels
86
+ ) = self.prepare_inputs_labels_for_multimodal(
87
+ input_ids,
88
+ attention_mask,
89
+ past_key_values,
90
+ labels,
91
+ images
92
+ )
93
+
94
+ return super().forward(
95
+ input_ids=input_ids,
96
+ attention_mask=attention_mask,
97
+ past_key_values=past_key_values,
98
+ inputs_embeds=inputs_embeds,
99
+ labels=labels,
100
+ use_cache=use_cache,
101
+ output_attentions=output_attentions,
102
+ output_hidden_states=output_hidden_states,
103
+ return_dict=return_dict,
104
+ cache_position=cache_position,
105
+ )
106
+
107
+ @torch.no_grad()
108
+ def generate(
109
+ self,
110
+ inputs: Optional[torch.Tensor] = None,
111
+ images: Optional[torch.Tensor] = None,
112
+ **kwargs,
113
+ ) -> Union[GenerateOutput, torch.LongTensor]:
114
+ position_ids = kwargs.pop("position_ids", None)
115
+ attention_mask = kwargs.pop("attention_mask", None)
116
+ if "inputs_embeds" in kwargs:
117
+ raise NotImplementedError("`inputs_embeds` is not supported")
118
+
119
+ if images is not None:
120
+ (
121
+ input_ids,
122
+ attention_mask,
123
+ past_key_values,
124
+ inputs_embeds,
125
+ _
126
+ ) = self.prepare_inputs_labels_for_multimodal(
127
+ input_ids=inputs,
128
+ attention_mask=attention_mask,
129
+ past_key_values=None,
130
+ labels=None,
131
+ images=images
132
+ )
133
+ else:
134
+ inputs_embeds = self.get_model().embed_tokens(inputs)
135
+
136
+ return super().generate(
137
+ position_ids=position_ids,
138
+ attention_mask=attention_mask,
139
+ inputs_embeds=inputs_embeds,
140
+ **kwargs
141
+ )
142
+
143
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
144
+ images = kwargs.pop("images", None)
145
+ _inputs = super().prepare_inputs_for_generation(
146
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
147
+ )
148
+ if images is not None:
149
+ _inputs['images'] = images
150
+ return _inputs
151
+
152
+
153
+ AutoConfig.register("videollama2_mixtral", Videollama2MixtralConfig)
154
+ AutoModelForCausalLM.register(Videollama2MixtralConfig, Videollama2MixtralForCausalLM)
videollama2/model/videollama2_phi3.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
24
+ Phi3Config, Phi3Model, Phi3ForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2Phi3Config(Phi3Config):
33
+ model_type = "videollama2_phi3"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_phi3"
38
+
39
+
40
+ class Videollama2Phi3Model(Videollama2MetaModel, Phi3Model):
41
+ config_class = Videollama2Phi3Config
42
+
43
+ def __init__(self, config: Phi3Config):
44
+ super(Videollama2Phi3Model, self).__init__(config)
45
+
46
+
47
+ class Videollama2Phi3ForCausalLM(Phi3ForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2Phi3Config
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(Phi3ForCausalLM, self).__init__(config)
52
+ self.model = Videollama2Phi3Model(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ cache_position: Optional[int] = None,
77
+ **kwargs
78
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
79
+
80
+ if inputs_embeds is None:
81
+ (
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ inputs_embeds,
86
+ labels
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ attention_mask,
90
+ past_key_values,
91
+ labels,
92
+ images
93
+ )
94
+
95
+ outputs = super().forward(
96
+ input_ids=input_ids,
97
+ attention_mask=attention_mask,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ labels=labels,
101
+ use_cache=use_cache,
102
+ output_attentions=output_attentions,
103
+ output_hidden_states=output_hidden_states,
104
+ return_dict=return_dict,
105
+ cache_position=cache_position,
106
+ )
107
+
108
+ outputs.labels = labels
109
+
110
+ return outputs
111
+
112
+ @torch.no_grad()
113
+ def generate(
114
+ self,
115
+ inputs: Optional[torch.Tensor] = None,
116
+ images: Optional[torch.Tensor] = None,
117
+ **kwargs,
118
+ ) -> Union[GenerateOutput, torch.LongTensor]:
119
+ position_ids = kwargs.pop("position_ids", None)
120
+ attention_mask = kwargs.pop("attention_mask", None)
121
+ if "inputs_embeds" in kwargs:
122
+ raise NotImplementedError("`inputs_embeds` is not supported")
123
+
124
+ if images is not None:
125
+ (
126
+ input_ids,
127
+ attention_mask,
128
+ past_key_values,
129
+ inputs_embeds,
130
+ _
131
+ ) = self.prepare_inputs_labels_for_multimodal(
132
+ input_ids=inputs,
133
+ attention_mask=attention_mask,
134
+ past_key_values=None,
135
+ labels=None,
136
+ images=images
137
+ )
138
+ else:
139
+ inputs_embeds = self.get_model().embed_tokens(inputs)
140
+
141
+ return super().generate(
142
+ position_ids=position_ids,
143
+ attention_mask=attention_mask,
144
+ inputs_embeds=inputs_embeds,
145
+ **kwargs
146
+ )
147
+
148
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
149
+ images = kwargs.pop("images", None)
150
+ _inputs = super().prepare_inputs_for_generation(
151
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
152
+ )
153
+ if images is not None:
154
+ _inputs['images'] = images
155
+ return _inputs
156
+
157
+
158
+ AutoConfig.register("videollama2_phi3", Videollama2Phi3Config)
159
+ AutoModelForCausalLM.register(Videollama2Phi3Config, Videollama2Phi3ForCausalLM)
videollama2/model/videollama2_qwen2.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ Qwen2Config, Qwen2Model, Qwen2ForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2Qwen2Config(Qwen2Config):
31
+ model_type = "videollama2_qwen2"
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.model_type = "videollama2_qwen2"
36
+
37
+
38
+ class Videollama2Qwen2Model(Videollama2MetaModel, Qwen2Model):
39
+ config_class = Videollama2Qwen2Config
40
+
41
+ def __init__(self, config: Videollama2Qwen2Config):
42
+ super(Videollama2Qwen2Model, self).__init__(config)
43
+
44
+
45
+ class Videollama2Qwen2ForCausalLM(Qwen2ForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2Qwen2Config
47
+
48
+ def __init__(self, config, **kwargs):
49
+ super(Qwen2ForCausalLM, self).__init__(config)
50
+ self.model = Videollama2Qwen2Model(config)
51
+ # self.pretraining_tp = config.pretraining_tp
52
+ self.vocab_size = config.vocab_size
53
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
54
+
55
+ # Initialize weights and apply final processing
56
+ self.post_init()
57
+
58
+ def get_model(self):
59
+ return self.model
60
+
61
+ def forward(
62
+ self,
63
+ input_ids: torch.LongTensor = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ position_ids: Optional[torch.LongTensor] = None,
66
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
67
+ inputs_embeds: Optional[torch.FloatTensor] = None,
68
+ labels: Optional[torch.LongTensor] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ images: Optional[torch.FloatTensor] = None,
73
+ return_dict: Optional[bool] = None,
74
+ cache_position: Optional[int] = None,
75
+ **kwargs
76
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
77
+
78
+ if inputs_embeds is None:
79
+ (
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ inputs_embeds,
84
+ labels
85
+ ) = self.prepare_inputs_labels_for_multimodal(
86
+ input_ids,
87
+ attention_mask,
88
+ past_key_values,
89
+ labels,
90
+ images
91
+ )
92
+
93
+ return super().forward(
94
+ input_ids=input_ids,
95
+ attention_mask=attention_mask,
96
+ past_key_values=past_key_values,
97
+ inputs_embeds=inputs_embeds,
98
+ labels=labels,
99
+ use_cache=use_cache,
100
+ output_attentions=output_attentions,
101
+ output_hidden_states=output_hidden_states,
102
+ return_dict=return_dict,
103
+ cache_position=cache_position,
104
+ )
105
+
106
+ @torch.no_grad()
107
+ def generate(
108
+ self,
109
+ inputs: Optional[torch.Tensor] = None,
110
+ images: Optional[torch.Tensor] = None,
111
+ **kwargs,
112
+ ) -> Union[GenerateOutput, torch.LongTensor]:
113
+ position_ids = kwargs.pop("position_ids", None)
114
+ attention_mask = kwargs.pop("attention_mask", None)
115
+ if "inputs_embeds" in kwargs:
116
+ raise NotImplementedError("`inputs_embeds` is not supported")
117
+
118
+ if images is not None:
119
+ (
120
+ input_ids,
121
+ attention_mask,
122
+ past_key_values,
123
+ inputs_embeds,
124
+ _
125
+ ) = self.prepare_inputs_labels_for_multimodal(
126
+ input_ids=inputs,
127
+ attention_mask=attention_mask,
128
+ past_key_values=None,
129
+ labels=None,
130
+ images=images
131
+ )
132
+ else:
133
+ inputs_embeds = self.get_model().embed_tokens(inputs)
134
+
135
+ return super().generate(
136
+ position_ids=position_ids,
137
+ attention_mask=attention_mask,
138
+ inputs_embeds=inputs_embeds,
139
+ **kwargs
140
+ )
141
+
142
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
143
+ images = kwargs.pop("images", None)
144
+ _inputs = super().prepare_inputs_for_generation(
145
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
146
+ )
147
+ if images is not None:
148
+ _inputs['images'] = images
149
+ return _inputs
150
+
151
+
152
+ AutoConfig.register("videollama2_qwen2", Videollama2Qwen2Config)
153
+ AutoModelForCausalLM.register(Videollama2Qwen2Config, Videollama2Qwen2ForCausalLM)
videollama2/serve/cli.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+
4
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, NUM_FRAMES
5
+ from videollama2.conversation import conv_templates, SeparatorStyle
6
+ from videollama2.model.builder import load_pretrained_model
7
+ from videollama2.utils import disable_torch_init
8
+ from videollama2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, tokenizer_MMODAL_token
9
+
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+
13
+ import requests
14
+ from io import BytesIO
15
+ from transformers import TextStreamer
16
+
17
+
18
+ def load_image(image_file):
19
+ if image_file.startswith('http://') or image_file.startswith('https://'):
20
+ response = requests.get(image_file)
21
+ image = Image.open(BytesIO(response.content)).convert('RGB')
22
+ else:
23
+ image = Image.open(image_file).convert('RGB')
24
+ return image
25
+
26
+ def load_video(video_file):
27
+ decord_vr = VideoReader(uri=video_file, ctx=cpu(0))
28
+ duration = len(decord_vr)
29
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
30
+ video = decord_vr.get_batch(frame_id_list)
31
+ return video
32
+
33
+ def load_image_or_video(image_or_video_file):
34
+ if file_path.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
35
+ return load_image(image_file=image_or_video_file)
36
+ elif file_path.endswith(('.mp4', '.avi', '.mov')):
37
+ return load_video(video_file=image_or_video_file)
38
+ else:
39
+ raise Exception(f"File type of {image_or_video_file} not supported!!!")
40
+
41
+
42
+ def main(args):
43
+ # Model
44
+ disable_torch_init()
45
+
46
+ model_name = get_model_name_from_path(args.model_path)
47
+ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
48
+
49
+ # if "llama-2" in model_name.lower():
50
+ # conv_mode = "llava_llama2"
51
+ # elif "mistral" in model_name.lower():
52
+ # conv_mode = "mistral"
53
+ # elif "v1.6-34b" in model_name.lower():
54
+ # conv_mode = "chatml_direct"
55
+ # elif "v1" in model_name.lower():
56
+ # conv_mode = "llava_v1"
57
+ # else:
58
+ # conv_mode = "llava_v0"
59
+ conv_mode = "llava_v1" # fix conversation mode for now
60
+
61
+ if args.conv_mode is not None and conv_mode != args.conv_mode:
62
+ print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
63
+ else:
64
+ args.conv_mode = conv_mode
65
+
66
+ conv = conv_templates[args.conv_mode].copy()
67
+ roles = conv.roles
68
+
69
+ image = load_image(args.image_file)
70
+ image_size = image.size
71
+ # Similar operation in model_worker.py
72
+ image_tensor = process_images([image], image_processor, model.config)
73
+ if type(image_tensor) is list:
74
+ image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
75
+ else:
76
+ image_tensor = image_tensor.to(model.device, dtype=torch.float16)
77
+
78
+ while True:
79
+ try:
80
+ inp = input(f"{roles[0]}: ")
81
+ except EOFError:
82
+ inp = ""
83
+ if not inp:
84
+ print("exit...")
85
+ break
86
+
87
+ print(f"{roles[1]}: ", end="")
88
+
89
+ if image is not None:
90
+ # first message
91
+ if model.config.mm_use_im_start_end:
92
+ inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
93
+ else:
94
+ inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
95
+ conv.append_message(conv.roles[0], inp)
96
+ image = None
97
+ else:
98
+ # later messages
99
+ conv.append_message(conv.roles[0], inp)
100
+ conv.append_message(conv.roles[1], None)
101
+ prompt = conv.get_prompt()
102
+
103
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
104
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
105
+ keywords = [stop_str]
106
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
107
+
108
+ with torch.inference_mode():
109
+ output_ids = model.generate(
110
+ input_ids,
111
+ images=image_tensor,
112
+ image_sizes=[image_size],
113
+ do_sample=True if args.temperature > 0 else False,
114
+ temperature=args.temperature,
115
+ max_new_tokens=args.max_new_tokens,
116
+ streamer=streamer,
117
+ use_cache=True)
118
+
119
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
120
+ conv.messages[-1][-1] = outputs
121
+
122
+ if args.debug:
123
+ print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
124
+
125
+
126
+ if __name__ == "__main__":
127
+ parser = argparse.ArgumentParser()
128
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
129
+ parser.add_argument("--model-base", type=str, default=None)
130
+ parser.add_argument("--image-file", type=str, required=True)
131
+ parser.add_argument("--device", type=str, default="cuda")
132
+ parser.add_argument("--conv-mode", type=str, default=None)
133
+ parser.add_argument("--temperature", type=float, default=0.2)
134
+ parser.add_argument("--max-new-tokens", type=int, default=512)
135
+ parser.add_argument("--load-8bit", action="store_true")
136
+ parser.add_argument("--load-4bit", action="store_true")
137
+ parser.add_argument("--debug", action="store_true")
138
+ args = parser.parse_args()
139
+ main(args)
videollama2/serve/controller.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A controller manages distributed workers.
3
+ It sends worker addresses to clients.
4
+ """
5
+ import argparse
6
+ import asyncio
7
+ import dataclasses
8
+ from enum import Enum, auto
9
+ import json
10
+ import logging
11
+ import time
12
+ from typing import List, Union
13
+ import threading
14
+
15
+ from fastapi import FastAPI, Request
16
+ from fastapi.responses import StreamingResponse
17
+ import numpy as np
18
+ import requests
19
+ import uvicorn
20
+
21
+ from videollama2.constants import CONTROLLER_HEART_BEAT_EXPIRATION
22
+ from videollama2.utils import build_logger, server_error_msg
23
+
24
+
25
+ logger = build_logger("controller", "controller.log")
26
+
27
+
28
+ class DispatchMethod(Enum):
29
+ LOTTERY = auto()
30
+ SHORTEST_QUEUE = auto()
31
+
32
+ @classmethod
33
+ def from_str(cls, name):
34
+ if name == "lottery":
35
+ return cls.LOTTERY
36
+ elif name == "shortest_queue":
37
+ return cls.SHORTEST_QUEUE
38
+ else:
39
+ raise ValueError(f"Invalid dispatch method")
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class WorkerInfo:
44
+ model_names: List[str]
45
+ speed: int
46
+ queue_length: int
47
+ check_heart_beat: bool
48
+ last_heart_beat: str
49
+
50
+
51
+ def heart_beat_controller(controller):
52
+ while True:
53
+ time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
54
+ controller.remove_stable_workers_by_expiration()
55
+
56
+
57
+ class Controller:
58
+ def __init__(self, dispatch_method: str):
59
+ # Dict[str -> WorkerInfo]
60
+ self.worker_info = {}
61
+ self.dispatch_method = DispatchMethod.from_str(dispatch_method)
62
+
63
+ self.heart_beat_thread = threading.Thread(
64
+ target=heart_beat_controller, args=(self,), daemon=True)
65
+ self.heart_beat_thread.start()
66
+
67
+ logger.info("Init controller")
68
+
69
+ def register_worker(self, worker_name: str, check_heart_beat: bool,
70
+ worker_status: dict):
71
+ if worker_name not in self.worker_info:
72
+ logger.info(f"Register a new worker: {worker_name}")
73
+ else:
74
+ logger.info(f"Register an existing worker: {worker_name}")
75
+
76
+ if not worker_status:
77
+ worker_status = self.get_worker_status(worker_name)
78
+ if not worker_status:
79
+ return False
80
+
81
+ self.worker_info[worker_name] = WorkerInfo(
82
+ worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
83
+ check_heart_beat, time.time())
84
+
85
+ logger.info(f"Register done: {worker_name}, {worker_status}")
86
+ return True
87
+
88
+ def get_worker_status(self, worker_name: str):
89
+ try:
90
+ r = requests.post(worker_name + "/worker_get_status", timeout=5)
91
+ except requests.exceptions.RequestException as e:
92
+ logger.error(f"Get status fails: {worker_name}, {e}")
93
+ return None
94
+
95
+ if r.status_code != 200:
96
+ logger.error(f"Get status fails: {worker_name}, {r}")
97
+ return None
98
+
99
+ return r.json()
100
+
101
+ def remove_worker(self, worker_name: str):
102
+ del self.worker_info[worker_name]
103
+
104
+ def refresh_all_workers(self):
105
+ old_info = dict(self.worker_info)
106
+ self.worker_info = {}
107
+
108
+ for w_name, w_info in old_info.items():
109
+ if not self.register_worker(w_name, w_info.check_heart_beat, None):
110
+ logger.info(f"Remove stale worker: {w_name}")
111
+
112
+ def list_models(self):
113
+ model_names = set()
114
+
115
+ for w_name, w_info in self.worker_info.items():
116
+ model_names.update(w_info.model_names)
117
+
118
+ return list(model_names)
119
+
120
+ def get_worker_address(self, model_name: str):
121
+ if self.dispatch_method == DispatchMethod.LOTTERY:
122
+ worker_names = []
123
+ worker_speeds = []
124
+ for w_name, w_info in self.worker_info.items():
125
+ if model_name in w_info.model_names:
126
+ worker_names.append(w_name)
127
+ worker_speeds.append(w_info.speed)
128
+ worker_speeds = np.array(worker_speeds, dtype=np.float32)
129
+ norm = np.sum(worker_speeds)
130
+ if norm < 1e-4:
131
+ return ""
132
+ worker_speeds = worker_speeds / norm
133
+ if True: # Directly return address
134
+ pt = np.random.choice(np.arange(len(worker_names)),
135
+ p=worker_speeds)
136
+ worker_name = worker_names[pt]
137
+ return worker_name
138
+
139
+ # Check status before returning
140
+ while True:
141
+ pt = np.random.choice(np.arange(len(worker_names)),
142
+ p=worker_speeds)
143
+ worker_name = worker_names[pt]
144
+
145
+ if self.get_worker_status(worker_name):
146
+ break
147
+ else:
148
+ self.remove_worker(worker_name)
149
+ worker_speeds[pt] = 0
150
+ norm = np.sum(worker_speeds)
151
+ if norm < 1e-4:
152
+ return ""
153
+ worker_speeds = worker_speeds / norm
154
+ continue
155
+ return worker_name
156
+ elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
157
+ worker_names = []
158
+ worker_qlen = []
159
+ for w_name, w_info in self.worker_info.items():
160
+ if model_name in w_info.model_names:
161
+ worker_names.append(w_name)
162
+ worker_qlen.append(w_info.queue_length / w_info.speed)
163
+ if len(worker_names) == 0:
164
+ return ""
165
+ min_index = np.argmin(worker_qlen)
166
+ w_name = worker_names[min_index]
167
+ self.worker_info[w_name].queue_length += 1
168
+ logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
169
+ return w_name
170
+ else:
171
+ raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
172
+
173
+ def receive_heart_beat(self, worker_name: str, queue_length: int):
174
+ if worker_name not in self.worker_info:
175
+ logger.info(f"Receive unknown heart beat. {worker_name}")
176
+ return False
177
+
178
+ self.worker_info[worker_name].queue_length = queue_length
179
+ self.worker_info[worker_name].last_heart_beat = time.time()
180
+ logger.info(f"Receive heart beat. {worker_name}")
181
+ return True
182
+
183
+ def remove_stable_workers_by_expiration(self):
184
+ expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
185
+ to_delete = []
186
+ for worker_name, w_info in self.worker_info.items():
187
+ if w_info.check_heart_beat and w_info.last_heart_beat < expire:
188
+ to_delete.append(worker_name)
189
+
190
+ for worker_name in to_delete:
191
+ self.remove_worker(worker_name)
192
+
193
+ def worker_api_generate_stream(self, params):
194
+ worker_addr = self.get_worker_address(params["model"])
195
+ if not worker_addr:
196
+ logger.info(f"no worker: {params['model']}")
197
+ ret = {
198
+ "text": server_error_msg,
199
+ "error_code": 2,
200
+ }
201
+ yield json.dumps(ret).encode() + b"\0"
202
+
203
+ try:
204
+ response = requests.post(worker_addr + "/worker_generate_stream",
205
+ json=params, stream=True, timeout=5)
206
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
207
+ if chunk:
208
+ yield chunk + b"\0"
209
+ except requests.exceptions.RequestException as e:
210
+ logger.info(f"worker timeout: {worker_addr}")
211
+ ret = {
212
+ "text": server_error_msg,
213
+ "error_code": 3,
214
+ }
215
+ yield json.dumps(ret).encode() + b"\0"
216
+
217
+
218
+ # Let the controller act as a worker to achieve hierarchical
219
+ # management. This can be used to connect isolated sub networks.
220
+ def worker_api_get_status(self):
221
+ model_names = set()
222
+ speed = 0
223
+ queue_length = 0
224
+
225
+ for w_name in self.worker_info:
226
+ worker_status = self.get_worker_status(w_name)
227
+ if worker_status is not None:
228
+ model_names.update(worker_status["model_names"])
229
+ speed += worker_status["speed"]
230
+ queue_length += worker_status["queue_length"]
231
+
232
+ return {
233
+ "model_names": list(model_names),
234
+ "speed": speed,
235
+ "queue_length": queue_length,
236
+ }
237
+
238
+
239
+ app = FastAPI()
240
+
241
+
242
+ @app.post("/register_worker")
243
+ async def register_worker(request: Request):
244
+ data = await request.json()
245
+ controller.register_worker(
246
+ data["worker_name"], data["check_heart_beat"],
247
+ data.get("worker_status", None))
248
+
249
+
250
+ @app.post("/refresh_all_workers")
251
+ async def refresh_all_workers():
252
+ models = controller.refresh_all_workers()
253
+
254
+
255
+ @app.post("/list_models")
256
+ async def list_models():
257
+ models = controller.list_models()
258
+ return {"models": models}
259
+
260
+
261
+ @app.post("/get_worker_address")
262
+ async def get_worker_address(request: Request):
263
+ data = await request.json()
264
+ addr = controller.get_worker_address(data["model"])
265
+ return {"address": addr}
266
+
267
+
268
+ @app.post("/receive_heart_beat")
269
+ async def receive_heart_beat(request: Request):
270
+ data = await request.json()
271
+ exist = controller.receive_heart_beat(
272
+ data["worker_name"], data["queue_length"])
273
+ return {"exist": exist}
274
+
275
+
276
+ @app.post("/worker_generate_stream")
277
+ async def worker_api_generate_stream(request: Request):
278
+ params = await request.json()
279
+ generator = controller.worker_api_generate_stream(params)
280
+ return StreamingResponse(generator)
281
+
282
+
283
+ @app.post("/worker_get_status")
284
+ async def worker_api_get_status(request: Request):
285
+ return controller.worker_api_get_status()
286
+
287
+
288
+ if __name__ == "__main__":
289
+ parser = argparse.ArgumentParser()
290
+ parser.add_argument("--host", type=str, default="localhost")
291
+ parser.add_argument("--port", type=int, default=21001)
292
+ parser.add_argument("--dispatch-method", type=str, choices=[
293
+ "lottery", "shortest_queue"], default="shortest_queue")
294
+ args = parser.parse_args()
295
+ logger.info(f"args: {args}")
296
+
297
+ controller = Controller(args.dispatch_method)
298
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/gradio_web_server.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import hashlib
5
+ import requests
6
+ import argparse
7
+ import datetime
8
+
9
+ import numpy as np
10
+ import gradio as gr
11
+ from decord import VideoReader, cpu
12
+
13
+ from videollama2.constants import LOGDIR, NUM_FRAMES
14
+ from videollama2.conversation import (default_conversation, conv_templates,SeparatorStyle)
15
+ from videollama2.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
16
+
17
+
18
+ logger = build_logger("gradio_web_server", "gradio_web_server.log")
19
+
20
+ headers = {"User-Agent": "Videollama2 Client"}
21
+
22
+ no_change_btn = gr.Button.update()
23
+ enable_btn = gr.Button.update(interactive=True)
24
+ disable_btn = gr.Button.update(interactive=False)
25
+
26
+ priority = {
27
+ "vicuna-13b": "aaaaaaa",
28
+ "koala-13b": "aaaaaab",
29
+ }
30
+
31
+
32
+ def get_conv_log_filename():
33
+ t = datetime.datetime.now()
34
+ name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
35
+ return name
36
+
37
+
38
+ def get_model_list():
39
+ ret = requests.post(args.controller_url + "/refresh_all_workers")
40
+ assert ret.status_code == 200
41
+ ret = requests.post(args.controller_url + "/list_models")
42
+ models = ret.json()["models"]
43
+ models.sort(key=lambda x: priority.get(x, x))
44
+ logger.info(f"Models: {models}")
45
+ return models
46
+
47
+
48
+ get_window_url_params = """
49
+ function() {
50
+ const params = new URLSearchParams(window.location.search);
51
+ url_params = Object.fromEntries(params);
52
+ console.log(url_params);
53
+ return url_params;
54
+ }
55
+ """
56
+
57
+
58
+ def load_demo(url_params, request: gr.Request):
59
+ logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
60
+
61
+ dropdown_update = gr.Dropdown.update(visible=True)
62
+ if "model" in url_params:
63
+ model = url_params["model"]
64
+ if model in models:
65
+ dropdown_update = gr.Dropdown.update(
66
+ value=model, visible=True)
67
+
68
+ state = default_conversation.copy()
69
+ return state, dropdown_update
70
+
71
+
72
+ def load_demo_refresh_model_list(request: gr.Request):
73
+ logger.info(f"load_demo. ip: {request.client.host}")
74
+ models = get_model_list()
75
+ state = default_conversation.copy()
76
+ dropdown_update = gr.Dropdown.update(
77
+ choices=models,
78
+ value=models[0] if len(models) > 0 else ""
79
+ )
80
+ return state, dropdown_update
81
+
82
+
83
+ def vote_last_response(state, vote_type, model_selector, request: gr.Request):
84
+ with open(get_conv_log_filename(), "a") as fout:
85
+ data = {
86
+ "tstamp": round(time.time(), 4),
87
+ "type": vote_type,
88
+ "model": model_selector,
89
+ "state": state.dict(),
90
+ "ip": request.client.host,
91
+ }
92
+ fout.write(json.dumps(data) + "\n")
93
+
94
+
95
+ def upvote_last_response(state, model_selector, request: gr.Request):
96
+ logger.info(f"upvote. ip: {request.client.host}")
97
+ vote_last_response(state, "upvote", model_selector, request)
98
+ return ("",) + (disable_btn,) * 3
99
+
100
+
101
+ def downvote_last_response(state, model_selector, request: gr.Request):
102
+ logger.info(f"downvote. ip: {request.client.host}")
103
+ vote_last_response(state, "downvote", model_selector, request)
104
+ return ("",) + (disable_btn,) * 3
105
+
106
+
107
+ def flag_last_response(state, model_selector, request: gr.Request):
108
+ logger.info(f"flag. ip: {request.client.host}")
109
+ vote_last_response(state, "flag", model_selector, request)
110
+ return ("",) + (disable_btn,) * 3
111
+
112
+
113
+ def regenerate(state, image_process_mode, request: gr.Request):
114
+ logger.info(f"regenerate. ip: {request.client.host}")
115
+ state.messages[-1][-1] = None
116
+ prev_human_msg = state.messages[-2]
117
+ if type(prev_human_msg[1]) in (tuple, list):
118
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
119
+ state.skip_next = False
120
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
121
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
122
+
123
+
124
+ def clear_history(request: gr.Request):
125
+ logger.info(f"clear_history. ip: {request.client.host}")
126
+ state = default_conversation.copy()
127
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
128
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
129
+
130
+
131
+ def add_text_ori(state, text, image, video, image_process_mode, request: gr.Request):
132
+ # note: imagebox itself is PIL object while videobox is filepath
133
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
134
+ if len(text) <= 0 and image is None:
135
+ state.skip_next = True
136
+ return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
137
+ if args.moderate:
138
+ flagged = violates_moderation(text)
139
+ if flagged:
140
+ state.skip_next = True
141
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
142
+ no_change_btn,) * 5
143
+ assert image is None or video is None, "Please don't feed image and video inputs at the same time!!!"
144
+ text = text[:1536] # Hard cut-off
145
+ if image is not None:
146
+ # here image is the PIL object itself
147
+ text = text[:1200] # Hard cut-off for images
148
+ if '<image>' not in text:
149
+ # text = '<Image><image></Image>' + text
150
+ text = text + '\n<image>'
151
+ text = (text, image, image_process_mode)
152
+ if len(state.get_images(return_pil=True)) > 0:
153
+ state = default_conversation.copy()
154
+ state.modality = "image"
155
+ if video is not None:
156
+ print("Video box:", video)
157
+ # here video is the file path of video
158
+ text = text[:1200] # Hard cut-off for images
159
+ if '<video>' not in text:
160
+ # text = '<Image><image></Image>' + text
161
+ text = text + '\n<video>'
162
+ text = (text, video, image_process_mode)
163
+ if len(state.get_videos(return_pil=True)) > 0:
164
+ state = default_conversation.copy()
165
+ state.modality = "video"
166
+ print("Set modality as video...")
167
+ state.append_message(state.roles[0], text)
168
+ state.append_message(state.roles[1], None)
169
+ state.skip_next = False
170
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
171
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
172
+
173
+
174
+ def add_text(state, text, image, video, image_process_mode, request: gr.Request):
175
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
176
+
177
+ # if input is new video or image ,reset the state
178
+ if image is not None or video is not None:
179
+ state = default_conversation.copy()
180
+
181
+ if len(text) <= 0 and image is None and video is None:
182
+ state.skip_next = True
183
+ return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
184
+
185
+ if args.moderate:
186
+ flagged = violates_moderation(text)
187
+ if flagged:
188
+ state.skip_next = True
189
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (no_change_btn,) * 5
190
+
191
+ # process the input video
192
+ if video is not None:
193
+ text = text[:1200] #
194
+ if '<video>' not in text:
195
+ text = text + '\n<video>'
196
+ text = (text, video, image_process_mode)
197
+ state.modality = "video"
198
+ # process the input image
199
+ elif image is not None:
200
+ text = text[:1200] #
201
+ if '<image>' not in text:
202
+ text = text + '\n<image>'
203
+ text = (text, image, image_process_mode)
204
+ state.modality = "image"
205
+ elif state.modality == "image" and len(text)>0:
206
+ state.modality = "image_text"
207
+ text = text[:1536] # Hard cut-off
208
+ elif state.modality == "video" and len(text)>0:
209
+ state.modality = "video_text"
210
+ text = text[:1536] # Hard cut-off
211
+
212
+ state.append_message(state.roles[0], text)
213
+ state.append_message(state.roles[1], None)
214
+ state.skip_next = False
215
+
216
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
217
+
218
+
219
+ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
220
+ logger.info(f"http_bot. ip: {request.client.host}")
221
+ start_tstamp = time.time()
222
+ model_name = model_selector
223
+
224
+ if state.skip_next:
225
+ # This generate call is skipped due to invalid inputs
226
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
227
+ return
228
+
229
+ if len(state.messages) == state.offset + 2:
230
+ # First round of conversation
231
+ if "llava" in model_name.lower():
232
+ if 'llama-2' in model_name.lower():
233
+ template_name = "llava_llama2"
234
+ elif "v1" in model_name.lower():
235
+ if 'mmtag' in model_name.lower():
236
+ template_name = "v1_mmtag"
237
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
238
+ template_name = "v1_mmtag"
239
+ else:
240
+ template_name = "llava_v1"
241
+ else:
242
+ if 'mmtag' in model_name.lower():
243
+ template_name = "v0_mmtag"
244
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
245
+ template_name = "v0_mmtag"
246
+ else:
247
+ template_name = "llava_v0"
248
+ elif "llama-2" in model_name:
249
+ template_name = "llama2"
250
+ else:
251
+ template_name = "vicuna_v1"
252
+ template_name = "llava_v1"
253
+ new_state = conv_templates[template_name].copy()
254
+ new_state.append_message(new_state.roles[0], state.messages[-2][1])
255
+ new_state.append_message(new_state.roles[1], None)
256
+ new_state.modality = state.modality
257
+ state = new_state
258
+
259
+ # Query worker address
260
+ controller_url = args.controller_url
261
+ ret = requests.post(controller_url + "/get_worker_address",
262
+ json={"model": model_name})
263
+ worker_addr = ret.json()["address"]
264
+ logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
265
+
266
+ # No available worker
267
+ if worker_addr == "":
268
+ state.messages[-1][-1] = server_error_msg
269
+ yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
270
+ return
271
+
272
+ # Construct prompt
273
+ prompt = state.get_prompt()
274
+ if state.modality == "image" or state.modality == "image_text":
275
+ all_images = state.get_images(return_pil=True) # return PIL.Image object
276
+ elif state.modality == "video" or state.modality == "video_text":
277
+ all_images = state.get_videos(return_pil=True) # return video frames where each frame is a PIL.Image object
278
+ all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
279
+ for idx, (image, hash) in enumerate(zip(all_images, all_image_hash)):
280
+ t = datetime.datetime.now()
281
+ if state.modality == "image" or state.modality == "image_text":
282
+ filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
283
+ elif state.modality == "video" or state.modality == "video_text":
284
+ filename = os.path.join(LOGDIR, "serve_videos", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}_{idx}.jpg")
285
+ if not os.path.isfile(filename):
286
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
287
+ image.save(filename)
288
+
289
+ # Make requests
290
+ pload = {
291
+ "model": model_name,
292
+ "prompt": prompt,
293
+ "temperature": float(temperature),
294
+ "top_p": float(top_p),
295
+ "max_new_tokens": min(int(max_new_tokens), 1536),
296
+ "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE] else state.sep2,
297
+ #"images": f'List of {len(state.get_images())} images: {all_image_hash}',
298
+ "images": f'List of {len(all_image_hash)} images: {all_image_hash}',
299
+ }
300
+ logger.info(f"==== request ====\n{pload}")
301
+
302
+ if state.modality == "image" or state.modality == "image_text":
303
+ pload['images'] = state.get_images()
304
+ elif state.modality == "video" or state.modality == "video_text":
305
+ pload['images'] = state.get_videos()
306
+
307
+ state.messages[-1][-1] = "▌"
308
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
309
+
310
+ try:
311
+ # Stream output
312
+ response = requests.post(worker_addr + "/worker_generate_stream",
313
+ headers=headers, json=pload, stream=True, timeout=10)
314
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
315
+ if chunk:
316
+ data = json.loads(chunk.decode())
317
+ if data["error_code"] == 0:
318
+ output = data["text"][len(prompt):].strip()
319
+ state.messages[-1][-1] = output + "▌"
320
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
321
+ else:
322
+ output = data["text"] + f" (error_code: {data['error_code']})"
323
+ state.messages[-1][-1] = output
324
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
325
+ return
326
+ time.sleep(0.03)
327
+ except requests.exceptions.RequestException as e:
328
+ state.messages[-1][-1] = server_error_msg
329
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
330
+ return
331
+
332
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
333
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
334
+
335
+ finish_tstamp = time.time()
336
+ logger.info(f"{output}")
337
+
338
+ with open(get_conv_log_filename(), "a") as fout:
339
+ data = {
340
+ "tstamp": round(finish_tstamp, 4),
341
+ "type": "chat",
342
+ "model": model_name,
343
+ "start": round(start_tstamp, 4),
344
+ "finish": round(start_tstamp, 4),
345
+ #"state": state.dict(),
346
+ "images": all_image_hash,
347
+ "ip": request.client.host,
348
+ }
349
+ fout.write(json.dumps(data) + "\n")
350
+
351
+ title_markdown = ("""
352
+ # The publicl release of VideoLLaMA2
353
+ """)
354
+
355
+ tos_markdown = ("""
356
+ ### Terms of use
357
+ By using this service, users are required to agree to the following terms:
358
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
359
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
360
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
361
+ """)
362
+
363
+
364
+ learn_more_markdown = ("""
365
+ ### License
366
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
367
+ """)
368
+
369
+ block_css = """
370
+
371
+ #buttons button {
372
+ min-width: min(120px,100%);
373
+ }
374
+
375
+ """
376
+
377
+ def build_demo(embed_mode):
378
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
379
+ with gr.Blocks(title="Video-Llama", theme=gr.themes.Default(), css=block_css) as demo:
380
+ state = gr.State()
381
+
382
+ if not embed_mode:
383
+ gr.Markdown(title_markdown)
384
+
385
+ with gr.Row():
386
+ with gr.Column(scale=3):
387
+ with gr.Row(elem_id="model_selector_row"):
388
+ model_selector = gr.Dropdown(
389
+ choices=models,
390
+ value=models[0] if len(models) > 0 else "",
391
+ interactive=True,
392
+ show_label=False,
393
+ container=False)
394
+
395
+ imagebox = gr.Image(type="pil")
396
+ videobox = gr.Video()
397
+ image_process_mode = gr.Radio(
398
+ ["Crop", "Resize", "Pad", "Default"],
399
+ value="Default",
400
+ label="Preprocess for non-square image", visible=False)
401
+
402
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
403
+ gr.Examples(examples=[
404
+ [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
405
+ [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
406
+ [f"{cur_dir}/examples/desert.jpg", "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?"],
407
+ ], inputs=[imagebox, textbox], label="Image examples")
408
+
409
+ # video example inputs
410
+ gr.Examples(examples=[
411
+ [f"{cur_dir}/examples/sample_demo_1.mp4", "Why is this video funny?"],
412
+ [f"{cur_dir}/examples/sample_demo_3.mp4", "Can you identify any safety hazards in this video?"],
413
+ [f"{cur_dir}/examples/1034346401.mp4", "What is this young woman doing?"]
414
+ ], inputs=[videobox, textbox], label="Video examples")
415
+ #[f"{cur_dir}/examples/sample_demo_9.mp4", "Describe the video in detail and please do not generate repetitive content."]
416
+
417
+ with gr.Accordion("Parameters", open=False) as parameter_row:
418
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
419
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
420
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
421
+
422
+ with gr.Column(scale=8):
423
+ chatbot = gr.Chatbot(elem_id="chatbot", label="Videollama2 Chatbot", height=550)
424
+ with gr.Row():
425
+ with gr.Column(scale=8):
426
+ textbox.render()
427
+ with gr.Column(scale=1, min_width=50):
428
+ submit_btn = gr.Button(value="Send", variant="primary")
429
+ with gr.Row(elem_id="buttons") as button_row:
430
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
431
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
432
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
433
+ #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
434
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
435
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
436
+
437
+ if not embed_mode:
438
+ gr.Markdown(tos_markdown)
439
+ gr.Markdown(learn_more_markdown)
440
+ url_params = gr.JSON(visible=False)
441
+
442
+ # Register listeners
443
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
444
+ upvote_btn.click(upvote_last_response,
445
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
446
+ downvote_btn.click(downvote_last_response,
447
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
448
+ flag_btn.click(flag_last_response,
449
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
450
+ regenerate_btn.click(regenerate, [state, image_process_mode],
451
+ [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
452
+ http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
453
+ [state, chatbot] + btn_list)
454
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
455
+
456
+ textbox.submit(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
457
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
458
+ [state, chatbot] + btn_list)
459
+ submit_btn.click(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
460
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
461
+ [state, chatbot] + btn_list)
462
+
463
+ if args.model_list_mode == "once":
464
+ demo.load(load_demo, [url_params], [state, model_selector],
465
+ _js=get_window_url_params)
466
+ elif args.model_list_mode == "reload":
467
+ demo.load(load_demo_refresh_model_list, None, [state, model_selector])
468
+ else:
469
+ raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
470
+
471
+ return demo
472
+
473
+
474
+ if __name__ == "__main__":
475
+ parser = argparse.ArgumentParser()
476
+ parser.add_argument("--host", type=str, default="0.0.0.0")
477
+ parser.add_argument("--port", type=int)
478
+ parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
479
+ parser.add_argument("--concurrency-count", type=int, default=10)
480
+ parser.add_argument("--model-list-mode", type=str, default="once",
481
+ choices=["once", "reload"])
482
+ parser.add_argument("--share", action="store_true")
483
+ parser.add_argument("--moderate", action="store_true")
484
+ parser.add_argument("--embed", action="store_true")
485
+ args = parser.parse_args()
486
+ logger.info(f"args: {args}")
487
+
488
+ models = get_model_list()
489
+
490
+ logger.info(args)
491
+ demo = build_demo(args.embed)
492
+ demo.queue(
493
+ concurrency_count=args.concurrency_count,
494
+ api_open=False
495
+ ).launch(
496
+ server_name=args.host,
497
+ server_port=args.port,
498
+ share=args.share
499
+ )
videollama2/serve/model_worker.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import os
5
+ import json
6
+ import time
7
+ import uuid
8
+ import asyncio
9
+ import requests
10
+ import argparse
11
+ import threading
12
+ from threading import Thread
13
+ from functools import partial
14
+ from typing import Iterator, List, Optional, Tuple
15
+
16
+ import uvicorn
17
+ from fastapi import FastAPI, Request, BackgroundTasks
18
+ from fastapi.responses import StreamingResponse
19
+
20
+ import torch
21
+ import decord
22
+ import numpy as np
23
+ from PIL import Image
24
+ from decord import VideoReader, cpu
25
+ from transformers import TextIteratorStreamer
26
+
27
+ from videollama2.constants import WORKER_HEART_BEAT_INTERVAL
28
+ from videollama2.utils import (build_logger, server_error_msg, pretty_print_semaphore)
29
+ from videollama2.model.builder import load_pretrained_model
30
+ from videollama2.mm_utils import process_images, process_videos, load_image_from_base64, tokenizer_image_token, KeywordsStoppingCriteria, tokenizer_MMODAL_token
31
+ from videollama2.mm_utils import chunk_list, frame_expansion
32
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, NUM_FRAMES, MMODAL_TOKEN_INDEX
33
+
34
+
35
+ GB = 1 << 30
36
+
37
+ worker_id = str(uuid.uuid4())[:6]
38
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
39
+ global_counter = 0
40
+
41
+ model_semaphore = None
42
+
43
+
44
+ # variable_content = os.getenv('MY_VARIABLE', '')
45
+ # KEYWORDS_LIST = set(variable_content.split('\n'))
46
+ KEYWORDS_LIST = []
47
+ path = 'assets/keywords.txt'
48
+ if os.path.exists(path):
49
+ with open(path, 'r', encoding='utf-8') as file:
50
+ for line in file:
51
+
52
+ KEYWORDS_LIST.append(line.strip())
53
+ else:
54
+ KEYWORDS_LIST = []
55
+
56
+
57
+ KEYWORD_BLOCK_MESSAGE2 = "The output contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
58
+ KEYWORD_BLOCK_MESSAGE1 = "Your input question contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
59
+ STREAM_CHECK_MULTIPLE = 20
60
+
61
+
62
+ def heart_beat_worker(controller):
63
+
64
+ while True:
65
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
66
+ controller.send_heart_beat()
67
+
68
+
69
+ def safety_check(text, history=None, ) -> Optional[str]:
70
+
71
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
72
+ print('############')
73
+ return KEYWORD_BLOCK_MESSAGE2
74
+
75
+ return None
76
+
77
+
78
+ def input_safety_check(text) -> Optional[str]:
79
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
80
+ print('######## Input keyword alarm triggered:', text)
81
+ return KEYWORD_BLOCK_MESSAGE1
82
+ return None
83
+
84
+
85
+ class ModelWorker:
86
+
87
+ def __init__(self, controller_addr, worker_addr,
88
+ worker_id, no_register,
89
+ model_path, model_base, model_name,
90
+ load_8bit, load_4bit, device):
91
+ self.controller_addr = controller_addr
92
+ self.worker_addr = worker_addr
93
+ self.worker_id = worker_id
94
+ self.model_path = model_path
95
+ if model_path.endswith("/"):
96
+ model_path = model_path[:-1]
97
+ if model_name is None:
98
+ model_paths = model_path.split("/")
99
+ if model_paths[-1].startswith('checkpoint-'):
100
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
101
+ else:
102
+ self.model_name = model_paths[-1]
103
+ else:
104
+ self.model_name = model_name
105
+
106
+ self.device = device
107
+ logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
108
+ self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
109
+ model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)
110
+ self.is_multimodal = 'videollama2' in self.model_name.lower() or 'vlb' in self.model_name.lower()
111
+
112
+ if not no_register:
113
+ self.register_to_controller()
114
+ self.heart_beat_thread = threading.Thread(
115
+ target=heart_beat_worker, args=(self,))
116
+ self.heart_beat_thread.start()
117
+
118
+ def register_to_controller(self):
119
+ logger.info("Register to controller")
120
+
121
+ url = self.controller_addr + "/register_worker"
122
+ data = {
123
+ "worker_name": self.worker_addr,
124
+ "check_heart_beat": True,
125
+ "worker_status": self.get_status()
126
+ }
127
+ r = requests.post(url, json=data)
128
+ assert r.status_code == 200
129
+
130
+ def send_heart_beat(self):
131
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
132
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
133
+ f"global_counter: {global_counter}")
134
+
135
+ url = self.controller_addr + "/receive_heart_beat"
136
+
137
+ while True:
138
+ try:
139
+ ret = requests.post(url, json={
140
+ "worker_name": self.worker_addr,
141
+ "queue_length": self.get_queue_length()}, timeout=5)
142
+ exist = ret.json()["exist"]
143
+ break
144
+ except requests.exceptions.RequestException as e:
145
+ logger.error(f"heart beat error: {e}")
146
+ time.sleep(5)
147
+
148
+ if not exist:
149
+ self.register_to_controller()
150
+
151
+ def get_queue_length(self):
152
+ if model_semaphore is None:
153
+ return 0
154
+ else:
155
+ return args.limit_model_concurrency - model_semaphore._value + (len(
156
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
157
+
158
+ def get_status(self):
159
+ return {
160
+ "model_names": [self.model_name],
161
+ "speed": 1,
162
+ "queue_length": self.get_queue_length(),
163
+ }
164
+
165
+ @torch.inference_mode()
166
+ def generate_stream(self, params):
167
+ tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
168
+
169
+ prompt = params["prompt"]
170
+ ori_prompt = prompt
171
+ images_or_videos = params.get("images", None)
172
+ #print("Input images:", images_or_videos)
173
+ num_image_tokens = 0
174
+ modal_list = []
175
+ if images_or_videos is not None and len(images_or_videos) and self.is_multimodal:
176
+ if len(images_or_videos) > 0:
177
+ if len(images_or_videos) != prompt.count(DEFAULT_IMAGE_TOKEN) and len(images_or_videos) != (prompt.count(DEFAULT_VIDEO_TOKEN)):
178
+ raise ValueError("Number of images/videos does not match number of <image>/<video> tokens in prompt")
179
+
180
+ try:
181
+ print("Load image...")
182
+ images_or_videos = [load_image_from_base64(image) for image in images_or_videos]
183
+ images_or_videos = process_images(images_or_videos, image_processor, model.config)
184
+
185
+ modal_list = ["image"]
186
+ replace_token = DEFAULT_IMAGE_TOKEN
187
+ modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
188
+ except:
189
+ print("Load video instead...")
190
+ decord_vr = VideoReader(uri=images_or_videos[0], ctx=cpu(0))
191
+ duration = len(decord_vr)
192
+ if not "use_taug" in self.model_path:
193
+ frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
194
+ video_frames = decord_vr.get_batch(frame_id_list).asnumpy()
195
+ images_or_videos = process_videos(video_frames, image_processor, model.config)
196
+ else:
197
+ print("Temporal augmentation activated!!!")
198
+ frame_id_list = np.linspace(0, duration-1, 8 * 2 * 2, dtype=int)
199
+ video_data = decord_vr.get_batch(frame_id_list)
200
+ video_frames = [Image.fromarray(f) for f in video_data.asnumpy()]
201
+ chunked_video_frames = chunk_list(video_frames, 2*2)
202
+ expanded_video_frames = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
203
+ images_or_videos = process_videos(expanded_video_frames, image_processor, model.config)
204
+
205
+ # frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
206
+ # images_or_videos = decord_vr.get_batch(frame_id_list).asnumpy()
207
+ # images_or_videos = process_videos(images_or_videos, image_processor, model.config)
208
+ #print("images_or_videos.shape:", images_or_videos.shape)
209
+ modal_list = ["video"]
210
+ replace_token = DEFAULT_VIDEO_TOKEN
211
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
212
+
213
+ if type(images_or_videos) is list:
214
+ images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
215
+ else:
216
+ images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
217
+ if modal_list[0] == "video":
218
+ print("Video:", images_or_videos.shape)
219
+ images_or_videos = [images_or_videos]
220
+ else:
221
+ print("Image:", images_or_videos.shape)
222
+
223
+
224
+ #image_sizes = [image.size for image in images_or_videos]
225
+
226
+
227
+ # if len(images_or_videos) % NUM_FRAMES == 0:
228
+ # images_or_videos = process_images(images_or_videos, image_processor, model.config)
229
+ # #images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
230
+ # #modal_list = ["image"] * len(images_or_videos)
231
+ # images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
232
+ # modal_list = ["video"]
233
+ # replace_token = DEFAULT_VIDEO_TOKEN
234
+ # else:
235
+
236
+ if getattr(self.model.config, 'mm_use_im_start_end', False):
237
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
238
+ prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
239
+
240
+ num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
241
+ else:
242
+ images = None
243
+ modal_list = []
244
+ image_args = {"images_or_videos": images_or_videos, "modal_list": modal_list}
245
+ else:
246
+ images = None
247
+ image_args = {}
248
+ print("image_args:", image_args)
249
+ temperature = float(params.get("temperature", 1.0))
250
+ top_p = float(params.get("top_p", 1.0))
251
+ max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
252
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
253
+ stop_str = params.get("stop", None)
254
+ do_sample = True if temperature > 0.001 else False
255
+
256
+ #input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
257
+ # tokenizer for our video-llama beta
258
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to(self.device)
259
+ #print("Current prompt:", prompt)
260
+ #print("input_ids.shape:", input_ids.shape)
261
+ keywords = [stop_str]
262
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
263
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
264
+
265
+ max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
266
+
267
+ if max_new_tokens < 1:
268
+ yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
269
+ return
270
+
271
+ thread = Thread(target=model.generate, kwargs=dict(
272
+ inputs=input_ids,
273
+ do_sample=do_sample,
274
+ temperature=temperature,
275
+ top_p=top_p,
276
+ max_new_tokens=max_new_tokens,
277
+ streamer=streamer,
278
+ stopping_criteria=[stopping_criteria],
279
+ use_cache=True,
280
+ **image_args
281
+ ))
282
+ thread.start()
283
+
284
+ generated_text = ori_prompt
285
+ token_count = 0
286
+ for new_text in streamer:
287
+ generated_text += new_text
288
+ token_count += len(tokenizer.encode(new_text))
289
+ if token_count >= STREAM_CHECK_MULTIPLE:
290
+ safety_message = safety_check(generated_text)
291
+ if safety_message:
292
+ print('####### Keyword alarm triggered:', generated_text)
293
+ yield json.dumps({"text": safety_message , "error_code": 1}).encode() + b"\0"
294
+ return
295
+ token_count = 0 #
296
+
297
+
298
+ if generated_text.endswith(stop_str):
299
+ generated_text = generated_text[:-len(stop_str)]
300
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
301
+
302
+ def generate_stream_gate(self, params):
303
+ try:
304
+ input_text = params.get("prompt", "")
305
+ safety_message = input_safety_check(input_text)
306
+ if safety_message:
307
+ yield json.dumps({"text": safety_message, "error_code": 1}).encode() + b"\0"
308
+ return
309
+
310
+ for x in self.generate_stream(params):
311
+ yield x
312
+ except ValueError as e:
313
+ print("Caught ValueError:", e)
314
+ ret = {
315
+ "text": server_error_msg,
316
+ "error_code": 1,
317
+ }
318
+ yield json.dumps(ret).encode() + b"\0"
319
+ except torch.cuda.CudaError as e:
320
+ print("Caught torch.cuda.CudaError:", e)
321
+ ret = {
322
+ "text": server_error_msg,
323
+ "error_code": 1,
324
+ }
325
+ yield json.dumps(ret).encode() + b"\0"
326
+ except Exception as e:
327
+ print("Caught Unknown Error", e)
328
+ ret = {
329
+ "text": server_error_msg,
330
+ "error_code": 1,
331
+ }
332
+ yield json.dumps(ret).encode() + b"\0"
333
+
334
+
335
+ app = FastAPI()
336
+
337
+
338
+ def release_model_semaphore(fn=None):
339
+ model_semaphore.release()
340
+ if fn is not None:
341
+ fn()
342
+
343
+
344
+ @app.post("/worker_generate_stream")
345
+ async def generate_stream(request: Request):
346
+ global model_semaphore, global_counter
347
+ global_counter += 1
348
+ params = await request.json()
349
+
350
+ if model_semaphore is None:
351
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
352
+ await model_semaphore.acquire()
353
+ worker.send_heart_beat()
354
+ generator = worker.generate_stream_gate(params)
355
+ background_tasks = BackgroundTasks()
356
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
357
+ return StreamingResponse(generator, background=background_tasks)
358
+
359
+
360
+ @app.post("/worker_get_status")
361
+ async def get_status(request: Request):
362
+ return worker.get_status()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ parser = argparse.ArgumentParser()
367
+ parser.add_argument("--host", type=str, default="localhost")
368
+ parser.add_argument("--port", type=int, default=21002)
369
+ parser.add_argument("--worker-address", type=str, default="http://localhost:21002")
370
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
371
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
372
+ parser.add_argument("--model-base", type=str, default=None)
373
+ parser.add_argument("--model-name", type=str)
374
+ parser.add_argument("--device", type=str, default="cuda")
375
+ parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
376
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
377
+ parser.add_argument("--stream-interval", type=int, default=1)
378
+ parser.add_argument("--no-register", action="store_true")
379
+ parser.add_argument("--load-8bit", action="store_true")
380
+ parser.add_argument("--load-4bit", action="store_true")
381
+ args = parser.parse_args()
382
+ logger.info(f"args: {args}")
383
+
384
+ if args.multi_modal:
385
+ logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
386
+
387
+ worker = ModelWorker(args.controller_address,
388
+ args.worker_address,
389
+ worker_id,
390
+ args.no_register,
391
+ args.model_path,
392
+ args.model_base,
393
+ args.model_name,
394
+ args.load_8bit,
395
+ args.load_4bit,
396
+ args.device)
397
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/register_worker.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manually register workers.
3
+
4
+ Usage:
5
+ python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6
+ """
7
+
8
+ import argparse
9
+
10
+ import requests
11
+
12
+ if __name__ == "__main__":
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--controller-address", type=str)
15
+ parser.add_argument("--worker-name", type=str)
16
+ parser.add_argument("--check-heart-beat", action="store_true")
17
+ args = parser.parse_args()
18
+
19
+ url = args.controller_address + "/register_worker"
20
+ data = {
21
+ "worker_name": args.worker_name,
22
+ "check_heart_beat": args.check_heart_beat,
23
+ "worker_status": None,
24
+ }
25
+ r = requests.post(url, json=data)
26
+ assert r.status_code == 200
videollama2/serve/sglang_worker.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import argparse
5
+ import asyncio
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import json
8
+ import time
9
+ import threading
10
+ import uuid
11
+
12
+ from fastapi import FastAPI, Request, BackgroundTasks
13
+ from fastapi.responses import StreamingResponse
14
+ import requests
15
+ import re
16
+ import uvicorn
17
+ from functools import partial
18
+
19
+ from llava.constants import WORKER_HEART_BEAT_INTERVAL
20
+ from llava.utils import (build_logger, server_error_msg,
21
+ pretty_print_semaphore)
22
+ from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
23
+ from llava.constants import DEFAULT_IMAGE_TOKEN
24
+
25
+ import sglang as sgl
26
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
27
+
28
+
29
+ GB = 1 << 30
30
+
31
+ worker_id = str(uuid.uuid4())[:6]
32
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
33
+ global_counter = 0
34
+
35
+ model_semaphore = None
36
+
37
+
38
+ def heart_beat_worker(controller):
39
+ while True:
40
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
41
+ controller.send_heart_beat()
42
+
43
+
44
+ @sgl.function
45
+ def pipeline(s, prompt, max_tokens):
46
+ for p in prompt:
47
+ if type(p) is str:
48
+ s += p
49
+ else:
50
+ s += sgl.image(p)
51
+ s += sgl.gen("response", max_tokens=max_tokens)
52
+
53
+
54
+ class ModelWorker:
55
+ def __init__(self, controller_addr, worker_addr, sgl_endpoint,
56
+ worker_id, no_register, model_name):
57
+ self.controller_addr = controller_addr
58
+ self.worker_addr = worker_addr
59
+ self.worker_id = worker_id
60
+
61
+ # Select backend
62
+ backend = RuntimeEndpoint(sgl_endpoint)
63
+ sgl.set_default_backend(backend)
64
+ model_path = backend.model_info["model_path"]
65
+
66
+ if model_path.endswith("/"):
67
+ model_path = model_path[:-1]
68
+ if model_name is None:
69
+ model_paths = model_path.split("/")
70
+ if model_paths[-1].startswith('checkpoint-'):
71
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
72
+ else:
73
+ self.model_name = model_paths[-1]
74
+ else:
75
+ self.model_name = model_name
76
+
77
+ logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
78
+
79
+ if not no_register:
80
+ self.register_to_controller()
81
+ self.heart_beat_thread = threading.Thread(
82
+ target=heart_beat_worker, args=(self,), daemon=True)
83
+ self.heart_beat_thread.start()
84
+
85
+ def register_to_controller(self):
86
+ logger.info("Register to controller")
87
+
88
+ url = self.controller_addr + "/register_worker"
89
+ data = {
90
+ "worker_name": self.worker_addr,
91
+ "check_heart_beat": True,
92
+ "worker_status": self.get_status()
93
+ }
94
+ r = requests.post(url, json=data)
95
+ assert r.status_code == 200
96
+
97
+ def send_heart_beat(self):
98
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
99
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
100
+ f"global_counter: {global_counter}")
101
+
102
+ url = self.controller_addr + "/receive_heart_beat"
103
+
104
+ while True:
105
+ try:
106
+ ret = requests.post(url, json={
107
+ "worker_name": self.worker_addr,
108
+ "queue_length": self.get_queue_length()}, timeout=5)
109
+ exist = ret.json()["exist"]
110
+ break
111
+ except requests.exceptions.RequestException as e:
112
+ logger.error(f"heart beat error: {e}")
113
+ time.sleep(5)
114
+
115
+ if not exist:
116
+ self.register_to_controller()
117
+
118
+ def get_queue_length(self):
119
+ if model_semaphore is None:
120
+ return 0
121
+ else:
122
+ return args.limit_model_concurrency - model_semaphore._value + (len(
123
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
124
+
125
+ def get_status(self):
126
+ return {
127
+ "model_names": [self.model_name],
128
+ "speed": 1,
129
+ "queue_length": self.get_queue_length(),
130
+ }
131
+
132
+ async def generate_stream(self, params):
133
+ ori_prompt = prompt = params["prompt"]
134
+ images = params.get("images", None)
135
+ if images is not None and len(images) > 0:
136
+ if len(images) > 0:
137
+ if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
138
+ raise ValueError("Number of images does not match number of <image> tokens in prompt")
139
+
140
+ images = [load_image_from_base64(image) for image in images]
141
+
142
+ # FIXME: for image-start/end token
143
+ # replace_token = DEFAULT_IMAGE_TOKEN
144
+ # if getattr(self.model.config, 'mm_use_im_start_end', False):
145
+ # replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
146
+ # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
147
+ prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
148
+ prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
149
+ prompt = []
150
+ for i in range(len(prompt_split)):
151
+ prompt.append(prompt_split[i])
152
+ if i < len(images):
153
+ prompt.append(images[i])
154
+ else:
155
+ prompt = [prompt]
156
+
157
+ temperature = float(params.get("temperature", 1.0))
158
+ top_p = float(params.get("top_p", 1.0))
159
+ # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
160
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
161
+ stop_str = params.get("stop", None)
162
+ stop_str = [stop_str] if stop_str is not None else None
163
+
164
+ print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
165
+ state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
166
+
167
+ generated_text = ori_prompt
168
+ async for text_outputs in state.text_async_iter(var_name="response"):
169
+ generated_text += text_outputs
170
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
171
+
172
+ async def generate_stream_gate(self, params):
173
+ try:
174
+ async for x in self.generate_stream(params):
175
+ yield x
176
+ except ValueError as e:
177
+ print("Caught ValueError:", e)
178
+ ret = {
179
+ "text": server_error_msg,
180
+ "error_code": 1,
181
+ }
182
+ yield json.dumps(ret).encode() + b"\0"
183
+ except Exception as e:
184
+ print("Caught Unknown Error", e)
185
+ ret = {
186
+ "text": server_error_msg,
187
+ "error_code": 1,
188
+ }
189
+ yield json.dumps(ret).encode() + b"\0"
190
+
191
+
192
+ app = FastAPI()
193
+
194
+
195
+ def release_model_semaphore(fn=None):
196
+ model_semaphore.release()
197
+ if fn is not None:
198
+ fn()
199
+
200
+
201
+ @app.post("/worker_generate_stream")
202
+ async def generate_stream(request: Request):
203
+ global model_semaphore, global_counter
204
+ global_counter += 1
205
+ params = await request.json()
206
+
207
+ if model_semaphore is None:
208
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
209
+ await model_semaphore.acquire()
210
+ worker.send_heart_beat()
211
+ generator = worker.generate_stream_gate(params)
212
+ background_tasks = BackgroundTasks()
213
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
214
+ return StreamingResponse(generator, background=background_tasks)
215
+
216
+
217
+ @app.post("/worker_get_status")
218
+ async def get_status(request: Request):
219
+ return worker.get_status()
220
+
221
+
222
+ if __name__ == "__main__":
223
+ parser = argparse.ArgumentParser()
224
+ parser.add_argument("--host", type=str, default="localhost")
225
+ parser.add_argument("--port", type=int, default=21002)
226
+ parser.add_argument("--worker-address", type=str,
227
+ default="http://localhost:21002")
228
+ parser.add_argument("--controller-address", type=str,
229
+ default="http://localhost:21001")
230
+ parser.add_argument("--model-name", type=str)
231
+ parser.add_argument("--sgl-endpoint", type=str)
232
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
233
+ parser.add_argument("--stream-interval", type=int, default=1)
234
+ parser.add_argument("--no-register", action="store_true")
235
+ args = parser.parse_args()
236
+ logger.info(f"args: {args}")
237
+
238
+ worker = ModelWorker(args.controller_address,
239
+ args.worker_address,
240
+ args.sgl_endpoint,
241
+ worker_id,
242
+ args.no_register,
243
+ args.model_name)
244
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/test_message.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+ import requests
5
+
6
+ from llava.conversation import default_conversation
7
+
8
+
9
+ def main():
10
+ if args.worker_address:
11
+ worker_addr = args.worker_address
12
+ else:
13
+ controller_addr = args.controller_address
14
+ ret = requests.post(controller_addr + "/refresh_all_workers")
15
+ ret = requests.post(controller_addr + "/list_models")
16
+ models = ret.json()["models"]
17
+ models.sort()
18
+ print(f"Models: {models}")
19
+
20
+ ret = requests.post(controller_addr + "/get_worker_address",
21
+ json={"model": args.model_name})
22
+ worker_addr = ret.json()["address"]
23
+ print(f"worker_addr: {worker_addr}")
24
+
25
+ if worker_addr == "":
26
+ return
27
+
28
+ conv = default_conversation.copy()
29
+ conv.append_message(conv.roles[0], args.message)
30
+ prompt = conv.get_prompt()
31
+
32
+ headers = {"User-Agent": "LLaVA Client"}
33
+ pload = {
34
+ "model": args.model_name,
35
+ "prompt": prompt,
36
+ "max_new_tokens": args.max_new_tokens,
37
+ "temperature": 0.7,
38
+ "stop": conv.sep,
39
+ }
40
+ response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41
+ json=pload, stream=True)
42
+
43
+ print(prompt.replace(conv.sep, "\n"), end="")
44
+ for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45
+ if chunk:
46
+ data = json.loads(chunk.decode("utf-8"))
47
+ output = data["text"].split(conv.sep)[-1]
48
+ print(output, end="\r")
49
+ print("")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ parser = argparse.ArgumentParser()
54
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55
+ parser.add_argument("--worker-address", type=str)
56
+ parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57
+ parser.add_argument("--max-new-tokens", type=int, default=32)
58
+ parser.add_argument("--message", type=str, default=
59
+ "Tell me a story with more than 1000 words.")
60
+ args = parser.parse_args()
61
+
62
+ main()
videollama2/train.py ADDED
@@ -0,0 +1,700 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import re
19
+ import os
20
+ import copy
21
+ import json
22
+ import random
23
+ import pathlib
24
+ import traceback
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, Optional, Sequence, List
27
+
28
+ # torch-related packages
29
+ # NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur.
30
+ import torch
31
+ from torch.utils.data import Dataset
32
+
33
+ import transformers
34
+ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
35
+
36
+ import sys
37
+ sys.path.append('./')
38
+ from videollama2.model import *
39
+ from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP
40
+ from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image, process_audio_file
41
+ from videollama2.videollama2_trainer import (VideoLLaMA2Trainer,
42
+ get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3,
43
+ find_all_linear_names, safe_save_model_for_hf_trainer
44
+ )
45
+
46
+ # NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486
47
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
48
+
49
+ local_rank = None
50
+
51
+
52
+ def rank0_print(*args):
53
+ if local_rank == 0:
54
+ print(*args)
55
+
56
+
57
+ def set_seed(seed=42):
58
+ """
59
+ Set the random seed for reproducible results.
60
+
61
+ :param seed: An integer value to be used as the random seed.
62
+ """
63
+ torch.manual_seed(seed)
64
+ torch.cuda.manual_seed(seed)
65
+ torch.cuda.manual_seed_all(seed) # for multi-GPU setups
66
+ torch.backends.cudnn.deterministic = True
67
+ torch.backends.cudnn.benchmark = False
68
+
69
+
70
+ @dataclass
71
+ class ModelArguments:
72
+ # LLM Arguments
73
+ model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())})
74
+ model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
75
+ version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
76
+ freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
77
+ tune_adapter_llm: bool = field(default=False)
78
+ # Connector Arguments
79
+ mm_projector_type: Optional[str] = field(default='linear')
80
+ mm_projector_a_type: Optional[str] = field(default='linear')
81
+ tune_mm_mlp_adapter: bool = field(default=False)
82
+ tune_mm_mlp_adapter_a: bool = field(default=False)
83
+ pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
84
+ pretrain_mm_mlp_adapter_a: Optional[str] = field(default=None)
85
+ # Vision tower Arguments
86
+ vision_tower: Optional[str] = field(default=None)
87
+ mm_vision_select_layer: Optional[int] = field(default=-1)
88
+ mm_vision_select_feature: Optional[str] = field(default="patch")
89
+ # Audio tower Arguments
90
+ audio_tower: Optional[str] = field(default=None)
91
+ tune_audio_tower: bool = field(default=False)
92
+
93
+ @dataclass
94
+ class DataArguments:
95
+ # Path Arguments
96
+ data_path: str = field(default=None, metadata={"help": "Path to the training data."})
97
+ data_path_a: Optional[str] = field(default=None, metadata={"help": "Path to the audio data."})
98
+ # image_folder: Optional[str] = field(default=None)
99
+ # video_folder: Optional[str] = field(default=None)
100
+ data_folder: Optional[str] = field(default=None)
101
+ # Loading Arguments
102
+ is_multimodal: bool = False
103
+ va: bool = field(default=False)
104
+ lazy_preprocess: bool = False
105
+ num_frames: Optional[int] = field(default=None)
106
+ # Preprocess Arguments
107
+ image_aspect_ratio: str = 'square'
108
+
109
+
110
+ @dataclass
111
+ class TrainingArguments(transformers.TrainingArguments):
112
+ optim: str = field(default="adamw_torch")
113
+ mm_projector_lr: Optional[float] = None
114
+ freeze_mm_mlp_adapter: bool = field(default=False)
115
+ remove_unused_columns: bool = field(default=False)
116
+ cache_dir: Optional[str] = field(default=None)
117
+ # Training Data Arguments
118
+ group_by_modality_length: bool = field(default=False)
119
+ model_max_length: int = field(
120
+ default=512,
121
+ metadata={
122
+ "help":
123
+ "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
124
+ },
125
+ )
126
+ # Lora or Quant Arguments
127
+ double_quant: bool = field(
128
+ default=True,
129
+ metadata={"help": "Compress the quantization statistics through double quantization."}
130
+ )
131
+ quant_type: str = field(
132
+ default="nf4",
133
+ metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
134
+ )
135
+ bits: int = field(
136
+ default=16,
137
+ metadata={"help": "How many bits to use."}
138
+ )
139
+ lora_enable: bool = False
140
+ lora_r: int = 64
141
+ lora_alpha: int = 16
142
+ lora_dropout: float = 0.05
143
+ lora_weight_path: str = ""
144
+ lora_bias: str = "none"
145
+
146
+
147
+ def preprocess_plain(
148
+ sources: Sequence[str],
149
+ tokenizer: transformers.PreTrainedTokenizer,
150
+ modal_token: str = None,
151
+ ) -> Dict:
152
+ roles = {"human": "user", "gpt": "assistant"}
153
+ conversations = []
154
+ input_ids = []
155
+ targets = []
156
+ #print(sources)
157
+ for source in sources:
158
+ # 1. apply chat template for input conversation
159
+ assert len(source) == 2
160
+ assert modal_token in source[0]['value']
161
+ message = [
162
+ {'role': 'user', 'content': modal_token},
163
+ {'role': 'assistant', 'content': source[1]['value']}
164
+ ]
165
+ conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
166
+ #print(conversation) //<s> [INST] <audio> [/INST] Someone is speaking.</s>
167
+ # 2. tokenize conversations
168
+ input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
169
+ # 3. make targets
170
+ targets.append(copy.deepcopy(input_ids[-1]))
171
+ #print(targets)
172
+ instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True)
173
+ #print(instruction) //<s> [INST] <audio> [/INST]
174
+ instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
175
+ #print(instruction_len) //12
176
+ targets[-1][:instruction_len] = IGNORE_INDEX
177
+ # print("instruction: ----------------")
178
+ # print(instruction)
179
+ # print("conversation: ----------------")
180
+ # print(conversation)
181
+ # print("training targets: ----------------")
182
+ # print(tokenizer.decode(targets[-1][instruction_len:]))
183
+ # print(input_ids[-1])
184
+ # print(targets[-1])
185
+ return dict(input_ids=input_ids, labels=targets)
186
+
187
+
188
+ def preprocess(
189
+ sources: Sequence[str],
190
+ tokenizer: transformers.PreTrainedTokenizer,
191
+ modal_token: str = None,
192
+ ) -> Dict:
193
+ roles = {"human": "user", "gpt": "assistant"}
194
+
195
+ # Apply prompt templates
196
+ conversations = []
197
+ input_ids = []
198
+ targets = []
199
+ for i, source in enumerate(sources):
200
+ if roles[source[0]["from"]] != "user":
201
+ # Skip the first one if it is not from human
202
+ source = source[1:]
203
+ message = [{'role': roles[sentence['from']], 'content': sentence['value']} for sentence in source]
204
+ conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
205
+ #print(conversation)
206
+ input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
207
+ #print(input_ids)
208
+ targets.append(copy.deepcopy(input_ids[-1]))
209
+ #print(targets)
210
+ assert len(source) % 2 == 0, f"Invalid conversation length {len(source)}."
211
+
212
+ cur = 0
213
+ message = []
214
+ for idx, sentence in enumerate(source):
215
+ if idx % 2 == 1:
216
+ tmp_message = [
217
+ {'role': roles[source[idx-1]['from']], 'content': source[idx-1]['value']},
218
+ {'role': roles[sentence['from']], 'content': sentence['value']}
219
+ ]
220
+
221
+ instruction = tokenizer.apply_chat_template(message + tmp_message[:1], tokenize=False, add_generation_prompt=True)
222
+ conversation = tokenizer.apply_chat_template(message + tmp_message, tokenize=False, add_generation_prompt=False)
223
+
224
+ instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
225
+ conversation_len = len(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
226
+
227
+ targets[-1][cur:instruction_len] = IGNORE_INDEX
228
+ #print(targets[-1])
229
+ cur = conversation_len
230
+ message += tmp_message
231
+ return dict(input_ids=input_ids, labels=targets)
232
+
233
+
234
+ def preprocess_multimodal(
235
+ sources: Sequence[str],
236
+ data_args: DataArguments,
237
+ modal_token: str = None,
238
+ ) -> Dict:
239
+ is_multimodal = data_args.is_multimodal
240
+ if not is_multimodal:
241
+ return sources
242
+
243
+ assert modal_token in MODAL_INDEX_MAP, f"Unsupported modal token {modal_token}."
244
+
245
+ for source in sources:
246
+ for sentence in source:
247
+ if modal_token in sentence['value']:
248
+ sentence['value'] = sentence['value'].replace(modal_token, '').strip()
249
+ sentence['value'] = modal_token + '\n' + sentence['value']
250
+ sentence['value'] = sentence['value'].strip()
251
+ replace_token = modal_token
252
+ # TODO: fix this for multimedia, e.g., <video>, <audio>, etc.
253
+ sentence["value"] = sentence["value"].replace(modal_token, replace_token)
254
+
255
+ return sources
256
+
257
+
258
+ class LazySupervisedDataset(Dataset):
259
+ """Dataset for supervised fine-tuning."""
260
+
261
+ def __init__(self, data_path: str, data_path_a: str,
262
+ tokenizer: transformers.PreTrainedTokenizer,
263
+ data_args: DataArguments):
264
+ super(LazySupervisedDataset, self).__init__()
265
+ self.mix_sampler_tag = False
266
+ if data_path is not None and len(data_path.split(",")) == 1:
267
+ data_path = data_path.split(",")[0]
268
+ list_data_dict = json.load(open(data_path, "r"))
269
+ elif data_path is not None and len(data_path.split(",")) > 1:
270
+ self.mix_sampler_tag = True
271
+ data_path = data_path.split(",")
272
+ for path in data_path:
273
+ if "stage3" in path:
274
+ self.av_data = json.load(open(path, "r"))
275
+ random.shuffle(self.av_data)
276
+ elif "stage2" in path and "audio" in path:
277
+ self.a_data = json.load(open(path, "r"))
278
+ random.shuffle(self.a_data)
279
+ elif "stage2" in path and "video" in path:
280
+ self.v_data = json.load(open(path, "r"))
281
+ random.shuffle(self.v_data)
282
+ else:
283
+ raise NotImplementedError
284
+ list_data_dict = self.av_data + self.a_data + self.v_data
285
+ if data_path_a is not None:
286
+ list_data_dict = json.load(open(data_path_a, "r"))
287
+
288
+ rank0_print("Formatting inputs...Skip in lazy mode")
289
+ self.tokenizer = tokenizer
290
+ self.list_data_dict = list_data_dict
291
+ self.data_args = data_args
292
+
293
+ def __len__(self):
294
+ return len(self.list_data_dict)
295
+
296
+ @property
297
+ def lengths(self):
298
+ length_list = []
299
+ for sample in self.list_data_dict:
300
+ img_tokens = 576 if 'image' in sample else 0
301
+ length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
302
+ return length_list
303
+
304
+ @property
305
+ def modality_lengths(self):
306
+ length_list = []
307
+ for sample in self.list_data_dict:
308
+ cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
309
+ cur_len = cur_len if 'image' in sample else -cur_len
310
+ length_list.append(cur_len)
311
+ return length_list
312
+
313
+ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
314
+ sources = self.list_data_dict[i]
315
+ if isinstance(i, int):
316
+ sources = [sources]
317
+ assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME
318
+ if self.data_args.data_path is not None:
319
+ image_processor = self.data_args.image_processor
320
+ video_processor = self.data_args.video_processor
321
+
322
+ num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames
323
+
324
+ if 'image' in sources[0]:
325
+ image_file = self.list_data_dict[i]['image']
326
+ image_folder = self.data_args.data_folder
327
+ image_file = os.path.join(image_folder, image_file)
328
+
329
+ try:
330
+ image = process_image(image_file, image_processor, aspect_ratio=self.data_args.image_aspect_ratio)
331
+ except:
332
+ traceback.print_exc()
333
+ backup_idx = random.randint(0, len(self.list_data_dict) - 1)
334
+ print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!")
335
+ return self.__getitem__(backup_idx)
336
+
337
+ # place <image> tag to question head.
338
+ modal_token = "<image>"
339
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
340
+ elif 'video' in sources[0]:
341
+ video_file = self.list_data_dict[i]['video']
342
+ video_folder = self.data_args.data_folder
343
+ if video_folder:
344
+ video_file = os.path.join(video_folder, video_file)
345
+ try:
346
+ video = process_video(video_file, video_processor, aspect_ratio=self.data_args.image_aspect_ratio, num_frames=num_frames, va = self.data_args.va if not self.mix_sampler_tag else (i < len(self.av_data)))
347
+ except Exception as e:
348
+ traceback.print_exc()
349
+ backup_idx = random.randint(0, len(self.list_data_dict) - 1)
350
+ print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!")
351
+ return self.__getitem__(backup_idx)
352
+
353
+ # place <video> tag to question head.
354
+ modal_token = "<video>"
355
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
356
+
357
+ elif 'audio' in sources[0]:
358
+ audio_file = self.list_data_dict[i]['audio']
359
+ #audio_folder = self.data_args.base_folder
360
+ #print(audio_file)
361
+ try:
362
+ audio = process_audio_file(audio_file)
363
+ except Exception as e:
364
+ print(e)
365
+ backup_idx = random.randint(0, len(self.list_data_dict)-1)
366
+ print(f"Encounted error when reading audio {audio_file}, use {backup_idx}-th example instead!!!")
367
+ return self.__getitem__(backup_idx)
368
+ modal_token = "<audio>"
369
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
370
+
371
+ else:
372
+ modal_token = None
373
+ sources = copy.deepcopy([e["conversations"] for e in sources])
374
+
375
+ if self.data_args.is_pretraining:
376
+ data_dict = preprocess_plain(sources, self.tokenizer, modal_token=modal_token)
377
+ else:
378
+ data_dict = preprocess(sources, self.tokenizer, modal_token=modal_token)
379
+
380
+ if isinstance(i, int):
381
+ data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
382
+
383
+ # image exist in the data
384
+ if 'image' in self.list_data_dict[i]:
385
+ data_dict['image'] = image
386
+ elif 'video' in self.list_data_dict[i]:
387
+ data_dict['video'] = video
388
+ elif 'audio' in self.list_data_dict[i]:
389
+ data_dict['audio'] = audio
390
+ elif self.data_args.data_path_a:
391
+ # image does not exist in the data, but the model is multimodal
392
+ data_dict['audio'] = torch.zeros(1, 2998, 128)
393
+ elif self.data_args.is_multimodal:
394
+ # image does not exist in the data, but the model is multimodal
395
+ data_dict['image'] = torch.zeros(3, self.data_args.image_size, self.data_args.image_size)
396
+ return data_dict
397
+
398
+
399
+ @dataclass
400
+ class DataCollatorForSupervisedDataset(object):
401
+ """Collate examples for supervised fine-tuning."""
402
+
403
+ tokenizer: transformers.PreTrainedTokenizer
404
+
405
+ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
406
+ input_ids, labels = tuple([instance[key] for instance in instances]
407
+ for key in ("input_ids", "labels"))
408
+ input_ids = torch.nn.utils.rnn.pad_sequence(
409
+ input_ids,
410
+ batch_first=True,
411
+ padding_value=self.tokenizer.pad_token_id)
412
+ labels = torch.nn.utils.rnn.pad_sequence(labels,
413
+ batch_first=True,
414
+ padding_value=IGNORE_INDEX)
415
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
416
+ labels = labels[:, :self.tokenizer.model_max_length]
417
+ batch = dict(
418
+ input_ids=input_ids,
419
+ labels=labels,
420
+ attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
421
+ )
422
+
423
+ # work for 'images' argument in `prepare_inputs_labels_for_multimodal` of LlavaMetaForCausalLM in llava_arch.py
424
+ batch['images'] = []
425
+ for instance in instances:
426
+ for modal_token in MODAL_INDEX_MAP.keys():
427
+ modal_token = modal_token.lower()
428
+ # MODAL_TOKEN shape like: <image>, <video>, ...
429
+ modal_name = re.findall(f'[<](.*)[>]', modal_token)
430
+ assert len(modal_name) == 1
431
+ modal_name = modal_name[0]
432
+ if modal_name in instance:
433
+ batch['images'].append((instance[modal_name], modal_name))
434
+
435
+ return batch
436
+
437
+
438
+ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
439
+ data_args) -> Dict:
440
+ """Make dataset and collator for supervised fine-tuning."""
441
+ train_dataset = LazySupervisedDataset(
442
+ tokenizer=tokenizer,
443
+ data_path=data_args.data_path,
444
+ data_path_a=data_args.data_path_a,
445
+ data_args=data_args
446
+ )
447
+ data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
448
+ return dict(train_dataset=train_dataset,
449
+ eval_dataset=None,
450
+ data_collator=data_collator)
451
+
452
+
453
+ def train(attn_implementation="flash_attention_2"):
454
+ global local_rank
455
+ set_seed(42)
456
+
457
+ parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
458
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
459
+
460
+ local_rank = training_args.local_rank
461
+ compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
462
+
463
+ bnb_model_from_pretrained_args = {}
464
+ if training_args.bits in [4, 8]:
465
+ from transformers import BitsAndBytesConfig
466
+ bnb_model_from_pretrained_args.update(dict(
467
+ # device_map={"": training_args.device},
468
+ # BUG: High version transformers report error:
469
+ # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time
470
+ # load_in_4bit=training_args.bits == 4,
471
+ # load_in_8bit=training_args.bits == 8,
472
+ quantization_config=BitsAndBytesConfig(
473
+ load_in_4bit=training_args.bits == 4,
474
+ load_in_8bit=training_args.bits == 8,
475
+ llm_int8_skip_modules=["mm_projector"],
476
+ llm_int8_threshold=6.0,
477
+ llm_int8_has_fp16_weight=False,
478
+ bnb_4bit_compute_dtype=compute_dtype,
479
+ bnb_4bit_use_double_quant=training_args.double_quant,
480
+ bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'}
481
+ bnb_4bit_quant_storage=compute_dtype,
482
+ )
483
+ ))
484
+
485
+ config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path, trust_remote_code=True)
486
+ if 'gemma2' in model_args.model_type:
487
+ config._attn_implementation = 'eager'
488
+ else:
489
+ config._attn_implementation = attn_implementation
490
+
491
+ if model_args.vision_tower is not None or model_args.audio_tower is not None:
492
+ model = VLLMs[model_args.model_type].from_pretrained(
493
+ model_args.model_path,
494
+ config=config,
495
+ cache_dir=training_args.cache_dir,
496
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
497
+ do_sample=True,
498
+ **bnb_model_from_pretrained_args
499
+ )
500
+ if 'mixtral' in model_args.model_type:
501
+ import deepspeed
502
+ deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
503
+ else:
504
+ model = transformers.LlamaForCausalLM.from_pretrained(
505
+ model_args.model_path,
506
+ config=config,
507
+ cache_dir=training_args.cache_dir,
508
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
509
+ do_sample=True,
510
+ **bnb_model_from_pretrained_args
511
+ )
512
+ model.config.use_cache = False
513
+
514
+
515
+ if training_args.bits in [4, 8]:
516
+ from peft import prepare_model_for_kbit_training
517
+ model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
518
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
519
+
520
+ if training_args.gradient_checkpointing:
521
+ if hasattr(model, "enable_input_require_grads"):
522
+ model.enable_input_require_grads()
523
+ else:
524
+ def make_inputs_require_grad(module, input, output):
525
+ output.requires_grad_(True)
526
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
527
+
528
+ if training_args.lora_enable:
529
+ from peft import LoraConfig, get_peft_model
530
+ lora_config = LoraConfig(
531
+ r=training_args.lora_r,
532
+ lora_alpha=training_args.lora_alpha,
533
+ target_modules=find_all_linear_names(model),
534
+ lora_dropout=training_args.lora_dropout,
535
+ bias=training_args.lora_bias,
536
+ task_type="CAUSAL_LM",
537
+ )
538
+ if training_args.bits == 16:
539
+ if training_args.bf16:
540
+ model.to(torch.bfloat16)
541
+ if training_args.fp16:
542
+ model.to(torch.float16)
543
+ rank0_print("Adding LoRA adapters...")
544
+ model = get_peft_model(model, lora_config)
545
+
546
+
547
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
548
+ model_args.model_path,
549
+ cache_dir=training_args.cache_dir,
550
+ model_max_length=training_args.model_max_length,
551
+ padding_side="right",
552
+ use_fast=True,
553
+ )
554
+
555
+ if tokenizer.pad_token is None:
556
+ tokenizer.pad_token = tokenizer.unk_token
557
+
558
+ if model_args.vision_tower is not None:
559
+ # initialize vision encoder + multi-modal projector
560
+ model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
561
+
562
+ vision_tower = model.get_vision_tower()
563
+ vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
564
+
565
+ data_args.image_size = vision_tower.image_size
566
+
567
+ data_args.image_processor = vision_tower.image_processor
568
+ data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
569
+
570
+ data_args.is_multimodal = True
571
+
572
+ model.config.image_aspect_ratio = data_args.image_aspect_ratio
573
+ model.config.tokenizer_padding_side = tokenizer.padding_side
574
+ model.config.tokenizer_model_max_length = tokenizer.model_max_length
575
+
576
+ model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
577
+ if model_args.tune_mm_mlp_adapter:
578
+ model.requires_grad_(False)
579
+ for p in model.get_model().mm_projector.parameters():
580
+ p.requires_grad = True
581
+
582
+ if model_args.tune_mm_mlp_adapter:
583
+ data_args.is_pretraining = True
584
+ else:
585
+ data_args.is_pretraining = False
586
+
587
+ model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
588
+ if training_args.freeze_mm_mlp_adapter:
589
+ for p in model.get_model().mm_projector.parameters():
590
+ p.requires_grad = False
591
+
592
+ if training_args.bits in [4, 8]:
593
+ model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
594
+
595
+ model.config.mm_projector_lr = training_args.mm_projector_lr
596
+ model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames
597
+
598
+
599
+ if model_args.audio_tower is not None:
600
+ # initialize audio encoder + multi-modal projector
601
+ model.get_model().initialize_audio_modules(
602
+ model_args=model_args,
603
+ fsdp=training_args.fsdp
604
+ )
605
+
606
+ audio_tower = model.get_audio_tower()
607
+ audio_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
608
+ data_args.is_multimodal = True
609
+ model.config.tokenizer_padding_side = tokenizer.padding_side
610
+ model.config.tokenizer_model_max_length = tokenizer.model_max_length
611
+
612
+ model.config.tune_mm_mlp_adapter_a = training_args.tune_mm_mlp_adapter_a = model_args.tune_mm_mlp_adapter_a
613
+ training_args.pretrain_mm_mlp_adapter_a = model_args.pretrain_mm_mlp_adapter_a
614
+ training_args.tune_audio_tower = model_args.tune_audio_tower
615
+ # only update mm_mlp's parameters while the remaining ones are kept frozen
616
+ if model_args.tune_mm_mlp_adapter_a:
617
+ model.requires_grad_(False)
618
+ for p in model.get_model().mm_projector_a.parameters():
619
+ p.requires_grad = True
620
+
621
+ if model_args.tune_audio_tower or model_args.tune_adapter_llm:
622
+ data_args.is_pretraining = False
623
+ else:
624
+ data_args.is_pretraining = True
625
+
626
+ model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
627
+ if training_args.freeze_mm_mlp_adapter:
628
+ for p in model.get_model().mm_projector_a.parameters():
629
+ p.requires_grad = False
630
+
631
+ if model_args.tune_adapter_llm:
632
+ model.requires_grad_(True)
633
+ if hasattr(model.get_model(), 'vision_tower'):
634
+ for p in model.get_model().vision_tower.parameters():
635
+ p.requires_grad = False
636
+ for p in model.get_model().audio_tower.parameters():
637
+ p.requires_grad = False
638
+
639
+ if model_args.freeze_backbone:
640
+ model.requires_grad_(False)
641
+
642
+ if model_args.tune_audio_tower:
643
+ for p in model.get_model().audio_tower.parameters():
644
+ p.requires_grad = True
645
+ else:
646
+ for p in model.get_model().audio_tower.parameters():
647
+ p.requires_grad = False
648
+
649
+ if training_args.bits in [4, 8]:
650
+ model.get_model().mm_projector_a.to(dtype=compute_dtype, device=training_args.device)
651
+
652
+ model.config.mm_projector_lr = training_args.mm_projector_lr
653
+
654
+ if training_args.bits in [4, 8]:
655
+ from peft.tuners.lora import LoraLayer
656
+ for name, module in model.named_modules():
657
+ if isinstance(module, LoraLayer):
658
+ if training_args.bf16:
659
+ module = module.to(torch.bfloat16)
660
+ if 'norm' in name:
661
+ module = module.to(torch.float32)
662
+ if 'lm_head' in name or 'embed_tokens' in name:
663
+ if hasattr(module, 'weight'):
664
+ if training_args.bf16 and module.weight.dtype == torch.float32:
665
+ module = module.to(torch.bfloat16)
666
+
667
+ print("Current model:", model)
668
+ '''
669
+ for name, param in model.named_parameters():
670
+ # Check if the parameter requires gradient
671
+ if param.requires_grad:
672
+ print(f'Parameter: {name} is trainable')
673
+ else:
674
+ print(f'Parameter: {name} is frozen')
675
+ '''
676
+
677
+ data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
678
+ # select a Trainer
679
+ trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
680
+ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
681
+ trainer.train(resume_from_checkpoint=True)
682
+ else:
683
+ trainer.train()
684
+ trainer.save_state()
685
+
686
+ model.config.use_cache = True
687
+
688
+ if training_args.lora_enable:
689
+ state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
690
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
691
+ if training_args.local_rank == 0 or training_args.local_rank == -1:
692
+ model.config.save_pretrained(training_args.output_dir)
693
+ model.save_pretrained(training_args.output_dir, state_dict=state_dict)
694
+ torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
695
+ else:
696
+ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
697
+
698
+
699
+ if __name__ == "__main__":
700
+ train()
videollama2/train_flash_attn.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
5
+
6
+ import sys
7
+ sys.path.append('./')
8
+
9
+ from videollama2.train import train
10
+
11
+ if __name__ == "__main__":
12
+ train(attn_implementation="flash_attention_2")
videollama2/utils.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ import logging.handlers
4
+ import os
5
+ import sys
6
+
7
+ import requests
8
+
9
+ from .constants import LOGDIR
10
+
11
+ server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
12
+ moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
13
+
14
+ handler = None
15
+
16
+
17
+ def build_logger(logger_name, logger_filename):
18
+ global handler
19
+
20
+ formatter = logging.Formatter(
21
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
22
+ datefmt="%Y-%m-%d %H:%M:%S",
23
+ )
24
+
25
+ # Set the format of root handlers
26
+ if not logging.getLogger().handlers:
27
+ logging.basicConfig(level=logging.INFO)
28
+ logging.getLogger().handlers[0].setFormatter(formatter)
29
+
30
+ # Redirect stdout and stderr to loggers
31
+ stdout_logger = logging.getLogger("stdout")
32
+ stdout_logger.setLevel(logging.INFO)
33
+ sl = StreamToLogger(stdout_logger, logging.INFO)
34
+ sys.stdout = sl
35
+
36
+ stderr_logger = logging.getLogger("stderr")
37
+ stderr_logger.setLevel(logging.ERROR)
38
+ sl = StreamToLogger(stderr_logger, logging.ERROR)
39
+ sys.stderr = sl
40
+
41
+ # Get logger
42
+ logger = logging.getLogger(logger_name)
43
+ logger.setLevel(logging.INFO)
44
+
45
+ # Add a file handler for all loggers
46
+ if handler is None:
47
+ os.makedirs(LOGDIR, exist_ok=True)
48
+ filename = os.path.join(LOGDIR, logger_filename)
49
+ handler = logging.handlers.TimedRotatingFileHandler(
50
+ filename, when='D', utc=True, encoding='UTF-8')
51
+ handler.setFormatter(formatter)
52
+
53
+ for name, item in logging.root.manager.loggerDict.items():
54
+ if isinstance(item, logging.Logger):
55
+ item.addHandler(handler)
56
+
57
+ return logger
58
+
59
+
60
+ class StreamToLogger(object):
61
+ """
62
+ Fake file-like stream object that redirects writes to a logger instance.
63
+ """
64
+ def __init__(self, logger, log_level=logging.INFO):
65
+ self.terminal = sys.stdout
66
+ self.logger = logger
67
+ self.log_level = log_level
68
+ self.linebuf = ''
69
+
70
+ def __getattr__(self, attr):
71
+ return getattr(self.terminal, attr)
72
+
73
+ def write(self, buf):
74
+ temp_linebuf = self.linebuf + buf
75
+ self.linebuf = ''
76
+ for line in temp_linebuf.splitlines(True):
77
+ # From the io.TextIOWrapper docs:
78
+ # On output, if newline is None, any '\n' characters written
79
+ # are translated to the system default line separator.
80
+ # By default sys.stdout.write() expects '\n' newlines and then
81
+ # translates them so this is still cross platform.
82
+ if line[-1] == '\n':
83
+ self.logger.log(self.log_level, line.rstrip())
84
+ else:
85
+ self.linebuf += line
86
+
87
+ def flush(self):
88
+ if self.linebuf != '':
89
+ self.logger.log(self.log_level, self.linebuf.rstrip())
90
+ self.linebuf = ''
91
+
92
+
93
+ def disable_torch_init():
94
+ """
95
+ Disable the redundant torch default initialization to accelerate model creation.
96
+ """
97
+ import torch
98
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
99
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100
+
101
+
102
+ def violates_moderation(text):
103
+ """
104
+ Check whether the text violates OpenAI moderation API.
105
+ """
106
+ url = "https://api.openai.com/v1/moderations"
107
+ headers = {"Content-Type": "application/json",
108
+ "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109
+ text = text.replace("\n", "")
110
+ data = "{" + '"input": ' + f'"{text}"' + "}"
111
+ data = data.encode("utf-8")
112
+ try:
113
+ ret = requests.post(url, headers=headers, data=data, timeout=5)
114
+ flagged = ret.json()["results"][0]["flagged"]
115
+ except requests.exceptions.RequestException as e:
116
+ flagged = False
117
+ except KeyError as e:
118
+ flagged = False
119
+
120
+ return flagged
121
+
122
+
123
+ def pretty_print_semaphore(semaphore):
124
+ if semaphore is None:
125
+ return "None"
126
+ return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
videollama2/videollama2_trainer.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py
2
+ import os
3
+ import logging
4
+ from typing import List, Optional
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.utils.data import Sampler
9
+
10
+ from transformers import Trainer
11
+ from transformers.trainer import (
12
+ is_sagemaker_mp_enabled,
13
+ get_parameter_names,
14
+ has_length,
15
+ ALL_LAYERNORM_LAYERS,
16
+ logger,
17
+ TRAINER_STATE_NAME,
18
+ )
19
+
20
+
21
+ def maybe_zero_3(param, ignore_status=False, name=None):
22
+ from deepspeed import zero
23
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
24
+ if hasattr(param, "ds_id"):
25
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
26
+ if not ignore_status:
27
+ logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
28
+ with zero.GatheredParameters([param]):
29
+ param = param.data.detach().cpu().clone()
30
+ else:
31
+ param = param.detach().cpu().clone()
32
+ return param
33
+
34
+
35
+ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
36
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
37
+ to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
38
+ return to_return
39
+
40
+
41
+ # Borrowed from peft.utils.get_peft_model_state_dict
42
+ def get_peft_state_maybe_zero_3(named_params, bias):
43
+ if bias == "none":
44
+ to_return = {k: t for k, t in named_params if "lora_" in k}
45
+ elif bias == "all":
46
+ to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
47
+ elif bias == "lora_only":
48
+ to_return = {}
49
+ maybe_lora_bias = {}
50
+ lora_bias_names = set()
51
+ for k, t in named_params:
52
+ if "lora_" in k:
53
+ to_return[k] = t
54
+ bias_name = k.split("lora_")[0] + "bias"
55
+ lora_bias_names.add(bias_name)
56
+ elif "bias" in k:
57
+ maybe_lora_bias[k] = t
58
+ for k, t in maybe_lora_bias:
59
+ if bias_name in lora_bias_names:
60
+ to_return[bias_name] = t
61
+ else:
62
+ raise NotImplementedError
63
+ to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
64
+ return to_return
65
+
66
+
67
+ def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
68
+ to_return = {k: t for k, t in named_params if "lora_" not in k}
69
+ if require_grad_only:
70
+ to_return = {k: t for k, t in to_return.items() if t.requires_grad}
71
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
72
+ return to_return
73
+
74
+
75
+ def find_all_linear_names(model):
76
+ cls = torch.nn.Linear
77
+ lora_module_names = set()
78
+ multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
79
+ for name, module in model.named_modules():
80
+ if any(mm_keyword in name for mm_keyword in multimodal_keywords):
81
+ continue
82
+ if isinstance(module, cls):
83
+ names = name.split('.')
84
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
85
+
86
+ if 'lm_head' in lora_module_names: # needed for 16-bit
87
+ lora_module_names.remove('lm_head')
88
+ return list(lora_module_names)
89
+
90
+
91
+ def safe_save_model_for_hf_trainer(trainer: Trainer,
92
+ output_dir: str):
93
+ """Collects the state dict and dump to disk."""
94
+
95
+ if getattr(trainer.args, "tune_mm_mlp_adapter", False):
96
+ # Only save Adapter
97
+ keys_to_match = ['mm_projector']
98
+
99
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
100
+ trainer.model.config.save_pretrained(output_dir)
101
+
102
+ current_folder = output_dir.split('/')[-1]
103
+ parent_folder = os.path.dirname(output_dir)
104
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
105
+ if current_folder.startswith('checkpoint-'):
106
+ mm_projector_folder = os.path.join(parent_folder, "mm_projector")
107
+ os.makedirs(mm_projector_folder, exist_ok=True)
108
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
109
+ else:
110
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
111
+ return
112
+
113
+ elif getattr(trainer.args, "tune_mm_mlp_adapter_a", False):
114
+ # Only save Adapter
115
+ keys_to_match = ['mm_projector_a']
116
+ if getattr(trainer.args, "use_im_start_end", False):
117
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
118
+
119
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
120
+ trainer.model.config.save_pretrained(output_dir)
121
+
122
+ current_folder = output_dir.split('/')[-1]
123
+ parent_folder = os.path.dirname(output_dir)
124
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
125
+ if current_folder.startswith('checkpoint-'):
126
+ mm_projector_folder = os.path.join(parent_folder, "mm_projector_a")
127
+ os.makedirs(mm_projector_folder, exist_ok=True)
128
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
129
+ else:
130
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector_a.bin'))
131
+
132
+ elif getattr(trainer.args, "pretrain_mm_mlp_adapter_a", False):
133
+ # Only save Adapter
134
+ keys_to_match = ['mm_projector_a']
135
+ if getattr(trainer.args, "use_im_start_end", False):
136
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
137
+
138
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
139
+ trainer.model.config.save_pretrained(output_dir)
140
+
141
+ current_folder = output_dir.split('/')[-1]
142
+ parent_folder = os.path.dirname(output_dir)
143
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
144
+ if current_folder.startswith('checkpoint-'):
145
+ mm_projector_folder = os.path.join(parent_folder, "mm_projector_a")
146
+ os.makedirs(mm_projector_folder, exist_ok=True)
147
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
148
+ else:
149
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector_a.bin'))
150
+
151
+ if getattr(trainer.args, "tune_audio_tower", False):
152
+ # Only save Adapter
153
+ keys_to_match = ['audio_tower']
154
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
155
+ trainer.model.config.save_pretrained(output_dir)
156
+
157
+ current_folder = output_dir.split('/')[-1]
158
+ parent_folder = os.path.dirname(output_dir)
159
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
160
+ if current_folder.startswith('checkpoint-'):
161
+ mm_projector_folder = os.path.join(parent_folder, "audio_tower")
162
+ os.makedirs(mm_projector_folder, exist_ok=True)
163
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
164
+ else:
165
+ torch.save(weight_to_save, os.path.join(output_dir, f'audio_tower.bin'))
166
+
167
+ if trainer.deepspeed:
168
+ torch.cuda.synchronize()
169
+ trainer.save_model(output_dir)
170
+ return
171
+
172
+ state_dict = trainer.model.state_dict()
173
+ if trainer.args.should_save:
174
+ cpu_state_dict = {
175
+ key: value.cpu()
176
+ for key, value in state_dict.items()
177
+ }
178
+ del state_dict
179
+ trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
180
+
181
+
182
+ def split_to_even_chunks(indices, lengths, num_chunks):
183
+ """
184
+ Split a list of indices into `chunks` chunks of roughly equal lengths.
185
+ """
186
+ if len(indices) % num_chunks != 0:
187
+ return [indices[i::num_chunks] for i in range(num_chunks)]
188
+ num_indices_per_chunk = len(indices) // num_chunks
189
+ chunks = [[] for _ in range(num_chunks)]
190
+ chunks_lengths = [0 for _ in range(num_chunks)]
191
+ for index in indices:
192
+ shortest_chunk = chunks_lengths.index(min(chunks_lengths))
193
+ chunks[shortest_chunk].append(index)
194
+ chunks_lengths[shortest_chunk] += lengths[index]
195
+ if len(chunks[shortest_chunk]) == num_indices_per_chunk:
196
+ chunks_lengths[shortest_chunk] = float("inf")
197
+ return chunks
198
+
199
+
200
+ def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
201
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
202
+ assert all(l != 0 for l in lengths), "Should not have zero length."
203
+ if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
204
+ # all samples are in the same modality
205
+ return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
206
+ mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
207
+ lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
208
+
209
+ mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
210
+ lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
211
+ megabatch_size = world_size * batch_size
212
+ mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
213
+ lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
214
+
215
+ last_mm = mm_megabatches[-1]
216
+ last_lang = lang_megabatches[-1]
217
+ additional_batch = last_mm + last_lang
218
+ megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
219
+ megabatch_indices = torch.randperm(len(megabatches), generator=generator)
220
+ megabatches = [megabatches[i] for i in megabatch_indices]
221
+
222
+ if len(additional_batch) > 0:
223
+ megabatches.append(sorted(additional_batch))
224
+
225
+ return [i for megabatch in megabatches for i in megabatch]
226
+
227
+
228
+ def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
229
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
230
+ indices = torch.randperm(len(lengths), generator=generator)
231
+ megabatch_size = world_size * batch_size
232
+ megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
233
+ megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
234
+ megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
235
+ return [i for megabatch in megabatches for batch in megabatch for i in batch]
236
+
237
+
238
+ class LengthGroupedSampler(Sampler):
239
+ r"""
240
+ Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
241
+ keeping a bit of randomness.
242
+ """
243
+
244
+ def __init__(
245
+ self,
246
+ batch_size: int,
247
+ world_size: int,
248
+ lengths: Optional[List[int]] = None,
249
+ generator=None,
250
+ group_by_modality: bool = False,
251
+ ):
252
+ if lengths is None:
253
+ raise ValueError("Lengths must be provided.")
254
+
255
+ self.batch_size = batch_size
256
+ self.world_size = world_size
257
+ self.lengths = lengths
258
+ self.generator = generator
259
+ self.group_by_modality = group_by_modality
260
+
261
+ def __len__(self):
262
+ return len(self.lengths)
263
+
264
+ def __iter__(self):
265
+ if self.group_by_modality:
266
+ indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
267
+ else:
268
+ indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
269
+ return iter(indices)
270
+
271
+
272
+ class MixSampler(Sampler):
273
+ def __init__(self, dataset, batch_size=4):
274
+ self.dataset = dataset
275
+ self.av_count = len(dataset.av_data)
276
+ self.a_count = len(dataset.a_data)
277
+ self.v_count = len(dataset.v_data)
278
+ self.batch_size = batch_size
279
+
280
+ def __iter__(self):
281
+ for i in range(0, self.av_count, 2):
282
+ if i + 1 == self.av_count:
283
+ break
284
+ batch_ids = [i, i+1]
285
+
286
+ audio_index = i % self.a_count
287
+ batch_ids.append(self.av_count + audio_index)
288
+ video_index = i % self.v_count
289
+ batch_ids.append(self.av_count + self.a_count + video_index)
290
+
291
+ for x in batch_ids:
292
+ yield x
293
+
294
+ def __len__(self):
295
+ return self.av_count * 2
296
+
297
+
298
+ class VideoLLaMA2Trainer(Trainer):
299
+
300
+ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
301
+ if self.train_dataset is None or not has_length(self.train_dataset):
302
+ return None
303
+ if self.train_dataset.mix_sampler_tag:
304
+ assert self.args.train_batch_size % 4 == 0
305
+ return MixSampler(self.train_dataset, self.args.train_batch_size * self.args.gradient_accumulation_steps)
306
+
307
+ if self.args.group_by_modality_length:
308
+ lengths = self.train_dataset.modality_lengths
309
+ return LengthGroupedSampler(
310
+ self.args.train_batch_size,
311
+ world_size=self.args.world_size * self.args.gradient_accumulation_steps,
312
+ lengths=lengths,
313
+ group_by_modality=True,
314
+ )
315
+ else:
316
+ return super()._get_train_sampler()
317
+
318
+ def create_optimizer(self):
319
+ """
320
+ Setup the optimizer.
321
+
322
+ We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
323
+ Trainer's init through `optimizers`, or subclass and override this method in a subclass.
324
+ """
325
+ if is_sagemaker_mp_enabled():
326
+ return super().create_optimizer()
327
+
328
+ opt_model = self.model
329
+
330
+ if self.optimizer is None:
331
+ decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
332
+ decay_parameters = [name for name in decay_parameters if "bias" not in name]
333
+ if self.args.mm_projector_lr is not None:
334
+ projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
335
+ optimizer_grouped_parameters = [
336
+ {
337
+ "params": [
338
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
339
+ ],
340
+ "weight_decay": self.args.weight_decay,
341
+ },
342
+ {
343
+ "params": [
344
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
345
+ ],
346
+ "weight_decay": 0.0,
347
+ },
348
+ {
349
+ "params": [
350
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
351
+ ],
352
+ "weight_decay": self.args.weight_decay,
353
+ "lr": self.args.mm_projector_lr,
354
+ },
355
+ {
356
+ "params": [
357
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
358
+ ],
359
+ "weight_decay": 0.0,
360
+ "lr": self.args.mm_projector_lr,
361
+ },
362
+ ]
363
+ else:
364
+ optimizer_grouped_parameters = [
365
+ {
366
+ "params": [
367
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
368
+ ],
369
+ "weight_decay": self.args.weight_decay,
370
+ },
371
+ {
372
+ "params": [
373
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
374
+ ],
375
+ "weight_decay": 0.0,
376
+ },
377
+ ]
378
+
379
+ optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
380
+
381
+ self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
382
+ if optimizer_cls.__name__ == "Adam8bit":
383
+ import bitsandbytes
384
+
385
+ manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
386
+
387
+ skipped = 0
388
+ for module in opt_model.modules():
389
+ if isinstance(module, nn.Embedding):
390
+ skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
391
+ logger.info(f"skipped {module}: {skipped/2**20}M params")
392
+ manager.register_module_override(module, "weight", {"optim_bits": 32})
393
+ logger.debug(f"bitsandbytes: will optimize {module} in fp32")
394
+ logger.info(f"skipped: {skipped/2**20}M params")
395
+
396
+ return self.optimizer
397
+
398
+ def _save_checkpoint(self, model, trial, metrics=None):
399
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
400
+ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
401
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
402
+
403
+ run_dir = self._get_output_dir(trial=trial)
404
+ output_dir = os.path.join(run_dir, checkpoint_folder)
405
+
406
+ # Only save Adapter
407
+ keys_to_match = ['mm_projector', 'vision_resampler']
408
+
409
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
410
+
411
+ if self.args.local_rank == 0 or self.args.local_rank == -1:
412
+ self.model.config.save_pretrained(output_dir)
413
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
414
+ # Save optimizer and scheduler
415
+ self._save_optimizer_and_scheduler(output_dir)
416
+ # Save RNG state
417
+ self._save_rng_state(output_dir)
418
+ self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
419
+ self.args.distributed_state.wait_for_everyone()
420
+ else:
421
+ # NOTE: Supporting save complete lora checkpoint during training.
422
+ if self.args.lora_enable:
423
+ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
424
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
425
+
426
+ run_dir = self._get_output_dir(trial=trial)
427
+ output_dir = os.path.join(run_dir, checkpoint_folder)
428
+
429
+ state_dict = get_peft_state_maybe_zero_3(self.model.named_parameters(), self.args.lora_bias)
430
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(self.model.named_parameters())
431
+ if self.args.local_rank == 0 or self.args.local_rank == -1:
432
+ # save for acquring `config.json`
433
+ self.model.config.save_pretrained(output_dir)
434
+ # save for acquring `adapter_config.json`, `adapter_model.bin`
435
+ # self.model.save_pretrained(output_dir, state_dict=state_dict)
436
+ torch.save(non_lora_state_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
437
+
438
+ # save for acquring lora adapter parameters & trainer states: `adapter_config.json`, `adapter_model.safetensors`
439
+ super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
440
+ else:
441
+ super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
442
+
443
+ def _save(self, output_dir: Optional[str] = None, state_dict=None):
444
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
445
+ pass
446
+ else:
447
+ super(VideoLLaMA2Trainer, self)._save(output_dir, state_dict)