舟勤 commited on
Commit
8eaeb01
1 Parent(s): 15554e1
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +45 -11
  3. eval_configs/video_llama_eval.yaml +1 -1
.gitattributes CHANGED
@@ -31,4 +31,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
 
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -59,16 +59,16 @@ def setup_seeds(config):
59
 
60
  print('Initializing Chat')
61
  args = parse_args()
62
- cfg = Config(args)
63
 
64
- model_config = cfg.model_cfg
65
- model_config.device_8bit = args.gpu_id
66
- model_cls = registry.get_model_class(model_config.arch)
67
- model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
68
 
69
- vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
70
- vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
71
- chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
72
  print('Initialization Finished')
73
 
74
  # ========================================
@@ -143,6 +143,9 @@ title = """
143
 
144
  <h1 align="center">Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding</h1>
145
 
 
 
 
146
  <div style='display:flex; gap: 0.25rem; '>
147
  <a href='https://github.com/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/Github-Code-success'></a>
148
  <a href='https://huggingface.co/spaces/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
@@ -152,8 +155,24 @@ title = """
152
  </div>
153
 
154
 
 
 
 
 
 
 
 
 
155
  """
156
 
 
 
 
 
 
 
 
 
157
  #TODO show examples below
158
 
159
  with gr.Blocks() as demo:
@@ -185,20 +204,35 @@ with gr.Blocks() as demo:
185
  label="Temperature",
186
  )
187
 
 
 
188
  with gr.Column():
189
  chat_state = gr.State()
190
  img_list = gr.State()
191
  chatbot = gr.Chatbot(label='Video-LLaMA')
192
  text_input = gr.Textbox(label='User', placeholder='Please upload your image/video first', interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
-
195
  upload_button.click(upload_imgorvideo, [video, image, text_input, chat_state], [video, image, text_input, upload_button, chat_state, img_list])
196
 
197
  text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
198
  gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
199
  )
200
  clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, image, text_input, upload_button, chat_state, img_list], queue=False)
201
-
202
  demo.launch(share=False, enable_queue=True)
203
 
204
- # %%
 
59
 
60
  print('Initializing Chat')
61
  args = parse_args()
62
+ # cfg = Config(args)
63
 
64
+ # model_config = cfg.model_cfg
65
+ # model_config.device_8bit = args.gpu_id
66
+ # model_cls = registry.get_model_class(model_config.arch)
67
+ # model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
68
 
69
+ # vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
70
+ # vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
71
+ # chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
72
  print('Initialization Finished')
73
 
74
  # ========================================
 
143
 
144
  <h1 align="center">Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding</h1>
145
 
146
+ <h5 align="center"> Introduction: Video-LLaMA is a multi-model large language model that achieves video-grounded conversations between humans and computers \
147
+ by connecting language decoder with off-the-shelf unimodal pre-trained models. </h5>
148
+
149
  <div style='display:flex; gap: 0.25rem; '>
150
  <a href='https://github.com/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/Github-Code-success'></a>
151
  <a href='https://huggingface.co/spaces/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
 
155
  </div>
156
 
157
 
158
+ Thank you for using the Video-LLaMA Demo Page! If you have any questions or feedback, feel free to contact us.
159
+
160
+ If you think Video-LLaMA interesting, please give us a star on GitHub.
161
+
162
+ Current online demo uses the 7B version of PandaGPT due to resource limitations. We have released \
163
+ the 13B version on our GitHub repository.
164
+
165
+
166
  """
167
 
168
+ Note_markdown = ("""
169
+ ### Note
170
+ Video-LLaMA is a prototype model and may have limitations in understanding complex scenes, long videos, or specific domains.
171
+ The output results may be influenced by input quality, limitations of the dataset, and the model's susceptibility to illusions. Please interpret the results with caution.
172
+
173
+ **Copyright 2023 Alibaba DAMO Academy.**
174
+ """)
175
+
176
  #TODO show examples below
177
 
178
  with gr.Blocks() as demo:
 
204
  label="Temperature",
205
  )
206
 
207
+ audio = gr.Checkbox(interactive=True, value=False, label="Audio")
208
+ gr.Markdown(Note_markdown)
209
  with gr.Column():
210
  chat_state = gr.State()
211
  img_list = gr.State()
212
  chatbot = gr.Chatbot(label='Video-LLaMA')
213
  text_input = gr.Textbox(label='User', placeholder='Please upload your image/video first', interactive=False)
214
+
215
+
216
+ with gr.Column():
217
+ gr.Examples(examples=[
218
+ [f"examples/dog.jpg", "What breed do you think this dog is ?"],
219
+ [f"examples/jonsnow.jpg", "Who's the man on the right? "],
220
+ [f"examples/statue_of_liberty.jpg", "Can you tell me about this building? "],
221
+ ], inputs=[image, text_input])
222
+
223
+ gr.Examples(examples=[
224
+ [f"examples/skateboarding_dog.mp4", "What is the dog doing? "],
225
+ [f"examples/birthday.mp4", "What is the boy doing? "],
226
+ [f"examples/Iron_Man.mp4", "Is the guy in the video Iron Man? "],
227
+ ], inputs=[video, text_input])
228
 
 
229
  upload_button.click(upload_imgorvideo, [video, image, text_input, chat_state], [video, image, text_input, upload_button, chat_state, img_list])
230
 
231
  text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
232
  gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
233
  )
234
  clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, image, text_input, upload_button, chat_state, img_list], queue=False)
235
+
236
  demo.launch(share=False, enable_queue=True)
237
 
238
+ # %%
eval_configs/video_llama_eval.yaml CHANGED
@@ -3,7 +3,7 @@ model:
3
  model_type: pretrain_vicuna
4
  freeze_vit: True
5
  freeze_qformer: True
6
- max_txt_len: 512
7
  end_sym: "###"
8
  low_resource: False
9
 
 
3
  model_type: pretrain_vicuna
4
  freeze_vit: True
5
  freeze_qformer: True
6
+ max_txt_len: 140
7
  end_sym: "###"
8
  low_resource: False
9