舟勤 commited on
Commit
5fd9760
1 Parent(s): df5bd23
video_llama/app.py DELETED
@@ -1,192 +0,0 @@
1
- """
2
- Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py
3
- """
4
- import argparse
5
- import os
6
- import random
7
-
8
- import numpy as np
9
- import torch
10
- import torch.backends.cudnn as cudnn
11
- import gradio as gr
12
-
13
- from video_llama.common.config import Config
14
- from video_llama.common.dist_utils import get_rank
15
- from video_llama.common.registry import registry
16
- from video_llama.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle
17
- import decord
18
- decord.bridge.set_bridge('torch')
19
-
20
- #%%
21
- # imports modules for registration
22
- from video_llama.datasets.builders import *
23
- from video_llama.models import *
24
- from video_llama.processors import *
25
- from video_llama.runners import *
26
- from video_llama.tasks import *
27
-
28
- #%%
29
- def parse_args():
30
- parser = argparse.ArgumentParser(description="Demo")
31
- parser.add_argument("--cfg-path", default='eval_configs/video_llama_eval.yaml', help="path to configuration file.")
32
- parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
33
- parser.add_argument(
34
- "--options",
35
- nargs="+",
36
- help="override some settings in the used config, the key-value pair "
37
- "in xxx=yyy format will be merged into config file (deprecate), "
38
- "change to --cfg-options instead.",
39
- )
40
- args = parser.parse_args()
41
- return args
42
-
43
-
44
- def setup_seeds(config):
45
- seed = config.run_cfg.seed + get_rank()
46
-
47
- random.seed(seed)
48
- np.random.seed(seed)
49
- torch.manual_seed(seed)
50
-
51
- cudnn.benchmark = False
52
- cudnn.deterministic = True
53
-
54
-
55
- # ========================================
56
- # Model Initialization
57
- # ========================================
58
-
59
- print('Initializing Chat')
60
- args = parse_args()
61
- cfg = Config(args)
62
-
63
- model_config = cfg.model_cfg
64
- model_config.device_8bit = args.gpu_id
65
- model_cls = registry.get_model_class(model_config.arch)
66
- model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
67
-
68
- vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
69
- vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
70
- chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
71
- print('Initialization Finished')
72
-
73
- # ========================================
74
- # Gradio Setting
75
- # ========================================
76
-
77
- def gradio_reset(chat_state, img_list):
78
- if chat_state is not None:
79
- chat_state.messages = []
80
- if img_list is not None:
81
- img_list = []
82
- return None, gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your video first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
83
-
84
- def upload_imgorvideo(gr_video, gr_img, text_input, chat_state):
85
- if gr_img is None and gr_video is None:
86
- return None, None, None, gr.update(interactive=True), chat_state, None
87
- elif gr_img is not None and gr_video is None:
88
- print(gr_img)
89
- chat_state = Conversation(
90
- system= "You are able to understand the visual content that the user provides."
91
- "Follow the instructions carefully and explain your answers in detail.",
92
- roles=("Human", "Assistant"),
93
- messages=[],
94
- offset=0,
95
- sep_style=SeparatorStyle.SINGLE,
96
- sep="###",
97
- )
98
- img_list = []
99
- llm_message = chat.upload_img(gr_img, chat_state, img_list)
100
- return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
101
- elif gr_video is not None and gr_img is None:
102
- print(gr_video)
103
- chat_state = default_conversation.copy()
104
- chat_state = Conversation(
105
- system= "You are able to understand the visual content that the user provides."
106
- "Follow the instructions carefully and explain your answers in detail.",
107
- roles=("Human", "Assistant"),
108
- messages=[],
109
- offset=0,
110
- sep_style=SeparatorStyle.SINGLE,
111
- sep="###",
112
- )
113
- img_list = []
114
- llm_message = chat.upload_video(gr_video, chat_state, img_list)
115
- return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
116
- else:
117
- # img_list = []
118
- return gr.update(interactive=False), gr.update(interactive=False, placeholder='Currently, only one input is supported'), gr.update(value="Currently, only one input is supported", interactive=False), chat_state, None
119
-
120
- def gradio_ask(user_message, chatbot, chat_state):
121
- if len(user_message) == 0:
122
- return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
123
- chat.ask(user_message, chat_state)
124
- chatbot = chatbot + [[user_message, None]]
125
- return '', chatbot, chat_state
126
-
127
-
128
- def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
129
- llm_message = chat.answer(conv=chat_state,
130
- img_list=img_list,
131
- num_beams=num_beams,
132
- temperature=temperature,
133
- max_new_tokens=300,
134
- max_length=2000)[0]
135
- chatbot[-1][1] = llm_message
136
- print(chat_state.get_prompt())
137
- print(chat_state)
138
- return chatbot, chat_state, img_list
139
-
140
- title = """<h1 align="center">Demo of Video-LLaMA</h1>"""
141
- description = """<h3>This is the demo of Video-LLaMA. Upload your images/videos and start chatting!</h3>"""
142
-
143
-
144
- #TODO show examples below
145
-
146
- with gr.Blocks() as demo:
147
- gr.Markdown(title)
148
- gr.Markdown(description)
149
-
150
- with gr.Row():
151
- with gr.Column(scale=0.5):
152
- video = gr.Video()
153
- image = gr.Image(type="pil")
154
-
155
- upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
156
- clear = gr.Button("Restart")
157
-
158
- num_beams = gr.Slider(
159
- minimum=1,
160
- maximum=10,
161
- value=1,
162
- step=1,
163
- interactive=True,
164
- label="beam search numbers)",
165
- )
166
-
167
- temperature = gr.Slider(
168
- minimum=0.1,
169
- maximum=2.0,
170
- value=1.0,
171
- step=0.1,
172
- interactive=True,
173
- label="Temperature",
174
- )
175
-
176
- with gr.Column():
177
- chat_state = gr.State()
178
- img_list = gr.State()
179
- chatbot = gr.Chatbot(label='Video-LLaMA')
180
- text_input = gr.Textbox(label='User', placeholder='Please upload your image/video first', interactive=False)
181
-
182
-
183
- upload_button.click(upload_imgorvideo, [video, image, text_input, chat_state], [video, image, text_input, upload_button, chat_state, img_list])
184
-
185
- text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
186
- gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
187
- )
188
- clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, image, text_input, upload_button, chat_state, img_list], queue=False)
189
-
190
- demo.launch(share=False, enable_queue=False)
191
-
192
- # %%
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
video_llama/ckpt/blip2_pretrained_flant5xxl.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b3839ea6c617f315ead9bf4036bbb0f0cf6bf62695ecfc14968ea626af03a29
3
- size 433481467
 
 
 
 
video_llama/ckpt/finetune-vicuna7b-v2-nofrozen_imageQ.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46af76d307c14d28c56534e4bf8654343e5512aa1285fc1c1fdb5728c418e7ca
3
- size 623104000
 
 
 
 
video_llama/ckpt/pretrain-billa7b-zh.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f50a51db3055e1be6461f6dec833fbbbba28650287d26c8787664c8ee31dcf0f
3
- size 265435689