jiachenl commited on
Commit
b7aace3
β€’
1 Parent(s): 5061a0e
Files changed (1) hide show
  1. app.py +304 -92
app.py CHANGED
@@ -1,101 +1,313 @@
1
  import sys
2
  import os
3
- import spaces
4
  import argparse
5
  import time
6
  import subprocess
7
- import torch
8
  import cumo.serve.gradio_web_server as gws
9
 
10
- #os.system("export BUILD_WITH_CUDA=True")
11
- #os.system("pip install --upgrade pip")
12
- #os.system("pip install flash-attn --no-build-isolation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Execute the pip install command with additional options
15
- #subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'flash-attn', '--no-build-isolation', '-U'])
16
-
17
- def start_controller():
18
- print("Starting the controller")
19
- controller_command = [
20
- sys.executable,
21
- "-m",
22
- "cumo.serve.controller",
23
- "--host",
24
- "0.0.0.0",
25
- "--port",
26
- "10000",
27
- ]
28
- print(controller_command)
29
- return subprocess.Popen(controller_command)
30
-
31
- def start_worker(model_path: str, bits=16):
32
- print(f"Starting the model worker for the model {model_path}")
33
- model_name = model_path.strip("/").split("/")[-1]
34
- assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
35
- if bits != 16:
36
- model_name += f"-{bits}bit"
37
- worker_command = [
38
- sys.executable,
39
- "-m",
40
- "cumo.serve.model_worker",
41
- "--host",
42
- "0.0.0.0",
43
- "--controller",
44
- "http://localhost:10000",
45
- "--model-path",
46
- model_path,
47
- "--model-name",
48
- model_name,
49
- ]
50
- if bits != 16:
51
- worker_command += [f"--load-{bits}bit"]
52
- print(worker_command)
53
- return subprocess.Popen(worker_command)
54
-
55
-
56
- if __name__ == "__main__":
57
- parser = argparse.ArgumentParser()
58
- parser.add_argument("--host", type=str, default="0.0.0.0")
59
- parser.add_argument("--port", type=int)
60
- parser.add_argument("--model-path", type=str, default="checkpoints/CuMo-mistral-7b")
61
- parser.add_argument("--model-base", type=str, default=None)
62
- parser.add_argument("--controller-url", type=str, default="http://localhost:10000")
63
- parser.add_argument("--concurrency-count", type=int, default=5)
64
- parser.add_argument("--bits", type=int, default=16)
65
- parser.add_argument("--model-list-mode", type=str, default="reload", choices=["once", "reload"])
66
- parser.add_argument("--share", action="store_true")
67
- parser.add_argument("--moderate", action="store_true")
68
- parser.add_argument("--embed", action="store_true")
69
- gws.args = parser.parse_args()
70
- gws.models = []
71
-
72
- print(f"args: {gws.args}")
73
- model_path = gws.args.model_path
74
- bits = gws.args.bits
75
- concurrency_count = int(os.getenv("concurrency_count", 5))
76
- #device = "cuda" if torch.cuda.is_available() else "cpu"
77
- controller_proc = start_controller()
78
- worker_proc = start_worker(model_path, bits=bits)
79
-
80
- # Wait for worker and controller to start
81
- time.sleep(10)
82
- exit_status = 0
83
- try:
84
- demo = gws.build_demo(embed_mode=False, concurrency_count=concurrency_count)
85
- demo.queue(
86
- status_update_rate=10,
87
- api_open=False
88
- ).launch(
89
- server_name=gws.args.host,
90
- server_port=gws.args.port,
91
- share=gws.args.share
92
- )
93
-
94
- except Exception as e:
95
- print(e)
96
- exit_status = 1
97
- finally:
98
- worker_proc.kill()
99
- controller_proc.kill()
100
-
101
- sys.exit(exit_status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import sys
2
  import os
 
3
  import argparse
4
  import time
5
  import subprocess
6
+ import spaces
7
  import cumo.serve.gradio_web_server as gws
8
 
9
+ import datetime
10
+ import json
11
+
12
+ import gradio as gr
13
+ import requests
14
+ from PIL import Image
15
+
16
+ from cumo.conversation import (default_conversation, conv_templates, SeparatorStyle)
17
+ from cumo.constants import LOGDIR
18
+ from cumo.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
19
+ import hashlib
20
+
21
+ import torch
22
+ import io
23
+ from cumo.constants import WORKER_HEART_BEAT_INTERVAL
24
+ from cumo.utils import (build_logger, server_error_msg,
25
+ pretty_print_semaphore)
26
+ from cumo.model.builder import load_pretrained_model
27
+ from cumo.mm_utils import process_images, load_image_from_base64, tokenizer_image_token
28
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
29
+ from transformers import TextIteratorStreamer
30
+ from threading import Thread
31
 
32
  # Execute the pip install command with additional options
33
+ #subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'flash-attn', '--no-build-isolation', '-U']
34
+
35
+ headers = {"User-Agent": "CuMo"}
36
+
37
+ no_change_btn = gr.Button()
38
+ enable_btn = gr.Button(interactive=True)
39
+ disable_btn = gr.Button(interactive=False)
40
+
41
+ device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ model_path = './checkpoints/CuMo-mistral-7b'
43
+ model_base = 'mistralai/Mistral-7B-Instruct-v0.2'
44
+ model_name = 'CuMo-mistral-7b'
45
+ conv_mode = 'mistral_instruct_system'
46
+ load_8bit = False
47
+ load_4bit = False
48
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, load_8bit, load_4bit, device=device, use_flash_attn=False)
49
+ model.config.training = False
50
+
51
+ def upvote_last_response(state):
52
+ return ("",) + (disable_btn,) * 3
53
+
54
+
55
+ def downvote_last_response(state):
56
+ return ("",) + (disable_btn,) * 3
57
+
58
+
59
+ def flag_last_response(state):
60
+ return ("",) + (disable_btn,) * 3
61
+
62
+ def clear_history():
63
+ state = default_conversation.copy()
64
+ return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
65
+
66
+ def add_text(state, imagebox, textbox, image_process_mode):
67
+ if state is None:
68
+ state = conv_templates[conv_mode].copy()
69
+
70
+ if imagebox is not None:
71
+ textbox = DEFAULT_IMAGE_TOKEN + '\n' + textbox
72
+ image = Image.open(imagebox).convert('RGB')
73
+
74
+ if imagebox is not None:
75
+ textbox = (textbox, image, image_process_mode)
76
+
77
+ state.append_message(state.roles[0], textbox)
78
+ state.append_message(state.roles[1], None)
79
+
80
+ yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
81
+
82
+ def delete_text(state, image_process_mode):
83
+ state.messages[-1][-1] = None
84
+ prev_human_msg = state.messages[-2]
85
+ if type(prev_human_msg[1]) in (tuple, list):
86
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
87
+ yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
88
+
89
+ def regenerate(state, image_process_mode):
90
+ state.messages[-1][-1] = None
91
+ prev_human_msg = state.messages[-2]
92
+ if type(prev_human_msg[1]) in (tuple, list):
93
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
94
+ state.skip_next = False
95
+ return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
96
+
97
+ @spaces.GPU
98
+ def generate(state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens):
99
+ prompt = state.get_prompt()
100
+ images = state.get_images(return_pil=True)
101
+ #prompt, image_args = process_image(prompt, images)
102
+
103
+ ori_prompt = prompt
104
+ num_image_tokens = 0
105
+
106
+ if images is not None and len(images) > 0:
107
+ if len(images) > 0:
108
+ if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
109
+ raise ValueError("Number of images does not match number of <image> tokens in prompt")
110
+
111
+ #images = [load_image_from_base64(image) for image in images]
112
+ image_sizes = [image.size for image in images]
113
+ images = process_images(images, image_processor, model.config)
114
+
115
+ if type(images) is list:
116
+ images = [image.to(model.device, dtype=torch.float16) for image in images]
117
+ else:
118
+ images = images.to(model.device, dtype=torch.float16)
119
+
120
+ replace_token = DEFAULT_IMAGE_TOKEN
121
+ if getattr(model.config, 'mm_use_im_start_end', False):
122
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
123
+ prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
124
+
125
+ num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
126
+ else:
127
+ images = None
128
+ image_sizes = None
129
+ image_args = {"images": images, "image_sizes": image_sizes}
130
+ else:
131
+ images = None
132
+ image_args = {}
133
+
134
+ max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
135
+ max_new_tokens = 512
136
+ do_sample = True if temperature > 0.001 else False
137
+ stop_str = state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2
138
+
139
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
140
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
141
+
142
+ max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
143
+
144
+ if max_new_tokens < 1:
145
+ yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
146
+ return
147
+
148
+ thread = Thread(target=model.generate, kwargs=dict(
149
+ inputs=input_ids,
150
+ do_sample=do_sample,
151
+ temperature=temperature,
152
+ top_p=top_p,
153
+ max_new_tokens=max_new_tokens,
154
+ streamer=streamer,
155
+ use_cache=True,
156
+ pad_token_id=tokenizer.eos_token_id,
157
+ **image_args
158
+ ))
159
+ thread.start()
160
+ generated_text = ''
161
+ for new_text in streamer:
162
+ generated_text += new_text
163
+ if generated_text.endswith(stop_str):
164
+ generated_text = generated_text[:-len(stop_str)]
165
+ state.messages[-1][-1] = generated_text
166
+ yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
167
+
168
+ yield (state, state.to_gradio_chatbot(), "", None) + (enable_btn,) * 5
169
+
170
+ torch.cuda.empty_cache()
171
+
172
+ title_markdown = ("""
173
+ # CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts
174
+ [[Project Page](https://chrisjuniorli.github.io/project/CuMo/)] [[Code](https://github.com/SHI-Labs/CuMo)] [[Model](https://huggingface.co/shi-labs/CuMo-mistral-7b)] | πŸ“š [[Arxiv](https://arxiv.org/pdf/2405.05949)]]
175
+ """)
176
+
177
+ tos_markdown = ("""
178
+ ### Terms of use
179
+ By using this service, users are required to agree to the following terms:
180
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
181
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
182
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
183
+ """)
184
+
185
+
186
+ learn_more_markdown = ("""
187
+ ### License
188
+ The service is a research preview intended for non-commercial use only, subject to the. Please contact us if you find any potential violation.
189
+ """)
190
+
191
+ block_css = """
192
+
193
+ #buttons button {
194
+ min-width: min(120px,100%);
195
+ }
196
+
197
+ """
198
+
199
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
200
+ with gr.Blocks(title="CuMo", theme=gr.themes.Default(), css=block_css) as demo:
201
+ state = gr.State()
202
+
203
+ gr.Markdown(title_markdown)
204
+
205
+ with gr.Row():
206
+ with gr.Column(scale=3):
207
+ imagebox = gr.Image(label="Input Image", type="filepath")
208
+ image_process_mode = gr.Radio(
209
+ ["Crop", "Resize", "Pad", "Default"],
210
+ value="Default",
211
+ label="Preprocess for non-square image", visible=False)
212
+
213
+
214
+ #cur_dir = os.path.dirname(os.path.abspath(__file__))
215
+ cur_dir = './cumo/serve'
216
+ gr.Examples(examples=[
217
+ [f"{cur_dir}/examples/aveger.jpg", "Can you introduce this movie based on the poster?"],
218
+ [f"{cur_dir}/examples/fridge.webp", "Can you describe what groceries are presented in this fridge?"],
219
+ [f"{cur_dir}/examples/su7_4.jpg", "What car is it in this image?"],
220
+ [f"{cur_dir}/examples/nvidia.jpeg", "Can you tell me what happened in this image?"],
221
+ [f"{cur_dir}/examples/animal.webp", "What animals are in this image?"],
222
+ [f"{cur_dir}/examples/noodle.png", "How many calories estimated in this bowl?"],
223
+ [f"{cur_dir}/examples/disney.jpeg", "How many characters in this image?"],
224
+ [f"{cur_dir}/examples/reka_6.jpeg", "What colour is my hat (im sitting on the bear)?"],
225
+ ], inputs=[imagebox, textbox], cache_examples=False)
226
+
227
+ with gr.Accordion("Parameters", open=False) as parameter_row:
228
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
229
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
230
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
231
+
232
+ with gr.Column(scale=8):
233
+ chatbot = gr.Chatbot(
234
+ elem_id="chatbot",
235
+ label="CuMo Chatbot",
236
+ height=650,
237
+ layout="panel",
238
+ )
239
+ with gr.Row():
240
+ with gr.Column(scale=8):
241
+ textbox.render()
242
+ with gr.Column(scale=1, min_width=50):
243
+ submit_btn = gr.Button(value="Send", variant="primary")
244
+ with gr.Row(elem_id="buttons") as button_row:
245
+ upvote_btn = gr.Button(value="πŸ‘ Upvote", interactive=False)
246
+ downvote_btn = gr.Button(value="πŸ‘Ž Downvote", interactive=False)
247
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
248
+ #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
249
+ regenerate_btn = gr.Button(value="πŸ”„ Regenerate", interactive=False)
250
+ clear_btn = gr.Button(value="πŸ—‘οΈ Clear", interactive=False)
251
+
252
+ gr.Markdown(tos_markdown)
253
+ gr.Markdown(learn_more_markdown)
254
+ url_params = gr.JSON(visible=False)
255
+
256
+ # Register listeners
257
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
258
+ upvote_btn.click(
259
+ upvote_last_response,
260
+ [state],
261
+ [textbox, upvote_btn, downvote_btn, flag_btn]
262
+ )
263
+ downvote_btn.click(
264
+ downvote_last_response,
265
+ [state],
266
+ [textbox, upvote_btn, downvote_btn, flag_btn]
267
+ )
268
+ flag_btn.click(
269
+ flag_last_response,
270
+ [state],
271
+ [textbox, upvote_btn, downvote_btn, flag_btn]
272
+ )
273
+
274
+ clear_btn.click(
275
+ clear_history,
276
+ None,
277
+ [state, chatbot, textbox, imagebox] + btn_list,
278
+ queue=False
279
+ )
280
+
281
+ regenerate_btn.click(
282
+ delete_text,
283
+ [state, image_process_mode],
284
+ [state, chatbot, textbox, imagebox] + btn_list,
285
+ ).then(
286
+ generate,
287
+ [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens],
288
+ [state, chatbot, textbox, imagebox] + btn_list,
289
+ )
290
+ textbox.submit(
291
+ add_text,
292
+ [state, imagebox, textbox, image_process_mode],
293
+ [state, chatbot, textbox, imagebox] + btn_list,
294
+ ).then(
295
+ generate,
296
+ [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens],
297
+ [state, chatbot, textbox, imagebox] + btn_list,
298
+ )
299
+
300
+ submit_btn.click(
301
+ add_text,
302
+ [state, imagebox, textbox, image_process_mode],
303
+ [state, chatbot, textbox, imagebox] + btn_list,
304
+ ).then(
305
+ generate,
306
+ [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens],
307
+ [state, chatbot, textbox, imagebox] + btn_list,
308
+ )
309
+
310
+ demo.queue(
311
+ status_update_rate=10,
312
+ api_open=False
313
+ ).launch()