finalf0 commited on
Commit
e92ba13
1 Parent(s): 0686a72

streaming output

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +98 -109
  3. requirements.txt +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -26,10 +26,6 @@ import modelscope_studio as mgr
26
  # For Mac with MPS (Apple silicon or AMD GPUs).
27
  # PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.6.py --device mps
28
 
29
- os.system("pip list|grep torch")
30
- os.system("pip list|grep trans")
31
- os.system("pip list|grep flash")
32
-
33
  # Argparser
34
  parser = argparse.ArgumentParser(description='demo')
35
  parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
@@ -131,7 +127,7 @@ def create_component(params, comp='Slider'):
131
 
132
 
133
  def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
134
- return mgr.MultimodalInput(upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
135
  upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
136
  submit_button_props={'label': 'Submit'})
137
 
@@ -139,6 +135,8 @@ def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=F
139
  @spaces.GPU(duration=120)
140
  def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
141
  try:
 
 
142
  print('msgs:', msgs)
143
  answer = model.chat(
144
  image=None,
@@ -146,17 +144,18 @@ def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
146
  tokenizer=tokenizer,
147
  **params
148
  )
149
- res = re.sub(r'(<box>.*</box>)', '', answer)
150
- res = res.replace('<ref>', '')
151
- res = res.replace('</ref>', '')
152
- res = res.replace('<box>', '')
153
- answer = res.replace('</box>', '')
154
- print('answer:', answer)
155
- return 0, answer, None, None
 
156
  except Exception as e:
157
  print(e)
158
  traceback.print_exc()
159
- return -1, ERROR_MSG, None, None
160
 
161
 
162
  def encode_image(image):
@@ -270,11 +269,7 @@ def count_video_frames(_context):
270
  return num_frames
271
 
272
 
273
- def respond(_question, _chat_bot, _app_cfg, params_form):
274
- print("[respond] question:", _question)
275
- _context = _app_cfg['ctx'].copy()
276
- _context.append({'role': 'user', 'content': encode_message(_question)})
277
-
278
  images_cnt = _app_cfg['images_cnt']
279
  videos_cnt = _app_cfg['videos_cnt']
280
  files_cnts = check_has_videos(_question)
@@ -284,47 +279,67 @@ def respond(_question, _chat_bot, _app_cfg, params_form):
284
  if files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
285
  gr.Warning("Please chat with at least one image or video.")
286
  return _question, _chat_bot, _app_cfg
287
-
288
- if params_form == 'Beam Search':
289
- params = {
290
- 'sampling': False,
291
- 'num_beams': 3,
292
- 'repetition_penalty': 1.2,
293
- "max_new_tokens": 2048
294
- }
295
- else:
296
- params = {
297
- 'sampling': True,
298
- 'top_p': 0.8,
299
- 'top_k': 100,
300
- 'temperature': 0.7,
301
- 'repetition_penalty': 1.05,
302
- "max_new_tokens": 2048
303
- }
304
- params["max_inp_length"] = 4352 # 4096+256
305
-
306
- if files_cnts[1] + videos_cnt > 0:
307
- #params["max_inp_length"] = 4352 # 4096+256
308
- params["use_image_id"] = False
309
- params["max_slice_nums"] = 1 if count_video_frames(_context) > 16 else 2
310
-
311
- code, _answer, _, sts = chat("", _context, None, params)
312
-
313
  images_cnt += files_cnts[0]
314
  videos_cnt += files_cnts[1]
315
- _context.append({"role": "assistant", "content": [make_text(_answer)]})
316
- _chat_bot.append((_question, _answer))
317
- if code == 0:
318
- _app_cfg['ctx']=_context
319
- _app_cfg['sts']=sts
320
  _app_cfg['images_cnt'] = images_cnt
321
  _app_cfg['videos_cnt'] = videos_cnt
322
-
323
  upload_image_disabled = videos_cnt > 0
324
  upload_video_disabled = videos_cnt > 0 or images_cnt > 0
325
  return create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg
326
 
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
329
  ctx = _app_cfg["ctx"]
330
  message_item = []
@@ -332,6 +347,7 @@ def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_b
332
  image = Image.open(_image).convert("RGB")
333
  ctx.append({"role": "user", "content": [encode_image(image), make_text(_user_message)]})
334
  message_item.append({"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]})
 
335
  else:
336
  if _user_message:
337
  ctx.append({"role": "user", "content": [make_text(_user_message)]})
@@ -348,65 +364,29 @@ def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_b
348
  return None, "", "", _chat_bot, _app_cfg
349
 
350
 
351
- def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form):
352
- user_message_contents = []
353
- _context = _app_cfg["ctx"].copy()
354
- images_cnt = _app_cfg["images_cnt"]
355
- if _image:
356
- image = Image.open(_image).convert("RGB")
357
- user_message_contents += [encode_image(image)]
358
- images_cnt += 1
359
- if _user_message:
360
- user_message_contents += [make_text(_user_message)]
361
- if user_message_contents:
362
- _context.append({"role": "user", "content": user_message_contents})
363
-
364
- if params_form == 'Beam Search':
365
- params = {
366
- 'sampling': False,
367
- 'num_beams': 3,
368
- 'repetition_penalty': 1.2,
369
- "max_new_tokens": 2048
370
- }
371
- else:
372
- params = {
373
- 'sampling': True,
374
- 'top_p': 0.8,
375
- 'top_k': 100,
376
- 'temperature': 0.7,
377
- 'repetition_penalty': 1.05,
378
- "max_new_tokens": 2048
379
- }
380
-
381
- if images_cnt == 0:
382
- gr.Warning("Please chat with at least one image or video.")
383
- return _image, _user_message, '', _chat_bot, _app_cfg
384
-
385
- code, _answer, _, sts = chat("", _context, None, params)
386
-
387
- _context.append({"role": "assistant", "content": [make_text(_answer)]})
388
-
389
  if _image:
390
  _chat_bot.append([
391
  {"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]},
392
- {"text": _answer, "files": []}
393
  ])
 
394
  else:
395
  _chat_bot.append([
396
  {"text": _user_message, "files": [_image]},
397
- {"text": _answer, "files": []}
398
  ])
399
- if code == 0:
400
- _app_cfg['ctx']=_context
401
- _app_cfg['sts']=sts
402
- _app_cfg['images_cnt'] = images_cnt
403
  return None, '', '', _chat_bot, _app_cfg
404
 
405
 
406
- def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form):
407
  if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
408
  gr.Warning('No question for regeneration.')
409
- return '', _image, _user_message, _assistant_message, _chat_bot, _app_cfg
410
  if _app_cfg["chat_type"] == "Chat":
411
  images_cnt = _app_cfg['images_cnt']
412
  videos_cnt = _app_cfg['videos_cnt']
@@ -418,10 +398,9 @@ def regenerate_button_clicked(_question, _image, _user_message, _assistant_messa
418
  videos_cnt -= files_cnts[1]
419
  _app_cfg['images_cnt'] = images_cnt
420
  _app_cfg['videos_cnt'] = videos_cnt
421
- upload_image_disabled = videos_cnt > 0
422
- upload_video_disabled = videos_cnt > 0 or images_cnt > 0
423
- _question, _chat_bot, _app_cfg = respond(_question, _chat_bot, _app_cfg, params_form)
424
- return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
425
  else:
426
  last_message = _chat_bot[-1][0]
427
  last_image = None
@@ -430,10 +409,9 @@ def regenerate_button_clicked(_question, _image, _user_message, _assistant_messa
430
  last_user_message = last_message.text
431
  if last_message.files:
432
  last_image = last_message.files[0].file.path
433
- _chat_bot = _chat_bot[:-1]
434
  _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
435
- _image, _user_message, _assistant_message, _chat_bot, _app_cfg = fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form)
436
- return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
437
 
438
 
439
  def flushed():
@@ -469,7 +447,6 @@ init_conversation = [
469
 
470
 
471
  css = """
472
- video { height: auto !important; }
473
  .example label { font-size: 16px;}
474
  """
475
 
@@ -503,9 +480,13 @@ with gr.Blocks(css=css) as demo:
503
  chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
504
 
505
  txt_message.submit(
506
- respond,
507
- [txt_message, chat_bot, app_session, params_form],
508
  [txt_message, chat_bot, app_session]
 
 
 
 
509
  )
510
 
511
  with gr.Tab("Few Shot") as fewshot_tab:
@@ -525,9 +506,13 @@ with gr.Blocks(css=css) as demo:
525
  [image_input, user_message, assistant_message, chat_bot, app_session]
526
  )
527
  generate_button.click(
528
- fewshot_respond,
529
- [image_input, user_message, chat_bot, app_session, params_form],
530
  [image_input, user_message, assistant_message, chat_bot, app_session]
 
 
 
 
531
  )
532
 
533
  chat_tab.select(
@@ -556,8 +541,12 @@ with gr.Blocks(css=css) as demo:
556
  )
557
  regenerate.click(
558
  regenerate_button_clicked,
559
- [txt_message, image_input, user_message, assistant_message, chat_bot, app_session, params_form],
560
  [txt_message, image_input, user_message, assistant_message, chat_bot, app_session]
 
 
 
 
561
  )
562
  clear_button.click(
563
  clear,
 
26
  # For Mac with MPS (Apple silicon or AMD GPUs).
27
  # PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.6.py --device mps
28
 
 
 
 
 
29
  # Argparser
30
  parser = argparse.ArgumentParser(description='demo')
31
  parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
 
127
 
128
 
129
  def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
130
+ return mgr.MultimodalInput(value=None, upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
131
  upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
132
  submit_button_props={'label': 'Submit'})
133
 
 
135
  @spaces.GPU(duration=120)
136
  def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
137
  try:
138
+ if msgs[-1]['role'] == 'assistant':
139
+ msgs = msgs[:-1] # remove last which is added for streaming
140
  print('msgs:', msgs)
141
  answer = model.chat(
142
  image=None,
 
144
  tokenizer=tokenizer,
145
  **params
146
  )
147
+ if params['stream'] is False:
148
+ res = re.sub(r'(<box>.*</box>)', '', answer)
149
+ res = res.replace('<ref>', '')
150
+ res = res.replace('</ref>', '')
151
+ res = res.replace('<box>', '')
152
+ answer = res.replace('</box>', '')
153
+ for char in answer:
154
+ yield char
155
  except Exception as e:
156
  print(e)
157
  traceback.print_exc()
158
+ yield ERROR_MSG
159
 
160
 
161
  def encode_image(image):
 
269
  return num_frames
270
 
271
 
272
+ def request(_question, _chat_bot, _app_cfg):
 
 
 
 
273
  images_cnt = _app_cfg['images_cnt']
274
  videos_cnt = _app_cfg['videos_cnt']
275
  files_cnts = check_has_videos(_question)
 
279
  if files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
280
  gr.Warning("Please chat with at least one image or video.")
281
  return _question, _chat_bot, _app_cfg
282
+ _chat_bot.append((_question, None))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  images_cnt += files_cnts[0]
284
  videos_cnt += files_cnts[1]
 
 
 
 
 
285
  _app_cfg['images_cnt'] = images_cnt
286
  _app_cfg['videos_cnt'] = videos_cnt
 
287
  upload_image_disabled = videos_cnt > 0
288
  upload_video_disabled = videos_cnt > 0 or images_cnt > 0
289
  return create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg
290
 
291
 
292
+ def respond(_chat_bot, _app_cfg, params_form):
293
+ if len(_app_cfg) == 0:
294
+ yield (_chat_bot, _app_cfg)
295
+ elif _app_cfg['images_cnt'] == 0 and _app_cfg['videos_cnt'] == 0:
296
+ yield(_chat_bot, _app_cfg)
297
+ else:
298
+ _question = _chat_bot[-1][0]
299
+ _context = _app_cfg['ctx'].copy()
300
+ _context.append({'role': 'user', 'content': encode_message(_question)})
301
+
302
+ videos_cnt = _app_cfg['videos_cnt']
303
+
304
+ if params_form == 'Beam Search':
305
+ params = {
306
+ 'sampling': False,
307
+ 'stream': False,
308
+ 'num_beams': 3,
309
+ 'repetition_penalty': 1.2,
310
+ "max_new_tokens": 2048
311
+ }
312
+ else:
313
+ params = {
314
+ 'sampling': True,
315
+ 'stream': True,
316
+ 'top_p': 0.8,
317
+ 'top_k': 100,
318
+ 'temperature': 0.7,
319
+ 'repetition_penalty': 1.05,
320
+ "max_new_tokens": 2048
321
+ }
322
+ params["max_inp_length"] = 4352 # 4096+256
323
+
324
+ if videos_cnt > 0:
325
+ #params["max_inp_length"] = 4352 # 4096+256
326
+ params["use_image_id"] = False
327
+ params["max_slice_nums"] = 1 if count_video_frames(_context) > 16 else 2
328
+
329
+ gen = chat("", _context, None, params)
330
+
331
+ _context.append({"role": "assistant", "content": [""]})
332
+ _chat_bot[-1][1] = ""
333
+
334
+ for _char in gen:
335
+ _chat_bot[-1][1] += _char
336
+ _context[-1]["content"][0] += _char
337
+ yield (_chat_bot, _app_cfg)
338
+
339
+ _app_cfg['ctx']=_context
340
+ yield (_chat_bot, _app_cfg)
341
+
342
+
343
  def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
344
  ctx = _app_cfg["ctx"]
345
  message_item = []
 
347
  image = Image.open(_image).convert("RGB")
348
  ctx.append({"role": "user", "content": [encode_image(image), make_text(_user_message)]})
349
  message_item.append({"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]})
350
+ _app_cfg["images_cnt"] += 1
351
  else:
352
  if _user_message:
353
  ctx.append({"role": "user", "content": [make_text(_user_message)]})
 
364
  return None, "", "", _chat_bot, _app_cfg
365
 
366
 
367
+ def fewshot_request(_image, _user_message, _chat_bot, _app_cfg):
368
+ if _app_cfg["images_cnt"] == 0 and not _image:
369
+ gr.Warning("Please chat with at least one image.")
370
+ return None, '', '', _chat_bot, _app_cfg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  if _image:
372
  _chat_bot.append([
373
  {"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]},
374
+ ""
375
  ])
376
+ _app_cfg["images_cnt"] += 1
377
  else:
378
  _chat_bot.append([
379
  {"text": _user_message, "files": [_image]},
380
+ ""
381
  ])
382
+
 
 
 
383
  return None, '', '', _chat_bot, _app_cfg
384
 
385
 
386
+ def regenerate_button_clicked(_chat_bot, _app_cfg):
387
  if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
388
  gr.Warning('No question for regeneration.')
389
+ return None, None, '', '', _chat_bot, _app_cfg
390
  if _app_cfg["chat_type"] == "Chat":
391
  images_cnt = _app_cfg['images_cnt']
392
  videos_cnt = _app_cfg['videos_cnt']
 
398
  videos_cnt -= files_cnts[1]
399
  _app_cfg['images_cnt'] = images_cnt
400
  _app_cfg['videos_cnt'] = videos_cnt
401
+
402
+ _question, _chat_bot, _app_cfg = request(_question, _chat_bot, _app_cfg)
403
+ return _question, None, '', '', _chat_bot, _app_cfg
 
404
  else:
405
  last_message = _chat_bot[-1][0]
406
  last_image = None
 
409
  last_user_message = last_message.text
410
  if last_message.files:
411
  last_image = last_message.files[0].file.path
412
+ _chat_bot[-1][1] = ""
413
  _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
414
+ return _question, None, '', '', _chat_bot, _app_cfg
 
415
 
416
 
417
  def flushed():
 
447
 
448
 
449
  css = """
 
450
  .example label { font-size: 16px;}
451
  """
452
 
 
480
  chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
481
 
482
  txt_message.submit(
483
+ request,
484
+ [txt_message, chat_bot, app_session],
485
  [txt_message, chat_bot, app_session]
486
+ ).then(
487
+ respond,
488
+ [chat_bot, app_session, params_form],
489
+ [chat_bot, app_session]
490
  )
491
 
492
  with gr.Tab("Few Shot") as fewshot_tab:
 
506
  [image_input, user_message, assistant_message, chat_bot, app_session]
507
  )
508
  generate_button.click(
509
+ fewshot_request,
510
+ [image_input, user_message, chat_bot, app_session],
511
  [image_input, user_message, assistant_message, chat_bot, app_session]
512
+ ).then(
513
+ respond,
514
+ [chat_bot, app_session, params_form],
515
+ [chat_bot, app_session]
516
  )
517
 
518
  chat_tab.select(
 
541
  )
542
  regenerate.click(
543
  regenerate_button_clicked,
544
+ [chat_bot, app_session],
545
  [txt_message, image_input, user_message, assistant_message, chat_bot, app_session]
546
+ ).then(
547
+ respond,
548
+ [chat_bot, app_session, params_form],
549
+ [chat_bot, app_session]
550
  )
551
  clear_button.click(
552
  clear,
requirements.txt CHANGED
@@ -6,6 +6,7 @@ sentencepiece==0.1.99
6
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
7
  opencv-python
8
  decord
9
- gradio==4.22.0
 
10
  http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl
11
  accelerate
 
6
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
7
  opencv-python
8
  decord
9
+ #gradio==4.22.0
10
+ gradio==4.41.0
11
  http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl
12
  accelerate