laizeqiang commited on
Commit
ee25e9d
1 Parent(s): d65cc15
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +1 -1
  2. iChatApp.py → app.py +406 -387
  3. iChat/models/grit_src/third_party/CenterNet2/detectron2/model_zoo/__init__.py +0 -10
  4. iChat/models/grit_src/third_party/CenterNet2/detectron2/model_zoo/model_zoo.py +0 -213
  5. {iChat → iGPT}/__init__.py +0 -0
  6. {iChat → iGPT}/chatbot/__init__.py +0 -0
  7. {iChat → iGPT}/chatbot/chatbot.py +0 -0
  8. {iChat → iGPT}/models/__init__.py +0 -0
  9. {iChat → iGPT}/models/grit_model.py +0 -0
  10. {iChat → iGPT}/models/grit_src/configs/Base.yaml +0 -0
  11. {iChat → iGPT}/models/grit_src/configs/GRiT_B_DenseCap.yaml +0 -0
  12. {iChat → iGPT}/models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml +0 -0
  13. {iChat → iGPT}/models/grit_src/configs/GRiT_B_ObjectDet.yaml +0 -0
  14. {iChat → iGPT}/models/grit_src/configs/GRiT_H_ObjectDet.yaml +0 -0
  15. {iChat → iGPT}/models/grit_src/configs/GRiT_L_ObjectDet.yaml +0 -0
  16. {iChat → iGPT}/models/grit_src/grit/__init__.py +0 -0
  17. {iChat → iGPT}/models/grit_src/grit/config.py +0 -0
  18. {iChat → iGPT}/models/grit_src/grit/custom_solver.py +0 -0
  19. {iChat → iGPT}/models/grit_src/grit/data/custom_build_augmentation.py +0 -0
  20. {iChat → iGPT}/models/grit_src/grit/data/custom_dataset_dataloader.py +0 -0
  21. {iChat → iGPT}/models/grit_src/grit/data/custom_dataset_mapper.py +0 -0
  22. {iChat → iGPT}/models/grit_src/grit/data/datasets/grit_coco.py +0 -0
  23. {iChat → iGPT}/models/grit_src/grit/data/datasets/object365.py +0 -0
  24. {iChat → iGPT}/models/grit_src/grit/data/datasets/vg.py +0 -0
  25. {iChat → iGPT}/models/grit_src/grit/data/transforms/custom_augmentation_impl.py +0 -0
  26. {iChat → iGPT}/models/grit_src/grit/data/transforms/custom_transform.py +0 -0
  27. {iChat → iGPT}/models/grit_src/grit/evaluation/eval.py +0 -0
  28. {iChat → iGPT}/models/grit_src/grit/modeling/backbone/utils.py +0 -0
  29. {iChat → iGPT}/models/grit_src/grit/modeling/backbone/vit.py +0 -0
  30. {iChat → iGPT}/models/grit_src/grit/modeling/meta_arch/grit.py +0 -0
  31. {iChat → iGPT}/models/grit_src/grit/modeling/roi_heads/grit_fast_rcnn.py +0 -0
  32. {iChat → iGPT}/models/grit_src/grit/modeling/roi_heads/grit_roi_heads.py +1 -1
  33. {iChat → iGPT}/models/grit_src/grit/modeling/soft_nms.py +0 -0
  34. {iChat → iGPT}/models/grit_src/grit/modeling/text/file_utils.py +0 -0
  35. {iChat → iGPT}/models/grit_src/grit/modeling/text/load_text_token.py +0 -0
  36. {iChat → iGPT}/models/grit_src/grit/modeling/text/modeling_bert.py +0 -0
  37. {iChat → iGPT}/models/grit_src/grit/modeling/text/text_decoder.py +0 -0
  38. {iChat → iGPT}/models/grit_src/grit/predictor.py +0 -0
  39. {iChat → iGPT}/models/grit_src/image_dense_captions.py +2 -2
  40. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/.circleci/config.yml +0 -0
  41. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/.clang-format +0 -0
  42. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/.flake8 +0 -0
  43. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/.gitignore +0 -0
  44. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/GETTING_STARTED.md +0 -0
  45. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/INSTALL.md +0 -0
  46. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/LICENSE +0 -0
  47. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/MODEL_ZOO.md +0 -0
  48. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/README.md +0 -0
  49. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/README_D2.md +0 -0
  50. {iChat → iGPT}/models/grit_src/third_party/CenterNet2/configs/Base-RCNN-C4.yaml +0 -0
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.28.1
8
- app_file: iChatApp.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.28.1
8
+ app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
iChatApp.py → app.py RENAMED
@@ -2,15 +2,15 @@
2
  import os
3
  os.environ['CURL_CA_BUNDLE'] = ''
4
 
5
- from pathlib import Path
6
- import sys
7
- sys.path.insert(0, str(Path(__file__).resolve().parent / "third-party" / "lama"))
8
-
9
  try:
10
  import detectron
11
  except:
12
  os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
13
 
 
 
 
 
14
  import random
15
  import torch
16
  import cv2
@@ -41,13 +41,14 @@ from langchain.agents.tools import Tool
41
  from langchain.chains.conversation.memory import ConversationBufferMemory
42
  from langchain.llms.openai import OpenAI
43
 
44
- from iChat.models import VideoCaption, ActionRecognition, DenseCaption, GenerateTikTokVideo
45
- from iChat.models import HuskyVQA, LDMInpainting
46
- from iChat.models.utils import (cal_dilate_factor, dilate_mask, gen_new_name,
47
  seed_everything, prompts, blend_gt2pt)
48
 
49
- from segment_anything.utils.amg import remove_small_regions
50
- from segment_anything import build_sam, sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
 
51
  from bark import SAMPLE_RATE, generate_audio
52
 
53
  import matplotlib.pyplot as plt
@@ -57,24 +58,25 @@ import easyocr
57
  from saicinpainting.evaluation.utils import move_to_device
58
  from saicinpainting.training.trainers import load_checkpoint
59
  from saicinpainting.evaluation.data import pad_tensor_to_modulo
 
60
 
61
  # openai.api_base = 'https://closeai.deno.dev/v1'
62
 
63
  GLOBAL_SEED=1912
64
 
65
- INTERN_CHAT_PREFIX = """InternChat is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. InternChat is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
66
 
67
- InternChat is able to process and understand large amounts of text and images. As a language model, InternChat can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and InternChat can invoke different tools to indirectly understand pictures. When talking about images, InternChat is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, InternChat is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. InternChat is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
68
 
69
- Human may provide new figures to InternChat with a description. The description helps InternChat to understand this image, but InternChat should use tools to finish following tasks, rather than directly imagine from the description.
70
 
71
- Overall, InternChat is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
72
 
73
 
74
  TOOLS:
75
  ------
76
 
77
- InternChat has access to the following tools:"""
78
 
79
  INTERN_CHAT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
80
 
@@ -102,23 +104,23 @@ Previous conversation history:
102
  {chat_history}
103
 
104
  New input: {input}
105
- Since InternChat is a text language model, InternChat must use tools to observe images rather than imagination.
106
- The thoughts and observations are only visible for InternChat, InternChat should remember to repeat important information in the final response for Human.
107
  Thought: Do I need to use a tool? {agent_scratchpad} Let's think step by step.
108
  """
109
 
110
- INTERN_CHAT_PREFIX_CN = """InternChat 旨在能够协助完成范围广泛的文本和视觉相关任务,从回答简单的问题到提供对广泛主题的深入解释和讨论。 InternChat 能够根据收到的输入生成类似人类的文本,使其能够进行听起来自然的对话,并提供连贯且与手头主题相关的响应。
111
 
112
- InternChat 能够处理和理解大量文本和图像。作为一种语言模型,InternChat 不能直接读取图像,但它有一系列工具来完成不同的视觉任务。每张图片都会有一个文件名,格式为“image/xxx.png”,InternChat可以调用不同的工具来间接理解图片。在谈论图片时,InternChat 对文件名的要求非常严格,绝不会伪造不存在的文件。在使用工具生成新的图像文件时,InternChat也知道图像可能与用户需求不一样,会使用其他视觉问答工具或描述工具来观察真实图像。 InternChat 能够按顺序使用工具,并且忠于工具观察输出,而不是伪造图像内容和图像文件名。如果生成新图像,它将记得提供上次工具观察的文件名。
113
 
114
- Human 可能会向 InternChat 提供带有描述的新图形。描述帮助 InternChat 理解这个图像,但 InternChat 应该使用工具来完成以下任务,而不是直接从描述中想象。有些工具将会返回英文描述,但你对用户的聊天应当采用中文。
115
 
116
- 总的来说,InternChat 是一个强大的可视化对话辅助工具,可以帮助处理范围广泛的任务,并提供关于范围广泛的主题的有价值的见解和信息。
117
 
118
  工具列表:
119
  ------
120
 
121
- InternChat 可以使用这些工具:"""
122
 
123
  INTERN_CHAT_FORMAT_INSTRUCTIONS_CN = """用户使用中文和你进行聊天,但是工具的参数应当使用英文。如果要调用工具,你必须遵循如下格式:
124
 
@@ -142,8 +144,8 @@ INTERN_CHAT_SUFFIX_CN = """你对文件名的正确性非常严格,而且永
142
 
143
  开始!
144
 
145
- 因为InternChat是一个文本语言模型,必须使用工具去观察图片而不是依靠想象。
146
- 推理想法和观察结果只对InternChat可见,需要记得在最终回复时把重要的信息重复给用户,你只能给用户返回中文句子。我们一步一步思考。在你使用工具时,工具的参数只能是英文。
147
 
148
  聊天历史:
149
  {chat_history}
@@ -155,51 +157,6 @@ Thought: Do I need to use a tool? {agent_scratchpad}
155
  os.makedirs('image', exist_ok=True)
156
 
157
 
158
- def cut_dialogue_history(history_memory, keep_last_n_words=500):
159
- if history_memory is None or len(history_memory) == 0:
160
- return history_memory
161
- tokens = history_memory.split()
162
- n_tokens = len(tokens)
163
- print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
164
- if n_tokens < keep_last_n_words:
165
- return history_memory
166
- paragraphs = history_memory.split('\n')
167
- last_n_tokens = n_tokens
168
- while last_n_tokens >= keep_last_n_words:
169
- last_n_tokens -= len(paragraphs[0].split(' '))
170
- paragraphs = paragraphs[1:]
171
- return '\n' + '\n'.join(paragraphs)
172
-
173
-
174
- def login_with_key(bot, debug, api_key):
175
- # Just for debug
176
- print('===>logging in')
177
- if debug:
178
- bot.init_agent()
179
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False, value='')
180
- else:
181
- import openai
182
- from langchain.llms.openai import OpenAI
183
- if api_key and len(api_key) > 30:
184
- os.environ["OPENAI_API_KEY"] = api_key
185
- openai.api_key = api_key
186
- try:
187
- llm = OpenAI(temperature=0)
188
- llm('Hi!')
189
- response = 'Success!'
190
- is_error = False
191
- bot.init_agent()
192
- except:
193
- # gr.update(visible=True)
194
- response = 'Incorrect key, please input again'
195
- is_error = True
196
- else:
197
- is_error = True
198
- response = 'Incorrect key, please input again'
199
-
200
- return gr.update(visible=not is_error), gr.update(visible=is_error), gr.update(visible=is_error, value=response)
201
-
202
-
203
  class InstructPix2Pix:
204
  def __init__(self, device):
205
  print(f"Initializing InstructPix2Pix to {device}")
@@ -292,7 +249,6 @@ class CannyText2Image:
292
  torch_dtype=self.torch_dtype)
293
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
294
  self.pipe.to(device)
295
- self.seed = -1
296
  self.a_prompt = 'best quality, extremely detailed'
297
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
298
  'fewer digits, cropped, worst quality, low quality'
@@ -306,13 +262,15 @@ class CannyText2Image:
306
  def inference(self, inputs):
307
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
308
  image = Image.open(image_path)
309
- self.seed = random.randint(0, 65535)
310
- seed_everything(self.seed)
 
311
  prompt = f'{instruct_text}, {self.a_prompt}'
312
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
313
  guidance_scale=9.0).images[0]
314
  # updated_image_path = get_new_image_name(image_path, func_name="canny2image")
315
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
316
  image.save(updated_image_path)
317
  print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
318
  f"Output Text: {updated_image_path}")
@@ -351,7 +309,6 @@ class LineText2Image:
351
  )
352
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
353
  self.pipe.to(device)
354
- self.seed = -1
355
  self.a_prompt = 'best quality, extremely detailed'
356
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
357
  'fewer digits, cropped, worst quality, low quality'
@@ -366,13 +323,15 @@ class LineText2Image:
366
  def inference(self, inputs):
367
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
368
  image = Image.open(image_path)
369
- self.seed = random.randint(0, 65535)
370
- seed_everything(self.seed)
 
371
  prompt = f'{instruct_text}, {self.a_prompt}'
372
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
373
  guidance_scale=9.0).images[0]
374
  # updated_image_path = get_new_image_name(image_path, func_name="line2image")
375
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
376
  image.save(updated_image_path)
377
  print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
378
  f"Output Text: {updated_image_path}")
@@ -411,7 +370,6 @@ class HedText2Image:
411
  )
412
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
413
  self.pipe.to(device)
414
- self.seed = -1
415
  self.a_prompt = 'best quality, extremely detailed'
416
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
417
  'fewer digits, cropped, worst quality, low quality'
@@ -426,13 +384,15 @@ class HedText2Image:
426
  def inference(self, inputs):
427
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
428
  image = Image.open(image_path)
429
- self.seed = random.randint(0, 65535)
430
- seed_everything(self.seed)
 
431
  prompt = f'{instruct_text}, {self.a_prompt}'
432
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
433
  guidance_scale=9.0).images[0]
434
  # updated_image_path = get_new_image_name(image_path, func_name="hed2image")
435
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
436
  image.save(updated_image_path)
437
  print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
438
  f"Output Image: {updated_image_path}")
@@ -471,7 +431,6 @@ class ScribbleText2Image:
471
  )
472
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
473
  self.pipe.to(device)
474
- self.seed = -1
475
  self.a_prompt = 'best quality, extremely detailed'
476
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
477
  'fewer digits, cropped, worst quality, low quality'
@@ -484,12 +443,15 @@ class ScribbleText2Image:
484
  def inference(self, inputs):
485
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
486
  image = Image.open(image_path)
 
 
487
  seed_everything(GLOBAL_SEED)
488
  prompt = f'{instruct_text}, {self.a_prompt}'
489
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
490
  guidance_scale=9.0).images[0]
491
  # updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
492
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
493
  image.save(updated_image_path)
494
  print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
495
  f"Output Image: {updated_image_path}")
@@ -527,7 +489,6 @@ class PoseText2Image:
527
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
528
  self.pipe.to(device)
529
  self.num_inference_steps = 20
530
- self.seed = -1
531
  self.unconditional_guidance_scale = 9.0
532
  self.a_prompt = 'best quality, extremely detailed'
533
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
@@ -543,13 +504,15 @@ class PoseText2Image:
543
  def inference(self, inputs):
544
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
545
  image = Image.open(image_path)
546
- self.seed = random.randint(0, 65535)
547
- seed_everything(self.seed)
 
548
  prompt = f'{instruct_text}, {self.a_prompt}'
549
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
550
  guidance_scale=9.0).images[0]
551
  # updated_image_path = get_new_image_name(image_path, func_name="pose2image")
552
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
553
  image.save(updated_image_path)
554
  print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
555
  f"Output Image: {updated_image_path}")
@@ -567,7 +530,6 @@ class SegText2Image:
567
  torch_dtype=self.torch_dtype)
568
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
569
  self.pipe.to(device)
570
- self.seed = -1
571
  self.a_prompt = 'best quality, extremely detailed'
572
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
573
  ' fewer digits, cropped, worst quality, low quality'
@@ -581,13 +543,15 @@ class SegText2Image:
581
  def inference(self, inputs):
582
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
583
  image = Image.open(image_path)
584
- self.seed = random.randint(0, 65535)
585
- seed_everything(self.seed)
 
586
  prompt = f'{instruct_text}, {self.a_prompt}'
587
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
588
  guidance_scale=9.0).images[0]
589
  # updated_image_path = get_new_image_name(image_path, func_name="segment2image")
590
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
591
  image.save(updated_image_path)
592
  print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
593
  f"Output Image: {updated_image_path}")
@@ -602,7 +566,6 @@ class ImageText2Image:
602
  # self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
603
  self.SegText2Image = SegText2Image
604
  self.SegmentAnything = SegmentAnything
605
- self.seed = -1
606
  self.a_prompt = 'best quality, extremely detailed'
607
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
608
  ' fewer digits, cropped, worst quality, low quality'
@@ -658,7 +621,6 @@ class DepthText2Image:
658
  torch_dtype=self.torch_dtype)
659
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
660
  self.pipe.to(device)
661
- self.seed = -1
662
  self.a_prompt = 'best quality, extremely detailed'
663
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
664
  ' fewer digits, cropped, worst quality, low quality'
@@ -672,13 +634,15 @@ class DepthText2Image:
672
  def inference(self, inputs):
673
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
674
  image = Image.open(image_path)
675
- self.seed = random.randint(0, 65535)
676
- seed_everything(self.seed)
 
677
  prompt = f'{instruct_text}, {self.a_prompt}'
678
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
679
  guidance_scale=9.0).images[0]
680
  # updated_image_path = get_new_image_name(image_path, func_name="depth2image")
681
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
 
682
  image.save(updated_image_path)
683
  print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
684
  f"Output Image: {updated_image_path}")
@@ -731,7 +695,6 @@ class NormalText2Image:
731
  torch_dtype=self.torch_dtype)
732
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
733
  self.pipe.to(device)
734
- self.seed = -1
735
  self.a_prompt = 'best quality, extremely detailed'
736
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
737
  ' fewer digits, cropped, worst quality, low quality'
@@ -868,23 +831,22 @@ class SegmentAnything:
868
  self.sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
869
  self.predictor = SamPredictor(self.sam)
870
  self.sam.to(device=device)
871
- self.clicked_region = None
872
- self.img_path = None
873
- self.history_mask_res = None
874
 
875
  @prompts(name="Segment Anything on Image",
876
  description="useful when you want to segment anything in the image. "
877
  "like: segment anything from this image, "
878
- "The input to this tool should be a string, representing the image_path")
 
879
  def inference(self, inputs):
880
  print("Inputs: ", inputs)
 
881
  img_path = inputs.strip()
882
- self.img_path = img_path
883
- annos = self.segment_anything(img_path)
884
  full_img, _ = self.show_annos(annos)
885
- # full_img = Image.fromarray(full_img)
886
- # res = Image.fromarray(res)
887
- # print(os.path.splitext(img_path))
888
  seg_all_image_path = gen_new_name(img_path, 'seg')
889
  full_img.save(seg_all_image_path, "PNG")
890
 
@@ -894,27 +856,25 @@ class SegmentAnything:
894
  @prompts(name="Segment the Clicked Region in the Image",
895
  description="useful when you want to segment the masked region or block in the image. "
896
  "like: segment the masked region in this image, "
897
- "The input to this tool should be None.")
898
- def inference_by_mask(self, inputs=None):
 
 
 
 
 
 
899
  # mask = np.array(Image.open(mask_path).convert('L'))
900
- res_mask = self.segment_by_mask(self.clicked_region)
901
 
902
- if self.history_mask_res is None:
903
- self.history_mask_res = res_mask
904
- else:
905
- self.history_mask_res = np.logical_or(self.history_mask_res, res_mask)
906
-
907
- res_mask = self.history_mask_res.astype(np.uint8)*255
908
- # res_mask = self.dilate_mask(res_mask)
909
  filaname = gen_new_name(self.img_path, 'mask')
910
  mask_img = Image.fromarray(res_mask)
911
  mask_img.save(filaname, "PNG")
912
  return filaname
913
 
914
- def segment_by_mask(self, mask=None):
915
  random.seed(GLOBAL_SEED)
916
- if mask is None:
917
- mask = self.clicked_region
918
  idxs = np.nonzero(mask)
919
  num_points = min(max(1, int(len(idxs[0]) * 0.01)), 16)
920
  sampled_idx = random.sample(range(0, len(idxs[0])), num_points)
@@ -925,6 +885,7 @@ class SegmentAnything:
925
  labels = np.array([1] * num_points)
926
 
927
  res_masks, scores, _ = self.predictor.predict(
 
928
  point_coords=points,
929
  point_labels=labels,
930
  multimask_output=True,
@@ -933,10 +894,9 @@ class SegmentAnything:
933
  return res_masks[np.argmax(scores), :, :]
934
 
935
 
936
- def segment_anything(self, img_path):
937
- img = cv2.imread(img_path)
938
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
939
-
940
  mask_generator = SamAutomaticMaskGenerator(self.sam)
941
  annos = mask_generator.generate(img)
942
  return annos
@@ -947,16 +907,9 @@ class SegmentAnything:
947
 
948
  return detection_map
949
 
950
- def preprocess(self, img, img_path):
951
- self.predictor.set_image(img)
952
- self.img_path = img_path
953
 
954
- def reset(self):
955
- self.predictor.reset_image()
956
- self.clicked_region = None
957
- self.img_path = None
958
- self.history_mask_res = None
959
-
960
  def show_annos(self, anns):
961
  # From https://github.com/sail-sg/EditAnything/blob/main/sam2image.py#L91
962
  if len(anns) == 0:
@@ -1078,30 +1031,37 @@ class ImageOCRRecognition:
1078
  print(f"Initializing ImageOCRRecognition to {device}")
1079
  self.device = device
1080
  self.reader = easyocr.Reader(['ch_sim', 'en'], gpu=device) # this needs to run only once to load the model into memory
1081
- self.result = None
1082
- self.image_path=None
1083
- self.clicked_region = None
1084
-
1085
  @prompts(name="recognize the optical characters in the image",
1086
  description="useful when you want to recognize the characters or words in the clicked region of image. "
1087
  "like: recognize the characters or words in the clicked region."
1088
  "The input to this tool should be a comma separated string of two, "
1089
- "The input to this tool should be None.")
1090
  def inference_by_mask(self, inputs=None):
1091
- mask = self.clicked_region
 
 
 
 
 
 
 
 
 
 
 
 
1092
  inds =np.where(mask != 0)
1093
  inds = (inds[0][::8], inds[1][::8])
1094
-
1095
- if self.image_path is None or len(inds[0]) == 0:
1096
  # self.result = self.reader.readtext(image_path)
1097
  return 'No characters in the image'
1098
 
1099
- # stat = [100, 595] # todo
1100
-
1101
  # reader = easyocr.Reader(['ch_sim', 'en', 'fr', 'it', 'ja', 'ko', 'ru', 'de', 'pt']) # this needs to run only once to load the model into memory
1102
  ocr_text_list = []
1103
  for i in range(len(inds[0])):
1104
- res = self.search((inds[1][i], inds[0][i]))
1105
  if res is not None and len(res) > 0:
1106
  ocr_text_list.append(res)
1107
  ocr_text_list = list(dict.fromkeys(ocr_text_list))
@@ -1110,10 +1070,7 @@ class ImageOCRRecognition:
1110
  ocr_text = 'No characters in the image'
1111
  else:
1112
  ocr_text = '\n' + ocr_text
1113
-
1114
- print(
1115
- f"\nProcessed ImageOCRRecognition, Input Image: {self.image_path}, "
1116
- f"Output Text: {ocr_text}.")
1117
  return ocr_text
1118
 
1119
  @prompts(name="recognize all optical characters in the image",
@@ -1123,25 +1080,26 @@ class ImageOCRRecognition:
1123
  "representing the image_path.")
1124
  def inference(self, inputs):
1125
  image_path = inputs.strip()
1126
- if self.image_path != image_path:
1127
- self.result = self.reader.readtext(image_path)
1128
- self.image_path = image_path
1129
  # print(self.result)
1130
  res_text = []
1131
- for item in self.result:
1132
  # ([[x, y], [x, y], [x, y], [x, y]], text, confidence)
1133
  res_text.append(item[1])
1134
  print(
1135
- f"\nProcessed ImageOCRRecognition, Input Image: {self.image_path}, "
1136
  f"Output Text: {res_text}")
1137
  return res_text
1138
 
1139
- def preprocess(self, img, img_path):
1140
- self.image_path = img_path
1141
- self.result = self.reader.readtext(self.image_path)
 
 
 
1142
 
1143
- def search(self, coord):
1144
- for item in self.result:
1145
  left_top = item[0][0]
1146
  right_bottom=item[0][-2]
1147
  if (coord[0] >= left_top[0] and coord[1] >= left_top[1]) and \
@@ -1150,25 +1108,20 @@ class ImageOCRRecognition:
1150
 
1151
  return ''
1152
 
1153
- def reset(self):
1154
- self.image_path = None
1155
- self.result = None
1156
- self.mask = None
1157
 
1158
 
1159
  class ConversationBot:
1160
  def __init__(self, load_dict):
1161
- # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
1162
- print(f"Initializing VisualChatGPT, load_dict={load_dict}")
1163
  if 'HuskyVQA' not in load_dict:
1164
- raise ValueError("You have to load ImageCaptioning as a basic function for i-GPT")
1165
  if 'SegmentAnything' not in load_dict:
1166
- raise ValueError("You have to load SegmentAnything as a basic function for i-GPT")
 
 
1167
 
1168
  self.models = {}
1169
- self.uploaded_image_filename = None
1170
- # self.segmented_image_filename = None
1171
- self.history_mask = None
1172
  # Load Basic Foundation Models
1173
  for class_name, device in load_dict.items():
1174
  self.models[class_name] = globals()[class_name](device=device)
@@ -1187,27 +1140,7 @@ class ConversationBot:
1187
  if e.startswith('inference'):
1188
  func = getattr(instance, e)
1189
  self.tools.append(Tool(name=func.name, description=func.description, func=func))
1190
- self.llm = None
1191
- self.memory = None
1192
- self.audio_model = None
1193
 
1194
- def init_agent(self):
1195
- if self.memory is not None:
1196
- self.memory.clear() #clear previous history
1197
- else:
1198
- self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
1199
-
1200
- self.reset()
1201
- self.llm = OpenAI(temperature=0)
1202
- self.agent = initialize_agent(
1203
- self.tools,
1204
- self.llm,
1205
- agent="conversational-react-description",
1206
- verbose=True,
1207
- memory=self.memory,
1208
- return_intermediate_steps=True,
1209
- agent_kwargs={'prefix': INTERN_CHAT_PREFIX, 'format_instructions': INTERN_CHAT_FORMAT_INSTRUCTIONS,
1210
- 'suffix': INTERN_CHAT_SUFFIX}, )
1211
 
1212
  def find_latest_image(self, file_list):
1213
  res = None
@@ -1231,12 +1164,12 @@ class ConversationBot:
1231
  res = file_item[0]
1232
  return res
1233
 
1234
- def run_task(self, use_voice, text, audio_path, state):
1235
  if use_voice:
1236
- state, _ = self.run_audio(audio_path, state)
1237
  else:
1238
- state, _ = self.run_text(text, state)
1239
- return state, state, None
1240
 
1241
  def find_param(self, msg, keyword, excluded=False):
1242
  p1 = re.compile(f'(image/[-\\w]*.(png|mp4))')
@@ -1253,8 +1186,7 @@ class ConversationBot:
1253
  res = self.find_latest_image(out_filenames)
1254
  return res
1255
 
1256
- def rectify_action(self, inputs, history_msg):
1257
- # history_msg = self.agent.memory.buffer.copy()
1258
  print('Rectify the action.')
1259
  print(inputs)
1260
  func = None
@@ -1316,7 +1248,7 @@ class ConversationBot:
1316
  func_inputs = f'{img_path},{prompt}'
1317
  else:
1318
  # raise NotImplementedError('Can not find the matched function.')
1319
- res = self.agent(f"You can use history message to sanswer this question without using any tools. {inputs}")
1320
  res = res['output'].replace("\\", "/")
1321
 
1322
  print(f'{func_name}: {func_inputs}')
@@ -1340,17 +1272,14 @@ class ConversationBot:
1340
 
1341
  return illegal_files
1342
 
1343
- def run_text(self, text, state):
1344
  if text is None or len(text) == 0:
1345
  state += [(None, 'Please input text.')]
1346
- return state, state
1347
- self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
1348
  pattern = re.compile('(image/[-\\w]*.(png|mp4))')
1349
  try:
1350
- # new_text = text.strip() + 'You can find all input paths in the history.'
1351
- # res = self.agent({"input": new_text})
1352
- response = self.agent({"input": text.strip()})['output']
1353
- # print(f'*******response*********: {response}')
1354
  response = response.replace("\\", "/")
1355
  out_filenames = pattern.findall(response)
1356
  illegal_files = self.check_illegal_files(out_filenames)
@@ -1361,29 +1290,29 @@ class ConversationBot:
1361
  # state += [(text, 'Sorry, I failed to understand your instruction. You can try it again or turn to more powerful language model.')]
1362
  print(f'Error: {err1}')
1363
  try:
1364
- response = self.rectify_action(text, self.agent.memory.buffer[:])
1365
  # print('response = ', response)
1366
  out_filenames = pattern.findall(response)
1367
  res = self.find_latest_image(out_filenames)
1368
  # print(out_filenames)
1369
- self.agent.memory.buffer += f'\nHuman: {text.strip()}\n' + f'AI:{response})'
1370
 
1371
  except Exception as err2:
1372
  print(f'Error: {err2}')
1373
  state += [(text, 'Sorry, I failed to understand your instruction. You can try it again or turn to more powerful language model.')]
1374
- return state, state
1375
 
1376
- if res is not None and self.agent.memory.buffer.count(res) <= 1:
1377
  state = state + [(text, response + f' `{res}` is as follows: ')]
1378
  state = state + [(None, (res, ))]
1379
  else:
1380
  state = state + [(text, response)]
1381
 
1382
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
1383
- f"Current Memory: {self.agent.memory.buffer}")
1384
- return state, state
1385
 
1386
- def run_audio(self, audio_path, state):
1387
  print(f'audio_path = {audio_path}')
1388
  if audio_path is None or not os.path.exists(audio_path):
1389
  state += [(None, 'No audio input. Please stop recording first and then send the audio.')]
@@ -1391,68 +1320,74 @@ class ConversationBot:
1391
  if self.audio_model is None:
1392
  self.audio_model = whisper.load_model("small").to('cuda:0')
1393
  text = self.audio_model.transcribe(audio_path)["text"]
1394
- res = self.run_text(text, state)
1395
  print(f"\nProcessed run_audio, Input transcribed audio: {text}\nCurrent state: {state}\n"
1396
- f"Current Memory: {self.agent.memory.buffer}")
1397
- return res[0], res[1]
1398
 
1399
- def upload_image(self, image, state, txt):
1400
- self.reset()
 
1401
  print('upload an image')
 
1402
  img = image['image']
1403
  image_filename = os.path.join('image', f"{str(uuid.uuid4())[:6]}.png")
1404
- image_filename = gen_new_name(image_filename, 'raw')
1405
- self.uploaded_image_filename = image_filename
1406
- img = img.convert('RGB')
1407
  img.save(image_filename, "PNG")
1408
- # print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
1409
- # let some foundation models preprocess image
1410
- NEED_PREPROCESSING_LIST = ["SegmentAnything", "ImageOCRRecognition"]
1411
- for model_name in NEED_PREPROCESSING_LIST:
1412
- if model_name in self.models.keys():
1413
- self.models[model_name].preprocess(np.array(img), image_filename)
1414
- # description = self.models['ImageCaptioning'].inference(image_filename)
1415
- description = self.models['HuskyVQA'].inference_captioning(image_filename)
1416
  # description = 'Debug'
 
1417
 
1418
- ocr_text = None
 
1419
  if 'ImageOCRRecognition' in self.models.keys():
1420
- ocr_text = self.models['ImageOCRRecognition'].inference(image_filename)
1421
-
1422
- if ocr_text is not None and len(ocr_text) > 0:
1423
- Human_prompt = f'\nHuman: provide a image named {image_filename}. The description is: {description}. OCR result is: {ocr_text}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
 
1424
  else:
1425
- Human_prompt = f'\nHuman: provide a image named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1426
  AI_prompt = "Received. "
1427
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
 
1428
  state = state + [(f"![](file={image_filename})*{image_filename}*", AI_prompt)]
1429
  print(f"\nProcessed upload_image, Input image: {image_filename}\nCurrent state: {state}\n"
1430
- f"Current Memory: {self.agent.memory.buffer}")
1431
- return state, state, f'{txt}'
1432
 
1433
- def upload_video(self, video_path, state, txt):
1434
- self.reset()
 
 
1435
  print('upload a video')
 
1436
  vid_name = os.path.basename(video_path)
1437
  # vid_name = gen_new_name(vid_name, '', vid_name.split('.')[-1])
1438
  new_video_path = os.path.join('./image/', vid_name)
1439
- new_video_path = gen_new_name(new_video_path, 'raw', vid_name.split('.')[-1])
1440
  shutil.copy(video_path, new_video_path)
1441
 
 
1442
  if "VideoCaption" in self.models.keys():
1443
  description = self.models['VideoCaption'].inference(new_video_path)
1444
  else:
1445
  description = 'A video.'
 
1446
  Human_prompt = f'\nHuman: provide a video named {new_video_path}. The description is: {description}. This information helps you to understand this video, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1447
  AI_prompt = f"Received video: {new_video_path} "
1448
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
1449
- # state = state + [(f"![](file={new_video_path})*{new_video_path}*", AI_prompt)]
1450
- # state = state + [(f"![](file={video_path})*{new_video_path}*", AI_prompt)]
1451
  state = state + [((new_video_path, ), AI_prompt)]
1452
  # print('exists = ', os.path.exists("./tmp_files/1e7f_f4236666_tmp.mp4"))
1453
  print(f"\nProcessed upload_video, Input video: `{new_video_path}`\nCurrent state: {state}\n"
1454
- f"Current Memory: {self.agent.memory.buffer}")
1455
- return state, state, f'{txt}'
 
1456
 
1457
  def blend_mask(self, img, mask):
1458
  mask = mask.astype(np.uint8)
@@ -1468,63 +1403,67 @@ class ConversationBot:
1468
  # print(new_img_arr.shape)
1469
  return new_img_arr
1470
 
1471
- def process_seg(self, image, state):
1472
- if self.uploaded_image_filename is None or \
1473
- not os.path.exists(self.uploaded_image_filename) or \
1474
- image is None:
1475
- return state, state, None
 
 
 
1476
 
1477
  if 'SegmentAnything' not in self.models.keys():
1478
  state += [(None, 'Please load the segmentation tool.')]
1479
- return state, state, image
1480
 
1481
- img = Image.open(self.uploaded_image_filename).convert('RGB')
1482
- # img = image['image'].convert('RGB')
 
1483
  mask = image['mask'].convert('L')
1484
  mask = np.array(mask, dtype=np.uint8)
1485
-
1486
- Human_prompt="Please process this image based on given mask."
1487
- if self.uploaded_image_filename is None:
1488
- AI_prompt = "Please upload an image for processing."
1489
- state += [(Human_prompt, AI_prompt)]
1490
- return state, state, None
1491
  if mask.sum() == 0:
1492
- AI_prompt = "You can click the image in the right and ask me some questions."
1493
  state += [(Human_prompt, AI_prompt)]
1494
- return state, state, image['image']
 
 
 
 
 
 
 
 
 
 
1495
 
1496
- if 'SegmentAnything' in self.models.keys():
1497
- self.models['SegmentAnything'].clicked_region = mask
1498
-
1499
- res_mask_path = self.models['SegmentAnything'].inference_by_mask()
1500
- res_mask = Image.open(res_mask_path)
1501
- res_mask_arr = np.array(res_mask, dtype=np.uint8)
1502
- # dilate_factor = self.models['SegmentAnything'].cal_dilate_factor(res_mask_arr)
1503
- # res_mask_arr = self.models['SegmentAnything'].dilate_mask(res_mask_arr, dilate_factor)
1504
- new_img_arr = self.blend_mask(img, res_mask_arr)
1505
  new_img = Image.fromarray(new_img_arr)
1506
- new_img_name = gen_new_name(self.uploaded_image_filename, 'blended')
1507
- print(new_img_name)
1508
- new_img.save(new_img_name)
1509
- # AI_prompt = f"I have finished processing. Now, you can ask me some questions."
1510
- # state = state + [(Human_prompt, AI_prompt)]
1511
  AI_prompt = f"Received. The mask_path is named {res_mask_path}."
1512
- self.agent.memory.buffer = self.agent.memory.buffer + '\nHuman: ' + Human_prompt + ' AI: ' + AI_prompt
1513
  # state = state + [(Human_prompt, f"![](file={seg_filename})*{AI_prompt}*")]
1514
  state = state + [(Human_prompt, f'Received. The sgemented figure named `{res_mask_path}` is as follows: ')]
1515
  state = state + [(None, (res_mask_path, ))]
1516
 
1517
- print(f"\nProcessed run_image, Input image: `{self.uploaded_image_filename}`\nCurrent state: {state}\n"
1518
- f"Current Memory: {self.agent.memory.buffer}")
1519
- return state, state, new_img
1520
-
1521
- def process_ocr(self, image, state):
1522
- if self.uploaded_image_filename is None or \
1523
- not os.path.exists(self.uploaded_image_filename) or \
1524
- image is None:
1525
- return state, state, None
1526
- img = Image.open(self.uploaded_image_filename).convert('RGB')
1527
- img = np.array(img)
 
 
 
1528
  # img[:100+int(time.time() % 50),:100, :] = 0
1529
  img = Image.fromarray(img)
1530
  # img = image['image'].convert('RGB')
@@ -1532,38 +1471,33 @@ class ConversationBot:
1532
  # mask.save(f'test_{int(time.time()) % 1000}.png')
1533
  mask = np.array(mask, dtype=np.uint8)
1534
 
1535
- Human_prompt="Please process this image based on given mask."
1536
- if self.uploaded_image_filename is None:
1537
- AI_prompt = "Please upload an image for processing."
1538
- state += [(Human_prompt, AI_prompt)]
1539
- return state, state, None
1540
  if mask.sum() == 0:
1541
- AI_prompt = "You can click the image in the right and ask me some questions."
1542
  state += [(Human_prompt, AI_prompt)]
1543
- return state, state, image['image']
1544
 
1545
- ocr_text = None
1546
  if 'ImageOCRRecognition' in self.models.keys():
1547
- self.models['ImageOCRRecognition'].clicked_region = mask
1548
- ocr_text = self.models['ImageOCRRecognition'].inference_by_mask()
1549
  else:
1550
  state += [Human_prompt, f'ImageOCRRecognition is not loaded.']
1551
 
1552
- if ocr_text is not None and len(ocr_text) > 0:
1553
- AI_prompt = f'OCR result: {ocr_text}'
1554
  # self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1555
  else:
1556
  AI_prompt = 'I didn\'t find any optical characters at given location.'
1557
 
1558
  state = state + [(Human_prompt, AI_prompt)]
1559
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1560
  print(f"\nProcessed process_ocr, Input image: {self.uploaded_image_filename}\nCurrent state: {state}\n"
1561
- f"Current Memory: {self.agent.memory.buffer}")
1562
- return state, state, image['image']
1563
 
1564
- def process_save(self, image, state):
1565
  if image is None:
1566
- return state, state, None
1567
 
1568
  mask_image = image['mask'].convert('RGB')
1569
  # mask = np.array(mask, dtype=np.uint8)
@@ -1580,72 +1514,22 @@ class ConversationBot:
1580
  AI_prompt = f'The saved mask is named {mask_image_name}: '
1581
  state = state + [(Human_prompt, AI_prompt)]
1582
  state = state + [(None, (mask_image_name, ))]
1583
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1584
  print(f"\nProcessed process_ocr, Input image: {self.uploaded_image_filename}\nCurrent state: {state}\n"
1585
- f"Current Memory: {self.agent.memory.buffer}")
1586
- return state, state, image['image']
1587
 
1588
- def process_image(self, image, state):
1589
- img = Image.open(self.uploaded_image_filename).convert('RGB')
1590
- # img = image['image'].convert('RGB')
1591
- mask = image['mask'].convert('L')
1592
- mask = np.array(mask, dtype=np.uint8)
1593
 
1594
- Human_prompt="Please process this image based on given mask."
1595
- if self.uploaded_image_filename is None:
1596
- AI_prompt = "Please upload an image for processing."
1597
- state += [(Human_prompt, AI_prompt)]
1598
- return state, state, None
1599
- if mask.sum() == 0:
1600
- AI_prompt = "You can click the image in the right and ask me some questions."
1601
- state += [(Human_prompt, AI_prompt)]
1602
- return state, state, image['image']
1603
-
1604
- if self.history_mask is None:
1605
- self.history_mask = mask
1606
  else:
1607
- self.history_mask = np.logical_or(self.history_mask, mask)
1608
-
1609
- ocr_text = None
1610
- if 'SegmentAnything' in self.models.keys():
1611
- self.models['SegmentAnything'].clicked_region = self.history_mask
1612
- if 'ImageOCRRecognition' in self.models.keys():
1613
- self.models['ImageOCRRecognition'].clicked_region = mask
1614
- inds = np.where(mask != 0)
1615
- coord = (int(np.mean(inds[1])), int(np.mean(inds[0])))
1616
- ocr_text = self.models['ImageOCRRecognition'].search(coord)
1617
-
1618
- # description = self.models['ImageCaptioning'].inference(image_filename)
1619
- res_mask = self.models['SegmentAnything'].segment_by_mask(self.history_mask)
1620
- mask_image = Image.fromarray(res_mask.astype(np.uint8)*255)
1621
- img = self.blend_mask(img, res_mask)
1622
- seg_filename = gen_new_name(self.uploaded_image_filename, 'mask')
1623
- mask_image.save(seg_filename, "PNG")
1624
-
1625
- AI_prompt = f"Received. The mask_path is named {seg_filename}:"
1626
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1627
- # state = state + [(Human_prompt, f"![](file={seg_filename})*{AI_prompt}*")]
1628
- state = state + [(Human_prompt, f"Received. The segmented image is named {seg_filename}:")]
1629
- state = state + [(None, (seg_filename, ))]
1630
- if ocr_text is not None and len(ocr_text) > 0:
1631
- state = state + [(None, f'OCR result: {ocr_text}')]
1632
-
1633
- print(f"\nProcessed process_image, Input image: {self.uploaded_image_filename}\nCurrent state: {state}\n"
1634
- f"Current Memory: {self.agent.memory.buffer}")
1635
- return state, state, img
1636
-
1637
- def reset(self, clear_history_memory=False):
1638
- print('reset the model cache.')
1639
- NEED_RESET_LIST = ['SegmentAnything', 'HuskyVQA']
1640
- for model_name in NEED_RESET_LIST:
1641
- if model_name in self.models.keys():
1642
- self.models[model_name].reset()
1643
 
1644
- self.history_mask = None
1645
- self.uploaded_image_filename = None
1646
- if clear_history_memory and bot.memory is not None:
1647
- self.memory.clear()
1648
- return None
1649
 
1650
 
1651
  class ImageSketcher(gr.Image):
@@ -1670,8 +1554,6 @@ class ImageSketcher(gr.Image):
1670
  mask = np.zeros((height, width, 4), dtype=np.uint8)
1671
  mask[..., -1] = 255
1672
  mask = self.postprocess(mask)
1673
- # print(type(mask))
1674
- # print(mask.shape)
1675
  x['mask'] = mask
1676
  elif not isinstance(x, dict):
1677
  # print(x)
@@ -1682,9 +1564,7 @@ class ImageSketcher(gr.Image):
1682
  # print(width, height)
1683
  mask = np.zeros((height, width, 4), dtype=np.uint8)
1684
  mask[..., -1] = 255
1685
- # print(mask.shape)
1686
  mask = self.postprocess(mask)
1687
- # print(type(mask))
1688
  x = {'image': x, 'mask': mask}
1689
  x = super().preprocess(x)
1690
  return x
@@ -1745,6 +1625,79 @@ css='''
1745
  #image_upload:{align-items: center; min-width: 640px}
1746
  '''
1747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1748
  def change_input_type(flag):
1749
  if flag:
1750
  print('Using voice input.')
@@ -1772,9 +1725,7 @@ def process_image_tab():
1772
 
1773
  def add_whiteboard():
1774
  # wb = np.ones((1080, 1920, 3), dtype=np.uint8) * 255
1775
- # wb = np.ones((540, 960, 3), dtype=np.uint8) * 255
1776
  wb = np.ones((720, 1280, 3), dtype=np.uint8) * 255
1777
- # wb[0, 0, 0] = int(time.time() % 100)
1778
  return Image.fromarray(wb)
1779
 
1780
 
@@ -1783,15 +1734,21 @@ if __name__ == '__main__':
1783
  parser.add_argument('-p', '--port', type=int, default=7862)
1784
  parser.add_argument('-d', '--debug', action='store_true')
1785
  parser.add_argument('--https', action='store_true')
1786
- parser.add_argument('--load', type=str, default="HuskyVQA_cuda:0,Text2Image_cuda:0")
1787
  args = parser.parse_args()
1788
  load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
1789
  bot = ConversationBot(load_dict=load_dict)
1790
  # bot.init_agent()
1791
  with gr.Blocks(theme=Seafoam(), css=css) as demo:
1792
  state = gr.State([])
1793
- gr.HTML("<div align='center'> <img src='/file=./assets/gvlab_logo.png' style='height:70px'/> </div>")
1794
-
 
 
 
 
 
 
1795
  with gr.Row(visible=True, elem_id='login') as login:
1796
  with gr.Column(scale=0.6, min_width=0) :
1797
  openai_api_key_text = gr.Textbox(
@@ -1800,12 +1757,12 @@ if __name__ == '__main__':
1800
  label="OpenAI API Key",
1801
  lines=1,
1802
  type="password").style(container=False)
1803
- with gr.Column(scale=0.4, min_width=0) :
1804
  key_submit_button = gr.Button(value="Please log in with your OpenAI API Key", interactive=True, variant='primary').style(container=False)
1805
 
1806
  with gr.Row(visible=False) as user_interface:
1807
  with gr.Column(scale=0.5, elem_id="text_input") as chat_part:
1808
- chatbot = gr.Chatbot(elem_id="chatbot", label="InternChat").style(height=360)
1809
  with gr.Row(visible=True) as input_row:
1810
  with gr.Column(scale=0.8, min_width=0) as text_col:
1811
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(
@@ -1841,14 +1798,13 @@ if __name__ == '__main__':
1841
  video_input = gr.Video(interactive=True, include_audio=True, elem_id="video_upload").style(height=360)
1842
 
1843
  login_func = partial(login_with_key, bot, args.debug)
1844
- openai_api_key_text.submit(login_func, [openai_api_key_text], [user_interface, openai_api_key_text, key_submit_button])
1845
- key_submit_button.click(login_func, [openai_api_key_text, ], [user_interface, openai_api_key_text, key_submit_button])
1846
- # txt.submit(bot.run_text, [txt, state], [chatbot, state])
1847
- # txt.submit(lambda: "", None, txt)
1848
  txt.submit(
1849
  lambda: gr.update(visible=False), [], [send_btn]).then(
1850
  lambda: gr.update(visible=False), [], [txt]).then(
1851
- bot.run_text, [txt, state], [chatbot, state]).then(
1852
  lambda: gr.update(visible=True), [], [send_btn]
1853
  ).then(lambda: "", None, [txt, ]).then(
1854
  lambda: gr.update(visible=True), [], [txt])
@@ -1857,57 +1813,120 @@ if __name__ == '__main__':
1857
  send_btn.click(
1858
  lambda: gr.update(visible=False), [], [send_btn]).then(
1859
  lambda: gr.update(visible=False), [], [txt]).then(
1860
- bot.run_task, [audio_switch, txt, audio_input, state], [chatbot, state, txt]).then(
1861
  lambda: gr.update(visible=True), [], [send_btn]).then(
 
1862
  lambda: gr.update(visible=True), [], [txt]
1863
  )
1864
 
1865
  audio_switch.change(change_input_type, [audio_switch, ], [txt, audio_input])
 
 
 
1866
  add_img_example.click(ramdom_image, [], [click_img,]).then(
1867
- bot.upload_image, [click_img, state, txt], [chatbot, state, txt])
 
 
 
 
 
 
 
1868
 
 
 
 
1869
  add_vid_example.click(ramdom_video, [], [video_input,]).then(
1870
- bot.upload_video, [video_input, state, txt], [chatbot, state, txt])
 
 
 
 
 
 
 
1871
 
1872
- whiteboard_mode.click(add_whiteboard, [], [click_img,])
1873
 
1874
  # click_img.upload(bot.upload_image, [click_img, state, txt], [chatbot, state, txt])
1875
- click_img.upload(lambda: gr.update(visible=False), [], [send_btn]).then(bot.upload_image, [click_img, state, txt], [chatbot, state, txt]).then(lambda: gr.update(visible=True), [], [send_btn])
 
 
 
 
 
 
 
1876
 
1877
- # process_btn.click(bot.process_image, [click_img, state], [chatbot, state, click_img])
1878
- # process_ocr_btn.click(bot.process_ocr, [click_img, state], [chatbot, state, click_img])
1879
  process_ocr_btn.click(
1880
  lambda: gr.update(visible=False), [], [vis_btn]).then(
1881
- bot.process_ocr, [click_img, state], [chatbot, state, click_img]).then(
1882
  lambda: gr.update(visible=True), [], [vis_btn]
1883
  )
1884
  # process_seg_btn.click(bot.process_seg, [click_img, state], [chatbot, state, click_img])
1885
  process_seg_btn.click(
1886
  lambda: gr.update(visible=False), [], [vis_btn]).then(
1887
- bot.process_seg, [click_img, state], [chatbot, state, click_img]).then(
1888
  lambda: gr.update(visible=True), [], [vis_btn]
1889
  )
1890
  # process_save_btn.click(bot.process_save, [click_img, state], [chatbot, state, click_img])
1891
  process_save_btn.click(
1892
  lambda: gr.update(visible=False), [], [vis_btn]).then(
1893
- bot.process_save, [click_img, state], [chatbot, state, click_img]).then(
1894
  lambda: gr.update(visible=True), [], [vis_btn]
1895
  )
1896
  video_tab.select(process_video_tab, [], [whiteboard_mode, img_example, vid_example])
1897
  img_tab.select(process_image_tab, [], [whiteboard_mode, img_example, vid_example])
1898
  # clear_img_btn.click(bot.reset, [], [click_img])
1899
- clear_func = partial(bot.reset, clear_history_memory=True)
1900
- clear_btn.click(clear_func, [], [click_img]).then(
1901
  lambda: [], None, state).then(
 
1902
  lambda: None, None, chatbot
1903
- )
1904
- click_img.upload(bot.reset, None, None)
1905
 
1906
- video_input.upload(bot.upload_video, [video_input, state, txt], [chatbot, state, txt])
1907
- video_input.clear(bot.reset, [], [video_input])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1908
 
1909
  if args.https:
1910
- demo.queue().launch(server_name="0.0.0.0", ssl_certfile="./cert.pem", ssl_keyfile="./key.pem", ssl_verify=False, server_port=args.port)
1911
  else:
1912
  demo.queue().launch(server_name="0.0.0.0", server_port=args.port)
1913
 
 
2
  import os
3
  os.environ['CURL_CA_BUNDLE'] = ''
4
 
 
 
 
 
5
  try:
6
  import detectron
7
  except:
8
  os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
9
 
10
+ from pathlib import Path
11
+ import sys
12
+ sys.path.insert(0, str(Path(__file__).resolve().parent / "third-party" / "lama"))
13
+
14
  import random
15
  import torch
16
  import cv2
 
41
  from langchain.chains.conversation.memory import ConversationBufferMemory
42
  from langchain.llms.openai import OpenAI
43
 
44
+ from iGPT.models import VideoCaption, ActionRecognition, DenseCaption, GenerateTikTokVideo
45
+ from iGPT.models import HuskyVQA, LDMInpainting
46
+ from iGPT.models.utils import (cal_dilate_factor, dilate_mask, gen_new_name,
47
  seed_everything, prompts, blend_gt2pt)
48
 
49
+ # from segment_anything.utils.amg import remove_small_regions
50
+ from segment_anything import build_sam, sam_model_registry, SamAutomaticMaskGenerator
51
+ from iGPT.models.sam_preditor import SamPredictor
52
  from bark import SAMPLE_RATE, generate_audio
53
 
54
  import matplotlib.pyplot as plt
 
58
  from saicinpainting.evaluation.utils import move_to_device
59
  from saicinpainting.training.trainers import load_checkpoint
60
  from saicinpainting.evaluation.data import pad_tensor_to_modulo
61
+ import openai
62
 
63
  # openai.api_base = 'https://closeai.deno.dev/v1'
64
 
65
  GLOBAL_SEED=1912
66
 
67
+ INTERN_CHAT_PREFIX = """InternGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. InternGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
68
 
69
+ InternGPT is able to process and understand large amounts of text and images. As a language model, InternGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and InternGPT can invoke different tools to indirectly understand pictures. When talking about images, InternGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, InternGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. InternGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
70
 
71
+ Human may provide new figures to InternGPT with a description. The description helps InternGPT to understand this image, but InternGPT should use tools to finish following tasks, rather than directly imagine from the description.
72
 
73
+ Overall, InternGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
74
 
75
 
76
  TOOLS:
77
  ------
78
 
79
+ InternGPT has access to the following tools:"""
80
 
81
  INTERN_CHAT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
82
 
 
104
  {chat_history}
105
 
106
  New input: {input}
107
+ Since InternGPT is a text language model, InternGPT must use tools to observe images rather than imagination.
108
+ The thoughts and observations are only visible for InternGPT, InternGPT should remember to repeat important information in the final response for Human.
109
  Thought: Do I need to use a tool? {agent_scratchpad} Let's think step by step.
110
  """
111
 
112
+ INTERN_CHAT_PREFIX_CN = """InternGPT 旨在能够协助完成范围广泛的文本和视觉相关任务,从回答简单的问题到提供对广泛主题的深入解释和讨论。 InternGPT 能够根据收到的输入生成类似人类的文本,使其能够进行听起来自然的对话,并提供连贯且与手头主题相关的响应。
113
 
114
+ InternGPT 能够处理和理解大量文本和图像。作为一种语言模型,InternGPT 不能直接读取图像,但它有一系列工具来完成不同的视觉任务。每张图片都会有一个文件名,格式为“image/xxx.png”,InternGPT可以调用不同的工具来间接理解图片。在谈论图片时,InternGPT 对文件名的要求非常严格,绝不会伪造不存在的文件。在使用工具生成新的图像文件时,InternGPT也知道图像可能与用户需求不一样,会使用其他视觉问答工具或描述工具来观察真实图像。 InternGPT 能够按顺序使用工具,并且忠于工具观察输出,而不是伪造图像内容和图像文件名。如果生成新图像,它将记得提供上次工具观察的文件名。
115
 
116
+ Human 可能会向 InternGPT 提供带有描述的新图形。描述帮助 InternGPT 理解这个图像,但 InternGPT 应该使用工具来完成以下任务,而不是直接从描述中想象。有些工具将会返回英文描述,但你对用户的聊天应当采用中文。
117
 
118
+ 总的来说,InternGPT 是一个强大的可视化对话辅助工具,可以帮助处理范围广泛的任务,并提供关于范围广泛的主题的有价值的见解和信息。
119
 
120
  工具列表:
121
  ------
122
 
123
+ InternGPT 可以使用这些工具:"""
124
 
125
  INTERN_CHAT_FORMAT_INSTRUCTIONS_CN = """用户使用中文和你进行聊天,但是工具的参数应当使用英文。如果要调用工具,你必须遵循如下格式:
126
 
 
144
 
145
  开始!
146
 
147
+ 因为InternGPT是一个文本语言模型,必须使用工具去观察图片而不是依靠想象。
148
+ 推理想法和观察结果只对InternGPT可见,需要记得在最终回复时把重要的信息重复给用户,你只能给用户返回中文句子。我们一步一步思考。在你使用工具时,工具的参数只能是英文。
149
 
150
  聊天历史:
151
  {chat_history}
 
157
  os.makedirs('image', exist_ok=True)
158
 
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  class InstructPix2Pix:
161
  def __init__(self, device):
162
  print(f"Initializing InstructPix2Pix to {device}")
 
249
  torch_dtype=self.torch_dtype)
250
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
251
  self.pipe.to(device)
 
252
  self.a_prompt = 'best quality, extremely detailed'
253
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
254
  'fewer digits, cropped, worst quality, low quality'
 
262
  def inference(self, inputs):
263
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
264
  image = Image.open(image_path)
265
+ w, h = image.size
266
+ image = resize_800(image)
267
+ seed_everything(GLOBAL_SEED)
268
  prompt = f'{instruct_text}, {self.a_prompt}'
269
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
270
  guidance_scale=9.0).images[0]
271
  # updated_image_path = get_new_image_name(image_path, func_name="canny2image")
272
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
273
+ image = image.resize((w, h))
274
  image.save(updated_image_path)
275
  print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
276
  f"Output Text: {updated_image_path}")
 
309
  )
310
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
311
  self.pipe.to(device)
 
312
  self.a_prompt = 'best quality, extremely detailed'
313
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
314
  'fewer digits, cropped, worst quality, low quality'
 
323
  def inference(self, inputs):
324
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
325
  image = Image.open(image_path)
326
+ w, h = image.size
327
+ image = resize_800(image)
328
+ seed_everything(GLOBAL_SEED)
329
  prompt = f'{instruct_text}, {self.a_prompt}'
330
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
331
  guidance_scale=9.0).images[0]
332
  # updated_image_path = get_new_image_name(image_path, func_name="line2image")
333
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
334
+ image = image.resize((w, h))
335
  image.save(updated_image_path)
336
  print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
337
  f"Output Text: {updated_image_path}")
 
370
  )
371
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
372
  self.pipe.to(device)
 
373
  self.a_prompt = 'best quality, extremely detailed'
374
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
375
  'fewer digits, cropped, worst quality, low quality'
 
384
  def inference(self, inputs):
385
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
386
  image = Image.open(image_path)
387
+ w, h = image.size
388
+ image = resize_800(image)
389
+ seed_everything(GLOBAL_SEED)
390
  prompt = f'{instruct_text}, {self.a_prompt}'
391
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
392
  guidance_scale=9.0).images[0]
393
  # updated_image_path = get_new_image_name(image_path, func_name="hed2image")
394
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
395
+ image = image.resize((w, h))
396
  image.save(updated_image_path)
397
  print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
398
  f"Output Image: {updated_image_path}")
 
431
  )
432
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
433
  self.pipe.to(device)
 
434
  self.a_prompt = 'best quality, extremely detailed'
435
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
436
  'fewer digits, cropped, worst quality, low quality'
 
443
  def inference(self, inputs):
444
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
445
  image = Image.open(image_path)
446
+ w, h = image.size
447
+ image = resize_800(image)
448
  seed_everything(GLOBAL_SEED)
449
  prompt = f'{instruct_text}, {self.a_prompt}'
450
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
451
  guidance_scale=9.0).images[0]
452
  # updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
453
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
454
+ image = image.resize((w, h))
455
  image.save(updated_image_path)
456
  print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
457
  f"Output Image: {updated_image_path}")
 
489
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
490
  self.pipe.to(device)
491
  self.num_inference_steps = 20
 
492
  self.unconditional_guidance_scale = 9.0
493
  self.a_prompt = 'best quality, extremely detailed'
494
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
 
504
  def inference(self, inputs):
505
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
506
  image = Image.open(image_path)
507
+ w, h = image.size
508
+ image = resize_800(image)
509
+ seed_everything(GLOBAL_SEED)
510
  prompt = f'{instruct_text}, {self.a_prompt}'
511
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
512
  guidance_scale=9.0).images[0]
513
  # updated_image_path = get_new_image_name(image_path, func_name="pose2image")
514
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
515
+ image = image.resize((w, h))
516
  image.save(updated_image_path)
517
  print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
518
  f"Output Image: {updated_image_path}")
 
530
  torch_dtype=self.torch_dtype)
531
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
532
  self.pipe.to(device)
 
533
  self.a_prompt = 'best quality, extremely detailed'
534
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
535
  ' fewer digits, cropped, worst quality, low quality'
 
543
  def inference(self, inputs):
544
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
545
  image = Image.open(image_path)
546
+ w, h = image.size
547
+ image = resize_800(image)
548
+ seed_everything(GLOBAL_SEED)
549
  prompt = f'{instruct_text}, {self.a_prompt}'
550
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
551
  guidance_scale=9.0).images[0]
552
  # updated_image_path = get_new_image_name(image_path, func_name="segment2image")
553
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
554
+ image = image.resize((w, h))
555
  image.save(updated_image_path)
556
  print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
557
  f"Output Image: {updated_image_path}")
 
566
  # self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
567
  self.SegText2Image = SegText2Image
568
  self.SegmentAnything = SegmentAnything
 
569
  self.a_prompt = 'best quality, extremely detailed'
570
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
571
  ' fewer digits, cropped, worst quality, low quality'
 
621
  torch_dtype=self.torch_dtype)
622
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
623
  self.pipe.to(device)
 
624
  self.a_prompt = 'best quality, extremely detailed'
625
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
626
  ' fewer digits, cropped, worst quality, low quality'
 
634
  def inference(self, inputs):
635
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
636
  image = Image.open(image_path)
637
+ w, h = image.size
638
+ image = resize_800(image)
639
+ seed_everything(GLOBAL_SEED)
640
  prompt = f'{instruct_text}, {self.a_prompt}'
641
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
642
  guidance_scale=9.0).images[0]
643
  # updated_image_path = get_new_image_name(image_path, func_name="depth2image")
644
  updated_image_path = gen_new_name(image_path, f'{type(self).__name__}')
645
+ image = image.resize((w, h))
646
  image.save(updated_image_path)
647
  print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
648
  f"Output Image: {updated_image_path}")
 
695
  torch_dtype=self.torch_dtype)
696
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
697
  self.pipe.to(device)
 
698
  self.a_prompt = 'best quality, extremely detailed'
699
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
700
  ' fewer digits, cropped, worst quality, low quality'
 
831
  self.sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
832
  self.predictor = SamPredictor(self.sam)
833
  self.sam.to(device=device)
834
+ # self.clicked_region = None
835
+ # self.img_path = None
836
+ # self.history_mask_res = None
837
 
838
  @prompts(name="Segment Anything on Image",
839
  description="useful when you want to segment anything in the image. "
840
  "like: segment anything from this image, "
841
+ "The input to this tool should be a string, "
842
+ "representing the image_path.")
843
  def inference(self, inputs):
844
  print("Inputs: ", inputs)
845
+
846
  img_path = inputs.strip()
847
+ img = np.array(Image.open(img_path))
848
+ annos = self.segment_anything(img)
849
  full_img, _ = self.show_annos(annos)
 
 
 
850
  seg_all_image_path = gen_new_name(img_path, 'seg')
851
  full_img.save(seg_all_image_path, "PNG")
852
 
 
856
  @prompts(name="Segment the Clicked Region in the Image",
857
  description="useful when you want to segment the masked region or block in the image. "
858
  "like: segment the masked region in this image, "
859
+ "The input to this tool should be a comma separated string of two, "
860
+ "representing the image_path and the mask_path")
861
+ def inference_by_mask(self, inputs):
862
+ img_path, mask_path = inputs.split(',')[0], inputs.split(',')[1]
863
+ img_path = img_path.strip()
864
+ mask_path = mask_path.strip()
865
+ clicked_mask = Image.open(mask_path).convert('L')
866
+ clicked_mask = np.array(clicked_mask, dtype=np.uint8)
867
  # mask = np.array(Image.open(mask_path).convert('L'))
868
+ res_mask = self.segment_by_mask(clicked_mask)
869
 
870
+ res_mask = res_mask.astype(np.uint8)*255
 
 
 
 
 
 
871
  filaname = gen_new_name(self.img_path, 'mask')
872
  mask_img = Image.fromarray(res_mask)
873
  mask_img.save(filaname, "PNG")
874
  return filaname
875
 
876
+ def segment_by_mask(self, mask, features):
877
  random.seed(GLOBAL_SEED)
 
 
878
  idxs = np.nonzero(mask)
879
  num_points = min(max(1, int(len(idxs[0]) * 0.01)), 16)
880
  sampled_idx = random.sample(range(0, len(idxs[0])), num_points)
 
885
  labels = np.array([1] * num_points)
886
 
887
  res_masks, scores, _ = self.predictor.predict(
888
+ features=features,
889
  point_coords=points,
890
  point_labels=labels,
891
  multimask_output=True,
 
894
  return res_masks[np.argmax(scores), :, :]
895
 
896
 
897
+ def segment_anything(self, img):
898
+ # img = cv2.imread(img_path)
899
+ # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 
900
  mask_generator = SamAutomaticMaskGenerator(self.sam)
901
  annos = mask_generator.generate(img)
902
  return annos
 
907
 
908
  return detection_map
909
 
910
+ def get_image_embedding(self, img):
911
+ return self.predictor.set_image(img)
 
912
 
 
 
 
 
 
 
913
  def show_annos(self, anns):
914
  # From https://github.com/sail-sg/EditAnything/blob/main/sam2image.py#L91
915
  if len(anns) == 0:
 
1031
  print(f"Initializing ImageOCRRecognition to {device}")
1032
  self.device = device
1033
  self.reader = easyocr.Reader(['ch_sim', 'en'], gpu=device) # this needs to run only once to load the model into memory
1034
+
 
 
 
1035
  @prompts(name="recognize the optical characters in the image",
1036
  description="useful when you want to recognize the characters or words in the clicked region of image. "
1037
  "like: recognize the characters or words in the clicked region."
1038
  "The input to this tool should be a comma separated string of two, "
1039
+ "representing the image_path and the mask_path")
1040
  def inference_by_mask(self, inputs=None):
1041
+ image_path, mask_path = inputs.split(',')[0], inputs.split(',')[1]
1042
+ image_path = image_path.strip()
1043
+ mask_path = mask_path.strip()
1044
+ mask = Image.open(mask_path).convert('L')
1045
+ mask = np.array(mask, dtype=np.uint8)
1046
+ ocr_res = self.readtext(image_path)
1047
+ seleted_ocr_text = self.get_ocr_by_mask(mask, ocr_res)
1048
+ print(
1049
+ f"\nProcessed ImageOCRRecognition, Input Image: {inputs}, "
1050
+ f"Output Text: {seleted_ocr_text}.")
1051
+ return seleted_ocr_text
1052
+
1053
+ def get_ocr_by_mask(self, mask, ocr_res):
1054
  inds =np.where(mask != 0)
1055
  inds = (inds[0][::8], inds[1][::8])
1056
+ # self.result = self.reader.readtext(self.image_path)
1057
+ if len(inds[0]) == 0:
1058
  # self.result = self.reader.readtext(image_path)
1059
  return 'No characters in the image'
1060
 
 
 
1061
  # reader = easyocr.Reader(['ch_sim', 'en', 'fr', 'it', 'ja', 'ko', 'ru', 'de', 'pt']) # this needs to run only once to load the model into memory
1062
  ocr_text_list = []
1063
  for i in range(len(inds[0])):
1064
+ res = self.search((inds[1][i], inds[0][i]), ocr_res)
1065
  if res is not None and len(res) > 0:
1066
  ocr_text_list.append(res)
1067
  ocr_text_list = list(dict.fromkeys(ocr_text_list))
 
1070
  ocr_text = 'No characters in the image'
1071
  else:
1072
  ocr_text = '\n' + ocr_text
1073
+
 
 
 
1074
  return ocr_text
1075
 
1076
  @prompts(name="recognize all optical characters in the image",
 
1080
  "representing the image_path.")
1081
  def inference(self, inputs):
1082
  image_path = inputs.strip()
1083
+ result = self.reader.readtext(image_path)
 
 
1084
  # print(self.result)
1085
  res_text = []
1086
+ for item in result:
1087
  # ([[x, y], [x, y], [x, y], [x, y]], text, confidence)
1088
  res_text.append(item[1])
1089
  print(
1090
+ f"\nProcessed ImageOCRRecognition, Input Image: {inputs}, "
1091
  f"Output Text: {res_text}")
1092
  return res_text
1093
 
1094
+ # def preprocess(self, img, img_path):
1095
+ # self.image_path = img_path
1096
+ # self.result = self.reader.readtext(self.image_path)
1097
+
1098
+ def readtext(self, img_path):
1099
+ return self.reader.readtext(img_path)
1100
 
1101
+ def search(self, coord, orc_res):
1102
+ for item in orc_res:
1103
  left_top = item[0][0]
1104
  right_bottom=item[0][-2]
1105
  if (coord[0] >= left_top[0] and coord[1] >= left_top[1]) and \
 
1108
 
1109
  return ''
1110
 
 
 
 
 
1111
 
1112
 
1113
  class ConversationBot:
1114
  def __init__(self, load_dict):
1115
+ print(f"Initializing InternGPT, load_dict={load_dict}")
 
1116
  if 'HuskyVQA' not in load_dict:
1117
+ raise ValueError("You have to load ImageCaptioning as a basic function for iGPT")
1118
  if 'SegmentAnything' not in load_dict:
1119
+ raise ValueError("You have to load SegmentAnything as a basic function for iGPT")
1120
+ if 'ImageOCRRecognition' not in load_dict:
1121
+ raise ValueError("You have to load ImageOCRRecognition as a basic function for iGPT")
1122
 
1123
  self.models = {}
1124
+ self.audio_model = whisper.load_model("small").to('cuda:0')
 
 
1125
  # Load Basic Foundation Models
1126
  for class_name, device in load_dict.items():
1127
  self.models[class_name] = globals()[class_name](device=device)
 
1140
  if e.startswith('inference'):
1141
  func = getattr(instance, e)
1142
  self.tools.append(Tool(name=func.name, description=func.description, func=func))
 
 
 
1143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1144
 
1145
  def find_latest_image(self, file_list):
1146
  res = None
 
1164
  res = file_item[0]
1165
  return res
1166
 
1167
+ def run_task(self, use_voice, text, audio_path, state, user_state):
1168
  if use_voice:
1169
+ state, _, user_state = self.run_audio(audio_path, state, user_state)
1170
  else:
1171
+ state, _, user_state = self.run_text(text, state, user_state)
1172
+ return state, state, user_state
1173
 
1174
  def find_param(self, msg, keyword, excluded=False):
1175
  p1 = re.compile(f'(image/[-\\w]*.(png|mp4))')
 
1186
  res = self.find_latest_image(out_filenames)
1187
  return res
1188
 
1189
+ def rectify_action(self, inputs, history_msg, user_state):
 
1190
  print('Rectify the action.')
1191
  print(inputs)
1192
  func = None
 
1248
  func_inputs = f'{img_path},{prompt}'
1249
  else:
1250
  # raise NotImplementedError('Can not find the matched function.')
1251
+ res = user_state[0]['agent'](f"You can use history message to sanswer this question without using any tools. {inputs}")
1252
  res = res['output'].replace("\\", "/")
1253
 
1254
  print(f'{func_name}: {func_inputs}')
 
1272
 
1273
  return illegal_files
1274
 
1275
+ def run_text(self, text, state, user_state):
1276
  if text is None or len(text) == 0:
1277
  state += [(None, 'Please input text.')]
1278
+ return state, state, user_state
1279
+ user_state[0]['agent'].memory.buffer = cut_dialogue_history(user_state[0]['agent'].memory.buffer, keep_last_n_words=500)
1280
  pattern = re.compile('(image/[-\\w]*.(png|mp4))')
1281
  try:
1282
+ response = user_state[0]['agent']({"input": text.strip()})['output']
 
 
 
1283
  response = response.replace("\\", "/")
1284
  out_filenames = pattern.findall(response)
1285
  illegal_files = self.check_illegal_files(out_filenames)
 
1290
  # state += [(text, 'Sorry, I failed to understand your instruction. You can try it again or turn to more powerful language model.')]
1291
  print(f'Error: {err1}')
1292
  try:
1293
+ response = self.rectify_action(text, user_state[0]['agent'].memory.buffer[:], user_state)
1294
  # print('response = ', response)
1295
  out_filenames = pattern.findall(response)
1296
  res = self.find_latest_image(out_filenames)
1297
  # print(out_filenames)
1298
+ user_state[0]['agent'].memory.buffer += f'\nHuman: {text.strip()}\n' + f'AI:{response})'
1299
 
1300
  except Exception as err2:
1301
  print(f'Error: {err2}')
1302
  state += [(text, 'Sorry, I failed to understand your instruction. You can try it again or turn to more powerful language model.')]
1303
+ return state, state, user_state
1304
 
1305
+ if res is not None and user_state[0]['agent'].memory.buffer.count(res) <= 1:
1306
  state = state + [(text, response + f' `{res}` is as follows: ')]
1307
  state = state + [(None, (res, ))]
1308
  else:
1309
  state = state + [(text, response)]
1310
 
1311
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
1312
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
1313
+ return state, state, user_state
1314
 
1315
+ def run_audio(self, audio_path, state, user_state):
1316
  print(f'audio_path = {audio_path}')
1317
  if audio_path is None or not os.path.exists(audio_path):
1318
  state += [(None, 'No audio input. Please stop recording first and then send the audio.')]
 
1320
  if self.audio_model is None:
1321
  self.audio_model = whisper.load_model("small").to('cuda:0')
1322
  text = self.audio_model.transcribe(audio_path)["text"]
1323
+ res = self.run_text(text, state, user_state)
1324
  print(f"\nProcessed run_audio, Input transcribed audio: {text}\nCurrent state: {state}\n"
1325
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
1326
+ return res[0], res[1], res[2]
1327
 
1328
+ def upload_image(self, image, state, user_state):
1329
+ # [txt, click_img, state, user_state], [chatbot, txt, state, user_state]
1330
+ # self.reset()
1331
  print('upload an image')
1332
+ user_state = self.clear_user_state(False, user_state)
1333
  img = image['image']
1334
  image_filename = os.path.join('image', f"{str(uuid.uuid4())[:6]}.png")
1335
+ image_filename = gen_new_name(image_filename, 'image')
 
 
1336
  img.save(image_filename, "PNG")
1337
+ # self.uploaded_image_filename = image_filename
1338
+ user_state[0]['image_path'] = image_filename
1339
+ img = img.convert('RGB')
1340
+
1341
+ image_caption = self.models['HuskyVQA'].inference_captioning(image_filename)
 
 
 
1342
  # description = 'Debug'
1343
+ user_state[0]['image_caption'] = image_caption
1344
 
1345
+ ocr_res = None
1346
+ user_state[0]['ocr_res'] = []
1347
  if 'ImageOCRRecognition' in self.models.keys():
1348
+ ocr_res = self.models['ImageOCRRecognition'].inference(image_filename)
1349
+ ocr_res_raw = self.models['ImageOCRRecognition'].readtext(image_filename)
1350
+ if ocr_res is not None and len(ocr_res) > 0:
1351
+ Human_prompt = f'\nHuman: provide a image named {image_filename}. The description is: {image_caption} OCR result is: {ocr_res}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1352
+ user_state[0]['ocr_res'] = ocr_res_raw
1353
  else:
1354
+ Human_prompt = f'\nHuman: provide a image named {image_filename}. The description is: {image_caption} This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1355
  AI_prompt = "Received. "
1356
+ # self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1357
+ user_state[0]['agent'].memory.buffer += Human_prompt + 'AI: ' + AI_prompt
1358
  state = state + [(f"![](file={image_filename})*{image_filename}*", AI_prompt)]
1359
  print(f"\nProcessed upload_image, Input image: {image_filename}\nCurrent state: {state}\n"
1360
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
 
1361
 
1362
+ return state, state, user_state
1363
+
1364
+ def upload_video(self, video_path, state, user_state):
1365
+ # self.reset()
1366
  print('upload a video')
1367
+ user_state = self.clear_user_state(False, user_state)
1368
  vid_name = os.path.basename(video_path)
1369
  # vid_name = gen_new_name(vid_name, '', vid_name.split('.')[-1])
1370
  new_video_path = os.path.join('./image/', vid_name)
1371
+ new_video_path = gen_new_name(new_video_path, 'image', vid_name.split('.')[-1])
1372
  shutil.copy(video_path, new_video_path)
1373
 
1374
+ user_state[0]['video_path'] = new_video_path
1375
  if "VideoCaption" in self.models.keys():
1376
  description = self.models['VideoCaption'].inference(new_video_path)
1377
  else:
1378
  description = 'A video.'
1379
+ user_state[0]['video_caption'] = description
1380
  Human_prompt = f'\nHuman: provide a video named {new_video_path}. The description is: {description}. This information helps you to understand this video, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
1381
  AI_prompt = f"Received video: {new_video_path} "
1382
+ # self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
1383
+ user_state[0]['agent'].memory.buffer += Human_prompt + 'AI: ' + AI_prompt
1384
+
1385
  state = state + [((new_video_path, ), AI_prompt)]
1386
  # print('exists = ', os.path.exists("./tmp_files/1e7f_f4236666_tmp.mp4"))
1387
  print(f"\nProcessed upload_video, Input video: `{new_video_path}`\nCurrent state: {state}\n"
1388
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
1389
+
1390
+ return state, state, user_state
1391
 
1392
  def blend_mask(self, img, mask):
1393
  mask = mask.astype(np.uint8)
 
1403
  # print(new_img_arr.shape)
1404
  return new_img_arr
1405
 
1406
+ def process_seg(self, image, state, user_state):
1407
+ Human_prompt="Please process this image based on given mask."
1408
+ if image is None or \
1409
+ user_state[0].get('image_path', None) is None or \
1410
+ not os.path.exists(user_state[0]['image_path']):
1411
+ AI_prompt = "Please upload an image for processing."
1412
+ state += [(Human_prompt, AI_prompt)]
1413
+ return None, state, state, user_state
1414
 
1415
  if 'SegmentAnything' not in self.models.keys():
1416
  state += [(None, 'Please load the segmentation tool.')]
1417
+ return image['image'], state, state, user_state
1418
 
1419
+ img = Image.open(user_state[0]['image_path']).convert('RGB')
1420
+ print(f'user_state[0][\'image_path\'] = {user_state[0]["image_path"]}')
1421
+ img = np.array(img, dtype=np.uint8)
1422
  mask = image['mask'].convert('L')
1423
  mask = np.array(mask, dtype=np.uint8)
1424
+
 
 
 
 
 
1425
  if mask.sum() == 0:
1426
+ AI_prompt = "You can click the image and ask me some questions."
1427
  state += [(Human_prompt, AI_prompt)]
1428
+ return image['image'], state, state, user_state
1429
+
1430
+ # if 'SegmentAnything' in self.models.keys():
1431
+ # self.models['SegmentAnything'].clicked_region = mask
1432
+ if user_state[0].get('features', None) is None:
1433
+ user_state[0]['features'] = self.models['SegmentAnything'].get_image_embedding(img)
1434
+
1435
+ res_mask = self.models['SegmentAnything'].segment_by_mask(mask, user_state[0]['features'])
1436
+
1437
+ if user_state[0].get('seg_mask', None) is not None:
1438
+ res_mask = np.logical_or(user_state[0]['seg_mask'], res_mask)
1439
 
1440
+ res_mask = res_mask.astype(np.uint8)*255
1441
+ user_state[0]['seg_mask'] = res_mask
1442
+ new_img_arr = self.blend_mask(img, res_mask)
 
 
 
 
 
 
1443
  new_img = Image.fromarray(new_img_arr)
1444
+ res_mask_img = Image.fromarray(res_mask).convert('RGB')
1445
+ res_mask_path = gen_new_name(user_state[0]['image_path'], 'mask')
1446
+ res_mask_img.save(res_mask_path)
 
 
1447
  AI_prompt = f"Received. The mask_path is named {res_mask_path}."
1448
+ user_state[0]['agent'].memory.buffer += '\nHuman: ' + Human_prompt + '\nAI: ' + AI_prompt
1449
  # state = state + [(Human_prompt, f"![](file={seg_filename})*{AI_prompt}*")]
1450
  state = state + [(Human_prompt, f'Received. The sgemented figure named `{res_mask_path}` is as follows: ')]
1451
  state = state + [(None, (res_mask_path, ))]
1452
 
1453
+ print(f"\nProcessed run_image, Input image: `{user_state[0]['image_path']}`\nCurrent state: {state}\n"
1454
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
1455
+ return new_img, state, state, user_state
1456
+
1457
+ def process_ocr(self, image, state, user_state):
1458
+ Human_prompt="Please process this image based on given mask."
1459
+ if image is None or \
1460
+ user_state[0].get('image_path', None) is None or \
1461
+ not os.path.exists(user_state[0]['image_path']):
1462
+ AI_prompt = "Please upload an image for processing."
1463
+ state += [(Human_prompt, AI_prompt)]
1464
+ return None, state, state, user_state
1465
+
1466
+ img = np.array(image['image'])
1467
  # img[:100+int(time.time() % 50),:100, :] = 0
1468
  img = Image.fromarray(img)
1469
  # img = image['image'].convert('RGB')
 
1471
  # mask.save(f'test_{int(time.time()) % 1000}.png')
1472
  mask = np.array(mask, dtype=np.uint8)
1473
 
 
 
 
 
 
1474
  if mask.sum() == 0:
1475
+ AI_prompt = "You can click the image and ask me some questions."
1476
  state += [(Human_prompt, AI_prompt)]
1477
+ return image['image'], state, state, user_state
1478
 
1479
+ chosen_ocr_res = None
1480
  if 'ImageOCRRecognition' in self.models.keys():
1481
+ # self.models['ImageOCRRecognition'].clicked_region = mask
1482
+ chosen_ocr_res = self.models['ImageOCRRecognition'].get_ocr_by_mask(mask, user_state[0]['ocr_res'])
1483
  else:
1484
  state += [Human_prompt, f'ImageOCRRecognition is not loaded.']
1485
 
1486
+ if chosen_ocr_res is not None and len(chosen_ocr_res) > 0:
1487
+ AI_prompt = f'OCR result: {chosen_ocr_res}'
1488
  # self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1489
  else:
1490
  AI_prompt = 'I didn\'t find any optical characters at given location.'
1491
 
1492
  state = state + [(Human_prompt, AI_prompt)]
1493
+ user_state[0]['agent'].memory.buffer += '\nHuman: ' + Human_prompt + '\nAI: ' + AI_prompt
1494
  print(f"\nProcessed process_ocr, Input image: {self.uploaded_image_filename}\nCurrent state: {state}\n"
1495
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
1496
+ return image['image'], state, state, user_state
1497
 
1498
+ def process_save(self, image, state, user_state):
1499
  if image is None:
1500
+ return None, state, state, user_state
1501
 
1502
  mask_image = image['mask'].convert('RGB')
1503
  # mask = np.array(mask, dtype=np.uint8)
 
1514
  AI_prompt = f'The saved mask is named {mask_image_name}: '
1515
  state = state + [(Human_prompt, AI_prompt)]
1516
  state = state + [(None, (mask_image_name, ))]
1517
+ user_state[0]['agent'].memory.buffer = user_state[0]['agent'].memory.buffer + Human_prompt + ' AI: ' + AI_prompt
1518
  print(f"\nProcessed process_ocr, Input image: {self.uploaded_image_filename}\nCurrent state: {state}\n"
1519
+ f"Current Memory: {user_state[0]['agent'].memory.buffer}")
1520
+ return image['image'], state, state, user_state
1521
 
 
 
 
 
 
1522
 
1523
+ def clear_user_state(self, clear_momery, user_state):
1524
+ new_user_state = [{}]
1525
+ new_user_state[0]['agent'] = user_state[0]['agent']
1526
+ new_user_state[0]['memory'] = user_state[0]['memory']
1527
+ if clear_momery:
1528
+ new_user_state[0]['memory'].clear()
 
 
 
 
 
 
1529
  else:
1530
+ new_user_state[0]['memory'] = user_state[0]['memory']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1531
 
1532
+ return new_user_state
 
 
 
 
1533
 
1534
 
1535
  class ImageSketcher(gr.Image):
 
1554
  mask = np.zeros((height, width, 4), dtype=np.uint8)
1555
  mask[..., -1] = 255
1556
  mask = self.postprocess(mask)
 
 
1557
  x['mask'] = mask
1558
  elif not isinstance(x, dict):
1559
  # print(x)
 
1564
  # print(width, height)
1565
  mask = np.zeros((height, width, 4), dtype=np.uint8)
1566
  mask[..., -1] = 255
 
1567
  mask = self.postprocess(mask)
 
1568
  x = {'image': x, 'mask': mask}
1569
  x = super().preprocess(x)
1570
  return x
 
1625
  #image_upload:{align-items: center; min-width: 640px}
1626
  '''
1627
 
1628
+ def resize_800(image):
1629
+ w, h = image.size
1630
+ if w > h:
1631
+ ratio = w * 1.0 / 800
1632
+ new_w, new_h = 800, int(h * 1.0 / ratio)
1633
+ else:
1634
+ ratio = h * 1.0 / 800
1635
+ new_w, new_h = int(w * 1.0 / ratio), 800
1636
+ image = image.resize((new_w, new_h))
1637
+ return image
1638
+
1639
+ def cut_dialogue_history(history_memory, keep_last_n_words=500):
1640
+ if history_memory is None or len(history_memory) == 0:
1641
+ return history_memory
1642
+ tokens = history_memory.split()
1643
+ n_tokens = len(tokens)
1644
+ print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
1645
+ if n_tokens < keep_last_n_words:
1646
+ return history_memory
1647
+ paragraphs = history_memory.split('\n')
1648
+ last_n_tokens = n_tokens
1649
+ while last_n_tokens >= keep_last_n_words:
1650
+ last_n_tokens -= len(paragraphs[0].split(' '))
1651
+ paragraphs = paragraphs[1:]
1652
+ return '\n' + '\n'.join(paragraphs)
1653
+
1654
+
1655
+ def login_with_key(bot, debug, api_key):
1656
+ # Just for debug
1657
+ print('===>logging in')
1658
+ user_state = [{}]
1659
+ is_error = True
1660
+ if debug:
1661
+ user_state = init_agent(bot)
1662
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False, value=''), user_state
1663
+ else:
1664
+ import openai
1665
+ from langchain.llms.openai import OpenAI
1666
+ if api_key and len(api_key) > 30:
1667
+ os.environ["OPENAI_API_KEY"] = api_key
1668
+ openai.api_key = api_key
1669
+ try:
1670
+ llm = OpenAI(temperature=0)
1671
+ llm('Hi!')
1672
+ response = 'Success!'
1673
+ is_error = False
1674
+ user_state = init_agent(bot)
1675
+ except:
1676
+ # gr.update(visible=True)
1677
+ response = 'Incorrect key, please input again'
1678
+ is_error = True
1679
+ else:
1680
+ is_error = True
1681
+ response = 'Incorrect key, please input again'
1682
+
1683
+ return gr.update(visible=not is_error), gr.update(visible=is_error), gr.update(visible=is_error, value=response), user_state
1684
+
1685
+ def init_agent(bot):
1686
+ memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
1687
+ llm = OpenAI(temperature=0)
1688
+ agent = initialize_agent(
1689
+ bot.tools,
1690
+ llm,
1691
+ agent="conversational-react-description",
1692
+ verbose=True,
1693
+ memory=memory,
1694
+ return_intermediate_steps=True,
1695
+ agent_kwargs={'prefix': INTERN_CHAT_PREFIX, 'format_instructions': INTERN_CHAT_FORMAT_INSTRUCTIONS,
1696
+ 'suffix': INTERN_CHAT_SUFFIX}, )
1697
+
1698
+ user_state = [{'agent': agent, 'memory': memory}]
1699
+ return user_state
1700
+
1701
  def change_input_type(flag):
1702
  if flag:
1703
  print('Using voice input.')
 
1725
 
1726
  def add_whiteboard():
1727
  # wb = np.ones((1080, 1920, 3), dtype=np.uint8) * 255
 
1728
  wb = np.ones((720, 1280, 3), dtype=np.uint8) * 255
 
1729
  return Image.fromarray(wb)
1730
 
1731
 
 
1734
  parser.add_argument('-p', '--port', type=int, default=7862)
1735
  parser.add_argument('-d', '--debug', action='store_true')
1736
  parser.add_argument('--https', action='store_true')
1737
+ parser.add_argument('--load', type=str, default="HuskyVQA_cuda:0,ImageOCRRecognition_cuda:0,SegmentAnything_cuda:0")
1738
  args = parser.parse_args()
1739
  load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
1740
  bot = ConversationBot(load_dict=load_dict)
1741
  # bot.init_agent()
1742
  with gr.Blocks(theme=Seafoam(), css=css) as demo:
1743
  state = gr.State([])
1744
+ # user_state is dict. Keys: [agent, memory, image_path, video_path, seg_mask, image_caption, OCR_res, ...]
1745
+ user_state = gr.State([])
1746
+
1747
+ gr.HTML(
1748
+ """
1749
+ <div align='center'> <img src='/file=./assets/gvlab_logo.png' style='height:70px'/> </div>
1750
+ <p align="center"><a href="https://github.com/OpenGVLab/InternGPT"><b>GitHub</b></a>&nbsp;&nbsp;&nbsp; <a href="https://arxiv.org/pdf/2305.05662.pdf"><b>ArXiv</b></a></p>
1751
+ """)
1752
  with gr.Row(visible=True, elem_id='login') as login:
1753
  with gr.Column(scale=0.6, min_width=0) :
1754
  openai_api_key_text = gr.Textbox(
 
1757
  label="OpenAI API Key",
1758
  lines=1,
1759
  type="password").style(container=False)
1760
+ with gr.Column(scale=0.4, min_width=0):
1761
  key_submit_button = gr.Button(value="Please log in with your OpenAI API Key", interactive=True, variant='primary').style(container=False)
1762
 
1763
  with gr.Row(visible=False) as user_interface:
1764
  with gr.Column(scale=0.5, elem_id="text_input") as chat_part:
1765
+ chatbot = gr.Chatbot(elem_id="chatbot", label="InternGPT").style(height=360)
1766
  with gr.Row(visible=True) as input_row:
1767
  with gr.Column(scale=0.8, min_width=0) as text_col:
1768
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(
 
1798
  video_input = gr.Video(interactive=True, include_audio=True, elem_id="video_upload").style(height=360)
1799
 
1800
  login_func = partial(login_with_key, bot, args.debug)
1801
+ openai_api_key_text.submit(login_func, [openai_api_key_text], [user_interface, openai_api_key_text, key_submit_button, user_state])
1802
+ key_submit_button.click(login_func, [openai_api_key_text, ], [user_interface, openai_api_key_text, key_submit_button, user_state])
1803
+
 
1804
  txt.submit(
1805
  lambda: gr.update(visible=False), [], [send_btn]).then(
1806
  lambda: gr.update(visible=False), [], [txt]).then(
1807
+ bot.run_text, [txt, state, user_state], [chatbot, state, user_state]).then(
1808
  lambda: gr.update(visible=True), [], [send_btn]
1809
  ).then(lambda: "", None, [txt, ]).then(
1810
  lambda: gr.update(visible=True), [], [txt])
 
1813
  send_btn.click(
1814
  lambda: gr.update(visible=False), [], [send_btn]).then(
1815
  lambda: gr.update(visible=False), [], [txt]).then(
1816
+ bot.run_task, [audio_switch, txt, audio_input, state, user_state], [chatbot, state, user_state]).then(
1817
  lambda: gr.update(visible=True), [], [send_btn]).then(
1818
+ lambda: "", None, [txt, ]).then(
1819
  lambda: gr.update(visible=True), [], [txt]
1820
  )
1821
 
1822
  audio_switch.change(change_input_type, [audio_switch, ], [txt, audio_input])
1823
+ # add_img_example.click(ramdom_image, [], [click_img,]).then(
1824
+ # bot.upload_image, [click_img, state, user_state], [chatbot, state, user_state])
1825
+
1826
  add_img_example.click(ramdom_image, [], [click_img,]).then(
1827
+ lambda: gr.update(visible=False), [], [send_btn]).then(
1828
+ lambda: gr.update(visible=False), [], [txt]).then(
1829
+ lambda: gr.update(visible=False), [], [vis_btn]).then(
1830
+ bot.upload_image, [click_img, state, user_state],
1831
+ [chatbot, state, user_state]).then(
1832
+ lambda: gr.update(visible=True), [], [send_btn]).then(
1833
+ lambda: gr.update(visible=True), [], [txt]).then(
1834
+ lambda: gr.update(visible=True), [], [vis_btn])
1835
 
1836
+ # add_vid_example.click(ramdom_video, [], [video_input,]).then(
1837
+ # bot.upload_video, [video_input, state, user_state], [chatbot, state, user_state])
1838
+
1839
  add_vid_example.click(ramdom_video, [], [video_input,]).then(
1840
+ lambda: gr.update(visible=False), [], [send_btn]).then(
1841
+ lambda: gr.update(visible=False), [], [txt]).then(
1842
+ lambda: gr.update(visible=False), [], [vis_btn]).then(
1843
+ bot.upload_video, [video_input, state, user_state],
1844
+ [chatbot, state, user_state]).then(
1845
+ lambda: gr.update(visible=True), [], [send_btn]).then(
1846
+ lambda: gr.update(visible=True), [], [txt]).then(
1847
+ lambda: gr.update(visible=True), [], [vis_btn])
1848
 
1849
+ whiteboard_mode.click(add_whiteboard, [], [click_img, ])
1850
 
1851
  # click_img.upload(bot.upload_image, [click_img, state, txt], [chatbot, state, txt])
1852
+ click_img.upload(lambda: gr.update(visible=False), [], [send_btn]).then(
1853
+ lambda: gr.update(visible=False), [], [txt]).then(
1854
+ lambda: gr.update(visible=False), [], [vis_btn]).then(
1855
+ bot.upload_image, [click_img, state, user_state],
1856
+ [chatbot, state, user_state]).then(
1857
+ lambda: gr.update(visible=True), [], [send_btn]).then(
1858
+ lambda: gr.update(visible=True), [], [txt]).then(
1859
+ lambda: gr.update(visible=True), [], [vis_btn])
1860
 
 
 
1861
  process_ocr_btn.click(
1862
  lambda: gr.update(visible=False), [], [vis_btn]).then(
1863
+ bot.process_ocr, [click_img, state, user_state], [click_img, chatbot, state, user_state]).then(
1864
  lambda: gr.update(visible=True), [], [vis_btn]
1865
  )
1866
  # process_seg_btn.click(bot.process_seg, [click_img, state], [chatbot, state, click_img])
1867
  process_seg_btn.click(
1868
  lambda: gr.update(visible=False), [], [vis_btn]).then(
1869
+ bot.process_seg, [click_img, state, user_state], [click_img, chatbot, state, user_state]).then(
1870
  lambda: gr.update(visible=True), [], [vis_btn]
1871
  )
1872
  # process_save_btn.click(bot.process_save, [click_img, state], [chatbot, state, click_img])
1873
  process_save_btn.click(
1874
  lambda: gr.update(visible=False), [], [vis_btn]).then(
1875
+ bot.process_save, [click_img, state, user_state], [click_img, chatbot, state, user_state]).then(
1876
  lambda: gr.update(visible=True), [], [vis_btn]
1877
  )
1878
  video_tab.select(process_video_tab, [], [whiteboard_mode, img_example, vid_example])
1879
  img_tab.select(process_image_tab, [], [whiteboard_mode, img_example, vid_example])
1880
  # clear_img_btn.click(bot.reset, [], [click_img])
1881
+ clear_func = partial(bot.clear_user_state, True)
1882
+ clear_btn.click(lambda: None, [], [click_img, ]).then(
1883
  lambda: [], None, state).then(
1884
+ clear_func, [user_state, ], [user_state, ]).then(
1885
  lambda: None, None, chatbot
1886
+ ).then(lambda: '', None, [txt, ])
1887
+ # click_img.upload(bot.reset, None, None)
1888
 
1889
+ # video_input.upload(bot.upload_video, [video_input, state, user_state], [chatbot, state, user_state])
1890
+ video_input.upload(lambda: gr.update(visible=False), [], [send_btn]).then(
1891
+ lambda: gr.update(visible=False), [], [txt]).then(
1892
+ bot.upload_video, [video_input, state, user_state],
1893
+ [chatbot, state, user_state]).then(
1894
+ lambda: gr.update(visible=True), [], [send_btn]).then(
1895
+ lambda: gr.update(visible=True), [], [txt])
1896
+
1897
+ clear_func = partial(bot.clear_user_state, False)
1898
+ video_input.clear(clear_func, [user_state, ], [user_state, ])
1899
+
1900
+ # (More detailed instructions can be found in <a href="https://www.shailab.org.cn">here</a>:</p>
1901
+ gr.HTML(
1902
+ """
1903
+ <body>
1904
+ <p style="font-family:verdana;color:#FF0000";>Tips!!! (More detailed instructions are coming soon): </p>
1905
+ </body>
1906
+ """
1907
+ )
1908
+ gr.Markdown(
1909
+ '''
1910
+ After uploading the image, you can have a **multi-modal dialogue** by sending messages like: `what is it in the image?` or `what is the background color of image?`.
1911
+
1912
+ You also can interactively operate, edit or generate the image as follows:
1913
+ - You can click the image and press the button `Pick` to **visualize the segmented region** or press the button `OCR` to **recognize the words** at chosen position;
1914
+ - To **remove the masked reigon** in the image, you can send the message like: `remove the maked region`;
1915
+ - To **replace the masked reigon** in the image, you can send the message like: `replace the maked region with {your prompt}`;
1916
+ - To **generate a new image**, you can send the message like: `generate a new image based on its segmentation decribing {your prompt}`
1917
+ - To **create a new image by your scribble**, you can press button `Whiteboard` and drawing in the below board. After drawing, you need to press the button `Save` and send the message like: `generate a new image based on this scribble decribing {your prompt}`.
1918
+ '''
1919
+ )
1920
+ gr.HTML(
1921
+ """
1922
+ <body>
1923
+ <p style="font-family:verdana;color:#11AA00";>More features is coming soon. Hope you have fun with our demo!</p>
1924
+ </body>
1925
+ """
1926
+ )
1927
 
1928
  if args.https:
1929
+ demo.queue().launch(server_name="0.0.0.0", ssl_certfile="./certificate/cert.pem", ssl_keyfile="./certificate/key.pem", ssl_verify=False, server_port=args.port)
1930
  else:
1931
  demo.queue().launch(server_name="0.0.0.0", server_port=args.port)
1932
 
iChat/models/grit_src/third_party/CenterNet2/detectron2/model_zoo/__init__.py DELETED
@@ -1,10 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates.
2
- """
3
- Model Zoo API for Detectron2: a collection of functions to create common model architectures
4
- listed in `MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md>`_,
5
- and optionally load their pre-trained weights.
6
- """
7
-
8
- from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
9
-
10
- __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
 
 
 
 
 
 
 
 
 
 
 
iChat/models/grit_src/third_party/CenterNet2/detectron2/model_zoo/model_zoo.py DELETED
@@ -1,213 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates.
2
- import os
3
- from typing import Optional
4
- import pkg_resources
5
- import torch
6
-
7
- from detectron2.checkpoint import DetectionCheckpointer
8
- from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
9
- from detectron2.modeling import build_model
10
-
11
-
12
- class _ModelZooUrls(object):
13
- """
14
- Mapping from names to officially released Detectron2 pre-trained models.
15
- """
16
-
17
- S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
18
-
19
- # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
20
- CONFIG_PATH_TO_URL_SUFFIX = {
21
- # COCO Detection with Faster R-CNN
22
- "COCO-Detection/faster_rcnn_R_50_C4_1x": "137257644/model_final_721ade.pkl",
23
- "COCO-Detection/faster_rcnn_R_50_DC5_1x": "137847829/model_final_51d356.pkl",
24
- "COCO-Detection/faster_rcnn_R_50_FPN_1x": "137257794/model_final_b275ba.pkl",
25
- "COCO-Detection/faster_rcnn_R_50_C4_3x": "137849393/model_final_f97cb7.pkl",
26
- "COCO-Detection/faster_rcnn_R_50_DC5_3x": "137849425/model_final_68d202.pkl",
27
- "COCO-Detection/faster_rcnn_R_50_FPN_3x": "137849458/model_final_280758.pkl",
28
- "COCO-Detection/faster_rcnn_R_101_C4_3x": "138204752/model_final_298dad.pkl",
29
- "COCO-Detection/faster_rcnn_R_101_DC5_3x": "138204841/model_final_3e0943.pkl",
30
- "COCO-Detection/faster_rcnn_R_101_FPN_3x": "137851257/model_final_f6e8b1.pkl",
31
- "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x": "139173657/model_final_68b088.pkl",
32
- # COCO Detection with RetinaNet
33
- "COCO-Detection/retinanet_R_50_FPN_1x": "190397773/model_final_bfca0b.pkl",
34
- "COCO-Detection/retinanet_R_50_FPN_3x": "190397829/model_final_5bd44e.pkl",
35
- "COCO-Detection/retinanet_R_101_FPN_3x": "190397697/model_final_971ab9.pkl",
36
- # COCO Detection with RPN and Fast R-CNN
37
- "COCO-Detection/rpn_R_50_C4_1x": "137258005/model_final_450694.pkl",
38
- "COCO-Detection/rpn_R_50_FPN_1x": "137258492/model_final_02ce48.pkl",
39
- "COCO-Detection/fast_rcnn_R_50_FPN_1x": "137635226/model_final_e5f7ce.pkl",
40
- # COCO Instance Segmentation Baselines with Mask R-CNN
41
- "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x": "137259246/model_final_9243eb.pkl",
42
- "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x": "137260150/model_final_4f86c3.pkl",
43
- "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "137260431/model_final_a54504.pkl",
44
- "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x": "137849525/model_final_4ce675.pkl",
45
- "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x": "137849551/model_final_84107b.pkl",
46
- "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x": "137849600/model_final_f10217.pkl",
47
- "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x": "138363239/model_final_a2914c.pkl",
48
- "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x": "138363294/model_final_0464b7.pkl",
49
- "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x": "138205316/model_final_a3ec72.pkl",
50
- "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x": "139653917/model_final_2d9806.pkl", # noqa
51
- # New baselines using Large-Scale Jitter and Longer Training Schedule
52
- "new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ": "42047764/model_final_bb69de.pkl",
53
- "new_baselines/mask_rcnn_R_50_FPN_200ep_LSJ": "42047638/model_final_89a8d3.pkl",
54
- "new_baselines/mask_rcnn_R_50_FPN_400ep_LSJ": "42019571/model_final_14d201.pkl",
55
- "new_baselines/mask_rcnn_R_101_FPN_100ep_LSJ": "42025812/model_final_4f7b58.pkl",
56
- "new_baselines/mask_rcnn_R_101_FPN_200ep_LSJ": "42131867/model_final_0bb7ae.pkl",
57
- "new_baselines/mask_rcnn_R_101_FPN_400ep_LSJ": "42073830/model_final_f96b26.pkl",
58
- "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_100ep_LSJ": "42047771/model_final_b7fbab.pkl", # noqa
59
- "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ": "42132721/model_final_5d87c1.pkl", # noqa
60
- "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_400ep_LSJ": "42025447/model_final_f1362d.pkl", # noqa
61
- "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_100ep_LSJ": "42047784/model_final_6ba57e.pkl", # noqa
62
- "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_200ep_LSJ": "42047642/model_final_27b9c1.pkl", # noqa
63
- "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ": "42045954/model_final_ef3a80.pkl", # noqa
64
- # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
65
- "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x": "137261548/model_final_04e291.pkl",
66
- "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x": "137849621/model_final_a6e10b.pkl",
67
- "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x": "138363331/model_final_997cc7.pkl",
68
- "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x": "139686956/model_final_5ad38f.pkl",
69
- # COCO Panoptic Segmentation Baselines with Panoptic FPN
70
- "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x": "139514544/model_final_dbfeb4.pkl",
71
- "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x": "139514569/model_final_c10459.pkl",
72
- "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x": "139514519/model_final_cafdb1.pkl",
73
- # LVIS Instance Segmentation Baselines with Mask R-CNN
74
- "LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "144219072/model_final_571f7c.pkl", # noqa
75
- "LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x": "144219035/model_final_824ab5.pkl", # noqa
76
- "LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x": "144219108/model_final_5e3439.pkl", # noqa
77
- # Cityscapes & Pascal VOC Baselines
78
- "Cityscapes/mask_rcnn_R_50_FPN": "142423278/model_final_af9cf5.pkl",
79
- "PascalVOC-Detection/faster_rcnn_R_50_C4": "142202221/model_final_b1acc2.pkl",
80
- # Other Settings
81
- "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5": "138602867/model_final_65c703.pkl",
82
- "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5": "144998336/model_final_821d0b.pkl",
83
- "Misc/cascade_mask_rcnn_R_50_FPN_1x": "138602847/model_final_e9d89b.pkl",
84
- "Misc/cascade_mask_rcnn_R_50_FPN_3x": "144998488/model_final_480dd8.pkl",
85
- "Misc/mask_rcnn_R_50_FPN_3x_syncbn": "169527823/model_final_3b3c51.pkl",
86
- "Misc/mask_rcnn_R_50_FPN_3x_gn": "138602888/model_final_dc5d9e.pkl",
87
- "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn": "138602908/model_final_01ca85.pkl",
88
- "Misc/scratch_mask_rcnn_R_50_FPN_9x_gn": "183808979/model_final_da7b4c.pkl",
89
- "Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn": "184226666/model_final_5ce33e.pkl",
90
- "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x": "139797668/model_final_be35db.pkl",
91
- "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv": "18131413/model_0039999_e76410.pkl", # noqa
92
- # D1 Comparisons
93
- "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x": "137781054/model_final_7ab50c.pkl", # noqa
94
- "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x": "137781281/model_final_62ca52.pkl", # noqa
95
- "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x": "137781195/model_final_cce136.pkl",
96
- }
97
-
98
- @staticmethod
99
- def query(config_path: str) -> Optional[str]:
100
- """
101
- Args:
102
- config_path: relative config filename
103
- """
104
- name = config_path.replace(".yaml", "").replace(".py", "")
105
- if name in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
106
- suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[name]
107
- return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
108
- return None
109
-
110
-
111
- def get_checkpoint_url(config_path):
112
- """
113
- Returns the URL to the model trained using the given config
114
-
115
- Args:
116
- config_path (str): config file name relative to detectron2's "configs/"
117
- directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
118
-
119
- Returns:
120
- str: a URL to the model
121
- """
122
- url = _ModelZooUrls.query(config_path)
123
- if url is None:
124
- raise RuntimeError("Pretrained model for {} is not available!".format(config_path))
125
- return url
126
-
127
-
128
- def get_config_file(config_path):
129
- """
130
- Returns path to a builtin config file.
131
-
132
- Args:
133
- config_path (str): config file name relative to detectron2's "configs/"
134
- directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
135
-
136
- Returns:
137
- str: the real path to the config file.
138
- """
139
- cfg_file = pkg_resources.resource_filename(
140
- "detectron2.model_zoo", os.path.join("configs", config_path)
141
- )
142
- if not os.path.exists(cfg_file):
143
- raise RuntimeError("{} not available in Model Zoo!".format(config_path))
144
- return cfg_file
145
-
146
-
147
- def get_config(config_path, trained: bool = False):
148
- """
149
- Returns a config object for a model in model zoo.
150
-
151
- Args:
152
- config_path (str): config file name relative to detectron2's "configs/"
153
- directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
154
- trained (bool): If True, will set ``MODEL.WEIGHTS`` to trained model zoo weights.
155
- If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
156
- instead; this will typically (though not always) initialize a subset of weights using
157
- an ImageNet pre-trained model, while randomly initializing the other weights.
158
-
159
- Returns:
160
- CfgNode or omegaconf.DictConfig: a config object
161
- """
162
- cfg_file = get_config_file(config_path)
163
- if cfg_file.endswith(".yaml"):
164
- cfg = get_cfg()
165
- cfg.merge_from_file(cfg_file)
166
- if trained:
167
- cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
168
- return cfg
169
- elif cfg_file.endswith(".py"):
170
- cfg = LazyConfig.load(cfg_file)
171
- if trained:
172
- url = get_checkpoint_url(config_path)
173
- if "train" in cfg and "init_checkpoint" in cfg.train:
174
- cfg.train.init_checkpoint = url
175
- else:
176
- raise NotImplementedError
177
- return cfg
178
-
179
-
180
- def get(config_path, trained: bool = False, device: Optional[str] = None):
181
- """
182
- Get a model specified by relative path under Detectron2's official ``configs/`` directory.
183
-
184
- Args:
185
- config_path (str): config file name relative to detectron2's "configs/"
186
- directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
187
- trained (bool): see :func:`get_config`.
188
- device (str or None): overwrite the device in config, if given.
189
-
190
- Returns:
191
- nn.Module: a detectron2 model. Will be in training mode.
192
-
193
- Example:
194
- ::
195
- from detectron2 import model_zoo
196
- model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
197
- """
198
- cfg = get_config(config_path, trained)
199
- if device is None and not torch.cuda.is_available():
200
- device = "cpu"
201
- if device is not None and isinstance(cfg, CfgNode):
202
- cfg.MODEL.DEVICE = device
203
-
204
- if isinstance(cfg, CfgNode):
205
- model = build_model(cfg)
206
- DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
207
- else:
208
- model = instantiate(cfg.model)
209
- if device is not None:
210
- model = model.to(device)
211
- if "train" in cfg and "init_checkpoint" in cfg.train:
212
- DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
213
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{iChat → iGPT}/__init__.py RENAMED
File without changes
{iChat → iGPT}/chatbot/__init__.py RENAMED
File without changes
{iChat → iGPT}/chatbot/chatbot.py RENAMED
File without changes
{iChat → iGPT}/models/__init__.py RENAMED
File without changes
{iChat → iGPT}/models/grit_model.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/configs/Base.yaml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/configs/GRiT_B_DenseCap.yaml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/configs/GRiT_B_ObjectDet.yaml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/configs/GRiT_H_ObjectDet.yaml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/configs/GRiT_L_ObjectDet.yaml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/__init__.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/config.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/custom_solver.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/custom_build_augmentation.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/custom_dataset_dataloader.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/custom_dataset_mapper.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/datasets/grit_coco.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/datasets/object365.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/datasets/vg.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/transforms/custom_augmentation_impl.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/data/transforms/custom_transform.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/evaluation/eval.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/backbone/utils.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/backbone/vit.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/meta_arch/grit.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/roi_heads/grit_fast_rcnn.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/roi_heads/grit_roi_heads.py RENAMED
@@ -16,7 +16,7 @@ from .grit_fast_rcnn import GRiTFastRCNNOutputLayers
16
  from ..text.text_decoder import TransformerDecoderTextualHead, GRiTTextDecoder, AutoRegressiveBeamSearch
17
  from ..text.load_text_token import LoadTextTokens
18
  from transformers import BertTokenizer
19
- from iChat.models.grit_src.grit.data.custom_dataset_mapper import ObjDescription
20
  from ..soft_nms import batched_soft_nms
21
 
22
  import logging
 
16
  from ..text.text_decoder import TransformerDecoderTextualHead, GRiTTextDecoder, AutoRegressiveBeamSearch
17
  from ..text.load_text_token import LoadTextTokens
18
  from transformers import BertTokenizer
19
+ from iGPT.models.grit_src.grit.data.custom_dataset_mapper import ObjDescription
20
  from ..soft_nms import batched_soft_nms
21
 
22
  import logging
{iChat → iGPT}/models/grit_src/grit/modeling/soft_nms.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/text/file_utils.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/text/load_text_token.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/text/modeling_bert.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/modeling/text/text_decoder.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/grit/predictor.py RENAMED
File without changes
{iChat → iGPT}/models/grit_src/image_dense_captions.py RENAMED
@@ -10,7 +10,7 @@ from detectron2.config import get_cfg
10
  from detectron2.data.detection_utils import read_image
11
  from detectron2.utils.logger import setup_logger
12
 
13
- sys.path.insert(0, 'iChat/models/grit_src/third_party/CenterNet2/projects/CenterNet2/')
14
  from centernet.config import add_centernet_config
15
  from ..grit_src.grit.config import add_grit_config
16
 
@@ -58,7 +58,7 @@ def setup_cfg(args):
58
  return cfg
59
 
60
  def get_parser(device):
61
- arg_dict = {'config_file': "iChat/models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml", 'device': device, 'confidence_threshold': 0.5, 'test_task': 'DenseCap', 'opts': ["MODEL.WEIGHTS", "model_zoo/grit_b_densecap_objectdet.pth"]}
62
  return arg_dict
63
 
64
  def image_caption_api(image_src, device):
 
10
  from detectron2.data.detection_utils import read_image
11
  from detectron2.utils.logger import setup_logger
12
 
13
+ sys.path.insert(0, 'iGPT/models/grit_src/third_party/CenterNet2/projects/CenterNet2/')
14
  from centernet.config import add_centernet_config
15
  from ..grit_src.grit.config import add_grit_config
16
 
 
58
  return cfg
59
 
60
  def get_parser(device):
61
+ arg_dict = {'config_file': "iGPT/models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml", 'device': device, 'confidence_threshold': 0.5, 'test_task': 'DenseCap', 'opts': ["MODEL.WEIGHTS", "model_zoo/grit_b_densecap_objectdet.pth"]}
62
  return arg_dict
63
 
64
  def image_caption_api(image_src, device):
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/.circleci/config.yml RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/.clang-format RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/.flake8 RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/.gitignore RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/GETTING_STARTED.md RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/INSTALL.md RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/LICENSE RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/MODEL_ZOO.md RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/README.md RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/README_D2.md RENAMED
File without changes
{iChat → iGPT}/models/grit_src/third_party/CenterNet2/configs/Base-RCNN-C4.yaml RENAMED
File without changes