LanHarmony commited on
Commit
53cf806
·
1 Parent(s): 1dc205a

add image2depth and depth2image

Browse files
Files changed (2) hide show
  1. app.py +34 -23
  2. visual_foundation_models.py +109 -77
app.py CHANGED
@@ -52,18 +52,19 @@ import gradio as gr
52
 
53
 
54
  def cut_dialogue_history(history_memory, keep_last_n_words=400):
 
 
55
  tokens = history_memory.split()
56
  n_tokens = len(tokens)
57
- print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}")
58
  if n_tokens < keep_last_n_words:
59
  return history_memory
60
- else:
61
- paragraphs = history_memory.split('\n')
62
- last_n_tokens = n_tokens
63
- while last_n_tokens >= keep_last_n_words:
64
- last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
65
- paragraphs = paragraphs[1:]
66
- return '\n' + '\n'.join(paragraphs)
67
 
68
 
69
  class ConversationBot:
@@ -74,7 +75,6 @@ class ConversationBot:
74
  raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
75
 
76
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
77
-
78
  self.models = dict()
79
  for class_name, device in load_dict.items():
80
  self.models[class_name] = globals()[class_name](device=device)
@@ -86,7 +86,6 @@ class ConversationBot:
86
  func = getattr(instance, e)
87
  self.tools.append(Tool(name=func.name, description=func.description, func=func))
88
 
89
-
90
  def run_text(self, text, state):
91
  self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
92
  res = self.agent({"input": text})
@@ -98,7 +97,7 @@ class ConversationBot:
98
  return state, state
99
 
100
  def run_image(self, image, state, txt):
101
- image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
102
  print("======>Auto Resize Image...")
103
  img = Image.open(image.name)
104
  width, height = img.size
@@ -111,17 +110,13 @@ class ConversationBot:
111
  img.save(image_filename, "PNG")
112
  print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
113
  description = self.models['ImageCaptioning'].inference(image_filename)
114
- Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. " \
115
- "This information helps you to understand this image, " \
116
- "but you should use tools to finish following tasks, " \
117
- "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
118
- image_filename, description)
119
  AI_prompt = "Received. "
120
  self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
121
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
122
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
123
  f"Current Memory: {self.agent.memory.buffer}")
124
- return state, state, txt + ' ' + image_filename + ' '
125
 
126
  def init_agent(self, openai_api_key):
127
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
@@ -136,17 +131,25 @@ class ConversationBot:
136
 
137
  return gr.update(visible = True)
138
 
139
- bot = ConversationBot({'Text2Image':'cuda:0',
140
- 'ImageCaptioning':'cuda:0',
141
  'ImageEditing': 'cuda:0',
142
  'VisualQuestionAnswering': 'cuda:0',
143
- 'Image2Canny':'cpu',
144
- 'CannyText2Image':'cuda:0',
145
- 'InstructPix2Pix':'cuda:0'})
 
 
 
146
 
147
  with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
148
  with gr.Row():
149
  gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
 
 
 
 
 
150
 
151
  with gr.Row():
152
  openai_api_key_textbox = gr.Textbox(
@@ -177,10 +180,18 @@ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
177
  "Can you use this canny image to generate an oil painting of a dog",
178
  "Make it like water-color painting",
179
  "What is the background color",
180
- "Describe this image"],
 
 
 
181
  inputs=txt
182
  )
183
 
 
 
 
 
 
184
 
185
  openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
186
  txt.submit(bot.run_text, [txt, state], [chatbot, state])
 
52
 
53
 
54
  def cut_dialogue_history(history_memory, keep_last_n_words=400):
55
+ if history_memory is None or len(history_memory) == 0:
56
+ return history_memory
57
  tokens = history_memory.split()
58
  n_tokens = len(tokens)
59
+ print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
60
  if n_tokens < keep_last_n_words:
61
  return history_memory
62
+ paragraphs = history_memory.split('\n')
63
+ last_n_tokens = n_tokens
64
+ while last_n_tokens >= keep_last_n_words:
65
+ last_n_tokens -= len(paragraphs[0].split(' '))
66
+ paragraphs = paragraphs[1:]
67
+ return '\n' + '\n'.join(paragraphs)
 
68
 
69
 
70
  class ConversationBot:
 
75
  raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
76
 
77
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
 
78
  self.models = dict()
79
  for class_name, device in load_dict.items():
80
  self.models[class_name] = globals()[class_name](device=device)
 
86
  func = getattr(instance, e)
87
  self.tools.append(Tool(name=func.name, description=func.description, func=func))
88
 
 
89
  def run_text(self, text, state):
90
  self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
91
  res = self.agent({"input": text})
 
97
  return state, state
98
 
99
  def run_image(self, image, state, txt):
100
+ image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
101
  print("======>Auto Resize Image...")
102
  img = Image.open(image.name)
103
  width, height = img.size
 
110
  img.save(image_filename, "PNG")
111
  print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
112
  description = self.models['ImageCaptioning'].inference(image_filename)
113
+ Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
 
 
 
 
114
  AI_prompt = "Received. "
115
  self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
116
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
117
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
118
  f"Current Memory: {self.agent.memory.buffer}")
119
+ return state, state, f'{txt} {image_filename} '
120
 
121
  def init_agent(self, openai_api_key):
122
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
 
131
 
132
  return gr.update(visible = True)
133
 
134
+ bot = ConversationBot({'Text2Image': 'cuda:0',
135
+ 'ImageCaptioning': 'cuda:0',
136
  'ImageEditing': 'cuda:0',
137
  'VisualQuestionAnswering': 'cuda:0',
138
+ 'Image2Canny': 'cpu',
139
+ 'CannyText2Image': 'cuda:0',
140
+ 'InstructPix2Pix': 'cuda:0',
141
+ 'Image2Depth': 'cpu',
142
+ 'DepthText2Image': 'cuda:0',
143
+ })
144
 
145
  with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
146
  with gr.Row():
147
  gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
148
+ gr.Markdown(
149
+ """This is a demo to the work [Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models](https://github.com/microsoft/visual-chatgpt).<br>
150
+ This space connects ChatGPT and a series of Visual Foundation Models to enable sending and receiving images during chatting.<br>
151
+ """
152
+ )
153
 
154
  with gr.Row():
155
  openai_api_key_textbox = gr.Textbox(
 
180
  "Can you use this canny image to generate an oil painting of a dog",
181
  "Make it like water-color painting",
182
  "What is the background color",
183
+ "Describe this image",
184
+ "please detect the depth of this image",
185
+ "Can you use this depth image to generate a cute dog",
186
+ ],
187
  inputs=txt
188
  )
189
 
190
+ gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:
191
+ <a href="https://huggingface.co/spaces/microsoft/visual_chatgpt?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
192
+ </center>''')
193
+
194
+
195
 
196
  openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
197
  txt.submit(bot.run_text, [txt, state], [chatbot, state])
visual_foundation_models.py CHANGED
@@ -44,7 +44,7 @@ def get_new_image_name(org_img_name, func_name="update"):
44
 
45
  class MaskFormer:
46
  def __init__(self, device):
47
- print("Initializing MaskFormer to %s" % device)
48
  self.device = device
49
  self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
50
  self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
@@ -74,24 +74,27 @@ class MaskFormer:
74
 
75
  class ImageEditing:
76
  def __init__(self, device):
77
- print("Initializing ImageEditing to %s" % device)
78
  self.device = device
79
  self.mask_former = MaskFormer(device=self.device)
80
- self.inpaint = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", revision="fp16", torch_dtype=torch.float16).to(device)
 
 
 
81
 
82
  @prompts(name="Remove Something From The Photo",
83
  description="useful when you want to remove and object or something from the photo "
84
  "from its description or location. "
85
- "The input to this tool should be a comma seperated string of two, "
86
  "representing the image_path and the object need to be removed. ")
87
  def inference_remove(self, inputs):
88
- image_path, to_be_removed_txt = inputs.split(",")
89
  return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
90
 
91
  @prompts(name="Replace Something From The Photo",
92
  description="useful when you want to replace an object from the object description or "
93
  "location with another object from its description. "
94
- "The input to this tool should be a comma seperated string of three, "
95
  "representing the image_path, the object to be replaced, the object to be replaced with ")
96
  def inference_replace(self, inputs):
97
  image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
@@ -111,16 +114,18 @@ class ImageEditing:
111
 
112
  class InstructPix2Pix:
113
  def __init__(self, device):
114
- print("Initializing InstructPix2Pix to %s" % device)
115
  self.device = device
116
- self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16,
117
- safety_checker=None).to(device)
 
 
118
  self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
119
 
120
  @prompts(name="Instruct Image Using Text",
121
  description="useful when you want to the style of the image to be like the text. "
122
  "like: make it look like a painting. or make it like a robot. "
123
- "The input to this tool should be a comma seperated string of two, "
124
  "representing the image_path and the text. ")
125
  def inference(self, inputs):
126
  """Change style of image."""
@@ -163,17 +168,18 @@ class Text2Image:
163
 
164
  class ImageCaptioning:
165
  def __init__(self, device):
166
- print("Initializing ImageCaptioning to %s" % device)
167
  self.device = device
 
168
  self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
169
  self.model = BlipForConditionalGeneration.from_pretrained(
170
- "Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to(self.device)
171
 
172
  @prompts(name="Get Photo Description",
173
  description="useful when you want to know what is inside the photo. receives image_path as input. "
174
  "The input to this tool should be a string, representing the image_path. ")
175
  def inference(self, image_path):
176
- inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, torch.float16)
177
  out = self.model.generate(**inputs)
178
  captions = self.processor.decode(out[0], skip_special_tokens=True)
179
  print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
@@ -206,29 +212,32 @@ class Image2Canny:
206
 
207
  class CannyText2Image:
208
  def __init__(self, device):
209
- print("Initializing CannyText2Image to %s" % device)
210
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny", torch_dtype=torch.float16)
 
 
211
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
212
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16)
 
213
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
214
  self.pipe.to(device)
215
  self.seed = -1
216
  self.a_prompt = 'best quality, extremely detailed'
217
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
218
- 'fewer digits, cropped, worst quality, low quality'
219
 
220
  @prompts(name="Generate Image Condition On Canny Image",
221
- description="useful when you want to generate a new real image from both the user desciption and a canny image."
222
  " like: generate a real image of a object or something from this canny image,"
223
  " or generate a new real image of a object or something from this edge image. "
224
- "The input to this tool should be a comma seperated string of two, "
225
  "representing the image_path and the user description. ")
226
  def inference(self, inputs):
227
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
228
  image = Image.open(image_path)
229
  self.seed = random.randint(0, 65535)
230
  seed_everything(self.seed)
231
- prompt = instruct_text + ', ' + self.a_prompt
232
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
233
  guidance_scale=9.0).images[0]
234
  updated_image_path = get_new_image_name(image_path, func_name="canny2image")
@@ -246,7 +255,7 @@ class Image2Line:
246
  @prompts(name="Line Detection On Image",
247
  description="useful when you want to detect the straight line of the image. "
248
  "like: detect the straight lines of this image, or straight line detection on image, "
249
- "or peform straight line detection on this image, or detect the straight line image of this image. "
250
  "The input to this tool should be a string, representing the image_path")
251
  def inference(self, inputs):
252
  image = Image.open(inputs)
@@ -259,31 +268,34 @@ class Image2Line:
259
 
260
  class LineText2Image:
261
  def __init__(self, device):
262
- print("Initializing LineText2Image to %s" % device)
263
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd")
 
 
264
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
265
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
 
266
  )
267
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
268
  self.pipe.to(device)
269
  self.seed = -1
270
  self.a_prompt = 'best quality, extremely detailed'
271
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
272
- 'fewer digits, cropped, worst quality, low quality'
273
 
274
  @prompts(name="Generate Image Condition On Line Image",
275
- description="useful when you want to generate a new real image from both the user desciption "
276
  "and a straight line image. "
277
  "like: generate a real image of a object or something from this straight line image, "
278
  "or generate a new real image of a object or something from this straight lines. "
279
- "The input to this tool should be a comma seperated string of two, "
280
  "representing the image_path and the user description. ")
281
  def inference(self, inputs):
282
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
283
  image = Image.open(image_path)
284
  self.seed = random.randint(0, 65535)
285
  seed_everything(self.seed)
286
- prompt = instruct_text + ', ' + self.a_prompt
287
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
288
  guidance_scale=9.0).images[0]
289
  updated_image_path = get_new_image_name(image_path, func_name="line2image")
@@ -301,7 +313,7 @@ class Image2Hed:
301
  @prompts(name="Hed Detection On Image",
302
  description="useful when you want to detect the soft hed boundary of the image. "
303
  "like: detect the soft hed boundary of this image, or hed boundary detection on image, "
304
- "or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
305
  "The input to this tool should be a string, representing the image_path")
306
  def inference(self, inputs):
307
  image = Image.open(inputs)
@@ -314,31 +326,34 @@ class Image2Hed:
314
 
315
  class HedText2Image:
316
  def __init__(self, device):
317
- print("Initializing HedText2Image to %s" % device)
318
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed")
 
 
319
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
320
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
 
321
  )
322
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
323
  self.pipe.to(device)
324
  self.seed = -1
325
  self.a_prompt = 'best quality, extremely detailed'
326
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
327
- 'fewer digits, cropped, worst quality, low quality'
328
 
329
  @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
330
- description="useful when you want to generate a new real image from both the user desciption "
331
  "and a soft hed boundary image. "
332
  "like: generate a real image of a object or something from this soft hed boundary image, "
333
  "or generate a new real image of a object or something from this hed boundary. "
334
- "The input to this tool should be a comma seperated string of two, "
335
  "representing the image_path and the user description")
336
  def inference(self, inputs):
337
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
338
  image = Image.open(image_path)
339
  self.seed = random.randint(0, 65535)
340
  seed_everything(self.seed)
341
- prompt = instruct_text + ', ' + self.a_prompt
342
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
343
  guidance_scale=9.0).images[0]
344
  updated_image_path = get_new_image_name(image_path, func_name="hed2image")
@@ -369,29 +384,32 @@ class Image2Scribble:
369
 
370
  class ScribbleText2Image:
371
  def __init__(self, device):
372
- print("Initializing ScribbleText2Image to %s" % device)
373
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble")
 
 
374
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
375
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
 
376
  )
377
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
378
  self.pipe.to(device)
379
  self.seed = -1
380
  self.a_prompt = 'best quality, extremely detailed'
381
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
382
- 'fewer digits, cropped, worst quality, low quality'
383
 
384
  @prompts(name="Generate Image Condition On Sketch Image",
385
- description="useful when you want to generate a new real image from both the user desciption and "
386
  "a scribble image or a sketch image. "
387
- "The input to this tool should be a comma seperated string of two, "
388
  "representing the image_path and the user description")
389
  def inference(self, inputs):
390
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
391
  image = Image.open(image_path)
392
  self.seed = random.randint(0, 65535)
393
  seed_everything(self.seed)
394
- prompt = instruct_text + ', ' + self.a_prompt
395
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
396
  guidance_scale=9.0).images[0]
397
  updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
@@ -421,10 +439,13 @@ class Image2Pose:
421
 
422
  class PoseText2Image:
423
  def __init__(self, device):
424
- print("Initializing PoseText2Image to %s" % device)
425
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose")
 
 
426
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
427
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
428
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
429
  self.pipe.to(device)
430
  self.num_inference_steps = 20
@@ -432,21 +453,21 @@ class PoseText2Image:
432
  self.unconditional_guidance_scale = 9.0
433
  self.a_prompt = 'best quality, extremely detailed'
434
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
435
- ' fewer digits, cropped, worst quality, low quality'
436
 
437
  @prompts(name="Generate Image Condition On Pose Image",
438
- description="useful when you want to generate a new real image from both the user desciption "
439
  "and a human pose image. "
440
  "like: generate a real image of a human from this human pose image, "
441
  "or generate a new real image of a human from this pose. "
442
- "The input to this tool should be a comma seperated string of two, "
443
  "representing the image_path and the user description")
444
  def inference(self, inputs):
445
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
446
  image = Image.open(image_path)
447
  self.seed = random.randint(0, 65535)
448
  seed_everything(self.seed)
449
- prompt = instruct_text + ', ' + self.a_prompt
450
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
451
  guidance_scale=9.0).images[0]
452
  updated_image_path = get_new_image_name(image_path, func_name="pose2image")
@@ -503,7 +524,7 @@ class Image2Seg:
503
  @prompts(name="Segmentation On Image",
504
  description="useful when you want to detect segmentations of the image. "
505
  "like: segment this image, or generate segmentations on this image, "
506
- "or peform segmentation on this image. "
507
  "The input to this tool should be a string, representing the image_path")
508
  def inference(self, inputs):
509
  image = Image.open(inputs)
@@ -525,29 +546,32 @@ class Image2Seg:
525
 
526
  class SegText2Image:
527
  def __init__(self, device):
528
- print("Initializing SegText2Image to %s" % device)
529
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg")
 
 
530
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
531
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
532
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
533
  self.pipe.to(device)
534
  self.seed = -1
535
  self.a_prompt = 'best quality, extremely detailed'
536
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
537
- ' fewer digits, cropped, worst quality, low quality'
538
 
539
  @prompts(name="Generate Image Condition On Segmentations",
540
- description="useful when you want to generate a new real image from both the user desciption and segmentations. "
541
  "like: generate a real image of a object or something from this segmentation image, "
542
  "or generate a new real image of a object or something from these segmentations. "
543
- "The input to this tool should be a comma seperated string of two, "
544
  "representing the image_path and the user description")
545
  def inference(self, inputs):
546
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
547
  image = Image.open(image_path)
548
  self.seed = random.randint(0, 65535)
549
  seed_everything(self.seed)
550
- prompt = instruct_text + ', ' + self.a_prompt
551
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
552
  guidance_scale=9.0).images[0]
553
  updated_image_path = get_new_image_name(image_path, func_name="segment2image")
@@ -581,29 +605,32 @@ class Image2Depth:
581
 
582
  class DepthText2Image:
583
  def __init__(self, device):
584
- print("Initializing DepthText2Image to %s" % device)
585
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-depth")
 
 
586
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
587
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
588
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
589
  self.pipe.to(device)
590
  self.seed = -1
591
  self.a_prompt = 'best quality, extremely detailed'
592
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
593
- ' fewer digits, cropped, worst quality, low quality'
594
 
595
  @prompts(name="Generate Image Condition On Depth",
596
- description="useful when you want to generate a new real image from both the user desciption and depth image. "
597
  "like: generate a real image of a object or something from this depth image, "
598
  "or generate a new real image of a object or something from the depth map. "
599
- "The input to this tool should be a comma seperated string of two, "
600
  "representing the image_path and the user description")
601
  def inference(self, inputs):
602
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
603
  image = Image.open(image_path)
604
  self.seed = random.randint(0, 65535)
605
  seed_everything(self.seed)
606
- prompt = instruct_text + ', ' + self.a_prompt
607
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
608
  guidance_scale=9.0).images[0]
609
  updated_image_path = get_new_image_name(image_path, func_name="depth2image")
@@ -649,29 +676,32 @@ class Image2Normal:
649
 
650
  class NormalText2Image:
651
  def __init__(self, device):
652
- print("Initializing NormalText2Image to %s" % device)
653
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-normal")
 
 
654
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
655
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
656
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
657
  self.pipe.to(device)
658
  self.seed = -1
659
  self.a_prompt = 'best quality, extremely detailed'
660
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
661
- ' fewer digits, cropped, worst quality, low quality'
662
 
663
  @prompts(name="Generate Image Condition On Normal Map",
664
- description="useful when you want to generate a new real image from both the user desciption and normal map. "
665
  "like: generate a real image of a object or something from this normal map, "
666
  "or generate a new real image of a object or something from the normal map. "
667
- "The input to this tool should be a comma seperated string of two, "
668
  "representing the image_path and the user description")
669
  def inference(self, inputs):
670
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
671
  image = Image.open(image_path)
672
  self.seed = random.randint(0, 65535)
673
  seed_everything(self.seed)
674
- prompt = instruct_text + ', ' + self.a_prompt
675
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
676
  guidance_scale=9.0).images[0]
677
  updated_image_path = get_new_image_name(image_path, func_name="normal2image")
@@ -683,19 +713,21 @@ class NormalText2Image:
683
 
684
  class VisualQuestionAnswering:
685
  def __init__(self, device):
686
- print("Initializing VisualQuestionAnswering to %s" % device)
 
687
  self.device = device
688
  self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
689
- self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base", torch_dtype=torch.float16).to(self.device)
 
690
 
691
  @prompts(name="Answer Question About The Image",
692
  description="useful when you need an answer for a question based on an image. "
693
  "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
694
- "The input to this tool should be a comma seperated string of two, representing the image_path and the question")
695
  def inference(self, inputs):
696
- image_path, question = inputs.split(",")
697
  raw_image = Image.open(image_path).convert('RGB')
698
- inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, torch.float16)
699
  out = self.model.generate(**inputs)
700
  answer = self.processor.decode(out[0], skip_special_tokens=True)
701
  print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
 
44
 
45
  class MaskFormer:
46
  def __init__(self, device):
47
+ print(f"Initializing MaskFormer to {device}")
48
  self.device = device
49
  self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
50
  self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
 
74
 
75
  class ImageEditing:
76
  def __init__(self, device):
77
+ print(f"Initializing ImageEditing to {device}")
78
  self.device = device
79
  self.mask_former = MaskFormer(device=self.device)
80
+ self.revision = 'fp16' if 'cuda' in device else None
81
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
82
+ self.inpaint = StableDiffusionInpaintPipeline.from_pretrained(
83
+ "runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device)
84
 
85
  @prompts(name="Remove Something From The Photo",
86
  description="useful when you want to remove and object or something from the photo "
87
  "from its description or location. "
88
+ "The input to this tool should be a comma separated string of two, "
89
  "representing the image_path and the object need to be removed. ")
90
  def inference_remove(self, inputs):
91
+ image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
92
  return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
93
 
94
  @prompts(name="Replace Something From The Photo",
95
  description="useful when you want to replace an object from the object description or "
96
  "location with another object from its description. "
97
+ "The input to this tool should be a comma separated string of three, "
98
  "representing the image_path, the object to be replaced, the object to be replaced with ")
99
  def inference_replace(self, inputs):
100
  image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
 
114
 
115
  class InstructPix2Pix:
116
  def __init__(self, device):
117
+ print(f"Initializing InstructPix2Pix to {device}")
118
  self.device = device
119
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
120
+ self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
121
+ safety_checker=None,
122
+ torch_dtype=self.torch_dtype).to(device)
123
  self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
124
 
125
  @prompts(name="Instruct Image Using Text",
126
  description="useful when you want to the style of the image to be like the text. "
127
  "like: make it look like a painting. or make it like a robot. "
128
+ "The input to this tool should be a comma separated string of two, "
129
  "representing the image_path and the text. ")
130
  def inference(self, inputs):
131
  """Change style of image."""
 
168
 
169
  class ImageCaptioning:
170
  def __init__(self, device):
171
+ print(f"Initializing ImageCaptioning to {device}")
172
  self.device = device
173
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
174
  self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
175
  self.model = BlipForConditionalGeneration.from_pretrained(
176
+ "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
177
 
178
  @prompts(name="Get Photo Description",
179
  description="useful when you want to know what is inside the photo. receives image_path as input. "
180
  "The input to this tool should be a string, representing the image_path. ")
181
  def inference(self, image_path):
182
+ inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
183
  out = self.model.generate(**inputs)
184
  captions = self.processor.decode(out[0], skip_special_tokens=True)
185
  print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
 
212
 
213
  class CannyText2Image:
214
  def __init__(self, device):
215
+ print(f"Initializing CannyText2Image to {device}")
216
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
217
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny",
218
+ torch_dtype=self.torch_dtype)
219
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
220
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
221
+ torch_dtype=self.torch_dtype)
222
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
223
  self.pipe.to(device)
224
  self.seed = -1
225
  self.a_prompt = 'best quality, extremely detailed'
226
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
227
+ 'fewer digits, cropped, worst quality, low quality'
228
 
229
  @prompts(name="Generate Image Condition On Canny Image",
230
+ description="useful when you want to generate a new real image from both the user description and a canny image."
231
  " like: generate a real image of a object or something from this canny image,"
232
  " or generate a new real image of a object or something from this edge image. "
233
+ "The input to this tool should be a comma separated string of two, "
234
  "representing the image_path and the user description. ")
235
  def inference(self, inputs):
236
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
237
  image = Image.open(image_path)
238
  self.seed = random.randint(0, 65535)
239
  seed_everything(self.seed)
240
+ prompt = f'{instruct_text}, {self.a_prompt}'
241
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
242
  guidance_scale=9.0).images[0]
243
  updated_image_path = get_new_image_name(image_path, func_name="canny2image")
 
255
  @prompts(name="Line Detection On Image",
256
  description="useful when you want to detect the straight line of the image. "
257
  "like: detect the straight lines of this image, or straight line detection on image, "
258
+ "or perform straight line detection on this image, or detect the straight line image of this image. "
259
  "The input to this tool should be a string, representing the image_path")
260
  def inference(self, inputs):
261
  image = Image.open(inputs)
 
268
 
269
  class LineText2Image:
270
  def __init__(self, device):
271
+ print(f"Initializing LineText2Image to {device}")
272
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
273
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd",
274
+ torch_dtype=self.torch_dtype)
275
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
276
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
277
+ torch_dtype=self.torch_dtype
278
  )
279
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
280
  self.pipe.to(device)
281
  self.seed = -1
282
  self.a_prompt = 'best quality, extremely detailed'
283
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
284
+ 'fewer digits, cropped, worst quality, low quality'
285
 
286
  @prompts(name="Generate Image Condition On Line Image",
287
+ description="useful when you want to generate a new real image from both the user description "
288
  "and a straight line image. "
289
  "like: generate a real image of a object or something from this straight line image, "
290
  "or generate a new real image of a object or something from this straight lines. "
291
+ "The input to this tool should be a comma separated string of two, "
292
  "representing the image_path and the user description. ")
293
  def inference(self, inputs):
294
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
295
  image = Image.open(image_path)
296
  self.seed = random.randint(0, 65535)
297
  seed_everything(self.seed)
298
+ prompt = f'{instruct_text}, {self.a_prompt}'
299
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
300
  guidance_scale=9.0).images[0]
301
  updated_image_path = get_new_image_name(image_path, func_name="line2image")
 
313
  @prompts(name="Hed Detection On Image",
314
  description="useful when you want to detect the soft hed boundary of the image. "
315
  "like: detect the soft hed boundary of this image, or hed boundary detection on image, "
316
+ "or perform hed boundary detection on this image, or detect soft hed boundary image of this image. "
317
  "The input to this tool should be a string, representing the image_path")
318
  def inference(self, inputs):
319
  image = Image.open(inputs)
 
326
 
327
  class HedText2Image:
328
  def __init__(self, device):
329
+ print(f"Initializing HedText2Image to {device}")
330
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
331
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed",
332
+ torch_dtype=self.torch_dtype)
333
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
334
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
335
+ torch_dtype=self.torch_dtype
336
  )
337
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
338
  self.pipe.to(device)
339
  self.seed = -1
340
  self.a_prompt = 'best quality, extremely detailed'
341
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
342
+ 'fewer digits, cropped, worst quality, low quality'
343
 
344
  @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
345
+ description="useful when you want to generate a new real image from both the user description "
346
  "and a soft hed boundary image. "
347
  "like: generate a real image of a object or something from this soft hed boundary image, "
348
  "or generate a new real image of a object or something from this hed boundary. "
349
+ "The input to this tool should be a comma separated string of two, "
350
  "representing the image_path and the user description")
351
  def inference(self, inputs):
352
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
353
  image = Image.open(image_path)
354
  self.seed = random.randint(0, 65535)
355
  seed_everything(self.seed)
356
+ prompt = f'{instruct_text}, {self.a_prompt}'
357
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
358
  guidance_scale=9.0).images[0]
359
  updated_image_path = get_new_image_name(image_path, func_name="hed2image")
 
384
 
385
  class ScribbleText2Image:
386
  def __init__(self, device):
387
+ print(f"Initializing ScribbleText2Image to {device}")
388
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
389
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble",
390
+ torch_dtype=self.torch_dtype)
391
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
392
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
393
+ torch_dtype=self.torch_dtype
394
  )
395
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
396
  self.pipe.to(device)
397
  self.seed = -1
398
  self.a_prompt = 'best quality, extremely detailed'
399
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
400
+ 'fewer digits, cropped, worst quality, low quality'
401
 
402
  @prompts(name="Generate Image Condition On Sketch Image",
403
+ description="useful when you want to generate a new real image from both the user description and "
404
  "a scribble image or a sketch image. "
405
+ "The input to this tool should be a comma separated string of two, "
406
  "representing the image_path and the user description")
407
  def inference(self, inputs):
408
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
409
  image = Image.open(image_path)
410
  self.seed = random.randint(0, 65535)
411
  seed_everything(self.seed)
412
+ prompt = f'{instruct_text}, {self.a_prompt}'
413
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
414
  guidance_scale=9.0).images[0]
415
  updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
 
439
 
440
  class PoseText2Image:
441
  def __init__(self, device):
442
+ print(f"Initializing PoseText2Image to {device}")
443
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
444
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose",
445
+ torch_dtype=self.torch_dtype)
446
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
447
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
448
+ torch_dtype=self.torch_dtype)
449
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
450
  self.pipe.to(device)
451
  self.num_inference_steps = 20
 
453
  self.unconditional_guidance_scale = 9.0
454
  self.a_prompt = 'best quality, extremely detailed'
455
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
456
+ ' fewer digits, cropped, worst quality, low quality'
457
 
458
  @prompts(name="Generate Image Condition On Pose Image",
459
+ description="useful when you want to generate a new real image from both the user description "
460
  "and a human pose image. "
461
  "like: generate a real image of a human from this human pose image, "
462
  "or generate a new real image of a human from this pose. "
463
+ "The input to this tool should be a comma separated string of two, "
464
  "representing the image_path and the user description")
465
  def inference(self, inputs):
466
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
467
  image = Image.open(image_path)
468
  self.seed = random.randint(0, 65535)
469
  seed_everything(self.seed)
470
+ prompt = f'{instruct_text}, {self.a_prompt}'
471
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
472
  guidance_scale=9.0).images[0]
473
  updated_image_path = get_new_image_name(image_path, func_name="pose2image")
 
524
  @prompts(name="Segmentation On Image",
525
  description="useful when you want to detect segmentations of the image. "
526
  "like: segment this image, or generate segmentations on this image, "
527
+ "or perform segmentation on this image. "
528
  "The input to this tool should be a string, representing the image_path")
529
  def inference(self, inputs):
530
  image = Image.open(inputs)
 
546
 
547
  class SegText2Image:
548
  def __init__(self, device):
549
+ print(f"Initializing SegText2Image to {device}")
550
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
551
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg",
552
+ torch_dtype=self.torch_dtype)
553
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
554
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
555
+ torch_dtype=self.torch_dtype)
556
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
557
  self.pipe.to(device)
558
  self.seed = -1
559
  self.a_prompt = 'best quality, extremely detailed'
560
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
561
+ ' fewer digits, cropped, worst quality, low quality'
562
 
563
  @prompts(name="Generate Image Condition On Segmentations",
564
+ description="useful when you want to generate a new real image from both the user description and segmentations. "
565
  "like: generate a real image of a object or something from this segmentation image, "
566
  "or generate a new real image of a object or something from these segmentations. "
567
+ "The input to this tool should be a comma separated string of two, "
568
  "representing the image_path and the user description")
569
  def inference(self, inputs):
570
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
571
  image = Image.open(image_path)
572
  self.seed = random.randint(0, 65535)
573
  seed_everything(self.seed)
574
+ prompt = f'{instruct_text}, {self.a_prompt}'
575
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
576
  guidance_scale=9.0).images[0]
577
  updated_image_path = get_new_image_name(image_path, func_name="segment2image")
 
605
 
606
  class DepthText2Image:
607
  def __init__(self, device):
608
+ print(f"Initializing DepthText2Image to {device}")
609
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
610
+ self.controlnet = ControlNetModel.from_pretrained(
611
+ "fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=self.torch_dtype)
612
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
613
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
614
+ torch_dtype=self.torch_dtype)
615
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
616
  self.pipe.to(device)
617
  self.seed = -1
618
  self.a_prompt = 'best quality, extremely detailed'
619
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
620
+ ' fewer digits, cropped, worst quality, low quality'
621
 
622
  @prompts(name="Generate Image Condition On Depth",
623
+ description="useful when you want to generate a new real image from both the user description and depth image. "
624
  "like: generate a real image of a object or something from this depth image, "
625
  "or generate a new real image of a object or something from the depth map. "
626
+ "The input to this tool should be a comma separated string of two, "
627
  "representing the image_path and the user description")
628
  def inference(self, inputs):
629
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
630
  image = Image.open(image_path)
631
  self.seed = random.randint(0, 65535)
632
  seed_everything(self.seed)
633
+ prompt = f'{instruct_text}, {self.a_prompt}'
634
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
635
  guidance_scale=9.0).images[0]
636
  updated_image_path = get_new_image_name(image_path, func_name="depth2image")
 
676
 
677
  class NormalText2Image:
678
  def __init__(self, device):
679
+ print(f"Initializing NormalText2Image to {device}")
680
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
681
+ self.controlnet = ControlNetModel.from_pretrained(
682
+ "fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=self.torch_dtype)
683
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
684
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
685
+ torch_dtype=self.torch_dtype)
686
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
687
  self.pipe.to(device)
688
  self.seed = -1
689
  self.a_prompt = 'best quality, extremely detailed'
690
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
691
+ ' fewer digits, cropped, worst quality, low quality'
692
 
693
  @prompts(name="Generate Image Condition On Normal Map",
694
+ description="useful when you want to generate a new real image from both the user description and normal map. "
695
  "like: generate a real image of a object or something from this normal map, "
696
  "or generate a new real image of a object or something from the normal map. "
697
+ "The input to this tool should be a comma separated string of two, "
698
  "representing the image_path and the user description")
699
  def inference(self, inputs):
700
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
701
  image = Image.open(image_path)
702
  self.seed = random.randint(0, 65535)
703
  seed_everything(self.seed)
704
+ prompt = f'{instruct_text}, {self.a_prompt}'
705
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
706
  guidance_scale=9.0).images[0]
707
  updated_image_path = get_new_image_name(image_path, func_name="normal2image")
 
713
 
714
  class VisualQuestionAnswering:
715
  def __init__(self, device):
716
+ print(f"Initializing VisualQuestionAnswering to {device}")
717
+ self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
718
  self.device = device
719
  self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
720
+ self.model = BlipForQuestionAnswering.from_pretrained(
721
+ "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
722
 
723
  @prompts(name="Answer Question About The Image",
724
  description="useful when you need an answer for a question based on an image. "
725
  "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
726
+ "The input to this tool should be a comma separated string of two, representing the image_path and the question")
727
  def inference(self, inputs):
728
+ image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
729
  raw_image = Image.open(image_path).convert('RGB')
730
+ inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
731
  out = self.model.generate(**inputs)
732
  answer = self.processor.decode(out[0], skip_special_tokens=True)
733
  print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "