LanHarmony commited on
Commit
bc147cf
1 Parent(s): 78df1b1
Files changed (2) hide show
  1. app.py +45 -117
  2. visual_foundation_models.py +347 -169
app.py CHANGED
@@ -67,149 +67,77 @@ def cut_dialogue_history(history_memory, keep_last_n_words=400):
67
 
68
 
69
  class ConversationBot:
70
- def __init__(self):
71
- print("Initializing VisualChatGPT")
72
- self.edit = ImageEditing(device="cuda:0")
73
- self.i2t = ImageCaptioning(device="cuda:0")
74
- self.t2i = T2I(device="cuda:0")
75
- self.BLIPVQA = BLIPVQA(device="cuda:0")
76
- self.pix2pix = Pix2Pix(device="cuda:0")
77
- self.image2canny = image2canny()
78
- self.canny2image = canny2image(device="cuda:0")
79
- # self.image2line = image2line()
80
- # self.line2image = line2image(device="cuda:0")
81
- # self.image2hed = image2hed()
82
- # self.hed2image = hed2image(device="cuda:0")
83
- # self.image2scribble = image2scribble()
84
- # self.scribble2image = scribble2image(device="cuda:0")
85
- # self.image2pose = image2pose()
86
- # self.pose2image = pose2image(device="cuda:0")
87
- # self.image2seg = image2seg_new()
88
- # self.seg2image = seg2image_new(device="cuda:0")
89
- # self.image2depth = image2depth_new()
90
- # self.depth2image = depth2image_new(device="cuda:0")
91
- # self.image2normal = image2normal_new()
92
- # self.normal2image = normal2image_new(device="cuda:0")
93
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
94
- self.tools = [
95
- Tool(name="Get Photo Description", func=self.i2t.inference,
96
- description="useful for when you want to know what is inside the photo. receives image_path as input. "
97
- "The input to this tool should be a string, representing the image_path. "),
98
- Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
99
- description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
100
- "The input to this tool should be a string, representing the text used to generate image. "),
101
- Tool(name="Remove Something From The Photo", func=self.edit.remove_part_of_image,
102
- description="useful for when you want to remove and object or something from the photo from its description or location. "
103
- "The input to this tool should be a comma seperated string of two, representing the image_path and the object need to be removed. "),
104
- Tool(name="Replace Something From The Photo", func=self.edit.replace_part_of_image,
105
- description="useful for when you want to replace an object from the object description or location with another object from its description. "
106
- "The input to this tool should be a comma seperated string of three, representing the image_path, the object to be replaced, the object to be replaced with "),
107
- Tool(name="Instruct Image Using Text", func=self.pix2pix.inference,
108
- description="useful for when you want to the style of the image to be like the text. like: make it look like a painting. or make it like a robot. "
109
- "The input to this tool should be a comma seperated string of two, representing the image_path and the text. "),
110
- Tool(name="Answer Question About The Image", func=self.BLIPVQA.get_answer_from_question_and_image,
111
- description="useful for when you need an answer for a question based on an image. like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
112
- "The input to this tool should be a comma seperated string of two, representing the image_path and the question"),
113
- Tool(name="Edge Detection On Image", func=self.image2canny.inference,
114
- description="useful for when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or peform edge detection on this image, or detect the canny image of this image. "
115
- "The input to this tool should be a string, representing the image_path"),
116
- Tool(name="Generate Image Condition On Canny Image", func=self.canny2image.inference,
117
- description="useful for when you want to generate a new real image from both the user desciption and a canny image. like: generate a real image of a object or something from this canny image, or generate a new real image of a object or something from this edge image. "
118
- "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "),
119
- # Tool(name="Line Detection On Image", func=self.image2line.inference,
120
- # description="useful for when you want to detect the straight line of the image. like: detect the straight lines of this image, or straight line detection on image, or peform straight line detection on this image, or detect the straight line image of this image. "
121
- # "The input to this tool should be a string, representing the image_path"),
122
- # Tool(name="Generate Image Condition On Line Image", func=self.line2image.inference,
123
- # description="useful for when you want to generate a new real image from both the user desciption and a straight line image. like: generate a real image of a object or something from this straight line image, or generate a new real image of a object or something from this straight lines. "
124
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "),
125
- # Tool(name="Hed Detection On Image", func=self.image2hed.inference,
126
- # description="useful for when you want to detect the soft hed boundary of the image. like: detect the soft hed boundary of this image, or hed boundary detection on image, or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
127
- # "The input to this tool should be a string, representing the image_path"),
128
- # Tool(name="Generate Image Condition On Soft Hed Boundary Image", func=self.hed2image.inference,
129
- # description="useful for when you want to generate a new real image from both the user desciption and a soft hed boundary image. like: generate a real image of a object or something from this soft hed boundary image, or generate a new real image of a object or something from this hed boundary. "
130
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
131
- # Tool(name="Segmentation On Image", func=self.image2seg.inference,
132
- # description="useful for when you want to detect segmentations of the image. like: segment this image, or generate segmentations on this image, or peform segmentation on this image. "
133
- # "The input to this tool should be a string, representing the image_path"),
134
- # Tool(name="Generate Image Condition On Segmentations", func=self.seg2image.inference,
135
- # description="useful for when you want to generate a new real image from both the user desciption and segmentations. like: generate a real image of a object or something from this segmentation image, or generate a new real image of a object or something from these segmentations. "
136
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
137
- # Tool(name="Predict Depth On Image", func=self.image2depth.inference,
138
- # description="useful for when you want to detect depth of the image. like: generate the depth from this image, or detect the depth map on this image, or predict the depth for this image. "
139
- # "The input to this tool should be a string, representing the image_path"),
140
- # Tool(name="Generate Image Condition On Depth", func=self.depth2image.inference,
141
- # description="useful for when you want to generate a new real image from both the user desciption and depth image. like: generate a real image of a object or something from this depth image, or generate a new real image of a object or something from the depth map. "
142
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
143
- # Tool(name="Predict Normal Map On Image", func=self.image2normal.inference,
144
- # description="useful for when you want to detect norm map of the image. like: generate normal map from this image, or predict normal map of this image. "
145
- # "The input to this tool should be a string, representing the image_path"),
146
- # Tool(name="Generate Image Condition On Normal Map", func=self.normal2image.inference,
147
- # description="useful for when you want to generate a new real image from both the user desciption and normal map. like: generate a real image of a object or something from this normal map, or generate a new real image of a object or something from the normal map. "
148
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
149
- # Tool(name="Sketch Detection On Image", func=self.image2scribble.inference,
150
- # description="useful for when you want to generate a scribble of the image. like: generate a scribble of this image, or generate a sketch from this image, detect the sketch from this image. "
151
- # "The input to this tool should be a string, representing the image_path"),
152
- # Tool(name="Generate Image Condition On Sketch Image", func=self.scribble2image.inference,
153
- # description="useful for when you want to generate a new real image from both the user desciption and a scribble image or a sketch image. "
154
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
155
- # Tool(name="Pose Detection On Image", func=self.image2pose.inference,
156
- # description="useful for when you want to detect the human pose of the image. like: generate human poses of this image, or generate a pose image from this image. "
157
- # "The input to this tool should be a string, representing the image_path"),
158
- # Tool(name="Generate Image Condition On Pose Image", func=self.pose2image.inference,
159
- # description="useful for when you want to generate a new real image from both the user desciption and a human pose image. like: generate a real image of a human from this human pose image, or generate a new real image of a human from this pose. "
160
- # "The input to this tool should be a comma seperated string of two, representing the image_path and the user description")
161
- ]
162
 
163
- def init_agent(self, openai_api_key):
164
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
165
- self.agent = initialize_agent(
166
- self.tools,
167
- self.llm,
168
- agent="conversational-react-description",
169
- verbose=True,
170
- memory=self.memory,
171
- return_intermediate_steps=True,
172
- agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': VISUAL_CHATGPT_SUFFIX}, )
173
 
174
- return gr.update(visible = True)
175
 
176
  def run_text(self, text, state):
177
- print("===============Running run_text =============")
178
- print("Inputs:", text, state)
179
- print("======>Previous memory:\n %s" % self.agent.memory)
180
- self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=400)
181
  res = self.agent({"input": text})
182
- print("======>Current memory:\n %s" % self.agent.memory)
183
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
184
  state = state + [(text, response)]
185
- print("Outputs:", state)
 
186
  return state, state
187
 
188
  def run_image(self, image, state, txt):
189
- print("===============Running run_image =============")
190
- print("Inputs:", image, state)
191
- print("======>Previous memory:\n %s" % self.agent.memory)
192
  image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
193
  print("======>Auto Resize Image...")
194
  img = Image.open(image.name)
195
  width, height = img.size
196
  ratio = min(512 / width, 512 / height)
197
  width_new, height_new = (round(width * ratio), round(height * ratio))
 
 
198
  img = img.resize((width_new, height_new))
199
  img = img.convert('RGB')
200
  img.save(image_filename, "PNG")
201
  print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
202
- description = self.i2t.inference(image_filename)
203
- Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
204
- "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
 
 
 
205
  AI_prompt = "Received. "
206
  self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
207
- print("======>Current memory:\n %s" % self.agent.memory)
208
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
209
- print("Outputs:", state)
 
210
  return state, state, txt + ' ' + image_filename + ' '
211
 
212
- bot = ConversationBot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
214
  with gr.Row():
215
  gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
 
67
 
68
 
69
  class ConversationBot:
70
+ def __init__(self, load_dict):
71
+ # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
72
+ print(f"Initializing VisualChatGPT, load_dict={load_dict}")
73
+ if 'ImageCaptioning' not in load_dict:
74
+ raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
75
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ self.models = dict()
79
+ for class_name, device in load_dict.items():
80
+ self.models[class_name] = globals()[class_name](device=device)
81
+
82
+ self.tools = []
83
+ for class_name, instance in self.models.items():
84
+ for e in dir(instance):
85
+ if e.startswith('inference'):
86
+ func = getattr(instance, e)
87
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
88
 
 
89
 
90
  def run_text(self, text, state):
91
+ self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
 
 
 
92
  res = self.agent({"input": text})
93
+ res['output'] = res['output'].replace("\\", "/")
94
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
95
  state = state + [(text, response)]
96
+ print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
97
+ f"Current Memory: {self.agent.memory.buffer}")
98
  return state, state
99
 
100
  def run_image(self, image, state, txt):
 
 
 
101
  image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
102
  print("======>Auto Resize Image...")
103
  img = Image.open(image.name)
104
  width, height = img.size
105
  ratio = min(512 / width, 512 / height)
106
  width_new, height_new = (round(width * ratio), round(height * ratio))
107
+ width_new = int(np.round(width_new / 64.0)) * 64
108
+ height_new = int(np.round(height_new / 64.0)) * 64
109
  img = img.resize((width_new, height_new))
110
  img = img.convert('RGB')
111
  img.save(image_filename, "PNG")
112
  print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
113
+ description = self.models['ImageCaptioning'].inference(image_filename)
114
+ Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. " \
115
+ "This information helps you to understand this image, " \
116
+ "but you should use tools to finish following tasks, " \
117
+ "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
118
+ image_filename, description)
119
  AI_prompt = "Received. "
120
  self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
 
121
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
122
+ print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
123
+ f"Current Memory: {self.agent.memory.buffer}")
124
  return state, state, txt + ' ' + image_filename + ' '
125
 
126
+ def init_agent(self, openai_api_key):
127
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
128
+ self.agent = initialize_agent(
129
+ self.tools,
130
+ self.llm,
131
+ agent="conversational-react-description",
132
+ verbose=True,
133
+ memory=self.memory,
134
+ return_intermediate_steps=True,
135
+ agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': VISUAL_CHATGPT_SUFFIX}, )
136
+
137
+ return gr.update(visible = True)
138
+
139
+ bot = ConversationBot({'Text2Image':'cuda:0', 'ImageCaptioning':'cuda:0',})
140
+
141
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
142
  with gr.Row():
143
  gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
visual_foundation_models.py CHANGED
@@ -16,6 +16,14 @@ from PIL import Image
16
  import numpy as np
17
  from pytorch_lightning import seed_everything
18
 
 
 
 
 
 
 
 
 
19
  def get_new_image_name(org_img_name, func_name="update"):
20
  head_tail = os.path.split(org_img_name)
21
  head = head_tail[0]
@@ -36,9 +44,10 @@ def get_new_image_name(org_img_name, func_name="update"):
36
 
37
  class MaskFormer:
38
  def __init__(self, device):
 
39
  self.device = device
40
- self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined", torch_dtype=torch.float16)
41
- self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined", torch_dtype=torch.float16).to(device)
42
 
43
  def inference(self, image_path, text):
44
  threshold = 0.5
@@ -46,7 +55,7 @@ class MaskFormer:
46
  padding = 20
47
  original_image = Image.open(image_path)
48
  image = original_image.resize((512, 512))
49
- inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt",).to(self.device)
50
  with torch.no_grad():
51
  outputs = self.model(**inputs)
52
  mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
@@ -62,87 +71,126 @@ class MaskFormer:
62
  image_mask = Image.fromarray(visual_mask)
63
  return image_mask.resize(original_image.size)
64
 
 
65
  class ImageEditing:
66
  def __init__(self, device):
67
- print("Initializing StableDiffusionInpaint to %s" % device)
68
  self.device = device
69
  self.mask_former = MaskFormer(device=self.device)
70
- self.inpainting = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16).to(device)
71
-
72
- def remove_part_of_image(self, input):
73
- image_path, to_be_removed_txt = input.split(",")
74
- print(f'remove_part_of_image: to_be_removed {to_be_removed_txt}')
75
- return self.replace_part_of_image(f"{image_path},{to_be_removed_txt},background")
76
-
77
- def replace_part_of_image(self, input):
78
- image_path, to_be_replaced_txt, replace_with_txt = input.split(",")
79
- print(f'replace_part_of_image: replace_with_txt {replace_with_txt}')
 
 
 
 
 
 
 
 
80
  original_image = Image.open(image_path)
81
  original_size = original_image.size
82
  mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
83
- updated_image = self.inpainting(prompt=replace_with_txt, image=original_image.resize((512,512)), mask_image=mask_image.resize((512,512))).images[0]
 
84
  updated_image_path = get_new_image_name(image_path, func_name="replace-something")
85
  updated_image = updated_image.resize(original_size)
86
  updated_image.save(updated_image_path)
 
 
 
87
  return updated_image_path
88
 
89
- class Pix2Pix:
 
90
  def __init__(self, device):
91
- print("Initializing Pix2Pix to %s" % device)
92
  self.device = device
93
- self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16, safety_checker=None).to(device)
 
94
  self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
95
 
 
 
 
 
 
96
  def inference(self, inputs):
97
  """Change style of image."""
98
- print("===>Starting Pix2Pix Inference")
99
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
100
  original_image = Image.open(image_path)
101
- image = self.pipe(instruct_text,image=original_image,num_inference_steps=40,image_guidance_scale=1.2,).images[0]
102
  updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
103
  image.save(updated_image_path)
 
 
104
  return updated_image_path
105
 
106
- class T2I:
 
107
  def __init__(self, device):
108
- print("Initializing T2I to %s" % device)
109
  self.device = device
110
- self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
111
- self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16)
112
- self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16)
113
- self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device, torch_dtype=torch.float16)
 
114
  self.pipe.to(device)
115
 
 
 
 
 
116
  def inference(self, text):
117
  image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
118
  refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
119
- print(f'{text} refined to {refined_text}')
120
  image = self.pipe(refined_text).images[0]
121
  image.save(image_filename)
122
- print(f"Processed T2I.run, text: {text}, image_filename: {image_filename}")
 
123
  return image_filename
124
 
 
125
  class ImageCaptioning:
126
  def __init__(self, device):
127
  print("Initializing ImageCaptioning to %s" % device)
128
  self.device = device
129
- self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16)
130
- self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to(self.device)
 
131
 
 
 
 
132
  def inference(self, image_path):
133
  inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
134
  out = self.model.generate(**inputs)
135
  captions = self.processor.decode(out[0], skip_special_tokens=True)
 
136
  return captions
137
 
138
- class image2canny:
139
- def __init__(self):
140
- print("Direct detect canny.")
 
141
  self.low_threshold = 100
142
  self.high_threshold = 200
143
 
 
 
 
 
 
144
  def inference(self, inputs):
145
- print("===>Starting image2canny Inference")
146
  image = Image.open(inputs)
147
  image = np.array(image)
148
  canny = cv2.Canny(image, self.low_threshold, self.high_threshold)
@@ -151,227 +199,311 @@ class image2canny:
151
  canny = Image.fromarray(canny)
152
  updated_image_path = get_new_image_name(inputs, func_name="edge")
153
  canny.save(updated_image_path)
 
154
  return updated_image_path
155
 
156
- class canny2image:
 
157
  def __init__(self, device):
158
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny", torch_dtype=torch.float16)
 
159
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
160
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
161
- )
162
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
163
  self.pipe.to(device)
164
  self.seed = -1
165
  self.a_prompt = 'best quality, extremely detailed'
166
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
167
-
 
 
 
 
 
 
 
168
  def inference(self, inputs):
169
- print("===>Starting canny2image Inference")
170
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
171
  image = Image.open(image_path)
172
  self.seed = random.randint(0, 65535)
173
  seed_everything(self.seed)
174
  prompt = instruct_text + ', ' + self.a_prompt
175
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
176
  updated_image_path = get_new_image_name(image_path, func_name="canny2image")
177
  image.save(updated_image_path)
 
 
178
  return updated_image_path
179
 
180
- class image2line:
181
- def __init__(self):
 
 
182
  self.detector = MLSDdetector.from_pretrained('lllyasviel/ControlNet')
183
 
 
 
 
 
 
184
  def inference(self, inputs):
185
- print("===>Starting image2line Inference")
186
  image = Image.open(inputs)
187
  mlsd = self.detector(image)
188
  updated_image_path = get_new_image_name(inputs, func_name="line-of")
189
  mlsd.save(updated_image_path)
 
190
  return updated_image_path
191
 
192
- class line2image:
 
193
  def __init__(self, device):
194
- print("Initialize the line2image model...")
195
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd", torch_dtype=torch.float16)
196
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
197
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
198
  )
199
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
200
  self.pipe.to(device)
201
  self.seed = -1
202
  self.a_prompt = 'best quality, extremely detailed'
203
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
204
-
 
 
 
 
 
 
 
 
205
  def inference(self, inputs):
206
- print("===>Starting line2image Inference")
207
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
208
  image = Image.open(image_path)
209
  self.seed = random.randint(0, 65535)
210
  seed_everything(self.seed)
211
  prompt = instruct_text + ', ' + self.a_prompt
212
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
213
  updated_image_path = get_new_image_name(image_path, func_name="line2image")
214
  image.save(updated_image_path)
 
 
215
  return updated_image_path
216
 
217
- class image2hed:
218
- def __init__(self):
219
- print("Direct detect soft HED boundary...")
 
220
  self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
221
 
 
 
 
 
 
222
  def inference(self, inputs):
223
- print("===>Starting image2hed Inference")
224
  image = Image.open(inputs)
225
  hed = self.detector(image)
226
  updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
227
  hed.save(updated_image_path)
 
228
  return updated_image_path
229
 
230
- class hed2image:
 
231
  def __init__(self, device):
232
- print("Initialize the hed2image model...")
233
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed", torch_dtype=torch.float16)
234
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
235
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
236
  )
237
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
238
  self.pipe.to(device)
239
  self.seed = -1
240
  self.a_prompt = 'best quality, extremely detailed'
241
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
242
-
 
 
 
 
 
 
 
 
243
  def inference(self, inputs):
244
- print("===>Starting hed2image Inference")
245
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
246
  image = Image.open(image_path)
247
  self.seed = random.randint(0, 65535)
248
  seed_everything(self.seed)
249
  prompt = instruct_text + ', ' + self.a_prompt
250
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
251
  updated_image_path = get_new_image_name(image_path, func_name="hed2image")
252
  image.save(updated_image_path)
 
 
253
  return updated_image_path
254
 
255
- class image2scribble:
256
- def __init__(self):
257
- print("Direct detect scribble.")
 
258
  self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
259
 
 
 
 
 
 
260
  def inference(self, inputs):
261
- print("===>Starting image2scribble Inference")
262
  image = Image.open(inputs)
263
  scribble = self.detector(image, scribble=True)
264
  updated_image_path = get_new_image_name(inputs, func_name="scribble")
265
  scribble.save(updated_image_path)
 
266
  return updated_image_path
267
 
268
- class scribble2image:
 
269
  def __init__(self, device):
270
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble", torch_dtype=torch.float16)
 
271
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
272
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
273
  )
274
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
275
  self.pipe.to(device)
276
  self.seed = -1
277
  self.a_prompt = 'best quality, extremely detailed'
278
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
279
-
 
 
 
 
 
 
280
  def inference(self, inputs):
281
- print("===>Starting scribble2image Inference")
282
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
283
  image = Image.open(image_path)
284
  self.seed = random.randint(0, 65535)
285
  seed_everything(self.seed)
286
  prompt = instruct_text + ', ' + self.a_prompt
287
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,guidance_scale=9.0).images[0]
 
288
  updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
289
  image.save(updated_image_path)
 
 
290
  return updated_image_path
291
 
292
- class image2pose:
293
- def __init__(self):
 
 
294
  self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
295
 
 
 
 
 
296
  def inference(self, inputs):
297
- print("===>Starting image2pose Inference")
298
  image = Image.open(inputs)
299
  pose = self.detector(image)
300
  updated_image_path = get_new_image_name(inputs, func_name="human-pose")
301
  pose.save(updated_image_path)
 
302
  return updated_image_path
303
 
304
- class pose2image:
 
305
  def __init__(self, device):
306
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose", torch_dtype=torch.float16)
 
307
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
308
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
309
- )
310
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
311
  self.pipe.to(device)
312
  self.num_inference_steps = 20
313
  self.seed = -1
314
  self.unconditional_guidance_scale = 9.0
315
  self.a_prompt = 'best quality, extremely detailed'
316
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
317
-
 
 
 
 
 
 
 
 
318
  def inference(self, inputs):
319
- print("===>Starting pose2image Inference")
320
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
321
  image = Image.open(image_path)
322
  self.seed = random.randint(0, 65535)
323
  seed_everything(self.seed)
324
  prompt = instruct_text + ', ' + self.a_prompt
325
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
326
  updated_image_path = get_new_image_name(image_path, func_name="pose2image")
327
  image.save(updated_image_path)
 
 
328
  return updated_image_path
329
 
330
- class image2seg:
331
- def __init__(self):
332
- print("Initialize image2segmentation Inference")
 
333
  self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
334
  self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
335
-
336
  self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
337
- [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
338
- [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
339
- [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
340
- [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
341
- [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
342
- [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
343
- [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
344
- [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
345
- [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
346
- [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
347
- [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
348
- [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
349
- [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
350
- [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
351
- [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
352
- [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
353
- [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
354
- [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
355
- [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
356
- [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
357
- [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
358
- [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
359
- [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
360
- [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
361
- [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
362
- [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
363
- [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
364
- [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
365
- [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
366
- [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
367
- [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
368
- [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
369
- [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
370
- [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
371
- [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
372
- [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
373
- [102, 255, 0], [92, 0, 255]]
374
-
 
 
 
 
 
375
  def inference(self, inputs):
376
  image = Image.open(inputs)
377
  pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
@@ -386,37 +518,53 @@ class image2seg:
386
  segmentation = Image.fromarray(color_seg)
387
  updated_image_path = get_new_image_name(inputs, func_name="segmentation")
388
  segmentation.save(updated_image_path)
 
389
  return updated_image_path
390
 
391
- class seg2image:
 
392
  def __init__(self, device):
393
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg", torch_dtype=torch.float16)
 
394
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
395
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
396
- )
397
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
398
  self.pipe.to(device)
399
  self.seed = -1
400
  self.a_prompt = 'best quality, extremely detailed'
401
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
402
-
 
 
 
 
 
 
 
403
  def inference(self, inputs):
404
- print("===>Starting seg2image Inference")
405
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
406
  image = Image.open(image_path)
407
  self.seed = random.randint(0, 65535)
408
  seed_everything(self.seed)
409
  prompt = instruct_text + ', ' + self.a_prompt
410
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
411
  updated_image_path = get_new_image_name(image_path, func_name="segment2image")
412
  image.save(updated_image_path)
 
 
413
  return updated_image_path
414
 
415
- class image2depth:
416
- def __init__(self):
417
- print("initialize depth estimation")
 
418
  self.depth_estimator = pipeline('depth-estimation')
419
 
 
 
 
 
420
  def inference(self, inputs):
421
  image = Image.open(inputs)
422
  depth = self.depth_estimator(image)['depth']
@@ -426,38 +574,54 @@ class image2depth:
426
  depth = Image.fromarray(depth)
427
  updated_image_path = get_new_image_name(inputs, func_name="depth")
428
  depth.save(updated_image_path)
 
429
  return updated_image_path
430
 
431
- class depth2image:
 
432
  def __init__(self, device):
433
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=torch.float16)
 
434
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
435
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
436
- )
437
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
438
  self.pipe.to(device)
439
  self.seed = -1
440
  self.a_prompt = 'best quality, extremely detailed'
441
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
442
-
 
 
 
 
 
 
 
443
  def inference(self, inputs):
444
- print("===>Starting depth2image Inference")
445
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
446
  image = Image.open(image_path)
447
  self.seed = random.randint(0, 65535)
448
  seed_everything(self.seed)
449
  prompt = instruct_text + ', ' + self.a_prompt
450
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
451
  updated_image_path = get_new_image_name(image_path, func_name="depth2image")
452
  image.save(updated_image_path)
 
 
453
  return updated_image_path
454
 
455
- class image2normal:
456
- def __init__(self):
457
- print("normal estimation")
 
458
  self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
459
  self.bg_threhold = 0.4
460
 
 
 
 
 
461
  def inference(self, inputs):
462
  image = Image.open(inputs)
463
  original_size = image.size
@@ -466,13 +630,10 @@ class image2normal:
466
  image_depth = image.copy()
467
  image_depth -= np.min(image_depth)
468
  image_depth /= np.max(image_depth)
469
-
470
  x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
471
  x[image_depth < self.bg_threhold] = 0
472
-
473
  y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
474
  y[image_depth < self.bg_threhold] = 0
475
-
476
  z = np.ones_like(x) * np.pi * 2.0
477
  image = np.stack([x, y, z], axis=2)
478
  image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
@@ -481,44 +642,61 @@ class image2normal:
481
  image = image.resize(original_size)
482
  updated_image_path = get_new_image_name(inputs, func_name="normal-map")
483
  image.save(updated_image_path)
 
484
  return updated_image_path
485
 
486
- class normal2image:
 
487
  def __init__(self, device):
488
- self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=torch.float16)
 
489
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
490
- "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None, torch_dtype=torch.float16
491
- )
492
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
493
  self.pipe.to(device)
494
  self.seed = -1
495
  self.a_prompt = 'best quality, extremely detailed'
496
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
497
-
 
 
 
 
 
 
 
498
  def inference(self, inputs):
499
- print("===>Starting normal2image Inference")
500
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
501
  image = Image.open(image_path)
502
  self.seed = random.randint(0, 65535)
503
  seed_everything(self.seed)
504
  prompt = instruct_text + ', ' + self.a_prompt
505
- image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, guidance_scale=9.0).images[0]
 
506
  updated_image_path = get_new_image_name(image_path, func_name="normal2image")
507
  image.save(updated_image_path)
 
 
508
  return updated_image_path
509
 
510
- class BLIPVQA:
 
511
  def __init__(self, device):
512
- print("Initializing BLIP VQA to %s" % device)
513
  self.device = device
514
- self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", torch_dtype=torch.float16)
515
- self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base", torch_dtype=torch.float16).to(self.device)
516
 
517
- def get_answer_from_question_and_image(self, inputs):
 
 
 
 
518
  image_path, question = inputs.split(",")
519
  raw_image = Image.open(image_path).convert('RGB')
520
- print(F'BLIPVQA :question :{question}')
521
  inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device)
522
  out = self.model.generate(**inputs)
523
  answer = self.processor.decode(out[0], skip_special_tokens=True)
 
 
524
  return answer
 
16
  import numpy as np
17
  from pytorch_lightning import seed_everything
18
 
19
+ def prompts(name, description):
20
+ def decorator(func):
21
+ func.name = name
22
+ func.description = description
23
+ return func
24
+
25
+ return decorator
26
+
27
  def get_new_image_name(org_img_name, func_name="update"):
28
  head_tail = os.path.split(org_img_name)
29
  head = head_tail[0]
 
44
 
45
  class MaskFormer:
46
  def __init__(self, device):
47
+ print("Initializing MaskFormer to %s" % device)
48
  self.device = device
49
+ self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
50
+ self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
51
 
52
  def inference(self, image_path, text):
53
  threshold = 0.5
 
55
  padding = 20
56
  original_image = Image.open(image_path)
57
  image = original_image.resize((512, 512))
58
+ inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
59
  with torch.no_grad():
60
  outputs = self.model(**inputs)
61
  mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
 
71
  image_mask = Image.fromarray(visual_mask)
72
  return image_mask.resize(original_image.size)
73
 
74
+
75
  class ImageEditing:
76
  def __init__(self, device):
77
+ print("Initializing ImageEditing to %s" % device)
78
  self.device = device
79
  self.mask_former = MaskFormer(device=self.device)
80
+ self.inpaint = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting").to(device)
81
+
82
+ @prompts(name="Remove Something From The Photo",
83
+ description="useful when you want to remove and object or something from the photo "
84
+ "from its description or location. "
85
+ "The input to this tool should be a comma seperated string of two, "
86
+ "representing the image_path and the object need to be removed. ")
87
+ def inference_remove(self, inputs):
88
+ image_path, to_be_removed_txt = inputs.split(",")
89
+ return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
90
+
91
+ @prompts(name="Replace Something From The Photo",
92
+ description="useful when you want to replace an object from the object description or "
93
+ "location with another object from its description. "
94
+ "The input to this tool should be a comma seperated string of three, "
95
+ "representing the image_path, the object to be replaced, the object to be replaced with ")
96
+ def inference_replace(self, inputs):
97
+ image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
98
  original_image = Image.open(image_path)
99
  original_size = original_image.size
100
  mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
101
+ updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
102
+ mask_image=mask_image.resize((512, 512))).images[0]
103
  updated_image_path = get_new_image_name(image_path, func_name="replace-something")
104
  updated_image = updated_image.resize(original_size)
105
  updated_image.save(updated_image_path)
106
+ print(
107
+ f"\nProcessed ImageEditing, Input Image: {image_path}, Replace {to_be_replaced_txt} to {replace_with_txt}, "
108
+ f"Output Image: {updated_image_path}")
109
  return updated_image_path
110
 
111
+
112
+ class InstructPix2Pix:
113
  def __init__(self, device):
114
+ print("Initializing InstructPix2Pix to %s" % device)
115
  self.device = device
116
+ self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
117
+ safety_checker=None).to(device)
118
  self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
119
 
120
+ @prompts(name="Instruct Image Using Text",
121
+ description="useful when you want to the style of the image to be like the text. "
122
+ "like: make it look like a painting. or make it like a robot. "
123
+ "The input to this tool should be a comma seperated string of two, "
124
+ "representing the image_path and the text. ")
125
  def inference(self, inputs):
126
  """Change style of image."""
127
+ print("===>Starting InstructPix2Pix Inference")
128
+ image_path, text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
129
  original_image = Image.open(image_path)
130
+ image = self.pipe(text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2).images[0]
131
  updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
132
  image.save(updated_image_path)
133
+ print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
134
+ f"Output Image: {updated_image_path}")
135
  return updated_image_path
136
 
137
+
138
+ class Text2Image:
139
  def __init__(self, device):
140
+ print("Initializing Text2Image to %s" % device)
141
  self.device = device
142
+ self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
143
+ self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
144
+ self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
145
+ self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model,
146
+ tokenizer=self.text_refine_tokenizer, device=self.device)
147
  self.pipe.to(device)
148
 
149
+ @prompts(name="Generate Image From User Input Text",
150
+ description="useful when you want to generate an image from a user input text and save it to a file. "
151
+ "like: generate an image of an object or something, or generate an image that includes some objects. "
152
+ "The input to this tool should be a string, representing the text used to generate image. ")
153
  def inference(self, text):
154
  image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
155
  refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
 
156
  image = self.pipe(refined_text).images[0]
157
  image.save(image_filename)
158
+ print(
159
+ f"\nProcessed Text2Image, Input Text: {text}, Refined Text: {refined_text}, Output Image: {image_filename}")
160
  return image_filename
161
 
162
+
163
  class ImageCaptioning:
164
  def __init__(self, device):
165
  print("Initializing ImageCaptioning to %s" % device)
166
  self.device = device
167
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
168
+ self.model = BlipForConditionalGeneration.from_pretrained(
169
+ "Salesforce/blip-image-captioning-base").to(self.device)
170
 
171
+ @prompts(name="Get Photo Description",
172
+ description="useful when you want to know what is inside the photo. receives image_path as input. "
173
+ "The input to this tool should be a string, representing the image_path. ")
174
  def inference(self, image_path):
175
  inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
176
  out = self.model.generate(**inputs)
177
  captions = self.processor.decode(out[0], skip_special_tokens=True)
178
+ print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
179
  return captions
180
 
181
+
182
+ class Image2Canny:
183
+ def __init__(self, device):
184
+ print("Initializing Image2Canny")
185
  self.low_threshold = 100
186
  self.high_threshold = 200
187
 
188
+ @prompts(name="Edge Detection On Image",
189
+ description="useful when you want to detect the edge of the image. "
190
+ "like: detect the edges of this image, or canny detection on image, "
191
+ "or perform edge detection on this image, or detect the canny image of this image. "
192
+ "The input to this tool should be a string, representing the image_path")
193
  def inference(self, inputs):
 
194
  image = Image.open(inputs)
195
  image = np.array(image)
196
  canny = cv2.Canny(image, self.low_threshold, self.high_threshold)
 
199
  canny = Image.fromarray(canny)
200
  updated_image_path = get_new_image_name(inputs, func_name="edge")
201
  canny.save(updated_image_path)
202
+ print(f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
203
  return updated_image_path
204
 
205
+
206
+ class CannyText2Image:
207
  def __init__(self, device):
208
+ print("Initializing CannyText2Image to %s" % device)
209
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny")
210
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
211
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
212
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
213
  self.pipe.to(device)
214
  self.seed = -1
215
  self.a_prompt = 'best quality, extremely detailed'
216
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
217
+ 'fewer digits, cropped, worst quality, low quality'
218
+
219
+ @prompts(name="Generate Image Condition On Canny Image",
220
+ description="useful when you want to generate a new real image from both the user desciption and a canny image."
221
+ " like: generate a real image of a object or something from this canny image,"
222
+ " or generate a new real image of a object or something from this edge image. "
223
+ "The input to this tool should be a comma seperated string of two, "
224
+ "representing the image_path and the user description. ")
225
  def inference(self, inputs):
 
226
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
227
  image = Image.open(image_path)
228
  self.seed = random.randint(0, 65535)
229
  seed_everything(self.seed)
230
  prompt = instruct_text + ', ' + self.a_prompt
231
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
232
+ guidance_scale=9.0).images[0]
233
  updated_image_path = get_new_image_name(image_path, func_name="canny2image")
234
  image.save(updated_image_path)
235
+ print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
236
+ f"Output Text: {updated_image_path}")
237
  return updated_image_path
238
 
239
+
240
+ class Image2Line:
241
+ def __init__(self, device):
242
+ print("Initializing Image2Line")
243
  self.detector = MLSDdetector.from_pretrained('lllyasviel/ControlNet')
244
 
245
+ @prompts(name="Line Detection On Image",
246
+ description="useful when you want to detect the straight line of the image. "
247
+ "like: detect the straight lines of this image, or straight line detection on image, "
248
+ "or peform straight line detection on this image, or detect the straight line image of this image. "
249
+ "The input to this tool should be a string, representing the image_path")
250
  def inference(self, inputs):
 
251
  image = Image.open(inputs)
252
  mlsd = self.detector(image)
253
  updated_image_path = get_new_image_name(inputs, func_name="line-of")
254
  mlsd.save(updated_image_path)
255
+ print(f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
256
  return updated_image_path
257
 
258
+
259
+ class LineText2Image:
260
  def __init__(self, device):
261
+ print("Initializing LineText2Image to %s" % device)
262
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd")
263
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
264
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
265
  )
266
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
267
  self.pipe.to(device)
268
  self.seed = -1
269
  self.a_prompt = 'best quality, extremely detailed'
270
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
271
+ 'fewer digits, cropped, worst quality, low quality'
272
+
273
+ @prompts(name="Generate Image Condition On Line Image",
274
+ description="useful when you want to generate a new real image from both the user desciption "
275
+ "and a straight line image. "
276
+ "like: generate a real image of a object or something from this straight line image, "
277
+ "or generate a new real image of a object or something from this straight lines. "
278
+ "The input to this tool should be a comma seperated string of two, "
279
+ "representing the image_path and the user description. ")
280
  def inference(self, inputs):
 
281
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
282
  image = Image.open(image_path)
283
  self.seed = random.randint(0, 65535)
284
  seed_everything(self.seed)
285
  prompt = instruct_text + ', ' + self.a_prompt
286
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
287
+ guidance_scale=9.0).images[0]
288
  updated_image_path = get_new_image_name(image_path, func_name="line2image")
289
  image.save(updated_image_path)
290
+ print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
291
+ f"Output Text: {updated_image_path}")
292
  return updated_image_path
293
 
294
+
295
+ class Image2Hed:
296
+ def __init__(self, device):
297
+ print("Initializing Image2Hed")
298
  self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
299
 
300
+ @prompts(name="Hed Detection On Image",
301
+ description="useful when you want to detect the soft hed boundary of the image. "
302
+ "like: detect the soft hed boundary of this image, or hed boundary detection on image, "
303
+ "or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
304
+ "The input to this tool should be a string, representing the image_path")
305
  def inference(self, inputs):
 
306
  image = Image.open(inputs)
307
  hed = self.detector(image)
308
  updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
309
  hed.save(updated_image_path)
310
+ print(f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
311
  return updated_image_path
312
 
313
+
314
+ class HedText2Image:
315
  def __init__(self, device):
316
+ print("Initializing HedText2Image to %s" % device)
317
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed")
318
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
319
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
320
  )
321
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
322
  self.pipe.to(device)
323
  self.seed = -1
324
  self.a_prompt = 'best quality, extremely detailed'
325
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
326
+ 'fewer digits, cropped, worst quality, low quality'
327
+
328
+ @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
329
+ description="useful when you want to generate a new real image from both the user desciption "
330
+ "and a soft hed boundary image. "
331
+ "like: generate a real image of a object or something from this soft hed boundary image, "
332
+ "or generate a new real image of a object or something from this hed boundary. "
333
+ "The input to this tool should be a comma seperated string of two, "
334
+ "representing the image_path and the user description")
335
  def inference(self, inputs):
 
336
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
337
  image = Image.open(image_path)
338
  self.seed = random.randint(0, 65535)
339
  seed_everything(self.seed)
340
  prompt = instruct_text + ', ' + self.a_prompt
341
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
342
+ guidance_scale=9.0).images[0]
343
  updated_image_path = get_new_image_name(image_path, func_name="hed2image")
344
  image.save(updated_image_path)
345
+ print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
346
+ f"Output Image: {updated_image_path}")
347
  return updated_image_path
348
 
349
+
350
+ class Image2Scribble:
351
+ def __init__(self, device):
352
+ print("Initializing Image2Scribble")
353
  self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
354
 
355
+ @prompts(name="Sketch Detection On Image",
356
+ description="useful when you want to generate a scribble of the image. "
357
+ "like: generate a scribble of this image, or generate a sketch from this image, "
358
+ "detect the sketch from this image. "
359
+ "The input to this tool should be a string, representing the image_path")
360
  def inference(self, inputs):
 
361
  image = Image.open(inputs)
362
  scribble = self.detector(image, scribble=True)
363
  updated_image_path = get_new_image_name(inputs, func_name="scribble")
364
  scribble.save(updated_image_path)
365
+ print(f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
366
  return updated_image_path
367
 
368
+
369
+ class ScribbleText2Image:
370
  def __init__(self, device):
371
+ print("Initializing ScribbleText2Image to %s" % device)
372
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble")
373
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
374
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
375
  )
376
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
377
  self.pipe.to(device)
378
  self.seed = -1
379
  self.a_prompt = 'best quality, extremely detailed'
380
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
381
+ 'fewer digits, cropped, worst quality, low quality'
382
+
383
+ @prompts(name="Generate Image Condition On Sketch Image",
384
+ description="useful when you want to generate a new real image from both the user desciption and "
385
+ "a scribble image or a sketch image. "
386
+ "The input to this tool should be a comma seperated string of two, "
387
+ "representing the image_path and the user description")
388
  def inference(self, inputs):
 
389
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
390
  image = Image.open(image_path)
391
  self.seed = random.randint(0, 65535)
392
  seed_everything(self.seed)
393
  prompt = instruct_text + ', ' + self.a_prompt
394
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
395
+ guidance_scale=9.0).images[0]
396
  updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
397
  image.save(updated_image_path)
398
+ print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
399
+ f"Output Image: {updated_image_path}")
400
  return updated_image_path
401
 
402
+
403
+ class Image2Pose:
404
+ def __init__(self, device):
405
+ print("Initializing Image2Pose")
406
  self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
407
 
408
+ @prompts(name="Pose Detection On Image",
409
+ description="useful when you want to detect the human pose of the image. "
410
+ "like: generate human poses of this image, or generate a pose image from this image. "
411
+ "The input to this tool should be a string, representing the image_path")
412
  def inference(self, inputs):
 
413
  image = Image.open(inputs)
414
  pose = self.detector(image)
415
  updated_image_path = get_new_image_name(inputs, func_name="human-pose")
416
  pose.save(updated_image_path)
417
+ print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
418
  return updated_image_path
419
 
420
+
421
+ class PoseText2Image:
422
  def __init__(self, device):
423
+ print("Initializing PoseText2Image to %s" % device)
424
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose")
425
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
426
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
427
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
428
  self.pipe.to(device)
429
  self.num_inference_steps = 20
430
  self.seed = -1
431
  self.unconditional_guidance_scale = 9.0
432
  self.a_prompt = 'best quality, extremely detailed'
433
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
434
+ ' fewer digits, cropped, worst quality, low quality'
435
+
436
+ @prompts(name="Generate Image Condition On Pose Image",
437
+ description="useful when you want to generate a new real image from both the user desciption "
438
+ "and a human pose image. "
439
+ "like: generate a real image of a human from this human pose image, "
440
+ "or generate a new real image of a human from this pose. "
441
+ "The input to this tool should be a comma seperated string of two, "
442
+ "representing the image_path and the user description")
443
  def inference(self, inputs):
 
444
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
445
  image = Image.open(image_path)
446
  self.seed = random.randint(0, 65535)
447
  seed_everything(self.seed)
448
  prompt = instruct_text + ', ' + self.a_prompt
449
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
450
+ guidance_scale=9.0).images[0]
451
  updated_image_path = get_new_image_name(image_path, func_name="pose2image")
452
  image.save(updated_image_path)
453
+ print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
454
+ f"Output Image: {updated_image_path}")
455
  return updated_image_path
456
 
457
+
458
+ class Image2Seg:
459
+ def __init__(self, device):
460
+ print("Initializing Image2Seg")
461
  self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
462
  self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
 
463
  self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
464
+ [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
465
+ [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
466
+ [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
467
+ [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
468
+ [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
469
+ [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
470
+ [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
471
+ [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
472
+ [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
473
+ [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
474
+ [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
475
+ [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
476
+ [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
477
+ [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
478
+ [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
479
+ [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
480
+ [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
481
+ [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
482
+ [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
483
+ [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
484
+ [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
485
+ [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
486
+ [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
487
+ [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
488
+ [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
489
+ [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
490
+ [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
491
+ [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
492
+ [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
493
+ [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
494
+ [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
495
+ [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
496
+ [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
497
+ [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
498
+ [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
499
+ [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
500
+ [102, 255, 0], [92, 0, 255]]
501
+
502
+ @prompts(name="Segmentation On Image",
503
+ description="useful when you want to detect segmentations of the image. "
504
+ "like: segment this image, or generate segmentations on this image, "
505
+ "or peform segmentation on this image. "
506
+ "The input to this tool should be a string, representing the image_path")
507
  def inference(self, inputs):
508
  image = Image.open(inputs)
509
  pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
 
518
  segmentation = Image.fromarray(color_seg)
519
  updated_image_path = get_new_image_name(inputs, func_name="segmentation")
520
  segmentation.save(updated_image_path)
521
+ print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
522
  return updated_image_path
523
 
524
+
525
+ class SegText2Image:
526
  def __init__(self, device):
527
+ print("Initializing SegText2Image to %s" % device)
528
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg")
529
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
530
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
531
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
532
  self.pipe.to(device)
533
  self.seed = -1
534
  self.a_prompt = 'best quality, extremely detailed'
535
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
536
+ ' fewer digits, cropped, worst quality, low quality'
537
+
538
+ @prompts(name="Generate Image Condition On Segmentations",
539
+ description="useful when you want to generate a new real image from both the user desciption and segmentations. "
540
+ "like: generate a real image of a object or something from this segmentation image, "
541
+ "or generate a new real image of a object or something from these segmentations. "
542
+ "The input to this tool should be a comma seperated string of two, "
543
+ "representing the image_path and the user description")
544
  def inference(self, inputs):
 
545
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
546
  image = Image.open(image_path)
547
  self.seed = random.randint(0, 65535)
548
  seed_everything(self.seed)
549
  prompt = instruct_text + ', ' + self.a_prompt
550
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
551
+ guidance_scale=9.0).images[0]
552
  updated_image_path = get_new_image_name(image_path, func_name="segment2image")
553
  image.save(updated_image_path)
554
+ print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
555
+ f"Output Image: {updated_image_path}")
556
  return updated_image_path
557
 
558
+
559
+ class Image2Depth:
560
+ def __init__(self, device):
561
+ print("Initializing Image2Depth")
562
  self.depth_estimator = pipeline('depth-estimation')
563
 
564
+ @prompts(name="Predict Depth On Image",
565
+ description="useful when you want to detect depth of the image. like: generate the depth from this image, "
566
+ "or detect the depth map on this image, or predict the depth for this image. "
567
+ "The input to this tool should be a string, representing the image_path")
568
  def inference(self, inputs):
569
  image = Image.open(inputs)
570
  depth = self.depth_estimator(image)['depth']
 
574
  depth = Image.fromarray(depth)
575
  updated_image_path = get_new_image_name(inputs, func_name="depth")
576
  depth.save(updated_image_path)
577
+ print(f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
578
  return updated_image_path
579
 
580
+
581
+ class DepthText2Image:
582
  def __init__(self, device):
583
+ print("Initializing DepthText2Image to %s" % device)
584
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-depth")
585
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
586
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
587
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
588
  self.pipe.to(device)
589
  self.seed = -1
590
  self.a_prompt = 'best quality, extremely detailed'
591
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
592
+ ' fewer digits, cropped, worst quality, low quality'
593
+
594
+ @prompts(name="Generate Image Condition On Depth",
595
+ description="useful when you want to generate a new real image from both the user desciption and depth image. "
596
+ "like: generate a real image of a object or something from this depth image, "
597
+ "or generate a new real image of a object or something from the depth map. "
598
+ "The input to this tool should be a comma seperated string of two, "
599
+ "representing the image_path and the user description")
600
  def inference(self, inputs):
 
601
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
602
  image = Image.open(image_path)
603
  self.seed = random.randint(0, 65535)
604
  seed_everything(self.seed)
605
  prompt = instruct_text + ', ' + self.a_prompt
606
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
607
+ guidance_scale=9.0).images[0]
608
  updated_image_path = get_new_image_name(image_path, func_name="depth2image")
609
  image.save(updated_image_path)
610
+ print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
611
+ f"Output Image: {updated_image_path}")
612
  return updated_image_path
613
 
614
+
615
+ class Image2Normal:
616
+ def __init__(self, device):
617
+ print("Initializing Image2Normal")
618
  self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
619
  self.bg_threhold = 0.4
620
 
621
+ @prompts(name="Predict Normal Map On Image",
622
+ description="useful when you want to detect norm map of the image. "
623
+ "like: generate normal map from this image, or predict normal map of this image. "
624
+ "The input to this tool should be a string, representing the image_path")
625
  def inference(self, inputs):
626
  image = Image.open(inputs)
627
  original_size = image.size
 
630
  image_depth = image.copy()
631
  image_depth -= np.min(image_depth)
632
  image_depth /= np.max(image_depth)
 
633
  x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
634
  x[image_depth < self.bg_threhold] = 0
 
635
  y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
636
  y[image_depth < self.bg_threhold] = 0
 
637
  z = np.ones_like(x) * np.pi * 2.0
638
  image = np.stack([x, y, z], axis=2)
639
  image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
 
642
  image = image.resize(original_size)
643
  updated_image_path = get_new_image_name(inputs, func_name="normal-map")
644
  image.save(updated_image_path)
645
+ print(f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
646
  return updated_image_path
647
 
648
+
649
+ class NormalText2Image:
650
  def __init__(self, device):
651
+ print("Initializing NormalText2Image to %s" % device)
652
+ self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-normal")
653
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
654
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None)
 
655
  self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
656
  self.pipe.to(device)
657
  self.seed = -1
658
  self.a_prompt = 'best quality, extremely detailed'
659
+ self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
660
+ ' fewer digits, cropped, worst quality, low quality'
661
+
662
+ @prompts(name="Generate Image Condition On Normal Map",
663
+ description="useful when you want to generate a new real image from both the user desciption and normal map. "
664
+ "like: generate a real image of a object or something from this normal map, "
665
+ "or generate a new real image of a object or something from the normal map. "
666
+ "The input to this tool should be a comma seperated string of two, "
667
+ "representing the image_path and the user description")
668
  def inference(self, inputs):
 
669
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
670
  image = Image.open(image_path)
671
  self.seed = random.randint(0, 65535)
672
  seed_everything(self.seed)
673
  prompt = instruct_text + ', ' + self.a_prompt
674
+ image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
675
+ guidance_scale=9.0).images[0]
676
  updated_image_path = get_new_image_name(image_path, func_name="normal2image")
677
  image.save(updated_image_path)
678
+ print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
679
+ f"Output Image: {updated_image_path}")
680
  return updated_image_path
681
 
682
+
683
+ class VisualQuestionAnswering:
684
  def __init__(self, device):
685
+ print("Initializing VisualQuestionAnswering to %s" % device)
686
  self.device = device
687
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
688
+ self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(self.device)
689
 
690
+ @prompts(name="Answer Question About The Image",
691
+ description="useful when you need an answer for a question based on an image. "
692
+ "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
693
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the question")
694
+ def inference(self, inputs):
695
  image_path, question = inputs.split(",")
696
  raw_image = Image.open(image_path).convert('RGB')
 
697
  inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device)
698
  out = self.model.generate(**inputs)
699
  answer = self.processor.decode(out[0], skip_special_tokens=True)
700
+ print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
701
+ f"Output Answer: {answer}")
702
  return answer