Spaces:
Runtime error
Runtime error
LanHarmony
commited on
Commit
•
53cf806
1
Parent(s):
1dc205a
add image2depth and depth2image
Browse files- app.py +34 -23
- visual_foundation_models.py +109 -77
app.py
CHANGED
@@ -52,18 +52,19 @@ import gradio as gr
|
|
52 |
|
53 |
|
54 |
def cut_dialogue_history(history_memory, keep_last_n_words=400):
|
|
|
|
|
55 |
tokens = history_memory.split()
|
56 |
n_tokens = len(tokens)
|
57 |
-
print(f"
|
58 |
if n_tokens < keep_last_n_words:
|
59 |
return history_memory
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
return '\n' + '\n'.join(paragraphs)
|
67 |
|
68 |
|
69 |
class ConversationBot:
|
@@ -74,7 +75,6 @@ class ConversationBot:
|
|
74 |
raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
|
75 |
|
76 |
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
77 |
-
|
78 |
self.models = dict()
|
79 |
for class_name, device in load_dict.items():
|
80 |
self.models[class_name] = globals()[class_name](device=device)
|
@@ -86,7 +86,6 @@ class ConversationBot:
|
|
86 |
func = getattr(instance, e)
|
87 |
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
88 |
|
89 |
-
|
90 |
def run_text(self, text, state):
|
91 |
self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
|
92 |
res = self.agent({"input": text})
|
@@ -98,7 +97,7 @@ class ConversationBot:
|
|
98 |
return state, state
|
99 |
|
100 |
def run_image(self, image, state, txt):
|
101 |
-
image_filename = os.path.join('image', str(uuid.uuid4())[
|
102 |
print("======>Auto Resize Image...")
|
103 |
img = Image.open(image.name)
|
104 |
width, height = img.size
|
@@ -111,17 +110,13 @@ class ConversationBot:
|
|
111 |
img.save(image_filename, "PNG")
|
112 |
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
|
113 |
description = self.models['ImageCaptioning'].inference(image_filename)
|
114 |
-
Human_prompt =
|
115 |
-
"This information helps you to understand this image, " \
|
116 |
-
"but you should use tools to finish following tasks, " \
|
117 |
-
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
|
118 |
-
image_filename, description)
|
119 |
AI_prompt = "Received. "
|
120 |
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
121 |
state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
|
122 |
print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
|
123 |
f"Current Memory: {self.agent.memory.buffer}")
|
124 |
-
return state, state, txt
|
125 |
|
126 |
def init_agent(self, openai_api_key):
|
127 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
@@ -136,17 +131,25 @@ class ConversationBot:
|
|
136 |
|
137 |
return gr.update(visible = True)
|
138 |
|
139 |
-
bot = ConversationBot({'Text2Image':'cuda:0',
|
140 |
-
'ImageCaptioning':'cuda:0',
|
141 |
'ImageEditing': 'cuda:0',
|
142 |
'VisualQuestionAnswering': 'cuda:0',
|
143 |
-
'Image2Canny':'cpu',
|
144 |
-
'CannyText2Image':'cuda:0',
|
145 |
-
'InstructPix2Pix':'cuda:0'
|
|
|
|
|
|
|
146 |
|
147 |
with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
148 |
with gr.Row():
|
149 |
gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
with gr.Row():
|
152 |
openai_api_key_textbox = gr.Textbox(
|
@@ -177,10 +180,18 @@ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
|
177 |
"Can you use this canny image to generate an oil painting of a dog",
|
178 |
"Make it like water-color painting",
|
179 |
"What is the background color",
|
180 |
-
"Describe this image"
|
|
|
|
|
|
|
181 |
inputs=txt
|
182 |
)
|
183 |
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
|
186 |
txt.submit(bot.run_text, [txt, state], [chatbot, state])
|
|
|
52 |
|
53 |
|
54 |
def cut_dialogue_history(history_memory, keep_last_n_words=400):
|
55 |
+
if history_memory is None or len(history_memory) == 0:
|
56 |
+
return history_memory
|
57 |
tokens = history_memory.split()
|
58 |
n_tokens = len(tokens)
|
59 |
+
print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
|
60 |
if n_tokens < keep_last_n_words:
|
61 |
return history_memory
|
62 |
+
paragraphs = history_memory.split('\n')
|
63 |
+
last_n_tokens = n_tokens
|
64 |
+
while last_n_tokens >= keep_last_n_words:
|
65 |
+
last_n_tokens -= len(paragraphs[0].split(' '))
|
66 |
+
paragraphs = paragraphs[1:]
|
67 |
+
return '\n' + '\n'.join(paragraphs)
|
|
|
68 |
|
69 |
|
70 |
class ConversationBot:
|
|
|
75 |
raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
|
76 |
|
77 |
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
|
|
78 |
self.models = dict()
|
79 |
for class_name, device in load_dict.items():
|
80 |
self.models[class_name] = globals()[class_name](device=device)
|
|
|
86 |
func = getattr(instance, e)
|
87 |
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
88 |
|
|
|
89 |
def run_text(self, text, state):
|
90 |
self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
|
91 |
res = self.agent({"input": text})
|
|
|
97 |
return state, state
|
98 |
|
99 |
def run_image(self, image, state, txt):
|
100 |
+
image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
|
101 |
print("======>Auto Resize Image...")
|
102 |
img = Image.open(image.name)
|
103 |
width, height = img.size
|
|
|
110 |
img.save(image_filename, "PNG")
|
111 |
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
|
112 |
description = self.models['ImageCaptioning'].inference(image_filename)
|
113 |
+
Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
|
|
|
|
|
|
|
|
114 |
AI_prompt = "Received. "
|
115 |
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
116 |
state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
|
117 |
print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
|
118 |
f"Current Memory: {self.agent.memory.buffer}")
|
119 |
+
return state, state, f'{txt} {image_filename} '
|
120 |
|
121 |
def init_agent(self, openai_api_key):
|
122 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
|
|
131 |
|
132 |
return gr.update(visible = True)
|
133 |
|
134 |
+
bot = ConversationBot({'Text2Image': 'cuda:0',
|
135 |
+
'ImageCaptioning': 'cuda:0',
|
136 |
'ImageEditing': 'cuda:0',
|
137 |
'VisualQuestionAnswering': 'cuda:0',
|
138 |
+
'Image2Canny': 'cpu',
|
139 |
+
'CannyText2Image': 'cuda:0',
|
140 |
+
'InstructPix2Pix': 'cuda:0',
|
141 |
+
'Image2Depth': 'cpu',
|
142 |
+
'DepthText2Image': 'cuda:0',
|
143 |
+
})
|
144 |
|
145 |
with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
146 |
with gr.Row():
|
147 |
gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
|
148 |
+
gr.Markdown(
|
149 |
+
"""This is a demo to the work [Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models](https://github.com/microsoft/visual-chatgpt).<br>
|
150 |
+
This space connects ChatGPT and a series of Visual Foundation Models to enable sending and receiving images during chatting.<br>
|
151 |
+
"""
|
152 |
+
)
|
153 |
|
154 |
with gr.Row():
|
155 |
openai_api_key_textbox = gr.Textbox(
|
|
|
180 |
"Can you use this canny image to generate an oil painting of a dog",
|
181 |
"Make it like water-color painting",
|
182 |
"What is the background color",
|
183 |
+
"Describe this image",
|
184 |
+
"please detect the depth of this image",
|
185 |
+
"Can you use this depth image to generate a cute dog",
|
186 |
+
],
|
187 |
inputs=txt
|
188 |
)
|
189 |
|
190 |
+
gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:
|
191 |
+
<a href="https://huggingface.co/spaces/microsoft/visual_chatgpt?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
|
192 |
+
</center>''')
|
193 |
+
|
194 |
+
|
195 |
|
196 |
openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
|
197 |
txt.submit(bot.run_text, [txt, state], [chatbot, state])
|
visual_foundation_models.py
CHANGED
@@ -44,7 +44,7 @@ def get_new_image_name(org_img_name, func_name="update"):
|
|
44 |
|
45 |
class MaskFormer:
|
46 |
def __init__(self, device):
|
47 |
-
print("Initializing MaskFormer to
|
48 |
self.device = device
|
49 |
self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
|
50 |
self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
|
@@ -74,24 +74,27 @@ class MaskFormer:
|
|
74 |
|
75 |
class ImageEditing:
|
76 |
def __init__(self, device):
|
77 |
-
print("Initializing ImageEditing to
|
78 |
self.device = device
|
79 |
self.mask_former = MaskFormer(device=self.device)
|
80 |
-
self.
|
|
|
|
|
|
|
81 |
|
82 |
@prompts(name="Remove Something From The Photo",
|
83 |
description="useful when you want to remove and object or something from the photo "
|
84 |
"from its description or location. "
|
85 |
-
"The input to this tool should be a comma
|
86 |
"representing the image_path and the object need to be removed. ")
|
87 |
def inference_remove(self, inputs):
|
88 |
-
image_path, to_be_removed_txt = inputs.split(",")
|
89 |
return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
|
90 |
|
91 |
@prompts(name="Replace Something From The Photo",
|
92 |
description="useful when you want to replace an object from the object description or "
|
93 |
"location with another object from its description. "
|
94 |
-
"The input to this tool should be a comma
|
95 |
"representing the image_path, the object to be replaced, the object to be replaced with ")
|
96 |
def inference_replace(self, inputs):
|
97 |
image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
|
@@ -111,16 +114,18 @@ class ImageEditing:
|
|
111 |
|
112 |
class InstructPix2Pix:
|
113 |
def __init__(self, device):
|
114 |
-
print("Initializing InstructPix2Pix to
|
115 |
self.device = device
|
116 |
-
self.
|
117 |
-
|
|
|
|
|
118 |
self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
|
119 |
|
120 |
@prompts(name="Instruct Image Using Text",
|
121 |
description="useful when you want to the style of the image to be like the text. "
|
122 |
"like: make it look like a painting. or make it like a robot. "
|
123 |
-
"The input to this tool should be a comma
|
124 |
"representing the image_path and the text. ")
|
125 |
def inference(self, inputs):
|
126 |
"""Change style of image."""
|
@@ -163,17 +168,18 @@ class Text2Image:
|
|
163 |
|
164 |
class ImageCaptioning:
|
165 |
def __init__(self, device):
|
166 |
-
print("Initializing ImageCaptioning to
|
167 |
self.device = device
|
|
|
168 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
169 |
self.model = BlipForConditionalGeneration.from_pretrained(
|
170 |
-
"Salesforce/blip-image-captioning-base", torch_dtype=
|
171 |
|
172 |
@prompts(name="Get Photo Description",
|
173 |
description="useful when you want to know what is inside the photo. receives image_path as input. "
|
174 |
"The input to this tool should be a string, representing the image_path. ")
|
175 |
def inference(self, image_path):
|
176 |
-
inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device,
|
177 |
out = self.model.generate(**inputs)
|
178 |
captions = self.processor.decode(out[0], skip_special_tokens=True)
|
179 |
print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
|
@@ -206,29 +212,32 @@ class Image2Canny:
|
|
206 |
|
207 |
class CannyText2Image:
|
208 |
def __init__(self, device):
|
209 |
-
print("Initializing CannyText2Image to
|
210 |
-
self.
|
|
|
|
|
211 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
212 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
|
|
213 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
214 |
self.pipe.to(device)
|
215 |
self.seed = -1
|
216 |
self.a_prompt = 'best quality, extremely detailed'
|
217 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
218 |
-
|
219 |
|
220 |
@prompts(name="Generate Image Condition On Canny Image",
|
221 |
-
description="useful when you want to generate a new real image from both the user
|
222 |
" like: generate a real image of a object or something from this canny image,"
|
223 |
" or generate a new real image of a object or something from this edge image. "
|
224 |
-
"The input to this tool should be a comma
|
225 |
"representing the image_path and the user description. ")
|
226 |
def inference(self, inputs):
|
227 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
228 |
image = Image.open(image_path)
|
229 |
self.seed = random.randint(0, 65535)
|
230 |
seed_everything(self.seed)
|
231 |
-
prompt = instruct_text
|
232 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
233 |
guidance_scale=9.0).images[0]
|
234 |
updated_image_path = get_new_image_name(image_path, func_name="canny2image")
|
@@ -246,7 +255,7 @@ class Image2Line:
|
|
246 |
@prompts(name="Line Detection On Image",
|
247 |
description="useful when you want to detect the straight line of the image. "
|
248 |
"like: detect the straight lines of this image, or straight line detection on image, "
|
249 |
-
"or
|
250 |
"The input to this tool should be a string, representing the image_path")
|
251 |
def inference(self, inputs):
|
252 |
image = Image.open(inputs)
|
@@ -259,31 +268,34 @@ class Image2Line:
|
|
259 |
|
260 |
class LineText2Image:
|
261 |
def __init__(self, device):
|
262 |
-
print("Initializing LineText2Image to
|
263 |
-
self.
|
|
|
|
|
264 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
265 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
266 |
)
|
267 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
268 |
self.pipe.to(device)
|
269 |
self.seed = -1
|
270 |
self.a_prompt = 'best quality, extremely detailed'
|
271 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
272 |
-
|
273 |
|
274 |
@prompts(name="Generate Image Condition On Line Image",
|
275 |
-
description="useful when you want to generate a new real image from both the user
|
276 |
"and a straight line image. "
|
277 |
"like: generate a real image of a object or something from this straight line image, "
|
278 |
"or generate a new real image of a object or something from this straight lines. "
|
279 |
-
"The input to this tool should be a comma
|
280 |
"representing the image_path and the user description. ")
|
281 |
def inference(self, inputs):
|
282 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
283 |
image = Image.open(image_path)
|
284 |
self.seed = random.randint(0, 65535)
|
285 |
seed_everything(self.seed)
|
286 |
-
prompt = instruct_text
|
287 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
288 |
guidance_scale=9.0).images[0]
|
289 |
updated_image_path = get_new_image_name(image_path, func_name="line2image")
|
@@ -301,7 +313,7 @@ class Image2Hed:
|
|
301 |
@prompts(name="Hed Detection On Image",
|
302 |
description="useful when you want to detect the soft hed boundary of the image. "
|
303 |
"like: detect the soft hed boundary of this image, or hed boundary detection on image, "
|
304 |
-
"or
|
305 |
"The input to this tool should be a string, representing the image_path")
|
306 |
def inference(self, inputs):
|
307 |
image = Image.open(inputs)
|
@@ -314,31 +326,34 @@ class Image2Hed:
|
|
314 |
|
315 |
class HedText2Image:
|
316 |
def __init__(self, device):
|
317 |
-
print("Initializing HedText2Image to
|
318 |
-
self.
|
|
|
|
|
319 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
320 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
321 |
)
|
322 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
323 |
self.pipe.to(device)
|
324 |
self.seed = -1
|
325 |
self.a_prompt = 'best quality, extremely detailed'
|
326 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
327 |
-
|
328 |
|
329 |
@prompts(name="Generate Image Condition On Soft Hed Boundary Image",
|
330 |
-
description="useful when you want to generate a new real image from both the user
|
331 |
"and a soft hed boundary image. "
|
332 |
"like: generate a real image of a object or something from this soft hed boundary image, "
|
333 |
"or generate a new real image of a object or something from this hed boundary. "
|
334 |
-
"The input to this tool should be a comma
|
335 |
"representing the image_path and the user description")
|
336 |
def inference(self, inputs):
|
337 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
338 |
image = Image.open(image_path)
|
339 |
self.seed = random.randint(0, 65535)
|
340 |
seed_everything(self.seed)
|
341 |
-
prompt = instruct_text
|
342 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
343 |
guidance_scale=9.0).images[0]
|
344 |
updated_image_path = get_new_image_name(image_path, func_name="hed2image")
|
@@ -369,29 +384,32 @@ class Image2Scribble:
|
|
369 |
|
370 |
class ScribbleText2Image:
|
371 |
def __init__(self, device):
|
372 |
-
print("Initializing ScribbleText2Image to
|
373 |
-
self.
|
|
|
|
|
374 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
375 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
376 |
)
|
377 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
378 |
self.pipe.to(device)
|
379 |
self.seed = -1
|
380 |
self.a_prompt = 'best quality, extremely detailed'
|
381 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
382 |
-
|
383 |
|
384 |
@prompts(name="Generate Image Condition On Sketch Image",
|
385 |
-
description="useful when you want to generate a new real image from both the user
|
386 |
"a scribble image or a sketch image. "
|
387 |
-
"The input to this tool should be a comma
|
388 |
"representing the image_path and the user description")
|
389 |
def inference(self, inputs):
|
390 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
391 |
image = Image.open(image_path)
|
392 |
self.seed = random.randint(0, 65535)
|
393 |
seed_everything(self.seed)
|
394 |
-
prompt = instruct_text
|
395 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
396 |
guidance_scale=9.0).images[0]
|
397 |
updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
|
@@ -421,10 +439,13 @@ class Image2Pose:
|
|
421 |
|
422 |
class PoseText2Image:
|
423 |
def __init__(self, device):
|
424 |
-
print("Initializing PoseText2Image to
|
425 |
-
self.
|
|
|
|
|
426 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
427 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
428 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
429 |
self.pipe.to(device)
|
430 |
self.num_inference_steps = 20
|
@@ -432,21 +453,21 @@ class PoseText2Image:
|
|
432 |
self.unconditional_guidance_scale = 9.0
|
433 |
self.a_prompt = 'best quality, extremely detailed'
|
434 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
435 |
-
|
436 |
|
437 |
@prompts(name="Generate Image Condition On Pose Image",
|
438 |
-
description="useful when you want to generate a new real image from both the user
|
439 |
"and a human pose image. "
|
440 |
"like: generate a real image of a human from this human pose image, "
|
441 |
"or generate a new real image of a human from this pose. "
|
442 |
-
"The input to this tool should be a comma
|
443 |
"representing the image_path and the user description")
|
444 |
def inference(self, inputs):
|
445 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
446 |
image = Image.open(image_path)
|
447 |
self.seed = random.randint(0, 65535)
|
448 |
seed_everything(self.seed)
|
449 |
-
prompt = instruct_text
|
450 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
451 |
guidance_scale=9.0).images[0]
|
452 |
updated_image_path = get_new_image_name(image_path, func_name="pose2image")
|
@@ -503,7 +524,7 @@ class Image2Seg:
|
|
503 |
@prompts(name="Segmentation On Image",
|
504 |
description="useful when you want to detect segmentations of the image. "
|
505 |
"like: segment this image, or generate segmentations on this image, "
|
506 |
-
"or
|
507 |
"The input to this tool should be a string, representing the image_path")
|
508 |
def inference(self, inputs):
|
509 |
image = Image.open(inputs)
|
@@ -525,29 +546,32 @@ class Image2Seg:
|
|
525 |
|
526 |
class SegText2Image:
|
527 |
def __init__(self, device):
|
528 |
-
print("Initializing SegText2Image to
|
529 |
-
self.
|
|
|
|
|
530 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
531 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
532 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
533 |
self.pipe.to(device)
|
534 |
self.seed = -1
|
535 |
self.a_prompt = 'best quality, extremely detailed'
|
536 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
537 |
-
|
538 |
|
539 |
@prompts(name="Generate Image Condition On Segmentations",
|
540 |
-
description="useful when you want to generate a new real image from both the user
|
541 |
"like: generate a real image of a object or something from this segmentation image, "
|
542 |
"or generate a new real image of a object or something from these segmentations. "
|
543 |
-
"The input to this tool should be a comma
|
544 |
"representing the image_path and the user description")
|
545 |
def inference(self, inputs):
|
546 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
547 |
image = Image.open(image_path)
|
548 |
self.seed = random.randint(0, 65535)
|
549 |
seed_everything(self.seed)
|
550 |
-
prompt = instruct_text
|
551 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
552 |
guidance_scale=9.0).images[0]
|
553 |
updated_image_path = get_new_image_name(image_path, func_name="segment2image")
|
@@ -581,29 +605,32 @@ class Image2Depth:
|
|
581 |
|
582 |
class DepthText2Image:
|
583 |
def __init__(self, device):
|
584 |
-
print("Initializing DepthText2Image to
|
585 |
-
self.
|
|
|
|
|
586 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
587 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
588 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
589 |
self.pipe.to(device)
|
590 |
self.seed = -1
|
591 |
self.a_prompt = 'best quality, extremely detailed'
|
592 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
593 |
-
|
594 |
|
595 |
@prompts(name="Generate Image Condition On Depth",
|
596 |
-
description="useful when you want to generate a new real image from both the user
|
597 |
"like: generate a real image of a object or something from this depth image, "
|
598 |
"or generate a new real image of a object or something from the depth map. "
|
599 |
-
"The input to this tool should be a comma
|
600 |
"representing the image_path and the user description")
|
601 |
def inference(self, inputs):
|
602 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
603 |
image = Image.open(image_path)
|
604 |
self.seed = random.randint(0, 65535)
|
605 |
seed_everything(self.seed)
|
606 |
-
prompt = instruct_text
|
607 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
608 |
guidance_scale=9.0).images[0]
|
609 |
updated_image_path = get_new_image_name(image_path, func_name="depth2image")
|
@@ -649,29 +676,32 @@ class Image2Normal:
|
|
649 |
|
650 |
class NormalText2Image:
|
651 |
def __init__(self, device):
|
652 |
-
print("Initializing NormalText2Image to
|
653 |
-
self.
|
|
|
|
|
654 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
655 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
656 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
657 |
self.pipe.to(device)
|
658 |
self.seed = -1
|
659 |
self.a_prompt = 'best quality, extremely detailed'
|
660 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
661 |
-
|
662 |
|
663 |
@prompts(name="Generate Image Condition On Normal Map",
|
664 |
-
description="useful when you want to generate a new real image from both the user
|
665 |
"like: generate a real image of a object or something from this normal map, "
|
666 |
"or generate a new real image of a object or something from the normal map. "
|
667 |
-
"The input to this tool should be a comma
|
668 |
"representing the image_path and the user description")
|
669 |
def inference(self, inputs):
|
670 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
671 |
image = Image.open(image_path)
|
672 |
self.seed = random.randint(0, 65535)
|
673 |
seed_everything(self.seed)
|
674 |
-
prompt = instruct_text
|
675 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
676 |
guidance_scale=9.0).images[0]
|
677 |
updated_image_path = get_new_image_name(image_path, func_name="normal2image")
|
@@ -683,19 +713,21 @@ class NormalText2Image:
|
|
683 |
|
684 |
class VisualQuestionAnswering:
|
685 |
def __init__(self, device):
|
686 |
-
print("Initializing VisualQuestionAnswering to
|
|
|
687 |
self.device = device
|
688 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
689 |
-
self.model = BlipForQuestionAnswering.from_pretrained(
|
|
|
690 |
|
691 |
@prompts(name="Answer Question About The Image",
|
692 |
description="useful when you need an answer for a question based on an image. "
|
693 |
"like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
|
694 |
-
"The input to this tool should be a comma
|
695 |
def inference(self, inputs):
|
696 |
-
image_path, question = inputs.split(",")
|
697 |
raw_image = Image.open(image_path).convert('RGB')
|
698 |
-
inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device,
|
699 |
out = self.model.generate(**inputs)
|
700 |
answer = self.processor.decode(out[0], skip_special_tokens=True)
|
701 |
print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
|
|
|
44 |
|
45 |
class MaskFormer:
|
46 |
def __init__(self, device):
|
47 |
+
print(f"Initializing MaskFormer to {device}")
|
48 |
self.device = device
|
49 |
self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
|
50 |
self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
|
|
|
74 |
|
75 |
class ImageEditing:
|
76 |
def __init__(self, device):
|
77 |
+
print(f"Initializing ImageEditing to {device}")
|
78 |
self.device = device
|
79 |
self.mask_former = MaskFormer(device=self.device)
|
80 |
+
self.revision = 'fp16' if 'cuda' in device else None
|
81 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
82 |
+
self.inpaint = StableDiffusionInpaintPipeline.from_pretrained(
|
83 |
+
"runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device)
|
84 |
|
85 |
@prompts(name="Remove Something From The Photo",
|
86 |
description="useful when you want to remove and object or something from the photo "
|
87 |
"from its description or location. "
|
88 |
+
"The input to this tool should be a comma separated string of two, "
|
89 |
"representing the image_path and the object need to be removed. ")
|
90 |
def inference_remove(self, inputs):
|
91 |
+
image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
92 |
return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
|
93 |
|
94 |
@prompts(name="Replace Something From The Photo",
|
95 |
description="useful when you want to replace an object from the object description or "
|
96 |
"location with another object from its description. "
|
97 |
+
"The input to this tool should be a comma separated string of three, "
|
98 |
"representing the image_path, the object to be replaced, the object to be replaced with ")
|
99 |
def inference_replace(self, inputs):
|
100 |
image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
|
|
|
114 |
|
115 |
class InstructPix2Pix:
|
116 |
def __init__(self, device):
|
117 |
+
print(f"Initializing InstructPix2Pix to {device}")
|
118 |
self.device = device
|
119 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
120 |
+
self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
|
121 |
+
safety_checker=None,
|
122 |
+
torch_dtype=self.torch_dtype).to(device)
|
123 |
self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
|
124 |
|
125 |
@prompts(name="Instruct Image Using Text",
|
126 |
description="useful when you want to the style of the image to be like the text. "
|
127 |
"like: make it look like a painting. or make it like a robot. "
|
128 |
+
"The input to this tool should be a comma separated string of two, "
|
129 |
"representing the image_path and the text. ")
|
130 |
def inference(self, inputs):
|
131 |
"""Change style of image."""
|
|
|
168 |
|
169 |
class ImageCaptioning:
|
170 |
def __init__(self, device):
|
171 |
+
print(f"Initializing ImageCaptioning to {device}")
|
172 |
self.device = device
|
173 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
174 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
175 |
self.model = BlipForConditionalGeneration.from_pretrained(
|
176 |
+
"Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
|
177 |
|
178 |
@prompts(name="Get Photo Description",
|
179 |
description="useful when you want to know what is inside the photo. receives image_path as input. "
|
180 |
"The input to this tool should be a string, representing the image_path. ")
|
181 |
def inference(self, image_path):
|
182 |
+
inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
|
183 |
out = self.model.generate(**inputs)
|
184 |
captions = self.processor.decode(out[0], skip_special_tokens=True)
|
185 |
print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
|
|
|
212 |
|
213 |
class CannyText2Image:
|
214 |
def __init__(self, device):
|
215 |
+
print(f"Initializing CannyText2Image to {device}")
|
216 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
217 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny",
|
218 |
+
torch_dtype=self.torch_dtype)
|
219 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
220 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
221 |
+
torch_dtype=self.torch_dtype)
|
222 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
223 |
self.pipe.to(device)
|
224 |
self.seed = -1
|
225 |
self.a_prompt = 'best quality, extremely detailed'
|
226 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
227 |
+
'fewer digits, cropped, worst quality, low quality'
|
228 |
|
229 |
@prompts(name="Generate Image Condition On Canny Image",
|
230 |
+
description="useful when you want to generate a new real image from both the user description and a canny image."
|
231 |
" like: generate a real image of a object or something from this canny image,"
|
232 |
" or generate a new real image of a object or something from this edge image. "
|
233 |
+
"The input to this tool should be a comma separated string of two, "
|
234 |
"representing the image_path and the user description. ")
|
235 |
def inference(self, inputs):
|
236 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
237 |
image = Image.open(image_path)
|
238 |
self.seed = random.randint(0, 65535)
|
239 |
seed_everything(self.seed)
|
240 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
241 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
242 |
guidance_scale=9.0).images[0]
|
243 |
updated_image_path = get_new_image_name(image_path, func_name="canny2image")
|
|
|
255 |
@prompts(name="Line Detection On Image",
|
256 |
description="useful when you want to detect the straight line of the image. "
|
257 |
"like: detect the straight lines of this image, or straight line detection on image, "
|
258 |
+
"or perform straight line detection on this image, or detect the straight line image of this image. "
|
259 |
"The input to this tool should be a string, representing the image_path")
|
260 |
def inference(self, inputs):
|
261 |
image = Image.open(inputs)
|
|
|
268 |
|
269 |
class LineText2Image:
|
270 |
def __init__(self, device):
|
271 |
+
print(f"Initializing LineText2Image to {device}")
|
272 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
273 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd",
|
274 |
+
torch_dtype=self.torch_dtype)
|
275 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
276 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
277 |
+
torch_dtype=self.torch_dtype
|
278 |
)
|
279 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
280 |
self.pipe.to(device)
|
281 |
self.seed = -1
|
282 |
self.a_prompt = 'best quality, extremely detailed'
|
283 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
284 |
+
'fewer digits, cropped, worst quality, low quality'
|
285 |
|
286 |
@prompts(name="Generate Image Condition On Line Image",
|
287 |
+
description="useful when you want to generate a new real image from both the user description "
|
288 |
"and a straight line image. "
|
289 |
"like: generate a real image of a object or something from this straight line image, "
|
290 |
"or generate a new real image of a object or something from this straight lines. "
|
291 |
+
"The input to this tool should be a comma separated string of two, "
|
292 |
"representing the image_path and the user description. ")
|
293 |
def inference(self, inputs):
|
294 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
295 |
image = Image.open(image_path)
|
296 |
self.seed = random.randint(0, 65535)
|
297 |
seed_everything(self.seed)
|
298 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
299 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
300 |
guidance_scale=9.0).images[0]
|
301 |
updated_image_path = get_new_image_name(image_path, func_name="line2image")
|
|
|
313 |
@prompts(name="Hed Detection On Image",
|
314 |
description="useful when you want to detect the soft hed boundary of the image. "
|
315 |
"like: detect the soft hed boundary of this image, or hed boundary detection on image, "
|
316 |
+
"or perform hed boundary detection on this image, or detect soft hed boundary image of this image. "
|
317 |
"The input to this tool should be a string, representing the image_path")
|
318 |
def inference(self, inputs):
|
319 |
image = Image.open(inputs)
|
|
|
326 |
|
327 |
class HedText2Image:
|
328 |
def __init__(self, device):
|
329 |
+
print(f"Initializing HedText2Image to {device}")
|
330 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
331 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed",
|
332 |
+
torch_dtype=self.torch_dtype)
|
333 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
334 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
335 |
+
torch_dtype=self.torch_dtype
|
336 |
)
|
337 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
338 |
self.pipe.to(device)
|
339 |
self.seed = -1
|
340 |
self.a_prompt = 'best quality, extremely detailed'
|
341 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
342 |
+
'fewer digits, cropped, worst quality, low quality'
|
343 |
|
344 |
@prompts(name="Generate Image Condition On Soft Hed Boundary Image",
|
345 |
+
description="useful when you want to generate a new real image from both the user description "
|
346 |
"and a soft hed boundary image. "
|
347 |
"like: generate a real image of a object or something from this soft hed boundary image, "
|
348 |
"or generate a new real image of a object or something from this hed boundary. "
|
349 |
+
"The input to this tool should be a comma separated string of two, "
|
350 |
"representing the image_path and the user description")
|
351 |
def inference(self, inputs):
|
352 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
353 |
image = Image.open(image_path)
|
354 |
self.seed = random.randint(0, 65535)
|
355 |
seed_everything(self.seed)
|
356 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
357 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
358 |
guidance_scale=9.0).images[0]
|
359 |
updated_image_path = get_new_image_name(image_path, func_name="hed2image")
|
|
|
384 |
|
385 |
class ScribbleText2Image:
|
386 |
def __init__(self, device):
|
387 |
+
print(f"Initializing ScribbleText2Image to {device}")
|
388 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
389 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble",
|
390 |
+
torch_dtype=self.torch_dtype)
|
391 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
392 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
393 |
+
torch_dtype=self.torch_dtype
|
394 |
)
|
395 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
396 |
self.pipe.to(device)
|
397 |
self.seed = -1
|
398 |
self.a_prompt = 'best quality, extremely detailed'
|
399 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
400 |
+
'fewer digits, cropped, worst quality, low quality'
|
401 |
|
402 |
@prompts(name="Generate Image Condition On Sketch Image",
|
403 |
+
description="useful when you want to generate a new real image from both the user description and "
|
404 |
"a scribble image or a sketch image. "
|
405 |
+
"The input to this tool should be a comma separated string of two, "
|
406 |
"representing the image_path and the user description")
|
407 |
def inference(self, inputs):
|
408 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
409 |
image = Image.open(image_path)
|
410 |
self.seed = random.randint(0, 65535)
|
411 |
seed_everything(self.seed)
|
412 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
413 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
414 |
guidance_scale=9.0).images[0]
|
415 |
updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
|
|
|
439 |
|
440 |
class PoseText2Image:
|
441 |
def __init__(self, device):
|
442 |
+
print(f"Initializing PoseText2Image to {device}")
|
443 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
444 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose",
|
445 |
+
torch_dtype=self.torch_dtype)
|
446 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
447 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
448 |
+
torch_dtype=self.torch_dtype)
|
449 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
450 |
self.pipe.to(device)
|
451 |
self.num_inference_steps = 20
|
|
|
453 |
self.unconditional_guidance_scale = 9.0
|
454 |
self.a_prompt = 'best quality, extremely detailed'
|
455 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
456 |
+
' fewer digits, cropped, worst quality, low quality'
|
457 |
|
458 |
@prompts(name="Generate Image Condition On Pose Image",
|
459 |
+
description="useful when you want to generate a new real image from both the user description "
|
460 |
"and a human pose image. "
|
461 |
"like: generate a real image of a human from this human pose image, "
|
462 |
"or generate a new real image of a human from this pose. "
|
463 |
+
"The input to this tool should be a comma separated string of two, "
|
464 |
"representing the image_path and the user description")
|
465 |
def inference(self, inputs):
|
466 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
467 |
image = Image.open(image_path)
|
468 |
self.seed = random.randint(0, 65535)
|
469 |
seed_everything(self.seed)
|
470 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
471 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
472 |
guidance_scale=9.0).images[0]
|
473 |
updated_image_path = get_new_image_name(image_path, func_name="pose2image")
|
|
|
524 |
@prompts(name="Segmentation On Image",
|
525 |
description="useful when you want to detect segmentations of the image. "
|
526 |
"like: segment this image, or generate segmentations on this image, "
|
527 |
+
"or perform segmentation on this image. "
|
528 |
"The input to this tool should be a string, representing the image_path")
|
529 |
def inference(self, inputs):
|
530 |
image = Image.open(inputs)
|
|
|
546 |
|
547 |
class SegText2Image:
|
548 |
def __init__(self, device):
|
549 |
+
print(f"Initializing SegText2Image to {device}")
|
550 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
551 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg",
|
552 |
+
torch_dtype=self.torch_dtype)
|
553 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
554 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
555 |
+
torch_dtype=self.torch_dtype)
|
556 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
557 |
self.pipe.to(device)
|
558 |
self.seed = -1
|
559 |
self.a_prompt = 'best quality, extremely detailed'
|
560 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
561 |
+
' fewer digits, cropped, worst quality, low quality'
|
562 |
|
563 |
@prompts(name="Generate Image Condition On Segmentations",
|
564 |
+
description="useful when you want to generate a new real image from both the user description and segmentations. "
|
565 |
"like: generate a real image of a object or something from this segmentation image, "
|
566 |
"or generate a new real image of a object or something from these segmentations. "
|
567 |
+
"The input to this tool should be a comma separated string of two, "
|
568 |
"representing the image_path and the user description")
|
569 |
def inference(self, inputs):
|
570 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
571 |
image = Image.open(image_path)
|
572 |
self.seed = random.randint(0, 65535)
|
573 |
seed_everything(self.seed)
|
574 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
575 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
576 |
guidance_scale=9.0).images[0]
|
577 |
updated_image_path = get_new_image_name(image_path, func_name="segment2image")
|
|
|
605 |
|
606 |
class DepthText2Image:
|
607 |
def __init__(self, device):
|
608 |
+
print(f"Initializing DepthText2Image to {device}")
|
609 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
610 |
+
self.controlnet = ControlNetModel.from_pretrained(
|
611 |
+
"fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=self.torch_dtype)
|
612 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
613 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
614 |
+
torch_dtype=self.torch_dtype)
|
615 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
616 |
self.pipe.to(device)
|
617 |
self.seed = -1
|
618 |
self.a_prompt = 'best quality, extremely detailed'
|
619 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
620 |
+
' fewer digits, cropped, worst quality, low quality'
|
621 |
|
622 |
@prompts(name="Generate Image Condition On Depth",
|
623 |
+
description="useful when you want to generate a new real image from both the user description and depth image. "
|
624 |
"like: generate a real image of a object or something from this depth image, "
|
625 |
"or generate a new real image of a object or something from the depth map. "
|
626 |
+
"The input to this tool should be a comma separated string of two, "
|
627 |
"representing the image_path and the user description")
|
628 |
def inference(self, inputs):
|
629 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
630 |
image = Image.open(image_path)
|
631 |
self.seed = random.randint(0, 65535)
|
632 |
seed_everything(self.seed)
|
633 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
634 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
635 |
guidance_scale=9.0).images[0]
|
636 |
updated_image_path = get_new_image_name(image_path, func_name="depth2image")
|
|
|
676 |
|
677 |
class NormalText2Image:
|
678 |
def __init__(self, device):
|
679 |
+
print(f"Initializing NormalText2Image to {device}")
|
680 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
681 |
+
self.controlnet = ControlNetModel.from_pretrained(
|
682 |
+
"fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=self.torch_dtype)
|
683 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
684 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
685 |
+
torch_dtype=self.torch_dtype)
|
686 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
687 |
self.pipe.to(device)
|
688 |
self.seed = -1
|
689 |
self.a_prompt = 'best quality, extremely detailed'
|
690 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
691 |
+
' fewer digits, cropped, worst quality, low quality'
|
692 |
|
693 |
@prompts(name="Generate Image Condition On Normal Map",
|
694 |
+
description="useful when you want to generate a new real image from both the user description and normal map. "
|
695 |
"like: generate a real image of a object or something from this normal map, "
|
696 |
"or generate a new real image of a object or something from the normal map. "
|
697 |
+
"The input to this tool should be a comma separated string of two, "
|
698 |
"representing the image_path and the user description")
|
699 |
def inference(self, inputs):
|
700 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
701 |
image = Image.open(image_path)
|
702 |
self.seed = random.randint(0, 65535)
|
703 |
seed_everything(self.seed)
|
704 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
705 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
706 |
guidance_scale=9.0).images[0]
|
707 |
updated_image_path = get_new_image_name(image_path, func_name="normal2image")
|
|
|
713 |
|
714 |
class VisualQuestionAnswering:
|
715 |
def __init__(self, device):
|
716 |
+
print(f"Initializing VisualQuestionAnswering to {device}")
|
717 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
718 |
self.device = device
|
719 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
720 |
+
self.model = BlipForQuestionAnswering.from_pretrained(
|
721 |
+
"Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
|
722 |
|
723 |
@prompts(name="Answer Question About The Image",
|
724 |
description="useful when you need an answer for a question based on an image. "
|
725 |
"like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
|
726 |
+
"The input to this tool should be a comma separated string of two, representing the image_path and the question")
|
727 |
def inference(self, inputs):
|
728 |
+
image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
729 |
raw_image = Image.open(image_path).convert('RGB')
|
730 |
+
inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
|
731 |
out = self.model.generate(**inputs)
|
732 |
answer = self.processor.decode(out[0], skip_special_tokens=True)
|
733 |
print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
|