Spaces:
Runtime error
Runtime error
LanHarmony
commited on
Commit
•
ba47d90
1
Parent(s):
2769cd9
add infinity and chinese support
Browse files- app.py +72 -10
- visual_foundation_models.py +159 -3
app.py
CHANGED
@@ -42,6 +42,51 @@ Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to
|
|
42 |
The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human.
|
43 |
Thought: Do I need to use a tool? {agent_scratchpad}"""
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
from visual_foundation_models import *
|
46 |
from langchain.agents.initialize import initialize_agent
|
47 |
from langchain.agents.tools import Tool
|
@@ -74,21 +119,31 @@ class ConversationBot:
|
|
74 |
if 'ImageCaptioning' not in load_dict:
|
75 |
raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
|
76 |
|
77 |
-
self.
|
78 |
-
|
79 |
for class_name, device in load_dict.items():
|
80 |
self.models[class_name] = globals()[class_name](device=device)
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
self.tools = []
|
83 |
-
for
|
84 |
for e in dir(instance):
|
85 |
if e.startswith('inference'):
|
86 |
func = getattr(instance, e)
|
87 |
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
|
|
88 |
|
89 |
def run_text(self, text, state):
|
90 |
self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
|
91 |
-
res = self.agent({"input": text})
|
92 |
res['output'] = res['output'].replace("\\", "/")
|
93 |
response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
|
94 |
state = state + [(text, response)]
|
@@ -118,7 +173,16 @@ class ConversationBot:
|
|
118 |
f"Current Memory: {self.agent.memory.buffer}")
|
119 |
return state, state, f'{txt} {image_filename} '
|
120 |
|
121 |
-
def init_agent(self, openai_api_key):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
123 |
self.agent = initialize_agent(
|
124 |
self.tools,
|
@@ -127,7 +191,7 @@ class ConversationBot:
|
|
127 |
verbose=True,
|
128 |
memory=self.memory,
|
129 |
return_intermediate_steps=True,
|
130 |
-
agent_kwargs={'prefix':
|
131 |
|
132 |
return gr.update(visible = True)
|
133 |
|
@@ -147,11 +211,11 @@ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
|
147 |
gr.Markdown(
|
148 |
"""This is a demo to the work [Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models](https://github.com/microsoft/visual-chatgpt).<br>
|
149 |
This space connects ChatGPT and a series of Visual Foundation Models to enable sending and receiving images during chatting.<br>
|
150 |
-
This space currently only supports English (目前只支持英文对话, 中文正在开发中).<br>
|
151 |
"""
|
152 |
)
|
153 |
|
154 |
with gr.Row():
|
|
|
155 |
openai_api_key_textbox = gr.Textbox(
|
156 |
placeholder="Paste your OpenAI API key here to start Visual ChatGPT(sk-...) and press Enter ↵️",
|
157 |
show_label=False,
|
@@ -191,9 +255,7 @@ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
|
191 |
<a href="https://huggingface.co/spaces/microsoft/visual_chatgpt?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
|
192 |
</center>''')
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
|
197 |
txt.submit(bot.run_text, [txt, state], [chatbot, state])
|
198 |
txt.submit(lambda: "", None, txt)
|
199 |
run.click(bot.run_text, [txt, state], [chatbot, state])
|
|
|
42 |
The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human.
|
43 |
Thought: Do I need to use a tool? {agent_scratchpad}"""
|
44 |
|
45 |
+
VISUAL_CHATGPT_PREFIX_CN = """Visual ChatGPT 旨在能够协助完成范围广泛的文本和视觉相关任务,从回答简单的问题到提供对广泛主题的深入解释和讨论。 Visual ChatGPT 能够根据收到的输入生成类似人类的文本,使其能够进行听起来自然的对话,并提供连贯且与手头主题相关的响应。
|
46 |
+
|
47 |
+
Visual ChatGPT 能够处理和理解大量文本和图像。作为一种语言模型,Visual ChatGPT 不能直接读取图像,但它有一系列工具来完成不同的视觉任务。每张图片都会有一个文件名,格式为“image/xxx.png”,Visual ChatGPT可以调用不同的工具来间接理解图片。在谈论图片时,Visual ChatGPT 对文件名的要求非常严格,绝不会伪造不存在的文件。在使用工具生成新的图像文件时,Visual ChatGPT也知道图像可能与用户需求不一样,会使用其他视觉问答工具或描述工具来观察真实图像。 Visual ChatGPT 能够按顺序使用工具,并且忠于工具观察输出,而不是伪造图像内容和图像文件名。如果生成新图像,它将记得提供上次工具观察的文件名。
|
48 |
+
|
49 |
+
Human 可能会向 Visual ChatGPT 提供带有描述的新图形。描述帮助 Visual ChatGPT 理解这个图像,但 Visual ChatGPT 应该使用工具来完成以下任务,而不是直接从描述中想象。有些工具将会返回英文描述,但你对用户的聊天应当采用中文。
|
50 |
+
|
51 |
+
总的来说,Visual ChatGPT 是一个强大的可视化对话辅助工具,可以帮助处理范围广泛的任务,并提供关于范围广泛的主题的有价值的见解和信息。
|
52 |
+
|
53 |
+
工具列表:
|
54 |
+
------
|
55 |
+
|
56 |
+
Visual ChatGPT 可以使用这些工具:"""
|
57 |
+
|
58 |
+
VISUAL_CHATGPT_FORMAT_INSTRUCTIONS_CN = """用户使用中文和你进行聊天,但是工具的参数应当使用英文。如果要调用工具,你必须遵循如下格式:
|
59 |
+
|
60 |
+
```
|
61 |
+
Thought: Do I need to use a tool? Yes
|
62 |
+
Action: the action to take, should be one of [{tool_names}]
|
63 |
+
Action Input: the input to the action
|
64 |
+
Observation: the result of the action
|
65 |
+
```
|
66 |
+
|
67 |
+
当你不再需要继续调用工具,而是对观察结果进行总结回复时,你必须使用如下格式:
|
68 |
+
|
69 |
+
|
70 |
+
```
|
71 |
+
Thought: Do I need to use a tool? No
|
72 |
+
{ai_prefix}: [your response here]
|
73 |
+
```
|
74 |
+
"""
|
75 |
+
|
76 |
+
VISUAL_CHATGPT_SUFFIX_CN = """你对文件名的正确性非常严格,而且永远不会伪造不存在的文件。
|
77 |
+
|
78 |
+
开始!
|
79 |
+
|
80 |
+
因为Visual ChatGPT是一个文本语言模型,必须使用工具去观察图片而不是依靠想象。
|
81 |
+
推理想法和观察结果只对Visual ChatGPT可见,需要记得在最终回复时把重要的信息重复给用户,你只能给用户返回中文句子。我们一步一步思考。在你使用工具时,工具的参数只能是英文。
|
82 |
+
|
83 |
+
聊天历史:
|
84 |
+
{chat_history}
|
85 |
+
|
86 |
+
新输入: {input}
|
87 |
+
Thought: Do I need to use a tool? {agent_scratchpad}
|
88 |
+
"""
|
89 |
+
|
90 |
from visual_foundation_models import *
|
91 |
from langchain.agents.initialize import initialize_agent
|
92 |
from langchain.agents.tools import Tool
|
|
|
119 |
if 'ImageCaptioning' not in load_dict:
|
120 |
raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
|
121 |
|
122 |
+
self.models = {}
|
123 |
+
# Load Basic Foundation Models
|
124 |
for class_name, device in load_dict.items():
|
125 |
self.models[class_name] = globals()[class_name](device=device)
|
126 |
|
127 |
+
# Load Template Foundation Models
|
128 |
+
for class_name, module in globals().items():
|
129 |
+
if getattr(module, 'template_model', False):
|
130 |
+
template_required_names = {k for k in inspect.signature(module.__init__).parameters.keys() if
|
131 |
+
k != 'self'}
|
132 |
+
loaded_names = set([type(e).__name__ for e in self.models.values()])
|
133 |
+
if template_required_names.issubset(loaded_names):
|
134 |
+
self.models[class_name] = globals()[class_name](
|
135 |
+
**{name: self.models[name] for name in template_required_names})
|
136 |
self.tools = []
|
137 |
+
for instance in self.models.values():
|
138 |
for e in dir(instance):
|
139 |
if e.startswith('inference'):
|
140 |
func = getattr(instance, e)
|
141 |
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
142 |
+
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
143 |
|
144 |
def run_text(self, text, state):
|
145 |
self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
|
146 |
+
res = self.agent({"input": text.strip()})
|
147 |
res['output'] = res['output'].replace("\\", "/")
|
148 |
response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
|
149 |
state = state + [(text, response)]
|
|
|
173 |
f"Current Memory: {self.agent.memory.buffer}")
|
174 |
return state, state, f'{txt} {image_filename} '
|
175 |
|
176 |
+
def init_agent(self, openai_api_key, lang):
|
177 |
+
self.memory.clear()
|
178 |
+
if lang=='English':
|
179 |
+
PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = VISUAL_CHATGPT_PREFIX, VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, VISUAL_CHATGPT_SUFFIX
|
180 |
+
place = "Enter text and press enter, or upload an image"
|
181 |
+
label_clear = "Clear"
|
182 |
+
else:
|
183 |
+
PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = VISUAL_CHATGPT_PREFIX_CN, VISUAL_CHATGPT_FORMAT_INSTRUCTIONS_CN, VISUAL_CHATGPT_SUFFIX_CN
|
184 |
+
place = "输入文字并回车,或者上传图片"
|
185 |
+
label_clear = "清除"
|
186 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
187 |
self.agent = initialize_agent(
|
188 |
self.tools,
|
|
|
191 |
verbose=True,
|
192 |
memory=self.memory,
|
193 |
return_intermediate_steps=True,
|
194 |
+
agent_kwargs={'prefix': PREFIX, 'format_instructions': FORMAT_INSTRUCTIONS, 'suffix': SUFFIX}, )
|
195 |
|
196 |
return gr.update(visible = True)
|
197 |
|
|
|
211 |
gr.Markdown(
|
212 |
"""This is a demo to the work [Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models](https://github.com/microsoft/visual-chatgpt).<br>
|
213 |
This space connects ChatGPT and a series of Visual Foundation Models to enable sending and receiving images during chatting.<br>
|
|
|
214 |
"""
|
215 |
)
|
216 |
|
217 |
with gr.Row():
|
218 |
+
lang = gr.Radio(choices=['Chinese', 'English'], value='English', label='Language')
|
219 |
openai_api_key_textbox = gr.Textbox(
|
220 |
placeholder="Paste your OpenAI API key here to start Visual ChatGPT(sk-...) and press Enter ↵️",
|
221 |
show_label=False,
|
|
|
255 |
<a href="https://huggingface.co/spaces/microsoft/visual_chatgpt?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
|
256 |
</center>''')
|
257 |
|
258 |
+
openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, lang], [input_raws])
|
|
|
|
|
259 |
txt.submit(bot.run_text, [txt, state], [chatbot, state])
|
260 |
txt.submit(lambda: "", None, txt)
|
261 |
run.click(bot.run_text, [txt, state], [chatbot, state])
|
visual_foundation_models.py
CHANGED
@@ -12,9 +12,12 @@ import random
|
|
12 |
import torch
|
13 |
import cv2
|
14 |
import uuid
|
15 |
-
from PIL import Image
|
16 |
import numpy as np
|
17 |
from pytorch_lightning import seed_everything
|
|
|
|
|
|
|
18 |
|
19 |
def prompts(name, description):
|
20 |
def decorator(func):
|
@@ -24,6 +27,62 @@ def prompts(name, description):
|
|
24 |
|
25 |
return decorator
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def get_new_image_name(org_img_name, func_name="update"):
|
28 |
head_tail = os.path.split(org_img_name)
|
29 |
head = head_tail[0]
|
@@ -540,7 +599,7 @@ class Image2Seg:
|
|
540 |
segmentation = Image.fromarray(color_seg)
|
541 |
updated_image_path = get_new_image_name(inputs, func_name="segmentation")
|
542 |
segmentation.save(updated_image_path)
|
543 |
-
print(f"\nProcessed
|
544 |
return updated_image_path
|
545 |
|
546 |
|
@@ -732,4 +791,101 @@ class VisualQuestionAnswering:
|
|
732 |
answer = self.processor.decode(out[0], skip_special_tokens=True)
|
733 |
print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
|
734 |
f"Output Answer: {answer}")
|
735 |
-
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
import torch
|
13 |
import cv2
|
14 |
import uuid
|
15 |
+
from PIL import Image, ImageOps
|
16 |
import numpy as np
|
17 |
from pytorch_lightning import seed_everything
|
18 |
+
import math
|
19 |
+
|
20 |
+
from langchain.llms.openai import OpenAI
|
21 |
|
22 |
def prompts(name, description):
|
23 |
def decorator(func):
|
|
|
27 |
|
28 |
return decorator
|
29 |
|
30 |
+
def blend_gt2pt(old_image, new_image, sigma=0.15, steps=100):
|
31 |
+
new_size = new_image.size
|
32 |
+
old_size = old_image.size
|
33 |
+
easy_img = np.array(new_image)
|
34 |
+
gt_img_array = np.array(old_image)
|
35 |
+
pos_w = (new_size[0] - old_size[0]) // 2
|
36 |
+
pos_h = (new_size[1] - old_size[1]) // 2
|
37 |
+
|
38 |
+
kernel_h = cv2.getGaussianKernel(old_size[1], old_size[1] * sigma)
|
39 |
+
kernel_w = cv2.getGaussianKernel(old_size[0], old_size[0] * sigma)
|
40 |
+
kernel = np.multiply(kernel_h, np.transpose(kernel_w))
|
41 |
+
|
42 |
+
kernel[steps:-steps, steps:-steps] = 1
|
43 |
+
kernel[:steps, :steps] = kernel[:steps, :steps] / kernel[steps - 1, steps - 1]
|
44 |
+
kernel[:steps, -steps:] = kernel[:steps, -steps:] / kernel[steps - 1, -(steps)]
|
45 |
+
kernel[-steps:, :steps] = kernel[-steps:, :steps] / kernel[-steps, steps - 1]
|
46 |
+
kernel[-steps:, -steps:] = kernel[-steps:, -steps:] / kernel[-steps, -steps]
|
47 |
+
kernel = np.expand_dims(kernel, 2)
|
48 |
+
kernel = np.repeat(kernel, 3, 2)
|
49 |
+
|
50 |
+
weight = np.linspace(0, 1, steps)
|
51 |
+
top = np.expand_dims(weight, 1)
|
52 |
+
top = np.repeat(top, old_size[0] - 2 * steps, 1)
|
53 |
+
top = np.expand_dims(top, 2)
|
54 |
+
top = np.repeat(top, 3, 2)
|
55 |
+
|
56 |
+
weight = np.linspace(1, 0, steps)
|
57 |
+
down = np.expand_dims(weight, 1)
|
58 |
+
down = np.repeat(down, old_size[0] - 2 * steps, 1)
|
59 |
+
down = np.expand_dims(down, 2)
|
60 |
+
down = np.repeat(down, 3, 2)
|
61 |
+
|
62 |
+
weight = np.linspace(0, 1, steps)
|
63 |
+
left = np.expand_dims(weight, 0)
|
64 |
+
left = np.repeat(left, old_size[1] - 2 * steps, 0)
|
65 |
+
left = np.expand_dims(left, 2)
|
66 |
+
left = np.repeat(left, 3, 2)
|
67 |
+
|
68 |
+
weight = np.linspace(1, 0, steps)
|
69 |
+
right = np.expand_dims(weight, 0)
|
70 |
+
right = np.repeat(right, old_size[1] - 2 * steps, 0)
|
71 |
+
right = np.expand_dims(right, 2)
|
72 |
+
right = np.repeat(right, 3, 2)
|
73 |
+
|
74 |
+
kernel[:steps, steps:-steps] = top
|
75 |
+
kernel[-steps:, steps:-steps] = down
|
76 |
+
kernel[steps:-steps, :steps] = left
|
77 |
+
kernel[steps:-steps, -steps:] = right
|
78 |
+
|
79 |
+
pt_gt_img = easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]]
|
80 |
+
gaussian_gt_img = kernel * gt_img_array + (1 - kernel) * pt_gt_img # gt img with blur img
|
81 |
+
gaussian_gt_img = gaussian_gt_img.astype(np.int64)
|
82 |
+
easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]] = gaussian_gt_img
|
83 |
+
gaussian_img = Image.fromarray(easy_img)
|
84 |
+
return gaussian_img
|
85 |
+
|
86 |
def get_new_image_name(org_img_name, func_name="update"):
|
87 |
head_tail = os.path.split(org_img_name)
|
88 |
head = head_tail[0]
|
|
|
599 |
segmentation = Image.fromarray(color_seg)
|
600 |
updated_image_path = get_new_image_name(inputs, func_name="segmentation")
|
601 |
segmentation.save(updated_image_path)
|
602 |
+
print(f"\nProcessed Image2Seg, Input Image: {inputs}, Output Pose: {updated_image_path}")
|
603 |
return updated_image_path
|
604 |
|
605 |
|
|
|
791 |
answer = self.processor.decode(out[0], skip_special_tokens=True)
|
792 |
print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
|
793 |
f"Output Answer: {answer}")
|
794 |
+
return answer
|
795 |
+
|
796 |
+
class InfinityOutPainting:
|
797 |
+
template_model = True # Add this line to show this is a template model.
|
798 |
+
def __init__(self, ImageCaptioning, ImageEditing, VisualQuestionAnswering):
|
799 |
+
self.llm = OpenAI(temperature=0)
|
800 |
+
self.ImageCaption = ImageCaptioning
|
801 |
+
self.ImageEditing = ImageEditing
|
802 |
+
self.ImageVQA = VisualQuestionAnswering
|
803 |
+
self.a_prompt = 'best quality, extremely detailed'
|
804 |
+
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
805 |
+
'fewer digits, cropped, worst quality, low quality'
|
806 |
+
|
807 |
+
def get_BLIP_vqa(self, image, question):
|
808 |
+
inputs = self.ImageVQA.processor(image, question, return_tensors="pt").to(self.ImageVQA.device,
|
809 |
+
self.ImageVQA.torch_dtype)
|
810 |
+
out = self.ImageVQA.model.generate(**inputs)
|
811 |
+
answer = self.ImageVQA.processor.decode(out[0], skip_special_tokens=True)
|
812 |
+
print(f"\nProcessed VisualQuestionAnswering, Input Question: {question}, Output Answer: {answer}")
|
813 |
+
return answer
|
814 |
+
|
815 |
+
def get_BLIP_caption(self, image):
|
816 |
+
inputs = self.ImageCaption.processor(image, return_tensors="pt").to(self.ImageCaption.device,
|
817 |
+
self.ImageCaption.torch_dtype)
|
818 |
+
out = self.ImageCaption.model.generate(**inputs)
|
819 |
+
BLIP_caption = self.ImageCaption.processor.decode(out[0], skip_special_tokens=True)
|
820 |
+
return BLIP_caption
|
821 |
+
|
822 |
+
def check_prompt(self, prompt):
|
823 |
+
check = f"Here is a paragraph with adjectives. " \
|
824 |
+
f"{prompt} " \
|
825 |
+
f"Please change all plural forms in the adjectives to singular forms. "
|
826 |
+
return self.llm(check)
|
827 |
+
|
828 |
+
def get_imagine_caption(self, image, imagine):
|
829 |
+
BLIP_caption = self.get_BLIP_caption(image)
|
830 |
+
background_color = self.get_BLIP_vqa(image, 'what is the background color of this image')
|
831 |
+
style = self.get_BLIP_vqa(image, 'what is the style of this image')
|
832 |
+
imagine_prompt = f"let's pretend you are an excellent painter and now " \
|
833 |
+
f"there is an incomplete painting with {BLIP_caption} in the center, " \
|
834 |
+
f"please imagine the complete painting and describe it" \
|
835 |
+
f"you should consider the background color is {background_color}, the style is {style}" \
|
836 |
+
f"You should make the painting as vivid and realistic as possible" \
|
837 |
+
f"You can not use words like painting or picture" \
|
838 |
+
f"and you should use no more than 50 words to describe it"
|
839 |
+
caption = self.llm(imagine_prompt) if imagine else BLIP_caption
|
840 |
+
caption = self.check_prompt(caption)
|
841 |
+
print(f'BLIP observation: {BLIP_caption}, ChatGPT imagine to {caption}') if imagine else print(
|
842 |
+
f'Prompt: {caption}')
|
843 |
+
return caption
|
844 |
+
|
845 |
+
def resize_image(self, image, max_size=1000000, multiple=8):
|
846 |
+
aspect_ratio = image.size[0] / image.size[1]
|
847 |
+
new_width = int(math.sqrt(max_size * aspect_ratio))
|
848 |
+
new_height = int(new_width / aspect_ratio)
|
849 |
+
new_width, new_height = new_width - (new_width % multiple), new_height - (new_height % multiple)
|
850 |
+
return image.resize((new_width, new_height))
|
851 |
+
|
852 |
+
def dowhile(self, original_img, tosize, expand_ratio, imagine, usr_prompt):
|
853 |
+
old_img = original_img
|
854 |
+
while (old_img.size != tosize):
|
855 |
+
prompt = self.check_prompt(usr_prompt) if usr_prompt else self.get_imagine_caption(old_img, imagine)
|
856 |
+
crop_w = 15 if old_img.size[0] != tosize[0] else 0
|
857 |
+
crop_h = 15 if old_img.size[1] != tosize[1] else 0
|
858 |
+
old_img = ImageOps.crop(old_img, (crop_w, crop_h, crop_w, crop_h))
|
859 |
+
temp_canvas_size = (expand_ratio * old_img.width if expand_ratio * old_img.width < tosize[0] else tosize[0],
|
860 |
+
expand_ratio * old_img.height if expand_ratio * old_img.height < tosize[1] else tosize[
|
861 |
+
1])
|
862 |
+
temp_canvas, temp_mask = Image.new("RGB", temp_canvas_size, color="white"), Image.new("L", temp_canvas_size,
|
863 |
+
color="white")
|
864 |
+
x, y = (temp_canvas.width - old_img.width) // 2, (temp_canvas.height - old_img.height) // 2
|
865 |
+
temp_canvas.paste(old_img, (x, y))
|
866 |
+
temp_mask.paste(0, (x, y, x + old_img.width, y + old_img.height))
|
867 |
+
resized_temp_canvas, resized_temp_mask = self.resize_image(temp_canvas), self.resize_image(temp_mask)
|
868 |
+
image = self.ImageEditing.inpaint(prompt=prompt, image=resized_temp_canvas, mask_image=resized_temp_mask,
|
869 |
+
height=resized_temp_canvas.height, width=resized_temp_canvas.width,
|
870 |
+
num_inference_steps=50).images[0].resize(
|
871 |
+
(temp_canvas.width, temp_canvas.height), Image.ANTIALIAS)
|
872 |
+
image = blend_gt2pt(old_img, image)
|
873 |
+
old_img = image
|
874 |
+
return old_img
|
875 |
+
|
876 |
+
@prompts(name="Extend An Image",
|
877 |
+
description="useful when you need to extend an image into a larger image."
|
878 |
+
"like: extend the image into a resolution of 2048x1024, extend the image into 2048x1024. "
|
879 |
+
"The input to this tool should be a comma separated string of two, representing the image_path and the resolution of widthxheight")
|
880 |
+
def inference(self, inputs):
|
881 |
+
image_path, resolution = inputs.split(',')
|
882 |
+
width, height = resolution.split('x')
|
883 |
+
tosize = (int(width), int(height))
|
884 |
+
image = Image.open(image_path)
|
885 |
+
image = ImageOps.crop(image, (10, 10, 10, 10))
|
886 |
+
out_painted_image = self.dowhile(image, tosize, 4, True, False)
|
887 |
+
updated_image_path = get_new_image_name(image_path, func_name="outpainting")
|
888 |
+
out_painted_image.save(updated_image_path)
|
889 |
+
print(f"\nProcessed InfinityOutPainting, Input Image: {image_path}, Input Resolution: {resolution}, "
|
890 |
+
f"Output Image: {updated_image_path}")
|
891 |
+
return updated_image_path
|