Spaces:
Running
on
A10G
Running
on
A10G
JingyeChen
commited on
Commit
•
2af8ebe
1
Parent(s):
cbddee5
update
Browse files
app.py
CHANGED
@@ -187,7 +187,7 @@ def get_pixels(i, t, evt: gr.SelectData):
|
|
187 |
|
188 |
|
189 |
|
190 |
-
def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature):
|
191 |
|
192 |
global stack
|
193 |
global state
|
@@ -196,105 +196,111 @@ def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slide
|
|
196 |
time1 = time.time()
|
197 |
user_prompt = prompt
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
else:
|
205 |
-
keywords = keywords.split('/')
|
206 |
-
keywords = [i.strip() for i in keywords]
|
207 |
-
template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. In addition, we also provide all keywords at random order for reference. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {prompt}. Keywords: {str(keywords)}'
|
208 |
-
|
209 |
-
msg = template
|
210 |
-
conv = get_conversation_template(m1_model_path)
|
211 |
-
conv.append_message(conv.roles[0], msg)
|
212 |
-
conv.append_message(conv.roles[1], None)
|
213 |
-
prompt = conv.get_prompt()
|
214 |
-
inputs = m1_tokenizer([prompt], return_token_type_ids=False)
|
215 |
-
inputs = {k: torch.tensor(v).to('cuda') for k, v in inputs.items()}
|
216 |
-
output_ids = m1_model.generate(
|
217 |
-
**inputs,
|
218 |
-
do_sample=True,
|
219 |
-
temperature=slider_temperature,
|
220 |
-
repetition_penalty=1.0,
|
221 |
-
max_new_tokens=512,
|
222 |
-
)
|
223 |
-
|
224 |
-
if m1_model.config.is_encoder_decoder:
|
225 |
-
output_ids = output_ids[0]
|
226 |
-
else:
|
227 |
-
output_ids = output_ids[0][len(inputs["input_ids"][0]) :]
|
228 |
-
outputs = m1_tokenizer.decode(
|
229 |
-
output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
|
230 |
-
)
|
231 |
-
print(f"[{conv.roles[0]}]\n{msg}")
|
232 |
-
print(f"[{conv.roles[1]}]\n{outputs}")
|
233 |
-
ocrs = outputs.split('\n')
|
234 |
-
time2 = time.time()
|
235 |
-
print(time2-time1)
|
236 |
-
|
237 |
-
# user_prompt = prompt
|
238 |
-
current_ocr = ocrs
|
239 |
-
|
240 |
-
ocr_ids = []
|
241 |
-
print('user_prompt', user_prompt)
|
242 |
-
print('current_ocr', current_ocr)
|
243 |
|
244 |
-
|
245 |
-
|
246 |
|
247 |
-
|
248 |
-
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
ocr_ids.extend(['l'+str(l), 't'+str(t), 'r'+str(r), 'b'+str(b)])
|
257 |
-
|
258 |
-
char_list = list(pred)
|
259 |
-
char_list = [f'[{i}]' for i in char_list]
|
260 |
-
ocr_ids.extend(char_list)
|
261 |
-
ocr_ids.append(tokenizer.eos_token_id)
|
262 |
-
|
263 |
-
caption_ids = tokenizer(
|
264 |
-
user_prompt, truncation=True, return_tensors="pt"
|
265 |
-
).input_ids[0].tolist()
|
266 |
-
|
267 |
-
try:
|
268 |
-
ocr_ids = tokenizer.encode(ocr_ids)
|
269 |
-
prompt = caption_ids + ocr_ids
|
270 |
-
except:
|
271 |
-
prompt = caption_ids
|
272 |
-
|
273 |
-
composed_prompt = tokenizer.decode(prompt)
|
274 |
-
|
275 |
-
else:
|
276 |
-
user_prompt += ' <|endoftext|>'
|
277 |
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
|
|
|
|
298 |
|
299 |
prompt = prompt[:77]
|
300 |
while len(prompt) < 77:
|
@@ -340,8 +346,9 @@ def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slide
|
|
340 |
col = index % 2
|
341 |
new_image.paste(image, (col*width, row*height))
|
342 |
# new_image.save(f'{args.output_dir}/pred_img_{sample_index}_{args.local_rank}.jpg')
|
343 |
-
results.insert(0, new_image)
|
344 |
# return new_image
|
|
|
345 |
return tuple(results), composed_prompt
|
346 |
|
347 |
with gr.Blocks() as demo:
|
@@ -349,7 +356,7 @@ with gr.Blocks() as demo:
|
|
349 |
gr.HTML(
|
350 |
"""
|
351 |
<div style="text-align: center; max-width: 1600px; margin: 20px auto;">
|
352 |
-
<h2 style="font-weight: 900; font-size: 2.
|
353 |
TextDiffuser-2: Unleashing the Power of Language Models for Text Rendering
|
354 |
</h2>
|
355 |
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
|
@@ -384,7 +391,7 @@ with gr.Blocks() as demo:
|
|
384 |
with gr.Column(scale=1):
|
385 |
i = gr.Image(label="Template (Click to paint)", type='filepath', value=f'./gray256.jpg', height=256, width=256)
|
386 |
with gr.Column(scale=1):
|
387 |
-
t = gr.Textbox(label="
|
388 |
redo = gr.Button(value='Redo - Cancel the last keyword') # 如何给b绑定事件
|
389 |
undo = gr.Button(value='Undo - Clear the canvas') # 如何给b绑定事件
|
390 |
skip_button = gr.Button(value='Skip - Operate next keyword') # 如何给b绑定事件
|
@@ -399,7 +406,8 @@ with gr.Blocks() as demo:
|
|
399 |
slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
|
400 |
slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
|
401 |
slider_temperature = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.1, label="Temperature", info="Control the diversity of layout planner. Higher value indicates more diversity.")
|
402 |
-
|
|
|
403 |
button = gr.Button("Generate")
|
404 |
|
405 |
with gr.Column(scale=1):
|
@@ -415,7 +423,7 @@ with gr.Blocks() as demo:
|
|
415 |
|
416 |
# gr.Markdown("## Prompt Examples")
|
417 |
|
418 |
-
button.click(text_to_image, inputs=[prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature], outputs=[output, composed_prompt])
|
419 |
|
420 |
gr.Markdown("## Prompt Examples")
|
421 |
gr.Examples(
|
|
|
187 |
|
188 |
|
189 |
|
190 |
+
def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature,slider_natural):
|
191 |
|
192 |
global stack
|
193 |
global state
|
|
|
196 |
time1 = time.time()
|
197 |
user_prompt = prompt
|
198 |
|
199 |
+
if slider_natural:
|
200 |
+
user_prompt += ' <|endoftext|>'
|
201 |
+
composed_prompt = tokenizer.decode(prompt)
|
202 |
+
else:
|
203 |
+
if len(stack) == 0:
|
204 |
+
|
205 |
+
if len(keywords.strip()) == 0:
|
206 |
+
template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. All keywords are included in the caption. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {user_prompt}'
|
207 |
+
else:
|
208 |
+
keywords = keywords.split('/')
|
209 |
+
keywords = [i.strip() for i in keywords]
|
210 |
+
template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. In addition, we also provide all keywords at random order for reference. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {prompt}. Keywords: {str(keywords)}'
|
211 |
+
|
212 |
+
msg = template
|
213 |
+
conv = get_conversation_template(m1_model_path)
|
214 |
+
conv.append_message(conv.roles[0], msg)
|
215 |
+
conv.append_message(conv.roles[1], None)
|
216 |
+
prompt = conv.get_prompt()
|
217 |
+
inputs = m1_tokenizer([prompt], return_token_type_ids=False)
|
218 |
+
inputs = {k: torch.tensor(v).to('cuda') for k, v in inputs.items()}
|
219 |
+
output_ids = m1_model.generate(
|
220 |
+
**inputs,
|
221 |
+
do_sample=True,
|
222 |
+
temperature=slider_temperature,
|
223 |
+
repetition_penalty=1.0,
|
224 |
+
max_new_tokens=512,
|
225 |
+
)
|
226 |
+
|
227 |
+
if m1_model.config.is_encoder_decoder:
|
228 |
+
output_ids = output_ids[0]
|
229 |
+
else:
|
230 |
+
output_ids = output_ids[0][len(inputs["input_ids"][0]) :]
|
231 |
+
outputs = m1_tokenizer.decode(
|
232 |
+
output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
|
233 |
+
)
|
234 |
+
print(f"[{conv.roles[0]}]\n{msg}")
|
235 |
+
print(f"[{conv.roles[1]}]\n{outputs}")
|
236 |
+
ocrs = outputs.split('\n')
|
237 |
+
time2 = time.time()
|
238 |
+
print(time2-time1)
|
239 |
+
|
240 |
+
# user_prompt = prompt
|
241 |
+
current_ocr = ocrs
|
242 |
|
243 |
+
ocr_ids = []
|
244 |
+
print('user_prompt', user_prompt)
|
245 |
+
print('current_ocr', current_ocr)
|
246 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
+
for ocr in current_ocr:
|
249 |
+
ocr = ocr.strip()
|
250 |
|
251 |
+
if len(ocr) == 0 or '###' in ocr or '.com' in ocr:
|
252 |
+
continue
|
253 |
|
254 |
+
items = ocr.split()
|
255 |
+
pred = ' '.join(items[:-1])
|
256 |
+
box = items[-1]
|
257 |
+
|
258 |
+
l,t,r,b = box.split(',')
|
259 |
+
l,t,r,b = int(l), int(t), int(r), int(b)
|
260 |
+
ocr_ids.extend(['l'+str(l), 't'+str(t), 'r'+str(r), 'b'+str(b)])
|
261 |
+
|
262 |
+
char_list = list(pred)
|
263 |
+
char_list = [f'[{i}]' for i in char_list]
|
264 |
+
ocr_ids.extend(char_list)
|
265 |
+
ocr_ids.append(tokenizer.eos_token_id)
|
266 |
+
|
267 |
+
caption_ids = tokenizer(
|
268 |
+
user_prompt, truncation=True, return_tensors="pt"
|
269 |
+
).input_ids[0].tolist()
|
270 |
+
|
271 |
+
try:
|
272 |
+
ocr_ids = tokenizer.encode(ocr_ids)
|
273 |
+
prompt = caption_ids + ocr_ids
|
274 |
+
except:
|
275 |
+
prompt = caption_ids
|
276 |
+
|
277 |
+
composed_prompt = tokenizer.decode(prompt)
|
278 |
|
279 |
+
else:
|
280 |
+
user_prompt += ' <|endoftext|>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
+
|
283 |
+
for items in stack:
|
284 |
+
position, text = items
|
285 |
+
|
286 |
+
|
287 |
+
if len(position) == 2:
|
288 |
+
x, y = position
|
289 |
+
x = x // 4
|
290 |
+
y = y // 4
|
291 |
+
text_str = ' '.join([f'[{c}]' for c in list(text)])
|
292 |
+
user_prompt += f'<|startoftext|> l{x} t{y} {text_str} <|endoftext|>'
|
293 |
+
elif len(position) == 4:
|
294 |
+
x0, y0, x1, y1 = position
|
295 |
+
x0 = x0 // 4
|
296 |
+
y0 = y0 // 4
|
297 |
+
x1 = x1 // 4
|
298 |
+
y1 = y1 // 4
|
299 |
+
text_str = ' '.join([f'[{c}]' for c in list(text)])
|
300 |
+
user_prompt += f'<|startoftext|> l{x0} t{y0} r{x1} b{y1} {text_str} <|endoftext|>'
|
301 |
+
|
302 |
+
composed_prompt = user_prompt
|
303 |
+
prompt = tokenizer.encode(user_prompt)
|
304 |
|
305 |
prompt = prompt[:77]
|
306 |
while len(prompt) < 77:
|
|
|
346 |
col = index % 2
|
347 |
new_image.paste(image, (col*width, row*height))
|
348 |
# new_image.save(f'{args.output_dir}/pred_img_{sample_index}_{args.local_rank}.jpg')
|
349 |
+
# results.insert(0, new_image)
|
350 |
# return new_image
|
351 |
+
os.system('nvidia-smi')
|
352 |
return tuple(results), composed_prompt
|
353 |
|
354 |
with gr.Blocks() as demo:
|
|
|
356 |
gr.HTML(
|
357 |
"""
|
358 |
<div style="text-align: center; max-width: 1600px; margin: 20px auto;">
|
359 |
+
<h2 style="font-weight: 900; font-size: 2.5rem; margin: 0rem">
|
360 |
TextDiffuser-2: Unleashing the Power of Language Models for Text Rendering
|
361 |
</h2>
|
362 |
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
|
|
|
391 |
with gr.Column(scale=1):
|
392 |
i = gr.Image(label="Template (Click to paint)", type='filepath', value=f'./gray256.jpg', height=256, width=256)
|
393 |
with gr.Column(scale=1):
|
394 |
+
t = gr.Textbox(label="Keyword", value='input_keyword')
|
395 |
redo = gr.Button(value='Redo - Cancel the last keyword') # 如何给b绑定事件
|
396 |
undo = gr.Button(value='Undo - Clear the canvas') # 如何给b绑定事件
|
397 |
skip_button = gr.Button(value='Skip - Operate next keyword') # 如何给b绑定事件
|
|
|
406 |
slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
|
407 |
slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
|
408 |
slider_temperature = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.1, label="Temperature", info="Control the diversity of layout planner. Higher value indicates more diversity.")
|
409 |
+
slider_natural = gr.Checkbox(label="Natural image generation", bool=False, info="The text position and content info will not be incorporated.")
|
410 |
+
slider_seed = gr.Slider(minimum=1, maximum=10000, label="Seed", randomize=True)
|
411 |
button = gr.Button("Generate")
|
412 |
|
413 |
with gr.Column(scale=1):
|
|
|
423 |
|
424 |
# gr.Markdown("## Prompt Examples")
|
425 |
|
426 |
+
button.click(text_to_image, inputs=[prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature,slider_natural], outputs=[output, composed_prompt])
|
427 |
|
428 |
gr.Markdown("## Prompt Examples")
|
429 |
gr.Examples(
|