JingyeChen commited on
Commit
2af8ebe
1 Parent(s): cbddee5
Files changed (1) hide show
  1. app.py +107 -99
app.py CHANGED
@@ -187,7 +187,7 @@ def get_pixels(i, t, evt: gr.SelectData):
187
 
188
 
189
 
190
- def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature):
191
 
192
  global stack
193
  global state
@@ -196,105 +196,111 @@ def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slide
196
  time1 = time.time()
197
  user_prompt = prompt
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- if len(stack) == 0:
201
-
202
- if len(keywords.strip()) == 0:
203
- template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. All keywords are included in the caption. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {user_prompt}'
204
- else:
205
- keywords = keywords.split('/')
206
- keywords = [i.strip() for i in keywords]
207
- template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. In addition, we also provide all keywords at random order for reference. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {prompt}. Keywords: {str(keywords)}'
208
-
209
- msg = template
210
- conv = get_conversation_template(m1_model_path)
211
- conv.append_message(conv.roles[0], msg)
212
- conv.append_message(conv.roles[1], None)
213
- prompt = conv.get_prompt()
214
- inputs = m1_tokenizer([prompt], return_token_type_ids=False)
215
- inputs = {k: torch.tensor(v).to('cuda') for k, v in inputs.items()}
216
- output_ids = m1_model.generate(
217
- **inputs,
218
- do_sample=True,
219
- temperature=slider_temperature,
220
- repetition_penalty=1.0,
221
- max_new_tokens=512,
222
- )
223
-
224
- if m1_model.config.is_encoder_decoder:
225
- output_ids = output_ids[0]
226
- else:
227
- output_ids = output_ids[0][len(inputs["input_ids"][0]) :]
228
- outputs = m1_tokenizer.decode(
229
- output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
230
- )
231
- print(f"[{conv.roles[0]}]\n{msg}")
232
- print(f"[{conv.roles[1]}]\n{outputs}")
233
- ocrs = outputs.split('\n')
234
- time2 = time.time()
235
- print(time2-time1)
236
-
237
- # user_prompt = prompt
238
- current_ocr = ocrs
239
-
240
- ocr_ids = []
241
- print('user_prompt', user_prompt)
242
- print('current_ocr', current_ocr)
243
 
244
- for ocr in current_ocr:
245
- ocr = ocr.strip()
246
 
247
- if len(ocr) == 0 or '###' in ocr or '.com' in ocr:
248
- continue
249
 
250
- items = ocr.split()
251
- pred = ' '.join(items[:-1])
252
- box = items[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- l,t,r,b = box.split(',')
255
- l,t,r,b = int(l), int(t), int(r), int(b)
256
- ocr_ids.extend(['l'+str(l), 't'+str(t), 'r'+str(r), 'b'+str(b)])
257
-
258
- char_list = list(pred)
259
- char_list = [f'[{i}]' for i in char_list]
260
- ocr_ids.extend(char_list)
261
- ocr_ids.append(tokenizer.eos_token_id)
262
-
263
- caption_ids = tokenizer(
264
- user_prompt, truncation=True, return_tensors="pt"
265
- ).input_ids[0].tolist()
266
-
267
- try:
268
- ocr_ids = tokenizer.encode(ocr_ids)
269
- prompt = caption_ids + ocr_ids
270
- except:
271
- prompt = caption_ids
272
-
273
- composed_prompt = tokenizer.decode(prompt)
274
-
275
- else:
276
- user_prompt += ' <|endoftext|>'
277
 
278
- for items in stack:
279
- position, text = items
280
-
281
- if len(position) == 2:
282
- x, y = position
283
- x = x // 4
284
- y = y // 4
285
- text_str = ' '.join([f'[{c}]' for c in list(text)])
286
- user_prompt += f'<|startoftext|> l{x} t{y} {text_str} <|endoftext|>'
287
- elif len(position) == 4:
288
- x0, y0, x1, y1 = position
289
- x0 = x0 // 4
290
- y0 = y0 // 4
291
- x1 = x1 // 4
292
- y1 = y1 // 4
293
- text_str = ' '.join([f'[{c}]' for c in list(text)])
294
- user_prompt += f'<|startoftext|> l{x0} t{y0} r{x1} b{y1} {text_str} <|endoftext|>'
295
-
296
- composed_prompt = user_prompt
297
- prompt = tokenizer.encode(user_prompt)
 
 
298
 
299
  prompt = prompt[:77]
300
  while len(prompt) < 77:
@@ -340,8 +346,9 @@ def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slide
340
  col = index % 2
341
  new_image.paste(image, (col*width, row*height))
342
  # new_image.save(f'{args.output_dir}/pred_img_{sample_index}_{args.local_rank}.jpg')
343
- results.insert(0, new_image)
344
  # return new_image
 
345
  return tuple(results), composed_prompt
346
 
347
  with gr.Blocks() as demo:
@@ -349,7 +356,7 @@ with gr.Blocks() as demo:
349
  gr.HTML(
350
  """
351
  <div style="text-align: center; max-width: 1600px; margin: 20px auto;">
352
- <h2 style="font-weight: 900; font-size: 2.3rem; margin: 0rem">
353
  TextDiffuser-2: Unleashing the Power of Language Models for Text Rendering
354
  </h2>
355
  <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
@@ -384,7 +391,7 @@ with gr.Blocks() as demo:
384
  with gr.Column(scale=1):
385
  i = gr.Image(label="Template (Click to paint)", type='filepath', value=f'./gray256.jpg', height=256, width=256)
386
  with gr.Column(scale=1):
387
- t = gr.Textbox(label="Template", placeholder='keyword')
388
  redo = gr.Button(value='Redo - Cancel the last keyword') # 如何给b绑定事件
389
  undo = gr.Button(value='Undo - Clear the canvas') # 如何给b绑定事件
390
  skip_button = gr.Button(value='Skip - Operate next keyword') # 如何给b绑定事件
@@ -399,7 +406,8 @@ with gr.Blocks() as demo:
399
  slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
400
  slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
401
  slider_temperature = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.1, label="Temperature", info="Control the diversity of layout planner. Higher value indicates more diversity.")
402
- # slider_seed = gr.Slider(minimum=1, maximum=10000, label="Seed", randomize=True)
 
403
  button = gr.Button("Generate")
404
 
405
  with gr.Column(scale=1):
@@ -415,7 +423,7 @@ with gr.Blocks() as demo:
415
 
416
  # gr.Markdown("## Prompt Examples")
417
 
418
- button.click(text_to_image, inputs=[prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature], outputs=[output, composed_prompt])
419
 
420
  gr.Markdown("## Prompt Examples")
421
  gr.Examples(
 
187
 
188
 
189
 
190
+ def text_to_image(prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature,slider_natural):
191
 
192
  global stack
193
  global state
 
196
  time1 = time.time()
197
  user_prompt = prompt
198
 
199
+ if slider_natural:
200
+ user_prompt += ' <|endoftext|>'
201
+ composed_prompt = tokenizer.decode(prompt)
202
+ else:
203
+ if len(stack) == 0:
204
+
205
+ if len(keywords.strip()) == 0:
206
+ template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. All keywords are included in the caption. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {user_prompt}'
207
+ else:
208
+ keywords = keywords.split('/')
209
+ keywords = [i.strip() for i in keywords]
210
+ template = f'Given a prompt that will be used to generate an image, plan the layout of visual text for the image. The size of the image is 128x128. Therefore, all properties of the positions should not exceed 128, including the coordinates of top, left, right, and bottom. In addition, we also provide all keywords at random order for reference. You dont need to specify the details of font styles. At each line, the format should be keyword left, top, right, bottom. So let us begin. Prompt: {prompt}. Keywords: {str(keywords)}'
211
+
212
+ msg = template
213
+ conv = get_conversation_template(m1_model_path)
214
+ conv.append_message(conv.roles[0], msg)
215
+ conv.append_message(conv.roles[1], None)
216
+ prompt = conv.get_prompt()
217
+ inputs = m1_tokenizer([prompt], return_token_type_ids=False)
218
+ inputs = {k: torch.tensor(v).to('cuda') for k, v in inputs.items()}
219
+ output_ids = m1_model.generate(
220
+ **inputs,
221
+ do_sample=True,
222
+ temperature=slider_temperature,
223
+ repetition_penalty=1.0,
224
+ max_new_tokens=512,
225
+ )
226
+
227
+ if m1_model.config.is_encoder_decoder:
228
+ output_ids = output_ids[0]
229
+ else:
230
+ output_ids = output_ids[0][len(inputs["input_ids"][0]) :]
231
+ outputs = m1_tokenizer.decode(
232
+ output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
233
+ )
234
+ print(f"[{conv.roles[0]}]\n{msg}")
235
+ print(f"[{conv.roles[1]}]\n{outputs}")
236
+ ocrs = outputs.split('\n')
237
+ time2 = time.time()
238
+ print(time2-time1)
239
+
240
+ # user_prompt = prompt
241
+ current_ocr = ocrs
242
 
243
+ ocr_ids = []
244
+ print('user_prompt', user_prompt)
245
+ print('current_ocr', current_ocr)
246
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ for ocr in current_ocr:
249
+ ocr = ocr.strip()
250
 
251
+ if len(ocr) == 0 or '###' in ocr or '.com' in ocr:
252
+ continue
253
 
254
+ items = ocr.split()
255
+ pred = ' '.join(items[:-1])
256
+ box = items[-1]
257
+
258
+ l,t,r,b = box.split(',')
259
+ l,t,r,b = int(l), int(t), int(r), int(b)
260
+ ocr_ids.extend(['l'+str(l), 't'+str(t), 'r'+str(r), 'b'+str(b)])
261
+
262
+ char_list = list(pred)
263
+ char_list = [f'[{i}]' for i in char_list]
264
+ ocr_ids.extend(char_list)
265
+ ocr_ids.append(tokenizer.eos_token_id)
266
+
267
+ caption_ids = tokenizer(
268
+ user_prompt, truncation=True, return_tensors="pt"
269
+ ).input_ids[0].tolist()
270
+
271
+ try:
272
+ ocr_ids = tokenizer.encode(ocr_ids)
273
+ prompt = caption_ids + ocr_ids
274
+ except:
275
+ prompt = caption_ids
276
+
277
+ composed_prompt = tokenizer.decode(prompt)
278
 
279
+ else:
280
+ user_prompt += ' <|endoftext|>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
+
283
+ for items in stack:
284
+ position, text = items
285
+
286
+
287
+ if len(position) == 2:
288
+ x, y = position
289
+ x = x // 4
290
+ y = y // 4
291
+ text_str = ' '.join([f'[{c}]' for c in list(text)])
292
+ user_prompt += f'<|startoftext|> l{x} t{y} {text_str} <|endoftext|>'
293
+ elif len(position) == 4:
294
+ x0, y0, x1, y1 = position
295
+ x0 = x0 // 4
296
+ y0 = y0 // 4
297
+ x1 = x1 // 4
298
+ y1 = y1 // 4
299
+ text_str = ' '.join([f'[{c}]' for c in list(text)])
300
+ user_prompt += f'<|startoftext|> l{x0} t{y0} r{x1} b{y1} {text_str} <|endoftext|>'
301
+
302
+ composed_prompt = user_prompt
303
+ prompt = tokenizer.encode(user_prompt)
304
 
305
  prompt = prompt[:77]
306
  while len(prompt) < 77:
 
346
  col = index % 2
347
  new_image.paste(image, (col*width, row*height))
348
  # new_image.save(f'{args.output_dir}/pred_img_{sample_index}_{args.local_rank}.jpg')
349
+ # results.insert(0, new_image)
350
  # return new_image
351
+ os.system('nvidia-smi')
352
  return tuple(results), composed_prompt
353
 
354
  with gr.Blocks() as demo:
 
356
  gr.HTML(
357
  """
358
  <div style="text-align: center; max-width: 1600px; margin: 20px auto;">
359
+ <h2 style="font-weight: 900; font-size: 2.5rem; margin: 0rem">
360
  TextDiffuser-2: Unleashing the Power of Language Models for Text Rendering
361
  </h2>
362
  <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
 
391
  with gr.Column(scale=1):
392
  i = gr.Image(label="Template (Click to paint)", type='filepath', value=f'./gray256.jpg', height=256, width=256)
393
  with gr.Column(scale=1):
394
+ t = gr.Textbox(label="Keyword", value='input_keyword')
395
  redo = gr.Button(value='Redo - Cancel the last keyword') # 如何给b绑定事件
396
  undo = gr.Button(value='Undo - Clear the canvas') # 如何给b绑定事件
397
  skip_button = gr.Button(value='Skip - Operate next keyword') # 如何给b绑定事件
 
406
  slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
407
  slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
408
  slider_temperature = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.1, label="Temperature", info="Control the diversity of layout planner. Higher value indicates more diversity.")
409
+ slider_natural = gr.Checkbox(label="Natural image generation", bool=False, info="The text position and content info will not be incorporated.")
410
+ slider_seed = gr.Slider(minimum=1, maximum=10000, label="Seed", randomize=True)
411
  button = gr.Button("Generate")
412
 
413
  with gr.Column(scale=1):
 
423
 
424
  # gr.Markdown("## Prompt Examples")
425
 
426
+ button.click(text_to_image, inputs=[prompt,keywords,slider_step,slider_guidance,slider_batch,slider_temperature,slider_natural], outputs=[output, composed_prompt])
427
 
428
  gr.Markdown("## Prompt Examples")
429
  gr.Examples(