ariG23498 commited on
Commit
f523ad6
1 Parent(s): 3d3d52b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -28
app.py CHANGED
@@ -2,7 +2,7 @@ import spaces
2
  import gradio as gr
3
  import torch
4
  from diffusers import AutoPipelineForInpainting
5
- from PIL import Image
6
  from transformers import (
7
  AutoModelForCausalLM,
8
  AutoTokenizer,
@@ -21,7 +21,7 @@ def delete_model(model):
21
  torch.cuda.empty_cache()
22
 
23
  @spaces.GPU()
24
- def run_language_model(edit_prompt, device):
25
  language_model_id = "Qwen/Qwen1.5-0.5B-Chat"
26
  language_model = AutoModelForCausalLM.from_pretrained(
27
  language_model_id, device_map="auto"
@@ -29,19 +29,27 @@ def run_language_model(edit_prompt, device):
29
  tokenizer = AutoTokenizer.from_pretrained(language_model_id)
30
  messages = [
31
  {"role": "system", "content": "Follow the examples and return the expected output"},
32
- {"role": "user", "content": "swap mountain and lion"}, # example 1
33
- {"role": "assistant", "content": "mountain, lion"}, # example 1
34
- {"role": "user", "content": "change the dog with cat"}, # example 2
35
- {"role": "assistant", "content": "dog, cat"}, # example 2
36
- {"role": "user", "content": "change the cat with a dog"}, # example 3
37
- {"role": "assistant", "content": "cat, dog"}, # example 3
38
- {"role": "user", "content": "replace the human with a boat"}, # example 4
39
- {"role": "assistant", "content": "human, boat"}, # example 4
40
- {"role": "user", "content": "in the above example change the background to the alps"}, # example 5
41
- {"role": "assistant", "content": "background, alps"}, # example 5
42
- {"role": "user", "content": "edit the house into a mansion"}, # example 6
43
- {"role": "assistant", "content": "house, a mansion"}, # example 6
44
- {"role": "user", "content": edit_prompt}
 
 
 
 
 
 
 
 
45
  ]
46
  text = tokenizer.apply_chat_template(
47
  messages,
@@ -61,10 +69,13 @@ def run_language_model(edit_prompt, device):
61
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
62
  ]
63
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
64
- to_replace, replace_with = response.split(", ")
 
 
 
65
 
66
  delete_model(language_model)
67
- return (to_replace, replace_with)
68
 
69
  @spaces.GPU()
70
  def run_image_captioner(image, device):
@@ -120,13 +131,16 @@ def run_segmentation(image, object_to_segment, device):
120
  return masks
121
 
122
  @spaces.GPU()
123
- def run_inpainting(image, replaced_caption, masks, device):
124
  pipeline = AutoPipelineForInpainting.from_pretrained(
125
  "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
126
  torch_dtype=torch.float16,
127
  variant="fp16",
128
  ).to(device)
129
 
 
 
 
130
  prompt = replaced_caption
131
  negative_prompt = """lowres, bad anatomy, bad hands,
132
  text, error, missing fingers, extra digit, fewer digits,
@@ -135,10 +149,11 @@ def run_inpainting(image, replaced_caption, masks, device):
135
  output = pipeline(
136
  prompt=prompt,
137
  image=image,
138
- mask_image=Image.fromarray(masks.numpy()),
139
  negative_prompt=negative_prompt,
140
  guidance_scale=7.5,
141
  strength=1.0,
 
142
  ).images[0]
143
 
144
  delete_model(pipeline)
@@ -151,24 +166,22 @@ def run_open_gen_fill(image, edit_prompt):
151
  # Resize the image to (512, 512)
152
  image = image.resize((512, 512))
153
 
154
- # Run the langauge model to extract the objects to be swapped from
155
- # the edit prompt
156
- to_replace, replace_with = run_language_model(
157
- edit_prompt=edit_prompt, device=device
158
- )
159
-
160
  # Caption the input image
161
  caption = run_image_captioner(image, device=device)
162
 
163
- # Replace the object in the caption with the new object
164
- replaced_caption = caption.replace(to_replace, replace_with)
 
 
 
165
 
166
  # Segment the `to_replace` object from the input image
167
  masks = run_segmentation(image, to_replace, device=device)
168
 
169
  # Diffusion pipeline for inpainting
 
170
  output = run_inpainting(
171
- image=image, replaced_caption=replaced_caption, masks=masks, device=device
172
  )
173
 
174
  return (
 
2
  import gradio as gr
3
  import torch
4
  from diffusers import AutoPipelineForInpainting
5
+ from PIL import Image, ImageFilter
6
  from transformers import (
7
  AutoModelForCausalLM,
8
  AutoTokenizer,
 
21
  torch.cuda.empty_cache()
22
 
23
  @spaces.GPU()
24
+ def run_language_model(edit_prompt, caption, device):
25
  language_model_id = "Qwen/Qwen1.5-0.5B-Chat"
26
  language_model = AutoModelForCausalLM.from_pretrained(
27
  language_model_id, device_map="auto"
 
29
  tokenizer = AutoTokenizer.from_pretrained(language_model_id)
30
  messages = [
31
  {"role": "system", "content": "Follow the examples and return the expected output"},
32
+ {"role": "user", "content": "Caption: a blue sky with fluffy clouds\nQuery: Make the sky stormy"},
33
+ {"role": "assistant", "content": "A: sky\nB: a stormy sky with heavy gray clouds, torrential rain, gloomy, overcast"},
34
+ {"role": "user", "content": "Caption: a cat sleeping on a sofa\nQuery: Change the cat to a dog"},
35
+ {"role": "assistant", "content": "A: cat\nB: a dog sleeping on a sofa, cozy and comfortable, snuggled up in a warm blanket, peaceful"},
36
+ {"role": "user", "content": "Caption: a snowy mountain peak\nQuery: Replace the snow with greenery"},
37
+ {"role": "assistant", "content": "A: snow\nB: a lush green mountain peak in summer, clear blue skies, birds flying overhead, serene and majestic"},
38
+ {"role": "user", "content": "Caption: a vintage car parked by the roadside\nQuery: Change the car to a modern electric vehicle"},
39
+ {"role": "assistant", "content": "A: car\nB: a sleek modern electric vehicle parked by the roadside, cutting-edge design, environmentally friendly, silent and powerful"},
40
+ {"role": "user", "content": "Caption: a wooden bridge over a river\nQuery: Make the bridge stone"},
41
+ {"role": "assistant", "content": "A: bridge\nB: an ancient stone bridge over a river, moss-covered, sturdy and timeless, with clear waters flowing beneath"},
42
+ {"role": "user", "content": "Caption: a bowl of salad on the table\nQuery: Replace salad with soup"},
43
+ {"role": "assistant", "content": "A: bowl\nB: a bowl of steaming hot soup on the table, scrumptious, with garnishing"},
44
+ {"role": "user", "content": "Caption: a book on a desk surrounded by stationery\nQuery: Remove all stationery, add a laptop"},
45
+ {"role": "assistant", "content": "A: stationery\nB: a book on a desk with a laptop next to it, modern study setup, focused and productive, technology and education combined"},
46
+ {"role": "user", "content": "Caption: a cup of coffee on a wooden table\nQuery: Change coffee to tea"},
47
+ {"role": "assistant", "content": "A: cup\nB: a steaming cup of tea on a wooden table, calming and aromatic, with a slice of lemon on the side, inviting"},
48
+ {"role": "user", "content": "Caption: a small pen on a white table\nQuery: Change the pen to an elaborate fountain pen"},
49
+ {"role": "assistant", "content": "A: pen\nB: an elaborate fountain pen on a white table, sleek and elegant, with intricate designs, ready for writing"},
50
+ {"role": "user", "content": "Caption: a plain notebook on a desk\nQuery: Replace the notebook with a journal"},
51
+ {"role": "assistant", "content": "A: notebook\nB: an artistically decorated journal on a desk, vibrant cover, filled with creativity, inspiring and personalized"},
52
+ {"role": "user", "content": f"Caption: {caption}\nQuery: {edit_prompt}"},
53
  ]
54
  text = tokenizer.apply_chat_template(
55
  messages,
 
69
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
70
  ]
71
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
72
+
73
+ output_generation_a, output_generation_b = response.split("\n")
74
+ to_replace = output_generation_a[2:].strip()
75
+ replace_caption = output_generation_b[2:].strip()
76
 
77
  delete_model(language_model)
78
+ return (to_replace, replace_caption)
79
 
80
  @spaces.GPU()
81
  def run_image_captioner(image, device):
 
131
  return masks
132
 
133
  @spaces.GPU()
134
+ def run_inpainting(image, replaced_caption, masks, generator, device):
135
  pipeline = AutoPipelineForInpainting.from_pretrained(
136
  "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
137
  torch_dtype=torch.float16,
138
  variant="fp16",
139
  ).to(device)
140
 
141
+ masks = Image.fromarray(masks.numpy())
142
+ dilation_image = masks.filter(ImageFilter.MaxFilter(3))
143
+
144
  prompt = replaced_caption
145
  negative_prompt = """lowres, bad anatomy, bad hands,
146
  text, error, missing fingers, extra digit, fewer digits,
 
149
  output = pipeline(
150
  prompt=prompt,
151
  image=image,
152
+ mask_image=dilation_image,
153
  negative_prompt=negative_prompt,
154
  guidance_scale=7.5,
155
  strength=1.0,
156
+ generator=generator,
157
  ).images[0]
158
 
159
  delete_model(pipeline)
 
166
  # Resize the image to (512, 512)
167
  image = image.resize((512, 512))
168
 
 
 
 
 
 
 
169
  # Caption the input image
170
  caption = run_image_captioner(image, device=device)
171
 
172
+ # Run the langauge model to extract the object for segmentation
173
+ # and get the replaced caption
174
+ to_replace, replace_caption = run_language_model(
175
+ edit_prompt=edit_prompt, caption=caption, device=device
176
+ )
177
 
178
  # Segment the `to_replace` object from the input image
179
  masks = run_segmentation(image, to_replace, device=device)
180
 
181
  # Diffusion pipeline for inpainting
182
+ generator = torch.Generator(device).manual_seed(17)
183
  output = run_inpainting(
184
+ image=image, replaced_caption=replaced_caption, masks=masks, generator=generator, device=device
185
  )
186
 
187
  return (