open-gen-fill-v1

Paused

App Files Files Community

ariG23498 commited on Feb 15

Commit

f523ad6

•

1 Parent(s): 3d3d52b

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -28

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import spaces
 import gradio as gr
 import torch
 from diffusers import AutoPipelineForInpainting
-from PIL import Image
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -21,7 +21,7 @@ def delete_model(model):
     torch.cuda.empty_cache()
 @spaces.GPU()
-def run_language_model(edit_prompt, device):
     language_model_id = "Qwen/Qwen1.5-0.5B-Chat"
     language_model = AutoModelForCausalLM.from_pretrained(
         language_model_id, device_map="auto"
@@ -29,19 +29,27 @@ def run_language_model(edit_prompt, device):
     tokenizer = AutoTokenizer.from_pretrained(language_model_id)
     messages = [
         {"role": "system", "content": "Follow the examples and return the expected output"},
-        {"role": "user", "content": "swap mountain and lion"},  # example 1
-        {"role": "assistant", "content": "mountain, lion"},  # example 1
-        {"role": "user", "content": "change the dog with cat"},  # example 2
-        {"role": "assistant", "content": "dog, cat"},  # example 2
-        {"role": "user", "content": "change the cat with a dog"},  # example 3
-        {"role": "assistant", "content": "cat, dog"},  # example 3
-        {"role": "user", "content": "replace the human with a boat"},  # example 4
-        {"role": "assistant", "content": "human, boat"},  # example 4
-        {"role": "user", "content": "in the above example change the background to the alps"},  # example 5
-        {"role": "assistant", "content": "background, alps"},  # example 5
-        {"role": "user", "content": "edit the house into a mansion"},  # example 6
-        {"role": "assistant", "content": "house, a mansion"},  # example 6
-        {"role": "user", "content": edit_prompt}
     ]
     text = tokenizer.apply_chat_template(
         messages,
@@ -61,10 +69,13 @@ def run_language_model(edit_prompt, device):
           output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]
     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    to_replace, replace_with = response.split(", ")
     delete_model(language_model)
-    return (to_replace, replace_with)
 @spaces.GPU()
 def run_image_captioner(image, device):
@@ -120,13 +131,16 @@ def run_segmentation(image, object_to_segment, device):
     return masks
 @spaces.GPU()
-def run_inpainting(image, replaced_caption, masks, device):
     pipeline = AutoPipelineForInpainting.from_pretrained(
         "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
         torch_dtype=torch.float16,
         variant="fp16",
     ).to(device)
     prompt = replaced_caption
     negative_prompt = """lowres, bad anatomy, bad hands,
     text, error, missing fingers, extra digit, fewer digits,
@@ -135,10 +149,11 @@ def run_inpainting(image, replaced_caption, masks, device):
     output = pipeline(
         prompt=prompt,
         image=image,
-        mask_image=Image.fromarray(masks.numpy()),
         negative_prompt=negative_prompt,
         guidance_scale=7.5,
         strength=1.0,
     ).images[0]
     delete_model(pipeline)
@@ -151,24 +166,22 @@ def run_open_gen_fill(image, edit_prompt):
     # Resize the image to (512, 512)
     image = image.resize((512, 512))
-    # Run the langauge model to extract the objects to be swapped from
-    # the edit prompt
-    to_replace, replace_with = run_language_model(
-        edit_prompt=edit_prompt, device=device
-    )
     # Caption the input image
     caption = run_image_captioner(image, device=device)
-    # Replace the object in the caption with the new object
-    replaced_caption = caption.replace(to_replace, replace_with)
     # Segment the `to_replace` object from the input image
     masks = run_segmentation(image, to_replace, device=device)
     # Diffusion pipeline for inpainting
     output = run_inpainting(
-        image=image, replaced_caption=replaced_caption, masks=masks, device=device
     )
     return (

 import gradio as gr
 import torch
 from diffusers import AutoPipelineForInpainting
+from PIL import Image, ImageFilter
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     torch.cuda.empty_cache()
 @spaces.GPU()
+def run_language_model(edit_prompt, caption, device):
     language_model_id = "Qwen/Qwen1.5-0.5B-Chat"
     language_model = AutoModelForCausalLM.from_pretrained(
         language_model_id, device_map="auto"
     tokenizer = AutoTokenizer.from_pretrained(language_model_id)
     messages = [
         {"role": "system", "content": "Follow the examples and return the expected output"},
+        {"role": "user", "content": "Caption: a blue sky with fluffy clouds\nQuery: Make the sky stormy"},
+        {"role": "assistant", "content": "A: sky\nB: a stormy sky with heavy gray clouds, torrential rain, gloomy, overcast"},
+        {"role": "user", "content": "Caption: a cat sleeping on a sofa\nQuery: Change the cat to a dog"},
+        {"role": "assistant", "content": "A: cat\nB: a dog sleeping on a sofa, cozy and comfortable, snuggled up in a warm blanket, peaceful"},
+        {"role": "user", "content": "Caption: a snowy mountain peak\nQuery: Replace the snow with greenery"},
+        {"role": "assistant", "content": "A: snow\nB: a lush green mountain peak in summer, clear blue skies, birds flying overhead, serene and majestic"},
+        {"role": "user", "content": "Caption: a vintage car parked by the roadside\nQuery: Change the car to a modern electric vehicle"},
+        {"role": "assistant", "content": "A: car\nB: a sleek modern electric vehicle parked by the roadside, cutting-edge design, environmentally friendly, silent and powerful"},
+        {"role": "user", "content": "Caption: a wooden bridge over a river\nQuery: Make the bridge stone"},
+        {"role": "assistant", "content": "A: bridge\nB: an ancient stone bridge over a river, moss-covered, sturdy and timeless, with clear waters flowing beneath"},
+        {"role": "user", "content": "Caption: a bowl of salad on the table\nQuery: Replace salad with soup"},
+        {"role": "assistant", "content": "A: bowl\nB: a bowl of steaming hot soup on the table, scrumptious, with garnishing"},
+        {"role": "user", "content": "Caption: a book on a desk surrounded by stationery\nQuery: Remove all stationery, add a laptop"},
+        {"role": "assistant", "content": "A: stationery\nB: a book on a desk with a laptop next to it, modern study setup, focused and productive, technology and education combined"},
+        {"role": "user", "content": "Caption: a cup of coffee on a wooden table\nQuery: Change coffee to tea"},
+        {"role": "assistant", "content": "A: cup\nB: a steaming cup of tea on a wooden table, calming and aromatic, with a slice of lemon on the side, inviting"},
+        {"role": "user", "content": "Caption: a small pen on a white table\nQuery: Change the pen to an elaborate fountain pen"},
+        {"role": "assistant", "content": "A: pen\nB: an elaborate fountain pen on a white table, sleek and elegant, with intricate designs, ready for writing"},
+        {"role": "user", "content": "Caption: a plain notebook on a desk\nQuery: Replace the notebook with a journal"},
+        {"role": "assistant", "content": "A: notebook\nB: an artistically decorated journal on a desk, vibrant cover, filled with creativity, inspiring and personalized"},
+        {"role": "user", "content": f"Caption: {caption}\nQuery: {edit_prompt}"},
     ]
     text = tokenizer.apply_chat_template(
         messages,
           output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]
     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    output_generation_a, output_generation_b = response.split("\n")
+    to_replace = output_generation_a[2:].strip()
+    replace_caption = output_generation_b[2:].strip()
     delete_model(language_model)
+    return (to_replace, replace_caption)
 @spaces.GPU()
 def run_image_captioner(image, device):
     return masks
 @spaces.GPU()
+def run_inpainting(image, replaced_caption, masks, generator, device):
     pipeline = AutoPipelineForInpainting.from_pretrained(
         "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
         torch_dtype=torch.float16,
         variant="fp16",
     ).to(device)
+    masks = Image.fromarray(masks.numpy())
+    dilation_image = masks.filter(ImageFilter.MaxFilter(3))
     prompt = replaced_caption
     negative_prompt = """lowres, bad anatomy, bad hands,
     text, error, missing fingers, extra digit, fewer digits,
     output = pipeline(
         prompt=prompt,
         image=image,
+        mask_image=dilation_image,
         negative_prompt=negative_prompt,
         guidance_scale=7.5,
         strength=1.0,
+        generator=generator,
     ).images[0]
     delete_model(pipeline)
     # Resize the image to (512, 512)
     image = image.resize((512, 512))
     # Caption the input image
     caption = run_image_captioner(image, device=device)
+    # Run the langauge model to extract the object for segmentation
+    # and get the replaced caption
+    to_replace, replace_caption = run_language_model(
+        edit_prompt=edit_prompt, caption=caption, device=device
+    )
     # Segment the `to_replace` object from the input image
     masks = run_segmentation(image, to_replace, device=device)
     # Diffusion pipeline for inpainting
+    generator = torch.Generator(device).manual_seed(17)
     output = run_inpainting(
+        image=image, replaced_caption=replaced_caption, masks=masks, generator=generator, device=device
     )
     return (