Spaces:

pOpsPaper
/

pOps-space

Running on Zero

App Files Files Community

moulichand commited on Jun 11, 2024

Commit

5a0d971

verified ·

1 Parent(s): d05f06b

Fix: Ensure Object is Correctly Placed in Scene without Texturing when the texture is not provided

Browse files

In the previous implementation of the run_texture_scene method, if a texture image was not provided, the scene was being incorrectly set to the object image. This resulted in the object not being placed correctly in the provided scene.

To address this, I added a dedicated condition check to handle scenarios where only the object and scene images are provided, ensuring the object is correctly integrated into the scene.

Modified Code
Here is the modified run_texture_scene method with added comments for clarity:

def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
# Process the input images
image_object = self.process_image(image_object_path)
image_texture = self.process_image(image_texture_path)
image_scene = self.process_image(image_scene_path)

if image_object is None:
raise gr.Error('Object image is required')

current_emb = None

# If both object and scene images are provided, run scene processing
if image_scene is not None:
current_emb = self.run_binary(input_a=image_object, input_b=image_scene, prior_type='scene')
scene_input = current_emb.image_embeds
else:
scene_input = image_object

# If a texture image is provided, apply texturing
if image_texture is not None:
current_emb = self.run_binary(input_a=scene_input, input_b=image_texture, prior_type='texturing')

if current_emb is None:
raise gr.Error('At least one of the images is required')

# Render the final image
image = self.render(current_emb)

return image

implementation of run_texture_scene, the method now correctly handles scenarios where:
Only the object and scene images are provided, ensuring the object is correctly placed within the scene.
A texture image is also provided, allowing for the application of texture to the combined object and scene.

Previous pops.py version:

![1.png](https://cdn-uploads.huggingface.co/production/uploads/64e1a81518af51be8e26c751/iF4xDkFmEmenAXasfhCV2.png)

updated pops.py version

![1.png](https://cdn-uploads.huggingface.co/production/uploads/64e1a81518af51be8e26c751/VIjt958h11zC8H1kZkK8Q.png)
![10.png](https://cdn-uploads.huggingface.co/production/uploads/64e1a81518af51be8e26c751/4wyCqU9SbWKJ5qsiT2z1U.png)

Files changed (1) hide show

pops.py +230 -231

pops.py CHANGED Viewed

@@ -1,231 +1,230 @@
-import gradio as gr
-import torch
-from PIL import Image
-from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
-from huggingface_hub import hf_hub_download
-from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
-from model import pops_utils
-from model.pipeline_pops import pOpsPipeline
-kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
-kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
-prior_texture_repo: str = 'models/texturing/learned_prior.pth'
-prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
-prior_scene_repo: str = 'models/scene/learned_prior.pth'
-prior_repo = "pOpsPaper/operators"
-# gpu = torch.device('cuda')
-# cpu = torch.device('cpu')
-class PopsPipelines:
-    def __init__(self):
-        weight_dtype = torch.float16
-        self.weight_dtype = weight_dtype
-        device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.device = 'cuda' #device
-        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
-                                                                      subfolder='image_encoder',
-                                                                      torch_dtype=weight_dtype).eval()
-        self.image_encoder.requires_grad_(False)
-        self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
-                                                             subfolder='image_processor')
-        self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
-        self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
-                                                                   subfolder='text_encoder',
-                                                                   torch_dtype=weight_dtype).eval().to(device)
-        # Load full model for vis
-        self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
-                                                    subfolder='unet').to(torch.float16).to(device)
-        self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
-                                                       torch_dtype=torch.float16)
-        self.decoder = self.decoder.to(device)
-        self.priors_dict = {
-            'texturing':{'repo':prior_texture_repo},
-            'instruct': {'repo': prior_instruct_repo},
-            'scene': {'repo':prior_scene_repo}
-        }
-        for prior_type in self.priors_dict:
-            prior_path = self.priors_dict[prior_type]['repo']
-            prior = PriorTransformer.from_pretrained(
-                kandinsky_prior_repo, subfolder="prior"
-            )
-            # Load from huggingface
-            prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
-            prior_state_dict = torch.load(prior_path, map_location=device)
-            prior.load_state_dict(prior_state_dict, strict=False)
-            prior.eval()
-            prior = prior.to(weight_dtype)
-            prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
-                                                          prior=prior,
-                                                          image_encoder=self.image_encoder,
-                                                          torch_dtype=torch.float16)
-            self.priors_dict[prior_type]['pipeline'] = prior_pipeline
-    def process_image(self, input_path):
-        if input_path is None:
-            return None
-        image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
-        image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
-            self.weight_dtype)
-        return image
-    def process_text(self, text):
-        self.text_encoder.to('cuda')
-        text_inputs = self.tokenizer(
-            text,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        mask = text_inputs.attention_mask.bool()  # [0]
-        text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
-        text_encoder_hidden_states = text_encoder_output.last_hidden_state
-        text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
-        self.text_encoder.to('cpu')
-        return text_encoder_concat
-    def run_binary(self, input_a, input_b, prior_type):
-        # Move pipeline to GPU
-        pipeline = self.priors_dict[prior_type]['pipeline']
-        pipeline.to('cuda')
-        self.image_encoder.to('cuda')
-        input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
-                                                                       self.image_encoder,
-                                                                       pipeline.prior.clip_mean.detach(),
-                                                                       pipeline.prior.clip_std.detach())
-        negative_input_embeds = torch.zeros_like(input_image_embeds)
-        negative_hidden_states = torch.zeros_like(input_hidden_state)
-        guidance_scale = 1.0
-        if prior_type == 'texturing':
-            guidance_scale = 8.0
-        img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
-                                    negative_input_embeds=negative_input_embeds,
-                                    negative_input_hidden_states=negative_hidden_states,
-                                    num_inference_steps=25,
-                                    num_images_per_prompt=1,
-                                    guidance_scale=guidance_scale)
-        # Optional
-        if prior_type == 'scene':
-            # Scene is the closet to what avg represents for a background image so incorporate that as well
-            mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
-            mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
-            alpha = 0.4
-            img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
-        # Move pipeline to CPU
-        pipeline.to('cpu')
-        self.image_encoder.to('cpu')
-        return img_emb
-    def run_instruct(self, input_a, text):
-        text_encodings = self.process_text(text)
-        # Move pipeline to GPU
-        instruct_pipeline = self.priors_dict['instruct']['pipeline']
-        instruct_pipeline.to('cuda')
-        self.image_encoder.to('cuda')
-        input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
-                                                           self.image_encoder,
-                                                           instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
-                                                           concat_hidden_states=text_encodings)
-        negative_input_embeds = torch.zeros_like(input_image_embeds)
-        negative_hidden_states = torch.zeros_like(input_hidden_state)
-        img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
-                                 negative_input_embeds=negative_input_embeds,
-                                 negative_input_hidden_states=negative_hidden_states,
-                                 num_inference_steps=25,
-                                 num_images_per_prompt=1,
-                                 guidance_scale=1.0)
-        # Move pipeline to CPU
-        instruct_pipeline.to('cpu')
-        self.image_encoder.to('cpu')
-        return img_emb
-    def render(self, img_emb):
-        self.decoder.to('cuda')
-        images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
-                         num_inference_steps=50, height=512,
-                         width=512, guidance_scale=4).images
-        self.decoder.to('cpu')
-        return images[0]
-    def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
-        # Process both inputs
-        image_object = self.process_image(image_object_path)
-        image_texture = self.process_image(image_texture_path)
-        if image_object is None:
-            raise gr.Error('Object image is required')
-        current_emb = None
-        if image_texture is None:
-            instruct_input = image_object
-        else:
-            # Run texturing
-            current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
-            instruct_input = current_emb.image_embeds
-        if text_instruct != '':
-            current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
-        if current_emb is None:
-            raise gr.Error('At least one of the inputs is required')
-        # Render as image
-        image = self.render(current_emb)
-        return image
-    def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
-        # Process both inputs
-        image_object = self.process_image(image_object_path)
-        image_texture = self.process_image(image_texture_path)
-        image_scene = self.process_image(image_scene_path)
-        if image_object is None:
-            raise gr.Error('Object image is required')
-        current_emb = None
-        if image_texture is None:
-            scene_input = image_object
-        else:
-            # Run texturing
-            current_emb = self.run_binary(input_a=image_object, input_b=image_scene,prior_type='scene')
-            scene_input = current_emb.image_embeds
-        # Run scene
-        if image_scene is not None:
-            current_emb = self.run_binary(input_a=scene_input, input_b=image_texture,prior_type='texturing')
-        if current_emb is None:
-            raise gr.Error('At least one of the images is required')
-        # Render as image
-        image = self.render(current_emb)
-        return image

+import gradio as gr
+import torch
+from PIL import Image
+from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
+from huggingface_hub import hf_hub_download
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
+from model import pops_utils
+from model.pipeline_pops import pOpsPipeline
+kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
+kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
+prior_texture_repo: str = 'models/texturing/learned_prior.pth'
+prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
+prior_scene_repo: str = 'models/scene/learned_prior.pth'
+prior_repo = "pOpsPaper/operators"
+# gpu = torch.device('cuda')
+# cpu = torch.device('cpu')
+class PopsPipelines:
+    def __init__(self):
+        weight_dtype = torch.float16
+        self.weight_dtype = weight_dtype
+        device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.device = 'cuda' #device
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
+                                                                      subfolder='image_encoder',
+                                                                      torch_dtype=weight_dtype).eval()
+        self.image_encoder.requires_grad_(False)
+        self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
+                                                             subfolder='image_processor')
+        self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
+        self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
+                                                                   subfolder='text_encoder',
+                                                                   torch_dtype=weight_dtype).eval().to(device)
+        # Load full model for vis
+        self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
+                                                    subfolder='unet').to(torch.float16).to(device)
+        self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
+                                                       torch_dtype=torch.float16)
+        self.decoder = self.decoder.to(device)
+        self.priors_dict = {
+            'texturing':{'repo':prior_texture_repo},
+            'instruct': {'repo': prior_instruct_repo},
+            'scene': {'repo':prior_scene_repo}
+        }
+        for prior_type in self.priors_dict:
+            prior_path = self.priors_dict[prior_type]['repo']
+            prior = PriorTransformer.from_pretrained(
+                kandinsky_prior_repo, subfolder="prior"
+            )
+            # Load from huggingface
+            prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
+            prior_state_dict = torch.load(prior_path, map_location=device)
+            prior.load_state_dict(prior_state_dict, strict=False)
+            prior.eval()
+            prior = prior.to(weight_dtype)
+            prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
+                                                          prior=prior,
+                                                          image_encoder=self.image_encoder,
+                                                          torch_dtype=torch.float16)
+            self.priors_dict[prior_type]['pipeline'] = prior_pipeline
+    def process_image(self, input_path):
+        if input_path is None:
+            return None
+        image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
+        image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
+            self.weight_dtype)
+        return image
+    def process_text(self, text):
+        self.text_encoder.to('cuda')
+        text_inputs = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        mask = text_inputs.attention_mask.bool()  # [0]
+        text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+        text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
+        self.text_encoder.to('cpu')
+        return text_encoder_concat
+    def run_binary(self, input_a, input_b, prior_type):
+        # Move pipeline to GPU
+        pipeline = self.priors_dict[prior_type]['pipeline']
+        pipeline.to('cuda')
+        self.image_encoder.to('cuda')
+        input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
+                                                                       self.image_encoder,
+                                                                       pipeline.prior.clip_mean.detach(),
+                                                                       pipeline.prior.clip_std.detach())
+        negative_input_embeds = torch.zeros_like(input_image_embeds)
+        negative_hidden_states = torch.zeros_like(input_hidden_state)
+        guidance_scale = 1.0
+        if prior_type == 'texturing':
+            guidance_scale = 8.0
+        img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
+                                    negative_input_embeds=negative_input_embeds,
+                                    negative_input_hidden_states=negative_hidden_states,
+                                    num_inference_steps=25,
+                                    num_images_per_prompt=1,
+                                    guidance_scale=guidance_scale)
+        # Optional
+        if prior_type == 'scene':
+            # Scene is the closet to what avg represents for a background image so incorporate that as well
+            mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
+            mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
+            alpha = 0.4
+            img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
+        # Move pipeline to CPU
+        pipeline.to('cpu')
+        self.image_encoder.to('cpu')
+        return img_emb
+    def run_instruct(self, input_a, text):
+        text_encodings = self.process_text(text)
+        # Move pipeline to GPU
+        instruct_pipeline = self.priors_dict['instruct']['pipeline']
+        instruct_pipeline.to('cuda')
+        self.image_encoder.to('cuda')
+        input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
+                                                           self.image_encoder,
+                                                           instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
+                                                           concat_hidden_states=text_encodings)
+        negative_input_embeds = torch.zeros_like(input_image_embeds)
+        negative_hidden_states = torch.zeros_like(input_hidden_state)
+        img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
+                                 negative_input_embeds=negative_input_embeds,
+                                 negative_input_hidden_states=negative_hidden_states,
+                                 num_inference_steps=25,
+                                 num_images_per_prompt=1,
+                                 guidance_scale=1.0)
+        # Move pipeline to CPU
+        instruct_pipeline.to('cpu')
+        self.image_encoder.to('cpu')
+        return img_emb
+    def render(self, img_emb):
+        self.decoder.to('cuda')
+        images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
+                         num_inference_steps=50, height=512,
+                         width=512, guidance_scale=4).images
+        self.decoder.to('cpu')
+        return images[0]
+    def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
+        # Process both inputs
+        image_object = self.process_image(image_object_path)
+        image_texture = self.process_image(image_texture_path)
+        if image_object is None:
+            raise gr.Error('Object image is required')
+        current_emb = None
+        if image_texture is None:
+            instruct_input = image_object
+        else:
+            # Run texturing
+            current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
+            instruct_input = current_emb.image_embeds
+        if text_instruct != '':
+            current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
+        if current_emb is None:
+            raise gr.Error('At least one of the inputs is required')
+        # Render as image
+        image = self.render(current_emb)
+        return image
+    def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
+      image_object = self.process_image(image_object_path)
+      image_texture = self.process_image(image_texture_path)
+      image_scene = self.process_image(image_scene_path)
+      if image_object is None:
+          raise gr.Error('Object image is required')
+      current_emb = None
+    # If both object and scene images are provided, run scene processing
+      if image_scene is not None:
+        current_emb = self.run_binary(input_a=image_object, input_b=image_scene, prior_type='scene')
+        scene_input = current_emb.image_embeds
+      else:
+        scene_input = image_object
+    # If a texture image is provided, apply texturing
+      if image_texture is not None:
+         current_emb = self.run_binary(input_a=scene_input, input_b=image_texture, prior_type='texturing')
+      if current_emb is None:
+         raise gr.Error('At least one of the images is required')
+    # Render the final image
+      image = self.render(current_emb)
+      return image