moulichand commited on
Commit
5a0d971
1 Parent(s): d05f06b

Fix: Ensure Object is Correctly Placed in Scene without Texturing when the texture is not provided

Browse files

In the previous implementation of the run_texture_scene method, if a texture image was not provided, the scene was being incorrectly set to the object image. This resulted in the object not being placed correctly in the provided scene.

To address this, I added a dedicated condition check to handle scenarios where only the object and scene images are provided, ensuring the object is correctly integrated into the scene.

Modified Code
Here is the modified run_texture_scene method with added comments for clarity:

def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
# Process the input images
image_object = self.process_image(image_object_path)
image_texture = self.process_image(image_texture_path)
image_scene = self.process_image(image_scene_path)

if image_object is None:
raise gr.Error('Object image is required')

current_emb = None

# If both object and scene images are provided, run scene processing
if image_scene is not None:
current_emb = self.run_binary(input_a=image_object, input_b=image_scene, prior_type='scene')
scene_input = current_emb.image_embeds
else:
scene_input = image_object

# If a texture image is provided, apply texturing
if image_texture is not None:
current_emb = self.run_binary(input_a=scene_input, input_b=image_texture, prior_type='texturing')

if current_emb is None:
raise gr.Error('At least one of the images is required')

# Render the final image
image = self.render(current_emb)

return image


implementation of run_texture_scene, the method now correctly handles scenarios where:
Only the object and scene images are provided, ensuring the object is correctly placed within the scene.
A texture image is also provided, allowing for the application of texture to the combined object and scene.

Previous pops.py version:

![1.png](https://cdn-uploads.huggingface.co/production/uploads/64e1a81518af51be8e26c751/iF4xDkFmEmenAXasfhCV2.png)


updated pops.py version

![1.png](https://cdn-uploads.huggingface.co/production/uploads/64e1a81518af51be8e26c751/VIjt958h11zC8H1kZkK8Q.png)
![10.png](https://cdn-uploads.huggingface.co/production/uploads/64e1a81518af51be8e26c751/4wyCqU9SbWKJ5qsiT2z1U.png)

Files changed (1) hide show
  1. pops.py +230 -231
pops.py CHANGED
@@ -1,231 +1,230 @@
1
- import gradio as gr
2
- import torch
3
- from PIL import Image
4
- from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
5
- from huggingface_hub import hf_hub_download
6
- from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
7
-
8
- from model import pops_utils
9
- from model.pipeline_pops import pOpsPipeline
10
-
11
- kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
12
- kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
13
- prior_texture_repo: str = 'models/texturing/learned_prior.pth'
14
- prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
15
- prior_scene_repo: str = 'models/scene/learned_prior.pth'
16
- prior_repo = "pOpsPaper/operators"
17
-
18
- # gpu = torch.device('cuda')
19
- # cpu = torch.device('cpu')
20
-
21
- class PopsPipelines:
22
- def __init__(self):
23
- weight_dtype = torch.float16
24
- self.weight_dtype = weight_dtype
25
- device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
- self.device = 'cuda' #device
27
- self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
28
- subfolder='image_encoder',
29
- torch_dtype=weight_dtype).eval()
30
- self.image_encoder.requires_grad_(False)
31
-
32
- self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
33
- subfolder='image_processor')
34
-
35
- self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
36
- self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
37
- subfolder='text_encoder',
38
- torch_dtype=weight_dtype).eval().to(device)
39
-
40
- # Load full model for vis
41
- self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
42
- subfolder='unet').to(torch.float16).to(device)
43
-
44
-
45
- self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
46
- torch_dtype=torch.float16)
47
- self.decoder = self.decoder.to(device)
48
-
49
-
50
- self.priors_dict = {
51
- 'texturing':{'repo':prior_texture_repo},
52
- 'instruct': {'repo': prior_instruct_repo},
53
- 'scene': {'repo':prior_scene_repo}
54
- }
55
-
56
- for prior_type in self.priors_dict:
57
- prior_path = self.priors_dict[prior_type]['repo']
58
- prior = PriorTransformer.from_pretrained(
59
- kandinsky_prior_repo, subfolder="prior"
60
- )
61
-
62
- # Load from huggingface
63
- prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
64
- prior_state_dict = torch.load(prior_path, map_location=device)
65
- prior.load_state_dict(prior_state_dict, strict=False)
66
-
67
- prior.eval()
68
- prior = prior.to(weight_dtype)
69
-
70
- prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
71
- prior=prior,
72
- image_encoder=self.image_encoder,
73
- torch_dtype=torch.float16)
74
-
75
- self.priors_dict[prior_type]['pipeline'] = prior_pipeline
76
-
77
- def process_image(self, input_path):
78
- if input_path is None:
79
- return None
80
- image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
81
- image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
82
- self.weight_dtype)
83
-
84
- return image
85
-
86
- def process_text(self, text):
87
- self.text_encoder.to('cuda')
88
- text_inputs = self.tokenizer(
89
- text,
90
- padding="max_length",
91
- max_length=self.tokenizer.model_max_length,
92
- truncation=True,
93
- return_tensors="pt",
94
- )
95
- mask = text_inputs.attention_mask.bool() # [0]
96
-
97
- text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
98
- text_encoder_hidden_states = text_encoder_output.last_hidden_state
99
- text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
100
- self.text_encoder.to('cpu')
101
- return text_encoder_concat
102
-
103
- def run_binary(self, input_a, input_b, prior_type):
104
- # Move pipeline to GPU
105
- pipeline = self.priors_dict[prior_type]['pipeline']
106
- pipeline.to('cuda')
107
- self.image_encoder.to('cuda')
108
- input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
109
- self.image_encoder,
110
- pipeline.prior.clip_mean.detach(),
111
- pipeline.prior.clip_std.detach())
112
-
113
- negative_input_embeds = torch.zeros_like(input_image_embeds)
114
- negative_hidden_states = torch.zeros_like(input_hidden_state)
115
-
116
- guidance_scale = 1.0
117
- if prior_type == 'texturing':
118
- guidance_scale = 8.0
119
-
120
- img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
121
- negative_input_embeds=negative_input_embeds,
122
- negative_input_hidden_states=negative_hidden_states,
123
- num_inference_steps=25,
124
- num_images_per_prompt=1,
125
- guidance_scale=guidance_scale)
126
-
127
- # Optional
128
- if prior_type == 'scene':
129
- # Scene is the closet to what avg represents for a background image so incorporate that as well
130
- mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
131
- mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
132
- alpha = 0.4
133
- img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
134
-
135
- # Move pipeline to CPU
136
- pipeline.to('cpu')
137
- self.image_encoder.to('cpu')
138
- return img_emb
139
-
140
- def run_instruct(self, input_a, text):
141
-
142
- text_encodings = self.process_text(text)
143
-
144
- # Move pipeline to GPU
145
- instruct_pipeline = self.priors_dict['instruct']['pipeline']
146
- instruct_pipeline.to('cuda')
147
- self.image_encoder.to('cuda')
148
- input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
149
- self.image_encoder,
150
- instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
151
- concat_hidden_states=text_encodings)
152
-
153
- negative_input_embeds = torch.zeros_like(input_image_embeds)
154
- negative_hidden_states = torch.zeros_like(input_hidden_state)
155
- img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
156
- negative_input_embeds=negative_input_embeds,
157
- negative_input_hidden_states=negative_hidden_states,
158
- num_inference_steps=25,
159
- num_images_per_prompt=1,
160
- guidance_scale=1.0)
161
-
162
- # Move pipeline to CPU
163
- instruct_pipeline.to('cpu')
164
- self.image_encoder.to('cpu')
165
- return img_emb
166
-
167
- def render(self, img_emb):
168
- self.decoder.to('cuda')
169
- images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
170
- num_inference_steps=50, height=512,
171
- width=512, guidance_scale=4).images
172
- self.decoder.to('cpu')
173
- return images[0]
174
-
175
- def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
176
- # Process both inputs
177
- image_object = self.process_image(image_object_path)
178
- image_texture = self.process_image(image_texture_path)
179
-
180
- if image_object is None:
181
- raise gr.Error('Object image is required')
182
-
183
- current_emb = None
184
-
185
- if image_texture is None:
186
- instruct_input = image_object
187
- else:
188
- # Run texturing
189
- current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
190
- instruct_input = current_emb.image_embeds
191
-
192
- if text_instruct != '':
193
- current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
194
-
195
- if current_emb is None:
196
- raise gr.Error('At least one of the inputs is required')
197
-
198
- # Render as image
199
- image = self.render(current_emb)
200
-
201
- return image
202
-
203
- def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
204
- # Process both inputs
205
- image_object = self.process_image(image_object_path)
206
- image_texture = self.process_image(image_texture_path)
207
- image_scene = self.process_image(image_scene_path)
208
-
209
- if image_object is None:
210
- raise gr.Error('Object image is required')
211
-
212
- current_emb = None
213
-
214
- if image_texture is None:
215
- scene_input = image_object
216
- else:
217
- # Run texturing
218
- current_emb = self.run_binary(input_a=image_object, input_b=image_scene,prior_type='scene')
219
- scene_input = current_emb.image_embeds
220
-
221
- # Run scene
222
- if image_scene is not None:
223
- current_emb = self.run_binary(input_a=scene_input, input_b=image_texture,prior_type='texturing')
224
-
225
- if current_emb is None:
226
- raise gr.Error('At least one of the images is required')
227
- # Render as image
228
- image = self.render(current_emb)
229
-
230
- return image
231
-
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
5
+ from huggingface_hub import hf_hub_download
6
+ from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
7
+
8
+ from model import pops_utils
9
+ from model.pipeline_pops import pOpsPipeline
10
+
11
+ kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
12
+ kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
13
+ prior_texture_repo: str = 'models/texturing/learned_prior.pth'
14
+ prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
15
+ prior_scene_repo: str = 'models/scene/learned_prior.pth'
16
+ prior_repo = "pOpsPaper/operators"
17
+
18
+ # gpu = torch.device('cuda')
19
+ # cpu = torch.device('cpu')
20
+
21
+ class PopsPipelines:
22
+ def __init__(self):
23
+ weight_dtype = torch.float16
24
+ self.weight_dtype = weight_dtype
25
+ device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ self.device = 'cuda' #device
27
+ self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
28
+ subfolder='image_encoder',
29
+ torch_dtype=weight_dtype).eval()
30
+ self.image_encoder.requires_grad_(False)
31
+
32
+ self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
33
+ subfolder='image_processor')
34
+
35
+ self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
36
+ self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
37
+ subfolder='text_encoder',
38
+ torch_dtype=weight_dtype).eval().to(device)
39
+
40
+ # Load full model for vis
41
+ self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
42
+ subfolder='unet').to(torch.float16).to(device)
43
+
44
+
45
+ self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
46
+ torch_dtype=torch.float16)
47
+ self.decoder = self.decoder.to(device)
48
+
49
+
50
+ self.priors_dict = {
51
+ 'texturing':{'repo':prior_texture_repo},
52
+ 'instruct': {'repo': prior_instruct_repo},
53
+ 'scene': {'repo':prior_scene_repo}
54
+ }
55
+
56
+ for prior_type in self.priors_dict:
57
+ prior_path = self.priors_dict[prior_type]['repo']
58
+ prior = PriorTransformer.from_pretrained(
59
+ kandinsky_prior_repo, subfolder="prior"
60
+ )
61
+
62
+ # Load from huggingface
63
+ prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
64
+ prior_state_dict = torch.load(prior_path, map_location=device)
65
+ prior.load_state_dict(prior_state_dict, strict=False)
66
+
67
+ prior.eval()
68
+ prior = prior.to(weight_dtype)
69
+
70
+ prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
71
+ prior=prior,
72
+ image_encoder=self.image_encoder,
73
+ torch_dtype=torch.float16)
74
+
75
+ self.priors_dict[prior_type]['pipeline'] = prior_pipeline
76
+
77
+ def process_image(self, input_path):
78
+ if input_path is None:
79
+ return None
80
+ image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
81
+ image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
82
+ self.weight_dtype)
83
+
84
+ return image
85
+
86
+ def process_text(self, text):
87
+ self.text_encoder.to('cuda')
88
+ text_inputs = self.tokenizer(
89
+ text,
90
+ padding="max_length",
91
+ max_length=self.tokenizer.model_max_length,
92
+ truncation=True,
93
+ return_tensors="pt",
94
+ )
95
+ mask = text_inputs.attention_mask.bool() # [0]
96
+
97
+ text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
98
+ text_encoder_hidden_states = text_encoder_output.last_hidden_state
99
+ text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
100
+ self.text_encoder.to('cpu')
101
+ return text_encoder_concat
102
+
103
+ def run_binary(self, input_a, input_b, prior_type):
104
+ # Move pipeline to GPU
105
+ pipeline = self.priors_dict[prior_type]['pipeline']
106
+ pipeline.to('cuda')
107
+ self.image_encoder.to('cuda')
108
+ input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
109
+ self.image_encoder,
110
+ pipeline.prior.clip_mean.detach(),
111
+ pipeline.prior.clip_std.detach())
112
+
113
+ negative_input_embeds = torch.zeros_like(input_image_embeds)
114
+ negative_hidden_states = torch.zeros_like(input_hidden_state)
115
+
116
+ guidance_scale = 1.0
117
+ if prior_type == 'texturing':
118
+ guidance_scale = 8.0
119
+
120
+ img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
121
+ negative_input_embeds=negative_input_embeds,
122
+ negative_input_hidden_states=negative_hidden_states,
123
+ num_inference_steps=25,
124
+ num_images_per_prompt=1,
125
+ guidance_scale=guidance_scale)
126
+
127
+ # Optional
128
+ if prior_type == 'scene':
129
+ # Scene is the closet to what avg represents for a background image so incorporate that as well
130
+ mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
131
+ mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
132
+ alpha = 0.4
133
+ img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
134
+
135
+ # Move pipeline to CPU
136
+ pipeline.to('cpu')
137
+ self.image_encoder.to('cpu')
138
+ return img_emb
139
+
140
+ def run_instruct(self, input_a, text):
141
+
142
+ text_encodings = self.process_text(text)
143
+
144
+ # Move pipeline to GPU
145
+ instruct_pipeline = self.priors_dict['instruct']['pipeline']
146
+ instruct_pipeline.to('cuda')
147
+ self.image_encoder.to('cuda')
148
+ input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
149
+ self.image_encoder,
150
+ instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
151
+ concat_hidden_states=text_encodings)
152
+
153
+ negative_input_embeds = torch.zeros_like(input_image_embeds)
154
+ negative_hidden_states = torch.zeros_like(input_hidden_state)
155
+ img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
156
+ negative_input_embeds=negative_input_embeds,
157
+ negative_input_hidden_states=negative_hidden_states,
158
+ num_inference_steps=25,
159
+ num_images_per_prompt=1,
160
+ guidance_scale=1.0)
161
+
162
+ # Move pipeline to CPU
163
+ instruct_pipeline.to('cpu')
164
+ self.image_encoder.to('cpu')
165
+ return img_emb
166
+
167
+ def render(self, img_emb):
168
+ self.decoder.to('cuda')
169
+ images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
170
+ num_inference_steps=50, height=512,
171
+ width=512, guidance_scale=4).images
172
+ self.decoder.to('cpu')
173
+ return images[0]
174
+
175
+ def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
176
+ # Process both inputs
177
+ image_object = self.process_image(image_object_path)
178
+ image_texture = self.process_image(image_texture_path)
179
+
180
+ if image_object is None:
181
+ raise gr.Error('Object image is required')
182
+
183
+ current_emb = None
184
+
185
+ if image_texture is None:
186
+ instruct_input = image_object
187
+ else:
188
+ # Run texturing
189
+ current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
190
+ instruct_input = current_emb.image_embeds
191
+
192
+ if text_instruct != '':
193
+ current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
194
+
195
+ if current_emb is None:
196
+ raise gr.Error('At least one of the inputs is required')
197
+
198
+ # Render as image
199
+ image = self.render(current_emb)
200
+
201
+ return image
202
+
203
+ def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
204
+ image_object = self.process_image(image_object_path)
205
+ image_texture = self.process_image(image_texture_path)
206
+ image_scene = self.process_image(image_scene_path)
207
+
208
+ if image_object is None:
209
+ raise gr.Error('Object image is required')
210
+
211
+ current_emb = None
212
+
213
+ # If both object and scene images are provided, run scene processing
214
+ if image_scene is not None:
215
+ current_emb = self.run_binary(input_a=image_object, input_b=image_scene, prior_type='scene')
216
+ scene_input = current_emb.image_embeds
217
+ else:
218
+ scene_input = image_object
219
+
220
+ # If a texture image is provided, apply texturing
221
+ if image_texture is not None:
222
+ current_emb = self.run_binary(input_a=scene_input, input_b=image_texture, prior_type='texturing')
223
+
224
+ if current_emb is None:
225
+ raise gr.Error('At least one of the images is required')
226
+
227
+ # Render the final image
228
+ image = self.render(current_emb)
229
+
230
+ return image