ashawkey commited on
Commit
71d1ff6
1 Parent(s): 915dfc8

Upload folder using huggingface_hub

Browse files
clip_camera_projection/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "CLIPCameraProjection",
3
+ "_diffusers_version": "0.24.0",
4
+ "additional_embeddings": 4,
5
+ "embedding_dim": 768
6
+ }
clip_camera_projection/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb292c6da02a0d8866c4378cd6053e097c7a6bdf0886cc4ac1fba2bda3af6f0
3
+ size 1187512
clip_camera_projection/zero123.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import math
17
+ import warnings
18
+ from typing import Any, Callable, Dict, List, Optional, Union
19
+
20
+ import PIL
21
+ import torch
22
+ import torchvision.transforms.functional as TF
23
+ from diffusers.configuration_utils import ConfigMixin, FrozenDict, register_to_config
24
+ from diffusers.image_processor import VaeImageProcessor
25
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
26
+ from diffusers.models.modeling_utils import ModelMixin
27
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
28
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
29
+ from diffusers.pipelines.stable_diffusion.safety_checker import (
30
+ StableDiffusionSafetyChecker,
31
+ )
32
+ from diffusers.schedulers import KarrasDiffusionSchedulers
33
+ from diffusers.utils import deprecate, is_accelerate_available, logging
34
+ from diffusers.utils.torch_utils import randn_tensor
35
+ from packaging import version
36
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
37
+
38
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
39
+
40
+
41
+ class CLIPCameraProjection(ModelMixin, ConfigMixin):
42
+ """
43
+ A Projection layer for CLIP embedding and camera embedding.
44
+
45
+ Parameters:
46
+ embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `clip_embed`
47
+ additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
48
+ projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
49
+ additional_embeddings`.
50
+ """
51
+
52
+ @register_to_config
53
+ def __init__(self, embedding_dim: int = 768, additional_embeddings: int = 4):
54
+ super().__init__()
55
+ self.embedding_dim = embedding_dim
56
+ self.additional_embeddings = additional_embeddings
57
+
58
+ self.input_dim = self.embedding_dim + self.additional_embeddings
59
+ self.output_dim = self.embedding_dim
60
+
61
+ self.proj = torch.nn.Linear(self.input_dim, self.output_dim)
62
+
63
+ def forward(
64
+ self,
65
+ embedding: torch.FloatTensor,
66
+ ):
67
+ """
68
+ The [`PriorTransformer`] forward method.
69
+
70
+ Args:
71
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, input_dim)`):
72
+ The currently input embeddings.
73
+
74
+ Returns:
75
+ The output embedding projection (`torch.FloatTensor` of shape `(batch_size, output_dim)`).
76
+ """
77
+ proj_embedding = self.proj(embedding)
78
+ return proj_embedding
79
+
80
+
81
+ class Zero123Pipeline(DiffusionPipeline):
82
+ r"""
83
+ Pipeline to generate variations from an input image using Stable Diffusion.
84
+
85
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
86
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
87
+
88
+ Args:
89
+ vae ([`AutoencoderKL`]):
90
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
91
+ image_encoder ([`CLIPVisionModelWithProjection`]):
92
+ Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
93
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
94
+ specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
95
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
96
+ scheduler ([`SchedulerMixin`]):
97
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
98
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
99
+ safety_checker ([`StableDiffusionSafetyChecker`]):
100
+ Classification module that estimates whether generated images could be considered offensive or harmful.
101
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
102
+ feature_extractor ([`CLIPImageProcessor`]):
103
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
104
+ """
105
+ # TODO: feature_extractor is required to encode images (if they are in PIL format),
106
+ # we should give a descriptive message if the pipeline doesn't have one.
107
+ _optional_components = ["safety_checker"]
108
+
109
+ def __init__(
110
+ self,
111
+ vae: AutoencoderKL,
112
+ image_encoder: CLIPVisionModelWithProjection,
113
+ unet: UNet2DConditionModel,
114
+ scheduler: KarrasDiffusionSchedulers,
115
+ safety_checker: StableDiffusionSafetyChecker,
116
+ feature_extractor: CLIPImageProcessor,
117
+ clip_camera_projection: CLIPCameraProjection,
118
+ requires_safety_checker: bool = True,
119
+ ):
120
+ super().__init__()
121
+
122
+ if safety_checker is None and requires_safety_checker:
123
+ logger.warn(
124
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
125
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
126
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
127
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
128
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
129
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
130
+ )
131
+
132
+ if safety_checker is not None and feature_extractor is None:
133
+ raise ValueError(
134
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
135
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
136
+ )
137
+
138
+ is_unet_version_less_0_9_0 = hasattr(
139
+ unet.config, "_diffusers_version"
140
+ ) and version.parse(
141
+ version.parse(unet.config._diffusers_version).base_version
142
+ ) < version.parse(
143
+ "0.9.0.dev0"
144
+ )
145
+ is_unet_sample_size_less_64 = (
146
+ hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
147
+ )
148
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
149
+ deprecation_message = (
150
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
151
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
152
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
153
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
154
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
155
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
156
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
157
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
158
+ " the `unet/config.json` file"
159
+ )
160
+ deprecate(
161
+ "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
162
+ )
163
+ new_config = dict(unet.config)
164
+ new_config["sample_size"] = 64
165
+ unet._internal_dict = FrozenDict(new_config)
166
+
167
+ self.register_modules(
168
+ vae=vae,
169
+ image_encoder=image_encoder,
170
+ unet=unet,
171
+ scheduler=scheduler,
172
+ safety_checker=safety_checker,
173
+ feature_extractor=feature_extractor,
174
+ clip_camera_projection=clip_camera_projection,
175
+ )
176
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
177
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
178
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
179
+
180
+ def enable_sequential_cpu_offload(self, gpu_id=0):
181
+ r"""
182
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
183
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
184
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
185
+ """
186
+ if is_accelerate_available():
187
+ from accelerate import cpu_offload
188
+ else:
189
+ raise ImportError("Please install accelerate via `pip install accelerate`")
190
+
191
+ device = torch.device(f"cuda:{gpu_id}")
192
+
193
+ for cpu_offloaded_model in [
194
+ self.unet,
195
+ self.image_encoder,
196
+ self.vae,
197
+ self.safety_checker,
198
+ ]:
199
+ if cpu_offloaded_model is not None:
200
+ cpu_offload(cpu_offloaded_model, device)
201
+
202
+ @property
203
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
204
+ def _execution_device(self):
205
+ r"""
206
+ Returns the device on which the pipeline's models will be executed. After calling
207
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
208
+ hooks.
209
+ """
210
+ if not hasattr(self.unet, "_hf_hook"):
211
+ return self.device
212
+ for module in self.unet.modules():
213
+ if (
214
+ hasattr(module, "_hf_hook")
215
+ and hasattr(module._hf_hook, "execution_device")
216
+ and module._hf_hook.execution_device is not None
217
+ ):
218
+ return torch.device(module._hf_hook.execution_device)
219
+ return self.device
220
+
221
+ def _encode_image(
222
+ self,
223
+ image,
224
+ elevation,
225
+ azimuth,
226
+ distance,
227
+ device,
228
+ num_images_per_prompt,
229
+ do_classifier_free_guidance,
230
+ clip_image_embeddings=None,
231
+ image_camera_embeddings=None,
232
+ ):
233
+ dtype = next(self.image_encoder.parameters()).dtype
234
+
235
+ if image_camera_embeddings is None:
236
+ if image is None:
237
+ assert clip_image_embeddings is not None
238
+ image_embeddings = clip_image_embeddings.to(device=device, dtype=dtype)
239
+ else:
240
+ if not isinstance(image, torch.Tensor):
241
+ image = self.feature_extractor(
242
+ images=image, return_tensors="pt"
243
+ ).pixel_values
244
+
245
+ image = image.to(device=device, dtype=dtype)
246
+ image_embeddings = self.image_encoder(image).image_embeds
247
+ image_embeddings = image_embeddings.unsqueeze(1)
248
+
249
+ bs_embed, seq_len, _ = image_embeddings.shape
250
+
251
+ if isinstance(elevation, float):
252
+ elevation = torch.as_tensor(
253
+ [elevation] * bs_embed, dtype=dtype, device=device
254
+ )
255
+ if isinstance(azimuth, float):
256
+ azimuth = torch.as_tensor(
257
+ [azimuth] * bs_embed, dtype=dtype, device=device
258
+ )
259
+ if isinstance(distance, float):
260
+ distance = torch.as_tensor(
261
+ [distance] * bs_embed, dtype=dtype, device=device
262
+ )
263
+
264
+ camera_embeddings = torch.stack(
265
+ [
266
+ torch.deg2rad(elevation),
267
+ torch.sin(torch.deg2rad(azimuth)),
268
+ torch.cos(torch.deg2rad(azimuth)),
269
+ distance,
270
+ ],
271
+ dim=-1,
272
+ )[:, None, :]
273
+
274
+ image_embeddings = torch.cat([image_embeddings, camera_embeddings], dim=-1)
275
+
276
+ # project (image, camera) embeddings to the same dimension as clip embeddings
277
+ image_embeddings = self.clip_camera_projection(image_embeddings)
278
+ else:
279
+ image_embeddings = image_camera_embeddings.to(device=device, dtype=dtype)
280
+ bs_embed, seq_len, _ = image_embeddings.shape
281
+
282
+ # duplicate image embeddings for each generation per prompt, using mps friendly method
283
+ image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
284
+ image_embeddings = image_embeddings.view(
285
+ bs_embed * num_images_per_prompt, seq_len, -1
286
+ )
287
+
288
+ if do_classifier_free_guidance:
289
+ negative_prompt_embeds = torch.zeros_like(image_embeddings)
290
+
291
+ # For classifier free guidance, we need to do two forward passes.
292
+ # Here we concatenate the unconditional and text embeddings into a single batch
293
+ # to avoid doing two forward passes
294
+ image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
295
+
296
+ return image_embeddings
297
+
298
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
299
+ def run_safety_checker(self, image, device, dtype):
300
+ if self.safety_checker is None:
301
+ has_nsfw_concept = None
302
+ else:
303
+ if torch.is_tensor(image):
304
+ feature_extractor_input = self.image_processor.postprocess(
305
+ image, output_type="pil"
306
+ )
307
+ else:
308
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
309
+ safety_checker_input = self.feature_extractor(
310
+ feature_extractor_input, return_tensors="pt"
311
+ ).to(device)
312
+ image, has_nsfw_concept = self.safety_checker(
313
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
314
+ )
315
+ return image, has_nsfw_concept
316
+
317
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
318
+ def decode_latents(self, latents):
319
+ warnings.warn(
320
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
321
+ " use VaeImageProcessor instead",
322
+ FutureWarning,
323
+ )
324
+ latents = 1 / self.vae.config.scaling_factor * latents
325
+ image = self.vae.decode(latents, return_dict=False)[0]
326
+ image = (image / 2 + 0.5).clamp(0, 1)
327
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
328
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
329
+ return image
330
+
331
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
332
+ def prepare_extra_step_kwargs(self, generator, eta):
333
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
334
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
335
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
336
+ # and should be between [0, 1]
337
+
338
+ accepts_eta = "eta" in set(
339
+ inspect.signature(self.scheduler.step).parameters.keys()
340
+ )
341
+ extra_step_kwargs = {}
342
+ if accepts_eta:
343
+ extra_step_kwargs["eta"] = eta
344
+
345
+ # check if the scheduler accepts generator
346
+ accepts_generator = "generator" in set(
347
+ inspect.signature(self.scheduler.step).parameters.keys()
348
+ )
349
+ if accepts_generator:
350
+ extra_step_kwargs["generator"] = generator
351
+ return extra_step_kwargs
352
+
353
+ def check_inputs(self, image, height, width, callback_steps):
354
+ # TODO: check image size or adjust image size to (height, width)
355
+
356
+ if height % 8 != 0 or width % 8 != 0:
357
+ raise ValueError(
358
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
359
+ )
360
+
361
+ if (callback_steps is None) or (
362
+ callback_steps is not None
363
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
364
+ ):
365
+ raise ValueError(
366
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
367
+ f" {type(callback_steps)}."
368
+ )
369
+
370
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
371
+ def prepare_latents(
372
+ self,
373
+ batch_size,
374
+ num_channels_latents,
375
+ height,
376
+ width,
377
+ dtype,
378
+ device,
379
+ generator,
380
+ latents=None,
381
+ ):
382
+ shape = (
383
+ batch_size,
384
+ num_channels_latents,
385
+ height // self.vae_scale_factor,
386
+ width // self.vae_scale_factor,
387
+ )
388
+ if isinstance(generator, list) and len(generator) != batch_size:
389
+ raise ValueError(
390
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
391
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
392
+ )
393
+
394
+ if latents is None:
395
+ latents = randn_tensor(
396
+ shape, generator=generator, device=device, dtype=dtype
397
+ )
398
+ else:
399
+ latents = latents.to(device)
400
+
401
+ # scale the initial noise by the standard deviation required by the scheduler
402
+ latents = latents * self.scheduler.init_noise_sigma
403
+ return latents
404
+
405
+ def _get_latent_model_input(
406
+ self,
407
+ latents: torch.FloatTensor,
408
+ image: Optional[
409
+ Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]
410
+ ],
411
+ num_images_per_prompt: int,
412
+ do_classifier_free_guidance: bool,
413
+ image_latents: Optional[torch.FloatTensor] = None,
414
+ ):
415
+ if isinstance(image, PIL.Image.Image):
416
+ image_pt = TF.to_tensor(image).unsqueeze(0).to(latents)
417
+ elif isinstance(image, list):
418
+ image_pt = torch.stack([TF.to_tensor(img) for img in image], dim=0).to(
419
+ latents
420
+ )
421
+ elif isinstance(image, torch.Tensor):
422
+ image_pt = image
423
+ else:
424
+ image_pt = None
425
+
426
+ if image_pt is None:
427
+ assert image_latents is not None
428
+ image_pt = image_latents.repeat_interleave(num_images_per_prompt, dim=0)
429
+ else:
430
+ image_pt = image_pt * 2.0 - 1.0 # scale to [-1, 1]
431
+ # FIXME: encoded latents should be multiplied with self.vae.config.scaling_factor
432
+ # but zero123 was not trained this way
433
+ image_pt = self.vae.encode(image_pt).latent_dist.mode()
434
+ image_pt = image_pt.repeat_interleave(num_images_per_prompt, dim=0)
435
+ if do_classifier_free_guidance:
436
+ latent_model_input = torch.cat(
437
+ [
438
+ torch.cat([latents, latents], dim=0),
439
+ torch.cat([torch.zeros_like(image_pt), image_pt], dim=0),
440
+ ],
441
+ dim=1,
442
+ )
443
+ else:
444
+ latent_model_input = torch.cat([latents, image_pt], dim=1)
445
+
446
+ return latent_model_input
447
+
448
+ @torch.no_grad()
449
+ def __call__(
450
+ self,
451
+ image: Optional[
452
+ Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]
453
+ ] = None,
454
+ elevation: Optional[Union[float, torch.FloatTensor]] = None,
455
+ azimuth: Optional[Union[float, torch.FloatTensor]] = None,
456
+ distance: Optional[Union[float, torch.FloatTensor]] = None,
457
+ height: Optional[int] = None,
458
+ width: Optional[int] = None,
459
+ num_inference_steps: int = 50,
460
+ guidance_scale: float = 3.0,
461
+ num_images_per_prompt: int = 1,
462
+ eta: float = 0.0,
463
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
464
+ latents: Optional[torch.FloatTensor] = None,
465
+ clip_image_embeddings: Optional[torch.FloatTensor] = None,
466
+ image_camera_embeddings: Optional[torch.FloatTensor] = None,
467
+ image_latents: Optional[torch.FloatTensor] = None,
468
+ output_type: Optional[str] = "pil",
469
+ return_dict: bool = True,
470
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
471
+ callback_steps: int = 1,
472
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
473
+ ):
474
+ r"""
475
+ Function invoked when calling the pipeline for generation.
476
+
477
+ Args:
478
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
479
+ The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
480
+ configuration of
481
+ [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
482
+ `CLIPImageProcessor`
483
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
484
+ The height in pixels of the generated image.
485
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
486
+ The width in pixels of the generated image.
487
+ num_inference_steps (`int`, *optional*, defaults to 50):
488
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
489
+ expense of slower inference.
490
+ guidance_scale (`float`, *optional*, defaults to 7.5):
491
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
492
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
493
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
494
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
495
+ usually at the expense of lower image quality.
496
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
497
+ The number of images to generate per prompt.
498
+ eta (`float`, *optional*, defaults to 0.0):
499
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
500
+ [`schedulers.DDIMScheduler`], will be ignored for others.
501
+ generator (`torch.Generator`, *optional*):
502
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
503
+ to make generation deterministic.
504
+ latents (`torch.FloatTensor`, *optional*):
505
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
506
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
507
+ tensor will ge generated by sampling using the supplied random `generator`.
508
+ output_type (`str`, *optional*, defaults to `"pil"`):
509
+ The output format of the generate image. Choose between
510
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
511
+ return_dict (`bool`, *optional*, defaults to `True`):
512
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
513
+ plain tuple.
514
+ callback (`Callable`, *optional*):
515
+ A function that will be called every `callback_steps` steps during inference. The function will be
516
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
517
+ callback_steps (`int`, *optional*, defaults to 1):
518
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
519
+ called at every step.
520
+
521
+ Returns:
522
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
523
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
524
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
525
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
526
+ (nsfw) content, according to the `safety_checker`.
527
+ """
528
+ # 0. Default height and width to unet
529
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
530
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
531
+
532
+ # 1. Check inputs. Raise error if not correct
533
+ # TODO: check input elevation, azimuth, and distance
534
+ # TODO: check image, clip_image_embeddings, image_latents
535
+ self.check_inputs(image, height, width, callback_steps)
536
+
537
+ # 2. Define call parameters
538
+ if isinstance(image, PIL.Image.Image):
539
+ batch_size = 1
540
+ elif isinstance(image, list):
541
+ batch_size = len(image)
542
+ elif isinstance(image, torch.Tensor):
543
+ batch_size = image.shape[0]
544
+ else:
545
+ assert image_latents is not None
546
+ assert (
547
+ clip_image_embeddings is not None or image_camera_embeddings is not None
548
+ )
549
+ batch_size = image_latents.shape[0]
550
+
551
+ device = self._execution_device
552
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
553
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
554
+ # corresponds to doing no classifier free guidance.
555
+ do_classifier_free_guidance = guidance_scale > 1.0
556
+
557
+ # 3. Encode input image
558
+ if isinstance(image, PIL.Image.Image) or isinstance(image, list):
559
+ pil_image = image
560
+ elif isinstance(image, torch.Tensor):
561
+ pil_image = [TF.to_pil_image(image[i]) for i in range(image.shape[0])]
562
+ else:
563
+ pil_image = None
564
+ image_embeddings = self._encode_image(
565
+ pil_image,
566
+ elevation,
567
+ azimuth,
568
+ distance,
569
+ device,
570
+ num_images_per_prompt,
571
+ do_classifier_free_guidance,
572
+ clip_image_embeddings,
573
+ image_camera_embeddings,
574
+ )
575
+
576
+ # 4. Prepare timesteps
577
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
578
+ timesteps = self.scheduler.timesteps
579
+
580
+ # 5. Prepare latent variables
581
+ # num_channels_latents = self.unet.config.in_channels
582
+ num_channels_latents = 4 # FIXME: hard-coded
583
+ latents = self.prepare_latents(
584
+ batch_size * num_images_per_prompt,
585
+ num_channels_latents,
586
+ height,
587
+ width,
588
+ image_embeddings.dtype,
589
+ device,
590
+ generator,
591
+ latents,
592
+ )
593
+
594
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
595
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
596
+
597
+ # 7. Denoising loop
598
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
599
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
600
+ for i, t in enumerate(timesteps):
601
+ # expand the latents if we are doing classifier free guidance
602
+ latent_model_input = self._get_latent_model_input(
603
+ latents,
604
+ image,
605
+ num_images_per_prompt,
606
+ do_classifier_free_guidance,
607
+ image_latents,
608
+ )
609
+ latent_model_input = self.scheduler.scale_model_input(
610
+ latent_model_input, t
611
+ )
612
+
613
+ # predict the noise residual
614
+ noise_pred = self.unet(
615
+ latent_model_input,
616
+ t,
617
+ encoder_hidden_states=image_embeddings,
618
+ cross_attention_kwargs=cross_attention_kwargs,
619
+ ).sample
620
+
621
+ # perform guidance
622
+ if do_classifier_free_guidance:
623
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
624
+ noise_pred = noise_pred_uncond + guidance_scale * (
625
+ noise_pred_text - noise_pred_uncond
626
+ )
627
+
628
+ # compute the previous noisy sample x_t -> x_t-1
629
+ latents = self.scheduler.step(
630
+ noise_pred, t, latents, **extra_step_kwargs
631
+ ).prev_sample
632
+
633
+ # call the callback, if provided
634
+ if i == len(timesteps) - 1 or (
635
+ (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
636
+ ):
637
+ progress_bar.update()
638
+ if callback is not None and i % callback_steps == 0:
639
+ callback(i, t, latents)
640
+
641
+ if not output_type == "latent":
642
+ image = self.vae.decode(
643
+ latents / self.vae.config.scaling_factor, return_dict=False
644
+ )[0]
645
+ image, has_nsfw_concept = self.run_safety_checker(
646
+ image, device, image_embeddings.dtype
647
+ )
648
+ else:
649
+ image = latents
650
+ has_nsfw_concept = None
651
+
652
+ if has_nsfw_concept is None:
653
+ do_denormalize = [True] * image.shape[0]
654
+ else:
655
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
656
+
657
+ image = self.image_processor.postprocess(
658
+ image, output_type=output_type, do_denormalize=do_denormalize
659
+ )
660
+
661
+ if not return_dict:
662
+ return (image, has_nsfw_concept)
663
+
664
+ return StableDiffusionPipelineOutput(
665
+ images=image, nsfw_content_detected=has_nsfw_concept
666
+ )
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "feature_extractor_type": "CLIPFeatureExtractor",
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ },
28
+ "use_square_size": false
29
+ }
image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "lambdalabs/sd-image-variations-diffusers",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "quick_gelu",
9
+ "hidden_size": 1024,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.35.2"
23
+ }
image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd0c1777bb75e2e9b7aa29b799b3ee8ebaaaa731e2c471a9ec589f12542cce7
3
+ size 607980096
model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Zero123Pipeline",
3
+ "_diffusers_version": "0.24.0",
4
+ "clip_camera_projection": [
5
+ "zero123",
6
+ "CLIPCameraProjection"
7
+ ],
8
+ "feature_extractor": [
9
+ "transformers",
10
+ "CLIPImageProcessor"
11
+ ],
12
+ "image_encoder": [
13
+ "transformers",
14
+ "CLIPVisionModelWithProjection"
15
+ ],
16
+ "requires_safety_checker": false,
17
+ "safety_checker": [
18
+ null,
19
+ null
20
+ ],
21
+ "scheduler": [
22
+ "diffusers",
23
+ "DDIMScheduler"
24
+ ],
25
+ "unet": [
26
+ "diffusers",
27
+ "UNet2DConditionModel"
28
+ ],
29
+ "vae": [
30
+ "diffusers",
31
+ "AutoencoderKL"
32
+ ]
33
+ }
readme.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Uses
6
+ _Note: This section is originally taken from the [Stable Diffusion v2 model card](https://huggingface.co/stabilityai/stable-diffusion-2), but applies in the same way to Zero-1-to-3._
7
+
8
+ ## Direct Use
9
+ The model is intended for research purposes only. Possible research areas and tasks include:
10
+
11
+ - Safe deployment of large-scale models.
12
+ - Probing and understanding the limitations and biases of generative models.
13
+ - Generation of artworks and use in design and other artistic processes.
14
+ - Applications in educational or creative tools.
15
+ - Research on generative models.
16
+
17
+ Excluded uses are described below.
18
+
19
+ ### Misuse, Malicious Use, and Out-of-Scope Use
20
+ The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
21
+
22
+ #### Out-of-Scope Use
23
+ The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
24
+
25
+ #### Misuse and Malicious Use
26
+ Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
27
+
28
+ - Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
29
+ - Intentionally promoting or propagating discriminatory content or harmful stereotypes.
30
+ - Impersonating individuals without their consent.
31
+ - Sexual content without consent of the people who might see it.
32
+ - Mis- and disinformation
33
+ - Representations of egregious violence and gore
34
+ - Sharing of copyrighted or licensed material in violation of its terms of use.
35
+ - Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
36
+
37
+ ## Limitations and Bias
38
+
39
+ ### Limitations
40
+
41
+ - The model does not achieve perfect photorealism.
42
+ - The model cannot render legible text.
43
+ - Faces and people in general may not be parsed or generated properly.
44
+ - The autoencoding part of the model is lossy.
45
+ - Stable Diffusion was trained on a subset of the large-scale dataset [LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, Stability AI has filtered the dataset using LAION's NSFW detector.
46
+ - Zero-1-to-3 was subsequently finetuned on a subset of the large-scale dataset [Objaverse](https://objaverse.allenai.org/), which might also potentially contain inappropriate content. To partially mitigate this, our demo applies a safety check to every uploaded image.
47
+
48
+ ### Bias
49
+ While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
50
+ Stable Diffusion was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/), which consists of images that are limited to English descriptions.
51
+ Images and concepts from communities and cultures that use other languages are likely to be insufficiently accounted for.
52
+ This affects the overall output of the model, as Western cultures are often overrepresented.
53
+ Stable Diffusion mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
54
+
55
+
56
+ ### Safety Module
57
+ The intended use of this model is with the [Safety Checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) in Diffusers.
58
+ This checker works by checking model inputs against known hard-coded NSFW concepts.
59
+ Specifically, the checker compares the class probability of harmful concepts in the embedding space of the uploaded input images.
60
+ The concepts are passed into the model with the image and compared to a hand-engineered weight for each NSFW concept.
61
+
62
+ ## Citation
63
+ ```
64
+ @misc{liu2023zero1to3,
65
+ title={Zero-1-to-3: Zero-shot One Image to 3D Object},
66
+ author={Ruoshi Liu and Rundi Wu and Basile Van Hoorick and Pavel Tokmakov and Sergey Zakharov and Carl Vondrick},
67
+ year={2023},
68
+ eprint={2303.11328},
69
+ archivePrefix={arXiv},
70
+ primaryClass={cs.CV}
71
+ }
72
+ ```
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.24.0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
unet/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.24.0",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": 8,
9
+ "attention_type": "default",
10
+ "block_out_channels": [
11
+ 320,
12
+ 640,
13
+ 1280,
14
+ 1280
15
+ ],
16
+ "center_input_sample": false,
17
+ "class_embed_type": null,
18
+ "class_embeddings_concat": false,
19
+ "conv_in_kernel": 3,
20
+ "conv_out_kernel": 3,
21
+ "cross_attention_dim": 768,
22
+ "cross_attention_norm": null,
23
+ "down_block_types": [
24
+ "CrossAttnDownBlock2D",
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "DownBlock2D"
28
+ ],
29
+ "downsample_padding": 1,
30
+ "dropout": 0.0,
31
+ "dual_cross_attention": false,
32
+ "encoder_hid_dim": null,
33
+ "encoder_hid_dim_type": null,
34
+ "flip_sin_to_cos": true,
35
+ "freq_shift": 0,
36
+ "in_channels": 8,
37
+ "layers_per_block": 2,
38
+ "mid_block_only_cross_attention": null,
39
+ "mid_block_scale_factor": 1,
40
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
41
+ "norm_eps": 1e-05,
42
+ "norm_num_groups": 32,
43
+ "num_attention_heads": null,
44
+ "num_class_embeds": null,
45
+ "only_cross_attention": false,
46
+ "out_channels": 4,
47
+ "projection_class_embeddings_input_dim": null,
48
+ "resnet_out_scale_factor": 1.0,
49
+ "resnet_skip_time_act": false,
50
+ "resnet_time_scale_shift": "default",
51
+ "reverse_transformer_layers_per_block": null,
52
+ "sample_size": 32,
53
+ "time_cond_proj_dim": null,
54
+ "time_embedding_act_fn": null,
55
+ "time_embedding_dim": null,
56
+ "time_embedding_type": "positional",
57
+ "timestep_post_act": null,
58
+ "transformer_layers_per_block": 1,
59
+ "up_block_types": [
60
+ "UpBlock2D",
61
+ "CrossAttnUpBlock2D",
62
+ "CrossAttnUpBlock2D",
63
+ "CrossAttnUpBlock2D"
64
+ ],
65
+ "upcast_attention": null,
66
+ "use_linear_projection": false
67
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df42cdcac55d5b89c15a0512845db20a6213f1eaa129f201dfc080fc8642d220
3
+ size 1719148344
vae/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.24.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "norm_num_groups": 32,
22
+ "out_channels": 3,
23
+ "sample_size": 256,
24
+ "scaling_factor": 0.18215,
25
+ "up_block_types": [
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D"
30
+ ]
31
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342