gvecchio commited on
Commit
0f95e92
1 Parent(s): 970fe52
model_index.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": ["pipeline", "StableMaterialsPipeline"],
3
+ "_diffusers_version": "0.27.2",
4
+ "processor": [
5
+ "transformers",
6
+ "CLIPProcessor"
7
+ ],
8
+ "scheduler": [
9
+ "diffusers",
10
+ "DDIMScheduler"
11
+ ],
12
+ "text_encoder": [
13
+ "transformers",
14
+ "CLIPTextModelWithProjection"
15
+ ],
16
+ "tokenizer": [
17
+ "transformers",
18
+ "CLIPTokenizerFast"
19
+ ],
20
+ "unet": [
21
+ "diffusers",
22
+ "UNet2DConditionModel"
23
+ ],
24
+ "vae": [
25
+ "diffusers",
26
+ "AutoencoderKL"
27
+ ],
28
+ "vision_encoder": [
29
+ "transformers",
30
+ "CLIPVisionModelWithProjection"
31
+ ]
32
+ }
pipeline.py ADDED
@@ -0,0 +1,977 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import inspect
3
+ from typing import Any, Dict, List, Optional, Union, get_args
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import torchvision.transforms.functional as TF
9
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
10
+ from diffusers.loaders import FromSingleFileMixin
11
+ from diffusers.models.transformers import Transformer2DModel
12
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
13
+ rescale_noise_cfg,
14
+ retrieve_timesteps,
15
+ )
16
+ from diffusers.schedulers import KarrasDiffusionSchedulers
17
+ from diffusers.utils import (
18
+ BaseOutput,
19
+ deprecate,
20
+ logging,
21
+ )
22
+ from diffusers.utils.torch_utils import randn_tensor
23
+ from PIL import (
24
+ Image,
25
+ Jpeg2KImagePlugin,
26
+ JpegImagePlugin,
27
+ PngImagePlugin,
28
+ TiffImagePlugin,
29
+ )
30
+ from transformers import (
31
+ CLIPImageProcessor,
32
+ CLIPTextModel,
33
+ CLIPTokenizer,
34
+ CLIPVisionModel,
35
+ )
36
+
37
+ from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
38
+
39
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
40
+ from dataclasses import dataclass
41
+
42
+ ImageInput = Union[
43
+ PipelineImageInput,
44
+ JpegImagePlugin.JpegImageFile,
45
+ Jpeg2KImagePlugin.Jpeg2KImageFile,
46
+ PngImagePlugin.PngImageFile,
47
+ TiffImagePlugin.TiffImageFile,
48
+ ]
49
+
50
+ import math
51
+
52
+
53
+ def postprocess(
54
+ image: torch.FloatTensor,
55
+ output_type: str = "pil",
56
+ ):
57
+ """
58
+ Postprocess the image output from tensor to `output_type`.
59
+
60
+ Args:
61
+ image (`torch.FloatTensor`):
62
+ The image input, should be a pytorch tensor with shape `B x C x H x W`.
63
+ output_type (`str`, *optional*, defaults to `pil`):
64
+ The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
65
+
66
+ Returns:
67
+ `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
68
+ The postprocessed image.
69
+ """
70
+ if not isinstance(image, torch.Tensor):
71
+ raise ValueError(
72
+ f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
73
+ )
74
+ if output_type not in ["latent", "pt", "np", "pil"]:
75
+ deprecation_message = (
76
+ f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
77
+ "`pil`, `np`, `pt`, `latent`"
78
+ )
79
+ deprecate(
80
+ "Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False
81
+ )
82
+ output_type = "np"
83
+
84
+ image = image.detach().cpu()
85
+ image = image.to(torch.float32)
86
+
87
+ if output_type == "latent":
88
+ return image
89
+
90
+ # denormalize the image
91
+ image = image * 0.5 + 0.5 # .clamp(0, 1)
92
+
93
+ materials = []
94
+ for i in range(image.shape[0]):
95
+
96
+ material = StableMaterialsMaterial()
97
+ material.init_from_tensor(image[i], mode=output_type)
98
+
99
+ materials.append(material)
100
+
101
+ return materials
102
+
103
+
104
+ @dataclass
105
+ class StableMaterialsMaterial:
106
+ basecolor: torch.FloatTensor
107
+ normal: torch.FloatTensor
108
+ height: torch.FloatTensor
109
+ roughness: torch.FloatTensor
110
+ metallic: torch.FloatTensor
111
+ _mode: str = "tensor" # Default mode is tensor
112
+
113
+ def __init__(
114
+ self,
115
+ basecolor: Optional[Union[Image.Image, np.ndarray, torch.FloatTensor]] = None,
116
+ normal: Optional[Union[Image.Image, np.ndarray, torch.FloatTensor]] = None,
117
+ height: Optional[Union[Image.Image, np.ndarray, torch.FloatTensor]] = None,
118
+ roughness: Optional[Union[Image.Image, np.ndarray, torch.FloatTensor]] = None,
119
+ metallic: Optional[Union[Image.Image, np.ndarray, torch.FloatTensor]] = None,
120
+ mode: str = "tensor",
121
+ ):
122
+ self._basecolor = self._to_pt(basecolor)
123
+ self._normal = self._to_pt(normal)
124
+ self._height = self._to_pt(height)
125
+ self._roughness = self._to_pt(roughness)
126
+ self._metallic = self._to_pt(metallic)
127
+ self._mode = mode
128
+
129
+ def init_from_tensor(self, image: torch.FloatTensor, mode: str = "tensor"):
130
+ assert image.shape[0] >= 8, "Input tensor should have at least 8 channels"
131
+ self._basecolor = image[:3].clamp(0, 1)
132
+ self._normal = self.compute_normal_map_z_component(image[3:5])
133
+ self._height = image[5:6].clamp(0, 1)
134
+ self._roughness = image[6:7].clamp(0, 1)
135
+ self._metallic = image[7:8].clamp(0, 1)
136
+ self._mode = mode
137
+
138
+ def resize(self, size, antialias=True):
139
+ self._basecolor = TF.resize(self._basecolor, size, antialias=antialias)
140
+ self._normal = TF.resize(self._normal, size, antialias=antialias)
141
+ self._height = TF.resize(self._height, size, antialias=antialias)
142
+ self._roughness = TF.resize(self._roughness, size, antialias=antialias)
143
+ self._metallic = TF.resize(self._metallic, size, antialias=antialias)
144
+ return self
145
+
146
+ def tile(self, num_tiles):
147
+ self._basecolor = self._basecolor.repeat(1, num_tiles, num_tiles)
148
+ self._normal = self._normal.repeat(1, num_tiles, num_tiles)
149
+ self._height = self._height.repeat(1, num_tiles, num_tiles)
150
+ self._roughness = self._roughness.repeat(1, num_tiles, num_tiles)
151
+ self._metallic = self._metallic.repeat(1, num_tiles, num_tiles)
152
+ return self
153
+
154
+ def _to_numpy(self, image: torch.FloatTensor):
155
+ if image is None:
156
+ return None
157
+ return image.numpy()
158
+
159
+ def _to_pil(self, image: torch.FloatTensor, mode: str = "RGB"):
160
+ if image is None:
161
+ return None
162
+ return TF.to_pil_image(image).convert(mode)
163
+
164
+ def _to_pt(self, image):
165
+ if image is None:
166
+ return None
167
+ if isinstance(image, np.ndarray):
168
+ image = torch.from_numpy(image)
169
+ elif isinstance(image, Image.Image):
170
+ image = TF.to_tensor(image)
171
+ return image.cpu()
172
+
173
+ def compute_normal_map_z_component(self, normal: torch.FloatTensor):
174
+ normal = normal * 2 - 1
175
+ sum_sq = (normal**2).sum(dim=0, keepdim=True)[0]
176
+ z = torch.zeros_like(sum_sq)
177
+ mask = sum_sq <= 1
178
+ z[mask] = torch.sqrt(1 - sum_sq[mask])
179
+ mask_outlier = sum_sq > 1
180
+ scale_factor = torch.sqrt(sum_sq[mask_outlier])
181
+ normal[:, mask_outlier] = normal[:, mask_outlier] / scale_factor
182
+ normal = torch.cat([normal, z.unsqueeze(0)], dim=0)
183
+ normal = normal * 0.5 + 0.5
184
+ return normal.clamp(0, 1)
185
+
186
+ def _convert(self, image, mode="RGB"):
187
+ if self._mode == "numpy":
188
+ return self._to_numpy(image)
189
+ elif self._mode == "pil":
190
+ return self._to_pil(image, mode)
191
+ return image
192
+
193
+ @property
194
+ def size(self):
195
+ return list(self._basecolor.shape[-2:])
196
+
197
+ @property
198
+ def basecolor(self):
199
+ return self._convert(self._basecolor, mode="RGB")
200
+
201
+ @property
202
+ def normal(self):
203
+ return self._convert(self._normal, mode="RGB")
204
+
205
+ @property
206
+ def height(self):
207
+ return self._convert(self._height, mode="L")
208
+
209
+ @property
210
+ def roughness(self):
211
+ return self._convert(self._roughness, mode="L")
212
+
213
+ @property
214
+ def metallic(self):
215
+ return self._convert(self._metallic, mode="L")
216
+
217
+ def as_dict(self):
218
+ return {
219
+ "basecolor": self.basecolor,
220
+ "normal": self.normal,
221
+ "height": self.height,
222
+ "roughness": self.roughness,
223
+ "metallic": self.metallic,
224
+ }
225
+
226
+ def as_list(self):
227
+ return [
228
+ self.basecolor,
229
+ self.normal,
230
+ self.height,
231
+ self.roughness,
232
+ self.metallic,
233
+ ]
234
+
235
+ def as_tensor(self):
236
+ return torch.cat(
237
+ [
238
+ self._basecolor,
239
+ self._normal[:2],
240
+ self._height,
241
+ self._roughness,
242
+ self._metallic,
243
+ ],
244
+ dim=0,
245
+ )
246
+
247
+
248
+ @dataclass
249
+ class StableMaterialsPipelineOutput(BaseOutput):
250
+ """
251
+ Output class for Stable Diffusion pipelines.
252
+
253
+ Args:
254
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
255
+ List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
256
+ num_channels)`.
257
+ """
258
+
259
+ images: List[StableMaterialsMaterial]
260
+
261
+
262
+ def patch(x, patch_factor=2):
263
+ if isinstance(x, (list, tuple)):
264
+ pass
265
+
266
+ b, c, h, w = x.shape
267
+ patch_size = h // patch_factor
268
+
269
+ x = x.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
270
+ x = x.permute(0, 2, 3, 1, 4, 5).contiguous().view(-1, c, patch_size, patch_size)
271
+
272
+ n_patches = x.shape[0] // b
273
+
274
+ return x, (b, h), n_patches, patch_size
275
+
276
+
277
+ def unpatch(x, b, h, n_patches, patch_size=32):
278
+ if isinstance(x, (list, tuple)):
279
+ if len(x) == 1:
280
+ x = x[0]
281
+ else:
282
+ pass
283
+
284
+ factor = patch_size / x.shape[-1]
285
+ h, w = int(h / factor), int(h / factor)
286
+
287
+ c, patch_size = x.shape[1], x.shape[2]
288
+ n_patches = x.shape[0] // b
289
+
290
+ x = x.reshape(b, n_patches, c, patch_size, patch_size)
291
+ x = x.permute(0, 2, 3, 4, 1).contiguous().view(b, c * patch_size * patch_size, -1)
292
+
293
+ x = F.fold(
294
+ x,
295
+ output_size=(h, w),
296
+ kernel_size=patch_size,
297
+ stride=patch_size,
298
+ )
299
+
300
+ return x
301
+
302
+
303
+ def roll(x):
304
+ roll_h = torch.randint(0, 256, (1,)).item() // 2 * 2
305
+ roll_w = torch.randint(0, 256, (1,)).item() // 2 * 2
306
+
307
+ x = torch.roll(x, shifts=(roll_h, roll_w), dims=(2, 3))
308
+
309
+ return x, (roll_h, roll_w)
310
+
311
+
312
+ def unroll(x, roll_h, roll_w, factor=1.0):
313
+ roll_h = int(roll_h * factor)
314
+ roll_w = int(roll_w * factor)
315
+ x = torch.roll(x, shifts=(-roll_h, -roll_w), dims=(2, 3))
316
+ return x
317
+
318
+
319
+ @contextlib.contextmanager
320
+ def rolled_conv(enabled=True):
321
+ conv = torch.nn.Conv2d
322
+
323
+ if enabled:
324
+ # Save the original conv's constructor
325
+ orig_forward = conv.forward
326
+
327
+ def forward(self, x, *args, **kwargs):
328
+ x, (roll_h, roll_w) = roll(x)
329
+
330
+ pad = 4
331
+ x = F.pad(x, (pad, pad, pad, pad), mode="circular")
332
+ h = x.shape[-2]
333
+
334
+ x = orig_forward(self, x, *args, **kwargs)
335
+ h1 = x.shape[-2]
336
+ factor = h1 / h
337
+
338
+ pad = int(pad * factor)
339
+ x = x[..., pad:-pad, pad:-pad]
340
+ x = unroll(x, roll_h, roll_w, factor)
341
+
342
+ return x
343
+
344
+ # Patch conv's constructor
345
+ conv.forward = forward
346
+ # conv.__init__ = __init__
347
+ yield conv
348
+
349
+ # Restore the original conv's constructor
350
+ conv.forward = orig_forward
351
+ else:
352
+ # Use the original conv
353
+ yield conv
354
+
355
+
356
+ @contextlib.contextmanager
357
+ def tiled_attn(enabled=True, scale_multiplier=4):
358
+ conv = Transformer2DModel
359
+
360
+ if enabled:
361
+ # Save the original conv's constructor
362
+ orig_forward = conv.forward
363
+ # mult = scale_multiplier
364
+
365
+ def forward(self, hidden_states, encoder_hidden_states, *args, **kwargs):
366
+ hidden_states, (roll_h, roll_w) = roll(hidden_states)
367
+ hidden_states, (b, h), n_patches, patch_size = patch(
368
+ hidden_states, self.scale_multiplier
369
+ )
370
+ encoder_hidden_states = encoder_hidden_states.repeat_interleave(
371
+ n_patches, dim=0
372
+ )
373
+ chunks = math.ceil(len(hidden_states) / 8)
374
+ hidden_states = hidden_states.chunk(chunks, dim=0)
375
+ encoder_hidden_states = encoder_hidden_states.chunk(chunks, dim=0)
376
+ result = []
377
+ for i in range(chunks):
378
+ result.append(
379
+ orig_forward(
380
+ self,
381
+ hidden_states[i],
382
+ encoder_hidden_states[i],
383
+ *args,
384
+ **kwargs,
385
+ )[0]
386
+ )
387
+ hidden_states = torch.cat(result, dim=0)
388
+ hidden_states = unpatch(hidden_states, b, h, n_patches, patch_size)
389
+ hidden_states = unroll(hidden_states, roll_h, roll_w)
390
+ return (hidden_states,)
391
+
392
+ # Patch conv's constructor
393
+ conv.scale_multiplier = scale_multiplier
394
+ conv.forward = forward
395
+ yield conv
396
+
397
+ # Restore the original conv's constructor
398
+ conv.forward = orig_forward
399
+ else:
400
+ # Use the original conv
401
+ yield conv
402
+
403
+
404
+ class StableMaterialsPipeline(DiffusionPipeline, FromSingleFileMixin):
405
+
406
+ model_cpu_offload_seq = "prompt_encoder->unet->vae"
407
+
408
+ def __init__(
409
+ self,
410
+ vae: AutoencoderKL,
411
+ unet: UNet2DConditionModel,
412
+ # prompt_encoder: nn.Module,
413
+ scheduler: KarrasDiffusionSchedulers,
414
+ text_encoder: CLIPTextModel,
415
+ tokenizer: CLIPTokenizer,
416
+ vision_encoder: CLIPVisionModel,
417
+ processor: CLIPImageProcessor,
418
+ ):
419
+ super().__init__()
420
+
421
+ self.register_modules(
422
+ vae=vae,
423
+ unet=unet,
424
+ # prompt_encoder=prompt_encoder,
425
+ scheduler=scheduler,
426
+ # Conditioning modules
427
+ tokenizer=tokenizer,
428
+ processor=processor,
429
+ text_encoder=text_encoder,
430
+ vision_encoder=vision_encoder,
431
+ )
432
+
433
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
434
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
435
+
436
+ def enable_vae_slicing(self):
437
+ r"""
438
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
439
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
440
+ """
441
+ self.vae.enable_slicing()
442
+
443
+ def disable_vae_slicing(self):
444
+ r"""
445
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
446
+ computing decoding in one step.
447
+ """
448
+ self.vae.disable_slicing()
449
+
450
+ def enable_vae_tiling(self):
451
+ r"""
452
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
453
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
454
+ processing larger images.
455
+ """
456
+ self.vae.enable_tiling()
457
+
458
+ def disable_vae_tiling(self):
459
+ r"""
460
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
461
+ computing decoding in one step.
462
+ """
463
+ self.vae.disable_tiling()
464
+
465
+ def __encode_text(self, text):
466
+ inputs = self.tokenizer(text, padding=True, return_tensors="pt")
467
+ inputs["input_ids"] = inputs["input_ids"].to(self.device)
468
+ inputs["attention_mask"] = inputs["attention_mask"].to(self.device)
469
+ outputs = self.text_encoder(**inputs)
470
+ return outputs.text_embeds.unsqueeze(1)
471
+
472
+ def __encode_image(self, image):
473
+ inputs = self.processor(images=image, return_tensors="pt")
474
+ inputs["pixel_values"] = inputs["pixel_values"].to(self.device)
475
+ outputs = self.vision_encoder(**inputs)
476
+ return outputs.image_embeds.unsqueeze(1)
477
+
478
+ def __encode_prompt(
479
+ self,
480
+ prompt,
481
+ ):
482
+ if type(prompt) != list:
483
+ prompt = [prompt]
484
+
485
+ embs = []
486
+ for prompt in prompt:
487
+ if isinstance(prompt, str):
488
+ embs.append(self.__encode_text(prompt))
489
+ elif type(prompt) in get_args(ImageInput):
490
+ embs.append(self.__encode_image(prompt))
491
+ else:
492
+ raise NotImplementedError
493
+
494
+ return torch.cat(embs, dim=0)
495
+
496
+ def encode_prompt(
497
+ self,
498
+ prompt,
499
+ device,
500
+ num_images_per_prompt,
501
+ do_classifier_free_guidance,
502
+ negative_prompt=None,
503
+ prompt_embeds: Optional[torch.FloatTensor] = None,
504
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
505
+ ):
506
+ r"""
507
+ Encodes the prompt into text encoder hidden states.
508
+
509
+ Args:
510
+ prompt (`str` or `List[str]`, *optional*):
511
+ prompt to be encoded
512
+ device: (`torch.device`):
513
+ torch device
514
+ num_images_per_prompt (`int`):
515
+ number of images that should be generated per prompt
516
+ do_classifier_free_guidance (`bool`):
517
+ whether to use classifier free guidance or not
518
+ negative_prompt (`str` or `List[str]`, *optional*):
519
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
520
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
521
+ less than `1`).
522
+ prompt_embeds (`torch.FloatTensor`, *optional*):
523
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
524
+ provided, text embeddings will be generated from `prompt` input argument.
525
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
526
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
527
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
528
+ argument.
529
+ """
530
+ if (
531
+ prompt is not None
532
+ and isinstance(prompt, str)
533
+ or isinstance(prompt, Image.Image)
534
+ ):
535
+ batch_size = 1
536
+ elif prompt is not None and isinstance(prompt, list):
537
+ batch_size = len(prompt)
538
+ else:
539
+ batch_size = prompt_embeds.shape[0]
540
+
541
+ if prompt_embeds is None:
542
+ prompt_embeds = self.__encode_prompt(prompt)
543
+
544
+ if self.unet is not None:
545
+ prompt_embeds_dtype = self.unet.dtype
546
+ else:
547
+ prompt_embeds_dtype = prompt_embeds.dtype
548
+
549
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
550
+
551
+ bs_embed, seq_len, _ = prompt_embeds.shape
552
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
553
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
554
+ prompt_embeds = prompt_embeds.view(
555
+ bs_embed * num_images_per_prompt, seq_len, -1
556
+ )
557
+
558
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
559
+ uncond_tokens: List[str]
560
+ if negative_prompt is None:
561
+ # uncond_tokens = [""] * batch_size
562
+ uncond_tokens = [Image.new("RGB", (512, 512), (0, 0, 0))] * batch_size
563
+ elif isinstance(negative_prompt, str):
564
+ uncond_tokens = [negative_prompt] * batch_size
565
+ elif len(negative_prompt) != batch_size:
566
+ raise ValueError(
567
+ "The `negative_prompt` must be a string, a list of strings of length `batch_size`, or `None`."
568
+ )
569
+ else:
570
+ uncond_tokens = negative_prompt
571
+
572
+ negative_prompt_embeds = self.__encode_prompt(uncond_tokens)
573
+
574
+ # get unconditional embeddings for classifier free guidance
575
+ if do_classifier_free_guidance:
576
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
577
+ seq_len = negative_prompt_embeds.shape[1]
578
+
579
+ negative_prompt_embeds = negative_prompt_embeds.to(
580
+ dtype=prompt_embeds_dtype, device=device
581
+ )
582
+
583
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
584
+ 1, num_images_per_prompt, 1
585
+ )
586
+ negative_prompt_embeds = negative_prompt_embeds.view(
587
+ batch_size * num_images_per_prompt, seq_len, -1
588
+ )
589
+
590
+ return prompt_embeds, negative_prompt_embeds
591
+
592
+ def decode_latents(self, latents):
593
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
594
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
595
+
596
+ latents = 1 / self.vae.config.scaling_factor * latents
597
+ image = self.vae.decode(latents, return_dict=False)[0]
598
+ image = (image / 2 + 0.5).clamp(0, 1)
599
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
600
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
601
+ return image
602
+
603
+ def prepare_extra_step_kwargs(self, generator, eta):
604
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
605
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
606
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
607
+ # and should be between [0, 1]
608
+
609
+ accepts_eta = "eta" in set(
610
+ inspect.signature(self.scheduler.step).parameters.keys()
611
+ )
612
+ extra_step_kwargs = {}
613
+ if accepts_eta:
614
+ extra_step_kwargs["eta"] = eta
615
+
616
+ # check if the scheduler accepts generator
617
+ accepts_generator = "generator" in set(
618
+ inspect.signature(self.scheduler.step).parameters.keys()
619
+ )
620
+ if accepts_generator:
621
+ extra_step_kwargs["generator"] = generator
622
+ return extra_step_kwargs
623
+
624
+ def check_inputs(
625
+ self,
626
+ prompt,
627
+ height,
628
+ width,
629
+ negative_prompt=None,
630
+ prompt_embeds=None,
631
+ negative_prompt_embeds=None,
632
+ ):
633
+ if height % 8 != 0 or width % 8 != 0:
634
+ raise ValueError(
635
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
636
+ )
637
+
638
+ if prompt is not None and prompt_embeds is not None:
639
+ raise ValueError(
640
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
641
+ " only forward one of the two."
642
+ )
643
+ elif prompt is None and prompt_embeds is None:
644
+ raise ValueError(
645
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
646
+ )
647
+ elif prompt is not None and (not isinstance(prompt, (str, list, Image.Image))):
648
+ raise ValueError(
649
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
650
+ )
651
+
652
+ if negative_prompt is not None and negative_prompt_embeds is not None:
653
+ raise ValueError(
654
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
655
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
656
+ )
657
+
658
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
659
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
660
+ raise ValueError(
661
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
662
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
663
+ f" {negative_prompt_embeds.shape}."
664
+ )
665
+
666
+ def prepare_latents(
667
+ self,
668
+ batch_size,
669
+ num_channels_latents,
670
+ height,
671
+ width,
672
+ dtype,
673
+ device,
674
+ generator,
675
+ latents=None,
676
+ ):
677
+ shape = (
678
+ batch_size,
679
+ num_channels_latents,
680
+ height // self.vae_scale_factor,
681
+ width // self.vae_scale_factor,
682
+ )
683
+ if isinstance(generator, list) and len(generator) != batch_size:
684
+ raise ValueError(
685
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
686
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
687
+ )
688
+
689
+ if latents is None:
690
+ latents = randn_tensor(
691
+ shape, generator=generator, device=device, dtype=dtype
692
+ )
693
+ else:
694
+ latents = latents.to(device)
695
+
696
+ # scale the initial noise by the standard deviation required by the scheduler
697
+ latents = latents * self.scheduler.init_noise_sigma
698
+ return latents
699
+
700
+ def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
701
+ r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
702
+
703
+ The suffixes after the scaling factors represent the stages where they are being applied.
704
+
705
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
706
+ that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
707
+
708
+ Args:
709
+ s1 (`float`):
710
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
711
+ mitigate "oversmoothing effect" in the enhanced denoising process.
712
+ s2 (`float`):
713
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
714
+ mitigate "oversmoothing effect" in the enhanced denoising process.
715
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
716
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
717
+ """
718
+ if not hasattr(self, "unet"):
719
+ raise ValueError("The pipeline must have `unet` for using FreeU.")
720
+ self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
721
+
722
+ def disable_freeu(self):
723
+ """Disables the FreeU mechanism if enabled."""
724
+ self.unet.disable_freeu()
725
+
726
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
727
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
728
+ """
729
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
730
+
731
+ Args:
732
+ timesteps (`torch.Tensor`):
733
+ generate embedding vectors at these timesteps
734
+ embedding_dim (`int`, *optional*, defaults to 512):
735
+ dimension of the embeddings to generate
736
+ dtype:
737
+ data type of the generated embeddings
738
+
739
+ Returns:
740
+ `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
741
+ """
742
+ assert len(w.shape) == 1
743
+ w = w * 1000.0
744
+
745
+ half_dim = embedding_dim // 2
746
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
747
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
748
+ emb = w.to(dtype)[:, None] * emb[None, :]
749
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
750
+ if embedding_dim % 2 == 1: # zero pad
751
+ emb = torch.nn.functional.pad(emb, (0, 1))
752
+ assert emb.shape == (w.shape[0], embedding_dim)
753
+ return emb
754
+
755
+ @property
756
+ def guidance_scale(self):
757
+ return self._guidance_scale
758
+
759
+ @property
760
+ def guidance_rescale(self):
761
+ return self._guidance_rescale
762
+
763
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
764
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
765
+ # corresponds to doing no classifier free guidance.
766
+ @property
767
+ def do_classifier_free_guidance(self):
768
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
769
+
770
+ @property
771
+ def cross_attention_kwargs(self):
772
+ return self._cross_attention_kwargs
773
+
774
+ @property
775
+ def num_timesteps(self):
776
+ return self._num_timesteps
777
+
778
+ @property
779
+ def interrupt(self):
780
+ return self._interrupt
781
+
782
+ @torch.no_grad()
783
+ # @replace_example_docstring(EXAMPLE_DOC_STRING)
784
+ def __call__(
785
+ self,
786
+ prompt: Union[
787
+ str, List[str], PipelineImageInput, List[PipelineImageInput]
788
+ ] = None,
789
+ height: Optional[int] = None,
790
+ width: Optional[int] = None,
791
+ tileable: bool = False,
792
+ patched: bool = False,
793
+ num_inference_steps: int = 50,
794
+ timesteps: List[int] = None,
795
+ guidance_scale: float = 7.5,
796
+ negative_prompt: Optional[Union[str, List[str]]] = None,
797
+ num_images_per_prompt: Optional[int] = 1,
798
+ eta: float = 0.0,
799
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
800
+ latents: Optional[torch.FloatTensor] = None,
801
+ prompt_embeds: Optional[torch.FloatTensor] = None,
802
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
803
+ output_type: Optional[str] = "pil",
804
+ return_dict: bool = True,
805
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
806
+ guidance_rescale: float = 0.0,
807
+ **kwargs,
808
+ ):
809
+
810
+ # 0. Default height and width to unet
811
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
812
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
813
+
814
+ # 1. Check inputs. Raise error if not correct
815
+ self.check_inputs(
816
+ prompt,
817
+ height,
818
+ width,
819
+ negative_prompt,
820
+ prompt_embeds,
821
+ negative_prompt_embeds,
822
+ )
823
+
824
+ self._guidance_scale = guidance_scale
825
+ self._guidance_rescale = guidance_rescale
826
+ self._cross_attention_kwargs = cross_attention_kwargs
827
+ self._interrupt = False
828
+
829
+ # 2. Define call parameters
830
+ if prompt is not None and (
831
+ isinstance(prompt, str) or isinstance(prompt, Image.Image)
832
+ ):
833
+ batch_size = 1
834
+ elif prompt is not None and isinstance(prompt, list):
835
+ batch_size = len(prompt)
836
+ else:
837
+ batch_size = prompt_embeds.shape[0]
838
+
839
+ device = self._execution_device
840
+
841
+ # 3. Encode input prompt
842
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
843
+ prompt,
844
+ device,
845
+ num_images_per_prompt,
846
+ self.do_classifier_free_guidance,
847
+ negative_prompt,
848
+ prompt_embeds=prompt_embeds,
849
+ negative_prompt_embeds=negative_prompt_embeds,
850
+ )
851
+
852
+ # For classifier free guidance, we need to do two forward passes.
853
+ # Here we concatenate the unconditional and text embeddings into a single batch
854
+ # to avoid doing two forward passes
855
+ if self.do_classifier_free_guidance:
856
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
857
+
858
+ # 4. Prepare timesteps
859
+ timesteps, num_inference_steps = retrieve_timesteps(
860
+ self.scheduler, num_inference_steps, device, timesteps
861
+ )
862
+
863
+ # 5. Prepare latent variables
864
+ num_channels_latents = self.unet.config.in_channels
865
+ latents = self.prepare_latents(
866
+ batch_size * num_images_per_prompt,
867
+ num_channels_latents,
868
+ height,
869
+ width,
870
+ prompt_embeds.dtype,
871
+ device,
872
+ generator,
873
+ latents,
874
+ )
875
+
876
+ # 6. Prepare extra step kwargs.
877
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
878
+
879
+ # 6.2 Optionally get Guidance Scale Embedding
880
+ timestep_cond = None
881
+ if self.unet.config.time_cond_proj_dim is not None:
882
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
883
+ batch_size * num_images_per_prompt
884
+ )
885
+ timestep_cond = self.get_guidance_scale_embedding(
886
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
887
+ ).to(device=device, dtype=latents.dtype)
888
+
889
+ # 7. Denoising loop
890
+ self._num_timesteps = len(timesteps)
891
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
892
+ for i, t in enumerate(timesteps):
893
+ if self.interrupt:
894
+ continue
895
+
896
+ # expand the latents if we are doing classifier free guidance
897
+ latent_model_input = (
898
+ torch.cat([latents] * 2)
899
+ if self.do_classifier_free_guidance
900
+ else latents
901
+ )
902
+ latent_model_input = self.scheduler.scale_model_input(
903
+ latent_model_input, t
904
+ )
905
+
906
+ scale_multiplier = (
907
+ latent_model_input.shape[-1]
908
+ ) // self.unet.config.sample_size
909
+
910
+ past_mid = i >= len(timesteps) // 4
911
+ # predict the noise residual
912
+ with rolled_conv(enabled=(tileable & past_mid)):
913
+ with tiled_attn(enabled=patched, scale_multiplier=scale_multiplier):
914
+ noise_pred = self.unet(
915
+ latent_model_input,
916
+ t,
917
+ encoder_hidden_states=prompt_embeds,
918
+ timestep_cond=timestep_cond,
919
+ cross_attention_kwargs=self.cross_attention_kwargs,
920
+ return_dict=False,
921
+ )[0]
922
+
923
+ # perform guidance
924
+ if self.do_classifier_free_guidance:
925
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
926
+ noise_pred = noise_pred_uncond + self.guidance_scale * (
927
+ noise_pred_text - noise_pred_uncond
928
+ )
929
+
930
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
931
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
932
+ noise_pred = rescale_noise_cfg(
933
+ noise_pred,
934
+ noise_pred_text,
935
+ guidance_rescale=self.guidance_rescale,
936
+ )
937
+
938
+ # compute the previous noisy sample x_t -> x_t-1
939
+ latents = self.scheduler.step(
940
+ noise_pred, t, latents, **extra_step_kwargs, return_dict=False
941
+ )[0]
942
+
943
+ # call the callback, if provided
944
+ if i == len(timesteps) - 1 or (i + 1) % self.scheduler.order == 0:
945
+ progress_bar.update()
946
+
947
+ if not output_type == "latent":
948
+ if tileable:
949
+ # decode padded latent to preserve tileability
950
+ l_height = height // self.vae_scale_factor
951
+ l_width = width // self.vae_scale_factor
952
+ pad = l_height // 4
953
+ latents = TF.center_crop(
954
+ latents.repeat(1, 1, 3, 3), (l_height + pad, l_width + pad)
955
+ )
956
+
957
+ # decode the latents
958
+ image = self.vae.decode(
959
+ latents / self.vae.config.scaling_factor,
960
+ return_dict=False,
961
+ generator=generator,
962
+ )[0]
963
+
964
+ # crop to original size
965
+ image = TF.center_crop(image, (height, width))
966
+ else:
967
+ image = latents
968
+
969
+ image = postprocess(image, output_type=output_type)
970
+
971
+ # Offload all models
972
+ self.maybe_free_model_hooks()
973
+
974
+ if not return_dict:
975
+ return image
976
+
977
+ return StableMaterialsPipelineOutput(images=image)
processor/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
processor/preprocessor_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_center_crop",
8
+ "crop_size",
9
+ "do_rescale",
10
+ "rescale_factor",
11
+ "do_normalize",
12
+ "image_mean",
13
+ "image_std",
14
+ "do_convert_rgb",
15
+ "return_tensors",
16
+ "data_format",
17
+ "input_data_format"
18
+ ],
19
+ "crop_size": {
20
+ "height": 224,
21
+ "width": 224
22
+ },
23
+ "do_center_crop": true,
24
+ "do_convert_rgb": true,
25
+ "do_normalize": true,
26
+ "do_rescale": true,
27
+ "do_resize": true,
28
+ "image_mean": [
29
+ 0.48145466,
30
+ 0.4578275,
31
+ 0.40821073
32
+ ],
33
+ "image_processor_type": "CLIPImageProcessor",
34
+ "image_std": [
35
+ 0.26862954,
36
+ 0.26130258,
37
+ 0.27577711
38
+ ],
39
+ "processor_class": "CLIPProcessor",
40
+ "resample": 3,
41
+ "rescale_factor": 0.00392156862745098,
42
+ "size": {
43
+ "shortest_edge": 224
44
+ }
45
+ }
processor/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
processor/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
processor/tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "processor_class": "CLIPProcessor",
29
+ "tokenizer_class": "CLIPTokenizer",
30
+ "unk_token": "<|endoftext|>"
31
+ }
processor/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.27.2",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "interpolation_type": "linear",
11
+ "num_train_timesteps": 1000,
12
+ "prediction_type": "epsilon",
13
+ "rescale_betas_zero_snr": false,
14
+ "sample_max_value": 1.0,
15
+ "set_alpha_to_one": false,
16
+ "skip_prk_steps": true,
17
+ "steps_offset": 1,
18
+ "thresholding": false,
19
+ "timestep_spacing": "leading",
20
+ "trained_betas": null
21
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/clip-vit-large-patch14",
3
+ "architectures": [
4
+ "CLIPTextModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 768,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.40.2",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae0eabbb1fd83756ed9dd893c17ff2f6825c98555a1e1b96154e2df0739b9e2
3
+ size 494624560
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "tokenizer_class": "CLIPTokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.27.2",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": 8,
9
+ "attention_type": "default",
10
+ "block_out_channels": [
11
+ 320,
12
+ 640,
13
+ 1280,
14
+ 1280
15
+ ],
16
+ "center_input_sample": false,
17
+ "class_embed_type": null,
18
+ "class_embeddings_concat": false,
19
+ "conv_in_kernel": 3,
20
+ "conv_out_kernel": 3,
21
+ "cross_attention_dim": 768,
22
+ "cross_attention_norm": null,
23
+ "down_block_types": [
24
+ "CrossAttnDownBlock2D",
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "DownBlock2D"
28
+ ],
29
+ "downsample_padding": 1,
30
+ "dropout": 0.0,
31
+ "dual_cross_attention": false,
32
+ "encoder_hid_dim": null,
33
+ "encoder_hid_dim_type": null,
34
+ "flip_sin_to_cos": true,
35
+ "freq_shift": 0,
36
+ "in_channels": 18,
37
+ "layers_per_block": 2,
38
+ "mid_block_only_cross_attention": null,
39
+ "mid_block_scale_factor": 1,
40
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
41
+ "norm_eps": 1e-05,
42
+ "norm_num_groups": 32,
43
+ "num_attention_heads": null,
44
+ "num_class_embeds": null,
45
+ "only_cross_attention": false,
46
+ "out_channels": 18,
47
+ "projection_class_embeddings_input_dim": null,
48
+ "resnet_out_scale_factor": 1.0,
49
+ "resnet_skip_time_act": false,
50
+ "resnet_time_scale_shift": "default",
51
+ "reverse_transformer_layers_per_block": null,
52
+ "sample_size": 64,
53
+ "time_cond_proj_dim": null,
54
+ "time_embedding_act_fn": null,
55
+ "time_embedding_dim": null,
56
+ "time_embedding_type": "positional",
57
+ "timestep_post_act": null,
58
+ "transformer_layers_per_block": 1,
59
+ "up_block_types": [
60
+ "UpBlock2D",
61
+ "CrossAttnUpBlock2D",
62
+ "CrossAttnUpBlock2D",
63
+ "CrossAttnUpBlock2D"
64
+ ],
65
+ "upcast_attention": false,
66
+ "use_linear_projection": false
67
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:726f9bc81cab88af34a29a6bd89e9e442ca89612f66d4cf5a252c7b23ba5b334
3
+ size 3438490176
unet_lcm/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.27.2",
4
+ "_name_or_path": "C:/Users/giuse/source/repos/methods/matforger/diffusers/stablematerials",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": 8,
10
+ "attention_type": "default",
11
+ "block_out_channels": [
12
+ 320,
13
+ 640,
14
+ 1280,
15
+ 1280
16
+ ],
17
+ "center_input_sample": false,
18
+ "class_embed_type": null,
19
+ "class_embeddings_concat": false,
20
+ "conv_in_kernel": 3,
21
+ "conv_out_kernel": 3,
22
+ "cross_attention_dim": 768,
23
+ "cross_attention_norm": null,
24
+ "down_block_types": [
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "CrossAttnDownBlock2D",
28
+ "DownBlock2D"
29
+ ],
30
+ "downsample_padding": 1,
31
+ "dropout": 0.0,
32
+ "dual_cross_attention": false,
33
+ "encoder_hid_dim": null,
34
+ "encoder_hid_dim_type": null,
35
+ "flip_sin_to_cos": true,
36
+ "freq_shift": 0,
37
+ "in_channels": 18,
38
+ "layers_per_block": 2,
39
+ "mid_block_only_cross_attention": null,
40
+ "mid_block_scale_factor": 1,
41
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
42
+ "norm_eps": 1e-05,
43
+ "norm_num_groups": 32,
44
+ "num_attention_heads": null,
45
+ "num_class_embeds": null,
46
+ "only_cross_attention": false,
47
+ "out_channels": 18,
48
+ "projection_class_embeddings_input_dim": null,
49
+ "resnet_out_scale_factor": 1.0,
50
+ "resnet_skip_time_act": false,
51
+ "resnet_time_scale_shift": "default",
52
+ "reverse_transformer_layers_per_block": null,
53
+ "sample_size": 64,
54
+ "time_cond_proj_dim": 256,
55
+ "time_embedding_act_fn": null,
56
+ "time_embedding_dim": null,
57
+ "time_embedding_type": "positional",
58
+ "timestep_post_act": null,
59
+ "transformer_layers_per_block": 1,
60
+ "up_block_types": [
61
+ "UpBlock2D",
62
+ "CrossAttnUpBlock2D",
63
+ "CrossAttnUpBlock2D",
64
+ "CrossAttnUpBlock2D"
65
+ ],
66
+ "upcast_attention": false,
67
+ "use_linear_projection": false
68
+ }
unet_lcm/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a47c261bf9f19f7022fde0b45e1f5705409ed7d51a3e5baecc1fa8da7baa1a
3
+ size 3438817960
vae/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.27.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 9,
19
+ "latent_channels": 18,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "norm_num_groups": 32,
24
+ "out_channels": 9,
25
+ "sample_size": 512,
26
+ "scaling_factor": 0.18215,
27
+ "up_block_types": [
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D"
32
+ ]
33
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dac9dfa16153ea53d01aaa1fe2a031e509e108bc5bad95379e4fa5b3524af623
3
+ size 335479204
vision_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/clip-vit-large-patch14",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "quick_gelu",
9
+ "hidden_size": 1024,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.40.2"
23
+ }
vision_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77b33d2a3a643650857672e880ccf73adbaf114fbbadec36d142ee9d48af7e20
3
+ size 1215912728