johnslegers commited on
Commit
121f6d3
1 Parent(s): 558cbd4
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitmodules +0 -0
  2. README.md +7 -5
  3. app-bckp.py +872 -0
  4. app.bckp2.py +330 -0
  5. ldmlib/__pycache__/util.cpython-38.pyc +0 -0
  6. ldmlib/data/__init__.py +0 -0
  7. ldmlib/data/base.py +23 -0
  8. ldmlib/data/imagenet.py +394 -0
  9. ldmlib/data/lsun.py +92 -0
  10. ldmlib/lr_scheduler.py +98 -0
  11. ldmlib/models/__pycache__/autoencoder.cpython-38.pyc +0 -0
  12. ldmlib/models/autoencoder.py +443 -0
  13. ldmlib/models/diffusion/__init__.py +0 -0
  14. ldmlib/models/diffusion/classifier.py +267 -0
  15. ldmlib/models/diffusion/ddim.py +241 -0
  16. ldmlib/models/diffusion/ddpm.py +1445 -0
  17. ldmlib/models/diffusion/plms.py +236 -0
  18. ldmlib/modules/__pycache__/attention.cpython-38.pyc +0 -0
  19. ldmlib/modules/__pycache__/x_transformer.cpython-38.pyc +0 -0
  20. ldmlib/modules/attention.py +261 -0
  21. ldmlib/modules/diffusionmodules/__init__.py +0 -0
  22. ldmlib/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc +0 -0
  23. ldmlib/modules/diffusionmodules/__pycache__/model.cpython-38.pyc +0 -0
  24. ldmlib/modules/diffusionmodules/__pycache__/util.cpython-38.pyc +0 -0
  25. ldmlib/modules/diffusionmodules/model.py +830 -0
  26. ldmlib/modules/diffusionmodules/openaimodel.py +960 -0
  27. ldmlib/modules/diffusionmodules/util.py +267 -0
  28. ldmlib/modules/distributions/__init__.py +0 -0
  29. ldmlib/modules/distributions/__pycache__/__init__.cpython-38.pyc +0 -0
  30. ldmlib/modules/distributions/__pycache__/distributions.cpython-38.pyc +0 -0
  31. ldmlib/modules/distributions/distributions.py +92 -0
  32. ldmlib/modules/ema.py +76 -0
  33. ldmlib/modules/encoders/__init__.py +0 -0
  34. ldmlib/modules/encoders/__pycache__/__init__.cpython-38.pyc +0 -0
  35. ldmlib/modules/encoders/__pycache__/modules.cpython-38.pyc +0 -0
  36. ldmlib/modules/encoders/modules.py +234 -0
  37. ldmlib/modules/image_degradation/__init__.py +2 -0
  38. ldmlib/modules/image_degradation/bsrgan.py +728 -0
  39. ldmlib/modules/image_degradation/bsrgan_light.py +650 -0
  40. ldmlib/modules/image_degradation/utils/test.png +0 -0
  41. ldmlib/modules/image_degradation/utils_image.py +916 -0
  42. ldmlib/modules/losses/__init__.py +1 -0
  43. ldmlib/modules/losses/contperceptual.py +110 -0
  44. ldmlib/modules/losses/vqperceptual.py +167 -0
  45. ldmlib/modules/x_transformer.py +641 -0
  46. ldmlib/util.py +203 -0
  47. modules/app.py +55 -0
  48. modules/dataset.py +19 -0
  49. modules/inference.py +11 -0
  50. optimizedSD/LICENSE +80 -0
.gitmodules ADDED
File without changes
README.md CHANGED
@@ -1,11 +1,13 @@
1
  ---
2
  title: Stable Diffusion 3
3
- emoji: 😻
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.4.1
8
- app_file: app.py
 
 
9
  pinned: false
10
  ---
11
 
 
1
  ---
2
  title: Stable Diffusion 3
3
+ emoji:
4
+ colorFrom: green
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.1.7
8
+ app_file: start.py
9
+ datasets: [emotion]
10
+ license: mit
11
  pinned: false
12
  ---
13
 
app-bckp.py ADDED
@@ -0,0 +1,872 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os, re
3
+ import traceback
4
+ import torch
5
+ import numpy as np
6
+ from omegaconf import OmegaConf
7
+ from PIL import Image, ImageOps
8
+ from tqdm import tqdm, trange
9
+ from itertools import islice
10
+ from einops import rearrange
11
+ import time
12
+ from pytorch_lightning import seed_everything
13
+ from torch import autocast
14
+ from contextlib import nullcontext
15
+ from einops import rearrange, repeat
16
+ from ldmlib.util import instantiate_from_config
17
+ from optimizedSD.optimUtils import split_weighted_subprompts
18
+ from transformers import logging
19
+
20
+ from gfpgan import GFPGANer
21
+ from basicsr.archs.rrdbnet_arch import RRDBNet
22
+ from realesrgan import RealESRGANer
23
+
24
+ import uuid
25
+
26
+ AUTH_TOKEN = os.environ.get('AUTH_TOKEN')
27
+ if not AUTH_TOKEN:
28
+ with open('/root/.huggingface/token') as f:
29
+ lines = f.readlines()
30
+ AUTH_TOKEN = lines[0]
31
+
32
+
33
+
34
+ logging.set_verbosity_error()
35
+
36
+ # consts
37
+ config_yaml = "optimizedSD/v1-inference.yaml"
38
+ filename_regex = re.compile('[^a-zA-Z0-9]')
39
+
40
+ # api stuff
41
+ from sd_internal import Request, Response, Image as ResponseImage
42
+ import base64
43
+ from io import BytesIO
44
+ #from colorama import Fore
45
+
46
+ # local
47
+ stop_processing = False
48
+ temp_images = {}
49
+
50
+ ckpt_file = None
51
+ gfpgan_file = None
52
+ real_esrgan_file = None
53
+
54
+ model = None
55
+ modelCS = None
56
+ modelFS = None
57
+ model_gfpgan = None
58
+ model_real_esrgan = None
59
+
60
+ model_is_half = False
61
+ model_fs_is_half = False
62
+ device = None
63
+ unet_bs = 1
64
+ precision = 'autocast'
65
+ sampler_plms = None
66
+ sampler_ddim = None
67
+
68
+ has_valid_gpu = False
69
+ force_full_precision = False
70
+ try:
71
+ gpu = torch.cuda.current_device()
72
+ gpu_name = torch.cuda.get_device_name(gpu)
73
+ print('GPU detected: ', gpu_name)
74
+
75
+ force_full_precision = ('nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower()) and (' 1660' in gpu_name or ' 1650' in gpu_name) # otherwise these NVIDIA cards create green images
76
+ if force_full_precision:
77
+ print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', gpu_name)
78
+
79
+ mem_free, mem_total = torch.cuda.mem_get_info(gpu)
80
+ mem_total /= float(10**9)
81
+ if mem_total < 3.0:
82
+ print("GPUs with less than 3 GB of VRAM are not compatible with Stable Diffusion")
83
+ raise Exception()
84
+
85
+ has_valid_gpu = True
86
+ except:
87
+ print('WARNING: No compatible GPU found. Using the CPU, but this will be very slow!')
88
+ pass
89
+
90
+ def load_model_ckpt(ckpt_to_use, device_to_use='cuda', turbo=False, unet_bs_to_use=1, precision_to_use='autocast'):
91
+ global ckpt_file, model, modelCS, modelFS, model_is_half, device, unet_bs, precision, model_fs_is_half
92
+
93
+ device = device_to_use if has_valid_gpu else 'cpu'
94
+ precision = precision_to_use if not force_full_precision else 'full'
95
+ unet_bs = unet_bs_to_use
96
+
97
+ unload_model()
98
+
99
+ if device == 'cpu':
100
+ precision = 'full'
101
+
102
+ sd = load_model_from_config(f"{ckpt_to_use}.ckpt")
103
+ li, lo = [], []
104
+ for key, value in sd.items():
105
+ sp = key.split(".")
106
+ if (sp[0]) == "model":
107
+ if "input_blocks" in sp:
108
+ li.append(key)
109
+ elif "middle_block" in sp:
110
+ li.append(key)
111
+ elif "time_embed" in sp:
112
+ li.append(key)
113
+ else:
114
+ lo.append(key)
115
+ for key in li:
116
+ sd["model1." + key[6:]] = sd.pop(key)
117
+ for key in lo:
118
+ sd["model2." + key[6:]] = sd.pop(key)
119
+
120
+ config = OmegaConf.load(f"{config_yaml}")
121
+
122
+ model = instantiate_from_config(config.modelUNet)
123
+ _, _ = model.load_state_dict(sd, strict=False)
124
+ model.eval()
125
+ model.cdevice = device
126
+ model.unet_bs = unet_bs
127
+ model.turbo = turbo
128
+
129
+ modelCS = instantiate_from_config(config.modelCondStage)
130
+ _, _ = modelCS.load_state_dict(sd, strict=False)
131
+ modelCS.eval()
132
+ modelCS.cond_stage_model.device = device
133
+
134
+ modelFS = instantiate_from_config(config.modelFirstStage)
135
+ _, _ = modelFS.load_state_dict(sd, strict=False)
136
+ modelFS.eval()
137
+ del sd
138
+
139
+ if device != "cpu" and precision == "autocast":
140
+ model.half()
141
+ modelCS.half()
142
+ modelFS.half()
143
+ model_is_half = True
144
+ model_fs_is_half = True
145
+ else:
146
+ model_is_half = False
147
+ model_fs_is_half = False
148
+
149
+ ckpt_file = ckpt_to_use
150
+
151
+ print('loaded ', ckpt_file, 'to', device, 'precision', precision)
152
+
153
+ def unload_model():
154
+ global model, modelCS, modelFS
155
+
156
+ if model is not None:
157
+ del model
158
+ del modelCS
159
+ del modelFS
160
+
161
+ model = None
162
+ modelCS = None
163
+ modelFS = None
164
+
165
+ def load_model_gfpgan(gfpgan_to_use):
166
+ global gfpgan_file, model_gfpgan
167
+
168
+ if gfpgan_to_use is None:
169
+ return
170
+
171
+ gfpgan_file = gfpgan_to_use
172
+ model_path = gfpgan_to_use + ".pth"
173
+
174
+ if device == 'cpu':
175
+ model_gfpgan = GFPGANer(model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=torch.device('cpu'))
176
+ else:
177
+ model_gfpgan = GFPGANer(model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=torch.device('cuda'))
178
+
179
+ print('loaded ', gfpgan_to_use, 'to', device, 'precision', precision)
180
+
181
+ def load_model_real_esrgan(real_esrgan_to_use):
182
+ global real_esrgan_file, model_real_esrgan
183
+
184
+ if real_esrgan_to_use is None:
185
+ return
186
+
187
+ real_esrgan_file = real_esrgan_to_use
188
+ model_path = real_esrgan_to_use + ".pth"
189
+
190
+ RealESRGAN_models = {
191
+ 'RealESRGAN_x4plus': RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4),
192
+ 'RealESRGAN_x4plus_anime_6B': RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
193
+ }
194
+
195
+ model_to_use = RealESRGAN_models[real_esrgan_to_use]
196
+
197
+ if device == 'cpu':
198
+ model_real_esrgan = RealESRGANer(scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=False) # cpu does not support half
199
+ model_real_esrgan.device = torch.device('cpu')
200
+ model_real_esrgan.model.to('cpu')
201
+ else:
202
+ model_real_esrgan = RealESRGANer(scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=model_is_half)
203
+
204
+ model_real_esrgan.model.name = real_esrgan_to_use
205
+
206
+ print('loaded ', real_esrgan_to_use, 'to', device, 'precision', precision)
207
+
208
+ def mk_img(req: Request):
209
+ try:
210
+ yield from do_mk_img(req)
211
+ except Exception as e:
212
+ print(traceback.format_exc())
213
+
214
+ gc()
215
+
216
+ if device != "cpu":
217
+ modelFS.to("cpu")
218
+ modelCS.to("cpu")
219
+
220
+ model.model1.to("cpu")
221
+ model.model2.to("cpu")
222
+
223
+ gc()
224
+
225
+ yield json.dumps({
226
+ "status": 'failed',
227
+ "detail": str(e)
228
+ })
229
+
230
+ def do_mk_img(req: Request):
231
+ global ckpt_file
232
+ global model, modelCS, modelFS, device
233
+ global model_gfpgan, model_real_esrgan
234
+ global stop_processing
235
+
236
+ stop_processing = False
237
+
238
+ res = Response()
239
+ res.request = req
240
+ res.images = []
241
+
242
+ temp_images.clear()
243
+
244
+ # custom model support:
245
+ # the req.use_stable_diffusion_model needs to be a valid path
246
+ # to the ckpt file (without the extension).
247
+
248
+ needs_model_reload = False
249
+ ckpt_to_use = ckpt_file
250
+ if ckpt_to_use != req.use_stable_diffusion_model:
251
+ ckpt_to_use = req.use_stable_diffusion_model
252
+ needs_model_reload = True
253
+
254
+ model.turbo = req.turbo
255
+ if req.use_cpu:
256
+ if device != 'cpu':
257
+ device = 'cpu'
258
+
259
+ if model_is_half:
260
+ load_model_ckpt(ckpt_to_use, device)
261
+ needs_model_reload = False
262
+
263
+ load_model_gfpgan(gfpgan_file)
264
+ load_model_real_esrgan(real_esrgan_file)
265
+ else:
266
+ if has_valid_gpu:
267
+ prev_device = device
268
+ device = 'cuda'
269
+
270
+ if (precision == 'autocast' and (req.use_full_precision or not model_is_half)) or \
271
+ (precision == 'full' and not req.use_full_precision and not force_full_precision):
272
+
273
+ load_model_ckpt(ckpt_to_use, device, req.turbo, unet_bs, ('full' if req.use_full_precision else 'autocast'))
274
+ needs_model_reload = False
275
+
276
+ if prev_device != device:
277
+ load_model_gfpgan(gfpgan_file)
278
+ load_model_real_esrgan(real_esrgan_file)
279
+
280
+ if needs_model_reload:
281
+ load_model_ckpt(ckpt_to_use, device, req.turbo, unet_bs, precision)
282
+
283
+ if req.use_face_correction != gfpgan_file:
284
+ load_model_gfpgan(req.use_face_correction)
285
+
286
+ if req.use_upscale != real_esrgan_file:
287
+ load_model_real_esrgan(req.use_upscale)
288
+
289
+ model.cdevice = device
290
+ modelCS.cond_stage_model.device = device
291
+
292
+ opt_prompt = req.prompt
293
+ opt_seed = req.seed
294
+ opt_n_samples = req.num_outputs
295
+ opt_n_iter = 1
296
+ opt_scale = req.guidance_scale
297
+ opt_C = 4
298
+ opt_H = req.height
299
+ opt_W = req.width
300
+ opt_f = 8
301
+ opt_ddim_steps = req.num_inference_steps
302
+ opt_ddim_eta = 0.0
303
+ opt_strength = req.prompt_strength
304
+ opt_save_to_disk_path = req.save_to_disk_path
305
+ opt_init_img = req.init_image
306
+ opt_use_face_correction = req.use_face_correction
307
+ opt_use_upscale = req.use_upscale
308
+ opt_show_only_filtered = req.show_only_filtered_image
309
+ opt_format = req.output_format
310
+ opt_sampler_name = req.sampler
311
+
312
+ print(req.to_string(), '\n device', device)
313
+
314
+ print('\n\n Using precision:', precision)
315
+
316
+ seed_everything(opt_seed)
317
+
318
+ batch_size = opt_n_samples
319
+ prompt = opt_prompt
320
+ assert prompt is not None
321
+ data = [batch_size * [prompt]]
322
+
323
+ if precision == "autocast" and device != "cpu":
324
+ precision_scope = autocast
325
+ else:
326
+ precision_scope = nullcontext
327
+
328
+ mask = None
329
+
330
+ if req.init_image is None:
331
+ handler = _txt2img
332
+
333
+ init_latent = None
334
+ t_enc = None
335
+ else:
336
+ handler = _img2img
337
+
338
+ init_image = load_img(req.init_image, opt_W, opt_H)
339
+ init_image = init_image.to(device)
340
+
341
+ if device != "cpu" and precision == "autocast":
342
+ init_image = init_image.half()
343
+
344
+ modelFS.to(device)
345
+
346
+ init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
347
+ init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image)) # move to latent space
348
+
349
+ if req.mask is not None:
350
+ mask = load_mask(req.mask, opt_W, opt_H, init_latent.shape[2], init_latent.shape[3], True).to(device)
351
+ mask = mask[0][0].unsqueeze(0).repeat(4, 1, 1).unsqueeze(0)
352
+ mask = repeat(mask, '1 ... -> b ...', b=batch_size)
353
+
354
+ if device != "cpu" and precision == "autocast":
355
+ mask = mask.half()
356
+
357
+ move_fs_to_cpu()
358
+
359
+ assert 0. <= opt_strength <= 1., 'can only work with strength in [0.0, 1.0]'
360
+ t_enc = int(opt_strength * opt_ddim_steps)
361
+ print(f"target t_enc is {t_enc} steps")
362
+
363
+ if opt_save_to_disk_path is not None:
364
+ session_out_path = os.path.join(opt_save_to_disk_path, req.session_id)
365
+ os.makedirs(session_out_path, exist_ok=True)
366
+ else:
367
+ session_out_path = None
368
+
369
+ seeds = ""
370
+ with torch.no_grad():
371
+ for n in trange(opt_n_iter, desc="Sampling"):
372
+ for prompts in tqdm(data, desc="data"):
373
+
374
+ with precision_scope("cuda"):
375
+ modelCS.to(device)
376
+ uc = None
377
+ if opt_scale != 1.0:
378
+ uc = modelCS.get_learned_conditioning(batch_size * [req.negative_prompt])
379
+ if isinstance(prompts, tuple):
380
+ prompts = list(prompts)
381
+
382
+ subprompts, weights = split_weighted_subprompts(prompts[0])
383
+ if len(subprompts) > 1:
384
+ c = torch.zeros_like(uc)
385
+ totalWeight = sum(weights)
386
+ # normalize each "sub prompt" and add it
387
+ for i in range(len(subprompts)):
388
+ weight = weights[i]
389
+ # if not skip_normalize:
390
+ weight = weight / totalWeight
391
+ c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
392
+ else:
393
+ c = modelCS.get_learned_conditioning(prompts)
394
+
395
+ modelFS.to(device)
396
+
397
+ partial_x_samples = None
398
+ def img_callback(x_samples, i):
399
+ nonlocal partial_x_samples
400
+
401
+ partial_x_samples = x_samples
402
+
403
+ if req.stream_progress_updates:
404
+ n_steps = opt_ddim_steps if req.init_image is None else t_enc
405
+ progress = {"step": i, "total_steps": n_steps}
406
+
407
+ if req.stream_image_progress and i % 5 == 0:
408
+ partial_images = []
409
+
410
+ for i in range(batch_size):
411
+ x_samples_ddim = modelFS.decode_first_stage(x_samples[i].unsqueeze(0))
412
+ x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
413
+ x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
414
+ x_sample = x_sample.astype(np.uint8)
415
+ img = Image.fromarray(x_sample)
416
+ buf = BytesIO()
417
+ img.save(buf, format='JPEG')
418
+ buf.seek(0)
419
+
420
+ del img, x_sample, x_samples_ddim
421
+ # don't delete x_samples, it is used in the code that called this callback
422
+
423
+ temp_images[str(req.session_id) + '/' + str(i)] = buf
424
+ partial_images.append({'path': f'/image/tmp/{req.session_id}/{i}'})
425
+
426
+ progress['output'] = partial_images
427
+
428
+ yield json.dumps(progress)
429
+
430
+ if stop_processing:
431
+ raise UserInitiatedStop("User requested that we stop processing")
432
+
433
+ # run the handler
434
+ try:
435
+ if handler == _txt2img:
436
+ x_samples = _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, None, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, opt_sampler_name)
437
+ else:
438
+ x_samples = _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, opt_ddim_eta, opt_seed, img_callback, mask)
439
+
440
+ yield from x_samples
441
+
442
+ x_samples = partial_x_samples
443
+ except UserInitiatedStop:
444
+ if partial_x_samples is None:
445
+ continue
446
+
447
+ x_samples = partial_x_samples
448
+
449
+ print("saving images")
450
+ for i in range(batch_size):
451
+
452
+ x_samples_ddim = modelFS.decode_first_stage(x_samples[i].unsqueeze(0))
453
+ x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
454
+ x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
455
+ x_sample = x_sample.astype(np.uint8)
456
+ img = Image.fromarray(x_sample)
457
+
458
+ has_filters = (opt_use_face_correction is not None and opt_use_face_correction.startswith('GFPGAN')) or \
459
+ (opt_use_upscale is not None and opt_use_upscale.startswith('RealESRGAN'))
460
+
461
+ return_orig_img = not has_filters or not opt_show_only_filtered
462
+
463
+ if stop_processing:
464
+ return_orig_img = True
465
+
466
+ if opt_save_to_disk_path is not None:
467
+ prompt_flattened = filename_regex.sub('_', prompts[0])
468
+ prompt_flattened = prompt_flattened[:50]
469
+
470
+ img_id = str(uuid.uuid4())[-8:]
471
+
472
+ file_path = f"{prompt_flattened}_{img_id}"
473
+ img_out_path = os.path.join(session_out_path, f"{file_path}.{opt_format}")
474
+ meta_out_path = os.path.join(session_out_path, f"{file_path}.txt")
475
+
476
+ if return_orig_img:
477
+ save_image(img, img_out_path)
478
+
479
+ save_metadata(meta_out_path, prompts, opt_seed, opt_W, opt_H, opt_ddim_steps, opt_scale, opt_strength, opt_use_face_correction, opt_use_upscale, opt_sampler_name, req.negative_prompt, ckpt_file)
480
+
481
+ if return_orig_img:
482
+ img_data = img_to_base64_str(img, opt_format)
483
+ res_image_orig = ResponseImage(data=img_data, seed=opt_seed)
484
+ res.images.append(res_image_orig)
485
+
486
+ if opt_save_to_disk_path is not None:
487
+ res_image_orig.path_abs = img_out_path
488
+
489
+ del img
490
+
491
+ if has_filters and not stop_processing:
492
+ print('Applying filters..')
493
+
494
+ gc()
495
+ filters_applied = []
496
+
497
+ if opt_use_face_correction:
498
+ _, _, output = model_gfpgan.enhance(x_sample[:,:,::-1], has_aligned=False, only_center_face=False, paste_back=True)
499
+ x_sample = output[:,:,::-1]
500
+ filters_applied.append(opt_use_face_correction)
501
+
502
+ if opt_use_upscale:
503
+ output, _ = model_real_esrgan.enhance(x_sample[:,:,::-1])
504
+ x_sample = output[:,:,::-1]
505
+ filters_applied.append(opt_use_upscale)
506
+
507
+ filtered_image = Image.fromarray(x_sample)
508
+
509
+ filtered_img_data = img_to_base64_str(filtered_image, opt_format)
510
+ res_image_filtered = ResponseImage(data=filtered_img_data, seed=opt_seed)
511
+ res.images.append(res_image_filtered)
512
+
513
+ filters_applied = "_".join(filters_applied)
514
+
515
+ if opt_save_to_disk_path is not None:
516
+ filtered_img_out_path = os.path.join(session_out_path, f"{file_path}_{filters_applied}.{opt_format}")
517
+ save_image(filtered_image, filtered_img_out_path)
518
+ res_image_filtered.path_abs = filtered_img_out_path
519
+
520
+ del filtered_image
521
+
522
+ seeds += str(opt_seed) + ","
523
+ opt_seed += 1
524
+
525
+ move_fs_to_cpu()
526
+ gc()
527
+ del x_samples, x_samples_ddim, x_sample
528
+ print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
529
+
530
+ print('Task completed')
531
+
532
+ yield json.dumps(res.json())
533
+
534
+ def save_image(img, img_out_path):
535
+ try:
536
+ img.save(img_out_path)
537
+ except:
538
+ print('could not save the file', traceback.format_exc())
539
+
540
+ def save_metadata(meta_out_path, prompts, opt_seed, opt_W, opt_H, opt_ddim_steps, opt_scale, opt_prompt_strength, opt_correct_face, opt_upscale, sampler_name, negative_prompt, ckpt_file):
541
+ metadata = f"{prompts[0]}\nWidth: {opt_W}\nHeight: {opt_H}\nSeed: {opt_seed}\nSteps: {opt_ddim_steps}\nGuidance Scale: {opt_scale}\nPrompt Strength: {opt_prompt_strength}\nUse Face Correction: {opt_correct_face}\nUse Upscaling: {opt_upscale}\nSampler: {sampler_name}\nNegative Prompt: {negative_prompt}\nStable Diffusion Model: {ckpt_file + '.ckpt'}"
542
+
543
+ try:
544
+ with open(meta_out_path, 'w') as f:
545
+ f.write(metadata)
546
+ except:
547
+ print('could not save the file', traceback.format_exc())
548
+
549
+ def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, sampler_name):
550
+ shape = [opt_n_samples, opt_C, opt_H // opt_f, opt_W // opt_f]
551
+
552
+ if device != "cpu":
553
+ mem = torch.cuda.memory_allocated() / 1e6
554
+ modelCS.to("cpu")
555
+ while torch.cuda.memory_allocated() / 1e6 >= mem:
556
+ time.sleep(1)
557
+
558
+ if sampler_name == 'ddim':
559
+ model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False)
560
+
561
+ samples_ddim = model.sample(
562
+ S=opt_ddim_steps,
563
+ conditioning=c,
564
+ seed=opt_seed,
565
+ shape=shape,
566
+ verbose=False,
567
+ unconditional_guidance_scale=opt_scale,
568
+ unconditional_conditioning=uc,
569
+ eta=opt_ddim_eta,
570
+ x_T=start_code,
571
+ img_callback=img_callback,
572
+ mask=mask,
573
+ sampler = sampler_name,
574
+ )
575
+
576
+ yield from samples_ddim
577
+
578
+ def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, opt_ddim_eta, opt_seed, img_callback, mask):
579
+ # encode (scaled latent)
580
+ z_enc = model.stochastic_encode(
581
+ init_latent,
582
+ torch.tensor([t_enc] * batch_size).to(device),
583
+ opt_seed,
584
+ opt_ddim_eta,
585
+ opt_ddim_steps,
586
+ )
587
+ x_T = None if mask is None else init_latent
588
+
589
+ # decode it
590
+ samples_ddim = model.sample(
591
+ t_enc,
592
+ c,
593
+ z_enc,
594
+ unconditional_guidance_scale=opt_scale,
595
+ unconditional_conditioning=uc,
596
+ img_callback=img_callback,
597
+ mask=mask,
598
+ x_T=x_T,
599
+ sampler = 'ddim'
600
+ )
601
+
602
+ yield from samples_ddim
603
+
604
+ def move_fs_to_cpu():
605
+ if device != "cpu":
606
+ mem = torch.cuda.memory_allocated() / 1e6
607
+ modelFS.to("cpu")
608
+ while torch.cuda.memory_allocated() / 1e6 >= mem:
609
+ time.sleep(1)
610
+
611
+ def gc():
612
+ if device == 'cpu':
613
+ return
614
+
615
+ torch.cuda.empty_cache()
616
+ torch.cuda.ipc_collect()
617
+
618
+ # internal
619
+
620
+ def chunk(it, size):
621
+ it = iter(it)
622
+ return iter(lambda: tuple(islice(it, size)), ())
623
+
624
+
625
+ def load_model_from_config(ckpt, verbose=False):
626
+ print(f"Loading model from {ckpt}")
627
+ pl_sd = torch.load(ckpt, map_location="cpu")
628
+ if "global_step" in pl_sd:
629
+ print(f"Global Step: {pl_sd['global_step']}")
630
+ sd = pl_sd["state_dict"]
631
+ return sd
632
+
633
+ # utils
634
+ class UserInitiatedStop(Exception):
635
+ pass
636
+
637
+ def load_img(img_str, w0, h0):
638
+ image = base64_str_to_img(img_str).convert("RGB")
639
+ w, h = image.size
640
+ print(f"loaded input image of size ({w}, {h}) from base64")
641
+ if h0 is not None and w0 is not None:
642
+ h, w = h0, w0
643
+
644
+ w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64
645
+ image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
646
+ image = np.array(image).astype(np.float32) / 255.0
647
+ image = image[None].transpose(0, 3, 1, 2)
648
+ image = torch.from_numpy(image)
649
+ return 2.*image - 1.
650
+
651
+ def load_mask(mask_str, h0, w0, newH, newW, invert=False):
652
+ image = base64_str_to_img(mask_str).convert("RGB")
653
+ w, h = image.size
654
+ print(f"loaded input mask of size ({w}, {h})")
655
+
656
+ if invert:
657
+ print("inverted")
658
+ image = ImageOps.invert(image)
659
+ # where_0, where_1 = np.where(image == 0), np.where(image == 255)
660
+ # image[where_0], image[where_1] = 255, 0
661
+
662
+ if h0 is not None and w0 is not None:
663
+ h, w = h0, w0
664
+
665
+ w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64
666
+
667
+ print(f"New mask size ({w}, {h})")
668
+ image = image.resize((newW, newH), resample=Image.Resampling.LANCZOS)
669
+ image = np.array(image)
670
+
671
+ image = image.astype(np.float32) / 255.0
672
+ image = image[None].transpose(0, 3, 1, 2)
673
+ image = torch.from_numpy(image)
674
+ return image
675
+
676
+ # https://stackoverflow.com/a/61114178
677
+ def img_to_base64_str(img, output_format="PNG"):
678
+ buffered = BytesIO()
679
+ img.save(buffered, format=output_format)
680
+ buffered.seek(0)
681
+ img_byte = buffered.getvalue()
682
+ img_str = "data:image/png;base64," + base64.b64encode(img_byte).decode()
683
+ return img_str
684
+
685
+ def base64_str_to_img(img_str):
686
+ img_str = img_str[len("data:image/png;base64,"):]
687
+ data = base64.b64decode(img_str)
688
+ buffered = BytesIO(data)
689
+ img = Image.open(buffered)
690
+ return img
691
+
692
+
693
+
694
+
695
+
696
+
697
+
698
+
699
+
700
+
701
+
702
+
703
+
704
+
705
+
706
+
707
+ from fastapi import FastAPI, HTTPException
708
+ from fastapi.staticfiles import StaticFiles
709
+ from starlette.responses import FileResponse, StreamingResponse
710
+ from pydantic import BaseModel
711
+ import logging
712
+
713
+ from sd_internal import Request, Response
714
+
715
+ import json
716
+ import traceback
717
+
718
+ import sys
719
+ import os
720
+
721
+ SD_DIR = os.getcwd()
722
+ print('started in ', SD_DIR)
723
+
724
+ #SD_UI_DIR = os.getenv('SD_UI_PATH', None)
725
+ #sys.path.append(os.path.dirname(SD_UI_DIR))
726
+
727
+ #CONFIG_DIR = os.path.abspath(os.path.join(SD_UI_DIR, '..', 'scripts'))
728
+ MODELS_DIR = os.path.abspath(os.path.join(SD_DIR, '..', 'models'))
729
+
730
+ OUTPUT_DIRNAME = "Stable Diffusion UI" # in the user's home folder
731
+
732
+ app = FastAPI()
733
+
734
+ model_loaded = False
735
+ model_is_loading = False
736
+
737
+ modifiers_cache = None
738
+ outpath = os.path.join(os.path.expanduser("~"), OUTPUT_DIRNAME)
739
+
740
+ # defaults from https://huggingface.co/blog/stable_diffusion
741
+ class ImageRequest(BaseModel):
742
+ session_id: str = "session"
743
+ prompt: str = ""
744
+ negative_prompt: str = ""
745
+ init_image: str = None # base64
746
+ mask: str = None # base64
747
+ num_outputs: int = 1
748
+ num_inference_steps: int = 50
749
+ guidance_scale: float = 7.5
750
+ width: int = 512
751
+ height: int = 512
752
+ seed: int = 42
753
+ prompt_strength: float = 0.8
754
+ sampler: str = None # "ddim", "plms", "heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"
755
+ # allow_nsfw: bool = False
756
+ save_to_disk_path: str = None
757
+ turbo: bool = True
758
+ use_cpu: bool = False
759
+ use_full_precision: bool = False
760
+ use_face_correction: str = None # or "GFPGANv1.3"
761
+ use_upscale: str = None # or "RealESRGAN_x4plus" or "RealESRGAN_x4plus_anime_6B"
762
+ use_stable_diffusion_model: str = "sd-v1-4"
763
+ show_only_filtered_image: bool = False
764
+ output_format: str = "jpeg" # or "png"
765
+
766
+ stream_progress_updates: bool = False
767
+ stream_image_progress: bool = False
768
+
769
+ from starlette.responses import FileResponse, StreamingResponse
770
+
771
+ def resolve_model_to_use(model_name):
772
+ if model_name in ('sd-v1-4', 'custom-model'):
773
+ model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
774
+
775
+ legacy_model_path = os.path.join(SD_DIR, model_name)
776
+ if not os.path.exists(model_path + '.ckpt') and os.path.exists(legacy_model_path + '.ckpt'):
777
+ model_path = legacy_model_path
778
+ else:
779
+ model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
780
+
781
+ return model_path
782
+
783
+ def image(req : ImageRequest):
784
+ r = Request()
785
+ r.session_id = req.session_id
786
+ r.prompt = req.prompt
787
+ r.negative_prompt = req.negative_prompt
788
+ r.init_image = req.init_image
789
+ r.mask = req.mask
790
+ r.num_outputs = req.num_outputs
791
+ r.num_inference_steps = req.num_inference_steps
792
+ r.guidance_scale = req.guidance_scale
793
+ r.width = req.width
794
+ r.height = req.height
795
+ r.seed = req.seed
796
+ r.prompt_strength = req.prompt_strength
797
+ r.sampler = req.sampler
798
+ # r.allow_nsfw = req.allow_nsfw
799
+ r.turbo = req.turbo
800
+ r.use_cpu = req.use_cpu
801
+ r.use_full_precision = req.use_full_precision
802
+ r.save_to_disk_path = req.save_to_disk_path
803
+ r.use_upscale: str = req.use_upscale
804
+ r.use_face_correction = req.use_face_correction
805
+ r.show_only_filtered_image = req.show_only_filtered_image
806
+ r.output_format = req.output_format
807
+
808
+ r.stream_progress_updates = True # the underlying implementation only supports streaming
809
+ r.stream_image_progress = req.stream_image_progress
810
+
811
+ r.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model)
812
+
813
+ save_model_to_config(req.use_stable_diffusion_model)
814
+
815
+ try:
816
+ if not req.stream_progress_updates:
817
+ r.stream_image_progress = False
818
+
819
+ res = mk_img(r)
820
+
821
+ if req.stream_progress_updates:
822
+ return StreamingResponse(res, media_type='application/json')
823
+ else: # compatibility mode: buffer the streaming responses, and return the last one
824
+ last_result = None
825
+
826
+ for result in res:
827
+ last_result = result
828
+
829
+ return json.loads(last_result)
830
+ except Exception as e:
831
+ print(traceback.format_exc())
832
+ return HTTPException(status_code=500, detail=str(e))
833
+
834
+
835
+ def getConfig():
836
+ try:
837
+ config_json_path = os.path.join(CONFIG_DIR, 'config.json')
838
+
839
+ if not os.path.exists(config_json_path):
840
+ return {}
841
+
842
+ with open(config_json_path, 'r') as f:
843
+ return json.load(f)
844
+ except Exception as e:
845
+ return {}
846
+
847
+ # needs to support the legacy installations
848
+ def get_initial_model_to_load():
849
+ custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt')
850
+ ckpt_to_use = "sd-v1-4" if not os.path.exists(custom_weight_path) else "custom-model"
851
+
852
+ ckpt_to_use = os.path.join(SD_DIR, ckpt_to_use)
853
+
854
+ config = getConfig()
855
+ if 'model' in config and 'stable-diffusion' in config['model']:
856
+ model_name = config['model']['stable-diffusion']
857
+ model_path = resolve_model_to_use(model_name)
858
+
859
+ if os.path.exists(model_path + '.ckpt'):
860
+ ckpt_to_use = model_path
861
+ else:
862
+ print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', ckpt_to_use + '.ckpt')
863
+
864
+ return ckpt_to_use
865
+
866
+
867
+ #model_is_loading = True
868
+ #load_model_ckpt(get_initial_model_to_load(), "cuda")
869
+ #model_loaded = True
870
+ #model_is_loading = False
871
+
872
+ #mk_img(ImageRequest)
app.bckp2.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import uvicorn
3
+
4
+ import json
5
+ import traceback
6
+
7
+ import sys
8
+ import os
9
+
10
+ SD_DIR = os.getcwd()
11
+ print('started in ', SD_DIR)
12
+
13
+ SD_UI_DIR = './ui'
14
+ #sys.path.append(os.path.dirname(SD_UI_DIR))
15
+
16
+ #CONFIG_DIR = os.path.abspath(os.path.join(SD_UI_DIR, '..', 'scripts'))
17
+ #MODELS_DIR = os.path.abspath(os.path.join(SD_DIR, '..', 'models'))
18
+
19
+ OUTPUT_DIRNAME = "Stable Diffusion UI" # in the user's home folder
20
+
21
+ from fastapi import FastAPI, HTTPException
22
+ from fastapi.staticfiles import StaticFiles
23
+ from starlette.responses import FileResponse, StreamingResponse
24
+ from pydantic import BaseModel
25
+ import logging
26
+
27
+ from sd_internal import Request, Response
28
+
29
+ app = FastAPI()
30
+
31
+ model_loaded = False
32
+ model_is_loading = False
33
+
34
+ modifiers_cache = None
35
+ outpath = os.path.join(os.path.expanduser("~"), OUTPUT_DIRNAME)
36
+
37
+ # don't show access log entries for URLs that start with the given prefix
38
+ ACCESS_LOG_SUPPRESS_PATH_PREFIXES = ['/ping', '/modifier-thumbnails']
39
+
40
+ app.mount('/media', StaticFiles(directory=os.path.join(SD_UI_DIR, 'media/')), name="media")
41
+
42
+ # defaults from https://huggingface.co/blog/stable_diffusion
43
+ class ImageRequest(BaseModel):
44
+ session_id: str = "session"
45
+ prompt: str = ""
46
+ negative_prompt: str = ""
47
+ init_image: str = None # base64
48
+ mask: str = None # base64
49
+ num_outputs: int = 1
50
+ num_inference_steps: int = 50
51
+ guidance_scale: float = 7.5
52
+ width: int = 512
53
+ height: int = 512
54
+ seed: int = 42
55
+ prompt_strength: float = 0.8
56
+ sampler: str = None # "ddim", "plms", "heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"
57
+ # allow_nsfw: bool = False
58
+ save_to_disk_path: str = None
59
+ turbo: bool = True
60
+ use_cpu: bool = False
61
+ use_full_precision: bool = False
62
+ use_face_correction: str = None # or "GFPGANv1.3"
63
+ use_upscale: str = None # or "RealESRGAN_x4plus" or "RealESRGAN_x4plus_anime_6B"
64
+ use_stable_diffusion_model: str = "sd-v1-4"
65
+ show_only_filtered_image: bool = False
66
+ output_format: str = "jpeg" # or "png"
67
+
68
+ stream_progress_updates: bool = False
69
+ stream_image_progress: bool = False
70
+
71
+ class SetAppConfigRequest(BaseModel):
72
+ update_branch: str = "main"
73
+
74
+ @app.get('/')
75
+ def read_root():
76
+ headers = {"Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", "Expires": "0"}
77
+ return FileResponse(os.path.join(SD_UI_DIR, 'index.html'), headers=headers)
78
+
79
+ @app.get('/ping')
80
+ async def ping():
81
+ global model_loaded, model_is_loading
82
+
83
+ try:
84
+ if model_loaded:
85
+ return {'OK'}
86
+
87
+ if model_is_loading:
88
+ return {'ERROR'}
89
+
90
+ model_is_loading = True
91
+
92
+ from sd_internal import runtime
93
+
94
+ runtime.load_model_ckpt(ckpt_to_use=get_initial_model_to_load())
95
+
96
+ model_loaded = True
97
+ model_is_loading = False
98
+
99
+ return {'OK'}
100
+ except Exception as e:
101
+ print(traceback.format_exc())
102
+ return HTTPException(status_code=500, detail=str(e))
103
+
104
+ # needs to support the legacy installations
105
+ def get_initial_model_to_load():
106
+ custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt')
107
+ ckpt_to_use = "sd-v1-4" if not os.path.exists(custom_weight_path) else "custom-model"
108
+
109
+ ckpt_to_use = os.path.join(SD_DIR, ckpt_to_use)
110
+
111
+ config = getConfig()
112
+ if 'model' in config and 'stable-diffusion' in config['model']:
113
+ model_name = config['model']['stable-diffusion']
114
+ model_path = resolve_model_to_use(model_name)
115
+
116
+ if os.path.exists(model_path + '.ckpt'):
117
+ ckpt_to_use = model_path
118
+ else:
119
+ print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', ckpt_to_use + '.ckpt')
120
+
121
+ return ckpt_to_use
122
+
123
+ def resolve_model_to_use(model_name):
124
+ if model_name in ('sd-v1-4', 'custom-model'):
125
+ model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
126
+
127
+ legacy_model_path = os.path.join(SD_DIR, model_name)
128
+ if not os.path.exists(model_path + '.ckpt') and os.path.exists(legacy_model_path + '.ckpt'):
129
+ model_path = legacy_model_path
130
+ else:
131
+ model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
132
+
133
+ return model_path
134
+
135
+ def save_model_to_config(model_name):
136
+ config = getConfig()
137
+ if 'model' not in config:
138
+ config['model'] = {}
139
+
140
+ config['model']['stable-diffusion'] = model_name
141
+
142
+ setConfig(config)
143
+
144
+ @app.post('/image')
145
+ def image(req : ImageRequest):
146
+ from sd_internal import runtime
147
+
148
+ r = Request()
149
+ r.session_id = req.session_id
150
+ r.prompt = req.prompt
151
+ r.negative_prompt = req.negative_prompt
152
+ r.init_image = req.init_image
153
+ r.mask = req.mask
154
+ r.num_outputs = req.num_outputs
155
+ r.num_inference_steps = req.num_inference_steps
156
+ r.guidance_scale = req.guidance_scale
157
+ r.width = req.width
158
+ r.height = req.height
159
+ r.seed = req.seed
160
+ r.prompt_strength = req.prompt_strength
161
+ r.sampler = req.sampler
162
+ # r.allow_nsfw = req.allow_nsfw
163
+ r.turbo = req.turbo
164
+ r.use_cpu = req.use_cpu
165
+ r.use_full_precision = req.use_full_precision
166
+ r.save_to_disk_path = req.save_to_disk_path
167
+ r.use_upscale: str = req.use_upscale
168
+ r.use_face_correction = req.use_face_correction
169
+ r.show_only_filtered_image = req.show_only_filtered_image
170
+ r.output_format = req.output_format
171
+
172
+ r.stream_progress_updates = True # the underlying implementation only supports streaming
173
+ r.stream_image_progress = req.stream_image_progress
174
+
175
+ r.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model)
176
+
177
+ save_model_to_config(req.use_stable_diffusion_model)
178
+
179
+ try:
180
+ if not req.stream_progress_updates:
181
+ r.stream_image_progress = False
182
+
183
+ res = runtime.mk_img(r)
184
+
185
+ if req.stream_progress_updates:
186
+ return StreamingResponse(res, media_type='application/json')
187
+ else: # compatibility mode: buffer the streaming responses, and return the last one
188
+ last_result = None
189
+
190
+ for result in res:
191
+ last_result = result
192
+
193
+ return json.loads(last_result)
194
+ except Exception as e:
195
+ print(traceback.format_exc())
196
+ return HTTPException(status_code=500, detail=str(e))
197
+
198
+ @app.get('/image/stop')
199
+ def stop():
200
+ try:
201
+ if model_is_loading:
202
+ return {'ERROR'}
203
+
204
+ from sd_internal import runtime
205
+ runtime.stop_processing = True
206
+
207
+ return {'OK'}
208
+ except Exception as e:
209
+ print(traceback.format_exc())
210
+ return HTTPException(status_code=500, detail=str(e))
211
+
212
+ @app.get('/image/tmp/{session_id}/{img_id}')
213
+ def get_image(session_id, img_id):
214
+ from sd_internal import runtime
215
+ buf = runtime.temp_images[session_id + '/' + img_id]
216
+ buf.seek(0)
217
+ return StreamingResponse(buf, media_type='image/jpeg')
218
+
219
+ @app.post('/app_config')
220
+ async def setAppConfig(req : SetAppConfigRequest):
221
+ try:
222
+ config = {
223
+ 'update_branch': req.update_branch
224
+ }
225
+
226
+ config_json_str = json.dumps(config)
227
+ config_bat_str = f'@set update_branch={req.update_branch}'
228
+ config_sh_str = f'export update_branch={req.update_branch}'
229
+
230
+ config_json_path = os.path.join(CONFIG_DIR, 'config.json')
231
+ config_bat_path = os.path.join(CONFIG_DIR, 'config.bat')
232
+ config_sh_path = os.path.join(CONFIG_DIR, 'config.sh')
233
+
234
+ with open(config_json_path, 'w') as f:
235
+ f.write(config_json_str)
236
+
237
+ with open(config_bat_path, 'w') as f:
238
+ f.write(config_bat_str)
239
+
240
+ with open(config_sh_path, 'w') as f:
241
+ f.write(config_sh_str)
242
+
243
+ return {'OK'}
244
+ except Exception as e:
245
+ print(traceback.format_exc())
246
+ return HTTPException(status_code=500, detail=str(e))
247
+
248
+ @app.get('/app_config')
249
+ def getAppConfig():
250
+ try:
251
+ config_json_path = os.path.join(CONFIG_DIR, 'config.json')
252
+
253
+ if not os.path.exists(config_json_path):
254
+ return HTTPException(status_code=500, detail="No config file")
255
+
256
+ with open(config_json_path, 'r') as f:
257
+ return json.load(f)
258
+ except Exception as e:
259
+ print(traceback.format_exc())
260
+ return HTTPException(status_code=500, detail=str(e))
261
+
262
+ def getConfig():
263
+ try:
264
+ config_json_path = os.path.join(CONFIG_DIR, 'config.json')
265
+
266
+ if not os.path.exists(config_json_path):
267
+ return {}
268
+
269
+ with open(config_json_path, 'r') as f:
270
+ return json.load(f)
271
+ except Exception as e:
272
+ return {}
273
+
274
+ def setConfig(config):
275
+ try:
276
+ config_json_path = os.path.join(CONFIG_DIR, 'config.json')
277
+
278
+ with open(config_json_path, 'w') as f:
279
+ return json.dump(config, f)
280
+ except:
281
+ print(traceback.format_exc())
282
+
283
+ @app.get('/models')
284
+ def getModels():
285
+ models = {
286
+ 'active': {
287
+ 'stable-diffusion': 'sd-v1-4',
288
+ },
289
+ 'options': {
290
+ 'stable-diffusion': ['sd-v1-4'],
291
+ },
292
+ }
293
+
294
+ # custom models
295
+ sd_models_dir = os.path.join(MODELS_DIR, 'stable-diffusion')
296
+ for file in os.listdir(sd_models_dir):
297
+ if file.endswith('.ckpt'):
298
+ model_name = os.path.splitext(file)[0]
299
+ models['options']['stable-diffusion'].append(model_name)
300
+
301
+ # legacy
302
+ custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt')
303
+ if os.path.exists(custom_weight_path):
304
+ models['active']['stable-diffusion'] = 'custom-model'
305
+ models['options']['stable-diffusion'].append('custom-model')
306
+
307
+ config = getConfig()
308
+ if 'model' in config and 'stable-diffusion' in config['model']:
309
+ models['active']['stable-diffusion'] = config['model']['stable-diffusion']
310
+
311
+ return models
312
+
313
+ @app.get('/modifiers.json')
314
+ def read_modifiers():
315
+ headers = {"Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", "Expires": "0"}
316
+ return FileResponse(os.path.join(SD_UI_DIR, 'modifiers.json'), headers=headers)
317
+
318
+ @app.get('/output_dir')
319
+ def read_home_dir():
320
+ return {outpath}
321
+
322
+ # don't log certain requests
323
+ class LogSuppressFilter(logging.Filter):
324
+ def filter(self, record: logging.LogRecord) -> bool:
325
+ path = record.getMessage()
326
+ for prefix in ACCESS_LOG_SUPPRESS_PATH_PREFIXES:
327
+ if path.find(prefix) != -1:
328
+ return False
329
+
330
+ return True
ldmlib/__pycache__/util.cpython-38.pyc ADDED
Binary file (6.08 kB). View file
 
ldmlib/data/__init__.py ADDED
File without changes
ldmlib/data/base.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
3
+
4
+
5
+ class Txt2ImgIterableBaseDataset(IterableDataset):
6
+ '''
7
+ Define an interface to make the IterableDatasets for text2img data chainable
8
+ '''
9
+ def __init__(self, num_records=0, valid_ids=None, size=256):
10
+ super().__init__()
11
+ self.num_records = num_records
12
+ self.valid_ids = valid_ids
13
+ self.sample_ids = valid_ids
14
+ self.size = size
15
+
16
+ print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
17
+
18
+ def __len__(self):
19
+ return self.num_records
20
+
21
+ @abstractmethod
22
+ def __iter__(self):
23
+ pass
ldmlib/data/imagenet.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, yaml, pickle, shutil, tarfile, glob
2
+ import cv2
3
+ import albumentations
4
+ import PIL
5
+ import numpy as np
6
+ import torchvision.transforms.functional as TF
7
+ from omegaconf import OmegaConf
8
+ from functools import partial
9
+ from PIL import Image
10
+ from tqdm import tqdm
11
+ from torch.utils.data import Dataset, Subset
12
+
13
+ import taming.data.utils as tdu
14
+ from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
15
+ from taming.data.imagenet import ImagePaths
16
+
17
+ from ldmlib.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
18
+
19
+
20
+ def synset2idx(path_to_yaml="data/index_synset.yaml"):
21
+ with open(path_to_yaml) as f:
22
+ di2s = yaml.load(f)
23
+ return dict((v,k) for k,v in di2s.items())
24
+
25
+
26
+ class ImageNetBase(Dataset):
27
+ def __init__(self, config=None):
28
+ self.config = config or OmegaConf.create()
29
+ if not type(self.config)==dict:
30
+ self.config = OmegaConf.to_container(self.config)
31
+ self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
32
+ self.process_images = True # if False we skip loading & processing images and self.data contains filepaths
33
+ self._prepare()
34
+ self._prepare_synset_to_human()
35
+ self._prepare_idx_to_synset()
36
+ self._prepare_human_to_integer_label()
37
+ self._load()
38
+
39
+ def __len__(self):
40
+ return len(self.data)
41
+
42
+ def __getitem__(self, i):
43
+ return self.data[i]
44
+
45
+ def _prepare(self):
46
+ raise NotImplementedError()
47
+
48
+ def _filter_relpaths(self, relpaths):
49
+ ignore = set([
50
+ "n06596364_9591.JPEG",
51
+ ])
52
+ relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
53
+ if "sub_indices" in self.config:
54
+ indices = str_to_indices(self.config["sub_indices"])
55
+ synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings
56
+ self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
57
+ files = []
58
+ for rpath in relpaths:
59
+ syn = rpath.split("/")[0]
60
+ if syn in synsets:
61
+ files.append(rpath)
62
+ return files
63
+ else:
64
+ return relpaths
65
+
66
+ def _prepare_synset_to_human(self):
67
+ SIZE = 2655750
68
+ URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
69
+ self.human_dict = os.path.join(self.root, "synset_human.txt")
70
+ if (not os.path.exists(self.human_dict) or
71
+ not os.path.getsize(self.human_dict)==SIZE):
72
+ download(URL, self.human_dict)
73
+
74
+ def _prepare_idx_to_synset(self):
75
+ URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
76
+ self.idx2syn = os.path.join(self.root, "index_synset.yaml")
77
+ if (not os.path.exists(self.idx2syn)):
78
+ download(URL, self.idx2syn)
79
+
80
+ def _prepare_human_to_integer_label(self):
81
+ URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
82
+ self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
83
+ if (not os.path.exists(self.human2integer)):
84
+ download(URL, self.human2integer)
85
+ with open(self.human2integer, "r") as f:
86
+ lines = f.read().splitlines()
87
+ assert len(lines) == 1000
88
+ self.human2integer_dict = dict()
89
+ for line in lines:
90
+ value, key = line.split(":")
91
+ self.human2integer_dict[key] = int(value)
92
+
93
+ def _load(self):
94
+ with open(self.txt_filelist, "r") as f:
95
+ self.relpaths = f.read().splitlines()
96
+ l1 = len(self.relpaths)
97
+ self.relpaths = self._filter_relpaths(self.relpaths)
98
+ print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
99
+
100
+ self.synsets = [p.split("/")[0] for p in self.relpaths]
101
+ self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
102
+
103
+ unique_synsets = np.unique(self.synsets)
104
+ class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
105
+ if not self.keep_orig_class_label:
106
+ self.class_labels = [class_dict[s] for s in self.synsets]
107
+ else:
108
+ self.class_labels = [self.synset2idx[s] for s in self.synsets]
109
+
110
+ with open(self.human_dict, "r") as f:
111
+ human_dict = f.read().splitlines()
112
+ human_dict = dict(line.split(maxsplit=1) for line in human_dict)
113
+
114
+ self.human_labels = [human_dict[s] for s in self.synsets]
115
+
116
+ labels = {
117
+ "relpath": np.array(self.relpaths),
118
+ "synsets": np.array(self.synsets),
119
+ "class_label": np.array(self.class_labels),
120
+ "human_label": np.array(self.human_labels),
121
+ }
122
+
123
+ if self.process_images:
124
+ self.size = retrieve(self.config, "size", default=256)
125
+ self.data = ImagePaths(self.abspaths,
126
+ labels=labels,
127
+ size=self.size,
128
+ random_crop=self.random_crop,
129
+ )
130
+ else:
131
+ self.data = self.abspaths
132
+
133
+
134
+ class ImageNetTrain(ImageNetBase):
135
+ NAME = "ILSVRC2012_train"
136
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
137
+ AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
138
+ FILES = [
139
+ "ILSVRC2012_img_train.tar",
140
+ ]
141
+ SIZES = [
142
+ 147897477120,
143
+ ]
144
+
145
+ def __init__(self, process_images=True, data_root=None, **kwargs):
146
+ self.process_images = process_images
147
+ self.data_root = data_root
148
+ super().__init__(**kwargs)
149
+
150
+ def _prepare(self):
151
+ if self.data_root:
152
+ self.root = os.path.join(self.data_root, self.NAME)
153
+ else:
154
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
155
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
156
+
157
+ self.datadir = os.path.join(self.root, "data")
158
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
159
+ self.expected_length = 1281167
160
+ self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
161
+ default=True)
162
+ if not tdu.is_prepared(self.root):
163
+ # prep
164
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
165
+
166
+ datadir = self.datadir
167
+ if not os.path.exists(datadir):
168
+ path = os.path.join(self.root, self.FILES[0])
169
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
170
+ import academictorrents as at
171
+ atpath = at.get(self.AT_HASH, datastore=self.root)
172
+ assert atpath == path
173
+
174
+ print("Extracting {} to {}".format(path, datadir))
175
+ os.makedirs(datadir, exist_ok=True)
176
+ with tarfile.open(path, "r:") as tar:
177
+ tar.extractall(path=datadir)
178
+
179
+ print("Extracting sub-tars.")
180
+ subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
181
+ for subpath in tqdm(subpaths):
182
+ subdir = subpath[:-len(".tar")]
183
+ os.makedirs(subdir, exist_ok=True)
184
+ with tarfile.open(subpath, "r:") as tar:
185
+ tar.extractall(path=subdir)
186
+
187
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
188
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
189
+ filelist = sorted(filelist)
190
+ filelist = "\n".join(filelist)+"\n"
191
+ with open(self.txt_filelist, "w") as f:
192
+ f.write(filelist)
193
+
194
+ tdu.mark_prepared(self.root)
195
+
196
+
197
+ class ImageNetValidation(ImageNetBase):
198
+ NAME = "ILSVRC2012_validation"
199
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
200
+ AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
201
+ VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
202
+ FILES = [
203
+ "ILSVRC2012_img_val.tar",
204
+ "validation_synset.txt",
205
+ ]
206
+ SIZES = [
207
+ 6744924160,
208
+ 1950000,
209
+ ]
210
+
211
+ def __init__(self, process_images=True, data_root=None, **kwargs):
212
+ self.data_root = data_root
213
+ self.process_images = process_images
214
+ super().__init__(**kwargs)
215
+
216
+ def _prepare(self):
217
+ if self.data_root:
218
+ self.root = os.path.join(self.data_root, self.NAME)
219
+ else:
220
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
221
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
222
+ self.datadir = os.path.join(self.root, "data")
223
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
224
+ self.expected_length = 50000
225
+ self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
226
+ default=False)
227
+ if not tdu.is_prepared(self.root):
228
+ # prep
229
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
230
+
231
+ datadir = self.datadir
232
+ if not os.path.exists(datadir):
233
+ path = os.path.join(self.root, self.FILES[0])
234
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
235
+ import academictorrents as at
236
+ atpath = at.get(self.AT_HASH, datastore=self.root)
237
+ assert atpath == path
238
+
239
+ print("Extracting {} to {}".format(path, datadir))
240
+ os.makedirs(datadir, exist_ok=True)
241
+ with tarfile.open(path, "r:") as tar:
242
+ tar.extractall(path=datadir)
243
+
244
+ vspath = os.path.join(self.root, self.FILES[1])
245
+ if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
246
+ download(self.VS_URL, vspath)
247
+
248
+ with open(vspath, "r") as f:
249
+ synset_dict = f.read().splitlines()
250
+ synset_dict = dict(line.split() for line in synset_dict)
251
+
252
+ print("Reorganizing into synset folders")
253
+ synsets = np.unique(list(synset_dict.values()))
254
+ for s in synsets:
255
+ os.makedirs(os.path.join(datadir, s), exist_ok=True)
256
+ for k, v in synset_dict.items():
257
+ src = os.path.join(datadir, k)
258
+ dst = os.path.join(datadir, v)
259
+ shutil.move(src, dst)
260
+
261
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
262
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
263
+ filelist = sorted(filelist)
264
+ filelist = "\n".join(filelist)+"\n"
265
+ with open(self.txt_filelist, "w") as f:
266
+ f.write(filelist)
267
+
268
+ tdu.mark_prepared(self.root)
269
+
270
+
271
+
272
+ class ImageNetSR(Dataset):
273
+ def __init__(self, size=None,
274
+ degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
275
+ random_crop=True):
276
+ """
277
+ Imagenet Superresolution Dataloader
278
+ Performs following ops in order:
279
+ 1. crops a crop of size s from image either as random or center crop
280
+ 2. resizes crop to size with cv2.area_interpolation
281
+ 3. degrades resized crop with degradation_fn
282
+
283
+ :param size: resizing to size after cropping
284
+ :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
285
+ :param downscale_f: Low Resolution Downsample factor
286
+ :param min_crop_f: determines crop size s,
287
+ where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
288
+ :param max_crop_f: ""
289
+ :param data_root:
290
+ :param random_crop:
291
+ """
292
+ self.base = self.get_base()
293
+ assert size
294
+ assert (size / downscale_f).is_integer()
295
+ self.size = size
296
+ self.LR_size = int(size / downscale_f)
297
+ self.min_crop_f = min_crop_f
298
+ self.max_crop_f = max_crop_f
299
+ assert(max_crop_f <= 1.)
300
+ self.center_crop = not random_crop
301
+
302
+ self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
303
+
304
+ self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
305
+
306
+ if degradation == "bsrgan":
307
+ self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
308
+
309
+ elif degradation == "bsrgan_light":
310
+ self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
311
+
312
+ else:
313
+ interpolation_fn = {
314
+ "cv_nearest": cv2.INTER_NEAREST,
315
+ "cv_bilinear": cv2.INTER_LINEAR,
316
+ "cv_bicubic": cv2.INTER_CUBIC,
317
+ "cv_area": cv2.INTER_AREA,
318
+ "cv_lanczos": cv2.INTER_LANCZOS4,
319
+ "pil_nearest": PIL.Image.NEAREST,
320
+ "pil_bilinear": PIL.Image.BILINEAR,
321
+ "pil_bicubic": PIL.Image.BICUBIC,
322
+ "pil_box": PIL.Image.BOX,
323
+ "pil_hamming": PIL.Image.HAMMING,
324
+ "pil_lanczos": PIL.Image.LANCZOS,
325
+ }[degradation]
326
+
327
+ self.pil_interpolation = degradation.startswith("pil_")
328
+
329
+ if self.pil_interpolation:
330
+ self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
331
+
332
+ else:
333
+ self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
334
+ interpolation=interpolation_fn)
335
+
336
+ def __len__(self):
337
+ return len(self.base)
338
+
339
+ def __getitem__(self, i):
340
+ example = self.base[i]
341
+ image = Image.open(example["file_path_"])
342
+
343
+ if not image.mode == "RGB":
344
+ image = image.convert("RGB")
345
+
346
+ image = np.array(image).astype(np.uint8)
347
+
348
+ min_side_len = min(image.shape[:2])
349
+ crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
350
+ crop_side_len = int(crop_side_len)
351
+
352
+ if self.center_crop:
353
+ self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
354
+
355
+ else:
356
+ self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
357
+
358
+ image = self.cropper(image=image)["image"]
359
+ image = self.image_rescaler(image=image)["image"]
360
+
361
+ if self.pil_interpolation:
362
+ image_pil = PIL.Image.fromarray(image)
363
+ LR_image = self.degradation_process(image_pil)
364
+ LR_image = np.array(LR_image).astype(np.uint8)
365
+
366
+ else:
367
+ LR_image = self.degradation_process(image=image)["image"]
368
+
369
+ example["image"] = (image/127.5 - 1.0).astype(np.float32)
370
+ example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
371
+
372
+ return example
373
+
374
+
375
+ class ImageNetSRTrain(ImageNetSR):
376
+ def __init__(self, **kwargs):
377
+ super().__init__(**kwargs)
378
+
379
+ def get_base(self):
380
+ with open("data/imagenet_train_hr_indices.p", "rb") as f:
381
+ indices = pickle.load(f)
382
+ dset = ImageNetTrain(process_images=False,)
383
+ return Subset(dset, indices)
384
+
385
+
386
+ class ImageNetSRValidation(ImageNetSR):
387
+ def __init__(self, **kwargs):
388
+ super().__init__(**kwargs)
389
+
390
+ def get_base(self):
391
+ with open("data/imagenet_val_hr_indices.p", "rb") as f:
392
+ indices = pickle.load(f)
393
+ dset = ImageNetValidation(process_images=False,)
394
+ return Subset(dset, indices)
ldmlib/data/lsun.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import PIL
4
+ from PIL import Image
5
+ from torch.utils.data import Dataset
6
+ from torchvision import transforms
7
+
8
+
9
+ class LSUNBase(Dataset):
10
+ def __init__(self,
11
+ txt_file,
12
+ data_root,
13
+ size=None,
14
+ interpolation="bicubic",
15
+ flip_p=0.5
16
+ ):
17
+ self.data_paths = txt_file
18
+ self.data_root = data_root
19
+ with open(self.data_paths, "r") as f:
20
+ self.image_paths = f.read().splitlines()
21
+ self._length = len(self.image_paths)
22
+ self.labels = {
23
+ "relative_file_path_": [l for l in self.image_paths],
24
+ "file_path_": [os.path.join(self.data_root, l)
25
+ for l in self.image_paths],
26
+ }
27
+
28
+ self.size = size
29
+ self.interpolation = {"linear": PIL.Image.LINEAR,
30
+ "bilinear": PIL.Image.BILINEAR,
31
+ "bicubic": PIL.Image.BICUBIC,
32
+ "lanczos": PIL.Image.LANCZOS,
33
+ }[interpolation]
34
+ self.flip = transforms.RandomHorizontalFlip(p=flip_p)
35
+
36
+ def __len__(self):
37
+ return self._length
38
+
39
+ def __getitem__(self, i):
40
+ example = dict((k, self.labels[k][i]) for k in self.labels)
41
+ image = Image.open(example["file_path_"])
42
+ if not image.mode == "RGB":
43
+ image = image.convert("RGB")
44
+
45
+ # default to score-sde preprocessing
46
+ img = np.array(image).astype(np.uint8)
47
+ crop = min(img.shape[0], img.shape[1])
48
+ h, w, = img.shape[0], img.shape[1]
49
+ img = img[(h - crop) // 2:(h + crop) // 2,
50
+ (w - crop) // 2:(w + crop) // 2]
51
+
52
+ image = Image.fromarray(img)
53
+ if self.size is not None:
54
+ image = image.resize((self.size, self.size), resample=self.interpolation)
55
+
56
+ image = self.flip(image)
57
+ image = np.array(image).astype(np.uint8)
58
+ example["image"] = (image / 127.5 - 1.0).astype(np.float32)
59
+ return example
60
+
61
+
62
+ class LSUNChurchesTrain(LSUNBase):
63
+ def __init__(self, **kwargs):
64
+ super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
65
+
66
+
67
+ class LSUNChurchesValidation(LSUNBase):
68
+ def __init__(self, flip_p=0., **kwargs):
69
+ super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
70
+ flip_p=flip_p, **kwargs)
71
+
72
+
73
+ class LSUNBedroomsTrain(LSUNBase):
74
+ def __init__(self, **kwargs):
75
+ super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
76
+
77
+
78
+ class LSUNBedroomsValidation(LSUNBase):
79
+ def __init__(self, flip_p=0.0, **kwargs):
80
+ super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
81
+ flip_p=flip_p, **kwargs)
82
+
83
+
84
+ class LSUNCatsTrain(LSUNBase):
85
+ def __init__(self, **kwargs):
86
+ super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
87
+
88
+
89
+ class LSUNCatsValidation(LSUNBase):
90
+ def __init__(self, flip_p=0., **kwargs):
91
+ super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
92
+ flip_p=flip_p, **kwargs)
ldmlib/lr_scheduler.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class LambdaWarmUpCosineScheduler:
5
+ """
6
+ note: use with a base_lr of 1.0
7
+ """
8
+ def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
9
+ self.lr_warm_up_steps = warm_up_steps
10
+ self.lr_start = lr_start
11
+ self.lr_min = lr_min
12
+ self.lr_max = lr_max
13
+ self.lr_max_decay_steps = max_decay_steps
14
+ self.last_lr = 0.
15
+ self.verbosity_interval = verbosity_interval
16
+
17
+ def schedule(self, n, **kwargs):
18
+ if self.verbosity_interval > 0:
19
+ if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20
+ if n < self.lr_warm_up_steps:
21
+ lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22
+ self.last_lr = lr
23
+ return lr
24
+ else:
25
+ t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26
+ t = min(t, 1.0)
27
+ lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28
+ 1 + np.cos(t * np.pi))
29
+ self.last_lr = lr
30
+ return lr
31
+
32
+ def __call__(self, n, **kwargs):
33
+ return self.schedule(n,**kwargs)
34
+
35
+
36
+ class LambdaWarmUpCosineScheduler2:
37
+ """
38
+ supports repeated iterations, configurable via lists
39
+ note: use with a base_lr of 1.0.
40
+ """
41
+ def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42
+ assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43
+ self.lr_warm_up_steps = warm_up_steps
44
+ self.f_start = f_start
45
+ self.f_min = f_min
46
+ self.f_max = f_max
47
+ self.cycle_lengths = cycle_lengths
48
+ self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49
+ self.last_f = 0.
50
+ self.verbosity_interval = verbosity_interval
51
+
52
+ def find_in_interval(self, n):
53
+ interval = 0
54
+ for cl in self.cum_cycles[1:]:
55
+ if n <= cl:
56
+ return interval
57
+ interval += 1
58
+
59
+ def schedule(self, n, **kwargs):
60
+ cycle = self.find_in_interval(n)
61
+ n = n - self.cum_cycles[cycle]
62
+ if self.verbosity_interval > 0:
63
+ if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64
+ f"current cycle {cycle}")
65
+ if n < self.lr_warm_up_steps[cycle]:
66
+ f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67
+ self.last_f = f
68
+ return f
69
+ else:
70
+ t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71
+ t = min(t, 1.0)
72
+ f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73
+ 1 + np.cos(t * np.pi))
74
+ self.last_f = f
75
+ return f
76
+
77
+ def __call__(self, n, **kwargs):
78
+ return self.schedule(n, **kwargs)
79
+
80
+
81
+ class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82
+
83
+ def schedule(self, n, **kwargs):
84
+ cycle = self.find_in_interval(n)
85
+ n = n - self.cum_cycles[cycle]
86
+ if self.verbosity_interval > 0:
87
+ if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88
+ f"current cycle {cycle}")
89
+
90
+ if n < self.lr_warm_up_steps[cycle]:
91
+ f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92
+ self.last_f = f
93
+ return f
94
+ else:
95
+ f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96
+ self.last_f = f
97
+ return f
98
+
ldmlib/models/__pycache__/autoencoder.cpython-38.pyc ADDED
Binary file (13.5 kB). View file
 
ldmlib/models/autoencoder.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pytorch_lightning as pl
3
+ import torch.nn.functional as F
4
+ from contextlib import contextmanager
5
+
6
+ from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
7
+
8
+ from ldmlib.modules.diffusionmodules.model import Encoder, Decoder
9
+ from ldmlib.modules.distributions.distributions import DiagonalGaussianDistribution
10
+
11
+ from ldmlib.util import instantiate_from_config
12
+
13
+
14
+ class VQModel(pl.LightningModule):
15
+ def __init__(self,
16
+ ddconfig,
17
+ lossconfig,
18
+ n_embed,
19
+ embed_dim,
20
+ ckpt_path=None,
21
+ ignore_keys=[],
22
+ image_key="image",
23
+ colorize_nlabels=None,
24
+ monitor=None,
25
+ batch_resize_range=None,
26
+ scheduler_config=None,
27
+ lr_g_factor=1.0,
28
+ remap=None,
29
+ sane_index_shape=False, # tell vector quantizer to return indices as bhw
30
+ use_ema=False
31
+ ):
32
+ super().__init__()
33
+ self.embed_dim = embed_dim
34
+ self.n_embed = n_embed
35
+ self.image_key = image_key
36
+ self.encoder = Encoder(**ddconfig)
37
+ self.decoder = Decoder(**ddconfig)
38
+ self.loss = instantiate_from_config(lossconfig)
39
+ self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
40
+ remap=remap,
41
+ sane_index_shape=sane_index_shape)
42
+ self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
43
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
44
+ if colorize_nlabels is not None:
45
+ assert type(colorize_nlabels)==int
46
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
47
+ if monitor is not None:
48
+ self.monitor = monitor
49
+ self.batch_resize_range = batch_resize_range
50
+ if self.batch_resize_range is not None:
51
+ print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
52
+
53
+ self.use_ema = use_ema
54
+ if self.use_ema:
55
+ self.model_ema = LitEma(self)
56
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
57
+
58
+ if ckpt_path is not None:
59
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
60
+ self.scheduler_config = scheduler_config
61
+ self.lr_g_factor = lr_g_factor
62
+
63
+ @contextmanager
64
+ def ema_scope(self, context=None):
65
+ if self.use_ema:
66
+ self.model_ema.store(self.parameters())
67
+ self.model_ema.copy_to(self)
68
+ if context is not None:
69
+ print(f"{context}: Switched to EMA weights")
70
+ try:
71
+ yield None
72
+ finally:
73
+ if self.use_ema:
74
+ self.model_ema.restore(self.parameters())
75
+ if context is not None:
76
+ print(f"{context}: Restored training weights")
77
+
78
+ def init_from_ckpt(self, path, ignore_keys=list()):
79
+ sd = torch.load(path, map_location="cpu")["state_dict"]
80
+ keys = list(sd.keys())
81
+ for k in keys:
82
+ for ik in ignore_keys:
83
+ if k.startswith(ik):
84
+ print("Deleting key {} from state_dict.".format(k))
85
+ del sd[k]
86
+ missing, unexpected = self.load_state_dict(sd, strict=False)
87
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
88
+ if len(missing) > 0:
89
+ print(f"Missing Keys: {missing}")
90
+ print(f"Unexpected Keys: {unexpected}")
91
+
92
+ def on_train_batch_end(self, *args, **kwargs):
93
+ if self.use_ema:
94
+ self.model_ema(self)
95
+
96
+ def encode(self, x):
97
+ h = self.encoder(x)
98
+ h = self.quant_conv(h)
99
+ quant, emb_loss, info = self.quantize(h)
100
+ return quant, emb_loss, info
101
+
102
+ def encode_to_prequant(self, x):
103
+ h = self.encoder(x)
104
+ h = self.quant_conv(h)
105
+ return h
106
+
107
+ def decode(self, quant):
108
+ quant = self.post_quant_conv(quant)
109
+ dec = self.decoder(quant)
110
+ return dec
111
+
112
+ def decode_code(self, code_b):
113
+ quant_b = self.quantize.embed_code(code_b)
114
+ dec = self.decode(quant_b)
115
+ return dec
116
+
117
+ def forward(self, input, return_pred_indices=False):
118
+ quant, diff, (_,_,ind) = self.encode(input)
119
+ dec = self.decode(quant)
120
+ if return_pred_indices:
121
+ return dec, diff, ind
122
+ return dec, diff
123
+
124
+ def get_input(self, batch, k):
125
+ x = batch[k]
126
+ if len(x.shape) == 3:
127
+ x = x[..., None]
128
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
129
+ if self.batch_resize_range is not None:
130
+ lower_size = self.batch_resize_range[0]
131
+ upper_size = self.batch_resize_range[1]
132
+ if self.global_step <= 4:
133
+ # do the first few batches with max size to avoid later oom
134
+ new_resize = upper_size
135
+ else:
136
+ new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
137
+ if new_resize != x.shape[2]:
138
+ x = F.interpolate(x, size=new_resize, mode="bicubic")
139
+ x = x.detach()
140
+ return x
141
+
142
+ def training_step(self, batch, batch_idx, optimizer_idx):
143
+ # https://github.com/pytorch/pytorch/issues/37142
144
+ # try not to fool the heuristics
145
+ x = self.get_input(batch, self.image_key)
146
+ xrec, qloss, ind = self(x, return_pred_indices=True)
147
+
148
+ if optimizer_idx == 0:
149
+ # autoencode
150
+ aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
151
+ last_layer=self.get_last_layer(), split="train",
152
+ predicted_indices=ind)
153
+
154
+ self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
155
+ return aeloss
156
+
157
+ if optimizer_idx == 1:
158
+ # discriminator
159
+ discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
160
+ last_layer=self.get_last_layer(), split="train")
161
+ self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
162
+ return discloss
163
+
164
+ def validation_step(self, batch, batch_idx):
165
+ log_dict = self._validation_step(batch, batch_idx)
166
+ with self.ema_scope():
167
+ log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
168
+ return log_dict
169
+
170
+ def _validation_step(self, batch, batch_idx, suffix=""):
171
+ x = self.get_input(batch, self.image_key)
172
+ xrec, qloss, ind = self(x, return_pred_indices=True)
173
+ aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
174
+ self.global_step,
175
+ last_layer=self.get_last_layer(),
176
+ split="val"+suffix,
177
+ predicted_indices=ind
178
+ )
179
+
180
+ discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
181
+ self.global_step,
182
+ last_layer=self.get_last_layer(),
183
+ split="val"+suffix,
184
+ predicted_indices=ind
185
+ )
186
+ rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
187
+ self.log(f"val{suffix}/rec_loss", rec_loss,
188
+ prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
189
+ self.log(f"val{suffix}/aeloss", aeloss,
190
+ prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
191
+ if version.parse(pl.__version__) >= version.parse('1.4.0'):
192
+ del log_dict_ae[f"val{suffix}/rec_loss"]
193
+ self.log_dict(log_dict_ae)
194
+ self.log_dict(log_dict_disc)
195
+ return self.log_dict
196
+
197
+ def configure_optimizers(self):
198
+ lr_d = self.learning_rate
199
+ lr_g = self.lr_g_factor*self.learning_rate
200
+ print("lr_d", lr_d)
201
+ print("lr_g", lr_g)
202
+ opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
203
+ list(self.decoder.parameters())+
204
+ list(self.quantize.parameters())+
205
+ list(self.quant_conv.parameters())+
206
+ list(self.post_quant_conv.parameters()),
207
+ lr=lr_g, betas=(0.5, 0.9))
208
+ opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
209
+ lr=lr_d, betas=(0.5, 0.9))
210
+
211
+ if self.scheduler_config is not None:
212
+ scheduler = instantiate_from_config(self.scheduler_config)
213
+
214
+ print("Setting up LambdaLR scheduler...")
215
+ scheduler = [
216
+ {
217
+ 'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
218
+ 'interval': 'step',
219
+ 'frequency': 1
220
+ },
221
+ {
222
+ 'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
223
+ 'interval': 'step',
224
+ 'frequency': 1
225
+ },
226
+ ]
227
+ return [opt_ae, opt_disc], scheduler
228
+ return [opt_ae, opt_disc], []
229
+
230
+ def get_last_layer(self):
231
+ return self.decoder.conv_out.weight
232
+
233
+ def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
234
+ log = dict()
235
+ x = self.get_input(batch, self.image_key)
236
+ x = x.to(self.device)
237
+ if only_inputs:
238
+ log["inputs"] = x
239
+ return log
240
+ xrec, _ = self(x)
241
+ if x.shape[1] > 3:
242
+ # colorize with random projection
243
+ assert xrec.shape[1] > 3
244
+ x = self.to_rgb(x)
245
+ xrec = self.to_rgb(xrec)
246
+ log["inputs"] = x
247
+ log["reconstructions"] = xrec
248
+ if plot_ema:
249
+ with self.ema_scope():
250
+ xrec_ema, _ = self(x)
251
+ if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
252
+ log["reconstructions_ema"] = xrec_ema
253
+ return log
254
+
255
+ def to_rgb(self, x):
256
+ assert self.image_key == "segmentation"
257
+ if not hasattr(self, "colorize"):
258
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
259
+ x = F.conv2d(x, weight=self.colorize)
260
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
261
+ return x
262
+
263
+
264
+ class VQModelInterface(VQModel):
265
+ def __init__(self, embed_dim, *args, **kwargs):
266
+ super().__init__(embed_dim=embed_dim, *args, **kwargs)
267
+ self.embed_dim = embed_dim
268
+
269
+ def encode(self, x):
270
+ h = self.encoder(x)
271
+ h = self.quant_conv(h)
272
+ return h
273
+
274
+ def decode(self, h, force_not_quantize=False):
275
+ # also go through quantization layer
276
+ if not force_not_quantize:
277
+ quant, emb_loss, info = self.quantize(h)
278
+ else:
279
+ quant = h
280
+ quant = self.post_quant_conv(quant)
281
+ dec = self.decoder(quant)
282
+ return dec
283
+
284
+
285
+ class AutoencoderKL(pl.LightningModule):
286
+ def __init__(self,
287
+ ddconfig,
288
+ lossconfig,
289
+ embed_dim,
290
+ ckpt_path=None,
291
+ ignore_keys=[],
292
+ image_key="image",
293
+ colorize_nlabels=None,
294
+ monitor=None,
295
+ ):
296
+ super().__init__()
297
+ self.image_key = image_key
298
+ self.encoder = Encoder(**ddconfig)
299
+ self.decoder = Decoder(**ddconfig)
300
+ self.loss = instantiate_from_config(lossconfig)
301
+ assert ddconfig["double_z"]
302
+ self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
303
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
304
+ self.embed_dim = embed_dim
305
+ if colorize_nlabels is not None:
306
+ assert type(colorize_nlabels)==int
307
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
308
+ if monitor is not None:
309
+ self.monitor = monitor
310
+ if ckpt_path is not None:
311
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
312
+
313
+ def init_from_ckpt(self, path, ignore_keys=list()):
314
+ sd = torch.load(path, map_location="cpu")["state_dict"]
315
+ keys = list(sd.keys())
316
+ for k in keys:
317
+ for ik in ignore_keys:
318
+ if k.startswith(ik):
319
+ print("Deleting key {} from state_dict.".format(k))
320
+ del sd[k]
321
+ self.load_state_dict(sd, strict=False)
322
+ print(f"Restored from {path}")
323
+
324
+ def encode(self, x):
325
+ h = self.encoder(x)
326
+ moments = self.quant_conv(h)
327
+ posterior = DiagonalGaussianDistribution(moments)
328
+ return posterior
329
+
330
+ def decode(self, z):
331
+ z = self.post_quant_conv(z)
332
+ dec = self.decoder(z)
333
+ return dec
334
+
335
+ def forward(self, input, sample_posterior=True):
336
+ posterior = self.encode(input)
337
+ if sample_posterior:
338
+ z = posterior.sample()
339
+ else:
340
+ z = posterior.mode()
341
+ dec = self.decode(z)
342
+ return dec, posterior
343
+
344
+ def get_input(self, batch, k):
345
+ x = batch[k]
346
+ if len(x.shape) == 3:
347
+ x = x[..., None]
348
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
349
+ return x
350
+
351
+ def training_step(self, batch, batch_idx, optimizer_idx):
352
+ inputs = self.get_input(batch, self.image_key)
353
+ reconstructions, posterior = self(inputs)
354
+
355
+ if optimizer_idx == 0:
356
+ # train encoder+decoder+logvar
357
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
358
+ last_layer=self.get_last_layer(), split="train")
359
+ self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
360
+ self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
361
+ return aeloss
362
+
363
+ if optimizer_idx == 1:
364
+ # train the discriminator
365
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
366
+ last_layer=self.get_last_layer(), split="train")
367
+
368
+ self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
369
+ self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
370
+ return discloss
371
+
372
+ def validation_step(self, batch, batch_idx):
373
+ inputs = self.get_input(batch, self.image_key)
374
+ reconstructions, posterior = self(inputs)
375
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
376
+ last_layer=self.get_last_layer(), split="val")
377
+
378
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
379
+ last_layer=self.get_last_layer(), split="val")
380
+
381
+ self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
382
+ self.log_dict(log_dict_ae)
383
+ self.log_dict(log_dict_disc)
384
+ return self.log_dict
385
+
386
+ def configure_optimizers(self):
387
+ lr = self.learning_rate
388
+ opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
389
+ list(self.decoder.parameters())+
390
+ list(self.quant_conv.parameters())+
391
+ list(self.post_quant_conv.parameters()),
392
+ lr=lr, betas=(0.5, 0.9))
393
+ opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
394
+ lr=lr, betas=(0.5, 0.9))
395
+ return [opt_ae, opt_disc], []
396
+
397
+ def get_last_layer(self):
398
+ return self.decoder.conv_out.weight
399
+
400
+ @torch.no_grad()
401
+ def log_images(self, batch, only_inputs=False, **kwargs):
402
+ log = dict()
403
+ x = self.get_input(batch, self.image_key)
404
+ x = x.to(self.device)
405
+ if not only_inputs:
406
+ xrec, posterior = self(x)
407
+ if x.shape[1] > 3:
408
+ # colorize with random projection
409
+ assert xrec.shape[1] > 3
410
+ x = self.to_rgb(x)
411
+ xrec = self.to_rgb(xrec)
412
+ log["samples"] = self.decode(torch.randn_like(posterior.sample()))
413
+ log["reconstructions"] = xrec
414
+ log["inputs"] = x
415
+ return log
416
+
417
+ def to_rgb(self, x):
418
+ assert self.image_key == "segmentation"
419
+ if not hasattr(self, "colorize"):
420
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
421
+ x = F.conv2d(x, weight=self.colorize)
422
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
423
+ return x
424
+
425
+
426
+ class IdentityFirstStage(torch.nn.Module):
427
+ def __init__(self, *args, vq_interface=False, **kwargs):
428
+ self.vq_interface = vq_interface # TODO: Should be true by default but check to not break older stuff
429
+ super().__init__()
430
+
431
+ def encode(self, x, *args, **kwargs):
432
+ return x
433
+
434
+ def decode(self, x, *args, **kwargs):
435
+ return x
436
+
437
+ def quantize(self, x, *args, **kwargs):
438
+ if self.vq_interface:
439
+ return x, None, [None, None, None]
440
+ return x
441
+
442
+ def forward(self, x, *args, **kwargs):
443
+ return x
ldmlib/models/diffusion/__init__.py ADDED
File without changes
ldmlib/models/diffusion/classifier.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pytorch_lightning as pl
4
+ from omegaconf import OmegaConf
5
+ from torch.nn import functional as F
6
+ from torch.optim import AdamW
7
+ from torch.optim.lr_scheduler import LambdaLR
8
+ from copy import deepcopy
9
+ from einops import rearrange
10
+ from glob import glob
11
+ from natsort import natsorted
12
+
13
+ from ldmlib.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
14
+ from ldmlib.util import log_txt_as_img, default, ismap, instantiate_from_config
15
+
16
+ __models__ = {
17
+ 'class_label': EncoderUNetModel,
18
+ 'segmentation': UNetModel
19
+ }
20
+
21
+
22
+ def disabled_train(self, mode=True):
23
+ """Overwrite model.train with this function to make sure train/eval mode
24
+ does not change anymore."""
25
+ return self
26
+
27
+
28
+ class NoisyLatentImageClassifier(pl.LightningModule):
29
+
30
+ def __init__(self,
31
+ diffusion_path,
32
+ num_classes,
33
+ ckpt_path=None,
34
+ pool='attention',
35
+ label_key=None,
36
+ diffusion_ckpt_path=None,
37
+ scheduler_config=None,
38
+ weight_decay=1.e-2,
39
+ log_steps=10,
40
+ monitor='val/loss',
41
+ *args,
42
+ **kwargs):
43
+ super().__init__(*args, **kwargs)
44
+ self.num_classes = num_classes
45
+ # get latest config of diffusion model
46
+ diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
47
+ self.diffusion_config = OmegaConf.load(diffusion_config).model
48
+ self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
49
+ self.load_diffusion()
50
+
51
+ self.monitor = monitor
52
+ self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
53
+ self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
54
+ self.log_steps = log_steps
55
+
56
+ self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
57
+ else self.diffusion_model.cond_stage_key
58
+
59
+ assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
60
+
61
+ if self.label_key not in __models__:
62
+ raise NotImplementedError()
63
+
64
+ self.load_classifier(ckpt_path, pool)
65
+
66
+ self.scheduler_config = scheduler_config
67
+ self.use_scheduler = self.scheduler_config is not None
68
+ self.weight_decay = weight_decay
69
+
70
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
71
+ sd = torch.load(path, map_location="cpu")
72
+ if "state_dict" in list(sd.keys()):
73
+ sd = sd["state_dict"]
74
+ keys = list(sd.keys())
75
+ for k in keys:
76
+ for ik in ignore_keys:
77
+ if k.startswith(ik):
78
+ print("Deleting key {} from state_dict.".format(k))
79
+ del sd[k]
80
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
81
+ sd, strict=False)
82
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
83
+ if len(missing) > 0:
84
+ print(f"Missing Keys: {missing}")
85
+ if len(unexpected) > 0:
86
+ print(f"Unexpected Keys: {unexpected}")
87
+
88
+ def load_diffusion(self):
89
+ model = instantiate_from_config(self.diffusion_config)
90
+ self.diffusion_model = model.eval()
91
+ self.diffusion_model.train = disabled_train
92
+ for param in self.diffusion_model.parameters():
93
+ param.requires_grad = False
94
+
95
+ def load_classifier(self, ckpt_path, pool):
96
+ model_config = deepcopy(self.diffusion_config.params.unet_config.params)
97
+ model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
98
+ model_config.out_channels = self.num_classes
99
+ if self.label_key == 'class_label':
100
+ model_config.pool = pool
101
+
102
+ self.model = __models__[self.label_key](**model_config)
103
+ if ckpt_path is not None:
104
+ print('#####################################################################')
105
+ print(f'load from ckpt "{ckpt_path}"')
106
+ print('#####################################################################')
107
+ self.init_from_ckpt(ckpt_path)
108
+
109
+ @torch.no_grad()
110
+ def get_x_noisy(self, x, t, noise=None):
111
+ noise = default(noise, lambda: torch.randn_like(x))
112
+ continuous_sqrt_alpha_cumprod = None
113
+ if self.diffusion_model.use_continuous_noise:
114
+ continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
115
+ # todo: make sure t+1 is correct here
116
+
117
+ return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
118
+ continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
119
+
120
+ def forward(self, x_noisy, t, *args, **kwargs):
121
+ return self.model(x_noisy, t)
122
+
123
+ @torch.no_grad()
124
+ def get_input(self, batch, k):
125
+ x = batch[k]
126
+ if len(x.shape) == 3:
127
+ x = x[..., None]
128
+ x = rearrange(x, 'b h w c -> b c h w')
129
+ x = x.to(memory_format=torch.contiguous_format).float()
130
+ return x
131
+
132
+ @torch.no_grad()
133
+ def get_conditioning(self, batch, k=None):
134
+ if k is None:
135
+ k = self.label_key
136
+ assert k is not None, 'Needs to provide label key'
137
+
138
+ targets = batch[k].to(self.device)
139
+
140
+ if self.label_key == 'segmentation':
141
+ targets = rearrange(targets, 'b h w c -> b c h w')
142
+ for down in range(self.numd):
143
+ h, w = targets.shape[-2:]
144
+ targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
145
+
146
+ # targets = rearrange(targets,'b c h w -> b h w c')
147
+
148
+ return targets
149
+
150
+ def compute_top_k(self, logits, labels, k, reduction="mean"):
151
+ _, top_ks = torch.topk(logits, k, dim=1)
152
+ if reduction == "mean":
153
+ return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
154
+ elif reduction == "none":
155
+ return (top_ks == labels[:, None]).float().sum(dim=-1)
156
+
157
+ def on_train_epoch_start(self):
158
+ # save some memory
159
+ self.diffusion_model.model.to('cpu')
160
+
161
+ @torch.no_grad()
162
+ def write_logs(self, loss, logits, targets):
163
+ log_prefix = 'train' if self.training else 'val'
164
+ log = {}
165
+ log[f"{log_prefix}/loss"] = loss.mean()
166
+ log[f"{log_prefix}/acc@1"] = self.compute_top_k(
167
+ logits, targets, k=1, reduction="mean"
168
+ )
169
+ log[f"{log_prefix}/acc@5"] = self.compute_top_k(
170
+ logits, targets, k=5, reduction="mean"
171
+ )
172
+
173
+ self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
174
+ self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
175
+ self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
176
+ lr = self.optimizers().param_groups[0]['lr']
177
+ self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
178
+
179
+ def shared_step(self, batch, t=None):
180
+ x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
181
+ targets = self.get_conditioning(batch)
182
+ if targets.dim() == 4:
183
+ targets = targets.argmax(dim=1)
184
+ if t is None:
185
+ t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
186
+ else:
187
+ t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
188
+ x_noisy = self.get_x_noisy(x, t)
189
+ logits = self(x_noisy, t)
190
+
191
+ loss = F.cross_entropy(logits, targets, reduction='none')
192
+
193
+ self.write_logs(loss.detach(), logits.detach(), targets.detach())
194
+
195
+ loss = loss.mean()
196
+ return loss, logits, x_noisy, targets
197
+
198
+ def training_step(self, batch, batch_idx):
199
+ loss, *_ = self.shared_step(batch)
200
+ return loss
201
+
202
+ def reset_noise_accs(self):
203
+ self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
204
+ range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
205
+
206
+ def on_validation_start(self):
207
+ self.reset_noise_accs()
208
+
209
+ @torch.no_grad()
210
+ def validation_step(self, batch, batch_idx):
211
+ loss, *_ = self.shared_step(batch)
212
+
213
+ for t in self.noisy_acc:
214
+ _, logits, _, targets = self.shared_step(batch, t)
215
+ self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
216
+ self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
217
+
218
+ return loss
219
+
220
+ def configure_optimizers(self):
221
+ optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
222
+
223
+ if self.use_scheduler:
224
+ scheduler = instantiate_from_config(self.scheduler_config)
225
+
226
+ print("Setting up LambdaLR scheduler...")
227
+ scheduler = [
228
+ {
229
+ 'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
230
+ 'interval': 'step',
231
+ 'frequency': 1
232
+ }]
233
+ return [optimizer], scheduler
234
+
235
+ return optimizer
236
+
237
+ @torch.no_grad()
238
+ def log_images(self, batch, N=8, *args, **kwargs):
239
+ log = dict()
240
+ x = self.get_input(batch, self.diffusion_model.first_stage_key)
241
+ log['inputs'] = x
242
+
243
+ y = self.get_conditioning(batch)
244
+
245
+ if self.label_key == 'class_label':
246
+ y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
247
+ log['labels'] = y
248
+
249
+ if ismap(y):
250
+ log['labels'] = self.diffusion_model.to_rgb(y)
251
+
252
+ for step in range(self.log_steps):
253
+ current_time = step * self.log_time_interval
254
+
255
+ _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
256
+
257
+ log[f'inputs@t{current_time}'] = x_noisy
258
+
259
+ pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
260
+ pred = rearrange(pred, 'b h w c -> b c h w')
261
+
262
+ log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
263
+
264
+ for key in log:
265
+ log[key] = log[key][:N]
266
+
267
+ return log
ldmlib/models/diffusion/ddim.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+
8
+ from ldmlib.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
9
+ extract_into_tensor
10
+
11
+
12
+ class DDIMSampler(object):
13
+ def __init__(self, model, schedule="linear", **kwargs):
14
+ super().__init__()
15
+ self.model = model
16
+ self.ddpm_num_timesteps = model.num_timesteps
17
+ self.schedule = schedule
18
+
19
+ def register_buffer(self, name, attr):
20
+ if type(attr) == torch.Tensor:
21
+ if attr.device != torch.device("cuda"):
22
+ attr = attr.to(torch.device("cuda"))
23
+ setattr(self, name, attr)
24
+
25
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
26
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
27
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
28
+ alphas_cumprod = self.model.alphas_cumprod
29
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
30
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
31
+
32
+ self.register_buffer('betas', to_torch(self.model.betas))
33
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
34
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
35
+
36
+ # calculations for diffusion q(x_t | x_{t-1}) and others
37
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
38
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
39
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
40
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
41
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
42
+
43
+ # ddim sampling parameters
44
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
45
+ ddim_timesteps=self.ddim_timesteps,
46
+ eta=ddim_eta,verbose=verbose)
47
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
48
+ self.register_buffer('ddim_alphas', ddim_alphas)
49
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
50
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
51
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
52
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
53
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
54
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
55
+
56
+ @torch.no_grad()
57
+ def sample(self,
58
+ S,
59
+ batch_size,
60
+ shape,
61
+ conditioning=None,
62
+ callback=None,
63
+ normals_sequence=None,
64
+ img_callback=None,
65
+ quantize_x0=False,
66
+ eta=0.,
67
+ mask=None,
68
+ x0=None,
69
+ temperature=1.,
70
+ noise_dropout=0.,
71
+ score_corrector=None,
72
+ corrector_kwargs=None,
73
+ verbose=True,
74
+ x_T=None,
75
+ log_every_t=100,
76
+ unconditional_guidance_scale=1.,
77
+ unconditional_conditioning=None,
78
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
79
+ **kwargs
80
+ ):
81
+ if conditioning is not None:
82
+ if isinstance(conditioning, dict):
83
+ cbs = conditioning[list(conditioning.keys())[0]].shape[0]
84
+ if cbs != batch_size:
85
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
86
+ else:
87
+ if conditioning.shape[0] != batch_size:
88
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
89
+
90
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
91
+ # sampling
92
+ C, H, W = shape
93
+ size = (batch_size, C, H, W)
94
+ print(f'Data shape for DDIM sampling is {size}, eta {eta}')
95
+
96
+ samples, intermediates = self.ddim_sampling(conditioning, size,
97
+ callback=callback,
98
+ img_callback=img_callback,
99
+ quantize_denoised=quantize_x0,
100
+ mask=mask, x0=x0,
101
+ ddim_use_original_steps=False,
102
+ noise_dropout=noise_dropout,
103
+ temperature=temperature,
104
+ score_corrector=score_corrector,
105
+ corrector_kwargs=corrector_kwargs,
106
+ x_T=x_T,
107
+ log_every_t=log_every_t,
108
+ unconditional_guidance_scale=unconditional_guidance_scale,
109
+ unconditional_conditioning=unconditional_conditioning,
110
+ )
111
+ return samples, intermediates
112
+
113
+ @torch.no_grad()
114
+ def ddim_sampling(self, cond, shape,
115
+ x_T=None, ddim_use_original_steps=False,
116
+ callback=None, timesteps=None, quantize_denoised=False,
117
+ mask=None, x0=None, img_callback=None, log_every_t=100,
118
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
119
+ unconditional_guidance_scale=1., unconditional_conditioning=None,):
120
+ device = self.model.betas.device
121
+ b = shape[0]
122
+ if x_T is None:
123
+ img = torch.randn(shape, device=device)
124
+ else:
125
+ img = x_T
126
+
127
+ if timesteps is None:
128
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
129
+ elif timesteps is not None and not ddim_use_original_steps:
130
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
131
+ timesteps = self.ddim_timesteps[:subset_end]
132
+
133
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
134
+ time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
135
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
136
+ print(f"Running DDIM Sampling with {total_steps} timesteps")
137
+
138
+ iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
139
+
140
+ for i, step in enumerate(iterator):
141
+ index = total_steps - i - 1
142
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
143
+
144
+ if mask is not None:
145
+ assert x0 is not None
146
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
147
+ img = img_orig * mask + (1. - mask) * img
148
+
149
+ outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
150
+ quantize_denoised=quantize_denoised, temperature=temperature,
151
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
152
+ corrector_kwargs=corrector_kwargs,
153
+ unconditional_guidance_scale=unconditional_guidance_scale,
154
+ unconditional_conditioning=unconditional_conditioning)
155
+ img, pred_x0 = outs
156
+ if callback: callback(i)
157
+ if img_callback: img_callback(pred_x0, i)
158
+
159
+ if index % log_every_t == 0 or index == total_steps - 1:
160
+ intermediates['x_inter'].append(img)
161
+ intermediates['pred_x0'].append(pred_x0)
162
+
163
+ return img, intermediates
164
+
165
+ @torch.no_grad()
166
+ def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
167
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
168
+ unconditional_guidance_scale=1., unconditional_conditioning=None):
169
+ b, *_, device = *x.shape, x.device
170
+
171
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
172
+ e_t = self.model.apply_model(x, t, c)
173
+ else:
174
+ x_in = torch.cat([x] * 2)
175
+ t_in = torch.cat([t] * 2)
176
+ c_in = torch.cat([unconditional_conditioning, c])
177
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
178
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
179
+
180
+ if score_corrector is not None:
181
+ assert self.model.parameterization == "eps"
182
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
183
+
184
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
185
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
186
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
187
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
188
+ # select parameters corresponding to the currently considered timestep
189
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
190
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
191
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
192
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
193
+
194
+ # current prediction for x_0
195
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
196
+ if quantize_denoised:
197
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
198
+ # direction pointing to x_t
199
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
200
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
201
+ if noise_dropout > 0.:
202
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
203
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
204
+ return x_prev, pred_x0
205
+
206
+ @torch.no_grad()
207
+ def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
208
+ # fast, but does not allow for exact reconstruction
209
+ # t serves as an index to gather the correct alphas
210
+ if use_original_steps:
211
+ sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
212
+ sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
213
+ else:
214
+ sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
215
+ sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
216
+
217
+ if noise is None:
218
+ noise = torch.randn_like(x0)
219
+ return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
220
+ extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
221
+
222
+ @torch.no_grad()
223
+ def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
224
+ use_original_steps=False):
225
+
226
+ timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
227
+ timesteps = timesteps[:t_start]
228
+
229
+ time_range = np.flip(timesteps)
230
+ total_steps = timesteps.shape[0]
231
+ print(f"Running DDIM Sampling with {total_steps} timesteps")
232
+
233
+ iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
234
+ x_dec = x_latent
235
+ for i, step in enumerate(iterator):
236
+ index = total_steps - i - 1
237
+ ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
238
+ x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
239
+ unconditional_guidance_scale=unconditional_guidance_scale,
240
+ unconditional_conditioning=unconditional_conditioning)
241
+ return x_dec
ldmlib/models/diffusion/ddpm.py ADDED
@@ -0,0 +1,1445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ wild mixture of
3
+ https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
4
+ https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
5
+ https://github.com/CompVis/taming-transformers
6
+ -- merci
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import numpy as np
12
+ import pytorch_lightning as pl
13
+ from torch.optim.lr_scheduler import LambdaLR
14
+ from einops import rearrange, repeat
15
+ from contextlib import contextmanager
16
+ from functools import partial
17
+ from tqdm import tqdm
18
+ from torchvision.utils import make_grid
19
+ from pytorch_lightning.utilities.distributed import rank_zero_only
20
+
21
+ from ldmlib.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
22
+ from ldmlib.modules.ema import LitEma
23
+ from ldmlib.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
24
+ from ldmlib.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
25
+ from ldmlib.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
26
+ from ldmlib.models.diffusion.ddim import DDIMSampler
27
+
28
+
29
+ __conditioning_keys__ = {'concat': 'c_concat',
30
+ 'crossattn': 'c_crossattn',
31
+ 'adm': 'y'}
32
+
33
+
34
+ def disabled_train(self, mode=True):
35
+ """Overwrite model.train with this function to make sure train/eval mode
36
+ does not change anymore."""
37
+ return self
38
+
39
+
40
+ def uniform_on_device(r1, r2, shape, device):
41
+ return (r1 - r2) * torch.rand(*shape, device=device) + r2
42
+
43
+
44
+ class DDPM(pl.LightningModule):
45
+ # classic DDPM with Gaussian diffusion, in image space
46
+ def __init__(self,
47
+ unet_config,
48
+ timesteps=1000,
49
+ beta_schedule="linear",
50
+ loss_type="l2",
51
+ ckpt_path=None,
52
+ ignore_keys=[],
53
+ load_only_unet=False,
54
+ monitor="val/loss",
55
+ use_ema=True,
56
+ first_stage_key="image",
57
+ image_size=256,
58
+ channels=3,
59
+ log_every_t=100,
60
+ clip_denoised=True,
61
+ linear_start=1e-4,
62
+ linear_end=2e-2,
63
+ cosine_s=8e-3,
64
+ given_betas=None,
65
+ original_elbo_weight=0.,
66
+ v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
67
+ l_simple_weight=1.,
68
+ conditioning_key=None,
69
+ parameterization="eps", # all assuming fixed variance schedules
70
+ scheduler_config=None,
71
+ use_positional_encodings=False,
72
+ learn_logvar=False,
73
+ logvar_init=0.,
74
+ ):
75
+ super().__init__()
76
+ assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
77
+ self.parameterization = parameterization
78
+ print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
79
+ self.cond_stage_model = None
80
+ self.clip_denoised = clip_denoised
81
+ self.log_every_t = log_every_t
82
+ self.first_stage_key = first_stage_key
83
+ self.image_size = image_size # try conv?
84
+ self.channels = channels
85
+ self.use_positional_encodings = use_positional_encodings
86
+ self.model = DiffusionWrapper(unet_config, conditioning_key)
87
+ count_params(self.model, verbose=True)
88
+ self.use_ema = use_ema
89
+ if self.use_ema:
90
+ self.model_ema = LitEma(self.model)
91
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
92
+
93
+ self.use_scheduler = scheduler_config is not None
94
+ if self.use_scheduler:
95
+ self.scheduler_config = scheduler_config
96
+
97
+ self.v_posterior = v_posterior
98
+ self.original_elbo_weight = original_elbo_weight
99
+ self.l_simple_weight = l_simple_weight
100
+
101
+ if monitor is not None:
102
+ self.monitor = monitor
103
+ if ckpt_path is not None:
104
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
105
+
106
+ self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
107
+ linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
108
+
109
+ self.loss_type = loss_type
110
+
111
+ self.learn_logvar = learn_logvar
112
+ self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
113
+ if self.learn_logvar:
114
+ self.logvar = nn.Parameter(self.logvar, requires_grad=True)
115
+
116
+
117
+ def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
118
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
119
+ if exists(given_betas):
120
+ betas = given_betas
121
+ else:
122
+ betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
123
+ cosine_s=cosine_s)
124
+ alphas = 1. - betas
125
+ alphas_cumprod = np.cumprod(alphas, axis=0)
126
+ alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
127
+
128
+ timesteps, = betas.shape
129
+ self.num_timesteps = int(timesteps)
130
+ self.linear_start = linear_start
131
+ self.linear_end = linear_end
132
+ assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
133
+
134
+ to_torch = partial(torch.tensor, dtype=torch.float32)
135
+
136
+ self.register_buffer('betas', to_torch(betas))
137
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
138
+ self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
139
+
140
+ # calculations for diffusion q(x_t | x_{t-1}) and others
141
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
142
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
143
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
144
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
145
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
146
+
147
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
148
+ posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
149
+ 1. - alphas_cumprod) + self.v_posterior * betas
150
+ # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
151
+ self.register_buffer('posterior_variance', to_torch(posterior_variance))
152
+ # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
153
+ self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
154
+ self.register_buffer('posterior_mean_coef1', to_torch(
155
+ betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
156
+ self.register_buffer('posterior_mean_coef2', to_torch(
157
+ (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
158
+
159
+ if self.parameterization == "eps":
160
+ lvlb_weights = self.betas ** 2 / (
161
+ 2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
162
+ elif self.parameterization == "x0":
163
+ lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
164
+ else:
165
+ raise NotImplementedError("mu not supported")
166
+ # TODO how to choose this term
167
+ lvlb_weights[0] = lvlb_weights[1]
168
+ self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
169
+ assert not torch.isnan(self.lvlb_weights).all()
170
+
171
+ @contextmanager
172
+ def ema_scope(self, context=None):
173
+ if self.use_ema:
174
+ self.model_ema.store(self.model.parameters())
175
+ self.model_ema.copy_to(self.model)
176
+ if context is not None:
177
+ print(f"{context}: Switched to EMA weights")
178
+ try:
179
+ yield None
180
+ finally:
181
+ if self.use_ema:
182
+ self.model_ema.restore(self.model.parameters())
183
+ if context is not None:
184
+ print(f"{context}: Restored training weights")
185
+
186
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
187
+ sd = torch.load(path, map_location="cpu")
188
+ if "state_dict" in list(sd.keys()):
189
+ sd = sd["state_dict"]
190
+ keys = list(sd.keys())
191
+ for k in keys:
192
+ for ik in ignore_keys:
193
+ if k.startswith(ik):
194
+ print("Deleting key {} from state_dict.".format(k))
195
+ del sd[k]
196
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
197
+ sd, strict=False)
198
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
199
+ if len(missing) > 0:
200
+ print(f"Missing Keys: {missing}")
201
+ if len(unexpected) > 0:
202
+ print(f"Unexpected Keys: {unexpected}")
203
+
204
+ def q_mean_variance(self, x_start, t):
205
+ """
206
+ Get the distribution q(x_t | x_0).
207
+ :param x_start: the [N x C x ...] tensor of noiseless inputs.
208
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
209
+ :return: A tuple (mean, variance, log_variance), all of x_start's shape.
210
+ """
211
+ mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
212
+ variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
213
+ log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
214
+ return mean, variance, log_variance
215
+
216
+ def predict_start_from_noise(self, x_t, t, noise):
217
+ return (
218
+ extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
219
+ extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
220
+ )
221
+
222
+ def q_posterior(self, x_start, x_t, t):
223
+ posterior_mean = (
224
+ extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
225
+ extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
226
+ )
227
+ posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
228
+ posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
229
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
230
+
231
+ def p_mean_variance(self, x, t, clip_denoised: bool):
232
+ model_out = self.model(x, t)
233
+ if self.parameterization == "eps":
234
+ x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
235
+ elif self.parameterization == "x0":
236
+ x_recon = model_out
237
+ if clip_denoised:
238
+ x_recon.clamp_(-1., 1.)
239
+
240
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
241
+ return model_mean, posterior_variance, posterior_log_variance
242
+
243
+ @torch.no_grad()
244
+ def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
245
+ b, *_, device = *x.shape, x.device
246
+ model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
247
+ noise = noise_like(x.shape, device, repeat_noise)
248
+ # no noise when t == 0
249
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
250
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
251
+
252
+ @torch.no_grad()
253
+ def p_sample_loop(self, shape, return_intermediates=False):
254
+ device = self.betas.device
255
+ b = shape[0]
256
+ img = torch.randn(shape, device=device)
257
+ intermediates = [img]
258
+ for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
259
+ img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
260
+ clip_denoised=self.clip_denoised)
261
+ if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
262
+ intermediates.append(img)
263
+ if return_intermediates:
264
+ return img, intermediates
265
+ return img
266
+
267
+ @torch.no_grad()
268
+ def sample(self, batch_size=16, return_intermediates=False):
269
+ image_size = self.image_size
270
+ channels = self.channels
271
+ return self.p_sample_loop((batch_size, channels, image_size, image_size),
272
+ return_intermediates=return_intermediates)
273
+
274
+ def q_sample(self, x_start, t, noise=None):
275
+ noise = default(noise, lambda: torch.randn_like(x_start))
276
+ return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
277
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
278
+
279
+ def get_loss(self, pred, target, mean=True):
280
+ if self.loss_type == 'l1':
281
+ loss = (target - pred).abs()
282
+ if mean:
283
+ loss = loss.mean()
284
+ elif self.loss_type == 'l2':
285
+ if mean:
286
+ loss = torch.nn.functional.mse_loss(target, pred)
287
+ else:
288
+ loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
289
+ else:
290
+ raise NotImplementedError("unknown loss type '{loss_type}'")
291
+
292
+ return loss
293
+
294
+ def p_losses(self, x_start, t, noise=None):
295
+ noise = default(noise, lambda: torch.randn_like(x_start))
296
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
297
+ model_out = self.model(x_noisy, t)
298
+
299
+ loss_dict = {}
300
+ if self.parameterization == "eps":
301
+ target = noise
302
+ elif self.parameterization == "x0":
303
+ target = x_start
304
+ else:
305
+ raise NotImplementedError(f"Paramterization {self.parameterization} not yet supported")
306
+
307
+ loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
308
+
309
+ log_prefix = 'train' if self.training else 'val'
310
+
311
+ loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
312
+ loss_simple = loss.mean() * self.l_simple_weight
313
+
314
+ loss_vlb = (self.lvlb_weights[t] * loss).mean()
315
+ loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
316
+
317
+ loss = loss_simple + self.original_elbo_weight * loss_vlb
318
+
319
+ loss_dict.update({f'{log_prefix}/loss': loss})
320
+
321
+ return loss, loss_dict
322
+
323
+ def forward(self, x, *args, **kwargs):
324
+ # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
325
+ # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
326
+ t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
327
+ return self.p_losses(x, t, *args, **kwargs)
328
+
329
+ def get_input(self, batch, k):
330
+ x = batch[k]
331
+ if len(x.shape) == 3:
332
+ x = x[..., None]
333
+ x = rearrange(x, 'b h w c -> b c h w')
334
+ x = x.to(memory_format=torch.contiguous_format).float()
335
+ return x
336
+
337
+ def shared_step(self, batch):
338
+ x = self.get_input(batch, self.first_stage_key)
339
+ loss, loss_dict = self(x)
340
+ return loss, loss_dict
341
+
342
+ def training_step(self, batch, batch_idx):
343
+ loss, loss_dict = self.shared_step(batch)
344
+
345
+ self.log_dict(loss_dict, prog_bar=True,
346
+ logger=True, on_step=True, on_epoch=True)
347
+
348
+ self.log("global_step", self.global_step,
349
+ prog_bar=True, logger=True, on_step=True, on_epoch=False)
350
+
351
+ if self.use_scheduler:
352
+ lr = self.optimizers().param_groups[0]['lr']
353
+ self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
354
+
355
+ return loss
356
+
357
+ @torch.no_grad()
358
+ def validation_step(self, batch, batch_idx):
359
+ _, loss_dict_no_ema = self.shared_step(batch)
360
+ with self.ema_scope():
361
+ _, loss_dict_ema = self.shared_step(batch)
362
+ loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
363
+ self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
364
+ self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
365
+
366
+ def on_train_batch_end(self, *args, **kwargs):
367
+ if self.use_ema:
368
+ self.model_ema(self.model)
369
+
370
+ def _get_rows_from_list(self, samples):
371
+ n_imgs_per_row = len(samples)
372
+ denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
373
+ denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
374
+ denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
375
+ return denoise_grid
376
+
377
+ @torch.no_grad()
378
+ def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
379
+ log = dict()
380
+ x = self.get_input(batch, self.first_stage_key)
381
+ N = min(x.shape[0], N)
382
+ n_row = min(x.shape[0], n_row)
383
+ x = x.to(self.device)[:N]
384
+ log["inputs"] = x
385
+
386
+ # get diffusion row
387
+ diffusion_row = list()
388
+ x_start = x[:n_row]
389
+
390
+ for t in range(self.num_timesteps):
391
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
392
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
393
+ t = t.to(self.device).long()
394
+ noise = torch.randn_like(x_start)
395
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
396
+ diffusion_row.append(x_noisy)
397
+
398
+ log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
399
+
400
+ if sample:
401
+ # get denoise row
402
+ with self.ema_scope("Plotting"):
403
+ samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
404
+
405
+ log["samples"] = samples
406
+ log["denoise_row"] = self._get_rows_from_list(denoise_row)
407
+
408
+ if return_keys:
409
+ if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
410
+ return log
411
+ else:
412
+ return {key: log[key] for key in return_keys}
413
+ return log
414
+
415
+ def configure_optimizers(self):
416
+ lr = self.learning_rate
417
+ params = list(self.model.parameters())
418
+ if self.learn_logvar:
419
+ params = params + [self.logvar]
420
+ opt = torch.optim.AdamW(params, lr=lr)
421
+ return opt
422
+
423
+
424
+ class LatentDiffusion(DDPM):
425
+ """main class"""
426
+ def __init__(self,
427
+ first_stage_config,
428
+ cond_stage_config,
429
+ num_timesteps_cond=None,
430
+ cond_stage_key="image",
431
+ cond_stage_trainable=False,
432
+ concat_mode=True,
433
+ cond_stage_forward=None,
434
+ conditioning_key=None,
435
+ scale_factor=1.0,
436
+ scale_by_std=False,
437
+ *args, **kwargs):
438
+ self.num_timesteps_cond = default(num_timesteps_cond, 1)
439
+ self.scale_by_std = scale_by_std
440
+ assert self.num_timesteps_cond <= kwargs['timesteps']
441
+ # for backwards compatibility after implementation of DiffusionWrapper
442
+ if conditioning_key is None:
443
+ conditioning_key = 'concat' if concat_mode else 'crossattn'
444
+ if cond_stage_config == '__is_unconditional__':
445
+ conditioning_key = None
446
+ ckpt_path = kwargs.pop("ckpt_path", None)
447
+ ignore_keys = kwargs.pop("ignore_keys", [])
448
+ super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
449
+ self.concat_mode = concat_mode
450
+ self.cond_stage_trainable = cond_stage_trainable
451
+ self.cond_stage_key = cond_stage_key
452
+ try:
453
+ self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
454
+ except:
455
+ self.num_downs = 0
456
+ if not scale_by_std:
457
+ self.scale_factor = scale_factor
458
+ else:
459
+ self.register_buffer('scale_factor', torch.tensor(scale_factor))
460
+ self.instantiate_first_stage(first_stage_config)
461
+ self.instantiate_cond_stage(cond_stage_config)
462
+ self.cond_stage_forward = cond_stage_forward
463
+ self.clip_denoised = False
464
+ self.bbox_tokenizer = None
465
+
466
+ self.restarted_from_ckpt = False
467
+ if ckpt_path is not None:
468
+ self.init_from_ckpt(ckpt_path, ignore_keys)
469
+ self.restarted_from_ckpt = True
470
+
471
+ def make_cond_schedule(self, ):
472
+ self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
473
+ ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
474
+ self.cond_ids[:self.num_timesteps_cond] = ids
475
+
476
+ @rank_zero_only
477
+ @torch.no_grad()
478
+ def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
479
+ # only for very first batch
480
+ if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
481
+ assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
482
+ # set rescale weight to 1./std of encodings
483
+ print("### USING STD-RESCALING ###")
484
+ x = super().get_input(batch, self.first_stage_key)
485
+ x = x.to(self.device)
486
+ encoder_posterior = self.encode_first_stage(x)
487
+ z = self.get_first_stage_encoding(encoder_posterior).detach()
488
+ del self.scale_factor
489
+ self.register_buffer('scale_factor', 1. / z.flatten().std())
490
+ print(f"setting self.scale_factor to {self.scale_factor}")
491
+ print("### USING STD-RESCALING ###")
492
+
493
+ def register_schedule(self,
494
+ given_betas=None, beta_schedule="linear", timesteps=1000,
495
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
496
+ super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
497
+
498
+ self.shorten_cond_schedule = self.num_timesteps_cond > 1
499
+ if self.shorten_cond_schedule:
500
+ self.make_cond_schedule()
501
+
502
+ def instantiate_first_stage(self, config):
503
+ model = instantiate_from_config(config)
504
+ self.first_stage_model = model.eval()
505
+ self.first_stage_model.train = disabled_train
506
+ for param in self.first_stage_model.parameters():
507
+ param.requires_grad = False
508
+
509
+ def instantiate_cond_stage(self, config):
510
+ if not self.cond_stage_trainable:
511
+ if config == "__is_first_stage__":
512
+ print("Using first stage also as cond stage.")
513
+ self.cond_stage_model = self.first_stage_model
514
+ elif config == "__is_unconditional__":
515
+ print(f"Training {self.__class__.__name__} as an unconditional model.")
516
+ self.cond_stage_model = None
517
+ # self.be_unconditional = True
518
+ else:
519
+ model = instantiate_from_config(config)
520
+ self.cond_stage_model = model.eval()
521
+ self.cond_stage_model.train = disabled_train
522
+ for param in self.cond_stage_model.parameters():
523
+ param.requires_grad = False
524
+ else:
525
+ assert config != '__is_first_stage__'
526
+ assert config != '__is_unconditional__'
527
+ model = instantiate_from_config(config)
528
+ self.cond_stage_model = model
529
+
530
+ def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
531
+ denoise_row = []
532
+ for zd in tqdm(samples, desc=desc):
533
+ denoise_row.append(self.decode_first_stage(zd.to(self.device),
534
+ force_not_quantize=force_no_decoder_quantization))
535
+ n_imgs_per_row = len(denoise_row)
536
+ denoise_row = torch.stack(denoise_row) # n_log_step, n_row, C, H, W
537
+ denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
538
+ denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
539
+ denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
540
+ return denoise_grid
541
+
542
+ def get_first_stage_encoding(self, encoder_posterior):
543
+ if isinstance(encoder_posterior, DiagonalGaussianDistribution):
544
+ z = encoder_posterior.sample()
545
+ elif isinstance(encoder_posterior, torch.Tensor):
546
+ z = encoder_posterior
547
+ else:
548
+ raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
549
+ return self.scale_factor * z
550
+
551
+ def get_learned_conditioning(self, c):
552
+ if self.cond_stage_forward is None:
553
+ if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
554
+ c = self.cond_stage_model.encode(c)
555
+ if isinstance(c, DiagonalGaussianDistribution):
556
+ c = c.mode()
557
+ else:
558
+ c = self.cond_stage_model(c)
559
+ else:
560
+ assert hasattr(self.cond_stage_model, self.cond_stage_forward)
561
+ c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
562
+ return c
563
+
564
+ def meshgrid(self, h, w):
565
+ y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
566
+ x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
567
+
568
+ arr = torch.cat([y, x], dim=-1)
569
+ return arr
570
+
571
+ def delta_border(self, h, w):
572
+ """
573
+ :param h: height
574
+ :param w: width
575
+ :return: normalized distance to image border,
576
+ wtith min distance = 0 at border and max dist = 0.5 at image center
577
+ """
578
+ lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
579
+ arr = self.meshgrid(h, w) / lower_right_corner
580
+ dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
581
+ dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
582
+ edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
583
+ return edge_dist
584
+
585
+ def get_weighting(self, h, w, Ly, Lx, device):
586
+ weighting = self.delta_border(h, w)
587
+ weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
588
+ self.split_input_params["clip_max_weight"], )
589
+ weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
590
+
591
+ if self.split_input_params["tie_braker"]:
592
+ L_weighting = self.delta_border(Ly, Lx)
593
+ L_weighting = torch.clip(L_weighting,
594
+ self.split_input_params["clip_min_tie_weight"],
595
+ self.split_input_params["clip_max_tie_weight"])
596
+
597
+ L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
598
+ weighting = weighting * L_weighting
599
+ return weighting
600
+
601
+ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo load once not every time, shorten code
602
+ """
603
+ :param x: img of size (bs, c, h, w)
604
+ :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
605
+ """
606
+ bs, nc, h, w = x.shape
607
+
608
+ # number of crops in image
609
+ Ly = (h - kernel_size[0]) // stride[0] + 1
610
+ Lx = (w - kernel_size[1]) // stride[1] + 1
611
+
612
+ if uf == 1 and df == 1:
613
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
614
+ unfold = torch.nn.Unfold(**fold_params)
615
+
616
+ fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
617
+
618
+ weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
619
+ normalization = fold(weighting).view(1, 1, h, w) # normalizes the overlap
620
+ weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
621
+
622
+ elif uf > 1 and df == 1:
623
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
624
+ unfold = torch.nn.Unfold(**fold_params)
625
+
626
+ fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
627
+ dilation=1, padding=0,
628
+ stride=(stride[0] * uf, stride[1] * uf))
629
+ fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
630
+
631
+ weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
632
+ normalization = fold(weighting).view(1, 1, h * uf, w * uf) # normalizes the overlap
633
+ weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
634
+
635
+ elif df > 1 and uf == 1:
636
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
637
+ unfold = torch.nn.Unfold(**fold_params)
638
+
639
+ fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
640
+ dilation=1, padding=0,
641
+ stride=(stride[0] // df, stride[1] // df))
642
+ fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
643
+
644
+ weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
645
+ normalization = fold(weighting).view(1, 1, h // df, w // df) # normalizes the overlap
646
+ weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
647
+
648
+ else:
649
+ raise NotImplementedError
650
+
651
+ return fold, unfold, normalization, weighting
652
+
653
+ @torch.no_grad()
654
+ def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
655
+ cond_key=None, return_original_cond=False, bs=None):
656
+ x = super().get_input(batch, k)
657
+ if bs is not None:
658
+ x = x[:bs]
659
+ x = x.to(self.device)
660
+ encoder_posterior = self.encode_first_stage(x)
661
+ z = self.get_first_stage_encoding(encoder_posterior).detach()
662
+
663
+ if self.model.conditioning_key is not None:
664
+ if cond_key is None:
665
+ cond_key = self.cond_stage_key
666
+ if cond_key != self.first_stage_key:
667
+ if cond_key in ['caption', 'coordinates_bbox']:
668
+ xc = batch[cond_key]
669
+ elif cond_key == 'class_label':
670
+ xc = batch
671
+ else:
672
+ xc = super().get_input(batch, cond_key).to(self.device)
673
+ else:
674
+ xc = x
675
+ if not self.cond_stage_trainable or force_c_encode:
676
+ if isinstance(xc, dict) or isinstance(xc, list):
677
+ # import pudb; pudb.set_trace()
678
+ c = self.get_learned_conditioning(xc)
679
+ else:
680
+ c = self.get_learned_conditioning(xc.to(self.device))
681
+ else:
682
+ c = xc
683
+ if bs is not None:
684
+ c = c[:bs]
685
+
686
+ if self.use_positional_encodings:
687
+ pos_x, pos_y = self.compute_latent_shifts(batch)
688
+ ckey = __conditioning_keys__[self.model.conditioning_key]
689
+ c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
690
+
691
+ else:
692
+ c = None
693
+ xc = None
694
+ if self.use_positional_encodings:
695
+ pos_x, pos_y = self.compute_latent_shifts(batch)
696
+ c = {'pos_x': pos_x, 'pos_y': pos_y}
697
+ out = [z, c]
698
+ if return_first_stage_outputs:
699
+ xrec = self.decode_first_stage(z)
700
+ out.extend([x, xrec])
701
+ if return_original_cond:
702
+ out.append(xc)
703
+ return out
704
+
705
+ @torch.no_grad()
706
+ def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
707
+ if predict_cids:
708
+ if z.dim() == 4:
709
+ z = torch.argmax(z.exp(), dim=1).long()
710
+ z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
711
+ z = rearrange(z, 'b h w c -> b c h w').contiguous()
712
+
713
+ z = 1. / self.scale_factor * z
714
+
715
+ if hasattr(self, "split_input_params"):
716
+ if self.split_input_params["patch_distributed_vq"]:
717
+ ks = self.split_input_params["ks"] # eg. (128, 128)
718
+ stride = self.split_input_params["stride"] # eg. (64, 64)
719
+ uf = self.split_input_params["vqf"]
720
+ bs, nc, h, w = z.shape
721
+ if ks[0] > h or ks[1] > w:
722
+ ks = (min(ks[0], h), min(ks[1], w))
723
+ print("reducing Kernel")
724
+
725
+ if stride[0] > h or stride[1] > w:
726
+ stride = (min(stride[0], h), min(stride[1], w))
727
+ print("reducing stride")
728
+
729
+ fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
730
+
731
+ z = unfold(z) # (bn, nc * prod(**ks), L)
732
+ # 1. Reshape to img shape
733
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
734
+
735
+ # 2. apply model loop over last dim
736
+ if isinstance(self.first_stage_model, VQModelInterface):
737
+ output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
738
+ force_not_quantize=predict_cids or force_not_quantize)
739
+ for i in range(z.shape[-1])]
740
+ else:
741
+
742
+ output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
743
+ for i in range(z.shape[-1])]
744
+
745
+ o = torch.stack(output_list, axis=-1) # # (bn, nc, ks[0], ks[1], L)
746
+ o = o * weighting
747
+ # Reverse 1. reshape to img shape
748
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
749
+ # stitch crops together
750
+ decoded = fold(o)
751
+ decoded = decoded / normalization # norm is shape (1, 1, h, w)
752
+ return decoded
753
+ else:
754
+ if isinstance(self.first_stage_model, VQModelInterface):
755
+ return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
756
+ else:
757
+ return self.first_stage_model.decode(z)
758
+
759
+ else:
760
+ if isinstance(self.first_stage_model, VQModelInterface):
761
+ return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
762
+ else:
763
+ return self.first_stage_model.decode(z)
764
+
765
+ # same as above but without decorator
766
+ def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
767
+ if predict_cids:
768
+ if z.dim() == 4:
769
+ z = torch.argmax(z.exp(), dim=1).long()
770
+ z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
771
+ z = rearrange(z, 'b h w c -> b c h w').contiguous()
772
+
773
+ z = 1. / self.scale_factor * z
774
+
775
+ if hasattr(self, "split_input_params"):
776
+ if self.split_input_params["patch_distributed_vq"]:
777
+ ks = self.split_input_params["ks"] # eg. (128, 128)
778
+ stride = self.split_input_params["stride"] # eg. (64, 64)
779
+ uf = self.split_input_params["vqf"]
780
+ bs, nc, h, w = z.shape
781
+ if ks[0] > h or ks[1] > w:
782
+ ks = (min(ks[0], h), min(ks[1], w))
783
+ print("reducing Kernel")
784
+
785
+ if stride[0] > h or stride[1] > w:
786
+ stride = (min(stride[0], h), min(stride[1], w))
787
+ print("reducing stride")
788
+
789
+ fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
790
+
791
+ z = unfold(z) # (bn, nc * prod(**ks), L)
792
+ # 1. Reshape to img shape
793
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
794
+
795
+ # 2. apply model loop over last dim
796
+ if isinstance(self.first_stage_model, VQModelInterface):
797
+ output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
798
+ force_not_quantize=predict_cids or force_not_quantize)
799
+ for i in range(z.shape[-1])]
800
+ else:
801
+
802
+ output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
803
+ for i in range(z.shape[-1])]
804
+
805
+ o = torch.stack(output_list, axis=-1) # # (bn, nc, ks[0], ks[1], L)
806
+ o = o * weighting
807
+ # Reverse 1. reshape to img shape
808
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
809
+ # stitch crops together
810
+ decoded = fold(o)
811
+ decoded = decoded / normalization # norm is shape (1, 1, h, w)
812
+ return decoded
813
+ else:
814
+ if isinstance(self.first_stage_model, VQModelInterface):
815
+ return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
816
+ else:
817
+ return self.first_stage_model.decode(z)
818
+
819
+ else:
820
+ if isinstance(self.first_stage_model, VQModelInterface):
821
+ return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
822
+ else:
823
+ return self.first_stage_model.decode(z)
824
+
825
+ @torch.no_grad()
826
+ def encode_first_stage(self, x):
827
+ if hasattr(self, "split_input_params"):
828
+ if self.split_input_params["patch_distributed_vq"]:
829
+ ks = self.split_input_params["ks"] # eg. (128, 128)
830
+ stride = self.split_input_params["stride"] # eg. (64, 64)
831
+ df = self.split_input_params["vqf"]
832
+ self.split_input_params['original_image_size'] = x.shape[-2:]
833
+ bs, nc, h, w = x.shape
834
+ if ks[0] > h or ks[1] > w:
835
+ ks = (min(ks[0], h), min(ks[1], w))
836
+ print("reducing Kernel")
837
+
838
+ if stride[0] > h or stride[1] > w:
839
+ stride = (min(stride[0], h), min(stride[1], w))
840
+ print("reducing stride")
841
+
842
+ fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
843
+ z = unfold(x) # (bn, nc * prod(**ks), L)
844
+ # Reshape to img shape
845
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
846
+
847
+ output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
848
+ for i in range(z.shape[-1])]
849
+
850
+ o = torch.stack(output_list, axis=-1)
851
+ o = o * weighting
852
+
853
+ # Reverse reshape to img shape
854
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
855
+ # stitch crops together
856
+ decoded = fold(o)
857
+ decoded = decoded / normalization
858
+ return decoded
859
+
860
+ else:
861
+ return self.first_stage_model.encode(x)
862
+ else:
863
+ return self.first_stage_model.encode(x)
864
+
865
+ def shared_step(self, batch, **kwargs):
866
+ x, c = self.get_input(batch, self.first_stage_key)
867
+ loss = self(x, c)
868
+ return loss
869
+
870
+ def forward(self, x, c, *args, **kwargs):
871
+ t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
872
+ if self.model.conditioning_key is not None:
873
+ assert c is not None
874
+ if self.cond_stage_trainable:
875
+ c = self.get_learned_conditioning(c)
876
+ if self.shorten_cond_schedule: # TODO: drop this option
877
+ tc = self.cond_ids[t].to(self.device)
878
+ c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
879
+ return self.p_losses(x, c, t, *args, **kwargs)
880
+
881
+ def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: move to dataset
882
+ def rescale_bbox(bbox):
883
+ x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
884
+ y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
885
+ w = min(bbox[2] / crop_coordinates[2], 1 - x0)
886
+ h = min(bbox[3] / crop_coordinates[3], 1 - y0)
887
+ return x0, y0, w, h
888
+
889
+ return [rescale_bbox(b) for b in bboxes]
890
+
891
+ def apply_model(self, x_noisy, t, cond, return_ids=False):
892
+
893
+ if isinstance(cond, dict):
894
+ # hybrid case, cond is exptected to be a dict
895
+ pass
896
+ else:
897
+ if not isinstance(cond, list):
898
+ cond = [cond]
899
+ key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
900
+ cond = {key: cond}
901
+
902
+ if hasattr(self, "split_input_params"):
903
+ assert len(cond) == 1 # todo can only deal with one conditioning atm
904
+ assert not return_ids
905
+ ks = self.split_input_params["ks"] # eg. (128, 128)
906
+ stride = self.split_input_params["stride"] # eg. (64, 64)
907
+
908
+ h, w = x_noisy.shape[-2:]
909
+
910
+ fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
911
+
912
+ z = unfold(x_noisy) # (bn, nc * prod(**ks), L)
913
+ # Reshape to img shape
914
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
915
+ z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
916
+
917
+ if self.cond_stage_key in ["image", "LR_image", "segmentation",
918
+ 'bbox_img'] and self.model.conditioning_key: # todo check for completeness
919
+ c_key = next(iter(cond.keys())) # get key
920
+ c = next(iter(cond.values())) # get value
921
+ assert (len(c) == 1) # todo extend to list with more than one elem
922
+ c = c[0] # get element
923
+
924
+ c = unfold(c)
925
+ c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1])) # (bn, nc, ks[0], ks[1], L )
926
+
927
+ cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
928
+
929
+ elif self.cond_stage_key == 'coordinates_bbox':
930
+ assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
931
+
932
+ # assuming padding of unfold is always 0 and its dilation is always 1
933
+ n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
934
+ full_img_h, full_img_w = self.split_input_params['original_image_size']
935
+ # as we are operating on latents, we need the factor from the original image size to the
936
+ # spatial latent size to properly rescale the crops for regenerating the bbox annotations
937
+ num_downs = self.first_stage_model.encoder.num_resolutions - 1
938
+ rescale_latent = 2 ** (num_downs)
939
+
940
+ # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
941
+ # need to rescale the tl patch coordinates to be in between (0,1)
942
+ tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
943
+ rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
944
+ for patch_nr in range(z.shape[-1])]
945
+
946
+ # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
947
+ patch_limits = [(x_tl, y_tl,
948
+ rescale_latent * ks[0] / full_img_w,
949
+ rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
950
+ # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
951
+
952
+ # tokenize crop coordinates for the bounding boxes of the respective patches
953
+ patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
954
+ for bbox in patch_limits] # list of length l with tensors of shape (1, 2)
955
+ print(patch_limits_tknzd[0].shape)
956
+ # cut tknzd crop position from conditioning
957
+ assert isinstance(cond, dict), 'cond must be dict to be fed into model'
958
+ cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
959
+ print(cut_cond.shape)
960
+
961
+ adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
962
+ adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
963
+ print(adapted_cond.shape)
964
+ adapted_cond = self.get_learned_conditioning(adapted_cond)
965
+ print(adapted_cond.shape)
966
+ adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
967
+ print(adapted_cond.shape)
968
+
969
+ cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
970
+
971
+ else:
972
+ cond_list = [cond for i in range(z.shape[-1])] # Todo make this more efficient
973
+
974
+ # apply model by loop over crops
975
+ output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
976
+ assert not isinstance(output_list[0],
977
+ tuple) # todo cant deal with multiple model outputs check this never happens
978
+
979
+ o = torch.stack(output_list, axis=-1)
980
+ o = o * weighting
981
+ # Reverse reshape to img shape
982
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
983
+ # stitch crops together
984
+ x_recon = fold(o) / normalization
985
+
986
+ else:
987
+ x_recon = self.model(x_noisy, t, **cond)
988
+
989
+ if isinstance(x_recon, tuple) and not return_ids:
990
+ return x_recon[0]
991
+ else:
992
+ return x_recon
993
+
994
+ def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
995
+ return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
996
+ extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
997
+
998
+ def _prior_bpd(self, x_start):
999
+ """
1000
+ Get the prior KL term for the variational lower-bound, measured in
1001
+ bits-per-dim.
1002
+ This term can't be optimized, as it only depends on the encoder.
1003
+ :param x_start: the [N x C x ...] tensor of inputs.
1004
+ :return: a batch of [N] KL values (in bits), one per batch element.
1005
+ """
1006
+ batch_size = x_start.shape[0]
1007
+ t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
1008
+ qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
1009
+ kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
1010
+ return mean_flat(kl_prior) / np.log(2.0)
1011
+
1012
+ def p_losses(self, x_start, cond, t, noise=None):
1013
+ noise = default(noise, lambda: torch.randn_like(x_start))
1014
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
1015
+ model_output = self.apply_model(x_noisy, t, cond)
1016
+
1017
+ loss_dict = {}
1018
+ prefix = 'train' if self.training else 'val'
1019
+
1020
+ if self.parameterization == "x0":
1021
+ target = x_start
1022
+ elif self.parameterization == "eps":
1023
+ target = noise
1024
+ else:
1025
+ raise NotImplementedError()
1026
+
1027
+ loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
1028
+ loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
1029
+
1030
+ logvar_t = self.logvar[t].to(self.device)
1031
+ loss = loss_simple / torch.exp(logvar_t) + logvar_t
1032
+ # loss = loss_simple / torch.exp(self.logvar) + self.logvar
1033
+ if self.learn_logvar:
1034
+ loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
1035
+ loss_dict.update({'logvar': self.logvar.data.mean()})
1036
+
1037
+ loss = self.l_simple_weight * loss.mean()
1038
+
1039
+ loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
1040
+ loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
1041
+ loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
1042
+ loss += (self.original_elbo_weight * loss_vlb)
1043
+ loss_dict.update({f'{prefix}/loss': loss})
1044
+
1045
+ return loss, loss_dict
1046
+
1047
+ def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
1048
+ return_x0=False, score_corrector=None, corrector_kwargs=None):
1049
+ t_in = t
1050
+ model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
1051
+
1052
+ if score_corrector is not None:
1053
+ assert self.parameterization == "eps"
1054
+ model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
1055
+
1056
+ if return_codebook_ids:
1057
+ model_out, logits = model_out
1058
+
1059
+ if self.parameterization == "eps":
1060
+ x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
1061
+ elif self.parameterization == "x0":
1062
+ x_recon = model_out
1063
+ else:
1064
+ raise NotImplementedError()
1065
+
1066
+ if clip_denoised:
1067
+ x_recon.clamp_(-1., 1.)
1068
+ if quantize_denoised:
1069
+ x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
1070
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
1071
+ if return_codebook_ids:
1072
+ return model_mean, posterior_variance, posterior_log_variance, logits
1073
+ elif return_x0:
1074
+ return model_mean, posterior_variance, posterior_log_variance, x_recon
1075
+ else:
1076
+ return model_mean, posterior_variance, posterior_log_variance
1077
+
1078
+ @torch.no_grad()
1079
+ def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
1080
+ return_codebook_ids=False, quantize_denoised=False, return_x0=False,
1081
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
1082
+ b, *_, device = *x.shape, x.device
1083
+ outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
1084
+ return_codebook_ids=return_codebook_ids,
1085
+ quantize_denoised=quantize_denoised,
1086
+ return_x0=return_x0,
1087
+ score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
1088
+ if return_codebook_ids:
1089
+ raise DeprecationWarning("Support dropped.")
1090
+ model_mean, _, model_log_variance, logits = outputs
1091
+ elif return_x0:
1092
+ model_mean, _, model_log_variance, x0 = outputs
1093
+ else:
1094
+ model_mean, _, model_log_variance = outputs
1095
+
1096
+ noise = noise_like(x.shape, device, repeat_noise) * temperature
1097
+ if noise_dropout > 0.:
1098
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
1099
+ # no noise when t == 0
1100
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
1101
+
1102
+ if return_codebook_ids:
1103
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
1104
+ if return_x0:
1105
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
1106
+ else:
1107
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
1108
+
1109
+ @torch.no_grad()
1110
+ def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
1111
+ img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
1112
+ score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
1113
+ log_every_t=None):
1114
+ if not log_every_t:
1115
+ log_every_t = self.log_every_t
1116
+ timesteps = self.num_timesteps
1117
+ if batch_size is not None:
1118
+ b = batch_size if batch_size is not None else shape[0]
1119
+ shape = [batch_size] + list(shape)
1120
+ else:
1121
+ b = batch_size = shape[0]
1122
+ if x_T is None:
1123
+ img = torch.randn(shape, device=self.device)
1124
+ else:
1125
+ img = x_T
1126
+ intermediates = []
1127
+ if cond is not None:
1128
+ if isinstance(cond, dict):
1129
+ cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
1130
+ list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
1131
+ else:
1132
+ cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
1133
+
1134
+ if start_T is not None:
1135
+ timesteps = min(timesteps, start_T)
1136
+ iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
1137
+ total=timesteps) if verbose else reversed(
1138
+ range(0, timesteps))
1139
+ if type(temperature) == float:
1140
+ temperature = [temperature] * timesteps
1141
+
1142
+ for i in iterator:
1143
+ ts = torch.full((b,), i, device=self.device, dtype=torch.long)
1144
+ if self.shorten_cond_schedule:
1145
+ assert self.model.conditioning_key != 'hybrid'
1146
+ tc = self.cond_ids[ts].to(cond.device)
1147
+ cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
1148
+
1149
+ img, x0_partial = self.p_sample(img, cond, ts,
1150
+ clip_denoised=self.clip_denoised,
1151
+ quantize_denoised=quantize_denoised, return_x0=True,
1152
+ temperature=temperature[i], noise_dropout=noise_dropout,
1153
+ score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
1154
+ if mask is not None:
1155
+ assert x0 is not None
1156
+ img_orig = self.q_sample(x0, ts)
1157
+ img = img_orig * mask + (1. - mask) * img
1158
+
1159
+ if i % log_every_t == 0 or i == timesteps - 1:
1160
+ intermediates.append(x0_partial)
1161
+ if callback: callback(i)
1162
+ if img_callback: img_callback(img, i)
1163
+ return img, intermediates
1164
+
1165
+ @torch.no_grad()
1166
+ def p_sample_loop(self, cond, shape, return_intermediates=False,
1167
+ x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
1168
+ mask=None, x0=None, img_callback=None, start_T=None,
1169
+ log_every_t=None):
1170
+
1171
+ if not log_every_t:
1172
+ log_every_t = self.log_every_t
1173
+ device = self.betas.device
1174
+ b = shape[0]
1175
+ if x_T is None:
1176
+ img = torch.randn(shape, device=device)
1177
+ else:
1178
+ img = x_T
1179
+
1180
+ intermediates = [img]
1181
+ if timesteps is None:
1182
+ timesteps = self.num_timesteps
1183
+
1184
+ if start_T is not None:
1185
+ timesteps = min(timesteps, start_T)
1186
+ iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
1187
+ range(0, timesteps))
1188
+
1189
+ if mask is not None:
1190
+ assert x0 is not None
1191
+ assert x0.shape[2:3] == mask.shape[2:3] # spatial size has to match
1192
+
1193
+ for i in iterator:
1194
+ ts = torch.full((b,), i, device=device, dtype=torch.long)
1195
+ if self.shorten_cond_schedule:
1196
+ assert self.model.conditioning_key != 'hybrid'
1197
+ tc = self.cond_ids[ts].to(cond.device)
1198
+ cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
1199
+
1200
+ img = self.p_sample(img, cond, ts,
1201
+ clip_denoised=self.clip_denoised,
1202
+ quantize_denoised=quantize_denoised)
1203
+ if mask is not None:
1204
+ img_orig = self.q_sample(x0, ts)
1205
+ img = img_orig * mask + (1. - mask) * img
1206
+
1207
+ if i % log_every_t == 0 or i == timesteps - 1:
1208
+ intermediates.append(img)
1209
+ if callback: callback(i)
1210
+ if img_callback: img_callback(img, i)
1211
+
1212
+ if return_intermediates:
1213
+ return img, intermediates
1214
+ return img
1215
+
1216
+ @torch.no_grad()
1217
+ def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
1218
+ verbose=True, timesteps=None, quantize_denoised=False,
1219
+ mask=None, x0=None, shape=None,**kwargs):
1220
+ if shape is None:
1221
+ shape = (batch_size, self.channels, self.image_size, self.image_size)
1222
+ if cond is not None:
1223
+ if isinstance(cond, dict):
1224
+ cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
1225
+ list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
1226
+ else:
1227
+ cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
1228
+ return self.p_sample_loop(cond,
1229
+ shape,
1230
+ return_intermediates=return_intermediates, x_T=x_T,
1231
+ verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
1232
+ mask=mask, x0=x0)
1233
+
1234
+ @torch.no_grad()
1235
+ def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
1236
+
1237
+ if ddim:
1238
+ ddim_sampler = DDIMSampler(self)
1239
+ shape = (self.channels, self.image_size, self.image_size)
1240
+ samples, intermediates =ddim_sampler.sample(ddim_steps,batch_size,
1241
+ shape,cond,verbose=False,**kwargs)
1242
+
1243
+ else:
1244
+ samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
1245
+ return_intermediates=True,**kwargs)
1246
+
1247
+ return samples, intermediates
1248
+
1249
+
1250
+ @torch.no_grad()
1251
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1252
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
1253
+ plot_diffusion_rows=True, **kwargs):
1254
+
1255
+ use_ddim = ddim_steps is not None
1256
+
1257
+ log = dict()
1258
+ z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
1259
+ return_first_stage_outputs=True,
1260
+ force_c_encode=True,
1261
+ return_original_cond=True,
1262
+ bs=N)
1263
+ N = min(x.shape[0], N)
1264
+ n_row = min(x.shape[0], n_row)
1265
+ log["inputs"] = x
1266
+ log["reconstruction"] = xrec
1267
+ if self.model.conditioning_key is not None:
1268
+ if hasattr(self.cond_stage_model, "decode"):
1269
+ xc = self.cond_stage_model.decode(c)
1270
+ log["conditioning"] = xc
1271
+ elif self.cond_stage_key in ["caption"]:
1272
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["caption"])
1273
+ log["conditioning"] = xc
1274
+ elif self.cond_stage_key == 'class_label':
1275
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
1276
+ log['conditioning'] = xc
1277
+ elif isimage(xc):
1278
+ log["conditioning"] = xc
1279
+ if ismap(xc):
1280
+ log["original_conditioning"] = self.to_rgb(xc)
1281
+
1282
+ if plot_diffusion_rows:
1283
+ # get diffusion row
1284
+ diffusion_row = list()
1285
+ z_start = z[:n_row]
1286
+ for t in range(self.num_timesteps):
1287
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1288
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1289
+ t = t.to(self.device).long()
1290
+ noise = torch.randn_like(z_start)
1291
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1292
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1293
+
1294
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1295
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1296
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1297
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1298
+ log["diffusion_row"] = diffusion_grid
1299
+
1300
+ if sample:
1301
+ # get denoise row
1302
+ with self.ema_scope("Plotting"):
1303
+ samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
1304
+ ddim_steps=ddim_steps,eta=ddim_eta)
1305
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1306
+ x_samples = self.decode_first_stage(samples)
1307
+ log["samples"] = x_samples
1308
+ if plot_denoise_rows:
1309
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1310
+ log["denoise_row"] = denoise_grid
1311
+
1312
+ if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
1313
+ self.first_stage_model, IdentityFirstStage):
1314
+ # also display when quantizing x0 while sampling
1315
+ with self.ema_scope("Plotting Quantized Denoised"):
1316
+ samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
1317
+ ddim_steps=ddim_steps,eta=ddim_eta,
1318
+ quantize_denoised=True)
1319
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
1320
+ # quantize_denoised=True)
1321
+ x_samples = self.decode_first_stage(samples.to(self.device))
1322
+ log["samples_x0_quantized"] = x_samples
1323
+
1324
+ if inpaint:
1325
+ # make a simple center square
1326
+ b, h, w = z.shape[0], z.shape[2], z.shape[3]
1327
+ mask = torch.ones(N, h, w).to(self.device)
1328
+ # zeros will be filled in
1329
+ mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
1330
+ mask = mask[:, None, ...]
1331
+ with self.ema_scope("Plotting Inpaint"):
1332
+
1333
+ samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
1334
+ ddim_steps=ddim_steps, x0=z[:N], mask=mask)
1335
+ x_samples = self.decode_first_stage(samples.to(self.device))
1336
+ log["samples_inpainting"] = x_samples
1337
+ log["mask"] = mask
1338
+
1339
+ # outpaint
1340
+ with self.ema_scope("Plotting Outpaint"):
1341
+ samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
1342
+ ddim_steps=ddim_steps, x0=z[:N], mask=mask)
1343
+ x_samples = self.decode_first_stage(samples.to(self.device))
1344
+ log["samples_outpainting"] = x_samples
1345
+
1346
+ if plot_progressive_rows:
1347
+ with self.ema_scope("Plotting Progressives"):
1348
+ img, progressives = self.progressive_denoising(c,
1349
+ shape=(self.channels, self.image_size, self.image_size),
1350
+ batch_size=N)
1351
+ prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
1352
+ log["progressive_row"] = prog_row
1353
+
1354
+ if return_keys:
1355
+ if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
1356
+ return log
1357
+ else:
1358
+ return {key: log[key] for key in return_keys}
1359
+ return log
1360
+
1361
+ def configure_optimizers(self):
1362
+ lr = self.learning_rate
1363
+ params = list(self.model.parameters())
1364
+ if self.cond_stage_trainable:
1365
+ print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
1366
+ params = params + list(self.cond_stage_model.parameters())
1367
+ if self.learn_logvar:
1368
+ print('Diffusion model optimizing logvar')
1369
+ params.append(self.logvar)
1370
+ opt = torch.optim.AdamW(params, lr=lr)
1371
+ if self.use_scheduler:
1372
+ assert 'target' in self.scheduler_config
1373
+ scheduler = instantiate_from_config(self.scheduler_config)
1374
+
1375
+ print("Setting up LambdaLR scheduler...")
1376
+ scheduler = [
1377
+ {
1378
+ 'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
1379
+ 'interval': 'step',
1380
+ 'frequency': 1
1381
+ }]
1382
+ return [opt], scheduler
1383
+ return opt
1384
+
1385
+ @torch.no_grad()
1386
+ def to_rgb(self, x):
1387
+ x = x.float()
1388
+ if not hasattr(self, "colorize"):
1389
+ self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
1390
+ x = nn.functional.conv2d(x, weight=self.colorize)
1391
+ x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
1392
+ return x
1393
+
1394
+
1395
+ class DiffusionWrapper(pl.LightningModule):
1396
+ def __init__(self, diff_model_config, conditioning_key):
1397
+ super().__init__()
1398
+ self.diffusion_model = instantiate_from_config(diff_model_config)
1399
+ self.conditioning_key = conditioning_key
1400
+ assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']
1401
+
1402
+ def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
1403
+ if self.conditioning_key is None:
1404
+ out = self.diffusion_model(x, t)
1405
+ elif self.conditioning_key == 'concat':
1406
+ xc = torch.cat([x] + c_concat, dim=1)
1407
+ out = self.diffusion_model(xc, t)
1408
+ elif self.conditioning_key == 'crossattn':
1409
+ cc = torch.cat(c_crossattn, 1)
1410
+ out = self.diffusion_model(x, t, context=cc)
1411
+ elif self.conditioning_key == 'hybrid':
1412
+ xc = torch.cat([x] + c_concat, dim=1)
1413
+ cc = torch.cat(c_crossattn, 1)
1414
+ out = self.diffusion_model(xc, t, context=cc)
1415
+ elif self.conditioning_key == 'adm':
1416
+ cc = c_crossattn[0]
1417
+ out = self.diffusion_model(x, t, y=cc)
1418
+ else:
1419
+ raise NotImplementedError()
1420
+
1421
+ return out
1422
+
1423
+
1424
+ class Layout2ImgDiffusion(LatentDiffusion):
1425
+ # TODO: move all layout-specific hacks to this class
1426
+ def __init__(self, cond_stage_key, *args, **kwargs):
1427
+ assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
1428
+ super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
1429
+
1430
+ def log_images(self, batch, N=8, *args, **kwargs):
1431
+ logs = super().log_images(batch=batch, N=N, *args, **kwargs)
1432
+
1433
+ key = 'train' if self.training else 'validation'
1434
+ dset = self.trainer.datamodule.datasets[key]
1435
+ mapper = dset.conditional_builders[self.cond_stage_key]
1436
+
1437
+ bbox_imgs = []
1438
+ map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
1439
+ for tknzd_bbox in batch[self.cond_stage_key][:N]:
1440
+ bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
1441
+ bbox_imgs.append(bboximg)
1442
+
1443
+ cond_img = torch.stack(bbox_imgs, dim=0)
1444
+ logs['bbox_image'] = cond_img
1445
+ return logs
ldmlib/models/diffusion/plms.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+
8
+ from ldmlib.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
9
+
10
+
11
+ class PLMSSampler(object):
12
+ def __init__(self, model, schedule="linear", **kwargs):
13
+ super().__init__()
14
+ self.model = model
15
+ self.ddpm_num_timesteps = model.num_timesteps
16
+ self.schedule = schedule
17
+
18
+ def register_buffer(self, name, attr):
19
+ if type(attr) == torch.Tensor:
20
+ if attr.device != torch.device("cuda"):
21
+ attr = attr.to(torch.device("cuda"))
22
+ setattr(self, name, attr)
23
+
24
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
25
+ if ddim_eta != 0:
26
+ raise ValueError('ddim_eta must be 0 for PLMS')
27
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
28
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
29
+ alphas_cumprod = self.model.alphas_cumprod
30
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
31
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
32
+
33
+ self.register_buffer('betas', to_torch(self.model.betas))
34
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
35
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
36
+
37
+ # calculations for diffusion q(x_t | x_{t-1}) and others
38
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
39
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
40
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
41
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
42
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
43
+
44
+ # ddim sampling parameters
45
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
46
+ ddim_timesteps=self.ddim_timesteps,
47
+ eta=ddim_eta,verbose=verbose)
48
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
49
+ self.register_buffer('ddim_alphas', ddim_alphas)
50
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
51
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
52
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
53
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
54
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
55
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
56
+
57
+ @torch.no_grad()
58
+ def sample(self,
59
+ S,
60
+ batch_size,
61
+ shape,
62
+ conditioning=None,
63
+ callback=None,
64
+ normals_sequence=None,
65
+ img_callback=None,
66
+ quantize_x0=False,
67
+ eta=0.,
68
+ mask=None,
69
+ x0=None,
70
+ temperature=1.,
71
+ noise_dropout=0.,
72
+ score_corrector=None,
73
+ corrector_kwargs=None,
74
+ verbose=True,
75
+ x_T=None,
76
+ log_every_t=100,
77
+ unconditional_guidance_scale=1.,
78
+ unconditional_conditioning=None,
79
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
80
+ **kwargs
81
+ ):
82
+ if conditioning is not None:
83
+ if isinstance(conditioning, dict):
84
+ cbs = conditioning[list(conditioning.keys())[0]].shape[0]
85
+ if cbs != batch_size:
86
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
87
+ else:
88
+ if conditioning.shape[0] != batch_size:
89
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
90
+
91
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
92
+ # sampling
93
+ C, H, W = shape
94
+ size = (batch_size, C, H, W)
95
+ print(f'Data shape for PLMS sampling is {size}')
96
+
97
+ samples, intermediates = self.plms_sampling(conditioning, size,
98
+ callback=callback,
99
+ img_callback=img_callback,
100
+ quantize_denoised=quantize_x0,
101
+ mask=mask, x0=x0,
102
+ ddim_use_original_steps=False,
103
+ noise_dropout=noise_dropout,
104
+ temperature=temperature,
105
+ score_corrector=score_corrector,
106
+ corrector_kwargs=corrector_kwargs,
107
+ x_T=x_T,
108
+ log_every_t=log_every_t,
109
+ unconditional_guidance_scale=unconditional_guidance_scale,
110
+ unconditional_conditioning=unconditional_conditioning,
111
+ )
112
+ return samples, intermediates
113
+
114
+ @torch.no_grad()
115
+ def plms_sampling(self, cond, shape,
116
+ x_T=None, ddim_use_original_steps=False,
117
+ callback=None, timesteps=None, quantize_denoised=False,
118
+ mask=None, x0=None, img_callback=None, log_every_t=100,
119
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
120
+ unconditional_guidance_scale=1., unconditional_conditioning=None,):
121
+ device = self.model.betas.device
122
+ b = shape[0]
123
+ if x_T is None:
124
+ img = torch.randn(shape, device=device)
125
+ else:
126
+ img = x_T
127
+
128
+ if timesteps is None:
129
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
130
+ elif timesteps is not None and not ddim_use_original_steps:
131
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
132
+ timesteps = self.ddim_timesteps[:subset_end]
133
+
134
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
135
+ time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
136
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
137
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
138
+
139
+ iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
140
+ old_eps = []
141
+
142
+ for i, step in enumerate(iterator):
143
+ index = total_steps - i - 1
144
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
145
+ ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
146
+
147
+ if mask is not None:
148
+ assert x0 is not None
149
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
150
+ img = img_orig * mask + (1. - mask) * img
151
+
152
+ outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
153
+ quantize_denoised=quantize_denoised, temperature=temperature,
154
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
155
+ corrector_kwargs=corrector_kwargs,
156
+ unconditional_guidance_scale=unconditional_guidance_scale,
157
+ unconditional_conditioning=unconditional_conditioning,
158
+ old_eps=old_eps, t_next=ts_next)
159
+ img, pred_x0, e_t = outs
160
+ old_eps.append(e_t)
161
+ if len(old_eps) >= 4:
162
+ old_eps.pop(0)
163
+ if callback: callback(i)
164
+ if img_callback: img_callback(pred_x0, i)
165
+
166
+ if index % log_every_t == 0 or index == total_steps - 1:
167
+ intermediates['x_inter'].append(img)
168
+ intermediates['pred_x0'].append(pred_x0)
169
+
170
+ return img, intermediates
171
+
172
+ @torch.no_grad()
173
+ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
174
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
175
+ unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
176
+ b, *_, device = *x.shape, x.device
177
+
178
+ def get_model_output(x, t):
179
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
180
+ e_t = self.model.apply_model(x, t, c)
181
+ else:
182
+ x_in = torch.cat([x] * 2)
183
+ t_in = torch.cat([t] * 2)
184
+ c_in = torch.cat([unconditional_conditioning, c])
185
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
186
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
187
+
188
+ if score_corrector is not None:
189
+ assert self.model.parameterization == "eps"
190
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
191
+
192
+ return e_t
193
+
194
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
195
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
196
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
197
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
198
+
199
+ def get_x_prev_and_pred_x0(e_t, index):
200
+ # select parameters corresponding to the currently considered timestep
201
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
202
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
203
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
204
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
205
+
206
+ # current prediction for x_0
207
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
208
+ if quantize_denoised:
209
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
210
+ # direction pointing to x_t
211
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
212
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
213
+ if noise_dropout > 0.:
214
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
215
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
216
+ return x_prev, pred_x0
217
+
218
+ e_t = get_model_output(x, t)
219
+ if len(old_eps) == 0:
220
+ # Pseudo Improved Euler (2nd order)
221
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
222
+ e_t_next = get_model_output(x_prev, t_next)
223
+ e_t_prime = (e_t + e_t_next) / 2
224
+ elif len(old_eps) == 1:
225
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
226
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
227
+ elif len(old_eps) == 2:
228
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
229
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
230
+ elif len(old_eps) >= 3:
231
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
232
+ e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
233
+
234
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
235
+
236
+ return x_prev, pred_x0, e_t
ldmlib/modules/__pycache__/attention.cpython-38.pyc ADDED
Binary file (8.78 kB). View file
 
ldmlib/modules/__pycache__/x_transformer.cpython-38.pyc ADDED
Binary file (18.3 kB). View file
 
ldmlib/modules/attention.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inspect import isfunction
2
+ import math
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn, einsum
6
+ from einops import rearrange, repeat
7
+
8
+ from ldmlib.modules.diffusionmodules.util import checkpoint
9
+
10
+
11
+ def exists(val):
12
+ return val is not None
13
+
14
+
15
+ def uniq(arr):
16
+ return{el: True for el in arr}.keys()
17
+
18
+
19
+ def default(val, d):
20
+ if exists(val):
21
+ return val
22
+ return d() if isfunction(d) else d
23
+
24
+
25
+ def max_neg_value(t):
26
+ return -torch.finfo(t.dtype).max
27
+
28
+
29
+ def init_(tensor):
30
+ dim = tensor.shape[-1]
31
+ std = 1 / math.sqrt(dim)
32
+ tensor.uniform_(-std, std)
33
+ return tensor
34
+
35
+
36
+ # feedforward
37
+ class GEGLU(nn.Module):
38
+ def __init__(self, dim_in, dim_out):
39
+ super().__init__()
40
+ self.proj = nn.Linear(dim_in, dim_out * 2)
41
+
42
+ def forward(self, x):
43
+ x, gate = self.proj(x).chunk(2, dim=-1)
44
+ return x * F.gelu(gate)
45
+
46
+
47
+ class FeedForward(nn.Module):
48
+ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
49
+ super().__init__()
50
+ inner_dim = int(dim * mult)
51
+ dim_out = default(dim_out, dim)
52
+ project_in = nn.Sequential(
53
+ nn.Linear(dim, inner_dim),
54
+ nn.GELU()
55
+ ) if not glu else GEGLU(dim, inner_dim)
56
+
57
+ self.net = nn.Sequential(
58
+ project_in,
59
+ nn.Dropout(dropout),
60
+ nn.Linear(inner_dim, dim_out)
61
+ )
62
+
63
+ def forward(self, x):
64
+ return self.net(x)
65
+
66
+
67
+ def zero_module(module):
68
+ """
69
+ Zero out the parameters of a module and return it.
70
+ """
71
+ for p in module.parameters():
72
+ p.detach().zero_()
73
+ return module
74
+
75
+
76
+ def Normalize(in_channels):
77
+ return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
78
+
79
+
80
+ class LinearAttention(nn.Module):
81
+ def __init__(self, dim, heads=4, dim_head=32):
82
+ super().__init__()
83
+ self.heads = heads
84
+ hidden_dim = dim_head * heads
85
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
86
+ self.to_out = nn.Conv2d(hidden_dim, dim, 1)
87
+
88
+ def forward(self, x):
89
+ b, c, h, w = x.shape
90
+ qkv = self.to_qkv(x)
91
+ q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
92
+ k = k.softmax(dim=-1)
93
+ context = torch.einsum('bhdn,bhen->bhde', k, v)
94
+ out = torch.einsum('bhde,bhdn->bhen', context, q)
95
+ out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
96
+ return self.to_out(out)
97
+
98
+
99
+ class SpatialSelfAttention(nn.Module):
100
+ def __init__(self, in_channels):
101
+ super().__init__()
102
+ self.in_channels = in_channels
103
+
104
+ self.norm = Normalize(in_channels)
105
+ self.q = torch.nn.Conv2d(in_channels,
106
+ in_channels,
107
+ kernel_size=1,
108
+ stride=1,
109
+ padding=0)
110
+ self.k = torch.nn.Conv2d(in_channels,
111
+ in_channels,
112
+ kernel_size=1,
113
+ stride=1,
114
+ padding=0)
115
+ self.v = torch.nn.Conv2d(in_channels,
116
+ in_channels,
117
+ kernel_size=1,
118
+ stride=1,
119
+ padding=0)
120
+ self.proj_out = torch.nn.Conv2d(in_channels,
121
+ in_channels,
122
+ kernel_size=1,
123
+ stride=1,
124
+ padding=0)
125
+
126
+ def forward(self, x):
127
+ h_ = x
128
+ h_ = self.norm(h_)
129
+ q = self.q(h_)
130
+ k = self.k(h_)
131
+ v = self.v(h_)
132
+
133
+ # compute attention
134
+ b,c,h,w = q.shape
135
+ q = rearrange(q, 'b c h w -> b (h w) c')
136
+ k = rearrange(k, 'b c h w -> b c (h w)')
137
+ w_ = torch.einsum('bij,bjk->bik', q, k)
138
+
139
+ w_ = w_ * (int(c)**(-0.5))
140
+ w_ = torch.nn.functional.softmax(w_, dim=2)
141
+
142
+ # attend to values
143
+ v = rearrange(v, 'b c h w -> b c (h w)')
144
+ w_ = rearrange(w_, 'b i j -> b j i')
145
+ h_ = torch.einsum('bij,bjk->bik', v, w_)
146
+ h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
147
+ h_ = self.proj_out(h_)
148
+
149
+ return x+h_
150
+
151
+
152
+ class CrossAttention(nn.Module):
153
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
154
+ super().__init__()
155
+ inner_dim = dim_head * heads
156
+ context_dim = default(context_dim, query_dim)
157
+
158
+ self.scale = dim_head ** -0.5
159
+ self.heads = heads
160
+
161
+ self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
162
+ self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
163
+ self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
164
+
165
+ self.to_out = nn.Sequential(
166
+ nn.Linear(inner_dim, query_dim),
167
+ nn.Dropout(dropout)
168
+ )
169
+
170
+ def forward(self, x, context=None, mask=None):
171
+ h = self.heads
172
+
173
+ q = self.to_q(x)
174
+ context = default(context, x)
175
+ k = self.to_k(context)
176
+ v = self.to_v(context)
177
+
178
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
179
+
180
+ sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
181
+
182
+ if exists(mask):
183
+ mask = rearrange(mask, 'b ... -> b (...)')
184
+ max_neg_value = -torch.finfo(sim.dtype).max
185
+ mask = repeat(mask, 'b j -> (b h) () j', h=h)
186
+ sim.masked_fill_(~mask, max_neg_value)
187
+
188
+ # attention, what we cannot get enough of
189
+ attn = sim.softmax(dim=-1)
190
+
191
+ out = einsum('b i j, b j d -> b i d', attn, v)
192
+ out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
193
+ return self.to_out(out)
194
+
195
+
196
+ class BasicTransformerBlock(nn.Module):
197
+ def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
198
+ super().__init__()
199
+ self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout) # is a self-attention
200
+ self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
201
+ self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
202
+ heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
203
+ self.norm1 = nn.LayerNorm(dim)
204
+ self.norm2 = nn.LayerNorm(dim)
205
+ self.norm3 = nn.LayerNorm(dim)
206
+ self.checkpoint = checkpoint
207
+
208
+ def forward(self, x, context=None):
209
+ return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
210
+
211
+ def _forward(self, x, context=None):
212
+ x = self.attn1(self.norm1(x)) + x
213
+ x = self.attn2(self.norm2(x), context=context) + x
214
+ x = self.ff(self.norm3(x)) + x
215
+ return x
216
+
217
+
218
+ class SpatialTransformer(nn.Module):
219
+ """
220
+ Transformer block for image-like data.
221
+ First, project the input (aka embedding)
222
+ and reshape to b, t, d.
223
+ Then apply standard transformer action.
224
+ Finally, reshape to image
225
+ """
226
+ def __init__(self, in_channels, n_heads, d_head,
227
+ depth=1, dropout=0., context_dim=None):
228
+ super().__init__()
229
+ self.in_channels = in_channels
230
+ inner_dim = n_heads * d_head
231
+ self.norm = Normalize(in_channels)
232
+
233
+ self.proj_in = nn.Conv2d(in_channels,
234
+ inner_dim,
235
+ kernel_size=1,
236
+ stride=1,
237
+ padding=0)
238
+
239
+ self.transformer_blocks = nn.ModuleList(
240
+ [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
241
+ for d in range(depth)]
242
+ )
243
+
244
+ self.proj_out = zero_module(nn.Conv2d(inner_dim,
245
+ in_channels,
246
+ kernel_size=1,
247
+ stride=1,
248
+ padding=0))
249
+
250
+ def forward(self, x, context=None):
251
+ # note: if no context is given, cross-attention defaults to self-attention
252
+ b, c, h, w = x.shape
253
+ x_in = x
254
+ x = self.norm(x)
255
+ x = self.proj_in(x)
256
+ x = rearrange(x, 'b c h w -> b (h w) c')
257
+ for block in self.transformer_blocks:
258
+ x = block(x, context=context)
259
+ x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
260
+ x = self.proj_out(x)
261
+ return x + x_in
ldmlib/modules/diffusionmodules/__init__.py ADDED
File without changes
ldmlib/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (171 Bytes). View file
 
ldmlib/modules/diffusionmodules/__pycache__/model.cpython-38.pyc ADDED
Binary file (20.6 kB). View file
 
ldmlib/modules/diffusionmodules/__pycache__/util.cpython-38.pyc ADDED
Binary file (9.45 kB). View file
 
ldmlib/modules/diffusionmodules/model.py ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_diffusion + derived encoder decoder
2
+ import math
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.nn.functional import silu
6
+ import numpy as np
7
+ from einops import rearrange
8
+
9
+ from ldmlib.util import instantiate_from_config
10
+ from ldmlib.modules.attention import LinearAttention
11
+
12
+
13
+ def get_timestep_embedding(timesteps, embedding_dim):
14
+ """
15
+ This matches the implementation in Denoising Diffusion Probabilistic Models:
16
+ From Fairseq.
17
+ Build sinusoidal embeddings.
18
+ This matches the implementation in tensor2tensor, but differs slightly
19
+ from the description in Section 3.5 of "Attention Is All You Need".
20
+ """
21
+ assert len(timesteps.shape) == 1
22
+
23
+ half_dim = embedding_dim // 2
24
+ emb = math.log(10000) / (half_dim - 1)
25
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
26
+ emb = emb.to(device=timesteps.device)
27
+ emb = timesteps.float()[:, None] * emb[None, :]
28
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
29
+ if embedding_dim % 2 == 1: # zero pad
30
+ emb = torch.nn.functional.pad(emb, (0,1,0,0))
31
+ return emb
32
+
33
+
34
+ def Normalize(in_channels, num_groups=32):
35
+ return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
36
+
37
+
38
+ class Upsample(nn.Module):
39
+ def __init__(self, in_channels, with_conv):
40
+ super().__init__()
41
+ self.with_conv = with_conv
42
+ if self.with_conv:
43
+ self.conv = torch.nn.Conv2d(in_channels,
44
+ in_channels,
45
+ kernel_size=3,
46
+ stride=1,
47
+ padding=1)
48
+
49
+ def forward(self, x):
50
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
51
+ if self.with_conv:
52
+ x = self.conv(x)
53
+ return x
54
+
55
+
56
+ class Downsample(nn.Module):
57
+ def __init__(self, in_channels, with_conv):
58
+ super().__init__()
59
+ self.with_conv = with_conv
60
+ if self.with_conv:
61
+ # no asymmetric padding in torch conv, must do it ourselves
62
+ self.conv = torch.nn.Conv2d(in_channels,
63
+ in_channels,
64
+ kernel_size=3,
65
+ stride=2,
66
+ padding=0)
67
+
68
+ def forward(self, x):
69
+ if self.with_conv:
70
+ pad = (0,1,0,1)
71
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
72
+ x = self.conv(x)
73
+ else:
74
+ x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
75
+ return x
76
+
77
+
78
+ class ResnetBlock(nn.Module):
79
+ def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
80
+ dropout, temb_channels=512):
81
+ super().__init__()
82
+ self.in_channels = in_channels
83
+ out_channels = in_channels if out_channels is None else out_channels
84
+ self.out_channels = out_channels
85
+ self.use_conv_shortcut = conv_shortcut
86
+
87
+ self.norm1 = Normalize(in_channels)
88
+ self.conv1 = torch.nn.Conv2d(in_channels,
89
+ out_channels,
90
+ kernel_size=3,
91
+ stride=1,
92
+ padding=1)
93
+ if temb_channels > 0:
94
+ self.temb_proj = torch.nn.Linear(temb_channels,
95
+ out_channels)
96
+ self.norm2 = Normalize(out_channels)
97
+ self.dropout = torch.nn.Dropout(dropout)
98
+ self.conv2 = torch.nn.Conv2d(out_channels,
99
+ out_channels,
100
+ kernel_size=3,
101
+ stride=1,
102
+ padding=1)
103
+ if self.in_channels != self.out_channels:
104
+ if self.use_conv_shortcut:
105
+ self.conv_shortcut = torch.nn.Conv2d(in_channels,
106
+ out_channels,
107
+ kernel_size=3,
108
+ stride=1,
109
+ padding=1)
110
+ else:
111
+ self.nin_shortcut = torch.nn.Conv2d(in_channels,
112
+ out_channels,
113
+ kernel_size=1,
114
+ stride=1,
115
+ padding=0)
116
+
117
+ def forward(self, x, temb):
118
+ h = x
119
+ h = self.norm1(h)
120
+ h = silu(h)
121
+ h = self.conv1(h)
122
+
123
+ if temb is not None:
124
+ h = h + self.temb_proj(silu(temb))[:,:,None,None]
125
+
126
+ h = self.norm2(h)
127
+ h = silu(h)
128
+ h = self.dropout(h)
129
+ h = self.conv2(h)
130
+
131
+ if self.in_channels != self.out_channels:
132
+ if self.use_conv_shortcut:
133
+ x = self.conv_shortcut(x)
134
+ else:
135
+ x = self.nin_shortcut(x)
136
+
137
+ return x+h
138
+
139
+
140
+ class LinAttnBlock(LinearAttention):
141
+ """to match AttnBlock usage"""
142
+ def __init__(self, in_channels):
143
+ super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
144
+
145
+
146
+ class AttnBlock(nn.Module):
147
+ def __init__(self, in_channels):
148
+ super().__init__()
149
+ self.in_channels = in_channels
150
+
151
+ self.norm = Normalize(in_channels)
152
+ self.q = torch.nn.Conv2d(in_channels,
153
+ in_channels,
154
+ kernel_size=1,
155
+ stride=1,
156
+ padding=0)
157
+ self.k = torch.nn.Conv2d(in_channels,
158
+ in_channels,
159
+ kernel_size=1,
160
+ stride=1,
161
+ padding=0)
162
+ self.v = torch.nn.Conv2d(in_channels,
163
+ in_channels,
164
+ kernel_size=1,
165
+ stride=1,
166
+ padding=0)
167
+ self.proj_out = torch.nn.Conv2d(in_channels,
168
+ in_channels,
169
+ kernel_size=1,
170
+ stride=1,
171
+ padding=0)
172
+
173
+
174
+ def forward(self, x):
175
+ h_ = x
176
+ h_ = self.norm(h_)
177
+ q = self.q(h_)
178
+ k = self.k(h_)
179
+ v = self.v(h_)
180
+
181
+ # compute attention
182
+ b,c,h,w = q.shape
183
+ q = q.reshape(b,c,h*w)
184
+ q = q.permute(0,2,1) # b,hw,c
185
+ k = k.reshape(b,c,h*w) # b,c,hw
186
+ w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
187
+ w_ = w_ * (int(c)**(-0.5))
188
+ w_ = torch.nn.functional.softmax(w_, dim=2)
189
+
190
+ # attend to values
191
+ v = v.reshape(b,c,h*w)
192
+ w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
193
+ h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
194
+ h_ = h_.reshape(b,c,h,w)
195
+
196
+ h_ = self.proj_out(h_)
197
+
198
+ return x+h_
199
+
200
+
201
+ def make_attn(in_channels, attn_type="vanilla"):
202
+ assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
203
+ print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
204
+ if attn_type == "vanilla":
205
+ return AttnBlock(in_channels)
206
+ elif attn_type == "none":
207
+ return nn.Identity(in_channels)
208
+ else:
209
+ return LinAttnBlock(in_channels)
210
+
211
+
212
+ class Model(nn.Module):
213
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
214
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
215
+ resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
216
+ super().__init__()
217
+ if use_linear_attn: attn_type = "linear"
218
+ self.ch = ch
219
+ self.temb_ch = self.ch*4
220
+ self.num_resolutions = len(ch_mult)
221
+ self.num_res_blocks = num_res_blocks
222
+ self.resolution = resolution
223
+ self.in_channels = in_channels
224
+
225
+ self.use_timestep = use_timestep
226
+ if self.use_timestep:
227
+ # timestep embedding
228
+ self.temb = nn.Module()
229
+ self.temb.dense = nn.ModuleList([
230
+ torch.nn.Linear(self.ch,
231
+ self.temb_ch),
232
+ torch.nn.Linear(self.temb_ch,
233
+ self.temb_ch),
234
+ ])
235
+
236
+ # downsampling
237
+ self.conv_in = torch.nn.Conv2d(in_channels,
238
+ self.ch,
239
+ kernel_size=3,
240
+ stride=1,
241
+ padding=1)
242
+
243
+ curr_res = resolution
244
+ in_ch_mult = (1,)+tuple(ch_mult)
245
+ self.down = nn.ModuleList()
246
+ for i_level in range(self.num_resolutions):
247
+ block = nn.ModuleList()
248
+ attn = nn.ModuleList()
249
+ block_in = ch*in_ch_mult[i_level]
250
+ block_out = ch*ch_mult[i_level]
251
+ for i_block in range(self.num_res_blocks):
252
+ block.append(ResnetBlock(in_channels=block_in,
253
+ out_channels=block_out,
254
+ temb_channels=self.temb_ch,
255
+ dropout=dropout))
256
+ block_in = block_out
257
+ if curr_res in attn_resolutions:
258
+ attn.append(make_attn(block_in, attn_type=attn_type))
259
+ down = nn.Module()
260
+ down.block = block
261
+ down.attn = attn
262
+ if i_level != self.num_resolutions-1:
263
+ down.downsample = Downsample(block_in, resamp_with_conv)
264
+ curr_res = curr_res // 2
265
+ self.down.append(down)
266
+
267
+ # middle
268
+ self.mid = nn.Module()
269
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
270
+ out_channels=block_in,
271
+ temb_channels=self.temb_ch,
272
+ dropout=dropout)
273
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
274
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
275
+ out_channels=block_in,
276
+ temb_channels=self.temb_ch,
277
+ dropout=dropout)
278
+
279
+ # upsampling
280
+ self.up = nn.ModuleList()
281
+ for i_level in reversed(range(self.num_resolutions)):
282
+ block = nn.ModuleList()
283
+ attn = nn.ModuleList()
284
+ block_out = ch*ch_mult[i_level]
285
+ skip_in = ch*ch_mult[i_level]
286
+ for i_block in range(self.num_res_blocks+1):
287
+ if i_block == self.num_res_blocks:
288
+ skip_in = ch*in_ch_mult[i_level]
289
+ block.append(ResnetBlock(in_channels=block_in+skip_in,
290
+ out_channels=block_out,
291
+ temb_channels=self.temb_ch,
292
+ dropout=dropout))
293
+ block_in = block_out
294
+ if curr_res in attn_resolutions:
295
+ attn.append(make_attn(block_in, attn_type=attn_type))
296
+ up = nn.Module()
297
+ up.block = block
298
+ up.attn = attn
299
+ if i_level != 0:
300
+ up.upsample = Upsample(block_in, resamp_with_conv)
301
+ curr_res = curr_res * 2
302
+ self.up.insert(0, up) # prepend to get consistent order
303
+
304
+ # end
305
+ self.norm_out = Normalize(block_in)
306
+ self.conv_out = torch.nn.Conv2d(block_in,
307
+ out_ch,
308
+ kernel_size=3,
309
+ stride=1,
310
+ padding=1)
311
+
312
+ def forward(self, x, t=None, context=None):
313
+ #assert x.shape[2] == x.shape[3] == self.resolution
314
+ if context is not None:
315
+ # assume aligned context, cat along channel axis
316
+ x = torch.cat((x, context), dim=1)
317
+ if self.use_timestep:
318
+ # timestep embedding
319
+ assert t is not None
320
+ temb = get_timestep_embedding(t, self.ch)
321
+ temb = self.temb.dense[0](temb)
322
+ temb = silu(temb)
323
+ temb = self.temb.dense[1](temb)
324
+ else:
325
+ temb = None
326
+
327
+ # downsampling
328
+ hs = [self.conv_in(x)]
329
+ for i_level in range(self.num_resolutions):
330
+ for i_block in range(self.num_res_blocks):
331
+ h = self.down[i_level].block[i_block](hs[-1], temb)
332
+ if len(self.down[i_level].attn) > 0:
333
+ h = self.down[i_level].attn[i_block](h)
334
+ hs.append(h)
335
+ if i_level != self.num_resolutions-1:
336
+ hs.append(self.down[i_level].downsample(hs[-1]))
337
+
338
+ # middle
339
+ h = hs[-1]
340
+ h = self.mid.block_1(h, temb)
341
+ h = self.mid.attn_1(h)
342
+ h = self.mid.block_2(h, temb)
343
+
344
+ # upsampling
345
+ for i_level in reversed(range(self.num_resolutions)):
346
+ for i_block in range(self.num_res_blocks+1):
347
+ h = self.up[i_level].block[i_block](
348
+ torch.cat([h, hs.pop()], dim=1), temb)
349
+ if len(self.up[i_level].attn) > 0:
350
+ h = self.up[i_level].attn[i_block](h)
351
+ if i_level != 0:
352
+ h = self.up[i_level].upsample(h)
353
+
354
+ # end
355
+ h = self.norm_out(h)
356
+ h = silu(h)
357
+ h = self.conv_out(h)
358
+ return h
359
+
360
+ def get_last_layer(self):
361
+ return self.conv_out.weight
362
+
363
+
364
+ class Encoder(nn.Module):
365
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
366
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
367
+ resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
368
+ **ignore_kwargs):
369
+ super().__init__()
370
+ if use_linear_attn: attn_type = "linear"
371
+ self.ch = ch
372
+ self.temb_ch = 0
373
+ self.num_resolutions = len(ch_mult)
374
+ self.num_res_blocks = num_res_blocks
375
+ self.resolution = resolution
376
+ self.in_channels = in_channels
377
+
378
+ # downsampling
379
+ self.conv_in = torch.nn.Conv2d(in_channels,
380
+ self.ch,
381
+ kernel_size=3,
382
+ stride=1,
383
+ padding=1)
384
+
385
+ curr_res = resolution
386
+ in_ch_mult = (1,)+tuple(ch_mult)
387
+ self.in_ch_mult = in_ch_mult
388
+ self.down = nn.ModuleList()
389
+ for i_level in range(self.num_resolutions):
390
+ block = nn.ModuleList()
391
+ attn = nn.ModuleList()
392
+ block_in = ch*in_ch_mult[i_level]
393
+ block_out = ch*ch_mult[i_level]
394
+ for i_block in range(self.num_res_blocks):
395
+ block.append(ResnetBlock(in_channels=block_in,
396
+ out_channels=block_out,
397
+ temb_channels=self.temb_ch,
398
+ dropout=dropout))
399
+ block_in = block_out
400
+ if curr_res in attn_resolutions:
401
+ attn.append(make_attn(block_in, attn_type=attn_type))
402
+ down = nn.Module()
403
+ down.block = block
404
+ down.attn = attn
405
+ if i_level != self.num_resolutions-1:
406
+ down.downsample = Downsample(block_in, resamp_with_conv)
407
+ curr_res = curr_res // 2
408
+ self.down.append(down)
409
+
410
+ # middle
411
+ self.mid = nn.Module()
412
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
413
+ out_channels=block_in,
414
+ temb_channels=self.temb_ch,
415
+ dropout=dropout)
416
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
417
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
418
+ out_channels=block_in,
419
+ temb_channels=self.temb_ch,
420
+ dropout=dropout)
421
+
422
+ # end
423
+ self.norm_out = Normalize(block_in)
424
+ self.conv_out = torch.nn.Conv2d(block_in,
425
+ 2*z_channels if double_z else z_channels,
426
+ kernel_size=3,
427
+ stride=1,
428
+ padding=1)
429
+
430
+ def forward(self, x):
431
+ # timestep embedding
432
+ temb = None
433
+
434
+ # downsampling
435
+ hs = [self.conv_in(x)]
436
+ for i_level in range(self.num_resolutions):
437
+ for i_block in range(self.num_res_blocks):
438
+ h = self.down[i_level].block[i_block](hs[-1], temb)
439
+ if len(self.down[i_level].attn) > 0:
440
+ h = self.down[i_level].attn[i_block](h)
441
+ hs.append(h)
442
+ if i_level != self.num_resolutions-1:
443
+ hs.append(self.down[i_level].downsample(hs[-1]))
444
+
445
+ # middle
446
+ h = hs[-1]
447
+ h = self.mid.block_1(h, temb)
448
+ h = self.mid.attn_1(h)
449
+ h = self.mid.block_2(h, temb)
450
+
451
+ # end
452
+ h = self.norm_out(h)
453
+ h = silu(h)
454
+ h = self.conv_out(h)
455
+ return h
456
+
457
+
458
+ class Decoder(nn.Module):
459
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
460
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
461
+ resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
462
+ attn_type="vanilla", **ignorekwargs):
463
+ super().__init__()
464
+ if use_linear_attn: attn_type = "linear"
465
+ self.ch = ch
466
+ self.temb_ch = 0
467
+ self.num_resolutions = len(ch_mult)
468
+ self.num_res_blocks = num_res_blocks
469
+ self.resolution = resolution
470
+ self.in_channels = in_channels
471
+ self.give_pre_end = give_pre_end
472
+ self.tanh_out = tanh_out
473
+
474
+ # compute in_ch_mult, block_in and curr_res at lowest res
475
+ in_ch_mult = (1,)+tuple(ch_mult)
476
+ block_in = ch*ch_mult[self.num_resolutions-1]
477
+ curr_res = resolution // 2**(self.num_resolutions-1)
478
+ self.z_shape = (1,z_channels,curr_res,curr_res)
479
+ print("Working with z of shape {} = {} dimensions.".format(
480
+ self.z_shape, np.prod(self.z_shape)))
481
+
482
+ # z to block_in
483
+ self.conv_in = torch.nn.Conv2d(z_channels,
484
+ block_in,
485
+ kernel_size=3,
486
+ stride=1,
487
+ padding=1)
488
+
489
+ # middle
490
+ self.mid = nn.Module()
491
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
492
+ out_channels=block_in,
493
+ temb_channels=self.temb_ch,
494
+ dropout=dropout)
495
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
496
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
497
+ out_channels=block_in,
498
+ temb_channels=self.temb_ch,
499
+ dropout=dropout)
500
+
501
+ # upsampling
502
+ self.up = nn.ModuleList()
503
+ for i_level in reversed(range(self.num_resolutions)):
504
+ block = nn.ModuleList()
505
+ attn = nn.ModuleList()
506
+ block_out = ch*ch_mult[i_level]
507
+ for i_block in range(self.num_res_blocks+1):
508
+ block.append(ResnetBlock(in_channels=block_in,
509
+ out_channels=block_out,
510
+ temb_channels=self.temb_ch,
511
+ dropout=dropout))
512
+ block_in = block_out
513
+ if curr_res in attn_resolutions:
514
+ attn.append(make_attn(block_in, attn_type=attn_type))
515
+ up = nn.Module()
516
+ up.block = block
517
+ up.attn = attn
518
+ if i_level != 0:
519
+ up.upsample = Upsample(block_in, resamp_with_conv)
520
+ curr_res = curr_res * 2
521
+ self.up.insert(0, up) # prepend to get consistent order
522
+
523
+ # end
524
+ self.norm_out = Normalize(block_in)
525
+ self.conv_out = torch.nn.Conv2d(block_in,
526
+ out_ch,
527
+ kernel_size=3,
528
+ stride=1,
529
+ padding=1)
530
+
531
+ def forward(self, z):
532
+ #assert z.shape[1:] == self.z_shape[1:]
533
+ self.last_z_shape = z.shape
534
+
535
+ # timestep embedding
536
+ temb = None
537
+
538
+ # z to block_in
539
+ h = self.conv_in(z)
540
+
541
+ # middle
542
+ h = self.mid.block_1(h, temb)
543
+ h = self.mid.attn_1(h)
544
+ h = self.mid.block_2(h, temb)
545
+
546
+ # upsampling
547
+ for i_level in reversed(range(self.num_resolutions)):
548
+ for i_block in range(self.num_res_blocks+1):
549
+ h = self.up[i_level].block[i_block](h, temb)
550
+ if len(self.up[i_level].attn) > 0:
551
+ h = self.up[i_level].attn[i_block](h)
552
+ if i_level != 0:
553
+ h = self.up[i_level].upsample(h)
554
+
555
+ # end
556
+ if self.give_pre_end:
557
+ return h
558
+
559
+ h = self.norm_out(h)
560
+ h = silu(h)
561
+ h = self.conv_out(h)
562
+ if self.tanh_out:
563
+ h = torch.tanh(h)
564
+ return h
565
+
566
+
567
+ class SimpleDecoder(nn.Module):
568
+ def __init__(self, in_channels, out_channels, *args, **kwargs):
569
+ super().__init__()
570
+ self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
571
+ ResnetBlock(in_channels=in_channels,
572
+ out_channels=2 * in_channels,
573
+ temb_channels=0, dropout=0.0),
574
+ ResnetBlock(in_channels=2 * in_channels,
575
+ out_channels=4 * in_channels,
576
+ temb_channels=0, dropout=0.0),
577
+ ResnetBlock(in_channels=4 * in_channels,
578
+ out_channels=2 * in_channels,
579
+ temb_channels=0, dropout=0.0),
580
+ nn.Conv2d(2*in_channels, in_channels, 1),
581
+ Upsample(in_channels, with_conv=True)])
582
+ # end
583
+ self.norm_out = Normalize(in_channels)
584
+ self.conv_out = torch.nn.Conv2d(in_channels,
585
+ out_channels,
586
+ kernel_size=3,
587
+ stride=1,
588
+ padding=1)
589
+
590
+ def forward(self, x):
591
+ for i, layer in enumerate(self.model):
592
+ if i in [1,2,3]:
593
+ x = layer(x, None)
594
+ else:
595
+ x = layer(x)
596
+
597
+ h = self.norm_out(x)
598
+ h = silu(h)
599
+ x = self.conv_out(h)
600
+ return x
601
+
602
+
603
+ class UpsampleDecoder(nn.Module):
604
+ def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
605
+ ch_mult=(2,2), dropout=0.0):
606
+ super().__init__()
607
+ # upsampling
608
+ self.temb_ch = 0
609
+ self.num_resolutions = len(ch_mult)
610
+ self.num_res_blocks = num_res_blocks
611
+ block_in = in_channels
612
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
613
+ self.res_blocks = nn.ModuleList()
614
+ self.upsample_blocks = nn.ModuleList()
615
+ for i_level in range(self.num_resolutions):
616
+ res_block = []
617
+ block_out = ch * ch_mult[i_level]
618
+ for i_block in range(self.num_res_blocks + 1):
619
+ res_block.append(ResnetBlock(in_channels=block_in,
620
+ out_channels=block_out,
621
+ temb_channels=self.temb_ch,
622
+ dropout=dropout))
623
+ block_in = block_out
624
+ self.res_blocks.append(nn.ModuleList(res_block))
625
+ if i_level != self.num_resolutions - 1:
626
+ self.upsample_blocks.append(Upsample(block_in, True))
627
+ curr_res = curr_res * 2
628
+
629
+ # end
630
+ self.norm_out = Normalize(block_in)
631
+ self.conv_out = torch.nn.Conv2d(block_in,
632
+ out_channels,
633
+ kernel_size=3,
634
+ stride=1,
635
+ padding=1)
636
+
637
+ def forward(self, x):
638
+ # upsampling
639
+ h = x
640
+ for k, i_level in enumerate(range(self.num_resolutions)):
641
+ for i_block in range(self.num_res_blocks + 1):
642
+ h = self.res_blocks[i_level][i_block](h, None)
643
+ if i_level != self.num_resolutions - 1:
644
+ h = self.upsample_blocks[k](h)
645
+ h = self.norm_out(h)
646
+ h = silu(h)
647
+ h = self.conv_out(h)
648
+ return h
649
+
650
+
651
+ class LatentRescaler(nn.Module):
652
+ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
653
+ super().__init__()
654
+ # residual block, interpolate, residual block
655
+ self.factor = factor
656
+ self.conv_in = nn.Conv2d(in_channels,
657
+ mid_channels,
658
+ kernel_size=3,
659
+ stride=1,
660
+ padding=1)
661
+ self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
662
+ out_channels=mid_channels,
663
+ temb_channels=0,
664
+ dropout=0.0) for _ in range(depth)])
665
+ self.attn = AttnBlock(mid_channels)
666
+ self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
667
+ out_channels=mid_channels,
668
+ temb_channels=0,
669
+ dropout=0.0) for _ in range(depth)])
670
+
671
+ self.conv_out = nn.Conv2d(mid_channels,
672
+ out_channels,
673
+ kernel_size=1,
674
+ )
675
+
676
+ def forward(self, x):
677
+ x = self.conv_in(x)
678
+ for block in self.res_block1:
679
+ x = block(x, None)
680
+ x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
681
+ x = self.attn(x)
682
+ for block in self.res_block2:
683
+ x = block(x, None)
684
+ x = self.conv_out(x)
685
+ return x
686
+
687
+
688
+ class MergedRescaleEncoder(nn.Module):
689
+ def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
690
+ attn_resolutions, dropout=0.0, resamp_with_conv=True,
691
+ ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
692
+ super().__init__()
693
+ intermediate_chn = ch * ch_mult[-1]
694
+ self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
695
+ z_channels=intermediate_chn, double_z=False, resolution=resolution,
696
+ attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
697
+ out_ch=None)
698
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
699
+ mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
700
+
701
+ def forward(self, x):
702
+ x = self.encoder(x)
703
+ x = self.rescaler(x)
704
+ return x
705
+
706
+
707
+ class MergedRescaleDecoder(nn.Module):
708
+ def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
709
+ dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
710
+ super().__init__()
711
+ tmp_chn = z_channels*ch_mult[-1]
712
+ self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
713
+ resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
714
+ ch_mult=ch_mult, resolution=resolution, ch=ch)
715
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
716
+ out_channels=tmp_chn, depth=rescale_module_depth)
717
+
718
+ def forward(self, x):
719
+ x = self.rescaler(x)
720
+ x = self.decoder(x)
721
+ return x
722
+
723
+
724
+ class Upsampler(nn.Module):
725
+ def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
726
+ super().__init__()
727
+ assert out_size >= in_size
728
+ num_blocks = int(np.log2(out_size//in_size))+1
729
+ factor_up = 1.+ (out_size % in_size)
730
+ print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
731
+ self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
732
+ out_channels=in_channels)
733
+ self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
734
+ attn_resolutions=[], in_channels=None, ch=in_channels,
735
+ ch_mult=[ch_mult for _ in range(num_blocks)])
736
+
737
+ def forward(self, x):
738
+ x = self.rescaler(x)
739
+ x = self.decoder(x)
740
+ return x
741
+
742
+
743
+ class Resize(nn.Module):
744
+ def __init__(self, in_channels=None, learned=False, mode="bilinear"):
745
+ super().__init__()
746
+ self.with_conv = learned
747
+ self.mode = mode
748
+ if self.with_conv:
749
+ print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
750
+ raise NotImplementedError()
751
+ assert in_channels is not None
752
+ # no asymmetric padding in torch conv, must do it ourselves
753
+ self.conv = torch.nn.Conv2d(in_channels,
754
+ in_channels,
755
+ kernel_size=4,
756
+ stride=2,
757
+ padding=1)
758
+
759
+ def forward(self, x, scale_factor=1.0):
760
+ if scale_factor==1.0:
761
+ return x
762
+ else:
763
+ x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
764
+ return x
765
+
766
+ class FirstStagePostProcessor(nn.Module):
767
+
768
+ def __init__(self, ch_mult:list, in_channels,
769
+ pretrained_model:nn.Module=None,
770
+ reshape=False,
771
+ n_channels=None,
772
+ dropout=0.,
773
+ pretrained_config=None):
774
+ super().__init__()
775
+ if pretrained_config is None:
776
+ assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
777
+ self.pretrained_model = pretrained_model
778
+ else:
779
+ assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
780
+ self.instantiate_pretrained(pretrained_config)
781
+
782
+ self.do_reshape = reshape
783
+
784
+ if n_channels is None:
785
+ n_channels = self.pretrained_model.encoder.ch
786
+
787
+ self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
788
+ self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
789
+ stride=1,padding=1)
790
+
791
+ blocks = []
792
+ downs = []
793
+ ch_in = n_channels
794
+ for m in ch_mult:
795
+ blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
796
+ ch_in = m * n_channels
797
+ downs.append(Downsample(ch_in, with_conv=False))
798
+
799
+ self.model = nn.ModuleList(blocks)
800
+ self.downsampler = nn.ModuleList(downs)
801
+
802
+
803
+ def instantiate_pretrained(self, config):
804
+ model = instantiate_from_config(config)
805
+ self.pretrained_model = model.eval()
806
+ # self.pretrained_model.train = False
807
+ for param in self.pretrained_model.parameters():
808
+ param.requires_grad = False
809
+
810
+
811
+ @torch.no_grad()
812
+ def encode_with_pretrained(self,x):
813
+ c = self.pretrained_model.encode(x)
814
+ if isinstance(c, DiagonalGaussianDistribution):
815
+ c = c.mode()
816
+ return c
817
+
818
+ def forward(self,x):
819
+ z_fs = self.encode_with_pretrained(x)
820
+ z = self.proj_norm(z_fs)
821
+ z = self.proj(z)
822
+ z = silu(z)
823
+
824
+ for submodel, downmodel in zip(self.model,self.downsampler):
825
+ z = submodel(z,temb=None)
826
+ z = downmodel(z)
827
+
828
+ if self.do_reshape:
829
+ z = rearrange(z,'b c h w -> b (h w) c')
830
+ return z
ldmlib/modules/diffusionmodules/openaimodel.py ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from functools import partial
3
+ import math
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ import torch as th
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+ from ldmlib.modules.diffusionmodules.util import (
12
+ checkpoint,
13
+ conv_nd,
14
+ linear,
15
+ avg_pool_nd,
16
+ zero_module,
17
+ normalization,
18
+ timestep_embedding,
19
+ )
20
+ from ldmlib.modules.attention import SpatialTransformer
21
+
22
+
23
+ # dummy replace
24
+ def convert_module_to_f16(x):
25
+ pass
26
+
27
+ def convert_module_to_f32(x):
28
+ pass
29
+
30
+
31
+ ## go
32
+ class AttentionPool2d(nn.Module):
33
+ """
34
+ Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ spacial_dim: int,
40
+ embed_dim: int,
41
+ num_heads_channels: int,
42
+ output_dim: int = None,
43
+ ):
44
+ super().__init__()
45
+ self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
46
+ self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
47
+ self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
48
+ self.num_heads = embed_dim // num_heads_channels
49
+ self.attention = QKVAttention(self.num_heads)
50
+
51
+ def forward(self, x):
52
+ b, c, *_spatial = x.shape
53
+ x = x.reshape(b, c, -1) # NC(HW)
54
+ x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
55
+ x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
56
+ x = self.qkv_proj(x)
57
+ x = self.attention(x)
58
+ x = self.c_proj(x)
59
+ return x[:, :, 0]
60
+
61
+
62
+ class TimestepBlock(nn.Module):
63
+ """
64
+ Any module where forward() takes timestep embeddings as a second argument.
65
+ """
66
+
67
+ @abstractmethod
68
+ def forward(self, x, emb):
69
+ """
70
+ Apply the module to `x` given `emb` timestep embeddings.
71
+ """
72
+
73
+
74
+ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
75
+ """
76
+ A sequential module that passes timestep embeddings to the children that
77
+ support it as an extra input.
78
+ """
79
+
80
+ def forward(self, x, emb, context=None):
81
+ for layer in self:
82
+ if isinstance(layer, TimestepBlock):
83
+ x = layer(x, emb)
84
+ elif isinstance(layer, SpatialTransformer):
85
+ x = layer(x, context)
86
+ else:
87
+ x = layer(x)
88
+ return x
89
+
90
+
91
+ class Upsample(nn.Module):
92
+ """
93
+ An upsampling layer with an optional convolution.
94
+ :param channels: channels in the inputs and outputs.
95
+ :param use_conv: a bool determining if a convolution is applied.
96
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
97
+ upsampling occurs in the inner-two dimensions.
98
+ """
99
+
100
+ def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
101
+ super().__init__()
102
+ self.channels = channels
103
+ self.out_channels = out_channels or channels
104
+ self.use_conv = use_conv
105
+ self.dims = dims
106
+ if use_conv:
107
+ self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
108
+
109
+ def forward(self, x):
110
+ assert x.shape[1] == self.channels
111
+ if self.dims == 3:
112
+ x = F.interpolate(
113
+ x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
114
+ )
115
+ else:
116
+ x = F.interpolate(x, scale_factor=2, mode="nearest")
117
+ if self.use_conv:
118
+ x = self.conv(x)
119
+ return x
120
+
121
+ class TransposedUpsample(nn.Module):
122
+ 'Learned 2x upsampling without padding'
123
+ def __init__(self, channels, out_channels=None, ks=5):
124
+ super().__init__()
125
+ self.channels = channels
126
+ self.out_channels = out_channels or channels
127
+
128
+ self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
129
+
130
+ def forward(self,x):
131
+ return self.up(x)
132
+
133
+
134
+ class Downsample(nn.Module):
135
+ """
136
+ A downsampling layer with an optional convolution.
137
+ :param channels: channels in the inputs and outputs.
138
+ :param use_conv: a bool determining if a convolution is applied.
139
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
140
+ downsampling occurs in the inner-two dimensions.
141
+ """
142
+
143
+ def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
144
+ super().__init__()
145
+ self.channels = channels
146
+ self.out_channels = out_channels or channels
147
+ self.use_conv = use_conv
148
+ self.dims = dims
149
+ stride = 2 if dims != 3 else (1, 2, 2)
150
+ if use_conv:
151
+ self.op = conv_nd(
152
+ dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
153
+ )
154
+ else:
155
+ assert self.channels == self.out_channels
156
+ self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
157
+
158
+ def forward(self, x):
159
+ assert x.shape[1] == self.channels
160
+ return self.op(x)
161
+
162
+
163
+ class ResBlock(TimestepBlock):
164
+ """
165
+ A residual block that can optionally change the number of channels.
166
+ :param channels: the number of input channels.
167
+ :param emb_channels: the number of timestep embedding channels.
168
+ :param dropout: the rate of dropout.
169
+ :param out_channels: if specified, the number of out channels.
170
+ :param use_conv: if True and out_channels is specified, use a spatial
171
+ convolution instead of a smaller 1x1 convolution to change the
172
+ channels in the skip connection.
173
+ :param dims: determines if the signal is 1D, 2D, or 3D.
174
+ :param use_checkpoint: if True, use gradient checkpointing on this module.
175
+ :param up: if True, use this block for upsampling.
176
+ :param down: if True, use this block for downsampling.
177
+ """
178
+
179
+ def __init__(
180
+ self,
181
+ channels,
182
+ emb_channels,
183
+ dropout,
184
+ out_channels=None,
185
+ use_conv=False,
186
+ use_scale_shift_norm=False,
187
+ dims=2,
188
+ use_checkpoint=False,
189
+ up=False,
190
+ down=False,
191
+ ):
192
+ super().__init__()
193
+ self.channels = channels
194
+ self.emb_channels = emb_channels
195
+ self.dropout = dropout
196
+ self.out_channels = out_channels or channels
197
+ self.use_conv = use_conv
198
+ self.use_checkpoint = use_checkpoint
199
+ self.use_scale_shift_norm = use_scale_shift_norm
200
+
201
+ self.in_layers = nn.Sequential(
202
+ normalization(channels),
203
+ nn.SiLU(),
204
+ conv_nd(dims, channels, self.out_channels, 3, padding=1),
205
+ )
206
+
207
+ self.updown = up or down
208
+
209
+ if up:
210
+ self.h_upd = Upsample(channels, False, dims)
211
+ self.x_upd = Upsample(channels, False, dims)
212
+ elif down:
213
+ self.h_upd = Downsample(channels, False, dims)
214
+ self.x_upd = Downsample(channels, False, dims)
215
+ else:
216
+ self.h_upd = self.x_upd = nn.Identity()
217
+
218
+ self.emb_layers = nn.Sequential(
219
+ nn.SiLU(),
220
+ linear(
221
+ emb_channels,
222
+ 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
223
+ ),
224
+ )
225
+ self.out_layers = nn.Sequential(
226
+ normalization(self.out_channels),
227
+ nn.SiLU(),
228
+ nn.Dropout(p=dropout),
229
+ zero_module(
230
+ conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
231
+ ),
232
+ )
233
+
234
+ if self.out_channels == channels:
235
+ self.skip_connection = nn.Identity()
236
+ elif use_conv:
237
+ self.skip_connection = conv_nd(
238
+ dims, channels, self.out_channels, 3, padding=1
239
+ )
240
+ else:
241
+ self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
242
+
243
+ def forward(self, x, emb):
244
+ """
245
+ Apply the block to a Tensor, conditioned on a timestep embedding.
246
+ :param x: an [N x C x ...] Tensor of features.
247
+ :param emb: an [N x emb_channels] Tensor of timestep embeddings.
248
+ :return: an [N x C x ...] Tensor of outputs.
249
+ """
250
+ return checkpoint(
251
+ self._forward, (x, emb), self.parameters(), self.use_checkpoint
252
+ )
253
+
254
+
255
+ def _forward(self, x, emb):
256
+ if self.updown:
257
+ in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
258
+ h = in_rest(x)
259
+ h = self.h_upd(h)
260
+ x = self.x_upd(x)
261
+ h = in_conv(h)
262
+ else:
263
+ h = self.in_layers(x)
264
+ emb_out = self.emb_layers(emb).type(h.dtype)
265
+ while len(emb_out.shape) < len(h.shape):
266
+ emb_out = emb_out[..., None]
267
+ if self.use_scale_shift_norm:
268
+ out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
269
+ scale, shift = th.chunk(emb_out, 2, dim=1)
270
+ h = out_norm(h) * (1 + scale) + shift
271
+ h = out_rest(h)
272
+ else:
273
+ h = h + emb_out
274
+ h = self.out_layers(h)
275
+ return self.skip_connection(x) + h
276
+
277
+
278
+ class AttentionBlock(nn.Module):
279
+ """
280
+ An attention block that allows spatial positions to attend to each other.
281
+ Originally ported from here, but adapted to the N-d case.
282
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
283
+ """
284
+
285
+ def __init__(
286
+ self,
287
+ channels,
288
+ num_heads=1,
289
+ num_head_channels=-1,
290
+ use_checkpoint=False,
291
+ use_new_attention_order=False,
292
+ ):
293
+ super().__init__()
294
+ self.channels = channels
295
+ if num_head_channels == -1:
296
+ self.num_heads = num_heads
297
+ else:
298
+ assert (
299
+ channels % num_head_channels == 0
300
+ ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
301
+ self.num_heads = channels // num_head_channels
302
+ self.use_checkpoint = use_checkpoint
303
+ self.norm = normalization(channels)
304
+ self.qkv = conv_nd(1, channels, channels * 3, 1)
305
+ if use_new_attention_order:
306
+ # split qkv before split heads
307
+ self.attention = QKVAttention(self.num_heads)
308
+ else:
309
+ # split heads before split qkv
310
+ self.attention = QKVAttentionLegacy(self.num_heads)
311
+
312
+ self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
313
+
314
+ def forward(self, x):
315
+ return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
316
+ #return pt_checkpoint(self._forward, x) # pytorch
317
+
318
+ def _forward(self, x):
319
+ b, c, *spatial = x.shape
320
+ x = x.reshape(b, c, -1)
321
+ qkv = self.qkv(self.norm(x))
322
+ h = self.attention(qkv)
323
+ h = self.proj_out(h)
324
+ return (x + h).reshape(b, c, *spatial)
325
+
326
+
327
+ def count_flops_attn(model, _x, y):
328
+ """
329
+ A counter for the `thop` package to count the operations in an
330
+ attention operation.
331
+ Meant to be used like:
332
+ macs, params = thop.profile(
333
+ model,
334
+ inputs=(inputs, timestamps),
335
+ custom_ops={QKVAttention: QKVAttention.count_flops},
336
+ )
337
+ """
338
+ b, c, *spatial = y[0].shape
339
+ num_spatial = int(np.prod(spatial))
340
+ # We perform two matmuls with the same number of ops.
341
+ # The first computes the weight matrix, the second computes
342
+ # the combination of the value vectors.
343
+ matmul_ops = 2 * b * (num_spatial ** 2) * c
344
+ model.total_ops += th.DoubleTensor([matmul_ops])
345
+
346
+
347
+ class QKVAttentionLegacy(nn.Module):
348
+ """
349
+ A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
350
+ """
351
+
352
+ def __init__(self, n_heads):
353
+ super().__init__()
354
+ self.n_heads = n_heads
355
+
356
+ def forward(self, qkv):
357
+ """
358
+ Apply QKV attention.
359
+ :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
360
+ :return: an [N x (H * C) x T] tensor after attention.
361
+ """
362
+ bs, width, length = qkv.shape
363
+ assert width % (3 * self.n_heads) == 0
364
+ ch = width // (3 * self.n_heads)
365
+ q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
366
+ scale = 1 / math.sqrt(math.sqrt(ch))
367
+ weight = th.einsum(
368
+ "bct,bcs->bts", q * scale, k * scale
369
+ ) # More stable with f16 than dividing afterwards
370
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
371
+ a = th.einsum("bts,bcs->bct", weight, v)
372
+ return a.reshape(bs, -1, length)
373
+
374
+ @staticmethod
375
+ def count_flops(model, _x, y):
376
+ return count_flops_attn(model, _x, y)
377
+
378
+
379
+ class QKVAttention(nn.Module):
380
+ """
381
+ A module which performs QKV attention and splits in a different order.
382
+ """
383
+
384
+ def __init__(self, n_heads):
385
+ super().__init__()
386
+ self.n_heads = n_heads
387
+
388
+ def forward(self, qkv):
389
+ """
390
+ Apply QKV attention.
391
+ :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
392
+ :return: an [N x (H * C) x T] tensor after attention.
393
+ """
394
+ bs, width, length = qkv.shape
395
+ assert width % (3 * self.n_heads) == 0
396
+ ch = width // (3 * self.n_heads)
397
+ q, k, v = qkv.chunk(3, dim=1)
398
+ scale = 1 / math.sqrt(math.sqrt(ch))
399
+ weight = th.einsum(
400
+ "bct,bcs->bts",
401
+ (q * scale).view(bs * self.n_heads, ch, length),
402
+ (k * scale).view(bs * self.n_heads, ch, length),
403
+ ) # More stable with f16 than dividing afterwards
404
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
405
+ a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
406
+ return a.reshape(bs, -1, length)
407
+
408
+ @staticmethod
409
+ def count_flops(model, _x, y):
410
+ return count_flops_attn(model, _x, y)
411
+
412
+
413
+ class UNetModel(nn.Module):
414
+ """
415
+ The full UNet model with attention and timestep embedding.
416
+ :param in_channels: channels in the input Tensor.
417
+ :param model_channels: base channel count for the model.
418
+ :param out_channels: channels in the output Tensor.
419
+ :param num_res_blocks: number of residual blocks per downsample.
420
+ :param attention_resolutions: a collection of downsample rates at which
421
+ attention will take place. May be a set, list, or tuple.
422
+ For example, if this contains 4, then at 4x downsampling, attention
423
+ will be used.
424
+ :param dropout: the dropout probability.
425
+ :param channel_mult: channel multiplier for each level of the UNet.
426
+ :param conv_resample: if True, use learned convolutions for upsampling and
427
+ downsampling.
428
+ :param dims: determines if the signal is 1D, 2D, or 3D.
429
+ :param num_classes: if specified (as an int), then this model will be
430
+ class-conditional with `num_classes` classes.
431
+ :param use_checkpoint: use gradient checkpointing to reduce memory usage.
432
+ :param num_heads: the number of attention heads in each attention layer.
433
+ :param num_heads_channels: if specified, ignore num_heads and instead use
434
+ a fixed channel width per attention head.
435
+ :param num_heads_upsample: works with num_heads to set a different number
436
+ of heads for upsampling. Deprecated.
437
+ :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
438
+ :param resblock_updown: use residual blocks for up/downsampling.
439
+ :param use_new_attention_order: use a different attention pattern for potentially
440
+ increased efficiency.
441
+ """
442
+
443
+ def __init__(
444
+ self,
445
+ image_size,
446
+ in_channels,
447
+ model_channels,
448
+ out_channels,
449
+ num_res_blocks,
450
+ attention_resolutions,
451
+ dropout=0,
452
+ channel_mult=(1, 2, 4, 8),
453
+ conv_resample=True,
454
+ dims=2,
455
+ num_classes=None,
456
+ use_checkpoint=False,
457
+ use_fp16=False,
458
+ num_heads=-1,
459
+ num_head_channels=-1,
460
+ num_heads_upsample=-1,
461
+ use_scale_shift_norm=False,
462
+ resblock_updown=False,
463
+ use_new_attention_order=False,
464
+ use_spatial_transformer=False, # custom transformer support
465
+ transformer_depth=1, # custom transformer support
466
+ context_dim=None, # custom transformer support
467
+ n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
468
+ legacy=True,
469
+ ):
470
+ super().__init__()
471
+ if use_spatial_transformer:
472
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
473
+
474
+ if context_dim is not None:
475
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
476
+ from omegaconf.listconfig import ListConfig
477
+ if type(context_dim) == ListConfig:
478
+ context_dim = list(context_dim)
479
+
480
+ if num_heads_upsample == -1:
481
+ num_heads_upsample = num_heads
482
+
483
+ if num_heads == -1:
484
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
485
+
486
+ if num_head_channels == -1:
487
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
488
+
489
+ self.image_size = image_size
490
+ self.in_channels = in_channels
491
+ self.model_channels = model_channels
492
+ self.out_channels = out_channels
493
+ self.num_res_blocks = num_res_blocks
494
+ self.attention_resolutions = attention_resolutions
495
+ self.dropout = dropout
496
+ self.channel_mult = channel_mult
497
+ self.conv_resample = conv_resample
498
+ self.num_classes = num_classes
499
+ self.use_checkpoint = use_checkpoint
500
+ self.dtype = th.float16 if use_fp16 else th.float32
501
+ self.num_heads = num_heads
502
+ self.num_head_channels = num_head_channels
503
+ self.num_heads_upsample = num_heads_upsample
504
+ self.predict_codebook_ids = n_embed is not None
505
+
506
+ time_embed_dim = model_channels * 4
507
+ self.time_embed = nn.Sequential(
508
+ linear(model_channels, time_embed_dim),
509
+ nn.SiLU(),
510
+ linear(time_embed_dim, time_embed_dim),
511
+ )
512
+
513
+ if self.num_classes is not None:
514
+ self.label_emb = nn.Embedding(num_classes, time_embed_dim)
515
+
516
+ self.input_blocks = nn.ModuleList(
517
+ [
518
+ TimestepEmbedSequential(
519
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
520
+ )
521
+ ]
522
+ )
523
+ self._feature_size = model_channels
524
+ input_block_chans = [model_channels]
525
+ ch = model_channels
526
+ ds = 1
527
+ for level, mult in enumerate(channel_mult):
528
+ for _ in range(num_res_blocks):
529
+ layers = [
530
+ ResBlock(
531
+ ch,
532
+ time_embed_dim,
533
+ dropout,
534
+ out_channels=mult * model_channels,
535
+ dims=dims,
536
+ use_checkpoint=use_checkpoint,
537
+ use_scale_shift_norm=use_scale_shift_norm,
538
+ )
539
+ ]
540
+ ch = mult * model_channels
541
+ if ds in attention_resolutions:
542
+ if num_head_channels == -1:
543
+ dim_head = ch // num_heads
544
+ else:
545
+ num_heads = ch // num_head_channels
546
+ dim_head = num_head_channels
547
+ if legacy:
548
+ #num_heads = 1
549
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
550
+ layers.append(
551
+ AttentionBlock(
552
+ ch,
553
+ use_checkpoint=use_checkpoint,
554
+ num_heads=num_heads,
555
+ num_head_channels=dim_head,
556
+ use_new_attention_order=use_new_attention_order,
557
+ ) if not use_spatial_transformer else SpatialTransformer(
558
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
559
+ )
560
+ )
561
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
562
+ self._feature_size += ch
563
+ input_block_chans.append(ch)
564
+ if level != len(channel_mult) - 1:
565
+ out_ch = ch
566
+ self.input_blocks.append(
567
+ TimestepEmbedSequential(
568
+ ResBlock(
569
+ ch,
570
+ time_embed_dim,
571
+ dropout,
572
+ out_channels=out_ch,
573
+ dims=dims,
574
+ use_checkpoint=use_checkpoint,
575
+ use_scale_shift_norm=use_scale_shift_norm,
576
+ down=True,
577
+ )
578
+ if resblock_updown
579
+ else Downsample(
580
+ ch, conv_resample, dims=dims, out_channels=out_ch
581
+ )
582
+ )
583
+ )
584
+ ch = out_ch
585
+ input_block_chans.append(ch)
586
+ ds *= 2
587
+ self._feature_size += ch
588
+
589
+ if num_head_channels == -1:
590
+ dim_head = ch // num_heads
591
+ else:
592
+ num_heads = ch // num_head_channels
593
+ dim_head = num_head_channels
594
+ if legacy:
595
+ #num_heads = 1
596
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
597
+ self.middle_block = TimestepEmbedSequential(
598
+ ResBlock(
599
+ ch,
600
+ time_embed_dim,
601
+ dropout,
602
+ dims=dims,
603
+ use_checkpoint=use_checkpoint,
604
+ use_scale_shift_norm=use_scale_shift_norm,
605
+ ),
606
+ AttentionBlock(
607
+ ch,
608
+ use_checkpoint=use_checkpoint,
609
+ num_heads=num_heads,
610
+ num_head_channels=dim_head,
611
+ use_new_attention_order=use_new_attention_order,
612
+ ) if not use_spatial_transformer else SpatialTransformer(
613
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
614
+ ),
615
+ ResBlock(
616
+ ch,
617
+ time_embed_dim,
618
+ dropout,
619
+ dims=dims,
620
+ use_checkpoint=use_checkpoint,
621
+ use_scale_shift_norm=use_scale_shift_norm,
622
+ ),
623
+ )
624
+ self._feature_size += ch
625
+
626
+ self.output_blocks = nn.ModuleList([])
627
+ for level, mult in list(enumerate(channel_mult))[::-1]:
628
+ for i in range(num_res_blocks + 1):
629
+ ich = input_block_chans.pop()
630
+ layers = [
631
+ ResBlock(
632
+ ch + ich,
633
+ time_embed_dim,
634
+ dropout,
635
+ out_channels=model_channels * mult,
636
+ dims=dims,
637
+ use_checkpoint=use_checkpoint,
638
+ use_scale_shift_norm=use_scale_shift_norm,
639
+ )
640
+ ]
641
+ ch = model_channels * mult
642
+ if ds in attention_resolutions:
643
+ if num_head_channels == -1:
644
+ dim_head = ch // num_heads
645
+ else:
646
+ num_heads = ch // num_head_channels
647
+ dim_head = num_head_channels
648
+ if legacy:
649
+ #num_heads = 1
650
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
651
+ layers.append(
652
+ AttentionBlock(
653
+ ch,
654
+ use_checkpoint=use_checkpoint,
655
+ num_heads=num_heads_upsample,
656
+ num_head_channels=dim_head,
657
+ use_new_attention_order=use_new_attention_order,
658
+ ) if not use_spatial_transformer else SpatialTransformer(
659
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
660
+ )
661
+ )
662
+ if level and i == num_res_blocks:
663
+ out_ch = ch
664
+ layers.append(
665
+ ResBlock(
666
+ ch,
667
+ time_embed_dim,
668
+ dropout,
669
+ out_channels=out_ch,
670
+ dims=dims,
671
+ use_checkpoint=use_checkpoint,
672
+ use_scale_shift_norm=use_scale_shift_norm,
673
+ up=True,
674
+ )
675
+ if resblock_updown
676
+ else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
677
+ )
678
+ ds //= 2
679
+ self.output_blocks.append(TimestepEmbedSequential(*layers))
680
+ self._feature_size += ch
681
+
682
+ self.out = nn.Sequential(
683
+ normalization(ch),
684
+ nn.SiLU(),
685
+ zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
686
+ )
687
+ if self.predict_codebook_ids:
688
+ self.id_predictor = nn.Sequential(
689
+ normalization(ch),
690
+ conv_nd(dims, model_channels, n_embed, 1),
691
+ #nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
692
+ )
693
+
694
+ def convert_to_fp16(self):
695
+ """
696
+ Convert the torso of the model to float16.
697
+ """
698
+ self.input_blocks.apply(convert_module_to_f16)
699
+ self.middle_block.apply(convert_module_to_f16)
700
+ self.output_blocks.apply(convert_module_to_f16)
701
+
702
+ def convert_to_fp32(self):
703
+ """
704
+ Convert the torso of the model to float32.
705
+ """
706
+ self.input_blocks.apply(convert_module_to_f32)
707
+ self.middle_block.apply(convert_module_to_f32)
708
+ self.output_blocks.apply(convert_module_to_f32)
709
+
710
+ def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
711
+ """
712
+ Apply the model to an input batch.
713
+ :param x: an [N x C x ...] Tensor of inputs.
714
+ :param timesteps: a 1-D batch of timesteps.
715
+ :param context: conditioning plugged in via crossattn
716
+ :param y: an [N] Tensor of labels, if class-conditional.
717
+ :return: an [N x C x ...] Tensor of outputs.
718
+ """
719
+ assert (y is not None) == (
720
+ self.num_classes is not None
721
+ ), "must specify y if and only if the model is class-conditional"
722
+ hs = []
723
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
724
+ emb = self.time_embed(t_emb)
725
+
726
+ if self.num_classes is not None:
727
+ assert y.shape == (x.shape[0],)
728
+ emb = emb + self.label_emb(y)
729
+
730
+ h = x.type(self.dtype)
731
+ for module in self.input_blocks:
732
+ h = module(h, emb, context)
733
+ hs.append(h)
734
+ h = self.middle_block(h, emb, context)
735
+ for module in self.output_blocks:
736
+ h = th.cat([h, hs.pop()], dim=1)
737
+ h = module(h, emb, context)
738
+ h = h.type(x.dtype)
739
+ if self.predict_codebook_ids:
740
+ return self.id_predictor(h)
741
+ else:
742
+ return self.out(h)
743
+
744
+
745
+ class EncoderUNetModel(nn.Module):
746
+ """
747
+ The half UNet model with attention and timestep embedding.
748
+ For usage, see UNet.
749
+ """
750
+
751
+ def __init__(
752
+ self,
753
+ image_size,
754
+ in_channels,
755
+ model_channels,
756
+ out_channels,
757
+ num_res_blocks,
758
+ attention_resolutions,
759
+ dropout=0,
760
+ channel_mult=(1, 2, 4, 8),
761
+ conv_resample=True,
762
+ dims=2,
763
+ use_checkpoint=False,
764
+ use_fp16=False,
765
+ num_heads=1,
766
+ num_head_channels=-1,
767
+ num_heads_upsample=-1,
768
+ use_scale_shift_norm=False,
769
+ resblock_updown=False,
770
+ use_new_attention_order=False,
771
+ pool="adaptive",
772
+ *args,
773
+ **kwargs
774
+ ):
775
+ super().__init__()
776
+
777
+ if num_heads_upsample == -1:
778
+ num_heads_upsample = num_heads
779
+
780
+ self.in_channels = in_channels
781
+ self.model_channels = model_channels
782
+ self.out_channels = out_channels
783
+ self.num_res_blocks = num_res_blocks
784
+ self.attention_resolutions = attention_resolutions
785
+ self.dropout = dropout
786
+ self.channel_mult = channel_mult
787
+ self.conv_resample = conv_resample
788
+ self.use_checkpoint = use_checkpoint
789
+ self.dtype = th.float16 if use_fp16 else th.float32
790
+ self.num_heads = num_heads
791
+ self.num_head_channels = num_head_channels
792
+ self.num_heads_upsample = num_heads_upsample
793
+
794
+ time_embed_dim = model_channels * 4
795
+ self.time_embed = nn.Sequential(
796
+ linear(model_channels, time_embed_dim),
797
+ nn.SiLU(),
798
+ linear(time_embed_dim, time_embed_dim),
799
+ )
800
+
801
+ self.input_blocks = nn.ModuleList(
802
+ [
803
+ TimestepEmbedSequential(
804
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
805
+ )
806
+ ]
807
+ )
808
+ self._feature_size = model_channels
809
+ input_block_chans = [model_channels]
810
+ ch = model_channels
811
+ ds = 1
812
+ for level, mult in enumerate(channel_mult):
813
+ for _ in range(num_res_blocks):
814
+ layers = [
815
+ ResBlock(
816
+ ch,
817
+ time_embed_dim,
818
+ dropout,
819
+ out_channels=mult * model_channels,
820
+ dims=dims,
821
+ use_checkpoint=use_checkpoint,
822
+ use_scale_shift_norm=use_scale_shift_norm,
823
+ )
824
+ ]
825
+ ch = mult * model_channels
826
+ if ds in attention_resolutions:
827
+ layers.append(
828
+ AttentionBlock(
829
+ ch,
830
+ use_checkpoint=use_checkpoint,
831
+ num_heads=num_heads,
832
+ num_head_channels=num_head_channels,
833
+ use_new_attention_order=use_new_attention_order,
834
+ )
835
+ )
836
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
837
+ self._feature_size += ch
838
+ input_block_chans.append(ch)
839
+ if level != len(channel_mult) - 1:
840
+ out_ch = ch
841
+ self.input_blocks.append(
842
+ TimestepEmbedSequential(
843
+ ResBlock(
844
+ ch,
845
+ time_embed_dim,
846
+ dropout,
847
+ out_channels=out_ch,
848
+ dims=dims,
849
+ use_checkpoint=use_checkpoint,
850
+ use_scale_shift_norm=use_scale_shift_norm,
851
+ down=True,
852
+ )
853
+ if resblock_updown
854
+ else Downsample(
855
+ ch, conv_resample, dims=dims, out_channels=out_ch
856
+ )
857
+ )
858
+ )
859
+ ch = out_ch
860
+ input_block_chans.append(ch)
861
+ ds *= 2
862
+ self._feature_size += ch
863
+
864
+ self.middle_block = TimestepEmbedSequential(
865
+ ResBlock(
866
+ ch,
867
+ time_embed_dim,
868
+ dropout,
869
+ dims=dims,
870
+ use_checkpoint=use_checkpoint,
871
+ use_scale_shift_norm=use_scale_shift_norm,
872
+ ),
873
+ AttentionBlock(
874
+ ch,
875
+ use_checkpoint=use_checkpoint,
876
+ num_heads=num_heads,
877
+ num_head_channels=num_head_channels,
878
+ use_new_attention_order=use_new_attention_order,
879
+ ),
880
+ ResBlock(
881
+ ch,
882
+ time_embed_dim,
883
+ dropout,
884
+ dims=dims,
885
+ use_checkpoint=use_checkpoint,
886
+ use_scale_shift_norm=use_scale_shift_norm,
887
+ ),
888
+ )
889
+ self._feature_size += ch
890
+ self.pool = pool
891
+ if pool == "adaptive":
892
+ self.out = nn.Sequential(
893
+ normalization(ch),
894
+ nn.SiLU(),
895
+ nn.AdaptiveAvgPool2d((1, 1)),
896
+ zero_module(conv_nd(dims, ch, out_channels, 1)),
897
+ nn.Flatten(),
898
+ )
899
+ elif pool == "attention":
900
+ assert num_head_channels != -1
901
+ self.out = nn.Sequential(
902
+ normalization(ch),
903
+ nn.SiLU(),
904
+ AttentionPool2d(
905
+ (image_size // ds), ch, num_head_channels, out_channels
906
+ ),
907
+ )
908
+ elif pool == "spatial":
909
+ self.out = nn.Sequential(
910
+ nn.Linear(self._feature_size, 2048),
911
+ nn.ReLU(),
912
+ nn.Linear(2048, self.out_channels),
913
+ )
914
+ elif pool == "spatial_v2":
915
+ self.out = nn.Sequential(
916
+ nn.Linear(self._feature_size, 2048),
917
+ normalization(2048),
918
+ nn.SiLU(),
919
+ nn.Linear(2048, self.out_channels),
920
+ )
921
+ else:
922
+ raise NotImplementedError(f"Unexpected {pool} pooling")
923
+
924
+ def convert_to_fp16(self):
925
+ """
926
+ Convert the torso of the model to float16.
927
+ """
928
+ self.input_blocks.apply(convert_module_to_f16)
929
+ self.middle_block.apply(convert_module_to_f16)
930
+
931
+ def convert_to_fp32(self):
932
+ """
933
+ Convert the torso of the model to float32.
934
+ """
935
+ self.input_blocks.apply(convert_module_to_f32)
936
+ self.middle_block.apply(convert_module_to_f32)
937
+
938
+ def forward(self, x, timesteps):
939
+ """
940
+ Apply the model to an input batch.
941
+ :param x: an [N x C x ...] Tensor of inputs.
942
+ :param timesteps: a 1-D batch of timesteps.
943
+ :return: an [N x K] Tensor of outputs.
944
+ """
945
+ emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
946
+
947
+ results = []
948
+ h = x.type(self.dtype)
949
+ for module in self.input_blocks:
950
+ h = module(h, emb)
951
+ if self.pool.startswith("spatial"):
952
+ results.append(h.type(x.dtype).mean(dim=(2, 3)))
953
+ h = self.middle_block(h, emb)
954
+ if self.pool.startswith("spatial"):
955
+ results.append(h.type(x.dtype).mean(dim=(2, 3)))
956
+ h = th.cat(results, axis=-1)
957
+ return self.out(h)
958
+ else:
959
+ h = h.type(x.dtype)
960
+ return self.out(h)
ldmlib/modules/diffusionmodules/util.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adopted from
2
+ # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
3
+ # and
4
+ # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
5
+ # and
6
+ # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
7
+ #
8
+ # thanks!
9
+
10
+
11
+ import os
12
+ import math
13
+ import torch
14
+ import torch.nn as nn
15
+ import numpy as np
16
+ from einops import repeat
17
+
18
+ from ldmlib.util import instantiate_from_config
19
+
20
+
21
+ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
22
+ if schedule == "linear":
23
+ betas = (
24
+ torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
25
+ )
26
+
27
+ elif schedule == "cosine":
28
+ timesteps = (
29
+ torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
30
+ )
31
+ alphas = timesteps / (1 + cosine_s) * np.pi / 2
32
+ alphas = torch.cos(alphas).pow(2)
33
+ alphas = alphas / alphas[0]
34
+ betas = 1 - alphas[1:] / alphas[:-1]
35
+ betas = np.clip(betas, a_min=0, a_max=0.999)
36
+
37
+ elif schedule == "sqrt_linear":
38
+ betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
39
+ elif schedule == "sqrt":
40
+ betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
41
+ else:
42
+ raise ValueError(f"schedule '{schedule}' unknown.")
43
+ return betas.numpy()
44
+
45
+
46
+ def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
47
+ if ddim_discr_method == 'uniform':
48
+ c = num_ddpm_timesteps // num_ddim_timesteps
49
+ ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
50
+ elif ddim_discr_method == 'quad':
51
+ ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
52
+ else:
53
+ raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
54
+
55
+ # assert ddim_timesteps.shape[0] == num_ddim_timesteps
56
+ # add one to get the final alpha values right (the ones from first scale to data during sampling)
57
+ steps_out = ddim_timesteps + 1
58
+ if verbose:
59
+ print(f'Selected timesteps for ddim sampler: {steps_out}')
60
+ return steps_out
61
+
62
+
63
+ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
64
+ # select alphas for computing the variance schedule
65
+ alphas = alphacums[ddim_timesteps]
66
+ alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
67
+
68
+ # according the the formula provided in https://arxiv.org/abs/2010.02502
69
+ sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
70
+ if verbose:
71
+ print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
72
+ print(f'For the chosen value of eta, which is {eta}, '
73
+ f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
74
+ return sigmas, alphas, alphas_prev
75
+
76
+
77
+ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
78
+ """
79
+ Create a beta schedule that discretizes the given alpha_t_bar function,
80
+ which defines the cumulative product of (1-beta) over time from t = [0,1].
81
+ :param num_diffusion_timesteps: the number of betas to produce.
82
+ :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
83
+ produces the cumulative product of (1-beta) up to that
84
+ part of the diffusion process.
85
+ :param max_beta: the maximum beta to use; use values lower than 1 to
86
+ prevent singularities.
87
+ """
88
+ betas = []
89
+ for i in range(num_diffusion_timesteps):
90
+ t1 = i / num_diffusion_timesteps
91
+ t2 = (i + 1) / num_diffusion_timesteps
92
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
93
+ return np.array(betas)
94
+
95
+
96
+ def extract_into_tensor(a, t, x_shape):
97
+ b, *_ = t.shape
98
+ out = a.gather(-1, t)
99
+ return out.reshape(b, *((1,) * (len(x_shape) - 1)))
100
+
101
+
102
+ def checkpoint(func, inputs, params, flag):
103
+ """
104
+ Evaluate a function without caching intermediate activations, allowing for
105
+ reduced memory at the expense of extra compute in the backward pass.
106
+ :param func: the function to evaluate.
107
+ :param inputs: the argument sequence to pass to `func`.
108
+ :param params: a sequence of parameters `func` depends on but does not
109
+ explicitly take as arguments.
110
+ :param flag: if False, disable gradient checkpointing.
111
+ """
112
+ if flag:
113
+ args = tuple(inputs) + tuple(params)
114
+ return CheckpointFunction.apply(func, len(inputs), *args)
115
+ else:
116
+ return func(*inputs)
117
+
118
+
119
+ class CheckpointFunction(torch.autograd.Function):
120
+ @staticmethod
121
+ def forward(ctx, run_function, length, *args):
122
+ ctx.run_function = run_function
123
+ ctx.input_tensors = list(args[:length])
124
+ ctx.input_params = list(args[length:])
125
+
126
+ with torch.no_grad():
127
+ output_tensors = ctx.run_function(*ctx.input_tensors)
128
+ return output_tensors
129
+
130
+ @staticmethod
131
+ def backward(ctx, *output_grads):
132
+ ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
133
+ with torch.enable_grad():
134
+ # Fixes a bug where the first op in run_function modifies the
135
+ # Tensor storage in place, which is not allowed for detach()'d
136
+ # Tensors.
137
+ shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
138
+ output_tensors = ctx.run_function(*shallow_copies)
139
+ input_grads = torch.autograd.grad(
140
+ output_tensors,
141
+ ctx.input_tensors + ctx.input_params,
142
+ output_grads,
143
+ allow_unused=True,
144
+ )
145
+ del ctx.input_tensors
146
+ del ctx.input_params
147
+ del output_tensors
148
+ return (None, None) + input_grads
149
+
150
+
151
+ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
152
+ """
153
+ Create sinusoidal timestep embeddings.
154
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
155
+ These may be fractional.
156
+ :param dim: the dimension of the output.
157
+ :param max_period: controls the minimum frequency of the embeddings.
158
+ :return: an [N x dim] Tensor of positional embeddings.
159
+ """
160
+ if not repeat_only:
161
+ half = dim // 2
162
+ freqs = torch.exp(
163
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
164
+ ).to(device=timesteps.device)
165
+ args = timesteps[:, None].float() * freqs[None]
166
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
167
+ if dim % 2:
168
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
169
+ else:
170
+ embedding = repeat(timesteps, 'b -> b d', d=dim)
171
+ return embedding
172
+
173
+
174
+ def zero_module(module):
175
+ """
176
+ Zero out the parameters of a module and return it.
177
+ """
178
+ for p in module.parameters():
179
+ p.detach().zero_()
180
+ return module
181
+
182
+
183
+ def scale_module(module, scale):
184
+ """
185
+ Scale the parameters of a module and return it.
186
+ """
187
+ for p in module.parameters():
188
+ p.detach().mul_(scale)
189
+ return module
190
+
191
+
192
+ def mean_flat(tensor):
193
+ """
194
+ Take the mean over all non-batch dimensions.
195
+ """
196
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
197
+
198
+
199
+ def normalization(channels):
200
+ """
201
+ Make a standard normalization layer.
202
+ :param channels: number of input channels.
203
+ :return: an nn.Module for normalization.
204
+ """
205
+ return GroupNorm32(32, channels)
206
+
207
+
208
+ # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
209
+ class SiLU(nn.Module):
210
+ def forward(self, x):
211
+ return x * torch.sigmoid(x)
212
+
213
+
214
+ class GroupNorm32(nn.GroupNorm):
215
+ def forward(self, x):
216
+ return super().forward(x.float()).type(x.dtype)
217
+
218
+ def conv_nd(dims, *args, **kwargs):
219
+ """
220
+ Create a 1D, 2D, or 3D convolution module.
221
+ """
222
+ if dims == 1:
223
+ return nn.Conv1d(*args, **kwargs)
224
+ elif dims == 2:
225
+ return nn.Conv2d(*args, **kwargs)
226
+ elif dims == 3:
227
+ return nn.Conv3d(*args, **kwargs)
228
+ raise ValueError(f"unsupported dimensions: {dims}")
229
+
230
+
231
+ def linear(*args, **kwargs):
232
+ """
233
+ Create a linear module.
234
+ """
235
+ return nn.Linear(*args, **kwargs)
236
+
237
+
238
+ def avg_pool_nd(dims, *args, **kwargs):
239
+ """
240
+ Create a 1D, 2D, or 3D average pooling module.
241
+ """
242
+ if dims == 1:
243
+ return nn.AvgPool1d(*args, **kwargs)
244
+ elif dims == 2:
245
+ return nn.AvgPool2d(*args, **kwargs)
246
+ elif dims == 3:
247
+ return nn.AvgPool3d(*args, **kwargs)
248
+ raise ValueError(f"unsupported dimensions: {dims}")
249
+
250
+
251
+ class HybridConditioner(nn.Module):
252
+
253
+ def __init__(self, c_concat_config, c_crossattn_config):
254
+ super().__init__()
255
+ self.concat_conditioner = instantiate_from_config(c_concat_config)
256
+ self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
257
+
258
+ def forward(self, c_concat, c_crossattn):
259
+ c_concat = self.concat_conditioner(c_concat)
260
+ c_crossattn = self.crossattn_conditioner(c_crossattn)
261
+ return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
262
+
263
+
264
+ def noise_like(shape, device, repeat=False):
265
+ repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
266
+ noise = lambda: torch.randn(shape, device=device)
267
+ return repeat_noise() if repeat else noise()
ldmlib/modules/distributions/__init__.py ADDED
File without changes
ldmlib/modules/distributions/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (168 Bytes). View file
 
ldmlib/modules/distributions/__pycache__/distributions.cpython-38.pyc ADDED
Binary file (3.81 kB). View file
 
ldmlib/modules/distributions/distributions.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ class AbstractDistribution:
6
+ def sample(self):
7
+ raise NotImplementedError()
8
+
9
+ def mode(self):
10
+ raise NotImplementedError()
11
+
12
+
13
+ class DiracDistribution(AbstractDistribution):
14
+ def __init__(self, value):
15
+ self.value = value
16
+
17
+ def sample(self):
18
+ return self.value
19
+
20
+ def mode(self):
21
+ return self.value
22
+
23
+
24
+ class DiagonalGaussianDistribution(object):
25
+ def __init__(self, parameters, deterministic=False):
26
+ self.parameters = parameters
27
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29
+ self.deterministic = deterministic
30
+ self.std = torch.exp(0.5 * self.logvar)
31
+ self.var = torch.exp(self.logvar)
32
+ if self.deterministic:
33
+ self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34
+
35
+ def sample(self):
36
+ x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37
+ return x
38
+
39
+ def kl(self, other=None):
40
+ if self.deterministic:
41
+ return torch.Tensor([0.])
42
+ else:
43
+ if other is None:
44
+ return 0.5 * torch.sum(torch.pow(self.mean, 2)
45
+ + self.var - 1.0 - self.logvar,
46
+ dim=[1, 2, 3])
47
+ else:
48
+ return 0.5 * torch.sum(
49
+ torch.pow(self.mean - other.mean, 2) / other.var
50
+ + self.var / other.var - 1.0 - self.logvar + other.logvar,
51
+ dim=[1, 2, 3])
52
+
53
+ def nll(self, sample, dims=[1,2,3]):
54
+ if self.deterministic:
55
+ return torch.Tensor([0.])
56
+ logtwopi = np.log(2.0 * np.pi)
57
+ return 0.5 * torch.sum(
58
+ logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59
+ dim=dims)
60
+
61
+ def mode(self):
62
+ return self.mean
63
+
64
+
65
+ def normal_kl(mean1, logvar1, mean2, logvar2):
66
+ """
67
+ source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68
+ Compute the KL divergence between two gaussians.
69
+ Shapes are automatically broadcasted, so batches can be compared to
70
+ scalars, among other use cases.
71
+ """
72
+ tensor = None
73
+ for obj in (mean1, logvar1, mean2, logvar2):
74
+ if isinstance(obj, torch.Tensor):
75
+ tensor = obj
76
+ break
77
+ assert tensor is not None, "at least one argument must be a Tensor"
78
+
79
+ # Force variances to be Tensors. Broadcasting helps convert scalars to
80
+ # Tensors, but it does not work for torch.exp().
81
+ logvar1, logvar2 = [
82
+ x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83
+ for x in (logvar1, logvar2)
84
+ ]
85
+
86
+ return 0.5 * (
87
+ -1.0
88
+ + logvar2
89
+ - logvar1
90
+ + torch.exp(logvar1 - logvar2)
91
+ + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92
+ )
ldmlib/modules/ema.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+
5
+ class LitEma(nn.Module):
6
+ def __init__(self, model, decay=0.9999, use_num_upates=True):
7
+ super().__init__()
8
+ if decay < 0.0 or decay > 1.0:
9
+ raise ValueError('Decay must be between 0 and 1')
10
+
11
+ self.m_name2s_name = {}
12
+ self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13
+ self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14
+ else torch.tensor(-1,dtype=torch.int))
15
+
16
+ for name, p in model.named_parameters():
17
+ if p.requires_grad:
18
+ #remove as '.'-character is not allowed in buffers
19
+ s_name = name.replace('.','')
20
+ self.m_name2s_name.update({name:s_name})
21
+ self.register_buffer(s_name,p.clone().detach().data)
22
+
23
+ self.collected_params = []
24
+
25
+ def forward(self,model):
26
+ decay = self.decay
27
+
28
+ if self.num_updates >= 0:
29
+ self.num_updates += 1
30
+ decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31
+
32
+ one_minus_decay = 1.0 - decay
33
+
34
+ with torch.no_grad():
35
+ m_param = dict(model.named_parameters())
36
+ shadow_params = dict(self.named_buffers())
37
+
38
+ for key in m_param:
39
+ if m_param[key].requires_grad:
40
+ sname = self.m_name2s_name[key]
41
+ shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42
+ shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43
+ else:
44
+ assert not key in self.m_name2s_name
45
+
46
+ def copy_to(self, model):
47
+ m_param = dict(model.named_parameters())
48
+ shadow_params = dict(self.named_buffers())
49
+ for key in m_param:
50
+ if m_param[key].requires_grad:
51
+ m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52
+ else:
53
+ assert not key in self.m_name2s_name
54
+
55
+ def store(self, parameters):
56
+ """
57
+ Save the current parameters for restoring later.
58
+ Args:
59
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60
+ temporarily stored.
61
+ """
62
+ self.collected_params = [param.clone() for param in parameters]
63
+
64
+ def restore(self, parameters):
65
+ """
66
+ Restore the parameters stored with the `store` method.
67
+ Useful to validate the model with EMA parameters without affecting the
68
+ original optimization process. Store the parameters before the
69
+ `copy_to` method. After validation (or model saving), use this to
70
+ restore the former parameters.
71
+ Args:
72
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73
+ updated with the stored parameters.
74
+ """
75
+ for c_param, param in zip(self.collected_params, parameters):
76
+ param.data.copy_(c_param.data)
ldmlib/modules/encoders/__init__.py ADDED
File without changes
ldmlib/modules/encoders/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (163 Bytes). View file
 
ldmlib/modules/encoders/__pycache__/modules.cpython-38.pyc ADDED
Binary file (9.51 kB). View file
 
ldmlib/modules/encoders/modules.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from functools import partial
4
+ import clip
5
+ from einops import rearrange, repeat
6
+ from transformers import CLIPTokenizer, CLIPTextModel
7
+ import kornia
8
+
9
+ from ldmlib.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
10
+
11
+
12
+ class AbstractEncoder(nn.Module):
13
+ def __init__(self):
14
+ super().__init__()
15
+
16
+ def encode(self, *args, **kwargs):
17
+ raise NotImplementedError
18
+
19
+
20
+
21
+ class ClassEmbedder(nn.Module):
22
+ def __init__(self, embed_dim, n_classes=1000, key='class'):
23
+ super().__init__()
24
+ self.key = key
25
+ self.embedding = nn.Embedding(n_classes, embed_dim)
26
+
27
+ def forward(self, batch, key=None):
28
+ if key is None:
29
+ key = self.key
30
+ # this is for use in crossattn
31
+ c = batch[key][:, None]
32
+ c = self.embedding(c)
33
+ return c
34
+
35
+
36
+ class TransformerEmbedder(AbstractEncoder):
37
+ """Some transformer encoder layers"""
38
+ def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
39
+ super().__init__()
40
+ self.device = device
41
+ self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
42
+ attn_layers=Encoder(dim=n_embed, depth=n_layer))
43
+
44
+ def forward(self, tokens):
45
+ tokens = tokens.to(self.device) # meh
46
+ z = self.transformer(tokens, return_embeddings=True)
47
+ return z
48
+
49
+ def encode(self, x):
50
+ return self(x)
51
+
52
+
53
+ class BERTTokenizer(AbstractEncoder):
54
+ """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
55
+ def __init__(self, device="cuda", vq_interface=True, max_length=77):
56
+ super().__init__()
57
+ from transformers import BertTokenizerFast # TODO: add to reuquirements
58
+ self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
59
+ self.device = device
60
+ self.vq_interface = vq_interface
61
+ self.max_length = max_length
62
+
63
+ def forward(self, text):
64
+ batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
65
+ return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
66
+ tokens = batch_encoding["input_ids"].to(self.device)
67
+ return tokens
68
+
69
+ @torch.no_grad()
70
+ def encode(self, text):
71
+ tokens = self(text)
72
+ if not self.vq_interface:
73
+ return tokens
74
+ return None, None, [None, None, tokens]
75
+
76
+ def decode(self, text):
77
+ return text
78
+
79
+
80
+ class BERTEmbedder(AbstractEncoder):
81
+ """Uses the BERT tokenizr model and add some transformer encoder layers"""
82
+ def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
83
+ device="cuda",use_tokenizer=True, embedding_dropout=0.0):
84
+ super().__init__()
85
+ self.use_tknz_fn = use_tokenizer
86
+ if self.use_tknz_fn:
87
+ self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
88
+ self.device = device
89
+ self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
90
+ attn_layers=Encoder(dim=n_embed, depth=n_layer),
91
+ emb_dropout=embedding_dropout)
92
+
93
+ def forward(self, text):
94
+ if self.use_tknz_fn:
95
+ tokens = self.tknz_fn(text)#.to(self.device)
96
+ else:
97
+ tokens = text
98
+ z = self.transformer(tokens, return_embeddings=True)
99
+ return z
100
+
101
+ def encode(self, text):
102
+ # output of length 77
103
+ return self(text)
104
+
105
+
106
+ class SpatialRescaler(nn.Module):
107
+ def __init__(self,
108
+ n_stages=1,
109
+ method='bilinear',
110
+ multiplier=0.5,
111
+ in_channels=3,
112
+ out_channels=None,
113
+ bias=False):
114
+ super().__init__()
115
+ self.n_stages = n_stages
116
+ assert self.n_stages >= 0
117
+ assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
118
+ self.multiplier = multiplier
119
+ self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
120
+ self.remap_output = out_channels is not None
121
+ if self.remap_output:
122
+ print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
123
+ self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
124
+
125
+ def forward(self,x):
126
+ for stage in range(self.n_stages):
127
+ x = self.interpolator(x, scale_factor=self.multiplier)
128
+
129
+
130
+ if self.remap_output:
131
+ x = self.channel_mapper(x)
132
+ return x
133
+
134
+ def encode(self, x):
135
+ return self(x)
136
+
137
+ class FrozenCLIPEmbedder(AbstractEncoder):
138
+ """Uses the CLIP transformer encoder for text (from Hugging Face)"""
139
+ def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
140
+ super().__init__()
141
+ self.tokenizer = CLIPTokenizer.from_pretrained(version)
142
+ self.transformer = CLIPTextModel.from_pretrained(version)
143
+ self.device = device
144
+ self.max_length = max_length
145
+ self.freeze()
146
+
147
+ def freeze(self):
148
+ self.transformer = self.transformer.eval()
149
+ for param in self.parameters():
150
+ param.requires_grad = False
151
+
152
+ def forward(self, text):
153
+ batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
154
+ return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
155
+ tokens = batch_encoding["input_ids"].to(self.device)
156
+ outputs = self.transformer(input_ids=tokens)
157
+
158
+ z = outputs.last_hidden_state
159
+ return z
160
+
161
+ def encode(self, text):
162
+ return self(text)
163
+
164
+
165
+ class FrozenCLIPTextEmbedder(nn.Module):
166
+ """
167
+ Uses the CLIP transformer encoder for text.
168
+ """
169
+ def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
170
+ super().__init__()
171
+ self.model, _ = clip.load(version, jit=False, device="cpu")
172
+ self.device = device
173
+ self.max_length = max_length
174
+ self.n_repeat = n_repeat
175
+ self.normalize = normalize
176
+
177
+ def freeze(self):
178
+ self.model = self.model.eval()
179
+ for param in self.parameters():
180
+ param.requires_grad = False
181
+
182
+ def forward(self, text):
183
+ tokens = clip.tokenize(text).to(self.device)
184
+ z = self.model.encode_text(tokens)
185
+ if self.normalize:
186
+ z = z / torch.linalg.norm(z, dim=1, keepdim=True)
187
+ return z
188
+
189
+ def encode(self, text):
190
+ z = self(text)
191
+ if z.ndim==2:
192
+ z = z[:, None, :]
193
+ z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
194
+ return z
195
+
196
+
197
+ class FrozenClipImageEmbedder(nn.Module):
198
+ """
199
+ Uses the CLIP image encoder.
200
+ """
201
+ def __init__(
202
+ self,
203
+ model,
204
+ jit=False,
205
+ device='cuda' if torch.cuda.is_available() else 'cpu',
206
+ antialias=False,
207
+ ):
208
+ super().__init__()
209
+ self.model, _ = clip.load(name=model, device=device, jit=jit)
210
+
211
+ self.antialias = antialias
212
+
213
+ self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
214
+ self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
215
+
216
+ def preprocess(self, x):
217
+ # normalize to [0,1]
218
+ x = kornia.geometry.resize(x, (224, 224),
219
+ interpolation='bicubic',align_corners=True,
220
+ antialias=self.antialias)
221
+ x = (x + 1.) / 2.
222
+ # renormalize according to clip
223
+ x = kornia.enhance.normalize(x, self.mean, self.std)
224
+ return x
225
+
226
+ def forward(self, x):
227
+ # x is assumed to be in range [-1,1]
228
+ return self.model.encode_image(self.preprocess(x))
229
+
230
+
231
+ if __name__ == "__main__":
232
+ from ldmlib.util import count_params
233
+ model = FrozenCLIPEmbedder()
234
+ count_params(model, verbose=True)
ldmlib/modules/image_degradation/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from ldmlib.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2
+ from ldmlib.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
ldmlib/modules/image_degradation/bsrgan.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # --------------------------------------------
4
+ # Super-Resolution
5
+ # --------------------------------------------
6
+ #
7
+ # Kai Zhang (cskaizhang@gmail.com)
8
+ # https://github.com/cszn
9
+ # From 2019/03--2021/08
10
+ # --------------------------------------------
11
+ """
12
+
13
+ import numpy as np
14
+ import cv2
15
+ import torch
16
+
17
+ from functools import partial
18
+ import random
19
+ from scipy import ndimage
20
+ import scipy
21
+ import scipy.stats as ss
22
+ from scipy.interpolate import interp2d
23
+ from scipy.linalg import orth
24
+ import albumentations
25
+
26
+ import ldmlib.modules.image_degradation.utils_image as util
27
+
28
+
29
+ def modcrop_np(img, sf):
30
+ '''
31
+ Args:
32
+ img: numpy image, WxH or WxHxC
33
+ sf: scale factor
34
+ Return:
35
+ cropped image
36
+ '''
37
+ w, h = img.shape[:2]
38
+ im = np.copy(img)
39
+ return im[:w - w % sf, :h - h % sf, ...]
40
+
41
+
42
+ """
43
+ # --------------------------------------------
44
+ # anisotropic Gaussian kernels
45
+ # --------------------------------------------
46
+ """
47
+
48
+
49
+ def analytic_kernel(k):
50
+ """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
51
+ k_size = k.shape[0]
52
+ # Calculate the big kernels size
53
+ big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
54
+ # Loop over the small kernel to fill the big one
55
+ for r in range(k_size):
56
+ for c in range(k_size):
57
+ big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
58
+ # Crop the edges of the big kernel to ignore very small values and increase run time of SR
59
+ crop = k_size // 2
60
+ cropped_big_k = big_k[crop:-crop, crop:-crop]
61
+ # Normalize to 1
62
+ return cropped_big_k / cropped_big_k.sum()
63
+
64
+
65
+ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
66
+ """ generate an anisotropic Gaussian kernel
67
+ Args:
68
+ ksize : e.g., 15, kernel size
69
+ theta : [0, pi], rotation angle range
70
+ l1 : [0.1,50], scaling of eigenvalues
71
+ l2 : [0.1,l1], scaling of eigenvalues
72
+ If l1 = l2, will get an isotropic Gaussian kernel.
73
+ Returns:
74
+ k : kernel
75
+ """
76
+
77
+ v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
78
+ V = np.array([[v[0], v[1]], [v[1], -v[0]]])
79
+ D = np.array([[l1, 0], [0, l2]])
80
+ Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
81
+ k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
82
+
83
+ return k
84
+
85
+
86
+ def gm_blur_kernel(mean, cov, size=15):
87
+ center = size / 2.0 + 0.5
88
+ k = np.zeros([size, size])
89
+ for y in range(size):
90
+ for x in range(size):
91
+ cy = y - center + 1
92
+ cx = x - center + 1
93
+ k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
94
+
95
+ k = k / np.sum(k)
96
+ return k
97
+
98
+
99
+ def shift_pixel(x, sf, upper_left=True):
100
+ """shift pixel for super-resolution with different scale factors
101
+ Args:
102
+ x: WxHxC or WxH
103
+ sf: scale factor
104
+ upper_left: shift direction
105
+ """
106
+ h, w = x.shape[:2]
107
+ shift = (sf - 1) * 0.5
108
+ xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
109
+ if upper_left:
110
+ x1 = xv + shift
111
+ y1 = yv + shift
112
+ else:
113
+ x1 = xv - shift
114
+ y1 = yv - shift
115
+
116
+ x1 = np.clip(x1, 0, w - 1)
117
+ y1 = np.clip(y1, 0, h - 1)
118
+
119
+ if x.ndim == 2:
120
+ x = interp2d(xv, yv, x)(x1, y1)
121
+ if x.ndim == 3:
122
+ for i in range(x.shape[-1]):
123
+ x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
124
+
125
+ return x
126
+
127
+
128
+ def blur(x, k):
129
+ '''
130
+ x: image, NxcxHxW
131
+ k: kernel, Nx1xhxw
132
+ '''
133
+ n, c = x.shape[:2]
134
+ p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
135
+ x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
136
+ k = k.repeat(1, c, 1, 1)
137
+ k = k.view(-1, 1, k.shape[2], k.shape[3])
138
+ x = x.view(1, -1, x.shape[2], x.shape[3])
139
+ x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
140
+ x = x.view(n, c, x.shape[2], x.shape[3])
141
+
142
+ return x
143
+
144
+
145
+ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
146
+ """"
147
+ # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
148
+ # Kai Zhang
149
+ # min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
150
+ # max_var = 2.5 * sf
151
+ """
152
+ # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
153
+ lambda_1 = min_var + np.random.rand() * (max_var - min_var)
154
+ lambda_2 = min_var + np.random.rand() * (max_var - min_var)
155
+ theta = np.random.rand() * np.pi # random theta
156
+ noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
157
+
158
+ # Set COV matrix using Lambdas and Theta
159
+ LAMBDA = np.diag([lambda_1, lambda_2])
160
+ Q = np.array([[np.cos(theta), -np.sin(theta)],
161
+ [np.sin(theta), np.cos(theta)]])
162
+ SIGMA = Q @ LAMBDA @ Q.T
163
+ INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
164
+
165
+ # Set expectation position (shifting kernel for aligned image)
166
+ MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
167
+ MU = MU[None, None, :, None]
168
+
169
+ # Create meshgrid for Gaussian
170
+ [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
171
+ Z = np.stack([X, Y], 2)[:, :, :, None]
172
+
173
+ # Calcualte Gaussian for every pixel of the kernel
174
+ ZZ = Z - MU
175
+ ZZ_t = ZZ.transpose(0, 1, 3, 2)
176
+ raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
177
+
178
+ # shift the kernel so it will be centered
179
+ # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
180
+
181
+ # Normalize the kernel and return
182
+ # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
183
+ kernel = raw_kernel / np.sum(raw_kernel)
184
+ return kernel
185
+
186
+
187
+ def fspecial_gaussian(hsize, sigma):
188
+ hsize = [hsize, hsize]
189
+ siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
190
+ std = sigma
191
+ [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
192
+ arg = -(x * x + y * y) / (2 * std * std)
193
+ h = np.exp(arg)
194
+ h[h < scipy.finfo(float).eps * h.max()] = 0
195
+ sumh = h.sum()
196
+ if sumh != 0:
197
+ h = h / sumh
198
+ return h
199
+
200
+
201
+ def fspecial_laplacian(alpha):
202
+ alpha = max([0, min([alpha, 1])])
203
+ h1 = alpha / (alpha + 1)
204
+ h2 = (1 - alpha) / (alpha + 1)
205
+ h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
206
+ h = np.array(h)
207
+ return h
208
+
209
+
210
+ def fspecial(filter_type, *args, **kwargs):
211
+ '''
212
+ python code from:
213
+ https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
214
+ '''
215
+ if filter_type == 'gaussian':
216
+ return fspecial_gaussian(*args, **kwargs)
217
+ if filter_type == 'laplacian':
218
+ return fspecial_laplacian(*args, **kwargs)
219
+
220
+
221
+ """
222
+ # --------------------------------------------
223
+ # degradation models
224
+ # --------------------------------------------
225
+ """
226
+
227
+
228
+ def bicubic_degradation(x, sf=3):
229
+ '''
230
+ Args:
231
+ x: HxWxC image, [0, 1]
232
+ sf: down-scale factor
233
+ Return:
234
+ bicubicly downsampled LR image
235
+ '''
236
+ x = util.imresize_np(x, scale=1 / sf)
237
+ return x
238
+
239
+
240
+ def srmd_degradation(x, k, sf=3):
241
+ ''' blur + bicubic downsampling
242
+ Args:
243
+ x: HxWxC image, [0, 1]
244
+ k: hxw, double
245
+ sf: down-scale factor
246
+ Return:
247
+ downsampled LR image
248
+ Reference:
249
+ @inproceedings{zhang2018learning,
250
+ title={Learning a single convolutional super-resolution network for multiple degradations},
251
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
252
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
253
+ pages={3262--3271},
254
+ year={2018}
255
+ }
256
+ '''
257
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
258
+ x = bicubic_degradation(x, sf=sf)
259
+ return x
260
+
261
+
262
+ def dpsr_degradation(x, k, sf=3):
263
+ ''' bicubic downsampling + blur
264
+ Args:
265
+ x: HxWxC image, [0, 1]
266
+ k: hxw, double
267
+ sf: down-scale factor
268
+ Return:
269
+ downsampled LR image
270
+ Reference:
271
+ @inproceedings{zhang2019deep,
272
+ title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
273
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
274
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
275
+ pages={1671--1681},
276
+ year={2019}
277
+ }
278
+ '''
279
+ x = bicubic_degradation(x, sf=sf)
280
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
281
+ return x
282
+
283
+
284
+ def classical_degradation(x, k, sf=3):
285
+ ''' blur + downsampling
286
+ Args:
287
+ x: HxWxC image, [0, 1]/[0, 255]
288
+ k: hxw, double
289
+ sf: down-scale factor
290
+ Return:
291
+ downsampled LR image
292
+ '''
293
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
294
+ # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
295
+ st = 0
296
+ return x[st::sf, st::sf, ...]
297
+
298
+
299
+ def add_sharpening(img, weight=0.5, radius=50, threshold=10):
300
+ """USM sharpening. borrowed from real-ESRGAN
301
+ Input image: I; Blurry image: B.
302
+ 1. K = I + weight * (I - B)
303
+ 2. Mask = 1 if abs(I - B) > threshold, else: 0
304
+ 3. Blur mask:
305
+ 4. Out = Mask * K + (1 - Mask) * I
306
+ Args:
307
+ img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
308
+ weight (float): Sharp weight. Default: 1.
309
+ radius (float): Kernel size of Gaussian blur. Default: 50.
310
+ threshold (int):
311
+ """
312
+ if radius % 2 == 0:
313
+ radius += 1
314
+ blur = cv2.GaussianBlur(img, (radius, radius), 0)
315
+ residual = img - blur
316
+ mask = np.abs(residual) * 255 > threshold
317
+ mask = mask.astype('float32')
318
+ soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
319
+
320
+ K = img + weight * residual
321
+ K = np.clip(K, 0, 1)
322
+ return soft_mask * K + (1 - soft_mask) * img
323
+
324
+
325
+ def add_blur(img, sf=4):
326
+ wd2 = 4.0 + sf
327
+ wd = 2.0 + 0.2 * sf
328
+ if random.random() < 0.5:
329
+ l1 = wd2 * random.random()
330
+ l2 = wd2 * random.random()
331
+ k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
332
+ else:
333
+ k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
334
+ img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
335
+
336
+ return img
337
+
338
+
339
+ def add_resize(img, sf=4):
340
+ rnum = np.random.rand()
341
+ if rnum > 0.8: # up
342
+ sf1 = random.uniform(1, 2)
343
+ elif rnum < 0.7: # down
344
+ sf1 = random.uniform(0.5 / sf, 1)
345
+ else:
346
+ sf1 = 1.0
347
+ img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
348
+ img = np.clip(img, 0.0, 1.0)
349
+
350
+ return img
351
+
352
+
353
+ # def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
354
+ # noise_level = random.randint(noise_level1, noise_level2)
355
+ # rnum = np.random.rand()
356
+ # if rnum > 0.6: # add color Gaussian noise
357
+ # img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
358
+ # elif rnum < 0.4: # add grayscale Gaussian noise
359
+ # img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
360
+ # else: # add noise
361
+ # L = noise_level2 / 255.
362
+ # D = np.diag(np.random.rand(3))
363
+ # U = orth(np.random.rand(3, 3))
364
+ # conv = np.dot(np.dot(np.transpose(U), D), U)
365
+ # img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
366
+ # img = np.clip(img, 0.0, 1.0)
367
+ # return img
368
+
369
+ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
370
+ noise_level = random.randint(noise_level1, noise_level2)
371
+ rnum = np.random.rand()
372
+ if rnum > 0.6: # add color Gaussian noise
373
+ img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
374
+ elif rnum < 0.4: # add grayscale Gaussian noise
375
+ img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
376
+ else: # add noise
377
+ L = noise_level2 / 255.
378
+ D = np.diag(np.random.rand(3))
379
+ U = orth(np.random.rand(3, 3))
380
+ conv = np.dot(np.dot(np.transpose(U), D), U)
381
+ img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
382
+ img = np.clip(img, 0.0, 1.0)
383
+ return img
384
+
385
+
386
+ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
387
+ noise_level = random.randint(noise_level1, noise_level2)
388
+ img = np.clip(img, 0.0, 1.0)
389
+ rnum = random.random()
390
+ if rnum > 0.6:
391
+ img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
392
+ elif rnum < 0.4:
393
+ img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
394
+ else:
395
+ L = noise_level2 / 255.
396
+ D = np.diag(np.random.rand(3))
397
+ U = orth(np.random.rand(3, 3))
398
+ conv = np.dot(np.dot(np.transpose(U), D), U)
399
+ img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
400
+ img = np.clip(img, 0.0, 1.0)
401
+ return img
402
+
403
+
404
+ def add_Poisson_noise(img):
405
+ img = np.clip((img * 255.0).round(), 0, 255) / 255.
406
+ vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
407
+ if random.random() < 0.5:
408
+ img = np.random.poisson(img * vals).astype(np.float32) / vals
409
+ else:
410
+ img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
411
+ img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
412
+ noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
413
+ img += noise_gray[:, :, np.newaxis]
414
+ img = np.clip(img, 0.0, 1.0)
415
+ return img
416
+
417
+
418
+ def add_JPEG_noise(img):
419
+ quality_factor = random.randint(30, 95)
420
+ img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
421
+ result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
422
+ img = cv2.imdecode(encimg, 1)
423
+ img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
424
+ return img
425
+
426
+
427
+ def random_crop(lq, hq, sf=4, lq_patchsize=64):
428
+ h, w = lq.shape[:2]
429
+ rnd_h = random.randint(0, h - lq_patchsize)
430
+ rnd_w = random.randint(0, w - lq_patchsize)
431
+ lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
432
+
433
+ rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
434
+ hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
435
+ return lq, hq
436
+
437
+
438
+ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
439
+ """
440
+ This is the degradation model of BSRGAN from the paper
441
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
442
+ ----------
443
+ img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
444
+ sf: scale factor
445
+ isp_model: camera ISP model
446
+ Returns
447
+ -------
448
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
449
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
450
+ """
451
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
452
+ sf_ori = sf
453
+
454
+ h1, w1 = img.shape[:2]
455
+ img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
456
+ h, w = img.shape[:2]
457
+
458
+ if h < lq_patchsize * sf or w < lq_patchsize * sf:
459
+ raise ValueError(f'img size ({h1}X{w1}) is too small!')
460
+
461
+ hq = img.copy()
462
+
463
+ if sf == 4 and random.random() < scale2_prob: # downsample1
464
+ if np.random.rand() < 0.5:
465
+ img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
466
+ interpolation=random.choice([1, 2, 3]))
467
+ else:
468
+ img = util.imresize_np(img, 1 / 2, True)
469
+ img = np.clip(img, 0.0, 1.0)
470
+ sf = 2
471
+
472
+ shuffle_order = random.sample(range(7), 7)
473
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
474
+ if idx1 > idx2: # keep downsample3 last
475
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
476
+
477
+ for i in shuffle_order:
478
+
479
+ if i == 0:
480
+ img = add_blur(img, sf=sf)
481
+
482
+ elif i == 1:
483
+ img = add_blur(img, sf=sf)
484
+
485
+ elif i == 2:
486
+ a, b = img.shape[1], img.shape[0]
487
+ # downsample2
488
+ if random.random() < 0.75:
489
+ sf1 = random.uniform(1, 2 * sf)
490
+ img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
491
+ interpolation=random.choice([1, 2, 3]))
492
+ else:
493
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
494
+ k_shifted = shift_pixel(k, sf)
495
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
496
+ img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
497
+ img = img[0::sf, 0::sf, ...] # nearest downsampling
498
+ img = np.clip(img, 0.0, 1.0)
499
+
500
+ elif i == 3:
501
+ # downsample3
502
+ img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
503
+ img = np.clip(img, 0.0, 1.0)
504
+
505
+ elif i == 4:
506
+ # add Gaussian noise
507
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
508
+
509
+ elif i == 5:
510
+ # add JPEG noise
511
+ if random.random() < jpeg_prob:
512
+ img = add_JPEG_noise(img)
513
+
514
+ elif i == 6:
515
+ # add processed camera sensor noise
516
+ if random.random() < isp_prob and isp_model is not None:
517
+ with torch.no_grad():
518
+ img, hq = isp_model.forward(img.copy(), hq)
519
+
520
+ # add final JPEG compression noise
521
+ img = add_JPEG_noise(img)
522
+
523
+ # random crop
524
+ img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
525
+
526
+ return img, hq
527
+
528
+
529
+ # todo no isp_model?
530
+ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
531
+ """
532
+ This is the degradation model of BSRGAN from the paper
533
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
534
+ ----------
535
+ sf: scale factor
536
+ isp_model: camera ISP model
537
+ Returns
538
+ -------
539
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
540
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
541
+ """
542
+ image = util.uint2single(image)
543
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
544
+ sf_ori = sf
545
+
546
+ h1, w1 = image.shape[:2]
547
+ image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
548
+ h, w = image.shape[:2]
549
+
550
+ hq = image.copy()
551
+
552
+ if sf == 4 and random.random() < scale2_prob: # downsample1
553
+ if np.random.rand() < 0.5:
554
+ image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
555
+ interpolation=random.choice([1, 2, 3]))
556
+ else:
557
+ image = util.imresize_np(image, 1 / 2, True)
558
+ image = np.clip(image, 0.0, 1.0)
559
+ sf = 2
560
+
561
+ shuffle_order = random.sample(range(7), 7)
562
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
563
+ if idx1 > idx2: # keep downsample3 last
564
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
565
+
566
+ for i in shuffle_order:
567
+
568
+ if i == 0:
569
+ image = add_blur(image, sf=sf)
570
+
571
+ elif i == 1:
572
+ image = add_blur(image, sf=sf)
573
+
574
+ elif i == 2:
575
+ a, b = image.shape[1], image.shape[0]
576
+ # downsample2
577
+ if random.random() < 0.75:
578
+ sf1 = random.uniform(1, 2 * sf)
579
+ image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
580
+ interpolation=random.choice([1, 2, 3]))
581
+ else:
582
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
583
+ k_shifted = shift_pixel(k, sf)
584
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
585
+ image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
586
+ image = image[0::sf, 0::sf, ...] # nearest downsampling
587
+ image = np.clip(image, 0.0, 1.0)
588
+
589
+ elif i == 3:
590
+ # downsample3
591
+ image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
592
+ image = np.clip(image, 0.0, 1.0)
593
+
594
+ elif i == 4:
595
+ # add Gaussian noise
596
+ image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
597
+
598
+ elif i == 5:
599
+ # add JPEG noise
600
+ if random.random() < jpeg_prob:
601
+ image = add_JPEG_noise(image)
602
+
603
+ # elif i == 6:
604
+ # # add processed camera sensor noise
605
+ # if random.random() < isp_prob and isp_model is not None:
606
+ # with torch.no_grad():
607
+ # img, hq = isp_model.forward(img.copy(), hq)
608
+
609
+ # add final JPEG compression noise
610
+ image = add_JPEG_noise(image)
611
+ image = util.single2uint(image)
612
+ example = {"image":image}
613
+ return example
614
+
615
+
616
+ # TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
617
+ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
618
+ """
619
+ This is an extended degradation model by combining
620
+ the degradation models of BSRGAN and Real-ESRGAN
621
+ ----------
622
+ img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
623
+ sf: scale factor
624
+ use_shuffle: the degradation shuffle
625
+ use_sharp: sharpening the img
626
+ Returns
627
+ -------
628
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
629
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
630
+ """
631
+
632
+ h1, w1 = img.shape[:2]
633
+ img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
634
+ h, w = img.shape[:2]
635
+
636
+ if h < lq_patchsize * sf or w < lq_patchsize * sf:
637
+ raise ValueError(f'img size ({h1}X{w1}) is too small!')
638
+
639
+ if use_sharp:
640
+ img = add_sharpening(img)
641
+ hq = img.copy()
642
+
643
+ if random.random() < shuffle_prob:
644
+ shuffle_order = random.sample(range(13), 13)
645
+ else:
646
+ shuffle_order = list(range(13))
647
+ # local shuffle for noise, JPEG is always the last one
648
+ shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
649
+ shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
650
+
651
+ poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
652
+
653
+ for i in shuffle_order:
654
+ if i == 0:
655
+ img = add_blur(img, sf=sf)
656
+ elif i == 1:
657
+ img = add_resize(img, sf=sf)
658
+ elif i == 2:
659
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
660
+ elif i == 3:
661
+ if random.random() < poisson_prob:
662
+ img = add_Poisson_noise(img)
663
+ elif i == 4:
664
+ if random.random() < speckle_prob:
665
+ img = add_speckle_noise(img)
666
+ elif i == 5:
667
+ if random.random() < isp_prob and isp_model is not None:
668
+ with torch.no_grad():
669
+ img, hq = isp_model.forward(img.copy(), hq)
670
+ elif i == 6:
671
+ img = add_JPEG_noise(img)
672
+ elif i == 7:
673
+ img = add_blur(img, sf=sf)
674
+ elif i == 8:
675
+ img = add_resize(img, sf=sf)
676
+ elif i == 9:
677
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
678
+ elif i == 10:
679
+ if random.random() < poisson_prob:
680
+ img = add_Poisson_noise(img)
681
+ elif i == 11:
682
+ if random.random() < speckle_prob:
683
+ img = add_speckle_noise(img)
684
+ elif i == 12:
685
+ if random.random() < isp_prob and isp_model is not None:
686
+ with torch.no_grad():
687
+ img, hq = isp_model.forward(img.copy(), hq)
688
+ else:
689
+ print('check the shuffle!')
690
+
691
+ # resize to desired size
692
+ img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
693
+ interpolation=random.choice([1, 2, 3]))
694
+
695
+ # add final JPEG compression noise
696
+ img = add_JPEG_noise(img)
697
+
698
+ # random crop
699
+ img, hq = random_crop(img, hq, sf, lq_patchsize)
700
+
701
+ return img, hq
702
+
703
+
704
+ if __name__ == '__main__':
705
+ print("hey")
706
+ img = util.imread_uint('utils/test.png', 3)
707
+ print(img)
708
+ img = util.uint2single(img)
709
+ print(img)
710
+ img = img[:448, :448]
711
+ h = img.shape[0] // 4
712
+ print("resizing to", h)
713
+ sf = 4
714
+ deg_fn = partial(degradation_bsrgan_variant, sf=sf)
715
+ for i in range(20):
716
+ print(i)
717
+ img_lq = deg_fn(img)
718
+ print(img_lq)
719
+ img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
720
+ print(img_lq.shape)
721
+ print("bicubic", img_lq_bicubic.shape)
722
+ print(img_hq.shape)
723
+ lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
724
+ interpolation=0)
725
+ lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
726
+ interpolation=0)
727
+ img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
728
+ util.imsave(img_concat, str(i) + '.png')
ldmlib/modules/image_degradation/bsrgan_light.py ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+
6
+ from functools import partial
7
+ import random
8
+ from scipy import ndimage
9
+ import scipy
10
+ import scipy.stats as ss
11
+ from scipy.interpolate import interp2d
12
+ from scipy.linalg import orth
13
+ import albumentations
14
+
15
+ import ldmlib.modules.image_degradation.utils_image as util
16
+
17
+ """
18
+ # --------------------------------------------
19
+ # Super-Resolution
20
+ # --------------------------------------------
21
+ #
22
+ # Kai Zhang (cskaizhang@gmail.com)
23
+ # https://github.com/cszn
24
+ # From 2019/03--2021/08
25
+ # --------------------------------------------
26
+ """
27
+
28
+
29
+ def modcrop_np(img, sf):
30
+ '''
31
+ Args:
32
+ img: numpy image, WxH or WxHxC
33
+ sf: scale factor
34
+ Return:
35
+ cropped image
36
+ '''
37
+ w, h = img.shape[:2]
38
+ im = np.copy(img)
39
+ return im[:w - w % sf, :h - h % sf, ...]
40
+
41
+
42
+ """
43
+ # --------------------------------------------
44
+ # anisotropic Gaussian kernels
45
+ # --------------------------------------------
46
+ """
47
+
48
+
49
+ def analytic_kernel(k):
50
+ """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
51
+ k_size = k.shape[0]
52
+ # Calculate the big kernels size
53
+ big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
54
+ # Loop over the small kernel to fill the big one
55
+ for r in range(k_size):
56
+ for c in range(k_size):
57
+ big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
58
+ # Crop the edges of the big kernel to ignore very small values and increase run time of SR
59
+ crop = k_size // 2
60
+ cropped_big_k = big_k[crop:-crop, crop:-crop]
61
+ # Normalize to 1
62
+ return cropped_big_k / cropped_big_k.sum()
63
+
64
+
65
+ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
66
+ """ generate an anisotropic Gaussian kernel
67
+ Args:
68
+ ksize : e.g., 15, kernel size
69
+ theta : [0, pi], rotation angle range
70
+ l1 : [0.1,50], scaling of eigenvalues
71
+ l2 : [0.1,l1], scaling of eigenvalues
72
+ If l1 = l2, will get an isotropic Gaussian kernel.
73
+ Returns:
74
+ k : kernel
75
+ """
76
+
77
+ v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
78
+ V = np.array([[v[0], v[1]], [v[1], -v[0]]])
79
+ D = np.array([[l1, 0], [0, l2]])
80
+ Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
81
+ k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
82
+
83
+ return k
84
+
85
+
86
+ def gm_blur_kernel(mean, cov, size=15):
87
+ center = size / 2.0 + 0.5
88
+ k = np.zeros([size, size])
89
+ for y in range(size):
90
+ for x in range(size):
91
+ cy = y - center + 1
92
+ cx = x - center + 1
93
+ k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
94
+
95
+ k = k / np.sum(k)
96
+ return k
97
+
98
+
99
+ def shift_pixel(x, sf, upper_left=True):
100
+ """shift pixel for super-resolution with different scale factors
101
+ Args:
102
+ x: WxHxC or WxH
103
+ sf: scale factor
104
+ upper_left: shift direction
105
+ """
106
+ h, w = x.shape[:2]
107
+ shift = (sf - 1) * 0.5
108
+ xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
109
+ if upper_left:
110
+ x1 = xv + shift
111
+ y1 = yv + shift
112
+ else:
113
+ x1 = xv - shift
114
+ y1 = yv - shift
115
+
116
+ x1 = np.clip(x1, 0, w - 1)
117
+ y1 = np.clip(y1, 0, h - 1)
118
+
119
+ if x.ndim == 2:
120
+ x = interp2d(xv, yv, x)(x1, y1)
121
+ if x.ndim == 3:
122
+ for i in range(x.shape[-1]):
123
+ x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
124
+
125
+ return x
126
+
127
+
128
+ def blur(x, k):
129
+ '''
130
+ x: image, NxcxHxW
131
+ k: kernel, Nx1xhxw
132
+ '''
133
+ n, c = x.shape[:2]
134
+ p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
135
+ x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
136
+ k = k.repeat(1, c, 1, 1)
137
+ k = k.view(-1, 1, k.shape[2], k.shape[3])
138
+ x = x.view(1, -1, x.shape[2], x.shape[3])
139
+ x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
140
+ x = x.view(n, c, x.shape[2], x.shape[3])
141
+
142
+ return x
143
+
144
+
145
+ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
146
+ """"
147
+ # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
148
+ # Kai Zhang
149
+ # min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
150
+ # max_var = 2.5 * sf
151
+ """
152
+ # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
153
+ lambda_1 = min_var + np.random.rand() * (max_var - min_var)
154
+ lambda_2 = min_var + np.random.rand() * (max_var - min_var)
155
+ theta = np.random.rand() * np.pi # random theta
156
+ noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
157
+
158
+ # Set COV matrix using Lambdas and Theta
159
+ LAMBDA = np.diag([lambda_1, lambda_2])
160
+ Q = np.array([[np.cos(theta), -np.sin(theta)],
161
+ [np.sin(theta), np.cos(theta)]])
162
+ SIGMA = Q @ LAMBDA @ Q.T
163
+ INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
164
+
165
+ # Set expectation position (shifting kernel for aligned image)
166
+ MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
167
+ MU = MU[None, None, :, None]
168
+
169
+ # Create meshgrid for Gaussian
170
+ [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
171
+ Z = np.stack([X, Y], 2)[:, :, :, None]
172
+
173
+ # Calcualte Gaussian for every pixel of the kernel
174
+ ZZ = Z - MU
175
+ ZZ_t = ZZ.transpose(0, 1, 3, 2)
176
+ raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
177
+
178
+ # shift the kernel so it will be centered
179
+ # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
180
+
181
+ # Normalize the kernel and return
182
+ # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
183
+ kernel = raw_kernel / np.sum(raw_kernel)
184
+ return kernel
185
+
186
+
187
+ def fspecial_gaussian(hsize, sigma):
188
+ hsize = [hsize, hsize]
189
+ siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
190
+ std = sigma
191
+ [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
192
+ arg = -(x * x + y * y) / (2 * std * std)
193
+ h = np.exp(arg)
194
+ h[h < scipy.finfo(float).eps * h.max()] = 0
195
+ sumh = h.sum()
196
+ if sumh != 0:
197
+ h = h / sumh
198
+ return h
199
+
200
+
201
+ def fspecial_laplacian(alpha):
202
+ alpha = max([0, min([alpha, 1])])
203
+ h1 = alpha / (alpha + 1)
204
+ h2 = (1 - alpha) / (alpha + 1)
205
+ h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
206
+ h = np.array(h)
207
+ return h
208
+
209
+
210
+ def fspecial(filter_type, *args, **kwargs):
211
+ '''
212
+ python code from:
213
+ https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
214
+ '''
215
+ if filter_type == 'gaussian':
216
+ return fspecial_gaussian(*args, **kwargs)
217
+ if filter_type == 'laplacian':
218
+ return fspecial_laplacian(*args, **kwargs)
219
+
220
+
221
+ """
222
+ # --------------------------------------------
223
+ # degradation models
224
+ # --------------------------------------------
225
+ """
226
+
227
+
228
+ def bicubic_degradation(x, sf=3):
229
+ '''
230
+ Args:
231
+ x: HxWxC image, [0, 1]
232
+ sf: down-scale factor
233
+ Return:
234
+ bicubicly downsampled LR image
235
+ '''
236
+ x = util.imresize_np(x, scale=1 / sf)
237
+ return x
238
+
239
+
240
+ def srmd_degradation(x, k, sf=3):
241
+ ''' blur + bicubic downsampling
242
+ Args:
243
+ x: HxWxC image, [0, 1]
244
+ k: hxw, double
245
+ sf: down-scale factor
246
+ Return:
247
+ downsampled LR image
248
+ Reference:
249
+ @inproceedings{zhang2018learning,
250
+ title={Learning a single convolutional super-resolution network for multiple degradations},
251
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
252
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
253
+ pages={3262--3271},
254
+ year={2018}
255
+ }
256
+ '''
257
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
258
+ x = bicubic_degradation(x, sf=sf)
259
+ return x
260
+
261
+
262
+ def dpsr_degradation(x, k, sf=3):
263
+ ''' bicubic downsampling + blur
264
+ Args:
265
+ x: HxWxC image, [0, 1]
266
+ k: hxw, double
267
+ sf: down-scale factor
268
+ Return:
269
+ downsampled LR image
270
+ Reference:
271
+ @inproceedings{zhang2019deep,
272
+ title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
273
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
274
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
275
+ pages={1671--1681},
276
+ year={2019}
277
+ }
278
+ '''
279
+ x = bicubic_degradation(x, sf=sf)
280
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
281
+ return x
282
+
283
+
284
+ def classical_degradation(x, k, sf=3):
285
+ ''' blur + downsampling
286
+ Args:
287
+ x: HxWxC image, [0, 1]/[0, 255]
288
+ k: hxw, double
289
+ sf: down-scale factor
290
+ Return:
291
+ downsampled LR image
292
+ '''
293
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
294
+ # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
295
+ st = 0
296
+ return x[st::sf, st::sf, ...]
297
+
298
+
299
+ def add_sharpening(img, weight=0.5, radius=50, threshold=10):
300
+ """USM sharpening. borrowed from real-ESRGAN
301
+ Input image: I; Blurry image: B.
302
+ 1. K = I + weight * (I - B)
303
+ 2. Mask = 1 if abs(I - B) > threshold, else: 0
304
+ 3. Blur mask:
305
+ 4. Out = Mask * K + (1 - Mask) * I
306
+ Args:
307
+ img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
308
+ weight (float): Sharp weight. Default: 1.
309
+ radius (float): Kernel size of Gaussian blur. Default: 50.
310
+ threshold (int):
311
+ """
312
+ if radius % 2 == 0:
313
+ radius += 1
314
+ blur = cv2.GaussianBlur(img, (radius, radius), 0)
315
+ residual = img - blur
316
+ mask = np.abs(residual) * 255 > threshold
317
+ mask = mask.astype('float32')
318
+ soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
319
+
320
+ K = img + weight * residual
321
+ K = np.clip(K, 0, 1)
322
+ return soft_mask * K + (1 - soft_mask) * img
323
+
324
+
325
+ def add_blur(img, sf=4):
326
+ wd2 = 4.0 + sf
327
+ wd = 2.0 + 0.2 * sf
328
+
329
+ wd2 = wd2/4
330
+ wd = wd/4
331
+
332
+ if random.random() < 0.5:
333
+ l1 = wd2 * random.random()
334
+ l2 = wd2 * random.random()
335
+ k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
336
+ else:
337
+ k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
338
+ img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
339
+
340
+ return img
341
+
342
+
343
+ def add_resize(img, sf=4):
344
+ rnum = np.random.rand()
345
+ if rnum > 0.8: # up
346
+ sf1 = random.uniform(1, 2)
347
+ elif rnum < 0.7: # down
348
+ sf1 = random.uniform(0.5 / sf, 1)
349
+ else:
350
+ sf1 = 1.0
351
+ img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
352
+ img = np.clip(img, 0.0, 1.0)
353
+
354
+ return img
355
+
356
+
357
+ # def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
358
+ # noise_level = random.randint(noise_level1, noise_level2)
359
+ # rnum = np.random.rand()
360
+ # if rnum > 0.6: # add color Gaussian noise
361
+ # img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
362
+ # elif rnum < 0.4: # add grayscale Gaussian noise
363
+ # img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
364
+ # else: # add noise
365
+ # L = noise_level2 / 255.
366
+ # D = np.diag(np.random.rand(3))
367
+ # U = orth(np.random.rand(3, 3))
368
+ # conv = np.dot(np.dot(np.transpose(U), D), U)
369
+ # img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
370
+ # img = np.clip(img, 0.0, 1.0)
371
+ # return img
372
+
373
+ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
374
+ noise_level = random.randint(noise_level1, noise_level2)
375
+ rnum = np.random.rand()
376
+ if rnum > 0.6: # add color Gaussian noise
377
+ img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
378
+ elif rnum < 0.4: # add grayscale Gaussian noise
379
+ img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
380
+ else: # add noise
381
+ L = noise_level2 / 255.
382
+ D = np.diag(np.random.rand(3))
383
+ U = orth(np.random.rand(3, 3))
384
+ conv = np.dot(np.dot(np.transpose(U), D), U)
385
+ img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
386
+ img = np.clip(img, 0.0, 1.0)
387
+ return img
388
+
389
+
390
+ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
391
+ noise_level = random.randint(noise_level1, noise_level2)
392
+ img = np.clip(img, 0.0, 1.0)
393
+ rnum = random.random()
394
+ if rnum > 0.6:
395
+ img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
396
+ elif rnum < 0.4:
397
+ img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
398
+ else:
399
+ L = noise_level2 / 255.
400
+ D = np.diag(np.random.rand(3))
401
+ U = orth(np.random.rand(3, 3))
402
+ conv = np.dot(np.dot(np.transpose(U), D), U)
403
+ img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
404
+ img = np.clip(img, 0.0, 1.0)
405
+ return img
406
+
407
+
408
+ def add_Poisson_noise(img):
409
+ img = np.clip((img * 255.0).round(), 0, 255) / 255.
410
+ vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
411
+ if random.random() < 0.5:
412
+ img = np.random.poisson(img * vals).astype(np.float32) / vals
413
+ else:
414
+ img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
415
+ img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
416
+ noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
417
+ img += noise_gray[:, :, np.newaxis]
418
+ img = np.clip(img, 0.0, 1.0)
419
+ return img
420
+
421
+
422
+ def add_JPEG_noise(img):
423
+ quality_factor = random.randint(80, 95)
424
+ img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
425
+ result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
426
+ img = cv2.imdecode(encimg, 1)
427
+ img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
428
+ return img
429
+
430
+
431
+ def random_crop(lq, hq, sf=4, lq_patchsize=64):
432
+ h, w = lq.shape[:2]
433
+ rnd_h = random.randint(0, h - lq_patchsize)
434
+ rnd_w = random.randint(0, w - lq_patchsize)
435
+ lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
436
+
437
+ rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
438
+ hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
439
+ return lq, hq
440
+
441
+
442
+ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
443
+ """
444
+ This is the degradation model of BSRGAN from the paper
445
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
446
+ ----------
447
+ img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
448
+ sf: scale factor
449
+ isp_model: camera ISP model
450
+ Returns
451
+ -------
452
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
453
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
454
+ """
455
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
456
+ sf_ori = sf
457
+
458
+ h1, w1 = img.shape[:2]
459
+ img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
460
+ h, w = img.shape[:2]
461
+
462
+ if h < lq_patchsize * sf or w < lq_patchsize * sf:
463
+ raise ValueError(f'img size ({h1}X{w1}) is too small!')
464
+
465
+ hq = img.copy()
466
+
467
+ if sf == 4 and random.random() < scale2_prob: # downsample1
468
+ if np.random.rand() < 0.5:
469
+ img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
470
+ interpolation=random.choice([1, 2, 3]))
471
+ else:
472
+ img = util.imresize_np(img, 1 / 2, True)
473
+ img = np.clip(img, 0.0, 1.0)
474
+ sf = 2
475
+
476
+ shuffle_order = random.sample(range(7), 7)
477
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
478
+ if idx1 > idx2: # keep downsample3 last
479
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
480
+
481
+ for i in shuffle_order:
482
+
483
+ if i == 0:
484
+ img = add_blur(img, sf=sf)
485
+
486
+ elif i == 1:
487
+ img = add_blur(img, sf=sf)
488
+
489
+ elif i == 2:
490
+ a, b = img.shape[1], img.shape[0]
491
+ # downsample2
492
+ if random.random() < 0.75:
493
+ sf1 = random.uniform(1, 2 * sf)
494
+ img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
495
+ interpolation=random.choice([1, 2, 3]))
496
+ else:
497
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
498
+ k_shifted = shift_pixel(k, sf)
499
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
500
+ img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
501
+ img = img[0::sf, 0::sf, ...] # nearest downsampling
502
+ img = np.clip(img, 0.0, 1.0)
503
+
504
+ elif i == 3:
505
+ # downsample3
506
+ img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
507
+ img = np.clip(img, 0.0, 1.0)
508
+
509
+ elif i == 4:
510
+ # add Gaussian noise
511
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
512
+
513
+ elif i == 5:
514
+ # add JPEG noise
515
+ if random.random() < jpeg_prob:
516
+ img = add_JPEG_noise(img)
517
+
518
+ elif i == 6:
519
+ # add processed camera sensor noise
520
+ if random.random() < isp_prob and isp_model is not None:
521
+ with torch.no_grad():
522
+ img, hq = isp_model.forward(img.copy(), hq)
523
+
524
+ # add final JPEG compression noise
525
+ img = add_JPEG_noise(img)
526
+
527
+ # random crop
528
+ img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
529
+
530
+ return img, hq
531
+
532
+
533
+ # todo no isp_model?
534
+ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
535
+ """
536
+ This is the degradation model of BSRGAN from the paper
537
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
538
+ ----------
539
+ sf: scale factor
540
+ isp_model: camera ISP model
541
+ Returns
542
+ -------
543
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
544
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
545
+ """
546
+ image = util.uint2single(image)
547
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
548
+ sf_ori = sf
549
+
550
+ h1, w1 = image.shape[:2]
551
+ image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
552
+ h, w = image.shape[:2]
553
+
554
+ hq = image.copy()
555
+
556
+ if sf == 4 and random.random() < scale2_prob: # downsample1
557
+ if np.random.rand() < 0.5:
558
+ image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
559
+ interpolation=random.choice([1, 2, 3]))
560
+ else:
561
+ image = util.imresize_np(image, 1 / 2, True)
562
+ image = np.clip(image, 0.0, 1.0)
563
+ sf = 2
564
+
565
+ shuffle_order = random.sample(range(7), 7)
566
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
567
+ if idx1 > idx2: # keep downsample3 last
568
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
569
+
570
+ for i in shuffle_order:
571
+
572
+ if i == 0:
573
+ image = add_blur(image, sf=sf)
574
+
575
+ # elif i == 1:
576
+ # image = add_blur(image, sf=sf)
577
+
578
+ if i == 0:
579
+ pass
580
+
581
+ elif i == 2:
582
+ a, b = image.shape[1], image.shape[0]
583
+ # downsample2
584
+ if random.random() < 0.8:
585
+ sf1 = random.uniform(1, 2 * sf)
586
+ image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
587
+ interpolation=random.choice([1, 2, 3]))
588
+ else:
589
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
590
+ k_shifted = shift_pixel(k, sf)
591
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
592
+ image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
593
+ image = image[0::sf, 0::sf, ...] # nearest downsampling
594
+
595
+ image = np.clip(image, 0.0, 1.0)
596
+
597
+ elif i == 3:
598
+ # downsample3
599
+ image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
600
+ image = np.clip(image, 0.0, 1.0)
601
+
602
+ elif i == 4:
603
+ # add Gaussian noise
604
+ image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
605
+
606
+ elif i == 5:
607
+ # add JPEG noise
608
+ if random.random() < jpeg_prob:
609
+ image = add_JPEG_noise(image)
610
+ #
611
+ # elif i == 6:
612
+ # # add processed camera sensor noise
613
+ # if random.random() < isp_prob and isp_model is not None:
614
+ # with torch.no_grad():
615
+ # img, hq = isp_model.forward(img.copy(), hq)
616
+
617
+ # add final JPEG compression noise
618
+ image = add_JPEG_noise(image)
619
+ image = util.single2uint(image)
620
+ example = {"image": image}
621
+ return example
622
+
623
+
624
+
625
+
626
+ if __name__ == '__main__':
627
+ print("hey")
628
+ img = util.imread_uint('utils/test.png', 3)
629
+ img = img[:448, :448]
630
+ h = img.shape[0] // 4
631
+ print("resizing to", h)
632
+ sf = 4
633
+ deg_fn = partial(degradation_bsrgan_variant, sf=sf)
634
+ for i in range(20):
635
+ print(i)
636
+ img_hq = img
637
+ img_lq = deg_fn(img)["image"]
638
+ img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
639
+ print(img_lq)
640
+ img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
641
+ print(img_lq.shape)
642
+ print("bicubic", img_lq_bicubic.shape)
643
+ print(img_hq.shape)
644
+ lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
645
+ interpolation=0)
646
+ lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
647
+ (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
648
+ interpolation=0)
649
+ img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
650
+ util.imsave(img_concat, str(i) + '.png')
ldmlib/modules/image_degradation/utils/test.png ADDED
ldmlib/modules/image_degradation/utils_image.py ADDED
@@ -0,0 +1,916 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import random
4
+ import numpy as np
5
+ import torch
6
+ import cv2
7
+ from torchvision.utils import make_grid
8
+ from datetime import datetime
9
+ #import matplotlib.pyplot as plt # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
10
+
11
+
12
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
13
+
14
+
15
+ '''
16
+ # --------------------------------------------
17
+ # Kai Zhang (github: https://github.com/cszn)
18
+ # 03/Mar/2019
19
+ # --------------------------------------------
20
+ # https://github.com/twhui/SRGAN-pyTorch
21
+ # https://github.com/xinntao/BasicSR
22
+ # --------------------------------------------
23
+ '''
24
+
25
+
26
+ IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
27
+
28
+
29
+ def is_image_file(filename):
30
+ return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
31
+
32
+
33
+ def get_timestamp():
34
+ return datetime.now().strftime('%y%m%d-%H%M%S')
35
+
36
+
37
+ def imshow(x, title=None, cbar=False, figsize=None):
38
+ plt.figure(figsize=figsize)
39
+ plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
40
+ if title:
41
+ plt.title(title)
42
+ if cbar:
43
+ plt.colorbar()
44
+ plt.show()
45
+
46
+
47
+ def surf(Z, cmap='rainbow', figsize=None):
48
+ plt.figure(figsize=figsize)
49
+ ax3 = plt.axes(projection='3d')
50
+
51
+ w, h = Z.shape[:2]
52
+ xx = np.arange(0,w,1)
53
+ yy = np.arange(0,h,1)
54
+ X, Y = np.meshgrid(xx, yy)
55
+ ax3.plot_surface(X,Y,Z,cmap=cmap)
56
+ #ax3.contour(X,Y,Z, zdim='z',offset=-2,cmap=cmap)
57
+ plt.show()
58
+
59
+
60
+ '''
61
+ # --------------------------------------------
62
+ # get image pathes
63
+ # --------------------------------------------
64
+ '''
65
+
66
+
67
+ def get_image_paths(dataroot):
68
+ paths = None # return None if dataroot is None
69
+ if dataroot is not None:
70
+ paths = sorted(_get_paths_from_images(dataroot))
71
+ return paths
72
+
73
+
74
+ def _get_paths_from_images(path):
75
+ assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
76
+ images = []
77
+ for dirpath, _, fnames in sorted(os.walk(path)):
78
+ for fname in sorted(fnames):
79
+ if is_image_file(fname):
80
+ img_path = os.path.join(dirpath, fname)
81
+ images.append(img_path)
82
+ assert images, '{:s} has no valid image file'.format(path)
83
+ return images
84
+
85
+
86
+ '''
87
+ # --------------------------------------------
88
+ # split large images into small images
89
+ # --------------------------------------------
90
+ '''
91
+
92
+
93
+ def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
94
+ w, h = img.shape[:2]
95
+ patches = []
96
+ if w > p_max and h > p_max:
97
+ w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
98
+ h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
99
+ w1.append(w-p_size)
100
+ h1.append(h-p_size)
101
+ # print(w1)
102
+ # print(h1)
103
+ for i in w1:
104
+ for j in h1:
105
+ patches.append(img[i:i+p_size, j:j+p_size,:])
106
+ else:
107
+ patches.append(img)
108
+
109
+ return patches
110
+
111
+
112
+ def imssave(imgs, img_path):
113
+ """
114
+ imgs: list, N images of size WxHxC
115
+ """
116
+ img_name, ext = os.path.splitext(os.path.basename(img_path))
117
+
118
+ for i, img in enumerate(imgs):
119
+ if img.ndim == 3:
120
+ img = img[:, :, [2, 1, 0]]
121
+ new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
122
+ cv2.imwrite(new_path, img)
123
+
124
+
125
+ def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
126
+ """
127
+ split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
128
+ and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
129
+ will be splitted.
130
+ Args:
131
+ original_dataroot:
132
+ taget_dataroot:
133
+ p_size: size of small images
134
+ p_overlap: patch size in training is a good choice
135
+ p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
136
+ """
137
+ paths = get_image_paths(original_dataroot)
138
+ for img_path in paths:
139
+ # img_name, ext = os.path.splitext(os.path.basename(img_path))
140
+ img = imread_uint(img_path, n_channels=n_channels)
141
+ patches = patches_from_image(img, p_size, p_overlap, p_max)
142
+ imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
143
+ #if original_dataroot == taget_dataroot:
144
+ #del img_path
145
+
146
+ '''
147
+ # --------------------------------------------
148
+ # makedir
149
+ # --------------------------------------------
150
+ '''
151
+
152
+
153
+ def mkdir(path):
154
+ if not os.path.exists(path):
155
+ os.makedirs(path)
156
+
157
+
158
+ def mkdirs(paths):
159
+ if isinstance(paths, str):
160
+ mkdir(paths)
161
+ else:
162
+ for path in paths:
163
+ mkdir(path)
164
+
165
+
166
+ def mkdir_and_rename(path):
167
+ if os.path.exists(path):
168
+ new_name = path + '_archived_' + get_timestamp()
169
+ print('Path already exists. Rename it to [{:s}]'.format(new_name))
170
+ os.rename(path, new_name)
171
+ os.makedirs(path)
172
+
173
+
174
+ '''
175
+ # --------------------------------------------
176
+ # read image from path
177
+ # opencv is fast, but read BGR numpy image
178
+ # --------------------------------------------
179
+ '''
180
+
181
+
182
+ # --------------------------------------------
183
+ # get uint8 image of size HxWxn_channles (RGB)
184
+ # --------------------------------------------
185
+ def imread_uint(path, n_channels=3):
186
+ # input: path
187
+ # output: HxWx3(RGB or GGG), or HxWx1 (G)
188
+ if n_channels == 1:
189
+ img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
190
+ img = np.expand_dims(img, axis=2) # HxWx1
191
+ elif n_channels == 3:
192
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
193
+ if img.ndim == 2:
194
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
195
+ else:
196
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
197
+ return img
198
+
199
+
200
+ # --------------------------------------------
201
+ # matlab's imwrite
202
+ # --------------------------------------------
203
+ def imsave(img, img_path):
204
+ img = np.squeeze(img)
205
+ if img.ndim == 3:
206
+ img = img[:, :, [2, 1, 0]]
207
+ cv2.imwrite(img_path, img)
208
+
209
+ def imwrite(img, img_path):
210
+ img = np.squeeze(img)
211
+ if img.ndim == 3:
212
+ img = img[:, :, [2, 1, 0]]
213
+ cv2.imwrite(img_path, img)
214
+
215
+
216
+
217
+ # --------------------------------------------
218
+ # get single image of size HxWxn_channles (BGR)
219
+ # --------------------------------------------
220
+ def read_img(path):
221
+ # read image by cv2
222
+ # return: Numpy float32, HWC, BGR, [0,1]
223
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # cv2.IMREAD_GRAYSCALE
224
+ img = img.astype(np.float32) / 255.
225
+ if img.ndim == 2:
226
+ img = np.expand_dims(img, axis=2)
227
+ # some images have 4 channels
228
+ if img.shape[2] > 3:
229
+ img = img[:, :, :3]
230
+ return img
231
+
232
+
233
+ '''
234
+ # --------------------------------------------
235
+ # image format conversion
236
+ # --------------------------------------------
237
+ # numpy(single) <---> numpy(unit)
238
+ # numpy(single) <---> tensor
239
+ # numpy(unit) <---> tensor
240
+ # --------------------------------------------
241
+ '''
242
+
243
+
244
+ # --------------------------------------------
245
+ # numpy(single) [0, 1] <---> numpy(unit)
246
+ # --------------------------------------------
247
+
248
+
249
+ def uint2single(img):
250
+
251
+ return np.float32(img/255.)
252
+
253
+
254
+ def single2uint(img):
255
+
256
+ return np.uint8((img.clip(0, 1)*255.).round())
257
+
258
+
259
+ def uint162single(img):
260
+
261
+ return np.float32(img/65535.)
262
+
263
+
264
+ def single2uint16(img):
265
+
266
+ return np.uint16((img.clip(0, 1)*65535.).round())
267
+
268
+
269
+ # --------------------------------------------
270
+ # numpy(unit) (HxWxC or HxW) <---> tensor
271
+ # --------------------------------------------
272
+
273
+
274
+ # convert uint to 4-dimensional torch tensor
275
+ def uint2tensor4(img):
276
+ if img.ndim == 2:
277
+ img = np.expand_dims(img, axis=2)
278
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
279
+
280
+
281
+ # convert uint to 3-dimensional torch tensor
282
+ def uint2tensor3(img):
283
+ if img.ndim == 2:
284
+ img = np.expand_dims(img, axis=2)
285
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
286
+
287
+
288
+ # convert 2/3/4-dimensional torch tensor to uint
289
+ def tensor2uint(img):
290
+ img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
291
+ if img.ndim == 3:
292
+ img = np.transpose(img, (1, 2, 0))
293
+ return np.uint8((img*255.0).round())
294
+
295
+
296
+ # --------------------------------------------
297
+ # numpy(single) (HxWxC) <---> tensor
298
+ # --------------------------------------------
299
+
300
+
301
+ # convert single (HxWxC) to 3-dimensional torch tensor
302
+ def single2tensor3(img):
303
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
304
+
305
+
306
+ # convert single (HxWxC) to 4-dimensional torch tensor
307
+ def single2tensor4(img):
308
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
309
+
310
+
311
+ # convert torch tensor to single
312
+ def tensor2single(img):
313
+ img = img.data.squeeze().float().cpu().numpy()
314
+ if img.ndim == 3:
315
+ img = np.transpose(img, (1, 2, 0))
316
+
317
+ return img
318
+
319
+ # convert torch tensor to single
320
+ def tensor2single3(img):
321
+ img = img.data.squeeze().float().cpu().numpy()
322
+ if img.ndim == 3:
323
+ img = np.transpose(img, (1, 2, 0))
324
+ elif img.ndim == 2:
325
+ img = np.expand_dims(img, axis=2)
326
+ return img
327
+
328
+
329
+ def single2tensor5(img):
330
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
331
+
332
+
333
+ def single32tensor5(img):
334
+ return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
335
+
336
+
337
+ def single42tensor4(img):
338
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
339
+
340
+
341
+ # from skimage.io import imread, imsave
342
+ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
343
+ '''
344
+ Converts a torch Tensor into an image Numpy array of BGR channel order
345
+ Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
346
+ Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
347
+ '''
348
+ tensor = tensor.squeeze().float().cpu().clamp_(*min_max) # squeeze first, then clamp
349
+ tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0]) # to range [0,1]
350
+ n_dim = tensor.dim()
351
+ if n_dim == 4:
352
+ n_img = len(tensor)
353
+ img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
354
+ img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
355
+ elif n_dim == 3:
356
+ img_np = tensor.numpy()
357
+ img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
358
+ elif n_dim == 2:
359
+ img_np = tensor.numpy()
360
+ else:
361
+ raise TypeError(
362
+ 'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
363
+ if out_type == np.uint8:
364
+ img_np = (img_np * 255.0).round()
365
+ # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
366
+ return img_np.astype(out_type)
367
+
368
+
369
+ '''
370
+ # --------------------------------------------
371
+ # Augmentation, flipe and/or rotate
372
+ # --------------------------------------------
373
+ # The following two are enough.
374
+ # (1) augmet_img: numpy image of WxHxC or WxH
375
+ # (2) augment_img_tensor4: tensor image 1xCxWxH
376
+ # --------------------------------------------
377
+ '''
378
+
379
+
380
+ def augment_img(img, mode=0):
381
+ '''Kai Zhang (github: https://github.com/cszn)
382
+ '''
383
+ if mode == 0:
384
+ return img
385
+ elif mode == 1:
386
+ return np.flipud(np.rot90(img))
387
+ elif mode == 2:
388
+ return np.flipud(img)
389
+ elif mode == 3:
390
+ return np.rot90(img, k=3)
391
+ elif mode == 4:
392
+ return np.flipud(np.rot90(img, k=2))
393
+ elif mode == 5:
394
+ return np.rot90(img)
395
+ elif mode == 6:
396
+ return np.rot90(img, k=2)
397
+ elif mode == 7:
398
+ return np.flipud(np.rot90(img, k=3))
399
+
400
+
401
+ def augment_img_tensor4(img, mode=0):
402
+ '''Kai Zhang (github: https://github.com/cszn)
403
+ '''
404
+ if mode == 0:
405
+ return img
406
+ elif mode == 1:
407
+ return img.rot90(1, [2, 3]).flip([2])
408
+ elif mode == 2:
409
+ return img.flip([2])
410
+ elif mode == 3:
411
+ return img.rot90(3, [2, 3])
412
+ elif mode == 4:
413
+ return img.rot90(2, [2, 3]).flip([2])
414
+ elif mode == 5:
415
+ return img.rot90(1, [2, 3])
416
+ elif mode == 6:
417
+ return img.rot90(2, [2, 3])
418
+ elif mode == 7:
419
+ return img.rot90(3, [2, 3]).flip([2])
420
+
421
+
422
+ def augment_img_tensor(img, mode=0):
423
+ '''Kai Zhang (github: https://github.com/cszn)
424
+ '''
425
+ img_size = img.size()
426
+ img_np = img.data.cpu().numpy()
427
+ if len(img_size) == 3:
428
+ img_np = np.transpose(img_np, (1, 2, 0))
429
+ elif len(img_size) == 4:
430
+ img_np = np.transpose(img_np, (2, 3, 1, 0))
431
+ img_np = augment_img(img_np, mode=mode)
432
+ img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
433
+ if len(img_size) == 3:
434
+ img_tensor = img_tensor.permute(2, 0, 1)
435
+ elif len(img_size) == 4:
436
+ img_tensor = img_tensor.permute(3, 2, 0, 1)
437
+
438
+ return img_tensor.type_as(img)
439
+
440
+
441
+ def augment_img_np3(img, mode=0):
442
+ if mode == 0:
443
+ return img
444
+ elif mode == 1:
445
+ return img.transpose(1, 0, 2)
446
+ elif mode == 2:
447
+ return img[::-1, :, :]
448
+ elif mode == 3:
449
+ img = img[::-1, :, :]
450
+ img = img.transpose(1, 0, 2)
451
+ return img
452
+ elif mode == 4:
453
+ return img[:, ::-1, :]
454
+ elif mode == 5:
455
+ img = img[:, ::-1, :]
456
+ img = img.transpose(1, 0, 2)
457
+ return img
458
+ elif mode == 6:
459
+ img = img[:, ::-1, :]
460
+ img = img[::-1, :, :]
461
+ return img
462
+ elif mode == 7:
463
+ img = img[:, ::-1, :]
464
+ img = img[::-1, :, :]
465
+ img = img.transpose(1, 0, 2)
466
+ return img
467
+
468
+
469
+ def augment_imgs(img_list, hflip=True, rot=True):
470
+ # horizontal flip OR rotate
471
+ hflip = hflip and random.random() < 0.5
472
+ vflip = rot and random.random() < 0.5
473
+ rot90 = rot and random.random() < 0.5
474
+
475
+ def _augment(img):
476
+ if hflip:
477
+ img = img[:, ::-1, :]
478
+ if vflip:
479
+ img = img[::-1, :, :]
480
+ if rot90:
481
+ img = img.transpose(1, 0, 2)
482
+ return img
483
+
484
+ return [_augment(img) for img in img_list]
485
+
486
+
487
+ '''
488
+ # --------------------------------------------
489
+ # modcrop and shave
490
+ # --------------------------------------------
491
+ '''
492
+
493
+
494
+ def modcrop(img_in, scale):
495
+ # img_in: Numpy, HWC or HW
496
+ img = np.copy(img_in)
497
+ if img.ndim == 2:
498
+ H, W = img.shape
499
+ H_r, W_r = H % scale, W % scale
500
+ img = img[:H - H_r, :W - W_r]
501
+ elif img.ndim == 3:
502
+ H, W, C = img.shape
503
+ H_r, W_r = H % scale, W % scale
504
+ img = img[:H - H_r, :W - W_r, :]
505
+ else:
506
+ raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
507
+ return img
508
+
509
+
510
+ def shave(img_in, border=0):
511
+ # img_in: Numpy, HWC or HW
512
+ img = np.copy(img_in)
513
+ h, w = img.shape[:2]
514
+ img = img[border:h-border, border:w-border]
515
+ return img
516
+
517
+
518
+ '''
519
+ # --------------------------------------------
520
+ # image processing process on numpy image
521
+ # channel_convert(in_c, tar_type, img_list):
522
+ # rgb2ycbcr(img, only_y=True):
523
+ # bgr2ycbcr(img, only_y=True):
524
+ # ycbcr2rgb(img):
525
+ # --------------------------------------------
526
+ '''
527
+
528
+
529
+ def rgb2ycbcr(img, only_y=True):
530
+ '''same as matlab rgb2ycbcr
531
+ only_y: only return Y channel
532
+ Input:
533
+ uint8, [0, 255]
534
+ float, [0, 1]
535
+ '''
536
+ in_img_type = img.dtype
537
+ img.astype(np.float32)
538
+ if in_img_type != np.uint8:
539
+ img *= 255.
540
+ # convert
541
+ if only_y:
542
+ rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
543
+ else:
544
+ rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
545
+ [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
546
+ if in_img_type == np.uint8:
547
+ rlt = rlt.round()
548
+ else:
549
+ rlt /= 255.
550
+ return rlt.astype(in_img_type)
551
+
552
+
553
+ def ycbcr2rgb(img):
554
+ '''same as matlab ycbcr2rgb
555
+ Input:
556
+ uint8, [0, 255]
557
+ float, [0, 1]
558
+ '''
559
+ in_img_type = img.dtype
560
+ img.astype(np.float32)
561
+ if in_img_type != np.uint8:
562
+ img *= 255.
563
+ # convert
564
+ rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
565
+ [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
566
+ if in_img_type == np.uint8:
567
+ rlt = rlt.round()
568
+ else:
569
+ rlt /= 255.
570
+ return rlt.astype(in_img_type)
571
+
572
+
573
+ def bgr2ycbcr(img, only_y=True):
574
+ '''bgr version of rgb2ycbcr
575
+ only_y: only return Y channel
576
+ Input:
577
+ uint8, [0, 255]
578
+ float, [0, 1]
579
+ '''
580
+ in_img_type = img.dtype
581
+ img.astype(np.float32)
582
+ if in_img_type != np.uint8:
583
+ img *= 255.
584
+ # convert
585
+ if only_y:
586
+ rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
587
+ else:
588
+ rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
589
+ [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
590
+ if in_img_type == np.uint8:
591
+ rlt = rlt.round()
592
+ else:
593
+ rlt /= 255.
594
+ return rlt.astype(in_img_type)
595
+
596
+
597
+ def channel_convert(in_c, tar_type, img_list):
598
+ # conversion among BGR, gray and y
599
+ if in_c == 3 and tar_type == 'gray': # BGR to gray
600
+ gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
601
+ return [np.expand_dims(img, axis=2) for img in gray_list]
602
+ elif in_c == 3 and tar_type == 'y': # BGR to y
603
+ y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
604
+ return [np.expand_dims(img, axis=2) for img in y_list]
605
+ elif in_c == 1 and tar_type == 'RGB': # gray/y to BGR
606
+ return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
607
+ else:
608
+ return img_list
609
+
610
+
611
+ '''
612
+ # --------------------------------------------
613
+ # metric, PSNR and SSIM
614
+ # --------------------------------------------
615
+ '''
616
+
617
+
618
+ # --------------------------------------------
619
+ # PSNR
620
+ # --------------------------------------------
621
+ def calculate_psnr(img1, img2, border=0):
622
+ # img1 and img2 have range [0, 255]
623
+ #img1 = img1.squeeze()
624
+ #img2 = img2.squeeze()
625
+ if not img1.shape == img2.shape:
626
+ raise ValueError('Input images must have the same dimensions.')
627
+ h, w = img1.shape[:2]
628
+ img1 = img1[border:h-border, border:w-border]
629
+ img2 = img2[border:h-border, border:w-border]
630
+
631
+ img1 = img1.astype(np.float64)
632
+ img2 = img2.astype(np.float64)
633
+ mse = np.mean((img1 - img2)**2)
634
+ if mse == 0:
635
+ return float('inf')
636
+ return 20 * math.log10(255.0 / math.sqrt(mse))
637
+
638
+
639
+ # --------------------------------------------
640
+ # SSIM
641
+ # --------------------------------------------
642
+ def calculate_ssim(img1, img2, border=0):
643
+ '''calculate SSIM
644
+ the same outputs as MATLAB's
645
+ img1, img2: [0, 255]
646
+ '''
647
+ #img1 = img1.squeeze()
648
+ #img2 = img2.squeeze()
649
+ if not img1.shape == img2.shape:
650
+ raise ValueError('Input images must have the same dimensions.')
651
+ h, w = img1.shape[:2]
652
+ img1 = img1[border:h-border, border:w-border]
653
+ img2 = img2[border:h-border, border:w-border]
654
+
655
+ if img1.ndim == 2:
656
+ return ssim(img1, img2)
657
+ elif img1.ndim == 3:
658
+ if img1.shape[2] == 3:
659
+ ssims = []
660
+ for i in range(3):
661
+ ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
662
+ return np.array(ssims).mean()
663
+ elif img1.shape[2] == 1:
664
+ return ssim(np.squeeze(img1), np.squeeze(img2))
665
+ else:
666
+ raise ValueError('Wrong input image dimensions.')
667
+
668
+
669
+ def ssim(img1, img2):
670
+ C1 = (0.01 * 255)**2
671
+ C2 = (0.03 * 255)**2
672
+
673
+ img1 = img1.astype(np.float64)
674
+ img2 = img2.astype(np.float64)
675
+ kernel = cv2.getGaussianKernel(11, 1.5)
676
+ window = np.outer(kernel, kernel.transpose())
677
+
678
+ mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid
679
+ mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
680
+ mu1_sq = mu1**2
681
+ mu2_sq = mu2**2
682
+ mu1_mu2 = mu1 * mu2
683
+ sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
684
+ sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
685
+ sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
686
+
687
+ ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
688
+ (sigma1_sq + sigma2_sq + C2))
689
+ return ssim_map.mean()
690
+
691
+
692
+ '''
693
+ # --------------------------------------------
694
+ # matlab's bicubic imresize (numpy and torch) [0, 1]
695
+ # --------------------------------------------
696
+ '''
697
+
698
+
699
+ # matlab 'imresize' function, now only support 'bicubic'
700
+ def cubic(x):
701
+ absx = torch.abs(x)
702
+ absx2 = absx**2
703
+ absx3 = absx**3
704
+ return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
705
+ (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
706
+
707
+
708
+ def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
709
+ if (scale < 1) and (antialiasing):
710
+ # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
711
+ kernel_width = kernel_width / scale
712
+
713
+ # Output-space coordinates
714
+ x = torch.linspace(1, out_length, out_length)
715
+
716
+ # Input-space coordinates. Calculate the inverse mapping such that 0.5
717
+ # in output space maps to 0.5 in input space, and 0.5+scale in output
718
+ # space maps to 1.5 in input space.
719
+ u = x / scale + 0.5 * (1 - 1 / scale)
720
+
721
+ # What is the left-most pixel that can be involved in the computation?
722
+ left = torch.floor(u - kernel_width / 2)
723
+
724
+ # What is the maximum number of pixels that can be involved in the
725
+ # computation? Note: it's OK to use an extra pixel here; if the
726
+ # corresponding weights are all zero, it will be eliminated at the end
727
+ # of this function.
728
+ P = math.ceil(kernel_width) + 2
729
+
730
+ # The indices of the input pixels involved in computing the k-th output
731
+ # pixel are in row k of the indices matrix.
732
+ indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
733
+ 1, P).expand(out_length, P)
734
+
735
+ # The weights used to compute the k-th output pixel are in row k of the
736
+ # weights matrix.
737
+ distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
738
+ # apply cubic kernel
739
+ if (scale < 1) and (antialiasing):
740
+ weights = scale * cubic(distance_to_center * scale)
741
+ else:
742
+ weights = cubic(distance_to_center)
743
+ # Normalize the weights matrix so that each row sums to 1.
744
+ weights_sum = torch.sum(weights, 1).view(out_length, 1)
745
+ weights = weights / weights_sum.expand(out_length, P)
746
+
747
+ # If a column in weights is all zero, get rid of it. only consider the first and last column.
748
+ weights_zero_tmp = torch.sum((weights == 0), 0)
749
+ if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
750
+ indices = indices.narrow(1, 1, P - 2)
751
+ weights = weights.narrow(1, 1, P - 2)
752
+ if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
753
+ indices = indices.narrow(1, 0, P - 2)
754
+ weights = weights.narrow(1, 0, P - 2)
755
+ weights = weights.contiguous()
756
+ indices = indices.contiguous()
757
+ sym_len_s = -indices.min() + 1
758
+ sym_len_e = indices.max() - in_length
759
+ indices = indices + sym_len_s - 1
760
+ return weights, indices, int(sym_len_s), int(sym_len_e)
761
+
762
+
763
+ # --------------------------------------------
764
+ # imresize for tensor image [0, 1]
765
+ # --------------------------------------------
766
+ def imresize(img, scale, antialiasing=True):
767
+ # Now the scale should be the same for H and W
768
+ # input: img: pytorch tensor, CHW or HW [0,1]
769
+ # output: CHW or HW [0,1] w/o round
770
+ need_squeeze = True if img.dim() == 2 else False
771
+ if need_squeeze:
772
+ img.unsqueeze_(0)
773
+ in_C, in_H, in_W = img.size()
774
+ out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
775
+ kernel_width = 4
776
+ kernel = 'cubic'
777
+
778
+ # Return the desired dimension order for performing the resize. The
779
+ # strategy is to perform the resize first along the dimension with the
780
+ # smallest scale factor.
781
+ # Now we do not support this.
782
+
783
+ # get weights and indices
784
+ weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
785
+ in_H, out_H, scale, kernel, kernel_width, antialiasing)
786
+ weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
787
+ in_W, out_W, scale, kernel, kernel_width, antialiasing)
788
+ # process H dimension
789
+ # symmetric copying
790
+ img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
791
+ img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
792
+
793
+ sym_patch = img[:, :sym_len_Hs, :]
794
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
795
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
796
+ img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
797
+
798
+ sym_patch = img[:, -sym_len_He:, :]
799
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
800
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
801
+ img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
802
+
803
+ out_1 = torch.FloatTensor(in_C, out_H, in_W)
804
+ kernel_width = weights_H.size(1)
805
+ for i in range(out_H):
806
+ idx = int(indices_H[i][0])
807
+ for j in range(out_C):
808
+ out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
809
+
810
+ # process W dimension
811
+ # symmetric copying
812
+ out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
813
+ out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
814
+
815
+ sym_patch = out_1[:, :, :sym_len_Ws]
816
+ inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
817
+ sym_patch_inv = sym_patch.index_select(2, inv_idx)
818
+ out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
819
+
820
+ sym_patch = out_1[:, :, -sym_len_We:]
821
+ inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
822
+ sym_patch_inv = sym_patch.index_select(2, inv_idx)
823
+ out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
824
+
825
+ out_2 = torch.FloatTensor(in_C, out_H, out_W)
826
+ kernel_width = weights_W.size(1)
827
+ for i in range(out_W):
828
+ idx = int(indices_W[i][0])
829
+ for j in range(out_C):
830
+ out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
831
+ if need_squeeze:
832
+ out_2.squeeze_()
833
+ return out_2
834
+
835
+
836
+ # --------------------------------------------
837
+ # imresize for numpy image [0, 1]
838
+ # --------------------------------------------
839
+ def imresize_np(img, scale, antialiasing=True):
840
+ # Now the scale should be the same for H and W
841
+ # input: img: Numpy, HWC or HW [0,1]
842
+ # output: HWC or HW [0,1] w/o round
843
+ img = torch.from_numpy(img)
844
+ need_squeeze = True if img.dim() == 2 else False
845
+ if need_squeeze:
846
+ img.unsqueeze_(2)
847
+
848
+ in_H, in_W, in_C = img.size()
849
+ out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
850
+ kernel_width = 4
851
+ kernel = 'cubic'
852
+
853
+ # Return the desired dimension order for performing the resize. The
854
+ # strategy is to perform the resize first along the dimension with the
855
+ # smallest scale factor.
856
+ # Now we do not support this.
857
+
858
+ # get weights and indices
859
+ weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
860
+ in_H, out_H, scale, kernel, kernel_width, antialiasing)
861
+ weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
862
+ in_W, out_W, scale, kernel, kernel_width, antialiasing)
863
+ # process H dimension
864
+ # symmetric copying
865
+ img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
866
+ img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
867
+
868
+ sym_patch = img[:sym_len_Hs, :, :]
869
+ inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
870
+ sym_patch_inv = sym_patch.index_select(0, inv_idx)
871
+ img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
872
+
873
+ sym_patch = img[-sym_len_He:, :, :]
874
+ inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
875
+ sym_patch_inv = sym_patch.index_select(0, inv_idx)
876
+ img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
877
+
878
+ out_1 = torch.FloatTensor(out_H, in_W, in_C)
879
+ kernel_width = weights_H.size(1)
880
+ for i in range(out_H):
881
+ idx = int(indices_H[i][0])
882
+ for j in range(out_C):
883
+ out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
884
+
885
+ # process W dimension
886
+ # symmetric copying
887
+ out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
888
+ out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
889
+
890
+ sym_patch = out_1[:, :sym_len_Ws, :]
891
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
892
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
893
+ out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
894
+
895
+ sym_patch = out_1[:, -sym_len_We:, :]
896
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
897
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
898
+ out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
899
+
900
+ out_2 = torch.FloatTensor(out_H, out_W, in_C)
901
+ kernel_width = weights_W.size(1)
902
+ for i in range(out_W):
903
+ idx = int(indices_W[i][0])
904
+ for j in range(out_C):
905
+ out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
906
+ if need_squeeze:
907
+ out_2.squeeze_()
908
+
909
+ return out_2.numpy()
910
+
911
+
912
+ if __name__ == '__main__':
913
+ print('---')
914
+ # img = imread_uint('test.bmp', 3)
915
+ # img = uint2single(img)
916
+ # img_bicubic = imresize_np(img, 1/4)
ldmlib/modules/losses/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from ldmlib.modules.losses.contperceptual import LPIPSWithDiscriminator
ldmlib/modules/losses/contperceptual.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from taming.modules.losses.vqperceptual import * # TODO: taming dependency yes/no?
5
+
6
+
7
+ class LPIPSWithDiscriminator(nn.Module):
8
+ def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
9
+ disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
10
+ perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
11
+ disc_loss="hinge"):
12
+
13
+ super().__init__()
14
+ assert disc_loss in ["hinge", "vanilla"]
15
+ self.kl_weight = kl_weight
16
+ self.pixel_weight = pixelloss_weight
17
+ self.perceptual_loss = LPIPS().eval()
18
+ self.perceptual_weight = perceptual_weight
19
+ # output log variance
20
+ self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
21
+
22
+ self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
23
+ n_layers=disc_num_layers,
24
+ use_actnorm=use_actnorm
25
+ ).apply(weights_init)
26
+ self.discriminator_iter_start = disc_start
27
+ self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
28
+ self.disc_factor = disc_factor
29
+ self.discriminator_weight = disc_weight
30
+ self.disc_conditional = disc_conditional
31
+
32
+ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
33
+ if last_layer is not None:
34
+ nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
35
+ g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
36
+ else:
37
+ nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
38
+ g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
39
+
40
+ d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
41
+ d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
42
+ d_weight = d_weight * self.discriminator_weight
43
+ return d_weight
44
+
45
+ def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
46
+ global_step, last_layer=None, cond=None, split="train",
47
+ weights=None):
48
+ rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
49
+ if self.perceptual_weight > 0:
50
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
51
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
52
+
53
+ nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
54
+ weighted_nll_loss = nll_loss
55
+ if weights is not None:
56
+ weighted_nll_loss = weights*nll_loss
57
+ weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
58
+ nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
59
+ kl_loss = posteriors.kl()
60
+ kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
61
+
62
+ # now the GAN part
63
+ if optimizer_idx == 0:
64
+ # generator update
65
+ if cond is None:
66
+ assert not self.disc_conditional
67
+ logits_fake = self.discriminator(reconstructions.contiguous())
68
+ else:
69
+ assert self.disc_conditional
70
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
71
+ g_loss = -torch.mean(logits_fake)
72
+
73
+ if self.disc_factor > 0.0:
74
+ try:
75
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
76
+ except RuntimeError:
77
+ assert not self.training
78
+ d_weight = torch.tensor(0.0)
79
+ else:
80
+ d_weight = torch.tensor(0.0)
81
+
82
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
83
+ loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
84
+
85
+ log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
86
+ "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
87
+ "{}/rec_loss".format(split): rec_loss.detach().mean(),
88
+ "{}/d_weight".format(split): d_weight.detach(),
89
+ "{}/disc_factor".format(split): torch.tensor(disc_factor),
90
+ "{}/g_loss".format(split): g_loss.detach().mean(),
91
+ }
92
+ return loss, log
93
+
94
+ if optimizer_idx == 1:
95
+ # second pass for discriminator update
96
+ if cond is None:
97
+ logits_real = self.discriminator(inputs.contiguous().detach())
98
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
99
+ else:
100
+ logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
101
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
102
+
103
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
104
+ d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
105
+
106
+ log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
107
+ "{}/logits_real".format(split): logits_real.detach().mean(),
108
+ "{}/logits_fake".format(split): logits_fake.detach().mean()
109
+ }
110
+ return d_loss, log
ldmlib/modules/losses/vqperceptual.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+ from einops import repeat
5
+
6
+ from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
7
+ from taming.modules.losses.lpips import LPIPS
8
+ from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
9
+
10
+
11
+ def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
12
+ assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
13
+ loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
14
+ loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
15
+ loss_real = (weights * loss_real).sum() / weights.sum()
16
+ loss_fake = (weights * loss_fake).sum() / weights.sum()
17
+ d_loss = 0.5 * (loss_real + loss_fake)
18
+ return d_loss
19
+
20
+ def adopt_weight(weight, global_step, threshold=0, value=0.):
21
+ if global_step < threshold:
22
+ weight = value
23
+ return weight
24
+
25
+
26
+ def measure_perplexity(predicted_indices, n_embed):
27
+ # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
28
+ # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
29
+ encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
30
+ avg_probs = encodings.mean(0)
31
+ perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
32
+ cluster_use = torch.sum(avg_probs > 0)
33
+ return perplexity, cluster_use
34
+
35
+ def l1(x, y):
36
+ return torch.abs(x-y)
37
+
38
+
39
+ def l2(x, y):
40
+ return torch.pow((x-y), 2)
41
+
42
+
43
+ class VQLPIPSWithDiscriminator(nn.Module):
44
+ def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
45
+ disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
46
+ perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
47
+ disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
48
+ pixel_loss="l1"):
49
+ super().__init__()
50
+ assert disc_loss in ["hinge", "vanilla"]
51
+ assert perceptual_loss in ["lpips", "clips", "dists"]
52
+ assert pixel_loss in ["l1", "l2"]
53
+ self.codebook_weight = codebook_weight
54
+ self.pixel_weight = pixelloss_weight
55
+ if perceptual_loss == "lpips":
56
+ print(f"{self.__class__.__name__}: Running with LPIPS.")
57
+ self.perceptual_loss = LPIPS().eval()
58
+ else:
59
+ raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
60
+ self.perceptual_weight = perceptual_weight
61
+
62
+ if pixel_loss == "l1":
63
+ self.pixel_loss = l1
64
+ else:
65
+ self.pixel_loss = l2
66
+
67
+ self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
68
+ n_layers=disc_num_layers,
69
+ use_actnorm=use_actnorm,
70
+ ndf=disc_ndf
71
+ ).apply(weights_init)
72
+ self.discriminator_iter_start = disc_start
73
+ if disc_loss == "hinge":
74
+ self.disc_loss = hinge_d_loss
75
+ elif disc_loss == "vanilla":
76
+ self.disc_loss = vanilla_d_loss
77
+ else:
78
+ raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
79
+ print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
80
+ self.disc_factor = disc_factor
81
+ self.discriminator_weight = disc_weight
82
+ self.disc_conditional = disc_conditional
83
+ self.n_classes = n_classes
84
+
85
+ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
86
+ if last_layer is not None:
87
+ nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
88
+ g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
89
+ else:
90
+ nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
91
+ g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
92
+
93
+ d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
94
+ d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
95
+ d_weight = d_weight * self.discriminator_weight
96
+ return d_weight
97
+
98
+ def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
99
+ global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
100
+ if not exists(codebook_loss):
101
+ codebook_loss = torch.tensor([0.]).to(inputs.device)
102
+ #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
103
+ rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
104
+ if self.perceptual_weight > 0:
105
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
106
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
107
+ else:
108
+ p_loss = torch.tensor([0.0])
109
+
110
+ nll_loss = rec_loss
111
+ #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
112
+ nll_loss = torch.mean(nll_loss)
113
+
114
+ # now the GAN part
115
+ if optimizer_idx == 0:
116
+ # generator update
117
+ if cond is None:
118
+ assert not self.disc_conditional
119
+ logits_fake = self.discriminator(reconstructions.contiguous())
120
+ else:
121
+ assert self.disc_conditional
122
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
123
+ g_loss = -torch.mean(logits_fake)
124
+
125
+ try:
126
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
127
+ except RuntimeError:
128
+ assert not self.training
129
+ d_weight = torch.tensor(0.0)
130
+
131
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
132
+ loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
133
+
134
+ log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
135
+ "{}/quant_loss".format(split): codebook_loss.detach().mean(),
136
+ "{}/nll_loss".format(split): nll_loss.detach().mean(),
137
+ "{}/rec_loss".format(split): rec_loss.detach().mean(),
138
+ "{}/p_loss".format(split): p_loss.detach().mean(),
139
+ "{}/d_weight".format(split): d_weight.detach(),
140
+ "{}/disc_factor".format(split): torch.tensor(disc_factor),
141
+ "{}/g_loss".format(split): g_loss.detach().mean(),
142
+ }
143
+ if predicted_indices is not None:
144
+ assert self.n_classes is not None
145
+ with torch.no_grad():
146
+ perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
147
+ log[f"{split}/perplexity"] = perplexity
148
+ log[f"{split}/cluster_usage"] = cluster_usage
149
+ return loss, log
150
+
151
+ if optimizer_idx == 1:
152
+ # second pass for discriminator update
153
+ if cond is None:
154
+ logits_real = self.discriminator(inputs.contiguous().detach())
155
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
156
+ else:
157
+ logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
158
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
159
+
160
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
161
+ d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
162
+
163
+ log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
164
+ "{}/logits_real".format(split): logits_real.detach().mean(),
165
+ "{}/logits_fake".format(split): logits_fake.detach().mean()
166
+ }
167
+ return d_loss, log
ldmlib/modules/x_transformer.py ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
2
+ import torch
3
+ from torch import nn, einsum
4
+ import torch.nn.functional as F
5
+ from functools import partial
6
+ from inspect import isfunction
7
+ from collections import namedtuple
8
+ from einops import rearrange, repeat, reduce
9
+
10
+ # constants
11
+
12
+ DEFAULT_DIM_HEAD = 64
13
+
14
+ Intermediates = namedtuple('Intermediates', [
15
+ 'pre_softmax_attn',
16
+ 'post_softmax_attn'
17
+ ])
18
+
19
+ LayerIntermediates = namedtuple('Intermediates', [
20
+ 'hiddens',
21
+ 'attn_intermediates'
22
+ ])
23
+
24
+
25
+ class AbsolutePositionalEmbedding(nn.Module):
26
+ def __init__(self, dim, max_seq_len):
27
+ super().__init__()
28
+ self.emb = nn.Embedding(max_seq_len, dim)
29
+ self.init_()
30
+
31
+ def init_(self):
32
+ nn.init.normal_(self.emb.weight, std=0.02)
33
+
34
+ def forward(self, x):
35
+ n = torch.arange(x.shape[1], device=x.device)
36
+ return self.emb(n)[None, :, :]
37
+
38
+
39
+ class FixedPositionalEmbedding(nn.Module):
40
+ def __init__(self, dim):
41
+ super().__init__()
42
+ inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
43
+ self.register_buffer('inv_freq', inv_freq)
44
+
45
+ def forward(self, x, seq_dim=1, offset=0):
46
+ t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
47
+ sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
48
+ emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
49
+ return emb[None, :, :]
50
+
51
+
52
+ # helpers
53
+
54
+ def exists(val):
55
+ return val is not None
56
+
57
+
58
+ def default(val, d):
59
+ if exists(val):
60
+ return val
61
+ return d() if isfunction(d) else d
62
+
63
+
64
+ def always(val):
65
+ def inner(*args, **kwargs):
66
+ return val
67
+ return inner
68
+
69
+
70
+ def not_equals(val):
71
+ def inner(x):
72
+ return x != val
73
+ return inner
74
+
75
+
76
+ def equals(val):
77
+ def inner(x):
78
+ return x == val
79
+ return inner
80
+
81
+
82
+ def max_neg_value(tensor):
83
+ return -torch.finfo(tensor.dtype).max
84
+
85
+
86
+ # keyword argument helpers
87
+
88
+ def pick_and_pop(keys, d):
89
+ values = list(map(lambda key: d.pop(key), keys))
90
+ return dict(zip(keys, values))
91
+
92
+
93
+ def group_dict_by_key(cond, d):
94
+ return_val = [dict(), dict()]
95
+ for key in d.keys():
96
+ match = bool(cond(key))
97
+ ind = int(not match)
98
+ return_val[ind][key] = d[key]
99
+ return (*return_val,)
100
+
101
+
102
+ def string_begins_with(prefix, str):
103
+ return str.startswith(prefix)
104
+
105
+
106
+ def group_by_key_prefix(prefix, d):
107
+ return group_dict_by_key(partial(string_begins_with, prefix), d)
108
+
109
+
110
+ def groupby_prefix_and_trim(prefix, d):
111
+ kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
112
+ kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
113
+ return kwargs_without_prefix, kwargs
114
+
115
+
116
+ # classes
117
+ class Scale(nn.Module):
118
+ def __init__(self, value, fn):
119
+ super().__init__()
120
+ self.value = value
121
+ self.fn = fn
122
+
123
+ def forward(self, x, **kwargs):
124
+ x, *rest = self.fn(x, **kwargs)
125
+ return (x * self.value, *rest)
126
+
127
+
128
+ class Rezero(nn.Module):
129
+ def __init__(self, fn):
130
+ super().__init__()
131
+ self.fn = fn
132
+ self.g = nn.Parameter(torch.zeros(1))
133
+
134
+ def forward(self, x, **kwargs):
135
+ x, *rest = self.fn(x, **kwargs)
136
+ return (x * self.g, *rest)
137
+
138
+
139
+ class ScaleNorm(nn.Module):
140
+ def __init__(self, dim, eps=1e-5):
141
+ super().__init__()
142
+ self.scale = dim ** -0.5
143
+ self.eps = eps
144
+ self.g = nn.Parameter(torch.ones(1))
145
+
146
+ def forward(self, x):
147
+ norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
148
+ return x / norm.clamp(min=self.eps) * self.g
149
+
150
+
151
+ class RMSNorm(nn.Module):
152
+ def __init__(self, dim, eps=1e-8):
153
+ super().__init__()
154
+ self.scale = dim ** -0.5
155
+ self.eps = eps
156
+ self.g = nn.Parameter(torch.ones(dim))
157
+
158
+ def forward(self, x):
159
+ norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
160
+ return x / norm.clamp(min=self.eps) * self.g
161
+
162
+
163
+ class Residual(nn.Module):
164
+ def forward(self, x, residual):
165
+ return x + residual
166
+
167
+
168
+ class GRUGating(nn.Module):
169
+ def __init__(self, dim):
170
+ super().__init__()
171
+ self.gru = nn.GRUCell(dim, dim)
172
+
173
+ def forward(self, x, residual):
174
+ gated_output = self.gru(
175
+ rearrange(x, 'b n d -> (b n) d'),
176
+ rearrange(residual, 'b n d -> (b n) d')
177
+ )
178
+
179
+ return gated_output.reshape_as(x)
180
+
181
+
182
+ # feedforward
183
+
184
+ class GEGLU(nn.Module):
185
+ def __init__(self, dim_in, dim_out):
186
+ super().__init__()
187
+ self.proj = nn.Linear(dim_in, dim_out * 2)
188
+
189
+ def forward(self, x):
190
+ x, gate = self.proj(x).chunk(2, dim=-1)
191
+ return x * F.gelu(gate)
192
+
193
+
194
+ class FeedForward(nn.Module):
195
+ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
196
+ super().__init__()
197
+ inner_dim = int(dim * mult)
198
+ dim_out = default(dim_out, dim)
199
+ project_in = nn.Sequential(
200
+ nn.Linear(dim, inner_dim),
201
+ nn.GELU()
202
+ ) if not glu else GEGLU(dim, inner_dim)
203
+
204
+ self.net = nn.Sequential(
205
+ project_in,
206
+ nn.Dropout(dropout),
207
+ nn.Linear(inner_dim, dim_out)
208
+ )
209
+
210
+ def forward(self, x):
211
+ return self.net(x)
212
+
213
+
214
+ # attention.
215
+ class Attention(nn.Module):
216
+ def __init__(
217
+ self,
218
+ dim,
219
+ dim_head=DEFAULT_DIM_HEAD,
220
+ heads=8,
221
+ causal=False,
222
+ mask=None,
223
+ talking_heads=False,
224
+ sparse_topk=None,
225
+ use_entmax15=False,
226
+ num_mem_kv=0,
227
+ dropout=0.,
228
+ on_attn=False
229
+ ):
230
+ super().__init__()
231
+ if use_entmax15:
232
+ raise NotImplementedError("Check out entmax activation instead of softmax activation!")
233
+ self.scale = dim_head ** -0.5
234
+ self.heads = heads
235
+ self.causal = causal
236
+ self.mask = mask
237
+
238
+ inner_dim = dim_head * heads
239
+
240
+ self.to_q = nn.Linear(dim, inner_dim, bias=False)
241
+ self.to_k = nn.Linear(dim, inner_dim, bias=False)
242
+ self.to_v = nn.Linear(dim, inner_dim, bias=False)
243
+ self.dropout = nn.Dropout(dropout)
244
+
245
+ # talking heads
246
+ self.talking_heads = talking_heads
247
+ if talking_heads:
248
+ self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
249
+ self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
250
+
251
+ # explicit topk sparse attention
252
+ self.sparse_topk = sparse_topk
253
+
254
+ # entmax
255
+ #self.attn_fn = entmax15 if use_entmax15 else F.softmax
256
+ self.attn_fn = F.softmax
257
+
258
+ # add memory key / values
259
+ self.num_mem_kv = num_mem_kv
260
+ if num_mem_kv > 0:
261
+ self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
262
+ self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
263
+
264
+ # attention on attention
265
+ self.attn_on_attn = on_attn
266
+ self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
267
+
268
+ def forward(
269
+ self,
270
+ x,
271
+ context=None,
272
+ mask=None,
273
+ context_mask=None,
274
+ rel_pos=None,
275
+ sinusoidal_emb=None,
276
+ prev_attn=None,
277
+ mem=None
278
+ ):
279
+ b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
280
+ kv_input = default(context, x)
281
+
282
+ q_input = x
283
+ k_input = kv_input
284
+ v_input = kv_input
285
+
286
+ if exists(mem):
287
+ k_input = torch.cat((mem, k_input), dim=-2)
288
+ v_input = torch.cat((mem, v_input), dim=-2)
289
+
290
+ if exists(sinusoidal_emb):
291
+ # in shortformer, the query would start at a position offset depending on the past cached memory
292
+ offset = k_input.shape[-2] - q_input.shape[-2]
293
+ q_input = q_input + sinusoidal_emb(q_input, offset=offset)
294
+ k_input = k_input + sinusoidal_emb(k_input)
295
+
296
+ q = self.to_q(q_input)
297
+ k = self.to_k(k_input)
298
+ v = self.to_v(v_input)
299
+
300
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
301
+
302
+ input_mask = None
303
+ if any(map(exists, (mask, context_mask))):
304
+ q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
305
+ k_mask = q_mask if not exists(context) else context_mask
306
+ k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
307
+ q_mask = rearrange(q_mask, 'b i -> b () i ()')
308
+ k_mask = rearrange(k_mask, 'b j -> b () () j')
309
+ input_mask = q_mask * k_mask
310
+
311
+ if self.num_mem_kv > 0:
312
+ mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
313
+ k = torch.cat((mem_k, k), dim=-2)
314
+ v = torch.cat((mem_v, v), dim=-2)
315
+ if exists(input_mask):
316
+ input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
317
+
318
+ dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
319
+ mask_value = max_neg_value(dots)
320
+
321
+ if exists(prev_attn):
322
+ dots = dots + prev_attn
323
+
324
+ pre_softmax_attn = dots
325
+
326
+ if talking_heads:
327
+ dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
328
+
329
+ if exists(rel_pos):
330
+ dots = rel_pos(dots)
331
+
332
+ if exists(input_mask):
333
+ dots.masked_fill_(~input_mask, mask_value)
334
+ del input_mask
335
+
336
+ if self.causal:
337
+ i, j = dots.shape[-2:]
338
+ r = torch.arange(i, device=device)
339
+ mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
340
+ mask = F.pad(mask, (j - i, 0), value=False)
341
+ dots.masked_fill_(mask, mask_value)
342
+ del mask
343
+
344
+ if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
345
+ top, _ = dots.topk(self.sparse_topk, dim=-1)
346
+ vk = top[..., -1].unsqueeze(-1).expand_as(dots)
347
+ mask = dots < vk
348
+ dots.masked_fill_(mask, mask_value)
349
+ del mask
350
+
351
+ attn = self.attn_fn(dots, dim=-1)
352
+ post_softmax_attn = attn
353
+
354
+ attn = self.dropout(attn)
355
+
356
+ if talking_heads:
357
+ attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
358
+
359
+ out = einsum('b h i j, b h j d -> b h i d', attn, v)
360
+ out = rearrange(out, 'b h n d -> b n (h d)')
361
+
362
+ intermediates = Intermediates(
363
+ pre_softmax_attn=pre_softmax_attn,
364
+ post_softmax_attn=post_softmax_attn
365
+ )
366
+
367
+ return self.to_out(out), intermediates
368
+
369
+
370
+ class AttentionLayers(nn.Module):
371
+ def __init__(
372
+ self,
373
+ dim,
374
+ depth,
375
+ heads=8,
376
+ causal=False,
377
+ cross_attend=False,
378
+ only_cross=False,
379
+ use_scalenorm=False,
380
+ use_rmsnorm=False,
381
+ use_rezero=False,
382
+ rel_pos_num_buckets=32,
383
+ rel_pos_max_distance=128,
384
+ position_infused_attn=False,
385
+ custom_layers=None,
386
+ sandwich_coef=None,
387
+ par_ratio=None,
388
+ residual_attn=False,
389
+ cross_residual_attn=False,
390
+ macaron=False,
391
+ pre_norm=True,
392
+ gate_residual=False,
393
+ **kwargs
394
+ ):
395
+ super().__init__()
396
+ ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
397
+ attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
398
+
399
+ dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
400
+
401
+ self.dim = dim
402
+ self.depth = depth
403
+ self.layers = nn.ModuleList([])
404
+
405
+ self.has_pos_emb = position_infused_attn
406
+ self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
407
+ self.rotary_pos_emb = always(None)
408
+
409
+ assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
410
+ self.rel_pos = None
411
+
412
+ self.pre_norm = pre_norm
413
+
414
+ self.residual_attn = residual_attn
415
+ self.cross_residual_attn = cross_residual_attn
416
+
417
+ norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
418
+ norm_class = RMSNorm if use_rmsnorm else norm_class
419
+ norm_fn = partial(norm_class, dim)
420
+
421
+ norm_fn = nn.Identity if use_rezero else norm_fn
422
+ branch_fn = Rezero if use_rezero else None
423
+
424
+ if cross_attend and not only_cross:
425
+ default_block = ('a', 'c', 'f')
426
+ elif cross_attend and only_cross:
427
+ default_block = ('c', 'f')
428
+ else:
429
+ default_block = ('a', 'f')
430
+
431
+ if macaron:
432
+ default_block = ('f',) + default_block
433
+
434
+ if exists(custom_layers):
435
+ layer_types = custom_layers
436
+ elif exists(par_ratio):
437
+ par_depth = depth * len(default_block)
438
+ assert 1 < par_ratio <= par_depth, 'par ratio out of range'
439
+ default_block = tuple(filter(not_equals('f'), default_block))
440
+ par_attn = par_depth // par_ratio
441
+ depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper
442
+ par_width = (depth_cut + depth_cut // par_attn) // par_attn
443
+ assert len(default_block) <= par_width, 'default block is too large for par_ratio'
444
+ par_block = default_block + ('f',) * (par_width - len(default_block))
445
+ par_head = par_block * par_attn
446
+ layer_types = par_head + ('f',) * (par_depth - len(par_head))
447
+ elif exists(sandwich_coef):
448
+ assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
449
+ layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
450
+ else:
451
+ layer_types = default_block * depth
452
+
453
+ self.layer_types = layer_types
454
+ self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
455
+
456
+ for layer_type in self.layer_types:
457
+ if layer_type == 'a':
458
+ layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
459
+ elif layer_type == 'c':
460
+ layer = Attention(dim, heads=heads, **attn_kwargs)
461
+ elif layer_type == 'f':
462
+ layer = FeedForward(dim, **ff_kwargs)
463
+ layer = layer if not macaron else Scale(0.5, layer)
464
+ else:
465
+ raise Exception(f'invalid layer type {layer_type}')
466
+
467
+ if isinstance(layer, Attention) and exists(branch_fn):
468
+ layer = branch_fn(layer)
469
+
470
+ if gate_residual:
471
+ residual_fn = GRUGating(dim)
472
+ else:
473
+ residual_fn = Residual()
474
+
475
+ self.layers.append(nn.ModuleList([
476
+ norm_fn(),
477
+ layer,
478
+ residual_fn
479
+ ]))
480
+
481
+ def forward(
482
+ self,
483
+ x,
484
+ context=None,
485
+ mask=None,
486
+ context_mask=None,
487
+ mems=None,
488
+ return_hiddens=False
489
+ ):
490
+ hiddens = []
491
+ intermediates = []
492
+ prev_attn = None
493
+ prev_cross_attn = None
494
+
495
+ mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
496
+
497
+ for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
498
+ is_last = ind == (len(self.layers) - 1)
499
+
500
+ if layer_type == 'a':
501
+ hiddens.append(x)
502
+ layer_mem = mems.pop(0)
503
+
504
+ residual = x
505
+
506
+ if self.pre_norm:
507
+ x = norm(x)
508
+
509
+ if layer_type == 'a':
510
+ out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
511
+ prev_attn=prev_attn, mem=layer_mem)
512
+ elif layer_type == 'c':
513
+ out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
514
+ elif layer_type == 'f':
515
+ out = block(x)
516
+
517
+ x = residual_fn(out, residual)
518
+
519
+ if layer_type in ('a', 'c'):
520
+ intermediates.append(inter)
521
+
522
+ if layer_type == 'a' and self.residual_attn:
523
+ prev_attn = inter.pre_softmax_attn
524
+ elif layer_type == 'c' and self.cross_residual_attn:
525
+ prev_cross_attn = inter.pre_softmax_attn
526
+
527
+ if not self.pre_norm and not is_last:
528
+ x = norm(x)
529
+
530
+ if return_hiddens:
531
+ intermediates = LayerIntermediates(
532
+ hiddens=hiddens,
533
+ attn_intermediates=intermediates
534
+ )
535
+
536
+ return x, intermediates
537
+
538
+ return x
539
+
540
+
541
+ class Encoder(AttentionLayers):
542
+ def __init__(self, **kwargs):
543
+ assert 'causal' not in kwargs, 'cannot set causality on encoder'
544
+ super().__init__(causal=False, **kwargs)
545
+
546
+
547
+
548
+ class TransformerWrapper(nn.Module):
549
+ def __init__(
550
+ self,
551
+ *,
552
+ num_tokens,
553
+ max_seq_len,
554
+ attn_layers,
555
+ emb_dim=None,
556
+ max_mem_len=0.,
557
+ emb_dropout=0.,
558
+ num_memory_tokens=None,
559
+ tie_embedding=False,
560
+ use_pos_emb=True
561
+ ):
562
+ super().__init__()
563
+ assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
564
+
565
+ dim = attn_layers.dim
566
+ emb_dim = default(emb_dim, dim)
567
+
568
+ self.max_seq_len = max_seq_len
569
+ self.max_mem_len = max_mem_len
570
+ self.num_tokens = num_tokens
571
+
572
+ self.token_emb = nn.Embedding(num_tokens, emb_dim)
573
+ self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
574
+ use_pos_emb and not attn_layers.has_pos_emb) else always(0)
575
+ self.emb_dropout = nn.Dropout(emb_dropout)
576
+
577
+ self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
578
+ self.attn_layers = attn_layers
579
+ self.norm = nn.LayerNorm(dim)
580
+
581
+ self.init_()
582
+
583
+ self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
584
+
585
+ # memory tokens (like [cls]) from Memory Transformers paper
586
+ num_memory_tokens = default(num_memory_tokens, 0)
587
+ self.num_memory_tokens = num_memory_tokens
588
+ if num_memory_tokens > 0:
589
+ self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
590
+
591
+ # let funnel encoder know number of memory tokens, if specified
592
+ if hasattr(attn_layers, 'num_memory_tokens'):
593
+ attn_layers.num_memory_tokens = num_memory_tokens
594
+
595
+ def init_(self):
596
+ nn.init.normal_(self.token_emb.weight, std=0.02)
597
+
598
+ def forward(
599
+ self,
600
+ x,
601
+ return_embeddings=False,
602
+ mask=None,
603
+ return_mems=False,
604
+ return_attn=False,
605
+ mems=None,
606
+ **kwargs
607
+ ):
608
+ b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
609
+ x = self.token_emb(x)
610
+ x += self.pos_emb(x)
611
+ x = self.emb_dropout(x)
612
+
613
+ x = self.project_emb(x)
614
+
615
+ if num_mem > 0:
616
+ mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
617
+ x = torch.cat((mem, x), dim=1)
618
+
619
+ # auto-handle masking after appending memory tokens
620
+ if exists(mask):
621
+ mask = F.pad(mask, (num_mem, 0), value=True)
622
+
623
+ x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
624
+ x = self.norm(x)
625
+
626
+ mem, x = x[:, :num_mem], x[:, num_mem:]
627
+
628
+ out = self.to_logits(x) if not return_embeddings else x
629
+
630
+ if return_mems:
631
+ hiddens = intermediates.hiddens
632
+ new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
633
+ new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
634
+ return out, new_mems
635
+
636
+ if return_attn:
637
+ attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
638
+ return out, attn_maps
639
+
640
+ return out
641
+
ldmlib/util.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+ import torch
4
+ import numpy as np
5
+ from collections import abc
6
+ from einops import rearrange
7
+ from functools import partial
8
+
9
+ import multiprocessing as mp
10
+ from threading import Thread
11
+ from queue import Queue
12
+
13
+ from inspect import isfunction
14
+ from PIL import Image, ImageDraw, ImageFont
15
+
16
+
17
+ def log_txt_as_img(wh, xc, size=10):
18
+ # wh a tuple of (width, height)
19
+ # xc a list of captions to plot
20
+ b = len(xc)
21
+ txts = list()
22
+ for bi in range(b):
23
+ txt = Image.new("RGB", wh, color="white")
24
+ draw = ImageDraw.Draw(txt)
25
+ font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
26
+ nc = int(40 * (wh[0] / 256))
27
+ lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
28
+
29
+ try:
30
+ draw.text((0, 0), lines, fill="black", font=font)
31
+ except UnicodeEncodeError:
32
+ print("Cant encode string for logging. Skipping.")
33
+
34
+ txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
35
+ txts.append(txt)
36
+ txts = np.stack(txts)
37
+ txts = torch.tensor(txts)
38
+ return txts
39
+
40
+
41
+ def ismap(x):
42
+ if not isinstance(x, torch.Tensor):
43
+ return False
44
+ return (len(x.shape) == 4) and (x.shape[1] > 3)
45
+
46
+
47
+ def isimage(x):
48
+ if not isinstance(x, torch.Tensor):
49
+ return False
50
+ return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
51
+
52
+
53
+ def exists(x):
54
+ return x is not None
55
+
56
+
57
+ def default(val, d):
58
+ if exists(val):
59
+ return val
60
+ return d() if isfunction(d) else d
61
+
62
+
63
+ def mean_flat(tensor):
64
+ """
65
+ https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
66
+ Take the mean over all non-batch dimensions.
67
+ """
68
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
69
+
70
+
71
+ def count_params(model, verbose=False):
72
+ total_params = sum(p.numel() for p in model.parameters())
73
+ if verbose:
74
+ print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
75
+ return total_params
76
+
77
+
78
+ def instantiate_from_config(config):
79
+ if not "target" in config:
80
+ if config == '__is_first_stage__':
81
+ return None
82
+ elif config == "__is_unconditional__":
83
+ return None
84
+ raise KeyError("Expected key `target` to instantiate.")
85
+ return get_obj_from_str(config["target"])(**config.get("params", dict()))
86
+
87
+
88
+ def get_obj_from_str(string, reload=False):
89
+ module, cls = string.rsplit(".", 1)
90
+ if reload:
91
+ module_imp = importlib.import_module(module)
92
+ importlib.reload(module_imp)
93
+ return getattr(importlib.import_module(module, package=None), cls)
94
+
95
+
96
+ def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
97
+ # create dummy dataset instance
98
+
99
+ # run prefetching
100
+ if idx_to_fn:
101
+ res = func(data, worker_id=idx)
102
+ else:
103
+ res = func(data)
104
+ Q.put([idx, res])
105
+ Q.put("Done")
106
+
107
+
108
+ def parallel_data_prefetch(
109
+ func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
110
+ ):
111
+ # if target_data_type not in ["ndarray", "list"]:
112
+ # raise ValueError(
113
+ # "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
114
+ # )
115
+ if isinstance(data, np.ndarray) and target_data_type == "list":
116
+ raise ValueError("list expected but function got ndarray.")
117
+ elif isinstance(data, abc.Iterable):
118
+ if isinstance(data, dict):
119
+ print(
120
+ f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
121
+ )
122
+ data = list(data.values())
123
+ if target_data_type == "ndarray":
124
+ data = np.asarray(data)
125
+ else:
126
+ data = list(data)
127
+ else:
128
+ raise TypeError(
129
+ f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
130
+ )
131
+
132
+ if cpu_intensive:
133
+ Q = mp.Queue(1000)
134
+ proc = mp.Process
135
+ else:
136
+ Q = Queue(1000)
137
+ proc = Thread
138
+ # spawn processes
139
+ if target_data_type == "ndarray":
140
+ arguments = [
141
+ [func, Q, part, i, use_worker_id]
142
+ for i, part in enumerate(np.array_split(data, n_proc))
143
+ ]
144
+ else:
145
+ step = (
146
+ int(len(data) / n_proc + 1)
147
+ if len(data) % n_proc != 0
148
+ else int(len(data) / n_proc)
149
+ )
150
+ arguments = [
151
+ [func, Q, part, i, use_worker_id]
152
+ for i, part in enumerate(
153
+ [data[i: i + step] for i in range(0, len(data), step)]
154
+ )
155
+ ]
156
+ processes = []
157
+ for i in range(n_proc):
158
+ p = proc(target=_do_parallel_data_prefetch, args=arguments[i])
159
+ processes += [p]
160
+
161
+ # start processes
162
+ print(f"Start prefetching...")
163
+ import time
164
+
165
+ start = time.time()
166
+ gather_res = [[] for _ in range(n_proc)]
167
+ try:
168
+ for p in processes:
169
+ p.start()
170
+
171
+ k = 0
172
+ while k < n_proc:
173
+ # get result
174
+ res = Q.get()
175
+ if res == "Done":
176
+ k += 1
177
+ else:
178
+ gather_res[res[0]] = res[1]
179
+
180
+ except Exception as e:
181
+ print("Exception: ", e)
182
+ for p in processes:
183
+ p.terminate()
184
+
185
+ raise e
186
+ finally:
187
+ for p in processes:
188
+ p.join()
189
+ print(f"Prefetching complete. [{time.time() - start} sec.]")
190
+
191
+ if target_data_type == 'ndarray':
192
+ if not isinstance(gather_res[0], np.ndarray):
193
+ return np.concatenate([np.asarray(r) for r in gather_res], axis=0)
194
+
195
+ # order outputs
196
+ return np.concatenate(gather_res, axis=0)
197
+ elif target_data_type == 'list':
198
+ out = []
199
+ for r in gather_res:
200
+ out.extend(r)
201
+ return out
202
+ else:
203
+ return gather_res
modules/app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import json
4
+ from io import BytesIO
5
+
6
+ from fastapi import FastAPI
7
+ from fastapi.staticfiles import StaticFiles
8
+ from fastapi.responses import FileResponse, StreamingResponse
9
+
10
+ from modules.inference import infer_t5
11
+ from modules.dataset import query_emotion
12
+
13
+ # https://huggingface.co/settings/tokens
14
+ # https://huggingface.co/spaces/{username}/{space}/settings
15
+ API_TOKEN = os.getenv("AUTH_TOKEN")
16
+ if not API_TOKEN:
17
+ with open('/root/.huggingface/token') as f:
18
+ lines = f.readlines()
19
+ API_TOKEN = lines[0]
20
+
21
+ app = FastAPI(docs_url=None, redoc_url=None)
22
+
23
+ app.mount("/static", StaticFiles(directory="static"), name="static")
24
+
25
+
26
+ @app.head("/")
27
+ @app.get("/")
28
+ def index() -> FileResponse:
29
+ return FileResponse(path="static/index.html", media_type="text/html")
30
+
31
+
32
+ @app.get("/infer_biggan")
33
+ def biggan(input):
34
+ output = requests.request(
35
+ "POST",
36
+ "https://api-inference.huggingface.co/models/osanseviero/BigGAN-deep-128",
37
+ headers={"Authorization": f"Bearer {API_TOKEN}"},
38
+ data=json.dumps(input),
39
+ )
40
+ #return json.dumps(output)
41
+ return StreamingResponse(BytesIO(output.content), media_type="image/png")
42
+
43
+
44
+ @app.get("/infer_t5")
45
+ def t5(input):
46
+ output = infer_t5(input)
47
+
48
+ return {"output": output}
49
+
50
+
51
+ @app.get("/query_emotion")
52
+ def emotion(start, end):
53
+ output = query_emotion(int(start), int(end))
54
+
55
+ return {"output": output}
modules/dataset.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ dataset = load_dataset("emotion", split="train")
4
+
5
+ emotions = dataset.info.features["label"].names
6
+
7
+ def query_emotion(start, end):
8
+ rows = dataset[start:end]
9
+ texts, labels = [rows[k] for k in rows.keys()]
10
+
11
+ observations = []
12
+
13
+ for i, text in enumerate(texts):
14
+ observations.append({
15
+ "text": text,
16
+ "emotion": emotions[labels[i]],
17
+ })
18
+
19
+ return observations
modules/inference.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
2
+
3
+ tokenizer = T5Tokenizer.from_pretrained("t5-small")
4
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
5
+
6
+
7
+ def infer_t5(input):
8
+ input_ids = tokenizer(input, return_tensors="pt").input_ids
9
+ outputs = model.generate(input_ids)
10
+
11
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
optimizedSD/LICENSE ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
2
+
3
+ CreativeML Open RAIL-M
4
+ dated August 22, 2022
5
+
6
+ Section I: PREAMBLE
7
+
8
+ Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
9
+
10
+ Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
11
+
12
+ In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
13
+
14
+ Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
15
+
16
+ This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
17
+
18
+ NOW THEREFORE, You and Licensor agree as follows:
19
+
20
+ 1. Definitions
21
+
22
+ - "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
23
+ - "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
24
+ - "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
25
+ - "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
26
+ - "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
27
+ - "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
28
+ - "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
29
+ - "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
30
+ - "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
31
+ - "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
32
+ - "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
33
+ - "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
34
+
35
+ Section II: INTELLECTUAL PROPERTY RIGHTS
36
+
37
+ Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
38
+
39
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
40
+ 3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
41
+
42
+ Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
43
+
44
+ 4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
45
+ Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
46
+ You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
47
+ You must cause any modified files to carry prominent notices stating that You changed the files;
48
+ You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
49
+ You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
50
+ 5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
51
+ 6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
52
+
53
+ Section IV: OTHER PROVISIONS
54
+
55
+ 7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
56
+ 8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
57
+ 9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
58
+ 10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
59
+ 11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
60
+ 12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
61
+
62
+ END OF TERMS AND CONDITIONS
63
+
64
+ Attachment A
65
+
66
+ Use Restrictions
67
+
68
+ You agree not to use the Model or Derivatives of the Model:
69
+
70
+ - In any way that violates any applicable national, federal, state, local or international law or regulation;
71
+ - For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
72
+ - To generate or disseminate verifiably false information and/or content with the purpose of harming others;
73
+ - To generate or disseminate personal identifiable information that can be used to harm an individual;
74
+ - To defame, disparage or otherwise harass others;
75
+ - For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
76
+ - For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
77
+ - To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
78
+ - For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
79
+ - To provide medical advice and medical results interpretation;
80
+ - To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).