Tu Bui commited on
Commit
6142a25
0 Parent(s):

first commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM tuvbui/torchcpu:torch111
2
+ ADD cldm ./cldm
3
+ ADD flae ./flae
4
+ ADD ldm ./ldm
5
+ ADD tools ./tools
6
+ ADD pages ./pages
7
+ ADD Embed_Secret.py .
8
+
9
+ EXPOSE 7860
10
+ CMD streamlit run Embed_Secret.py --server.enableXsrfProtection=false --server.port 7860 -- --weight https://kahlan.cvssp.org/data/Flickr25K/tubui/stega/unet100b_croprs/epoch=000070-step=000219999.ckpt --config https://kahlan.cvssp.org/data/Flickr25K/tubui/stega/unet100b_croprs/-project.yaml
Embed_Secret.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ streamlit app demo
5
+ how to run:
6
+ streamlit run app.py --server.port 8501
7
+
8
+ @author: Tu Bui @surrey.ac.uk
9
+ """
10
+ import os, sys, torch
11
+ import argparse
12
+ from pathlib import Path
13
+ import numpy as np
14
+ import pickle
15
+ import pytorch_lightning as pl
16
+ from torchvision import transforms
17
+ import argparse
18
+ from ldm.util import instantiate_from_config
19
+ from omegaconf import OmegaConf
20
+ from PIL import Image
21
+ from tools.augment_imagenetc import RandomImagenetC
22
+ from io import BytesIO
23
+ from tools.helpers import welcome_message
24
+ from tools.ecc import BCH, RSC
25
+
26
+ import streamlit as st
27
+ from streamlit.source_util import (
28
+ page_icon_and_name,
29
+ calc_md5,
30
+ get_pages,
31
+ _on_pages_changed
32
+ )
33
+
34
+ model_names = ['UNet']
35
+ SECRET_LEN = 100
36
+
37
+
38
+ def delete_page(main_script_path_str, page_name):
39
+
40
+ current_pages = get_pages(main_script_path_str)
41
+
42
+ for key, value in current_pages.items():
43
+ print(value['page_name'])
44
+ if value['page_name'] == page_name:
45
+ del current_pages[key]
46
+ break
47
+ else:
48
+ pass
49
+ _on_pages_changed.send()
50
+
51
+
52
+ def add_page(main_script_path_str, page_name):
53
+
54
+ pages = get_pages(main_script_path_str)
55
+ main_script_path = Path(main_script_path_str)
56
+ pages_dir = main_script_path.parent / "pages"
57
+ # st.write(list(pages_dir.glob("*.py"))+list(main_script_path.parent.glob("*.py")))
58
+ script_path = [f for f in list(pages_dir.glob("*.py"))+list(main_script_path.parent.glob("*.py")) if f.name.find(page_name) != -1][0]
59
+ script_path_str = str(script_path.resolve())
60
+ pi, pn = page_icon_and_name(script_path)
61
+ psh = calc_md5(script_path_str)
62
+ pages[psh] = {
63
+ "page_script_hash": psh,
64
+ "page_name": pn,
65
+ "icon": pi,
66
+ "script_path": script_path_str,
67
+ }
68
+ _on_pages_changed.send()
69
+
70
+ def unormalize(x):
71
+ # convert x in range [-1, 1], (B,C,H,W), tensor to [0, 255], uint8, numpy, (B,H,W,C)
72
+ x = torch.clamp((x + 1) * 127.5, 0, 255).permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
73
+ return x
74
+
75
+ def to_bytes(x, mime):
76
+ x = Image.fromarray(x)
77
+ buf = BytesIO()
78
+ f = "JPEG" if mime == 'image/jpeg' else "PNG"
79
+ x.save(buf, format=f)
80
+ byte_im = buf.getvalue()
81
+ return byte_im
82
+
83
+
84
+ def load_UNet(args):
85
+ print('args: ', args)
86
+ # # crop safe model
87
+ # config_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_tform2/configs/-project.yaml'
88
+ # weight_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_tform2/checkpoints/epoch=000060-step=000189999.ckpt'
89
+
90
+ # # resized crop safe model
91
+ # config_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/configs/-project.yaml'
92
+ # weight_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/checkpoints/epoch=000070-step=000219999.ckpt'
93
+
94
+ config_file = args.config_file
95
+ weight_file = args.weight_file
96
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
97
+ if weight_file.startswith('http'): # download from url
98
+ weight_dir = Path('./weights')
99
+ weight_dir.mkdir(exist_ok=True)
100
+ weight_path = weight_dir / weight_file.split('/')[-1]
101
+ config_path = weight_dir / config_file.split('/')[-1]
102
+ if not weight_path.exists():
103
+ import wget
104
+ print(f'Downloading {weight_file}...')
105
+ with st.spinner("Downloading model... this may take awhile!"):
106
+ wget.download(weight_file, str(weight_path))
107
+ wget.download(config_file, str(config_path))
108
+ weight_file = str(weight_path)
109
+ config_file = str(config_path)
110
+
111
+ config = OmegaConf.load(config_file).model
112
+ secret_len = config.params.secret_len
113
+ assert SECRET_LEN == secret_len
114
+ model = instantiate_from_config(config)
115
+ state_dict = torch.load(weight_file, map_location=torch.device('cpu'))
116
+ if 'global_step' in state_dict:
117
+ print(f'Global step: {state_dict["global_step"]}, epoch: {state_dict["epoch"]}')
118
+
119
+ if 'state_dict' in state_dict:
120
+ state_dict = state_dict['state_dict']
121
+ misses, ignores = model.load_state_dict(state_dict, strict=False)
122
+ print(f'Missed keys: {misses}\nIgnore keys: {ignores}')
123
+ model = model.to(device)
124
+ model.eval()
125
+ return model
126
+
127
+ def embed_secret(model_name, model, cover, tform, secret):
128
+ if model_name == 'UNet':
129
+ w, h = cover.size
130
+ with torch.no_grad():
131
+ im = tform(cover).unsqueeze(0).cuda() # 1, 3, 256, 256
132
+ stego, _ = model(im, secret) # 1, 3, 256, 256
133
+ res = (stego.clamp(-1,1) - im) # (1,3,256,256) residual
134
+ res = torch.nn.functional.interpolate(res, (h,w), mode='bilinear')
135
+ res = res.permute(0,2,3,1).cpu().numpy() # (1,256,256,3)
136
+ stego_uint8 = np.clip(res[0] + np.array(cover)/127.5-1., -1,1)*127.5+127.5 # (256, 256, 3), ndarray, uint8
137
+ stego_uint8 = stego_uint8.astype(np.uint8)
138
+ else:
139
+ raise NotImplementedError
140
+ return stego_uint8
141
+
142
+ def identity(x):
143
+ return x
144
+
145
+ def decode_secret(model_name, model, im, tform):
146
+ if model_name in ['RoSteALS', 'UNet']:
147
+ with torch.no_grad():
148
+ im = tform(im).unsqueeze(0).cuda() # 1, 3, 256, 256
149
+ secret_pred = (model.decoder(im) > 0).cpu().numpy() # 1, 100
150
+ else:
151
+ raise NotImplementedError
152
+ return secret_pred
153
+
154
+
155
+ @st.cache_resource
156
+ def load_model(model_name, _args):
157
+ if model_name == 'UNet':
158
+ tform_emb = transforms.Compose([
159
+ transforms.Resize((256,256)),
160
+ transforms.ToTensor(),
161
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
162
+ ])
163
+ tform_det = transforms.Compose([
164
+ transforms.Resize((224,224)),
165
+ transforms.ToTensor(),
166
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
167
+ ])
168
+ model = load_UNet(_args)
169
+ else:
170
+ raise NotImplementedError
171
+ return model, tform_emb, tform_det
172
+
173
+
174
+ @st.cache_resource
175
+ def load_ecc(ecc_name):
176
+ if ecc_name == 'BCH':
177
+ # ecc = BCH(285, 10, SECRET_LEN, verbose=True)
178
+ ecc = BCH(payload_len= SECRET_LEN, verbose=True)
179
+ elif ecc_name == 'RSC':
180
+ ecc = RSC(data_bytes=16, ecc_bytes=4, verbose=True)
181
+ return ecc
182
+
183
+
184
+ class Resize(object):
185
+ def __init__(self, size=None) -> None:
186
+ self.size = size
187
+ def __call__(self, x, size=None):
188
+ if isinstance(x, np.ndarray):
189
+ x = Image.fromarray(x)
190
+ new_size = size if size is not None else self.size
191
+ if min(x.size) > min(new_size): # downsample
192
+ x = x.resize(new_size, Image.LANCZOS)
193
+ else: # upsample
194
+ x = x.resize(new_size, Image.BILINEAR)
195
+ x = np.array(x)
196
+ return x
197
+
198
+
199
+ def parse_st_args():
200
+ # usage: streamlit run app.py -- --arg1 val1 --arg2 val2
201
+ parser = argparse.ArgumentParser()
202
+ parser.add_argument('--weight', default='/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/checkpoints/epoch=000070-step=000219999.ckpt')
203
+ parser.add_argument('--config', default='/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/configs/-project.yaml')
204
+ # parser.add_argument('--cpu', action='store_true')
205
+ args = parser.parse_args()
206
+ return args
207
+
208
+
209
+ def app(args):
210
+ # delete_page('Embed_Secret', 'Extract_Secret')
211
+ st.title('Watermarking Demo')
212
+ # setup model
213
+ model_name = st.selectbox("Choose the model", model_names)
214
+ model, tform_emb, tform_det = load_model(model_name, args)
215
+ display_width = 300
216
+
217
+ # ecc
218
+ ecc = load_ecc('BCH')
219
+ assert ecc.get_total_len() == SECRET_LEN
220
+
221
+ # setup st
222
+ st.subheader("Input")
223
+ image_file = st.file_uploader("Upload an image", type=["png","jpg","jpeg"])
224
+ if image_file is not None:
225
+ print('Image: ', image_file.name)
226
+ ext = image_file.name.split('.')[-1]
227
+ im = Image.open(image_file).convert('RGB')
228
+ size0 = im.size
229
+ st.image(im, width=display_width)
230
+ secret_text = st.text_input(f'Input the secret (max {ecc.data_len} chars)', 'A secret')
231
+ assert len(secret_text) <= ecc.data_len
232
+
233
+ # embed
234
+ st.subheader("Embed results")
235
+ status = st.empty()
236
+ prep = transforms.Compose([
237
+ transforms.Resize((256,256)),
238
+ transforms.CenterCrop((224,224))
239
+ ])
240
+ if image_file is not None and secret_text is not None:
241
+ secret = ecc.encode_text([secret_text]) # (1, len)
242
+ secret = torch.from_numpy(secret).float().cuda()
243
+ # im = tform(im).unsqueeze(0).cuda() # (1,3,H,W)
244
+ stego = embed_secret(model_name, model, im, tform_emb, secret)
245
+ st.image(stego, width=display_width)
246
+
247
+ # download button
248
+ mime='image/jpeg' if ext=='jpg' else f'image/{ext}'
249
+ stego_bytes = to_bytes(stego, mime)
250
+ st.download_button(label='Download image', data=stego_bytes, file_name=f'stego.{ext}', mime=mime)
251
+
252
+ # verify secret
253
+ stego_processed = prep(Image.fromarray(stego))
254
+ secret_pred = decode_secret(model_name, model, stego_processed, tform_det)
255
+ bit_acc = (secret_pred == secret.cpu().numpy()).mean()
256
+ secret_pred = ecc.decode_text(secret_pred)[0]
257
+ status.markdown('**Secret recovery check:** ' + secret_pred, unsafe_allow_html=True)
258
+ status.markdown('**Bit accuracy:** ' + str(bit_acc), unsafe_allow_html=True)
259
+
260
+ if __name__ == '__main__':
261
+ args = parse_st_args()
262
+ app(args)
263
+
264
+
265
+
cldm/ae.py ADDED
@@ -0,0 +1,727 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import einops
3
+ import torch
4
+ import torch as th
5
+ import torch.nn as nn
6
+ from torch.nn import functional as thf
7
+ import pytorch_lightning as pl
8
+ import torchvision
9
+ from copy import deepcopy
10
+ from ldm.modules.diffusionmodules.util import (
11
+ conv_nd,
12
+ linear,
13
+ zero_module,
14
+ timestep_embedding,
15
+ )
16
+ from contextlib import contextmanager, nullcontext
17
+ from einops import rearrange, repeat
18
+ from torchvision.utils import make_grid
19
+ from ldm.modules.attention import SpatialTransformer
20
+ from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
21
+ from ldm.models.diffusion.ddpm import LatentDiffusion
22
+ from ldm.util import log_txt_as_img, exists, instantiate_from_config, default
23
+ from ldm.models.diffusion.ddim import DDIMSampler
24
+ from ldm.modules.ema import LitEma
25
+ from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
26
+ from ldm.modules.diffusionmodules.model import Encoder
27
+ import lpips
28
+ import kornia
29
+ from kornia import color
30
+
31
+ def disabled_train(self, mode=True):
32
+ """Overwrite model.train with this function to make sure train/eval mode
33
+ does not change anymore."""
34
+ return self
35
+
36
+ class View(nn.Module):
37
+ def __init__(self, *shape):
38
+ super().__init__()
39
+ self.shape = shape
40
+
41
+ def forward(self, x):
42
+ return x.view(*self.shape)
43
+
44
+
45
+ class SecretEncoder3(nn.Module):
46
+ def __init__(self, secret_len, base_res=16, resolution=64) -> None:
47
+ super().__init__()
48
+ log_resolution = int(np.log2(resolution))
49
+ log_base = int(np.log2(base_res))
50
+ self.secret_len = secret_len
51
+ self.secret_scaler = nn.Sequential(
52
+ nn.Linear(secret_len, base_res*base_res*3),
53
+ nn.SiLU(),
54
+ View(-1, 3, base_res, base_res),
55
+ nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))), # chx16x16 -> chx256x256
56
+ zero_module(conv_nd(2, 3, 3, 3, padding=1))
57
+ ) # secret len -> ch x res x res
58
+
59
+ def copy_encoder_weight(self, ae_model):
60
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
61
+ return None
62
+
63
+ def encode(self, x):
64
+ x = self.secret_scaler(x)
65
+ return x
66
+
67
+ def forward(self, x, c):
68
+ # x: [B, C, H, W], c: [B, secret_len]
69
+ c = self.encode(c)
70
+ return c, None
71
+
72
+
73
+ class SecretEncoder4(nn.Module):
74
+ """same as SecretEncoder3 but with ch as input"""
75
+ def __init__(self, secret_len, ch=3, base_res=16, resolution=64) -> None:
76
+ super().__init__()
77
+ log_resolution = int(np.log2(resolution))
78
+ log_base = int(np.log2(base_res))
79
+ self.secret_len = secret_len
80
+ self.secret_scaler = nn.Sequential(
81
+ nn.Linear(secret_len, base_res*base_res*ch),
82
+ nn.SiLU(),
83
+ View(-1, ch, base_res, base_res),
84
+ nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))), # chx16x16 -> chx256x256
85
+ zero_module(conv_nd(2, ch, ch, 3, padding=1))
86
+ ) # secret len -> ch x res x res
87
+
88
+ def copy_encoder_weight(self, ae_model):
89
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
90
+ return None
91
+
92
+ def encode(self, x):
93
+ x = self.secret_scaler(x)
94
+ return x
95
+
96
+ def forward(self, x, c):
97
+ # x: [B, C, H, W], c: [B, secret_len]
98
+ c = self.encode(c)
99
+ return c, None
100
+
101
+ class SecretEncoder6(nn.Module):
102
+ """join img emb with secret emb"""
103
+ def __init__(self, secret_len, ch=3, base_res=16, resolution=64, emode='c3') -> None:
104
+ super().__init__()
105
+ assert emode in ['c3', 'c2', 'm3']
106
+
107
+ if emode == 'c3': # c3: concat c and x each has ch channels
108
+ secret_ch = ch
109
+ join_ch = 2*ch
110
+ elif emode == 'c2': # c2: concat c (2) and x ave (1)
111
+ secret_ch = 2
112
+ join_ch = ch
113
+ elif emode == 'm3': # m3: multiply c (ch) and x (ch)
114
+ secret_ch = ch
115
+ join_ch = ch
116
+
117
+ # m3: multiply c (ch) and x ave (1)
118
+ log_resolution = int(np.log2(resolution))
119
+ log_base = int(np.log2(base_res))
120
+ self.secret_len = secret_len
121
+ self.emode = emode
122
+ self.resolution = resolution
123
+ self.secret_scaler = nn.Sequential(
124
+ nn.Linear(secret_len, base_res*base_res*secret_ch),
125
+ nn.SiLU(),
126
+ View(-1, secret_ch, base_res, base_res),
127
+ nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))), # chx16x16 -> chx256x256
128
+ ) # secret len -> ch x res x res
129
+ self.join_encoder = nn.Sequential(
130
+ conv_nd(2, join_ch, join_ch, 3, padding=1),
131
+ nn.SiLU(),
132
+ conv_nd(2, join_ch, ch, 3, padding=1),
133
+ nn.SiLU(),
134
+ conv_nd(2, ch, ch, 3, padding=1),
135
+ nn.SiLU()
136
+ )
137
+ self.out_layer = zero_module(conv_nd(2, ch, ch, 3, padding=1))
138
+
139
+ def copy_encoder_weight(self, ae_model):
140
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
141
+ return None
142
+
143
+ def encode(self, x):
144
+ x = self.secret_scaler(x)
145
+ return x
146
+
147
+ def forward(self, x, c):
148
+ # x: [B, C, H, W], c: [B, secret_len]
149
+ c = self.encode(c)
150
+ if self.emode == 'c3':
151
+ x = torch.cat([x, c], dim=1)
152
+ elif self.emode == 'c2':
153
+ x = torch.cat([x.mean(dim=1, keepdim=True), c], dim=1)
154
+ elif self.emode == 'm3':
155
+ x = x * c
156
+ dx = self.join_encoder(x)
157
+ dx = self.out_layer(dx)
158
+ return dx, None
159
+
160
+ class SecretEncoder5(nn.Module):
161
+ """same as SecretEncoder3 but with ch as input"""
162
+ def __init__(self, secret_len, ch=3, base_res=16, resolution=64, joint=False) -> None:
163
+ super().__init__()
164
+ log_resolution = int(np.log2(resolution))
165
+ log_base = int(np.log2(base_res))
166
+ self.secret_len = secret_len
167
+ self.joint = joint
168
+ self.resolution = resolution
169
+ self.secret_scaler = nn.Sequential(
170
+ nn.Linear(secret_len, base_res*base_res*ch),
171
+ nn.SiLU(),
172
+ View(-1, ch, base_res, base_res),
173
+ nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))), # chx16x16 -> chx256x256
174
+ ) # secret len -> ch x res x res
175
+ if joint:
176
+ self.join_encoder = nn.Sequential(
177
+ conv_nd(2, 2*ch, 2*ch, 3, padding=1),
178
+ nn.SiLU(),
179
+ conv_nd(2, 2*ch, ch, 3, padding=1),
180
+ nn.SiLU()
181
+ )
182
+ self.out_layer = zero_module(conv_nd(2, ch, ch, 3, padding=1))
183
+
184
+ def copy_encoder_weight(self, ae_model):
185
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
186
+ return None
187
+
188
+ def encode(self, x):
189
+ x = self.secret_scaler(x)
190
+ return x
191
+
192
+ def forward(self, x, c):
193
+ # x: [B, C, H, W], c: [B, secret_len]
194
+ c = self.encode(c)
195
+ if self.joint:
196
+ x = thf.interpolate(x, size=(self.resolution, self.resolution), mode="bilinear", align_corners=False, antialias=True)
197
+ c = self.join_encoder(torch.cat([x, c], dim=1))
198
+ c = self.out_layer(c)
199
+ return c, None
200
+
201
+
202
+ class SecretEncoder2(nn.Module):
203
+ def __init__(self, secret_len, embed_dim, ddconfig, ckpt_path=None,
204
+ ignore_keys=[],
205
+ image_key="image",
206
+ colorize_nlabels=None,
207
+ monitor=None,
208
+ ema_decay=None,
209
+ learn_logvar=False) -> None:
210
+ super().__init__()
211
+ log_resolution = int(np.log2(ddconfig.resolution))
212
+ self.secret_len = secret_len
213
+ self.learn_logvar = learn_logvar
214
+ self.image_key = image_key
215
+ self.encoder = Encoder(**ddconfig)
216
+ self.encoder.conv_out = zero_module(self.encoder.conv_out)
217
+ self.embed_dim = embed_dim
218
+
219
+ if colorize_nlabels is not None:
220
+ assert type(colorize_nlabels)==int
221
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
222
+
223
+ if monitor is not None:
224
+ self.monitor = monitor
225
+
226
+ self.secret_scaler = nn.Sequential(
227
+ nn.Linear(secret_len, 32*32*ddconfig.out_ch),
228
+ nn.SiLU(),
229
+ View(-1, ddconfig.out_ch, 32, 32),
230
+ nn.Upsample(scale_factor=(2**(log_resolution-5), 2**(log_resolution-5))), # chx16x16 -> chx256x256
231
+ # zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
232
+ ) # secret len -> ch x res x res
233
+ # out_resolution = ddconfig.resolution//(len(ddconfig.ch_mult)-1)
234
+ # self.out_layer = zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
235
+
236
+ self.use_ema = ema_decay is not None
237
+ if self.use_ema:
238
+ self.ema_decay = ema_decay
239
+ assert 0. < ema_decay < 1.
240
+ self.model_ema = LitEma(self, decay=ema_decay)
241
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
242
+
243
+ if ckpt_path is not None:
244
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
245
+
246
+
247
+ def init_from_ckpt(self, path, ignore_keys=list()):
248
+ sd = torch.load(path, map_location="cpu")["state_dict"]
249
+ keys = list(sd.keys())
250
+ for k in keys:
251
+ for ik in ignore_keys:
252
+ if k.startswith(ik):
253
+ print("Deleting key {} from state_dict.".format(k))
254
+ del sd[k]
255
+ misses, ignores = self.load_state_dict(sd, strict=False)
256
+ print(f"[SecretEncoder] Restored from {path}, misses: {misses}, ignores: {ignores}")
257
+
258
+ def copy_encoder_weight(self, ae_model):
259
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
260
+ return None
261
+ self.encoder.load_state_dict(ae_model.encoder.state_dict())
262
+ self.quant_conv.load_state_dict(ae_model.quant_conv.state_dict())
263
+
264
+ @contextmanager
265
+ def ema_scope(self, context=None):
266
+ if self.use_ema:
267
+ self.model_ema.store(self.parameters())
268
+ self.model_ema.copy_to(self)
269
+ if context is not None:
270
+ print(f"{context}: Switched to EMA weights")
271
+ try:
272
+ yield None
273
+ finally:
274
+ if self.use_ema:
275
+ self.model_ema.restore(self.parameters())
276
+ if context is not None:
277
+ print(f"{context}: Restored training weights")
278
+
279
+ def on_train_batch_end(self, *args, **kwargs):
280
+ if self.use_ema:
281
+ self.model_ema(self)
282
+
283
+ def encode(self, x):
284
+ h = self.encoder(x)
285
+ posterior = h
286
+ return posterior
287
+
288
+ def forward(self, x, c):
289
+ # x: [B, C, H, W], c: [B, secret_len]
290
+ c = self.secret_scaler(c)
291
+ x = torch.cat([x, c], dim=1)
292
+ z = self.encode(x)
293
+ # z = self.out_layer(z)
294
+ return z, None
295
+
296
+
297
+ class SecretEncoder7(nn.Module):
298
+ def __init__(self, secret_len, ddconfig, ckpt_path=None,
299
+ ignore_keys=[],embed_dim=3,
300
+ ema_decay=None) -> None:
301
+ super().__init__()
302
+ log_resolution = int(np.log2(ddconfig.resolution))
303
+ self.secret_len = secret_len
304
+ self.encoder = Encoder(**ddconfig)
305
+ # self.encoder.conv_out = zero_module(self.encoder.conv_out)
306
+ self.quant_conv = nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
307
+
308
+ self.secret_scaler = nn.Sequential(
309
+ nn.Linear(secret_len, 32*32*2),
310
+ nn.SiLU(),
311
+ View(-1, 2, 32, 32),
312
+ # nn.Upsample(scale_factor=(2**(log_resolution-5), 2**(log_resolution-5))), # chx16x16 -> chx256x256
313
+ # zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
314
+ ) # secret len -> ch x res x res
315
+ # out_resolution = ddconfig.resolution//(len(ddconfig.ch_mult)-1)
316
+ # self.out_layer = zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
317
+
318
+ self.use_ema = ema_decay is not None
319
+ if self.use_ema:
320
+ self.ema_decay = ema_decay
321
+ assert 0. < ema_decay < 1.
322
+ self.model_ema = LitEma(self, decay=ema_decay)
323
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
324
+
325
+ if ckpt_path is not None:
326
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
327
+
328
+
329
+ def init_from_ckpt(self, path, ignore_keys=list()):
330
+ sd = torch.load(path, map_location="cpu")["state_dict"]
331
+ keys = list(sd.keys())
332
+ for k in keys:
333
+ for ik in ignore_keys:
334
+ if k.startswith(ik):
335
+ print("Deleting key {} from state_dict.".format(k))
336
+ del sd[k]
337
+ misses, ignores = self.load_state_dict(sd, strict=False)
338
+ print(f"[SecretEncoder7] Restored from {path}, misses: {len(misses)}, ignores: {len(ignores)}. Do not worry as we are not using the decoder and the secret encoder is a novel module.")
339
+
340
+ def copy_encoder_weight(self, ae_model):
341
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
342
+ # return None
343
+ self.encoder.load_state_dict(deepcopy(ae_model.encoder.state_dict()))
344
+ self.quant_conv.load_state_dict(deepcopy(ae_model.quant_conv.state_dict()))
345
+
346
+ @contextmanager
347
+ def ema_scope(self, context=None):
348
+ if self.use_ema:
349
+ self.model_ema.store(self.parameters())
350
+ self.model_ema.copy_to(self)
351
+ if context is not None:
352
+ print(f"{context}: Switched to EMA weights")
353
+ try:
354
+ yield None
355
+ finally:
356
+ if self.use_ema:
357
+ self.model_ema.restore(self.parameters())
358
+ if context is not None:
359
+ print(f"{context}: Restored training weights")
360
+
361
+ def on_train_batch_end(self, *args, **kwargs):
362
+ if self.use_ema:
363
+ self.model_ema(self)
364
+
365
+ def encode(self, x):
366
+ h = self.encoder(x)
367
+ h = self.quant_conv(h)
368
+ return h
369
+
370
+ def forward(self, x, c):
371
+ # x: [B, C, H, W], c: [B, secret_len]
372
+ c = self.secret_scaler(c) # [B, 2, 32, 32]
373
+ # c = thf.interpolate(c, size=x.shape[-2:], mode="bilinear", align_corners=False)
374
+ c = thf.interpolate(c, size=x.shape[-2:], mode="nearest")
375
+ x = 0.2125 * x[:,0,...] + 0.7154 *x[:,1,...] + 0.0721 * x[:,2,...]
376
+ x = torch.cat([x.unsqueeze(1), c], dim=1)
377
+ z = self.encode(x)
378
+ # z = self.out_layer(z)
379
+ return z, None
380
+
381
+ class SecretEncoder(nn.Module):
382
+ def __init__(self, secret_len, embed_dim, ddconfig, ckpt_path=None,
383
+ ignore_keys=[],
384
+ image_key="image",
385
+ colorize_nlabels=None,
386
+ monitor=None,
387
+ ema_decay=None,
388
+ learn_logvar=False) -> None:
389
+ super().__init__()
390
+ log_resolution = int(np.log2(ddconfig.resolution))
391
+ self.secret_len = secret_len
392
+ self.learn_logvar = learn_logvar
393
+ self.image_key = image_key
394
+ self.encoder = Encoder(**ddconfig)
395
+ assert ddconfig["double_z"]
396
+ self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
397
+ self.embed_dim = embed_dim
398
+
399
+ if colorize_nlabels is not None:
400
+ assert type(colorize_nlabels)==int
401
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
402
+
403
+ if monitor is not None:
404
+ self.monitor = monitor
405
+
406
+ self.use_ema = ema_decay is not None
407
+ if self.use_ema:
408
+ self.ema_decay = ema_decay
409
+ assert 0. < ema_decay < 1.
410
+ self.model_ema = LitEma(self, decay=ema_decay)
411
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
412
+
413
+ if ckpt_path is not None:
414
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
415
+
416
+ self.secret_scaler = nn.Sequential(
417
+ nn.Linear(secret_len, 32*32*ddconfig.out_ch),
418
+ nn.SiLU(),
419
+ View(-1, ddconfig.out_ch, 32, 32),
420
+ nn.Upsample(scale_factor=(2**(log_resolution-5), 2**(log_resolution-5))), # chx16x16 -> chx256x256
421
+ zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
422
+ ) # secret len -> ch x res x res
423
+ # out_resolution = ddconfig.resolution//(len(ddconfig.ch_mult)-1)
424
+ self.out_layer = zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
425
+
426
+ def init_from_ckpt(self, path, ignore_keys=list()):
427
+ sd = torch.load(path, map_location="cpu")["state_dict"]
428
+ keys = list(sd.keys())
429
+ for k in keys:
430
+ for ik in ignore_keys:
431
+ if k.startswith(ik):
432
+ print("Deleting key {} from state_dict.".format(k))
433
+ del sd[k]
434
+ misses, ignores = self.load_state_dict(sd, strict=False)
435
+ print(f"[SecretEncoder] Restored from {path}, misses: {misses}, ignores: {ignores}")
436
+
437
+ def copy_encoder_weight(self, ae_model):
438
+ # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
439
+ self.encoder.load_state_dict(ae_model.encoder.state_dict())
440
+ self.quant_conv.load_state_dict(ae_model.quant_conv.state_dict())
441
+
442
+ @contextmanager
443
+ def ema_scope(self, context=None):
444
+ if self.use_ema:
445
+ self.model_ema.store(self.parameters())
446
+ self.model_ema.copy_to(self)
447
+ if context is not None:
448
+ print(f"{context}: Switched to EMA weights")
449
+ try:
450
+ yield None
451
+ finally:
452
+ if self.use_ema:
453
+ self.model_ema.restore(self.parameters())
454
+ if context is not None:
455
+ print(f"{context}: Restored training weights")
456
+
457
+ def on_train_batch_end(self, *args, **kwargs):
458
+ if self.use_ema:
459
+ self.model_ema(self)
460
+
461
+ def encode(self, x):
462
+ h = self.encoder(x)
463
+ moments = self.quant_conv(h)
464
+ posterior = DiagonalGaussianDistribution(moments)
465
+ return posterior
466
+
467
+ def forward(self, x, c):
468
+ # x: [B, C, H, W], c: [B, secret_len]
469
+ c = self.secret_scaler(c)
470
+ x = x + c
471
+ posterior = self.encode(x)
472
+ z = posterior.sample()
473
+ z = self.out_layer(z)
474
+ return z, posterior
475
+
476
+
477
+ class ControlAE(pl.LightningModule):
478
+ def __init__(self,
479
+ first_stage_key,
480
+ first_stage_config,
481
+ control_key,
482
+ control_config,
483
+ decoder_config,
484
+ loss_config,
485
+ noise_config='__none__',
486
+ use_ema=False,
487
+ secret_warmup=False,
488
+ scale_factor=1.,
489
+ ckpt_path="__none__",
490
+ ):
491
+ super().__init__()
492
+ self.scale_factor = scale_factor
493
+ self.control_key = control_key
494
+ self.first_stage_key = first_stage_key
495
+ self.ae = instantiate_from_config(first_stage_config)
496
+ self.control = instantiate_from_config(control_config)
497
+ self.decoder = instantiate_from_config(decoder_config)
498
+ self.crop = kornia.augmentation.CenterCrop((224, 224), cropping_mode="resample") # early training phase
499
+ if noise_config != '__none__':
500
+ print('Using noise')
501
+ self.noise = instantiate_from_config(noise_config)
502
+ # copy weights from first stage
503
+ self.control.copy_encoder_weight(self.ae)
504
+ # freeze first stage
505
+ self.ae.eval()
506
+ self.ae.train = disabled_train
507
+ for p in self.ae.parameters():
508
+ p.requires_grad = False
509
+
510
+ self.loss_layer = instantiate_from_config(loss_config)
511
+
512
+ # early training phase
513
+ # self.fixed_input = True
514
+ self.fixed_x = None
515
+ self.fixed_img = None
516
+ self.fixed_input_recon = None
517
+ self.fixed_control = None
518
+ self.register_buffer("fixed_input", torch.tensor(True))
519
+
520
+ # secret warmup
521
+ self.secret_warmup = secret_warmup
522
+ self.secret_baselen = 2
523
+ self.secret_len = control_config.params.secret_len
524
+ if self.secret_warmup:
525
+ assert self.secret_len == 2**(int(np.log2(self.secret_len)))
526
+
527
+ self.use_ema = use_ema
528
+ if self.use_ema:
529
+ print('Using EMA')
530
+ self.control_ema = LitEma(self.control)
531
+ self.decoder_ema = LitEma(self.decoder)
532
+ print(f"Keeping EMAs of {len(list(self.control_ema.buffers()) + list(self.decoder_ema.buffers()))}.")
533
+
534
+ if ckpt_path != '__none__':
535
+ self.init_from_ckpt(ckpt_path, ignore_keys=[])
536
+
537
+ def get_warmup_secret(self, old_secret):
538
+ # old_secret: [B, secret_len]
539
+ # new_secret: [B, secret_len]
540
+ if self.secret_warmup:
541
+ bsz = old_secret.shape[0]
542
+ nrepeats = self.secret_len // self.secret_baselen
543
+ new_secret = torch.zeros((bsz, self.secret_baselen), dtype=torch.float).random_(0, 2).repeat_interleave(nrepeats, dim=1)
544
+ return new_secret.to(old_secret.device)
545
+ else:
546
+ return old_secret
547
+
548
+ def init_from_ckpt(self, path, ignore_keys=list()):
549
+ sd = torch.load(path, map_location="cpu")["state_dict"]
550
+ keys = list(sd.keys())
551
+ for k in keys:
552
+ for ik in ignore_keys:
553
+ if k.startswith(ik):
554
+ print("Deleting key {} from state_dict.".format(k))
555
+ del sd[k]
556
+ self.load_state_dict(sd, strict=False)
557
+ print(f"Restored from {path}")
558
+
559
+ @contextmanager
560
+ def ema_scope(self, context=None):
561
+ if self.use_ema:
562
+ self.control_ema.store(self.control.parameters())
563
+ self.decoder_ema.store(self.decoder.parameters())
564
+ self.control_ema.copy_to(self.control)
565
+ self.decoder_ema.copy_to(self.decoder)
566
+ if context is not None:
567
+ print(f"{context}: Switched to EMA weights")
568
+ try:
569
+ yield None
570
+ finally:
571
+ if self.use_ema:
572
+ self.control_ema.restore(self.control.parameters())
573
+ self.decoder_ema.restore(self.decoder.parameters())
574
+ if context is not None:
575
+ print(f"{context}: Restored training weights")
576
+
577
+ def on_train_batch_end(self, *args, **kwargs):
578
+ if self.use_ema:
579
+ self.control_ema(self.control)
580
+ self.decoder_ema(self.decoder)
581
+
582
+ def compute_loss(self, pred, target):
583
+ # return thf.mse_loss(pred, target, reduction="none").mean(dim=(1, 2, 3))
584
+ lpips_loss = self.lpips_loss(pred, target).mean(dim=[1,2,3])
585
+ pred_yuv = color.rgb_to_yuv((pred + 1) / 2)
586
+ target_yuv = color.rgb_to_yuv((target + 1) / 2)
587
+ yuv_loss = torch.mean((pred_yuv - target_yuv)**2, dim=[2,3])
588
+ yuv_loss = 1.5*torch.mm(yuv_loss, self.yuv_scales).squeeze(1)
589
+ return lpips_loss + yuv_loss
590
+
591
+ def forward(self, x, image, c):
592
+ if self.control.__class__.__name__ == 'SecretEncoder6':
593
+ eps, posterior = self.control(x, c)
594
+ else:
595
+ eps, posterior = self.control(image, c)
596
+ return x + eps, posterior
597
+
598
+ @torch.no_grad()
599
+ def get_input(self, batch, return_first_stage=False, bs=None):
600
+ image = batch[self.first_stage_key]
601
+ control = batch[self.control_key]
602
+ control = self.get_warmup_secret(control)
603
+ if bs is not None:
604
+ image = image[:bs]
605
+ control = control[:bs]
606
+ else:
607
+ bs = image.shape[0]
608
+ # encode image 1st stage
609
+ image = einops.rearrange(image, "b h w c -> b c h w").contiguous()
610
+ x = self.encode_first_stage(image).detach()
611
+ image_rec = self.decode_first_stage(x).detach()
612
+
613
+ # check if using fixed input (early training phase)
614
+ # if self.training and self.fixed_input:
615
+ if self.fixed_input:
616
+ if self.fixed_x is None: # first iteration
617
+ print('[TRAINING] Warmup - using fixed input image for now!')
618
+ self.fixed_x = x.detach().clone()[:bs]
619
+ self.fixed_img = image.detach().clone()[:bs]
620
+ self.fixed_input_recon = image_rec.detach().clone()[:bs]
621
+ self.fixed_control = control.detach().clone()[:bs] # use for log_images with fixed_input option only
622
+ x, image, image_rec = self.fixed_x, self.fixed_img, self.fixed_input_recon
623
+
624
+ out = [x, control]
625
+ if return_first_stage:
626
+ out.extend([image, image_rec])
627
+ return out
628
+
629
+ def decode_first_stage(self, z):
630
+ z = 1./self.scale_factor * z
631
+ image_rec = self.ae.decode(z)
632
+ return image_rec
633
+
634
+ def encode_first_stage(self, image):
635
+ encoder_posterior = self.ae.encode(image)
636
+ if isinstance(encoder_posterior, DiagonalGaussianDistribution):
637
+ z = encoder_posterior.sample()
638
+ elif isinstance(encoder_posterior, torch.Tensor):
639
+ z = encoder_posterior
640
+ else:
641
+ raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
642
+ return self.scale_factor * z
643
+
644
+ def shared_step(self, batch):
645
+ x, c, img, _ = self.get_input(batch, return_first_stage=True)
646
+ # import pdb; pdb.set_trace()
647
+ x, posterior = self(x, img, c)
648
+ image_rec = self.decode_first_stage(x)
649
+ # resize
650
+ if img.shape[-1] > 256:
651
+ img = thf.interpolate(img, size=(256, 256), mode='bilinear', align_corners=False).detach()
652
+ image_rec = thf.interpolate(image_rec, size=(256, 256), mode='bilinear', align_corners=False)
653
+ if hasattr(self, 'noise') and self.noise.is_activated():
654
+ image_rec_noised = self.noise(image_rec, self.global_step, p=0.9)
655
+ else:
656
+ image_rec_noised = self.crop(image_rec) # center crop
657
+ image_rec_noised = torch.clamp(image_rec_noised, -1, 1)
658
+ pred = self.decoder(image_rec_noised)
659
+
660
+ loss, loss_dict = self.loss_layer(img, image_rec, posterior, c, pred, self.global_step)
661
+ bit_acc = loss_dict["bit_acc"]
662
+
663
+ bit_acc_ = bit_acc.item()
664
+
665
+ if (bit_acc_ > 0.98) and (not self.fixed_input) and self.noise.is_activated():
666
+ self.loss_layer.activate_ramp(self.global_step)
667
+
668
+ if (bit_acc_ > 0.95) and (not self.fixed_input): # ramp up image loss at late training stage
669
+ if hasattr(self, 'noise') and (not self.noise.is_activated()):
670
+ self.noise.activate(self.global_step)
671
+
672
+ if (bit_acc_ > 0.9) and self.fixed_input: # execute only once
673
+ print(f'[TRAINING] High bit acc ({bit_acc_}) achieved, switch to full image dataset training.')
674
+ self.fixed_input = ~self.fixed_input
675
+ return loss, loss_dict
676
+
677
+ def training_step(self, batch, batch_idx):
678
+ loss, loss_dict = self.shared_step(batch)
679
+ loss_dict = {f"train/{key}": val for key, val in loss_dict.items()}
680
+ self.log_dict(loss_dict, prog_bar=True,
681
+ logger=True, on_step=True, on_epoch=True)
682
+
683
+ self.log("global_step", self.global_step,
684
+ prog_bar=True, logger=True, on_step=True, on_epoch=False)
685
+ # if self.use_scheduler:
686
+ # lr = self.optimizers().param_groups[0]['lr']
687
+ # self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
688
+
689
+ return loss
690
+
691
+ @torch.no_grad()
692
+ def validation_step(self, batch, batch_idx):
693
+ _, loss_dict_no_ema = self.shared_step(batch)
694
+ loss_dict_no_ema = {f"val/{key}": val for key, val in loss_dict_no_ema.items() if key != 'img_lw'}
695
+ with self.ema_scope():
696
+ _, loss_dict_ema = self.shared_step(batch)
697
+ loss_dict_ema = {'val/' + key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
698
+ self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
699
+ self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
700
+
701
+ @torch.no_grad()
702
+ def log_images(self, batch, fixed_input=False, **kwargs):
703
+ log = dict()
704
+ if fixed_input and self.fixed_img is not None:
705
+ x, c, img, img_recon = self.fixed_x, self.fixed_control, self.fixed_img, self.fixed_input_recon
706
+ else:
707
+ x, c, img, img_recon = self.get_input(batch, return_first_stage=True)
708
+ x, _ = self(x, img, c)
709
+ image_out = self.decode_first_stage(x)
710
+ if hasattr(self, 'noise') and self.noise.is_activated():
711
+ img_noise = self.noise(image_out, self.global_step, p=1.0)
712
+ log['noised'] = img_noise
713
+ log['input'] = img
714
+ log['output'] = image_out
715
+ log['recon'] = img_recon
716
+ return log
717
+
718
+ def configure_optimizers(self):
719
+ lr = self.learning_rate
720
+ params = list(self.control.parameters()) + list(self.decoder.parameters())
721
+ optimizer = torch.optim.AdamW(params, lr=lr)
722
+ return optimizer
723
+
724
+
725
+
726
+
727
+
cldm/cldm.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import einops
3
+ import torch
4
+ import torch as th
5
+ import torch.nn as nn
6
+ import torchvision
7
+ from ldm.modules.diffusionmodules.util import (
8
+ conv_nd,
9
+ linear,
10
+ zero_module,
11
+ timestep_embedding,
12
+ )
13
+
14
+ from einops import rearrange, repeat
15
+ from torchvision.utils import make_grid
16
+ from ldm.modules.attention import SpatialTransformer
17
+ from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
18
+ from ldm.models.diffusion.ddpm import LatentDiffusion
19
+ from ldm.util import log_txt_as_img, exists, instantiate_from_config
20
+ from ldm.models.diffusion.ddim import DDIMSampler
21
+
22
+
23
+ class ControlledUnetModel(UNetModel):
24
+ def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
25
+ hs = []
26
+ with torch.no_grad():
27
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
28
+ emb = self.time_embed(t_emb)
29
+ h = x.type(self.dtype)
30
+ for module in self.input_blocks:
31
+ h = module(h, emb, context)
32
+ hs.append(h)
33
+ h = self.middle_block(h, emb, context)
34
+
35
+ h += control.pop()
36
+
37
+ for i, module in enumerate(self.output_blocks):
38
+ if only_mid_control:
39
+ h = torch.cat([h, hs.pop()], dim=1)
40
+ else:
41
+ h = torch.cat([h, hs.pop() + control.pop()], dim=1)
42
+ h = module(h, emb, context)
43
+
44
+ h = h.type(x.dtype)
45
+ return self.out(h)
46
+
47
+ class View(nn.Module):
48
+ def __init__(self, *shape):
49
+ super().__init__()
50
+ self.shape = shape
51
+
52
+ def forward(self, x):
53
+ return x.view(*self.shape)
54
+
55
+ class ControlNet(nn.Module):
56
+ def __init__(
57
+ self,
58
+ image_size,
59
+ in_channels,
60
+ model_channels,
61
+ hint_channels,
62
+ num_res_blocks,
63
+ attention_resolutions,
64
+ dropout=0,
65
+ channel_mult=(1, 2, 4, 8),
66
+ conv_resample=True,
67
+ dims=2,
68
+ use_checkpoint=False,
69
+ use_fp16=False,
70
+ num_heads=-1,
71
+ num_head_channels=-1,
72
+ num_heads_upsample=-1,
73
+ use_scale_shift_norm=False,
74
+ resblock_updown=False,
75
+ use_new_attention_order=False,
76
+ use_spatial_transformer=False, # custom transformer support
77
+ transformer_depth=1, # custom transformer support
78
+ context_dim=None, # custom transformer support
79
+ n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
80
+ legacy=True,
81
+ disable_self_attentions=None,
82
+ num_attention_blocks=None,
83
+ disable_middle_self_attn=False,
84
+ use_linear_in_transformer=False,
85
+ secret_len = 0,
86
+ ):
87
+ super().__init__()
88
+ if use_spatial_transformer:
89
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
90
+
91
+ if context_dim is not None:
92
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
93
+ from omegaconf.listconfig import ListConfig
94
+ if type(context_dim) == ListConfig:
95
+ context_dim = list(context_dim)
96
+
97
+ if num_heads_upsample == -1:
98
+ num_heads_upsample = num_heads
99
+
100
+ if num_heads == -1:
101
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
102
+
103
+ if num_head_channels == -1:
104
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
105
+
106
+ self.dims = dims
107
+ self.image_size = image_size
108
+ self.in_channels = in_channels
109
+ self.model_channels = model_channels
110
+ if isinstance(num_res_blocks, int):
111
+ self.num_res_blocks = len(channel_mult) * [num_res_blocks]
112
+ else:
113
+ if len(num_res_blocks) != len(channel_mult):
114
+ raise ValueError("provide num_res_blocks either as an int (globally constant) or "
115
+ "as a list/tuple (per-level) with the same length as channel_mult")
116
+ self.num_res_blocks = num_res_blocks
117
+ if disable_self_attentions is not None:
118
+ # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
119
+ assert len(disable_self_attentions) == len(channel_mult)
120
+ if num_attention_blocks is not None:
121
+ assert len(num_attention_blocks) == len(self.num_res_blocks)
122
+ assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
123
+ print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
124
+ f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
125
+ f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
126
+ f"attention will still not be set.")
127
+
128
+ self.attention_resolutions = attention_resolutions
129
+ self.dropout = dropout
130
+ self.channel_mult = channel_mult
131
+ self.conv_resample = conv_resample
132
+ self.use_checkpoint = use_checkpoint
133
+ self.dtype = th.float16 if use_fp16 else th.float32
134
+ self.num_heads = num_heads
135
+ self.num_head_channels = num_head_channels
136
+ self.num_heads_upsample = num_heads_upsample
137
+ self.predict_codebook_ids = n_embed is not None
138
+
139
+ time_embed_dim = model_channels * 4
140
+ self.time_embed = nn.Sequential(
141
+ linear(model_channels, time_embed_dim),
142
+ nn.SiLU(),
143
+ linear(time_embed_dim, time_embed_dim),
144
+ )
145
+
146
+ self.input_blocks = nn.ModuleList(
147
+ [
148
+ TimestepEmbedSequential(
149
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
150
+ )
151
+ ]
152
+ )
153
+ self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
154
+ self.secret_len = secret_len
155
+ if secret_len > 0:
156
+ log_resolution = int(np.log2(64))
157
+ self.input_hint_block = TimestepEmbedSequential(
158
+ nn.Linear(secret_len, 16*16*4),
159
+ nn.SiLU(),
160
+ View(-1, 4, 16, 16),
161
+ nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4))),
162
+ conv_nd(dims, 4, 64, 3, padding=1),
163
+ nn.SiLU(),
164
+ conv_nd(dims, 64, 256, 3, padding=1),
165
+ nn.SiLU(),
166
+ zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
167
+ )
168
+ else:
169
+ self.input_hint_block = TimestepEmbedSequential(
170
+ conv_nd(dims, hint_channels, 16, 3, padding=1),
171
+ nn.SiLU(),
172
+ conv_nd(dims, 16, 16, 3, padding=1),
173
+ nn.SiLU(),
174
+ conv_nd(dims, 16, 32, 3, padding=1, stride=2),
175
+ nn.SiLU(),
176
+ conv_nd(dims, 32, 32, 3, padding=1),
177
+ nn.SiLU(),
178
+ conv_nd(dims, 32, 96, 3, padding=1, stride=2),
179
+ nn.SiLU(),
180
+ conv_nd(dims, 96, 96, 3, padding=1),
181
+ nn.SiLU(),
182
+ conv_nd(dims, 96, 256, 3, padding=1, stride=2),
183
+ nn.SiLU(),
184
+ zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
185
+ )
186
+
187
+ self._feature_size = model_channels
188
+ input_block_chans = [model_channels]
189
+ ch = model_channels
190
+ ds = 1
191
+ for level, mult in enumerate(channel_mult):
192
+ for nr in range(self.num_res_blocks[level]):
193
+ layers = [
194
+ ResBlock(
195
+ ch,
196
+ time_embed_dim,
197
+ dropout,
198
+ out_channels=mult * model_channels,
199
+ dims=dims,
200
+ use_checkpoint=use_checkpoint,
201
+ use_scale_shift_norm=use_scale_shift_norm,
202
+ )
203
+ ]
204
+ ch = mult * model_channels
205
+ if ds in attention_resolutions:
206
+ if num_head_channels == -1:
207
+ dim_head = ch // num_heads
208
+ else:
209
+ num_heads = ch // num_head_channels
210
+ dim_head = num_head_channels
211
+ if legacy:
212
+ #num_heads = 1
213
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
214
+ if exists(disable_self_attentions):
215
+ disabled_sa = disable_self_attentions[level]
216
+ else:
217
+ disabled_sa = False
218
+
219
+ if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
220
+ layers.append(
221
+ AttentionBlock(
222
+ ch,
223
+ use_checkpoint=use_checkpoint,
224
+ num_heads=num_heads,
225
+ num_head_channels=dim_head,
226
+ use_new_attention_order=use_new_attention_order,
227
+ ) if not use_spatial_transformer else SpatialTransformer(
228
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
229
+ disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
230
+ use_checkpoint=use_checkpoint
231
+ )
232
+ )
233
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
234
+ self.zero_convs.append(self.make_zero_conv(ch))
235
+ self._feature_size += ch
236
+ input_block_chans.append(ch)
237
+ if level != len(channel_mult) - 1:
238
+ out_ch = ch
239
+ self.input_blocks.append(
240
+ TimestepEmbedSequential(
241
+ ResBlock(
242
+ ch,
243
+ time_embed_dim,
244
+ dropout,
245
+ out_channels=out_ch,
246
+ dims=dims,
247
+ use_checkpoint=use_checkpoint,
248
+ use_scale_shift_norm=use_scale_shift_norm,
249
+ down=True,
250
+ )
251
+ if resblock_updown
252
+ else Downsample(
253
+ ch, conv_resample, dims=dims, out_channels=out_ch
254
+ )
255
+ )
256
+ )
257
+ ch = out_ch
258
+ input_block_chans.append(ch)
259
+ self.zero_convs.append(self.make_zero_conv(ch))
260
+ ds *= 2
261
+ self._feature_size += ch
262
+
263
+ if num_head_channels == -1:
264
+ dim_head = ch // num_heads
265
+ else:
266
+ num_heads = ch // num_head_channels
267
+ dim_head = num_head_channels
268
+ if legacy:
269
+ #num_heads = 1
270
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
271
+ self.middle_block = TimestepEmbedSequential(
272
+ ResBlock(
273
+ ch,
274
+ time_embed_dim,
275
+ dropout,
276
+ dims=dims,
277
+ use_checkpoint=use_checkpoint,
278
+ use_scale_shift_norm=use_scale_shift_norm,
279
+ ),
280
+ AttentionBlock(
281
+ ch,
282
+ use_checkpoint=use_checkpoint,
283
+ num_heads=num_heads,
284
+ num_head_channels=dim_head,
285
+ use_new_attention_order=use_new_attention_order,
286
+ ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
287
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
288
+ disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
289
+ use_checkpoint=use_checkpoint
290
+ ),
291
+ ResBlock(
292
+ ch,
293
+ time_embed_dim,
294
+ dropout,
295
+ dims=dims,
296
+ use_checkpoint=use_checkpoint,
297
+ use_scale_shift_norm=use_scale_shift_norm,
298
+ ),
299
+ )
300
+ self.middle_block_out = self.make_zero_conv(ch)
301
+ self._feature_size += ch
302
+
303
+ def make_zero_conv(self, channels):
304
+ return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
305
+
306
+ def forward(self, x, hint, timesteps, context, **kwargs):
307
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
308
+ emb = self.time_embed(t_emb)
309
+ # import pdb; pdb.set_trace()
310
+ guided_hint = self.input_hint_block(hint, emb, context)
311
+
312
+ outs = []
313
+
314
+ h = x.type(self.dtype)
315
+ for module, zero_conv in zip(self.input_blocks, self.zero_convs):
316
+ if guided_hint is not None:
317
+ h = module(h, emb, context)
318
+ h += guided_hint
319
+ guided_hint = None
320
+ else:
321
+ h = module(h, emb, context)
322
+ outs.append(zero_conv(h, emb, context))
323
+
324
+ h = self.middle_block(h, emb, context)
325
+ outs.append(self.middle_block_out(h, emb, context))
326
+
327
+ return outs
328
+
329
+
330
+ class SecretDecoder(nn.Module):
331
+ def __init__(self, arch='CNN', act='ReLU', norm='none', resolution=256, in_channels=3, secret_len=100):
332
+ super().__init__()
333
+ self.resolution = resolution
334
+ self.arch = arch
335
+ print(f'SecretDecoder arch: {arch}')
336
+ def activation(name = 'ReLU'):
337
+ if name == 'ReLU':
338
+ return nn.ReLU()
339
+ elif name == 'LeakyReLU':
340
+ return nn.LeakyReLU()
341
+ elif name == 'SiLU':
342
+ return nn.SiLU()
343
+
344
+ def normalisation(name, n):
345
+ if name == 'none':
346
+ return nn.Identity()
347
+ elif name == 'BatchNorm2D':
348
+ return nn.BatchNorm2d(n)
349
+ elif name == 'BatchNorm1d':
350
+ return nn.BatchNorm1d(n)
351
+ elif name == 'LayerNorm':
352
+ return nn.LayerNorm(n)
353
+
354
+ if arch=='CNN':
355
+ self.decoder = nn.Sequential(
356
+ nn.Conv2d(in_channels, 32, (3, 3), 2, 1), # 128
357
+ activation(act),
358
+ nn.Conv2d(32, 32, 3, 1, 1),
359
+ activation(act),
360
+ nn.Conv2d(32, 64, 3, 2, 1), # 64
361
+ activation(act),
362
+ nn.Conv2d(64, 64, 3, 1, 1),
363
+ activation(act),
364
+ nn.Conv2d(64, 64, 3, 2, 1), # 32
365
+ activation(act),
366
+ nn.Conv2d(64, 128, 3, 2, 1), # 16
367
+ activation(act),
368
+ nn.Conv2d(128, 128, (3, 3), 2, 1), # 8
369
+ activation(act),
370
+ )
371
+ self.dense = nn.Sequential(
372
+ nn.Linear(resolution * resolution * 128 // 32 // 32, 512),
373
+ activation(act),
374
+ nn.Linear(512, secret_len)
375
+ )
376
+ elif arch == 'resnet50':
377
+ self.decoder = torchvision.models.resnet50(pretrained=True, progress=False)
378
+ self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
379
+ else:
380
+ raise NotImplementedError
381
+
382
+ def forward(self, image):
383
+ x = self.decoder(image)
384
+ if self.arch == 'CNN':
385
+ x = x.view(-1, self.resolution * self.resolution * 128 // 32 // 32)
386
+ x = self.dense(x)
387
+ return x
388
+
389
+
390
+ class ControlLDM(LatentDiffusion):
391
+
392
+ def __init__(self, control_stage_config, control_key, only_mid_control, secret_decoder_config, *args, **kwargs):
393
+ super().__init__(*args, **kwargs)
394
+ self.control_model = instantiate_from_config(control_stage_config)
395
+ self.control_key = control_key
396
+ self.only_mid_control = only_mid_control
397
+ if secret_decoder_config != 'none':
398
+ self.secret_decoder = instantiate_from_config(secret_decoder_config)
399
+
400
+ @torch.no_grad()
401
+ def get_input(self, batch, k, bs=None, *args, **kwargs):
402
+ x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
403
+ control = batch[self.control_key]
404
+ if bs is not None:
405
+ control = control[:bs]
406
+ control = control.to(self.device)
407
+ if self.control_key == 'hint':
408
+ control = einops.rearrange(control, 'b h w c -> b c h w')
409
+ control = control.to(memory_format=torch.contiguous_format).float()
410
+ return x, dict(c_crossattn=[c], c_concat=[control])
411
+
412
+ def apply_model(self, x_noisy, t, cond, *args, **kwargs):
413
+ assert isinstance(cond, dict)
414
+ diffusion_model = self.model.diffusion_model
415
+ cond_txt = torch.cat(cond['c_crossattn'], 1)
416
+ cond_hint = torch.cat(cond['c_concat'], 1)
417
+
418
+ control = self.control_model(x=x_noisy, hint=cond_hint, timesteps=t, context=cond_txt)
419
+ eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
420
+
421
+ return eps
422
+
423
+ @torch.no_grad()
424
+ def get_unconditional_conditioning(self, N):
425
+ return self.get_learned_conditioning([""] * N)
426
+
427
+ @torch.no_grad()
428
+ def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
429
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
430
+ plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
431
+ use_ema_scope=True,
432
+ **kwargs):
433
+ use_ddim = ddim_steps is not None
434
+
435
+ log = dict()
436
+ z, c = self.get_input(batch, self.first_stage_key, bs=N)
437
+ c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
438
+ N = min(z.shape[0], N)
439
+ n_row = min(z.shape[0], n_row)
440
+ log["reconstruction"] = self.decode_first_stage(z)
441
+ log["control"] = c_cat * 2.0 - 1.0
442
+ log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
443
+
444
+ if plot_diffusion_rows:
445
+ # get diffusion row
446
+ diffusion_row = list()
447
+ z_start = z[:n_row]
448
+ for t in range(self.num_timesteps):
449
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
450
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
451
+ t = t.to(self.device).long()
452
+ noise = torch.randn_like(z_start)
453
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
454
+ diffusion_row.append(self.decode_first_stage(z_noisy))
455
+
456
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
457
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
458
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
459
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
460
+ log["diffusion_row"] = diffusion_grid
461
+
462
+ if sample:
463
+ # get denoise row
464
+ samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
465
+ batch_size=N, ddim=use_ddim,
466
+ ddim_steps=ddim_steps, eta=ddim_eta)
467
+ x_samples = self.decode_first_stage(samples)
468
+ log["samples"] = x_samples
469
+ if plot_denoise_rows:
470
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
471
+ log["denoise_row"] = denoise_grid
472
+ # import pudb; pudb.set_trace()
473
+ if unconditional_guidance_scale > 1.0:
474
+ uc_cross = self.get_unconditional_conditioning(N)
475
+ uc_cat = c_cat # torch.zeros_like(c_cat)
476
+ uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
477
+ samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
478
+ batch_size=N, ddim=use_ddim,
479
+ ddim_steps=ddim_steps, eta=ddim_eta,
480
+ unconditional_guidance_scale=unconditional_guidance_scale,
481
+ unconditional_conditioning=uc_full,
482
+ )
483
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
484
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
485
+
486
+ return log
487
+
488
+ @torch.no_grad()
489
+ def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
490
+ ddim_sampler = DDIMSampler(self)
491
+ # import pdb; pdb.set_trace()
492
+ # b, c, h, w = cond["c_concat"][0].shape
493
+ b, c, h, w = cond["c_concat"][0].shape[0], self.channels, self.image_size*8, self.image_size*8
494
+ shape = (self.channels, h // 8, w // 8)
495
+ samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
496
+ return samples, intermediates
497
+
498
+ def configure_optimizers(self):
499
+ lr = self.learning_rate
500
+ params = list(self.control_model.parameters())
501
+ if not self.sd_locked:
502
+ params += list(self.model.diffusion_model.output_blocks.parameters())
503
+ params += list(self.model.diffusion_model.out.parameters())
504
+ opt = torch.optim.AdamW(params, lr=lr)
505
+ return opt
506
+
507
+ def low_vram_shift(self, is_diffusing):
508
+ if is_diffusing:
509
+ self.model = self.model.cuda()
510
+ self.control_model = self.control_model.cuda()
511
+ self.first_stage_model = self.first_stage_model.cpu()
512
+ self.cond_stage_model = self.cond_stage_model.cpu()
513
+ else:
514
+ self.model = self.model.cpu()
515
+ self.control_model = self.control_model.cpu()
516
+ self.first_stage_model = self.first_stage_model.cuda()
517
+ self.cond_stage_model = self.cond_stage_model.cuda()
cldm/diffsteg.py ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import einops
3
+ import torch
4
+ import torch as th
5
+ import torch.nn as nn
6
+ from torch.nn import functional as thf
7
+ import torchvision
8
+ from ldm.modules.diffusionmodules.util import (
9
+ conv_nd,
10
+ linear,
11
+ zero_module,
12
+ timestep_embedding,
13
+ )
14
+
15
+ from einops import rearrange, repeat
16
+ from torchvision.utils import make_grid
17
+ from ldm.modules.attention import SpatialTransformer
18
+ from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
19
+ from ldm.models.diffusion.ddpm import LatentDiffusion
20
+ from ldm.util import log_txt_as_img, exists, instantiate_from_config, default
21
+ from ldm.models.diffusion.ddim import DDIMSampler
22
+
23
+
24
+ # class CUNetModel(nn.Module):
25
+ # def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
26
+ # hs = []
27
+ # with torch.no_grad():
28
+ # t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
29
+ # emb = self.time_embed(t_emb)
30
+
31
+ # h = x.type(self.dtype)
32
+ # for module in self.input_blocks:
33
+ # h = module(h, emb, context)
34
+ # hs.append(h)
35
+
36
+ # h = self.middle_block(h, emb, context)
37
+ # h += control.pop(0)
38
+ # for module in self.output_blocks:
39
+ # if only_mid_control:
40
+ # h = th.cat([h, hs.pop()], dim=1)
41
+ # else:
42
+ # h = torch.cat([h, hs.pop() + control.pop(0)], dim=1)
43
+ # h = module(h, emb, context)
44
+ # h = h.type(x.dtype)
45
+ # return self.out(h)
46
+
47
+ class SecretNet(nn.Module):
48
+ def __init__(
49
+ self,
50
+ image_size,
51
+ in_channels,
52
+ model_channels,
53
+ hint_channels,
54
+ num_res_blocks,
55
+ attention_resolutions,
56
+ dropout=0,
57
+ channel_mult=(1, 2, 4, 8),
58
+ conv_resample=True,
59
+ dims=2,
60
+ use_checkpoint=False,
61
+ use_fp16=False,
62
+ num_heads=-1,
63
+ num_head_channels=-1,
64
+ num_heads_upsample=-1,
65
+ use_scale_shift_norm=False,
66
+ resblock_updown=False,
67
+ use_new_attention_order=False,
68
+ use_spatial_transformer=False, # custom transformer support
69
+ transformer_depth=1, # custom transformer support
70
+ context_dim=None, # custom transformer support
71
+ n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
72
+ legacy=True,
73
+ disable_self_attentions=None,
74
+ num_attention_blocks=None,
75
+ disable_middle_self_attn=False,
76
+ use_linear_in_transformer=False,
77
+ secret_len = 0,
78
+ ):
79
+ super().__init__()
80
+ if use_spatial_transformer:
81
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
82
+
83
+ if context_dim is not None:
84
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
85
+ from omegaconf.listconfig import ListConfig
86
+ if type(context_dim) == ListConfig:
87
+ context_dim = list(context_dim)
88
+
89
+ if num_heads_upsample == -1:
90
+ num_heads_upsample = num_heads
91
+
92
+ if num_heads == -1:
93
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
94
+
95
+ if num_head_channels == -1:
96
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
97
+
98
+ self.dims = dims
99
+ self.image_size = image_size
100
+ self.in_channels = in_channels
101
+ self.model_channels = model_channels
102
+ if isinstance(num_res_blocks, int):
103
+ self.num_res_blocks = len(channel_mult) * [num_res_blocks]
104
+ else:
105
+ if len(num_res_blocks) != len(channel_mult):
106
+ raise ValueError("provide num_res_blocks either as an int (globally constant) or "
107
+ "as a list/tuple (per-level) with the same length as channel_mult")
108
+ self.num_res_blocks = num_res_blocks
109
+ if disable_self_attentions is not None:
110
+ # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
111
+ assert len(disable_self_attentions) == len(channel_mult)
112
+ if num_attention_blocks is not None:
113
+ assert len(num_attention_blocks) == len(self.num_res_blocks)
114
+ assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
115
+ print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
116
+ f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
117
+ f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
118
+ f"attention will still not be set.")
119
+
120
+ self.attention_resolutions = attention_resolutions
121
+ self.dropout = dropout
122
+ self.channel_mult = channel_mult
123
+ self.conv_resample = conv_resample
124
+ self.use_checkpoint = use_checkpoint
125
+ self.dtype = th.float16 if use_fp16 else th.float32
126
+ self.num_heads = num_heads
127
+ self.num_head_channels = num_head_channels
128
+ self.num_heads_upsample = num_heads_upsample
129
+ self.predict_codebook_ids = n_embed is not None
130
+
131
+ time_embed_dim = model_channels * 4
132
+ self.time_embed = nn.Sequential(
133
+ linear(model_channels, time_embed_dim),
134
+ nn.SiLU(),
135
+ linear(time_embed_dim, time_embed_dim),
136
+ )
137
+
138
+ # self.input_blocks = nn.ModuleList(
139
+ # [
140
+ # TimestepEmbedSequential(
141
+ # conv_nd(dims, in_channels, model_channels, 3, padding=1)
142
+ # )
143
+ # ]
144
+ # )
145
+ self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
146
+ self.secret_len = secret_len
147
+ if secret_len > 0: # TODO: update for dec
148
+ log_resolution = int(np.log2(64))
149
+ self.input_hint_block = TimestepEmbedSequential(
150
+ nn.Linear(secret_len, 16*16*4),
151
+ nn.SiLU(),
152
+ View(-1, 4, 16, 16),
153
+ nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4))),
154
+ conv_nd(dims, 4, 64, 3, padding=1),
155
+ nn.SiLU(),
156
+ conv_nd(dims, 64, 256, 3, padding=1),
157
+ nn.SiLU(),
158
+ zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
159
+ )
160
+
161
+ self._feature_size = model_channels
162
+ input_block_chans = [model_channels]
163
+ ch = model_channels
164
+ ds = 1
165
+ for level, mult in enumerate(channel_mult):
166
+ for nr in range(self.num_res_blocks[level]):
167
+ layers = []
168
+ ch = mult * model_channels
169
+ if ds in attention_resolutions:
170
+ if num_head_channels == -1:
171
+ dim_head = ch // num_heads
172
+ else:
173
+ num_heads = ch // num_head_channels
174
+ dim_head = num_head_channels
175
+ if legacy:
176
+ #num_heads = 1
177
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
178
+ if exists(disable_self_attentions):
179
+ disabled_sa = disable_self_attentions[level]
180
+ else:
181
+ disabled_sa = False
182
+
183
+ if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
184
+ layers.append(0)
185
+ # self.input_blocks.append(TimestepEmbedSequential(*layers))
186
+ # self.zero_convs.append(self.make_zero_conv(ch))
187
+ self._feature_size += ch
188
+ input_block_chans.append(ch)
189
+ if level != len(channel_mult) - 1:
190
+ out_ch = ch
191
+ self.input_blocks.append(
192
+ 0
193
+ )
194
+ ch = out_ch
195
+ input_block_chans.append(ch)
196
+ # self.zero_convs.append(self.make_zero_conv(ch))
197
+ ds *= 2
198
+ self._feature_size += ch
199
+
200
+ if num_head_channels == -1:
201
+ dim_head = ch // num_heads
202
+ else:
203
+ num_heads = ch // num_head_channels
204
+ dim_head = num_head_channels
205
+ if legacy:
206
+ #num_heads = 1
207
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
208
+ self.middle_block = TimestepEmbedSequential(
209
+ ResBlock(
210
+ ch,
211
+ time_embed_dim,
212
+ dropout,
213
+ dims=dims,
214
+ use_checkpoint=use_checkpoint,
215
+ use_scale_shift_norm=use_scale_shift_norm,
216
+ ),
217
+ AttentionBlock(
218
+ ch,
219
+ use_checkpoint=use_checkpoint,
220
+ num_heads=num_heads,
221
+ num_head_channels=dim_head,
222
+ use_new_attention_order=use_new_attention_order,
223
+ ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
224
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
225
+ disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
226
+ use_checkpoint=use_checkpoint
227
+ ),
228
+ ResBlock(
229
+ ch,
230
+ time_embed_dim,
231
+ dropout,
232
+ dims=dims,
233
+ use_checkpoint=use_checkpoint,
234
+ use_scale_shift_norm=use_scale_shift_norm,
235
+ ),
236
+ )
237
+ self.middle_block_out = self.make_zero_conv(ch)
238
+ self._feature_size += ch
239
+
240
+ def make_zero_conv(self, channels):
241
+ return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
242
+
243
+ def forward(self, x, hint, timesteps, context, **kwargs):
244
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
245
+ emb = self.time_embed(t_emb)
246
+ guided_hint = self.input_hint_block(hint, emb, context)
247
+ # import pdb; pdb.set_trace()
248
+ outs = []
249
+
250
+ h = x.type(self.dtype)
251
+ for module, zero_conv in zip(self.input_blocks, self.zero_convs):
252
+ if guided_hint is not None:
253
+ h = module(h, emb, context)
254
+ h += guided_hint
255
+ guided_hint = None
256
+ else:
257
+ h = module(h, emb, context)
258
+ outs.append(zero_conv(h, emb, context))
259
+
260
+ h = self.middle_block(h, emb, context)
261
+ outs.append(self.middle_block_out(h, emb, context))
262
+
263
+ return outs
264
+
265
+ class ControlledUnetModel(UNetModel):
266
+ def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
267
+ hs = []
268
+ with torch.no_grad():
269
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
270
+ emb = self.time_embed(t_emb)
271
+ h = x.type(self.dtype)
272
+ for module in self.input_blocks:
273
+ h = module(h, emb, context)
274
+ hs.append(h)
275
+ h = self.middle_block(h, emb, context)
276
+
277
+ h += control.pop()
278
+
279
+ for i, module in enumerate(self.output_blocks):
280
+ if only_mid_control:
281
+ h = torch.cat([h, hs.pop()], dim=1)
282
+ else:
283
+ h = torch.cat([h, hs.pop() + control.pop()], dim=1)
284
+ h = module(h, emb, context)
285
+
286
+ h = h.type(x.dtype)
287
+ return self.out(h)
288
+
289
+ class View(nn.Module):
290
+ def __init__(self, *shape):
291
+ super().__init__()
292
+ self.shape = shape
293
+
294
+ def forward(self, x):
295
+ return x.view(*self.shape)
296
+
297
+ class ControlNet(nn.Module):
298
+ def __init__(
299
+ self,
300
+ image_size,
301
+ in_channels,
302
+ model_channels,
303
+ hint_channels,
304
+ num_res_blocks,
305
+ attention_resolutions,
306
+ dropout=0,
307
+ channel_mult=(1, 2, 4, 8),
308
+ conv_resample=True,
309
+ dims=2,
310
+ use_checkpoint=False,
311
+ use_fp16=False,
312
+ num_heads=-1,
313
+ num_head_channels=-1,
314
+ num_heads_upsample=-1,
315
+ use_scale_shift_norm=False,
316
+ resblock_updown=False,
317
+ use_new_attention_order=False,
318
+ use_spatial_transformer=False, # custom transformer support
319
+ transformer_depth=1, # custom transformer support
320
+ context_dim=None, # custom transformer support
321
+ n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
322
+ legacy=True,
323
+ disable_self_attentions=None,
324
+ num_attention_blocks=None,
325
+ disable_middle_self_attn=False,
326
+ use_linear_in_transformer=False,
327
+ secret_len = 0,
328
+ ):
329
+ super().__init__()
330
+ if use_spatial_transformer:
331
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
332
+
333
+ if context_dim is not None:
334
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
335
+ from omegaconf.listconfig import ListConfig
336
+ if type(context_dim) == ListConfig:
337
+ context_dim = list(context_dim)
338
+
339
+ if num_heads_upsample == -1:
340
+ num_heads_upsample = num_heads
341
+
342
+ if num_heads == -1:
343
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
344
+
345
+ if num_head_channels == -1:
346
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
347
+
348
+ self.dims = dims
349
+ self.image_size = image_size
350
+ self.in_channels = in_channels
351
+ self.model_channels = model_channels
352
+ if isinstance(num_res_blocks, int):
353
+ self.num_res_blocks = len(channel_mult) * [num_res_blocks]
354
+ else:
355
+ if len(num_res_blocks) != len(channel_mult):
356
+ raise ValueError("provide num_res_blocks either as an int (globally constant) or "
357
+ "as a list/tuple (per-level) with the same length as channel_mult")
358
+ self.num_res_blocks = num_res_blocks
359
+ if disable_self_attentions is not None:
360
+ # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
361
+ assert len(disable_self_attentions) == len(channel_mult)
362
+ if num_attention_blocks is not None:
363
+ assert len(num_attention_blocks) == len(self.num_res_blocks)
364
+ assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
365
+ print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
366
+ f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
367
+ f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
368
+ f"attention will still not be set.")
369
+
370
+ self.attention_resolutions = attention_resolutions
371
+ self.dropout = dropout
372
+ self.channel_mult = channel_mult
373
+ self.conv_resample = conv_resample
374
+ self.use_checkpoint = use_checkpoint
375
+ self.dtype = th.float16 if use_fp16 else th.float32
376
+ self.num_heads = num_heads
377
+ self.num_head_channels = num_head_channels
378
+ self.num_heads_upsample = num_heads_upsample
379
+ self.predict_codebook_ids = n_embed is not None
380
+
381
+ time_embed_dim = model_channels * 4
382
+ self.time_embed = nn.Sequential(
383
+ linear(model_channels, time_embed_dim),
384
+ nn.SiLU(),
385
+ linear(time_embed_dim, time_embed_dim),
386
+ )
387
+
388
+ self.input_blocks = nn.ModuleList(
389
+ [
390
+ TimestepEmbedSequential(
391
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
392
+ )
393
+ ]
394
+ )
395
+ self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
396
+ self.secret_len = secret_len
397
+ if secret_len > 0:
398
+ log_resolution = int(np.log2(64))
399
+ self.input_hint_block = TimestepEmbedSequential(
400
+ nn.Linear(secret_len, 16*16*4),
401
+ nn.SiLU(),
402
+ View(-1, 4, 16, 16),
403
+ nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4))),
404
+ conv_nd(dims, 4, 64, 3, padding=1),
405
+ nn.SiLU(),
406
+ conv_nd(dims, 64, 256, 3, padding=1),
407
+ nn.SiLU(),
408
+ zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
409
+ )
410
+ else:
411
+ self.input_hint_block = TimestepEmbedSequential(
412
+ conv_nd(dims, hint_channels, 16, 3, padding=1),
413
+ nn.SiLU(),
414
+ conv_nd(dims, 16, 16, 3, padding=1),
415
+ nn.SiLU(),
416
+ conv_nd(dims, 16, 32, 3, padding=1, stride=2),
417
+ nn.SiLU(),
418
+ conv_nd(dims, 32, 32, 3, padding=1),
419
+ nn.SiLU(),
420
+ conv_nd(dims, 32, 96, 3, padding=1, stride=2),
421
+ nn.SiLU(),
422
+ conv_nd(dims, 96, 96, 3, padding=1),
423
+ nn.SiLU(),
424
+ conv_nd(dims, 96, 256, 3, padding=1, stride=2),
425
+ nn.SiLU(),
426
+ zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
427
+ )
428
+
429
+ self._feature_size = model_channels
430
+ input_block_chans = [model_channels]
431
+ ch = model_channels
432
+ ds = 1
433
+ for level, mult in enumerate(channel_mult):
434
+ for nr in range(self.num_res_blocks[level]):
435
+ layers = [
436
+ ResBlock(
437
+ ch,
438
+ time_embed_dim,
439
+ dropout,
440
+ out_channels=mult * model_channels,
441
+ dims=dims,
442
+ use_checkpoint=use_checkpoint,
443
+ use_scale_shift_norm=use_scale_shift_norm,
444
+ )
445
+ ]
446
+ ch = mult * model_channels
447
+ if ds in attention_resolutions:
448
+ if num_head_channels == -1:
449
+ dim_head = ch // num_heads
450
+ else:
451
+ num_heads = ch // num_head_channels
452
+ dim_head = num_head_channels
453
+ if legacy:
454
+ #num_heads = 1
455
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
456
+ if exists(disable_self_attentions):
457
+ disabled_sa = disable_self_attentions[level]
458
+ else:
459
+ disabled_sa = False
460
+
461
+ if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
462
+ layers.append(
463
+ AttentionBlock(
464
+ ch,
465
+ use_checkpoint=use_checkpoint,
466
+ num_heads=num_heads,
467
+ num_head_channels=dim_head,
468
+ use_new_attention_order=use_new_attention_order,
469
+ ) if not use_spatial_transformer else SpatialTransformer(
470
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
471
+ disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
472
+ use_checkpoint=use_checkpoint
473
+ )
474
+ )
475
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
476
+ self.zero_convs.append(self.make_zero_conv(ch))
477
+ self._feature_size += ch
478
+ input_block_chans.append(ch)
479
+ if level != len(channel_mult) - 1:
480
+ out_ch = ch
481
+ self.input_blocks.append(
482
+ TimestepEmbedSequential(
483
+ ResBlock(
484
+ ch,
485
+ time_embed_dim,
486
+ dropout,
487
+ out_channels=out_ch,
488
+ dims=dims,
489
+ use_checkpoint=use_checkpoint,
490
+ use_scale_shift_norm=use_scale_shift_norm,
491
+ down=True,
492
+ )
493
+ if resblock_updown
494
+ else Downsample(
495
+ ch, conv_resample, dims=dims, out_channels=out_ch
496
+ )
497
+ )
498
+ )
499
+ ch = out_ch
500
+ input_block_chans.append(ch)
501
+ self.zero_convs.append(self.make_zero_conv(ch))
502
+ ds *= 2
503
+ self._feature_size += ch
504
+
505
+ if num_head_channels == -1:
506
+ dim_head = ch // num_heads
507
+ else:
508
+ num_heads = ch // num_head_channels
509
+ dim_head = num_head_channels
510
+ if legacy:
511
+ #num_heads = 1
512
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
513
+ self.middle_block = TimestepEmbedSequential(
514
+ ResBlock(
515
+ ch,
516
+ time_embed_dim,
517
+ dropout,
518
+ dims=dims,
519
+ use_checkpoint=use_checkpoint,
520
+ use_scale_shift_norm=use_scale_shift_norm,
521
+ ),
522
+ AttentionBlock(
523
+ ch,
524
+ use_checkpoint=use_checkpoint,
525
+ num_heads=num_heads,
526
+ num_head_channels=dim_head,
527
+ use_new_attention_order=use_new_attention_order,
528
+ ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
529
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
530
+ disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
531
+ use_checkpoint=use_checkpoint
532
+ ),
533
+ ResBlock(
534
+ ch,
535
+ time_embed_dim,
536
+ dropout,
537
+ dims=dims,
538
+ use_checkpoint=use_checkpoint,
539
+ use_scale_shift_norm=use_scale_shift_norm,
540
+ ),
541
+ )
542
+ self.middle_block_out = self.make_zero_conv(ch)
543
+ self._feature_size += ch
544
+
545
+ def make_zero_conv(self, channels):
546
+ return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
547
+
548
+ def forward(self, x, hint, timesteps, context, **kwargs):
549
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
550
+ emb = self.time_embed(t_emb)
551
+ guided_hint = self.input_hint_block(hint, emb, context)
552
+ # import pdb; pdb.set_trace()
553
+ outs = []
554
+
555
+ h = x.type(self.dtype)
556
+ for module, zero_conv in zip(self.input_blocks, self.zero_convs):
557
+ if guided_hint is not None:
558
+ h = module(h, emb, context)
559
+ h += guided_hint
560
+ guided_hint = None
561
+ else:
562
+ h = module(h, emb, context)
563
+ outs.append(zero_conv(h, emb, context))
564
+
565
+ h = self.middle_block(h, emb, context)
566
+ outs.append(self.middle_block_out(h, emb, context))
567
+
568
+ return outs
569
+
570
+
571
+ class SecretDecoder(nn.Module):
572
+ def __init__(self, arch='resnet50', secret_len=100):
573
+ super().__init__()
574
+ self.arch = arch
575
+ print(f'SecretDecoder arch: {arch}')
576
+ self.resolution = 224
577
+ if arch == 'resnet50':
578
+ self.decoder = torchvision.models.resnet50(pretrained=True, progress=False)
579
+ self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
580
+ elif arch == 'resnet18':
581
+ self.decoder = torchvision.models.resnet18(pretrained=True, progress=False)
582
+ self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
583
+ else:
584
+ raise NotImplementedError
585
+
586
+ def forward(self, image):
587
+ if self.arch in ['resnet50', 'resnet18'] and image.shape[-1] > self.resolution:
588
+ image = thf.interpolate(image, size=(self.resolution, self.resolution), mode='bilinear', align_corners=False)
589
+ x = self.decoder(image)
590
+ return x
591
+
592
+
593
+ class ControlLDM(LatentDiffusion):
594
+
595
+ def __init__(self, control_stage_config, control_key, only_mid_control, secret_decoder_config, *args, **kwargs):
596
+ super().__init__(*args, **kwargs)
597
+ self.control_model = instantiate_from_config(control_stage_config)
598
+ self.control_key = control_key
599
+ self.only_mid_control = only_mid_control
600
+
601
+ self.secret_decoder = None if secret_decoder_config == 'none' else instantiate_from_config(secret_decoder_config)
602
+ self.secret_loss_layer = nn.BCEWithLogitsLoss()
603
+
604
+ @torch.no_grad()
605
+ def get_input(self, batch, k, bs=None, *args, **kwargs):
606
+ x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
607
+ control = batch[self.control_key]
608
+ if bs is not None:
609
+ control = control[:bs]
610
+ control = control.to(self.device)
611
+ if self.control_key == 'hint':
612
+ control = einops.rearrange(control, 'b h w c -> b c h w')
613
+ control = control.to(memory_format=torch.contiguous_format).float()
614
+ return x, dict(c_crossattn=[c], c_concat=[control])
615
+
616
+ def apply_model(self, x_noisy, t, cond, *args, **kwargs):
617
+ assert isinstance(cond, dict)
618
+ diffusion_model = self.model.diffusion_model
619
+ cond_txt = torch.cat(cond['c_crossattn'], 1)
620
+ cond_hint = torch.cat(cond['c_concat'], 1)
621
+
622
+ control = self.control_model(x=x_noisy, hint=cond_hint, timesteps=t, context=cond_txt)
623
+ eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
624
+
625
+ return eps
626
+
627
+ def p_losses(self, x_start, cond, t, noise=None):
628
+ noise = default(noise, lambda: torch.randn_like(x_start))
629
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
630
+ model_output = self.apply_model(x_noisy, t, cond)
631
+ loss_dict = {}
632
+ prefix = 'train' if self.training else 'val'
633
+
634
+ if self.parameterization == "x0":
635
+ target = x_start
636
+ x_recon = model_output
637
+ elif self.parameterization == "eps":
638
+ target = noise
639
+ x_recon = self.predict_start_from_noise(x_noisy, t, noise=model_output)
640
+ elif self.parameterization == "v":
641
+ target = self.get_v(x_start, noise, t)
642
+ else:
643
+ raise NotImplementedError()
644
+
645
+ loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
646
+ loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
647
+
648
+ logvar_t = self.logvar[t].to(self.device)
649
+ loss = loss_simple / torch.exp(logvar_t) + logvar_t
650
+ # loss = loss_simple / torch.exp(self.logvar) + self.logvar
651
+ if self.learn_logvar:
652
+ loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
653
+ loss_dict.update({'logvar': self.logvar.data.mean()})
654
+
655
+ loss = self.l_simple_weight * loss.mean()
656
+
657
+ loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
658
+ loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
659
+ loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
660
+ loss += (self.original_elbo_weight * loss_vlb)
661
+ # secret decode
662
+ if self.secret_decoder is not None:
663
+ simple_loss_weight = 0.1
664
+ x_recon = self.differentiable_decode_first_stage(x_recon)
665
+ secret_pred = self.secret_decoder(x_recon)
666
+ secret = cond['c_concat'][0]
667
+ loss_secret = self.secret_loss_layer(secret_pred, secret)
668
+ bit_acc = ((secret_pred.detach() > 0).float() == secret).float().mean()
669
+ loss_dict.update({f'{prefix}/bit_acc': bit_acc})
670
+ loss_dict.update({f'{prefix}/loss_secret': loss_secret})
671
+ loss = (loss*simple_loss_weight + loss_secret) / (simple_loss_weight + 1)
672
+
673
+ loss_dict.update({f'{prefix}/loss': loss})
674
+ return loss, loss_dict
675
+
676
+ def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
677
+ if predict_cids:
678
+ if z.dim() == 4:
679
+ z = torch.argmax(z.exp(), dim=1).long()
680
+ z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
681
+ z = rearrange(z, 'b h w c -> b c h w').contiguous()
682
+
683
+ z = 1. / self.scale_factor * z
684
+ return self.first_stage_model.decode(z)
685
+
686
+ @torch.no_grad()
687
+ def get_unconditional_conditioning(self, N):
688
+ return self.get_learned_conditioning([""] * N)
689
+
690
+ @torch.no_grad()
691
+ def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
692
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
693
+ plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
694
+ use_ema_scope=True,
695
+ **kwargs):
696
+ use_ddim = ddim_steps is not None
697
+
698
+ log = dict()
699
+ z, c = self.get_input(batch, self.first_stage_key, bs=N)
700
+ c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
701
+ N = min(z.shape[0], N)
702
+ n_row = min(z.shape[0], n_row)
703
+ log["reconstruction"] = self.decode_first_stage(z)
704
+ # log["control"] = c_cat * 2.0 - 1.0
705
+ log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
706
+
707
+ if plot_diffusion_rows:
708
+ # get diffusion row
709
+ diffusion_row = list()
710
+ z_start = z[:n_row]
711
+ for t in range(self.num_timesteps):
712
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
713
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
714
+ t = t.to(self.device).long()
715
+ noise = torch.randn_like(z_start)
716
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
717
+ diffusion_row.append(self.decode_first_stage(z_noisy))
718
+
719
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
720
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
721
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
722
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
723
+ log["diffusion_row"] = diffusion_grid
724
+
725
+ if sample:
726
+ # get denoise row
727
+ samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
728
+ batch_size=N, ddim=use_ddim,
729
+ ddim_steps=ddim_steps, eta=ddim_eta)
730
+ x_samples = self.decode_first_stage(samples)
731
+ log["samples"] = x_samples
732
+ if plot_denoise_rows:
733
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
734
+ log["denoise_row"] = denoise_grid
735
+ # import pudb; pudb.set_trace()
736
+ if unconditional_guidance_scale > 1.0:
737
+ uc_cross = self.get_unconditional_conditioning(N)
738
+ uc_cat = c_cat # torch.zeros_like(c_cat)
739
+ uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
740
+ samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
741
+ batch_size=N, ddim=use_ddim,
742
+ ddim_steps=ddim_steps, eta=ddim_eta,
743
+ unconditional_guidance_scale=unconditional_guidance_scale,
744
+ unconditional_conditioning=uc_full,
745
+ )
746
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
747
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
748
+
749
+ return log
750
+
751
+ @torch.no_grad()
752
+ def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
753
+ ddim_sampler = DDIMSampler(self)
754
+ # import pdb; pdb.set_trace()
755
+ # b, c, h, w = cond["c_concat"][0].shape
756
+ b, c, h, w = cond["c_concat"][0].shape[0], self.channels, self.image_size*8, self.image_size*8
757
+ shape = (self.channels, h // 8, w // 8)
758
+ samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
759
+ return samples, intermediates
760
+
761
+ def configure_optimizers(self):
762
+ lr = self.learning_rate
763
+ params = list(self.control_model.parameters())
764
+ if self.secret_decoder is not None:
765
+ params += list(self.secret_decoder.parameters())
766
+ if not self.sd_locked:
767
+ params += list(self.model.diffusion_model.output_blocks.parameters())
768
+ params += list(self.model.diffusion_model.out.parameters())
769
+ opt = torch.optim.AdamW(params, lr=lr)
770
+ return opt
771
+
772
+ def low_vram_shift(self, is_diffusing):
773
+ if is_diffusing:
774
+ self.model = self.model.cuda()
775
+ self.control_model = self.control_model.cuda()
776
+ self.first_stage_model = self.first_stage_model.cpu()
777
+ self.cond_stage_model = self.cond_stage_model.cpu()
778
+ else:
779
+ self.model = self.model.cpu()
780
+ self.control_model = self.control_model.cpu()
781
+ self.first_stage_model = self.first_stage_model.cuda()
782
+ self.cond_stage_model = self.cond_stage_model.cuda()
cldm/hack.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import einops
3
+
4
+ import ldm.modules.encoders.modules
5
+ import ldm.modules.attention
6
+
7
+ from transformers import logging
8
+ from ldm.modules.attention import default
9
+ import warnings
10
+
11
+ def disable_verbosity():
12
+ logging.set_verbosity_error()
13
+ warnings.filterwarnings(action='ignore', category=DeprecationWarning)
14
+ warnings.filterwarnings(action='ignore', category=UserWarning)
15
+ print('logging improved.')
16
+ return
17
+
18
+
19
+ def enable_sliced_attention():
20
+ ldm.modules.attention.CrossAttention.forward = _hacked_sliced_attentin_forward
21
+ print('Enabled sliced_attention.')
22
+ return
23
+
24
+
25
+ def hack_everything(clip_skip=0):
26
+ disable_verbosity()
27
+ ldm.modules.encoders.modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward
28
+ ldm.modules.encoders.modules.FrozenCLIPEmbedder.clip_skip = clip_skip
29
+ print('Enabled clip hacks.')
30
+ return
31
+
32
+
33
+ # Written by Lvmin
34
+ def _hacked_clip_forward(self, text):
35
+ PAD = self.tokenizer.pad_token_id
36
+ EOS = self.tokenizer.eos_token_id
37
+ BOS = self.tokenizer.bos_token_id
38
+
39
+ def tokenize(t):
40
+ return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"]
41
+
42
+ def transformer_encode(t):
43
+ if self.clip_skip > 1:
44
+ rt = self.transformer(input_ids=t, output_hidden_states=True)
45
+ return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip])
46
+ else:
47
+ return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state
48
+
49
+ def split(x):
50
+ return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3]
51
+
52
+ def pad(x, p, i):
53
+ return x[:i] if len(x) >= i else x + [p] * (i - len(x))
54
+
55
+ raw_tokens_list = tokenize(text)
56
+ tokens_list = []
57
+
58
+ for raw_tokens in raw_tokens_list:
59
+ raw_tokens_123 = split(raw_tokens)
60
+ raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123]
61
+ raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123]
62
+ tokens_list.append(raw_tokens_123)
63
+
64
+ tokens_list = torch.IntTensor(tokens_list).to(self.device)
65
+
66
+ feed = einops.rearrange(tokens_list, 'b f i -> (b f) i')
67
+ y = transformer_encode(feed)
68
+ z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3)
69
+
70
+ return z
71
+
72
+
73
+ # Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py
74
+ def _hacked_sliced_attentin_forward(self, x, context=None, mask=None):
75
+ h = self.heads
76
+
77
+ q = self.to_q(x)
78
+ context = default(context, x)
79
+ k = self.to_k(context)
80
+ v = self.to_v(context)
81
+ del context, x
82
+
83
+ q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
84
+
85
+ limit = k.shape[0]
86
+ att_step = 1
87
+ q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0))
88
+ k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0))
89
+ v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0))
90
+
91
+ q_chunks.reverse()
92
+ k_chunks.reverse()
93
+ v_chunks.reverse()
94
+ sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
95
+ del k, q, v
96
+ for i in range(0, limit, att_step):
97
+ q_buffer = q_chunks.pop()
98
+ k_buffer = k_chunks.pop()
99
+ v_buffer = v_chunks.pop()
100
+ sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
101
+
102
+ del k_buffer, q_buffer
103
+ # attention, what we cannot get enough of, by chunks
104
+
105
+ sim_buffer = sim_buffer.softmax(dim=-1)
106
+
107
+ sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
108
+ del v_buffer
109
+ sim[i:i + att_step, :, :] = sim_buffer
110
+
111
+ del sim_buffer
112
+ sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h)
113
+ return self.to_out(sim)
cldm/logger.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from omegaconf import OmegaConf
3
+ import numpy as np
4
+ import torch
5
+ import torchvision
6
+ from PIL import Image
7
+ from pytorch_lightning.callbacks import Callback
8
+ from pytorch_lightning.utilities.distributed import rank_zero_only
9
+ from pytorch_lightning.utilities import rank_zero_info
10
+ import time
11
+
12
+
13
+ class CUDACallback(Callback):
14
+ # see https://github.com/SeanNaren/minGPT/blob/master/mingpt/callback.py
15
+ def on_train_epoch_start(self, trainer, pl_module):
16
+ # Reset the memory use counter
17
+ torch.cuda.reset_peak_memory_stats(trainer.root_gpu)
18
+ torch.cuda.synchronize(trainer.root_gpu)
19
+ self.start_time = time.time()
20
+
21
+ def on_train_epoch_end(self, trainer, pl_module, outputs):
22
+ torch.cuda.synchronize(trainer.root_gpu)
23
+ max_memory = torch.cuda.max_memory_allocated(trainer.root_gpu) / 2 ** 20
24
+ epoch_time = (time.time() - self.start_time)/3600
25
+
26
+ try:
27
+ max_memory = trainer.training_type_plugin.reduce(max_memory)
28
+ epoch_time = trainer.training_type_plugin.reduce(epoch_time)
29
+
30
+ rank_zero_info(f"Average Epoch time: {epoch_time:.2f} hours")
31
+ rank_zero_info(f"Average Peak memory {max_memory:.2f}MiB")
32
+ except AttributeError:
33
+ pass
34
+
35
+
36
+ class SetupCallback(Callback):
37
+ def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
38
+ super().__init__()
39
+ self.resume = resume
40
+ self.now = now
41
+ self.logdir = logdir
42
+ self.ckptdir = ckptdir
43
+ self.cfgdir = cfgdir
44
+ self.config = config
45
+ self.lightning_config = lightning_config
46
+
47
+ def on_keyboard_interrupt(self, trainer, pl_module):
48
+ if trainer.global_rank == 0:
49
+ print("Summoning checkpoint.")
50
+ ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
51
+ trainer.save_checkpoint(ckpt_path)
52
+
53
+ def on_pretrain_routine_start(self, trainer, pl_module):
54
+ if trainer.global_rank == 0:
55
+ # Create logdirs and save configs
56
+ os.makedirs(self.logdir, exist_ok=True)
57
+ os.makedirs(self.ckptdir, exist_ok=True)
58
+ os.makedirs(self.cfgdir, exist_ok=True)
59
+
60
+ if "callbacks" in self.lightning_config:
61
+ if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']:
62
+ os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True)
63
+ print("Project config")
64
+ print(OmegaConf.to_yaml(self.config))
65
+ OmegaConf.save(self.config,
66
+ os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
67
+
68
+ print("Lightning config")
69
+ print(OmegaConf.to_yaml(self.lightning_config))
70
+ OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
71
+ os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
72
+
73
+ else:
74
+ # ModelCheckpoint callback created log directory --- remove it
75
+ if not self.resume and os.path.exists(self.logdir):
76
+ dst, name = os.path.split(self.logdir)
77
+ dst = os.path.join(dst, "child_runs", name)
78
+ os.makedirs(os.path.split(dst)[0], exist_ok=True)
79
+ try:
80
+ os.rename(self.logdir, dst)
81
+ except FileNotFoundError:
82
+ pass
83
+
84
+ class ImageLogger(Callback):
85
+ def __init__(self, batch_frequency=2000, max_images=4, clamp=True, increase_log_steps=True,
86
+ rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
87
+ log_images_kwargs=None, fixed_input=False):
88
+ super().__init__()
89
+ self.rescale = rescale
90
+ self.batch_freq = batch_frequency
91
+ self.max_images = max_images
92
+ if not increase_log_steps:
93
+ self.log_steps = [self.batch_freq]
94
+ self.clamp = clamp
95
+ self.disabled = disabled
96
+ self.log_on_batch_idx = log_on_batch_idx
97
+ self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
98
+ self.log_first_step = log_first_step
99
+ self.fixed_input = fixed_input
100
+
101
+ @rank_zero_only
102
+ def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx):
103
+ root = os.path.join(save_dir, "image_log", split)
104
+ for k in images:
105
+ grid = torchvision.utils.make_grid(images[k], nrow=4)
106
+ if self.rescale:
107
+ grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w
108
+ grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
109
+ grid = grid.numpy()
110
+ grid = (grid * 255).astype(np.uint8)
111
+ filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(k, global_step, current_epoch, batch_idx)
112
+ path = os.path.join(root, filename)
113
+ os.makedirs(os.path.split(path)[0], exist_ok=True)
114
+ Image.fromarray(grid).save(path)
115
+
116
+ def log_img(self, pl_module, batch, batch_idx, split="train"):
117
+ check_idx = batch_idx # if self.log_on_batch_idx else pl_module.global_step
118
+ if (self.check_frequency(check_idx) and # batch_idx % self.batch_freq == 0
119
+ hasattr(pl_module, "log_images") and
120
+ callable(pl_module.log_images) and
121
+ self.max_images > 0):
122
+ logger = type(pl_module.logger)
123
+
124
+ is_train = pl_module.training
125
+ if is_train:
126
+ pl_module.eval()
127
+
128
+ with torch.no_grad():
129
+ images = pl_module.log_images(batch, fixed_input=self.fixed_input, split=split, **self.log_images_kwargs)
130
+
131
+ for k in images:
132
+ N = min(images[k].shape[0], self.max_images)
133
+ images[k] = images[k][:N]
134
+ if isinstance(images[k], torch.Tensor):
135
+ images[k] = images[k].detach().cpu()
136
+ if self.clamp:
137
+ images[k] = torch.clamp(images[k], -1., 1.)
138
+ self.log_local(pl_module.logger.save_dir, split, images,
139
+ pl_module.global_step, pl_module.current_epoch, batch_idx)
140
+
141
+ if is_train:
142
+ pl_module.train()
143
+
144
+ def check_frequency(self, check_idx):
145
+ return check_idx % self.batch_freq == 0
146
+
147
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
148
+ if not self.disabled:
149
+ self.log_img(pl_module, batch, batch_idx, split="train")
cldm/loss.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from lpips import LPIPS
4
+ from kornia import color
5
+ # from taming.modules.losses.vqperceptual import *
6
+
7
+ class ImageSecretLoss(nn.Module):
8
+ def __init__(self, recon_type='rgb', recon_weight=1., perceptual_weight=1.0, secret_weight=10., kl_weight=0.000001, logvar_init=0.0, ramp=100000, max_image_weight_ratio=2.) -> None:
9
+ super().__init__()
10
+ self.recon_type = recon_type
11
+ assert recon_type in ['rgb', 'yuv']
12
+ if recon_type == 'yuv':
13
+ self.register_buffer('yuv_scales', torch.tensor([1,100,100]).unsqueeze(1).float()) # [3,1]
14
+ self.recon_weight = recon_weight
15
+ self.perceptual_weight = perceptual_weight
16
+ self.secret_weight = secret_weight
17
+ self.kl_weight = kl_weight
18
+
19
+ self.ramp = ramp
20
+ self.max_image_weight = max_image_weight_ratio * secret_weight - 1
21
+ self.register_buffer('ramp_on', torch.tensor(False))
22
+ self.register_buffer('step0', torch.tensor(1e9)) # large number
23
+
24
+ self.perceptual_loss = LPIPS().eval()
25
+ self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
26
+ self.bce = nn.BCEWithLogitsLoss(reduction="none")
27
+
28
+ def activate_ramp(self, global_step):
29
+ if not self.ramp_on: # do not activate ramp twice
30
+ self.step0 = torch.tensor(global_step)
31
+ self.ramp_on = ~self.ramp_on
32
+ print('[TRAINING] Activate ramp for image loss at step ', global_step)
33
+
34
+ def compute_recon_loss(self, inputs, reconstructions):
35
+ if self.recon_type == 'rgb':
36
+ rec_loss = torch.abs(inputs - reconstructions).mean(dim=[1,2,3])
37
+ elif self.recon_type == 'yuv':
38
+ reconstructions_yuv = color.rgb_to_yuv((reconstructions + 1) / 2)
39
+ inputs_yuv = color.rgb_to_yuv((inputs + 1) / 2)
40
+ yuv_loss = torch.mean((reconstructions_yuv - inputs_yuv)**2, dim=[2,3])
41
+ rec_loss = torch.mm(yuv_loss, self.yuv_scales).squeeze(1)
42
+ else:
43
+ raise ValueError(f"Unknown recon type {self.recon_type}")
44
+ return rec_loss
45
+
46
+ def forward(self, inputs, reconstructions, posteriors, secret_gt, secret_pred, global_step):
47
+ loss_dict = {}
48
+ rec_loss = self.compute_recon_loss(inputs.contiguous(), reconstructions.contiguous())
49
+
50
+ loss = rec_loss*self.recon_weight
51
+
52
+ if self.perceptual_weight > 0:
53
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()).mean(dim=[1,2,3])
54
+ loss += self.perceptual_weight * p_loss
55
+ loss_dict['p_loss'] = p_loss.mean()
56
+
57
+ loss = loss / torch.exp(self.logvar) + self.logvar
58
+ if self.kl_weight > 0:
59
+ kl_loss = posteriors.kl()
60
+ loss += kl_loss*self.kl_weight
61
+ loss_dict['kl_loss'] = kl_loss.mean()
62
+
63
+ image_weight = 1 + min(self.max_image_weight, max(0., self.max_image_weight*(global_step - self.step0.item())/self.ramp))
64
+
65
+ secret_loss = self.bce(secret_pred, secret_gt).mean(dim=1)
66
+ loss = (loss*image_weight + secret_loss*self.secret_weight) / (image_weight+self.secret_weight)
67
+
68
+ # loss dict update
69
+ bit_acc = ((secret_pred.detach() > 0).float() == secret_gt).float().mean()
70
+ loss_dict['bit_acc'] = bit_acc
71
+ loss_dict['loss'] = loss.mean()
72
+ loss_dict['img_lw'] = image_weight/self.secret_weight
73
+ loss_dict['rec_loss'] = rec_loss.mean()
74
+ loss_dict['secret_loss'] = secret_loss.mean()
75
+
76
+ return loss.mean(), loss_dict
77
+
78
+
cldm/loss_weight_scheduler.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+ @author: Tu Bui @University of Surrey
6
+ """
7
+
8
+ class SimpleLossWeightScheduler(object):
9
+ def __init__(self, simple_loss_weight_max=10., wait_steps=50000, ramp=100000) -> None:
10
+ self.simple_loss_weight_max = simple_loss_weight_max
11
+ self.wait_steps = wait_steps
12
+ self.ramp = ramp
13
+
14
+ def __call__(self, step):
15
+ max_weight = self.simple_loss_weight_max - 1
16
+ w = 1 + min(max_weight, max(0., max_weight*(step - self.wait_steps)/self.ramp))
17
+ return w
cldm/model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ from omegaconf import OmegaConf
5
+ from ldm.util import instantiate_from_config
6
+
7
+
8
+ def get_state_dict(d):
9
+ return d.get('state_dict', d)
10
+
11
+
12
+ def load_state_dict(ckpt_path, location='cpu'):
13
+ _, extension = os.path.splitext(ckpt_path)
14
+ if extension.lower() == ".safetensors":
15
+ import safetensors.torch
16
+ state_dict = safetensors.torch.load_file(ckpt_path, device=location)
17
+ else:
18
+ state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
19
+ state_dict = get_state_dict(state_dict)
20
+ print(f'Loaded state_dict from [{ckpt_path}]')
21
+ return state_dict
22
+
23
+
24
+ def create_model(config_path):
25
+ config = OmegaConf.load(config_path)
26
+ model = instantiate_from_config(config.model).cpu()
27
+ print(f'Loaded model config from [{config_path}]')
28
+ return model
cldm/plms.py ADDED
@@ -0,0 +1,1481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+ import os
3
+ import torch
4
+ from torch import nn
5
+ import torchvision
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ from functools import partial
9
+ from PIL import Image
10
+ import shutil
11
+
12
+ from ldm.modules.diffusionmodules.util import (
13
+ make_ddim_sampling_parameters,
14
+ make_ddim_timesteps,
15
+ noise_like,
16
+ )
17
+ import clip
18
+ from einops import rearrange
19
+ import random
20
+
21
+
22
+ class VGGPerceptualLoss(torch.nn.Module):
23
+ def __init__(self, resize=True):
24
+ super(VGGPerceptualLoss, self).__init__()
25
+ blocks = []
26
+ blocks.append(torchvision.models.vgg16(pretrained=True).features[:4].eval())
27
+ blocks.append(torchvision.models.vgg16(pretrained=True).features[4:9].eval())
28
+ blocks.append(torchvision.models.vgg16(pretrained=True).features[9:16].eval())
29
+ blocks.append(torchvision.models.vgg16(pretrained=True).features[16:23].eval())
30
+ for bl in blocks:
31
+ for p in bl.parameters():
32
+ p.requires_grad = False
33
+ self.blocks = torch.nn.ModuleList(blocks)
34
+ self.transform = torch.nn.functional.interpolate
35
+ self.resize = resize
36
+ self.register_buffer(
37
+ "mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
38
+ )
39
+ self.register_buffer(
40
+ "std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
41
+ )
42
+
43
+ def forward(self, input, target, feature_layers=[0, 1, 2, 3], style_layers=[]):
44
+ input = (input - self.mean) / self.std
45
+ target = (target - self.mean) / self.std
46
+ if self.resize:
47
+ input = self.transform(
48
+ input, mode="bilinear", size=(224, 224), align_corners=False
49
+ )
50
+ target = self.transform(
51
+ target, mode="bilinear", size=(224, 224), align_corners=False
52
+ )
53
+ loss = 0.0
54
+ x = input
55
+ y = target
56
+ for i, block in enumerate(self.blocks):
57
+ x = block(x)
58
+ y = block(y)
59
+ if i in feature_layers:
60
+ loss += torch.nn.functional.l1_loss(x, y)
61
+ if i in style_layers:
62
+ act_x = x.reshape(x.shape[0], x.shape[1], -1)
63
+ act_y = y.reshape(y.shape[0], y.shape[1], -1)
64
+ gram_x = act_x @ act_x.permute(0, 2, 1)
65
+ gram_y = act_y @ act_y.permute(0, 2, 1)
66
+ loss += torch.nn.functional.l1_loss(gram_x, gram_y)
67
+ return loss
68
+
69
+
70
+ class DCLIPLoss(torch.nn.Module):
71
+ def __init__(self):
72
+ super(DCLIPLoss, self).__init__()
73
+ self.model, self.preprocess = clip.load("ViT-B/32", device="cuda")
74
+ self.upsample = torch.nn.Upsample(scale_factor=7)
75
+ self.avg_pool = torch.nn.AvgPool2d(kernel_size=16)
76
+
77
+ def forward(self, image1, image2, text1, text2):
78
+ text1 = clip.tokenize([text1]).to("cuda")
79
+ text2 = clip.tokenize([text2]).to("cuda")
80
+ image1 = image1.unsqueeze(0).cuda()
81
+ image2 = image2.unsqueeze(0)
82
+ image1 = self.avg_pool(self.upsample(image1))
83
+ image2 = self.avg_pool(self.upsample(image2))
84
+ image1_feat = self.model.encode_image(image1)
85
+ image2_feat = self.model.encode_image(image2)
86
+ text1_feat = self.model.encode_text(text1)
87
+ text2_feat = self.model.encode_text(text2)
88
+ d_image_feat = image1_feat - image2_feat
89
+ d_text_feat = text1_feat - text2_feat
90
+ similarity = torch.nn.CosineSimilarity()(d_image_feat, d_text_feat)
91
+ return 1 - similarity
92
+
93
+
94
+ class PLMSSampler(object):
95
+ def __init__(self, model, schedule="linear", **kwargs):
96
+ super().__init__()
97
+ self.model = model
98
+ self.ddpm_num_timesteps = model.num_timesteps
99
+ self.schedule = schedule
100
+
101
+ def register_buffer(self, name, attr):
102
+ if type(attr) == torch.Tensor:
103
+ if attr.device != torch.device("cuda"):
104
+ attr = attr.to(torch.device("cuda"))
105
+ setattr(self, name, attr)
106
+
107
+ def make_schedule(
108
+ self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True
109
+ ):
110
+ if ddim_eta != 0:
111
+ raise ValueError("ddim_eta must be 0 for PLMS")
112
+ self.ddim_timesteps = make_ddim_timesteps(
113
+ ddim_discr_method=ddim_discretize,
114
+ num_ddim_timesteps=ddim_num_steps,
115
+ num_ddpm_timesteps=self.ddpm_num_timesteps,
116
+ verbose=verbose,
117
+ )
118
+ alphas_cumprod = self.model.alphas_cumprod
119
+ assert (
120
+ alphas_cumprod.shape[0] == self.ddpm_num_timesteps
121
+ ), "alphas have to be defined for each timestep"
122
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
123
+
124
+ self.register_buffer("betas", to_torch(self.model.betas))
125
+ self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
126
+ self.register_buffer(
127
+ "alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev)
128
+ )
129
+
130
+ # calculations for diffusion q(x_t | x_{t-1}) and others
131
+ self.register_buffer(
132
+ "sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu()))
133
+ )
134
+ self.register_buffer(
135
+ "sqrt_one_minus_alphas_cumprod",
136
+ to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
137
+ )
138
+ self.register_buffer(
139
+ "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu()))
140
+ )
141
+ self.register_buffer(
142
+ "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu()))
143
+ )
144
+ self.register_buffer(
145
+ "sqrt_recipm1_alphas_cumprod",
146
+ to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
147
+ )
148
+
149
+ # ddim sampling parameters
150
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
151
+ alphacums=alphas_cumprod.cpu(),
152
+ ddim_timesteps=self.ddim_timesteps,
153
+ eta=0.0,
154
+ verbose=verbose,
155
+ )
156
+ self.register_buffer("ddim_sigmas", ddim_sigmas)
157
+ self.register_buffer("ddim_alphas", ddim_alphas)
158
+ self.register_buffer("ddim_alphas_prev", ddim_alphas_prev)
159
+ self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas))
160
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
161
+ (1 - self.alphas_cumprod_prev)
162
+ / (1 - self.alphas_cumprod)
163
+ * (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
164
+ )
165
+ self.register_buffer(
166
+ "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps
167
+ )
168
+
169
+ @torch.no_grad()
170
+ def sample(self,
171
+ S,
172
+ batch_size,
173
+ shape,
174
+ conditioning=None,
175
+ callback=None,
176
+ normals_sequence=None,
177
+ img_callback=None,
178
+ quantize_x0=False,
179
+ eta=0.,
180
+ mask=None,
181
+ x0=None,
182
+ temperature=1.,
183
+ noise_dropout=0.,
184
+ score_corrector=None,
185
+ corrector_kwargs=None,
186
+ verbose=True,
187
+ x_T=None,
188
+ log_every_t=100,
189
+ unconditional_guidance_scale=1.,
190
+ unconditional_conditioning=None,
191
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
192
+ dynamic_threshold=None,
193
+ **kwargs
194
+ ):
195
+ if conditioning is not None:
196
+ if isinstance(conditioning, dict):
197
+ cbs = conditioning[list(conditioning.keys())[0]][0].shape[0]
198
+ if cbs != batch_size:
199
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
200
+ else:
201
+ if conditioning.shape[0] != batch_size:
202
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
203
+
204
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
205
+ # sampling
206
+ C, H, W = shape
207
+ size = (batch_size, C, H, W)
208
+ print(f'Data shape for PLMS sampling is {size}')
209
+
210
+ samples, intermediates = self.plms_sampling(conditioning, size,
211
+ callback=callback,
212
+ img_callback=img_callback,
213
+ quantize_denoised=quantize_x0,
214
+ mask=mask, x0=x0,
215
+ ddim_use_original_steps=False,
216
+ noise_dropout=noise_dropout,
217
+ temperature=temperature,
218
+ score_corrector=score_corrector,
219
+ corrector_kwargs=corrector_kwargs,
220
+ x_T=x_T,
221
+ log_every_t=log_every_t,
222
+ unconditional_guidance_scale=unconditional_guidance_scale,
223
+ unconditional_conditioning=unconditional_conditioning,
224
+ )
225
+ return samples, intermediates
226
+
227
+ @torch.no_grad()
228
+ def plms_sampling(
229
+ self,
230
+ cond,
231
+ shape,
232
+ x_T=None,
233
+ ddim_use_original_steps=False,
234
+ callback=None,
235
+ timesteps=None,
236
+ quantize_denoised=False,
237
+ mask=None,
238
+ x0=None,
239
+ img_callback=None,
240
+ log_every_t=100,
241
+ temperature=1.0,
242
+ noise_dropout=0.0,
243
+ score_corrector=None,
244
+ corrector_kwargs=None,
245
+ unconditional_guidance_scale=1.0,
246
+ unconditional_conditioning=None,
247
+ ):
248
+ device = self.model.betas.device
249
+ b = shape[0]
250
+ if x_T is None:
251
+ img = torch.randn(shape, device=device)
252
+ else:
253
+ img = x_T
254
+
255
+ if timesteps is None:
256
+ timesteps = (
257
+ self.ddpm_num_timesteps
258
+ if ddim_use_original_steps
259
+ else self.ddim_timesteps
260
+ )
261
+ elif timesteps is not None and not ddim_use_original_steps:
262
+ subset_end = (
263
+ int(
264
+ min(timesteps / self.ddim_timesteps.shape[0], 1)
265
+ * self.ddim_timesteps.shape[0]
266
+ )
267
+ - 1
268
+ )
269
+ timesteps = self.ddim_timesteps[:subset_end]
270
+
271
+ intermediates = {"x_inter": [img], "pred_x0": [img]}
272
+ time_range = (
273
+ list(reversed(range(0, timesteps)))
274
+ if ddim_use_original_steps
275
+ else np.flip(timesteps)
276
+ )
277
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
278
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
279
+
280
+ iterator = tqdm(time_range, desc="PLMS Sampler", total=total_steps)
281
+ old_eps = []
282
+
283
+ for i, step in enumerate(iterator):
284
+ index = total_steps - i - 1
285
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
286
+ ts_next = torch.full(
287
+ (b,),
288
+ time_range[min(i + 1, len(time_range) - 1)],
289
+ device=device,
290
+ dtype=torch.long,
291
+ )
292
+
293
+ if mask is not None:
294
+ assert x0 is not None
295
+ # import ipdb; ipdb.set_trace()
296
+ img_orig = self.model.q_sample(
297
+ x0, ts
298
+ ) # TODO: deterministic forward pass?
299
+ img = img_orig * mask + (1.0 - mask) * img
300
+
301
+ outs = self.p_sample_plms(
302
+ img,
303
+ cond,
304
+ ts,
305
+ index=index,
306
+ use_original_steps=ddim_use_original_steps,
307
+ quantize_denoised=quantize_denoised,
308
+ temperature=temperature,
309
+ noise_dropout=noise_dropout,
310
+ score_corrector=score_corrector,
311
+ corrector_kwargs=corrector_kwargs,
312
+ unconditional_guidance_scale=unconditional_guidance_scale,
313
+ unconditional_conditioning=unconditional_conditioning,
314
+ old_eps=old_eps,
315
+ t_next=ts_next,
316
+ )
317
+ img, pred_x0, e_t = outs
318
+ old_eps.append(e_t)
319
+ if len(old_eps) >= 4:
320
+ old_eps.pop(0)
321
+ if callback:
322
+ callback(i)
323
+ if img_callback:
324
+ img_callback(pred_x0, i)
325
+
326
+ if index % 1 == 0 or index == total_steps - 1:
327
+ intermediates["x_inter"].append(img)
328
+ intermediates["pred_x0"].append(pred_x0)
329
+
330
+ return img, intermediates
331
+
332
+ @torch.no_grad()
333
+ def p_sample_plms(
334
+ self,
335
+ x,
336
+ c,
337
+ t,
338
+ index,
339
+ repeat_noise=False,
340
+ use_original_steps=False,
341
+ quantize_denoised=False,
342
+ temperature=1.0,
343
+ noise_dropout=0.0,
344
+ score_corrector=None,
345
+ corrector_kwargs=None,
346
+ unconditional_guidance_scale=1.0,
347
+ unconditional_conditioning=None,
348
+ old_eps=None,
349
+ t_next=None,
350
+ ):
351
+ b, *_, device = *x.shape, x.device
352
+
353
+ def get_model_output(x, t):
354
+ if (
355
+ unconditional_conditioning is None
356
+ or unconditional_guidance_scale == 1.0
357
+ ):
358
+ e_t = self.model.apply_model(x, t, c)
359
+ else:
360
+ x_in = torch.cat([x] * 2)
361
+ t_in = torch.cat([t] * 2)
362
+ if isinstance(c, dict):
363
+ c_in = {key: [torch.cat([unconditional_conditioning[key][0], c[key][0]])] for key in c}
364
+ else:
365
+ c_in = torch.cat([unconditional_conditioning, c])
366
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
367
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
368
+
369
+ if score_corrector is not None:
370
+ assert self.model.parameterization == "eps"
371
+ e_t = score_corrector.modify_score(
372
+ self.model, e_t, x, t, c, **corrector_kwargs
373
+ )
374
+
375
+ return e_t
376
+
377
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
378
+ alphas_prev = (
379
+ self.model.alphas_cumprod_prev
380
+ if use_original_steps
381
+ else self.ddim_alphas_prev
382
+ )
383
+ sqrt_one_minus_alphas = (
384
+ self.model.sqrt_one_minus_alphas_cumprod
385
+ if use_original_steps
386
+ else self.ddim_sqrt_one_minus_alphas
387
+ )
388
+ sigmas = (
389
+ self.model.ddim_sigmas_for_original_num_steps
390
+ if use_original_steps
391
+ else self.ddim_sigmas
392
+ )
393
+
394
+ def get_x_prev_and_pred_x0(e_t, index):
395
+ # select parameters corresponding to the currently considered timestep
396
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
397
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
398
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
399
+ sqrt_one_minus_at = torch.full(
400
+ (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
401
+ )
402
+
403
+ # current prediction for x_0
404
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
405
+ if quantize_denoised:
406
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
407
+ # direction pointing to x_t
408
+ dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
409
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
410
+ if noise_dropout > 0.0:
411
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
412
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
413
+ return x_prev, pred_x0
414
+
415
+ e_t = get_model_output(x, t)
416
+ if len(old_eps) == 0:
417
+ # Pseudo Improved Euler (2nd order)
418
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
419
+ e_t_next = get_model_output(x_prev, t_next)
420
+ e_t_prime = (e_t + e_t_next) / 2
421
+ elif len(old_eps) == 1:
422
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
423
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
424
+ elif len(old_eps) == 2:
425
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
426
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
427
+ elif len(old_eps) >= 3:
428
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
429
+ e_t_prime = (
430
+ 55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
431
+ ) / 24
432
+
433
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
434
+
435
+ return x_prev, pred_x0, e_t
436
+
437
+ ###### Above are original stable-diffusion code ############
438
+
439
+ ###### Encode Image ########################################
440
+
441
+ @torch.no_grad()
442
+ def sample_encode_save_noise(
443
+ self,
444
+ S,
445
+ batch_size,
446
+ shape,
447
+ conditioning=None,
448
+ callback=None,
449
+ normals_sequence=None,
450
+ img_callback=None,
451
+ quantize_x0=False,
452
+ eta=0.0,
453
+ mask=None,
454
+ x0=None,
455
+ temperature=1.0,
456
+ noise_dropout=0.0,
457
+ score_corrector=None,
458
+ corrector_kwargs=None,
459
+ verbose=True,
460
+ x_T=None,
461
+ log_every_t=100,
462
+ unconditional_guidance_scale=1.0,
463
+ unconditional_conditioning=None,
464
+ input_image=None,
465
+ noise_save_path=None,
466
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
467
+ **kwargs,
468
+ ):
469
+ assert conditioning is not None
470
+ # assert not isinstance(conditioning, dict)
471
+
472
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
473
+ # sampling
474
+ C, H, W = shape
475
+ size = (batch_size, C, H, W)
476
+ if verbose:
477
+ print(f"Data shape for PLMS sampling is {size}")
478
+
479
+ samples, intermediates, x0_loop = self.plms_sampling_enc_save_noise(
480
+ conditioning,
481
+ size,
482
+ callback=callback,
483
+ img_callback=img_callback,
484
+ quantize_denoised=quantize_x0,
485
+ mask=mask,
486
+ x0=x0,
487
+ ddim_use_original_steps=False,
488
+ noise_dropout=noise_dropout,
489
+ temperature=temperature,
490
+ score_corrector=score_corrector,
491
+ corrector_kwargs=corrector_kwargs,
492
+ x_T=x_T,
493
+ log_every_t=log_every_t,
494
+ unconditional_guidance_scale=unconditional_guidance_scale,
495
+ unconditional_conditioning=unconditional_conditioning,
496
+ input_image=input_image,
497
+ noise_save_path=noise_save_path,
498
+ verbose=verbose
499
+ )
500
+ return samples, intermediates, x0_loop
501
+
502
+ @torch.no_grad()
503
+ def plms_sampling_enc_save_noise(
504
+ self,
505
+ cond,
506
+ shape,
507
+ x_T=None,
508
+ ddim_use_original_steps=False,
509
+ callback=None,
510
+ timesteps=None,
511
+ quantize_denoised=False,
512
+ mask=None,
513
+ x0=None,
514
+ img_callback=None,
515
+ log_every_t=100,
516
+ temperature=1.0,
517
+ noise_dropout=0.0,
518
+ score_corrector=None,
519
+ corrector_kwargs=None,
520
+ unconditional_guidance_scale=1.0,
521
+ unconditional_conditioning=None,
522
+ input_image=None,
523
+ noise_save_path=None,
524
+ verbose=True,
525
+ ):
526
+ device = self.model.betas.device
527
+
528
+ b = shape[0]
529
+ if x_T is None:
530
+ img = torch.randn(shape, device=device)
531
+ else:
532
+ img = x_T
533
+
534
+ if timesteps is None:
535
+ timesteps = (
536
+ self.ddpm_num_timesteps
537
+ if ddim_use_original_steps
538
+ else self.ddim_timesteps
539
+ )
540
+ elif timesteps is not None and not ddim_use_original_steps:
541
+ subset_end = (
542
+ int(
543
+ min(timesteps / self.ddim_timesteps.shape[0], 1)
544
+ * self.ddim_timesteps.shape[0]
545
+ )
546
+ - 1
547
+ )
548
+ timesteps = self.ddim_timesteps[:subset_end]
549
+
550
+ intermediates = {"x_inter": [img], "pred_x0": [img]}
551
+ time_range = (
552
+ list(reversed(range(0, timesteps)))
553
+ if ddim_use_original_steps
554
+ else np.flip(timesteps)
555
+ )
556
+ time_range = list(range(0, timesteps)) if ddim_use_original_steps else timesteps
557
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
558
+ if verbose:
559
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
560
+ iterator = tqdm(time_range[:-1], desc='PLMS Sampler', total=total_steps)
561
+ else:
562
+ iterator = time_range[:-1]
563
+ old_eps = []
564
+ noise_images = []
565
+ for each_time in time_range:
566
+ noised_image = self.model.q_sample(
567
+ input_image, torch.tensor([each_time]).to(device)
568
+ )
569
+ noise_images.append(noised_image)
570
+ # torch.save(noised_image, noise_save_path + "_image_time%d.pt" % (each_time))
571
+ # import pudb; pudb.set_trace()
572
+ x0_loop = input_image.clone()
573
+ alphas = (
574
+ self.model.alphas_cumprod if ddim_use_original_steps else self.ddim_alphas
575
+ )
576
+ alphas_prev = (
577
+ self.model.alphas_cumprod_prev
578
+ if ddim_use_original_steps
579
+ else self.ddim_alphas_prev
580
+ )
581
+ sqrt_one_minus_alphas = (
582
+ self.model.sqrt_one_minus_alphas_cumprod
583
+ if ddim_use_original_steps
584
+ else self.ddim_sqrt_one_minus_alphas
585
+ )
586
+ sigmas = (
587
+ self.model.ddim_sigmas_for_original_num_steps
588
+ if ddim_use_original_steps
589
+ else self.ddim_sigmas
590
+ )
591
+
592
+ def get_model_output(x, t):
593
+ x_in = torch.cat([x] * 2)
594
+ t_in = torch.cat([t] * 2)
595
+ if isinstance(cond, dict):
596
+ c_in = {key: [torch.cat([unconditional_conditioning[key][0], cond[key][0]])] for key in cond}
597
+ else:
598
+ c_in = torch.cat([unconditional_conditioning, cond])
599
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
600
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
601
+ return e_t
602
+
603
+ def get_x_prev_and_pred_x0(e_t, index, curr_x0):
604
+ # select parameters corresponding to the currently considered timestep
605
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
606
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
607
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
608
+ sqrt_one_minus_at = torch.full(
609
+ (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
610
+ )
611
+
612
+ # current prediction for x_0
613
+ pred_x0 = (curr_x0 - sqrt_one_minus_at * e_t) / a_t.sqrt()
614
+
615
+ a_t = torch.full((b, 1, 1, 1), alphas[index + 1], device=device)
616
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index + 1], device=device)
617
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index + 1], device=device)
618
+ sqrt_one_minus_at = torch.full(
619
+ (b, 1, 1, 1), sqrt_one_minus_alphas[index + 1], device=device
620
+ )
621
+
622
+ dir_xt = (1.0 - a_t - sigma_t ** 2).sqrt() * e_t
623
+
624
+ x_prev = a_t.sqrt() * pred_x0 + dir_xt
625
+
626
+ return x_prev, pred_x0
627
+
628
+ for i, step in enumerate(iterator):
629
+ index = i
630
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
631
+ ts_next = torch.full(
632
+ (b,),
633
+ time_range[min(i + 1, len(time_range) - 1)],
634
+ device=device,
635
+ dtype=torch.long,
636
+ )
637
+ e_t = get_model_output(x0_loop, ts)
638
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index, x0_loop)
639
+ x0_loop = x_prev
640
+ # torch.save(x0_loop, noise_save_path + "_final_latent.pt")
641
+
642
+ # Reconstruction
643
+ img = x0_loop.clone()
644
+ time_range = (
645
+ list(reversed(range(0, timesteps)))
646
+ if ddim_use_original_steps
647
+ else np.flip(timesteps)
648
+ )
649
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
650
+ if verbose:
651
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
652
+ iterator = tqdm(time_range, desc="PLMS Sampler", total=total_steps, miniters=total_steps+1, mininterval=600)
653
+ else:
654
+ iterator = time_range
655
+ old_eps = []
656
+ for i, step in enumerate(iterator):
657
+ index = total_steps - i - 1
658
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
659
+ ts_next = torch.full(
660
+ (b,),
661
+ time_range[min(i + 1, len(time_range) - 1)],
662
+ device=device,
663
+ dtype=torch.long,
664
+ )
665
+
666
+ if mask is not None:
667
+ assert x0 is not None
668
+ img_orig = self.model.q_sample(
669
+ x0, ts
670
+ ) # TODO: deterministic forward pass?
671
+ img = img_orig * mask + (1.0 - mask) * img
672
+
673
+ outs = self.p_sample_plms_dec_save_noise(
674
+ img,
675
+ cond,
676
+ ts,
677
+ index=index,
678
+ use_original_steps=ddim_use_original_steps,
679
+ quantize_denoised=quantize_denoised,
680
+ temperature=temperature,
681
+ noise_dropout=noise_dropout,
682
+ score_corrector=score_corrector,
683
+ corrector_kwargs=corrector_kwargs,
684
+ unconditional_guidance_scale=unconditional_guidance_scale,
685
+ unconditional_conditioning=unconditional_conditioning,
686
+ old_eps=old_eps,
687
+ t_next=ts_next,
688
+ input_image=input_image,
689
+ noise_save_path=noise_save_path,
690
+ noise_image=noise_images.pop(),
691
+ )
692
+ img, pred_x0, e_t = outs
693
+
694
+ old_eps.append(e_t)
695
+ if len(old_eps) >= 4:
696
+ old_eps.pop(0)
697
+ if callback:
698
+ callback(i)
699
+ if img_callback:
700
+ img_callback(pred_x0, i)
701
+
702
+ if index % log_every_t == 0 or index == total_steps - 1:
703
+ intermediates["x_inter"].append(img)
704
+ intermediates["pred_x0"].append(pred_x0)
705
+
706
+ return img, intermediates, x0_loop
707
+
708
+ @torch.no_grad()
709
+ def p_sample_plms_dec_save_noise(
710
+ self,
711
+ x,
712
+ c1,
713
+ t,
714
+ index,
715
+ repeat_noise=False,
716
+ use_original_steps=False,
717
+ quantize_denoised=False,
718
+ temperature=1.0,
719
+ noise_dropout=0.0,
720
+ score_corrector=None,
721
+ corrector_kwargs=None,
722
+ unconditional_guidance_scale=1.0,
723
+ unconditional_conditioning=None,
724
+ old_eps=None,
725
+ t_next=None,
726
+ input_image=None,
727
+ noise_save_path=None,
728
+ noise_image=None,
729
+ ):
730
+ b, *_, device = *x.shape, x.device
731
+
732
+ def get_model_output(x, t):
733
+ if (
734
+ unconditional_conditioning is None
735
+ or unconditional_guidance_scale == 1.0
736
+ ):
737
+ e_t = self.model.apply_model(x, t, c1)
738
+ else:
739
+ x_in = torch.cat([x] * 2)
740
+ t_in = torch.cat([t] * 2)
741
+ if isinstance(c1, dict):
742
+ c_in = {key: [torch.cat([unconditional_conditioning[key][0], c1[key][0]])] for key in c1}
743
+ else:
744
+ c_in = torch.cat([unconditional_conditioning, c1])
745
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
746
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
747
+ return e_t
748
+
749
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
750
+ alphas_prev = (
751
+ self.model.alphas_cumprod_prev
752
+ if use_original_steps
753
+ else self.ddim_alphas_prev
754
+ )
755
+ sqrt_one_minus_alphas = (
756
+ self.model.sqrt_one_minus_alphas_cumprod
757
+ if use_original_steps
758
+ else self.ddim_sqrt_one_minus_alphas
759
+ )
760
+ sigmas = (
761
+ self.model.ddim_sigmas_for_original_num_steps
762
+ if use_original_steps
763
+ else self.ddim_sigmas
764
+ )
765
+
766
+ def get_x_prev_and_pred_x0(e_t, index):
767
+ # select parameters corresponding to the currently considered timestep
768
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
769
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
770
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
771
+ sqrt_one_minus_at = torch.full(
772
+ (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
773
+ )
774
+
775
+ # current prediction for x_0
776
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
777
+ if quantize_denoised:
778
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
779
+ # direction pointing to x_t
780
+ dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
781
+ time_curr = index * 20 + 1
782
+ # img_prev = torch.load(noise_save_path + "_image_time%d.pt" % (time_curr))
783
+ img_prev = noise_image
784
+ noise = img_prev - a_prev.sqrt() * pred_x0 - dir_xt
785
+ # torch.save(noise, noise_save_path + "_time%d.pt" % (time_curr))
786
+
787
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
788
+ return x_prev, pred_x0
789
+
790
+ e_t = get_model_output(x, t)
791
+ if len(old_eps) == 0:
792
+ # Pseudo Improved Euler (2nd order)
793
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
794
+ e_t_next = get_model_output(x_prev, t_next)
795
+ e_t_prime = (e_t + e_t_next) / 2
796
+ elif len(old_eps) == 1:
797
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
798
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
799
+ elif len(old_eps) == 2:
800
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
801
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
802
+ elif len(old_eps) >= 3:
803
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
804
+ e_t_prime = (
805
+ 55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
806
+ ) / 24
807
+
808
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
809
+
810
+ return x_prev, pred_x0, e_t
811
+
812
+ ################## Encode Image End ###############################
813
+
814
+ def p_sample_plms_sampling(
815
+ self,
816
+ x,
817
+ c1,
818
+ c2,
819
+ t,
820
+ index,
821
+ repeat_noise=False,
822
+ use_original_steps=False,
823
+ quantize_denoised=False,
824
+ temperature=1.0,
825
+ noise_dropout=0.0,
826
+ score_corrector=None,
827
+ corrector_kwargs=None,
828
+ unconditional_guidance_scale=1.0,
829
+ unconditional_conditioning=None,
830
+ old_eps=None,
831
+ t_next=None,
832
+ input_image=None,
833
+ optimizing_weight=None,
834
+ noise_save_path=None,
835
+ ):
836
+ b, *_, device = *x.shape, x.device
837
+
838
+ def optimize_model_output(x, t):
839
+ # weight_for_pencil = torch.nn.Sigmoid()(optimizing_weight)
840
+ # condition = weight_for_pencil * c1 + (1 - weight_for_pencil) * c2
841
+ condition = optimizing_weight * c1 + (1 - optimizing_weight) * c2
842
+ if (
843
+ unconditional_conditioning is None
844
+ or unconditional_guidance_scale == 1.0
845
+ ):
846
+ e_t = self.model.apply_model(x, t, condition)
847
+ else:
848
+ x_in = torch.cat([x] * 2)
849
+ t_in = torch.cat([t] * 2)
850
+ if isinstance(condition, dict):
851
+ c_in = {key: [torch.cat([unconditional_conditioning[key][0], condition[key][0]])] for key in condition}
852
+ else:
853
+ c_in = torch.cat([unconditional_conditioning, condition])
854
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
855
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
856
+ return e_t
857
+
858
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
859
+ alphas_prev = (
860
+ self.model.alphas_cumprod_prev
861
+ if use_original_steps
862
+ else self.ddim_alphas_prev
863
+ )
864
+ sqrt_one_minus_alphas = (
865
+ self.model.sqrt_one_minus_alphas_cumprod
866
+ if use_original_steps
867
+ else self.ddim_sqrt_one_minus_alphas
868
+ )
869
+ sigmas = (
870
+ self.model.ddim_sigmas_for_original_num_steps
871
+ if use_original_steps
872
+ else self.ddim_sigmas
873
+ )
874
+
875
+ def get_x_prev_and_pred_x0(e_t, index):
876
+ # select parameters corresponding to the currently considered timestep
877
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
878
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
879
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
880
+ sqrt_one_minus_at = torch.full(
881
+ (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
882
+ )
883
+
884
+ # current prediction for x_0
885
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
886
+ if quantize_denoised:
887
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
888
+ # direction pointing to x_t
889
+ dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
890
+ time_curr = index * 20 + 1
891
+ if noise_save_path and index > 16:
892
+ noise = torch.load(noise_save_path + "_time%d.pt" % (time_curr))[:1]
893
+ else:
894
+ noise = torch.zeros_like(dir_xt)
895
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
896
+ return x_prev, pred_x0
897
+
898
+ e_t = optimize_model_output(x, t)
899
+ if len(old_eps) == 0:
900
+ # Pseudo Improved Euler (2nd order)
901
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
902
+ # e_t_next = get_model_output(x_prev, t_next)
903
+ e_t_next = optimize_model_output(x_prev, t_next)
904
+ e_t_prime = (e_t + e_t_next) / 2
905
+ elif len(old_eps) == 1:
906
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
907
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
908
+ elif len(old_eps) == 2:
909
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
910
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
911
+ elif len(old_eps) >= 3:
912
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
913
+ e_t_prime = (
914
+ 55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
915
+ ) / 24
916
+
917
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
918
+
919
+ return x_prev, pred_x0, e_t
920
+
921
+ ################## Edit Input Image ###############################
922
+
923
+ def sample_optimize_intrinsic_edit(
924
+ self,
925
+ S,
926
+ batch_size,
927
+ shape,
928
+ conditioning1=None,
929
+ conditioning2=None,
930
+ callback=None,
931
+ normals_sequence=None,
932
+ img_callback=None,
933
+ quantize_x0=False,
934
+ eta=0.0,
935
+ mask=None,
936
+ x0=None,
937
+ temperature=1.0,
938
+ noise_dropout=0.0,
939
+ score_corrector=None,
940
+ corrector_kwargs=None,
941
+ verbose=True,
942
+ x_T=None,
943
+ log_every_t=100,
944
+ unconditional_guidance_scale=1.0,
945
+ unconditional_conditioning=None,
946
+ input_image=None,
947
+ noise_save_path=None,
948
+ lambda_t=None,
949
+ lambda_save_path=None,
950
+ image_save_path=None,
951
+ original_text=None,
952
+ new_text=None,
953
+ otext=None,
954
+ noise_saved_path=None,
955
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
956
+ **kwargs,
957
+ ):
958
+ assert conditioning1 is not None
959
+ assert conditioning2 is not None
960
+
961
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
962
+ # sampling
963
+ C, H, W = shape
964
+ size = (batch_size, C, H, W)
965
+ print(f"Data shape for PLMS sampling is {size}")
966
+
967
+ self.plms_sampling_optimize_intrinsic_edit(
968
+ conditioning1,
969
+ conditioning2,
970
+ size,
971
+ callback=callback,
972
+ img_callback=img_callback,
973
+ quantize_denoised=quantize_x0,
974
+ mask=mask,
975
+ x0=x0,
976
+ ddim_use_original_steps=False,
977
+ noise_dropout=noise_dropout,
978
+ temperature=temperature,
979
+ score_corrector=score_corrector,
980
+ corrector_kwargs=corrector_kwargs,
981
+ x_T=x_T,
982
+ log_every_t=log_every_t,
983
+ unconditional_guidance_scale=unconditional_guidance_scale,
984
+ unconditional_conditioning=unconditional_conditioning,
985
+ input_image=input_image,
986
+ noise_save_path=noise_save_path,
987
+ lambda_t=lambda_t,
988
+ lambda_save_path=lambda_save_path,
989
+ image_save_path=image_save_path,
990
+ original_text=original_text,
991
+ new_text=new_text,
992
+ otext=otext,
993
+ noise_saved_path=noise_saved_path,
994
+ )
995
+ return None
996
+
997
+ def plms_sampling_optimize_intrinsic_edit(
998
+ self,
999
+ cond1,
1000
+ cond2,
1001
+ shape,
1002
+ x_T=None,
1003
+ ddim_use_original_steps=False,
1004
+ callback=None,
1005
+ timesteps=None,
1006
+ quantize_denoised=False,
1007
+ mask=None,
1008
+ x0=None,
1009
+ img_callback=None,
1010
+ log_every_t=100,
1011
+ temperature=1.0,
1012
+ noise_dropout=0.0,
1013
+ score_corrector=None,
1014
+ corrector_kwargs=None,
1015
+ unconditional_guidance_scale=1.0,
1016
+ unconditional_conditioning=None,
1017
+ input_image=None,
1018
+ noise_save_path=None,
1019
+ lambda_t=None,
1020
+ lambda_save_path=None,
1021
+ image_save_path=None,
1022
+ original_text=None,
1023
+ new_text=None,
1024
+ otext=None,
1025
+ noise_saved_path=None,
1026
+ ):
1027
+ # Different from above, the intrinsic edit version needs
1028
+ device = self.model.betas.device
1029
+
1030
+ b = shape[0]
1031
+ if x_T is None:
1032
+ img = torch.randn(shape, device=device)
1033
+ else:
1034
+ img = x_T
1035
+ img_clone = img.clone()
1036
+
1037
+ if timesteps is None:
1038
+ timesteps = (
1039
+ self.ddpm_num_timesteps
1040
+ if ddim_use_original_steps
1041
+ else self.ddim_timesteps
1042
+ )
1043
+ elif timesteps is not None and not ddim_use_original_steps:
1044
+ subset_end = (
1045
+ int(
1046
+ min(timesteps / self.ddim_timesteps.shape[0], 1)
1047
+ * self.ddim_timesteps.shape[0]
1048
+ )
1049
+ - 1
1050
+ )
1051
+ timesteps = self.ddim_timesteps[:subset_end]
1052
+
1053
+ intermediates = {"x_inter": [img], "pred_x0": [img]}
1054
+ time_range = (
1055
+ list(reversed(range(0, timesteps)))
1056
+ if ddim_use_original_steps
1057
+ else np.flip(timesteps)
1058
+ )
1059
+
1060
+ weighting_parameter = lambda_t
1061
+ weighting_parameter.requires_grad = True
1062
+ from torch import optim
1063
+
1064
+ optimizer = optim.Adam([weighting_parameter], lr=0.05)
1065
+
1066
+ print("Original image")
1067
+ with torch.no_grad():
1068
+ img = img_clone.clone()
1069
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
1070
+ iterator = time_range
1071
+ old_eps = []
1072
+
1073
+ for i, step in enumerate(iterator):
1074
+ index = total_steps - i - 1
1075
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
1076
+ ts_next = torch.full(
1077
+ (b,),
1078
+ time_range[min(i + 1, len(time_range) - 1)],
1079
+ device=device,
1080
+ dtype=torch.long,
1081
+ )
1082
+
1083
+ outs = self.p_sample_plms_sampling(
1084
+ img,
1085
+ cond1,
1086
+ cond2,
1087
+ ts,
1088
+ index=index,
1089
+ use_original_steps=ddim_use_original_steps,
1090
+ quantize_denoised=quantize_denoised,
1091
+ temperature=temperature,
1092
+ noise_dropout=noise_dropout,
1093
+ score_corrector=score_corrector,
1094
+ corrector_kwargs=corrector_kwargs,
1095
+ unconditional_guidance_scale=unconditional_guidance_scale,
1096
+ unconditional_conditioning=unconditional_conditioning,
1097
+ old_eps=old_eps,
1098
+ t_next=ts_next,
1099
+ input_image=input_image,
1100
+ optimizing_weight=torch.ones(50)[i],
1101
+ noise_save_path=noise_saved_path,
1102
+ )
1103
+ img, pred_x0, e_t = outs
1104
+ old_eps.append(e_t)
1105
+ if len(old_eps) >= 4:
1106
+ old_eps.pop(0)
1107
+ img_temp = self.model.decode_first_stage(img)
1108
+ img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
1109
+ img_temp_ddim = img_temp_ddim.cpu().permute(0, 2, 3, 1).permute(0, 3, 1, 2)
1110
+ # save image
1111
+ with torch.no_grad():
1112
+ x_sample = 255.0 * rearrange(
1113
+ img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
1114
+ )
1115
+ imgsave = Image.fromarray(x_sample.astype(np.uint8))
1116
+ imgsave.save(image_save_path + "original.png")
1117
+ readed_image = (
1118
+ torchvision.io.read_image(image_save_path + "original.png").float()
1119
+ / 255
1120
+ )
1121
+ print("Optimizing start")
1122
+ for epoch in tqdm(range(10)):
1123
+ img = img_clone.clone()
1124
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
1125
+ iterator = time_range
1126
+ old_eps = []
1127
+
1128
+ for i, step in enumerate(iterator):
1129
+ index = total_steps - i - 1
1130
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
1131
+ ts_next = torch.full(
1132
+ (b,),
1133
+ time_range[min(i + 1, len(time_range) - 1)],
1134
+ device=device,
1135
+ dtype=torch.long,
1136
+ )
1137
+
1138
+ outs = self.p_sample_plms_sampling(
1139
+ img,
1140
+ cond1,
1141
+ cond2,
1142
+ ts,
1143
+ index=index,
1144
+ use_original_steps=ddim_use_original_steps,
1145
+ quantize_denoised=quantize_denoised,
1146
+ temperature=temperature,
1147
+ noise_dropout=noise_dropout,
1148
+ score_corrector=score_corrector,
1149
+ corrector_kwargs=corrector_kwargs,
1150
+ unconditional_guidance_scale=unconditional_guidance_scale,
1151
+ unconditional_conditioning=unconditional_conditioning,
1152
+ old_eps=old_eps,
1153
+ t_next=ts_next,
1154
+ input_image=input_image,
1155
+ optimizing_weight=weighting_parameter[i],
1156
+ noise_save_path=noise_saved_path,
1157
+ )
1158
+ img, pred_x0, e_t = outs
1159
+ old_eps.append(e_t)
1160
+ if len(old_eps) >= 4:
1161
+ old_eps.pop(0)
1162
+ img_temp = self.model.decode_first_stage(img)
1163
+ img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
1164
+ img_temp_ddim = img_temp_ddim.cpu()
1165
+
1166
+ # save image
1167
+ # with torch.no_grad():
1168
+ # x_sample = 255.0 * rearrange(
1169
+ # img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
1170
+ # )
1171
+ # imgsave = Image.fromarray(x_sample.astype(np.uint8))
1172
+ # imgsave.save(image_save_path + "/%d.png" % (epoch))
1173
+
1174
+ loss1 = VGGPerceptualLoss()(img_temp_ddim[0], readed_image)
1175
+ loss2 = DCLIPLoss()(
1176
+ readed_image, img_temp_ddim[0].float().cuda(), otext, new_text
1177
+ )
1178
+ loss = 0.05 * loss1 + loss2
1179
+ optimizer.zero_grad()
1180
+ loss.backward()
1181
+ optimizer.step()
1182
+ # torch.save(
1183
+ # weighting_parameter, lambda_save_path + "/weightingParam%d.pt" % (epoch)
1184
+ # )
1185
+ if epoch < 9:
1186
+ del img
1187
+ else:
1188
+ # save image
1189
+ with torch.no_grad():
1190
+ x_sample = 255.0 * rearrange(
1191
+ img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
1192
+ )
1193
+ imgsave = Image.fromarray(x_sample.astype(np.uint8))
1194
+ imgsave.save(image_save_path + "/final.png")
1195
+ torch.save(
1196
+ weighting_parameter, lambda_save_path + "/weightingParam_final.pt"
1197
+ )
1198
+
1199
+ torch.cuda.empty_cache()
1200
+ # shutil.rmtree("noise")
1201
+ return None
1202
+
1203
+ ################ Edit Image End ######################
1204
+
1205
+ ################ Disentangle #########################
1206
+
1207
+ def sample_optimize_intrinsic(
1208
+ self,
1209
+ S,
1210
+ batch_size,
1211
+ shape,
1212
+ conditioning1=None,
1213
+ conditioning2=None,
1214
+ callback=None,
1215
+ normals_sequence=None,
1216
+ img_callback=None,
1217
+ quantize_x0=False,
1218
+ eta=0.0,
1219
+ mask=None,
1220
+ x0=None,
1221
+ temperature=1.0,
1222
+ noise_dropout=0.0,
1223
+ score_corrector=None,
1224
+ corrector_kwargs=None,
1225
+ verbose=True,
1226
+ x_T=None,
1227
+ log_every_t=100,
1228
+ unconditional_guidance_scale=1.0,
1229
+ unconditional_conditioning=None,
1230
+ input_image=None,
1231
+ noise_save_path=None,
1232
+ lambda_t=None,
1233
+ lambda_save_path=None,
1234
+ image_save_path=None,
1235
+ original_text=None,
1236
+ new_text=None,
1237
+ otext=None,
1238
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
1239
+ **kwargs,
1240
+ ):
1241
+ assert conditioning1 is not None
1242
+ assert conditioning2 is not None
1243
+
1244
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
1245
+ # sampling
1246
+ C, H, W = shape
1247
+ size = (batch_size, C, H, W)
1248
+ print(f"Data shape for PLMS sampling is {size}")
1249
+
1250
+ self.plms_sampling_optimize_intrinsic(
1251
+ conditioning1,
1252
+ conditioning2,
1253
+ size,
1254
+ callback=callback,
1255
+ img_callback=img_callback,
1256
+ quantize_denoised=quantize_x0,
1257
+ mask=mask,
1258
+ x0=x0,
1259
+ ddim_use_original_steps=False,
1260
+ noise_dropout=noise_dropout,
1261
+ temperature=temperature,
1262
+ score_corrector=score_corrector,
1263
+ corrector_kwargs=corrector_kwargs,
1264
+ x_T=x_T,
1265
+ log_every_t=log_every_t,
1266
+ unconditional_guidance_scale=unconditional_guidance_scale,
1267
+ unconditional_conditioning=unconditional_conditioning,
1268
+ input_image=input_image,
1269
+ noise_save_path=noise_save_path,
1270
+ lambda_t=lambda_t,
1271
+ lambda_save_path=lambda_save_path,
1272
+ image_save_path=image_save_path,
1273
+ original_text=original_text,
1274
+ new_text=new_text,
1275
+ otext=otext,
1276
+ )
1277
+ return None
1278
+
1279
+ def plms_sampling_optimize_intrinsic(
1280
+ self,
1281
+ cond1,
1282
+ cond2,
1283
+ shape,
1284
+ x_T=None,
1285
+ ddim_use_original_steps=False,
1286
+ callback=None,
1287
+ timesteps=None,
1288
+ quantize_denoised=False,
1289
+ mask=None,
1290
+ x0=None,
1291
+ img_callback=None,
1292
+ log_every_t=100,
1293
+ temperature=1.0,
1294
+ noise_dropout=0.0,
1295
+ score_corrector=None,
1296
+ corrector_kwargs=None,
1297
+ unconditional_guidance_scale=1.0,
1298
+ unconditional_conditioning=None,
1299
+ input_image=None,
1300
+ noise_save_path=None,
1301
+ lambda_t=None,
1302
+ lambda_save_path=None,
1303
+ image_save_path=None,
1304
+ original_text=None,
1305
+ new_text=None,
1306
+ otext=None,
1307
+ ):
1308
+ device = self.model.betas.device
1309
+
1310
+ b = shape[0]
1311
+ if x_T is None:
1312
+ img = torch.randn(shape, device=device)
1313
+ else:
1314
+ img = x_T
1315
+ img_clone = img.clone()
1316
+
1317
+ if timesteps is None:
1318
+ timesteps = (
1319
+ self.ddpm_num_timesteps
1320
+ if ddim_use_original_steps
1321
+ else self.ddim_timesteps
1322
+ )
1323
+ elif timesteps is not None and not ddim_use_original_steps:
1324
+ subset_end = (
1325
+ int(
1326
+ min(timesteps / self.ddim_timesteps.shape[0], 1)
1327
+ * self.ddim_timesteps.shape[0]
1328
+ )
1329
+ - 1
1330
+ )
1331
+ timesteps = self.ddim_timesteps[:subset_end]
1332
+
1333
+ time_range = (
1334
+ list(reversed(range(0, timesteps)))
1335
+ if ddim_use_original_steps
1336
+ else np.flip(timesteps)
1337
+ )
1338
+ weighting_parameter = lambda_t
1339
+ weighting_parameter.requires_grad = True
1340
+ from torch import optim
1341
+
1342
+ optimizer = optim.Adam([weighting_parameter], lr=0.05)
1343
+
1344
+ print("Original image")
1345
+ with torch.no_grad():
1346
+ img = img_clone.clone()
1347
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
1348
+ iterator = time_range
1349
+ old_eps = []
1350
+
1351
+ for i, step in enumerate(iterator):
1352
+ index = total_steps - i - 1
1353
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
1354
+ ts_next = torch.full(
1355
+ (b,),
1356
+ time_range[min(i + 1, len(time_range) - 1)],
1357
+ device=device,
1358
+ dtype=torch.long,
1359
+ )
1360
+
1361
+ outs = self.p_sample_plms_sampling(
1362
+ img,
1363
+ cond1,
1364
+ cond2,
1365
+ ts,
1366
+ index=index,
1367
+ use_original_steps=ddim_use_original_steps,
1368
+ quantize_denoised=quantize_denoised,
1369
+ temperature=temperature,
1370
+ noise_dropout=noise_dropout,
1371
+ score_corrector=score_corrector,
1372
+ corrector_kwargs=corrector_kwargs,
1373
+ unconditional_guidance_scale=unconditional_guidance_scale,
1374
+ unconditional_conditioning=unconditional_conditioning,
1375
+ old_eps=old_eps,
1376
+ t_next=ts_next,
1377
+ input_image=input_image,
1378
+ optimizing_weight=torch.ones(50)[i],
1379
+ noise_save_path=noise_save_path,
1380
+ )
1381
+ img, pred_x0, e_t = outs
1382
+ old_eps.append(e_t)
1383
+ if len(old_eps) >= 4:
1384
+ old_eps.pop(0)
1385
+ img_temp = self.model.decode_first_stage(img)
1386
+ del img
1387
+ img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
1388
+ img_temp_ddim = img_temp_ddim.cpu().permute(0, 2, 3, 1).permute(0, 3, 1, 2)
1389
+ # save image
1390
+ with torch.no_grad():
1391
+ x_sample = 255.0 * rearrange(
1392
+ img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
1393
+ )
1394
+ imgsave = Image.fromarray(x_sample.astype(np.uint8))
1395
+ imgsave.save(image_save_path + "original.png")
1396
+
1397
+ readed_image = (
1398
+ torchvision.io.read_image(image_save_path + "original.png").float()
1399
+ / 255
1400
+ )
1401
+
1402
+ print("Optimizing start")
1403
+ for epoch in tqdm(range(10)):
1404
+ img = img_clone.clone()
1405
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
1406
+ iterator = time_range
1407
+ old_eps = []
1408
+
1409
+ for i, step in enumerate(iterator):
1410
+ index = total_steps - i - 1
1411
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
1412
+ ts_next = torch.full(
1413
+ (b,),
1414
+ time_range[min(i + 1, len(time_range) - 1)],
1415
+ device=device,
1416
+ dtype=torch.long,
1417
+ )
1418
+
1419
+ outs = self.p_sample_plms_sampling(
1420
+ img,
1421
+ cond1,
1422
+ cond2,
1423
+ ts,
1424
+ index=index,
1425
+ use_original_steps=ddim_use_original_steps,
1426
+ quantize_denoised=quantize_denoised,
1427
+ temperature=temperature,
1428
+ noise_dropout=noise_dropout,
1429
+ score_corrector=score_corrector,
1430
+ corrector_kwargs=corrector_kwargs,
1431
+ unconditional_guidance_scale=unconditional_guidance_scale,
1432
+ unconditional_conditioning=unconditional_conditioning,
1433
+ old_eps=old_eps,
1434
+ t_next=ts_next,
1435
+ input_image=input_image,
1436
+ optimizing_weight=weighting_parameter[i],
1437
+ noise_save_path=noise_save_path,
1438
+ )
1439
+ img, _, e_t = outs
1440
+ old_eps.append(e_t)
1441
+ if len(old_eps) >= 4:
1442
+ old_eps.pop(0)
1443
+ img_temp = self.model.decode_first_stage(img)
1444
+ del img
1445
+ img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
1446
+ img_temp_ddim = img_temp_ddim.cpu()
1447
+
1448
+ # # save image
1449
+ # with torch.no_grad():
1450
+ # x_sample = 255. * rearrange(img_temp_ddim[0].detach().cpu().numpy(), 'c h w -> h w c')
1451
+ # imgsave = Image.fromarray(x_sample.astype(np.uint8))
1452
+ # imgsave.save(image_save_path + "/%d.png"%(epoch))
1453
+
1454
+ loss1 = VGGPerceptualLoss()(img_temp_ddim[0], readed_image)
1455
+ loss2 = DCLIPLoss()(
1456
+ readed_image, img_temp_ddim[0].float().cuda(), otext, new_text
1457
+ )
1458
+ loss = (
1459
+ 0.05 * loss1 + loss2
1460
+ ) # 0.05 or 0.03. Adjust according to attributes on scenes or people.
1461
+ optimizer.zero_grad()
1462
+ loss.backward()
1463
+ optimizer.step()
1464
+ # torch.save(weighting_parameter, lambda_save_path+"/weightingParam%d.pt"%(epoch))
1465
+ with torch.no_grad():
1466
+ if epoch == 9:
1467
+ # save image
1468
+ x_sample = 255.0 * rearrange(
1469
+ img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
1470
+ )
1471
+ imgsave = Image.fromarray(x_sample.astype(np.uint8))
1472
+ imgsave.save(image_save_path + "/final.png")
1473
+ torch.save(
1474
+ weighting_parameter,
1475
+ lambda_save_path + "/weightingParam_final.pt",
1476
+ )
1477
+ torch.cuda.empty_cache()
1478
+ return None
1479
+
1480
+
1481
+ ################ Disentangle End #########################
cldm/tmp.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ use kornia and albumentations for transformations
5
+ @author: Tu Bui @University of Surrey
6
+ """
7
+ import os
8
+ from . import utils
9
+ import torch
10
+ import numpy as np
11
+ from torch import nn
12
+ import torch.nn.functional as F
13
+ from PIL import Image
14
+ import kornia as ko
15
+ import albumentations as ab
16
+
17
+
18
+ class IdentityAugment(nn.Module):
19
+ def __init__(self):
20
+ super().__init__()
21
+
22
+ def forward(self, x, **kwargs):
23
+ return x
24
+
25
+
26
+ class RandomCompress(nn.Module):
27
+ def __init__(self, severity='medium', p=0.5):
28
+ super().__init__()
29
+ self.p = p
30
+ if severity == 'low':
31
+ self.jpeg_quality = 70
32
+ elif severity == 'medium':
33
+ self.jpeg_quality = 50
34
+ elif severity == 'high':
35
+ self.jpeg_quality = 40
36
+
37
+ def forward(self, x, ramp=1.):
38
+ # x (B, C, H, W) in range [0, 1]
39
+ # ramp: adjust the ramping of the compression, 1.0 means min quality = self.jpeg_quality
40
+ if torch.rand(1)[0] >= self.p:
41
+ return x
42
+ jpeg_quality = 100. - torch.rand(1)[0] * ramp * (100. - self.jpeg_quality)
43
+ x = utils.jpeg_compress_decompress(x, rounding=utils.round_only_at_0, quality=jpeg_quality)
44
+ return x
45
+
46
+
47
+ class RandomBoxBlur(nn.Module):
48
+ def __init__(self, severity='medium', border_type='reflect', normalize=True, p=0.5):
49
+ super().__init__()
50
+ self.p = p
51
+ if severity == 'low':
52
+ kernel_size = 3
53
+ elif severity == 'medium':
54
+ kernel_size = 5
55
+ elif severity == 'high':
56
+ kernel_size = 7
57
+
58
+ self.tform = ko.augmentation.RandomBoxBlur(kernel_size=(kernel_size, kernel_size), border_type=border_type, normalize=normalize, p=self.p)
59
+
60
+ def forward(self, x, **kwargs):
61
+ return self.tform(x)
62
+
63
+ class RandomMedianBlur(nn.Module):
64
+ def __init__(self, severity='medium', p=0.5):
65
+ super().__init__()
66
+ self.p = p
67
+ self.tform = ko.augmentation.RandomMedianBlur(kernel_size=(3,3), p=p)
68
+
69
+ def forward(self, x, **kwargs):
70
+ return self.tform(x)
71
+
72
+
73
+ class RandomBrightness(nn.Module):
74
+ def __init__(self, severity='medium', p=0.5):
75
+ super().__init__()
76
+ self.p = p
77
+ if severity == 'low':
78
+ brightness = (0.9, 1.1)
79
+ elif severity == 'medium':
80
+ brightness = (0.75, 1.25)
81
+ elif severity == 'high':
82
+ brightness = (0.5, 1.5)
83
+ self.tform = ko.augmentation.RandomBrightness(brightness=brightness, p=p)
84
+
85
+ def forward(self, x, **kwargs):
86
+ return self.tform(x)
87
+
88
+
89
+ class RandomContrast(nn.Module):
90
+ def __init__(self, severity='medium', p=0.5):
91
+ super().__init__()
92
+ self.p = p
93
+ if severity == 'low':
94
+ contrast = (0.9, 1.1)
95
+ elif severity == 'medium':
96
+ contrast = (0.75, 1.25)
97
+ elif severity == 'high':
98
+ contrast = (0.5, 1.5)
99
+ self.tform = ko.augmentation.RandomContrast(contrast=contrast, p=p)
100
+
101
+ def forward(self, x, **kwargs):
102
+ return self.tform(x)
103
+
104
+
105
+ class RandomSaturation(nn.Module):
106
+ def __init__(self, severity='medium', p=0.5):
107
+ super().__init__()
108
+ self.p = p
109
+ if severity == 'low':
110
+ sat = (0.9, 1.1)
111
+ elif severity == 'medium':
112
+ sat = (0.75, 1.25)
113
+ elif severity == 'high':
114
+ sat = (0.5, 1.5)
115
+ self.tform = ko.augmentation.RandomSaturation(saturation=sat, p=p)
116
+
117
+ def forward(self, x, **kwargs):
118
+ return self.tform(x)
119
+
120
+ class RandomSharpness(nn.Module):
121
+ def __init__(self, severity='medium', p=0.5):
122
+ super().__init__()
123
+ self.p = p
124
+ if severity == 'low':
125
+ sharpness = 0.5
126
+ elif severity == 'medium':
127
+ sharpness = 1.0
128
+ elif severity == 'high':
129
+ sharpness = 2.5
130
+ self.tform = ko.augmentation.RandomSharpness(sharpness=sharpness, p=p)
131
+
132
+ def forward(self, x, **kwargs):
133
+ return self.tform(x)
134
+
135
+ class RandomColorJiggle(nn.Module):
136
+ def __init__(self, severity='medium', p=0.5):
137
+ super().__init__()
138
+ self.p = p
139
+ if severity == 'low':
140
+ factor = (0.05, 0.05, 0.05, 0.01)
141
+ elif severity == 'medium':
142
+ factor = (0.1, 0.1, 0.1, 0.02)
143
+ elif severity == 'high':
144
+ factor = (0.1, 0.1, 0.1, 0.05)
145
+ self.tform = ko.augmentation.ColorJiggle(*factor, p=p)
146
+
147
+ def forward(self, x, **kwargs):
148
+ return self.tform(x)
149
+
150
+ class RandomHue(nn.Module):
151
+ def __init__(self, severity='medium', p=0.5):
152
+ super().__init__()
153
+ self.p = p
154
+ if severity == 'low':
155
+ hue = 0.01
156
+ elif severity == 'medium':
157
+ hue = 0.02
158
+ elif severity == 'high':
159
+ hue = 0.05
160
+ self.tform = ko.augmentation.RandomHue(hue=(-hue, hue), p=p)
161
+
162
+ def forward(self, x, **kwargs):
163
+ return self.tform(x)
164
+
165
+ class RandomGamma(nn.Module):
166
+ def __init__(self, severity='medium', p=0.5):
167
+ super().__init__()
168
+ self.p = p
169
+ if severity == 'low':
170
+ gamma, gain = (0.9, 1.1), (0.9,1.1)
171
+ elif severity == 'medium':
172
+ gamma, gain = (0.75, 1.25), (0.75,1.25)
173
+ elif severity == 'high':
174
+ gamma, gain = (0.5, 1.5), (0.5,1.5)
175
+ self.tform = ko.augmentation.RandomGamma(gamma, gain, p=p)
176
+
177
+ def forward(self, x, **kwargs):
178
+ return self.tform(x)
179
+
180
+ class RandomGaussianBlur(nn.Module):
181
+ def __init__(self, severity='medium', p=0.5):
182
+ super().__init__()
183
+ self.p = p
184
+ if severity == 'low':
185
+ kernel_size, sigma = 3, (0.1, 1.0)
186
+ elif severity == 'medium':
187
+ kernel_size, sigma = 5, (0.1, 1.5)
188
+ elif severity == 'high':
189
+ kernel_size, sigma = 7, (0.1, 2.0)
190
+ self.tform = ko.augmentation.RandomGaussianBlur(kernel_size=(kernel_size, kernel_size), sigma=sigma, p=self.p)
191
+
192
+ def forward(self, x, **kwargs):
193
+ return self.tform(x)
194
+
195
+ class RandomGaussianNoise(nn.Module):
196
+ def __init__(self, severity='medium', p=0.5):
197
+ super().__init__()
198
+ self.p = p
199
+ if severity == 'low':
200
+ std = 0.02
201
+ elif severity == 'medium':
202
+ std = 0.04
203
+ elif severity == 'high':
204
+ std = 0.08
205
+ self.tform = ko.augmentation.RandomGaussianNoise(mean=0., std=std, p=p)
206
+
207
+ def forward(self, x, **kwargs):
208
+ return self.tform(x)
209
+
210
+ class RandomMotionBlur(nn.Module):
211
+ def __init__(self, severity='medium', p=0.5):
212
+ super().__init__()
213
+ self.p = p
214
+ if severity == 'low':
215
+ kernel_size, angle, direction = (3, 5), (-25, 25), (-0.25, 0.25)
216
+ elif severity == 'medium':
217
+ kernel_size, angle, direction = (3, 7), (-45, 45), (-0.5, 0.5)
218
+ elif severity == 'high':
219
+ kernel_size, angle, direction = (3, 9), (-90, 90), (-1.0, 1.0)
220
+ self.tform = ko.augmentation.RandomMotionBlur(kernel_size, angle, direction, p=p)
221
+
222
+ def forward(self, x, **kwargs):
223
+ return self.tform(x)
224
+
225
+ class RandomPosterize(nn.Module):
226
+ def __init__(self, severity='medium', p=0.5):
227
+ super().__init__()
228
+ self.p = p
229
+ if severity == 'low':
230
+ bits = 5
231
+ elif severity == 'medium':
232
+ bits = 4
233
+ elif severity == 'high':
234
+ bits = 3
235
+ self.tform = ko.augmentation.RandomPosterize(bits=bits, p=p)
236
+
237
+ def forward(self, x, **kwargs):
238
+ return self.tform(x)
239
+
240
+ class RandomRGBShift(nn.Module):
241
+ def __init__(self, severity='medium', p=0.5):
242
+ super().__init__()
243
+ self.p = p
244
+ if severity == 'low':
245
+ rgb = 0.02
246
+ elif severity == 'medium':
247
+ rgb = 0.05
248
+ elif severity == 'high':
249
+ rgb = 0.1
250
+ self.tform = ko.augmentation.RandomRGBShift(r_shift_limit=rgb, g_shift_limit=rgb, b_shift_limit=rgb, p=p)
251
+
252
+ def forward(self, x, **kwargs):
253
+ return self.tform(x)
254
+
255
+
256
+
257
+ class TransformNet(nn.Module):
258
+ def __init__(self, flip=True, crop_mode='random_crop', compress=True, brightness=True, contrast=True, color_jiggle=True, gamma=True, grayscale=True, gaussian_blur=True, gaussian_noise=True, hue=True, motion_blur=True, posterize=True, rgb_shift=True, saturation=True, sharpness=True, median_blur=True, severity='medium', n_optional=2, ramp=1000, p=0.5):
259
+ super().__init__()
260
+ self.n_optional = n_optional
261
+ self.p = p
262
+ p_flip = 0.5 if flip else 0
263
+ rnd_flip_layer = ko.augmentation.RandomHorizontalFlip(p_flip)
264
+ self.ramp = ramp
265
+ self.register_buffer('step0', torch.tensor(0))
266
+
267
+ assert crop_mode in ['random_crop', 'resized_crop']
268
+ if crop_mode == 'random_crop':
269
+ rnd_crop_layer = ko.augmentation.RandomCrop((224,224), cropping_mode="resample")
270
+ elif crop_mode == 'resized_crop':
271
+ rnd_crop_layer = ko.augmentation.RandomResizedCrop(size=(224,224), scale=(0.7, 1.0), ratio=(3.0/4, 4.0/3), cropping_mode='resample')
272
+
273
+ self.fixed_transforms = [rnd_flip_layer, rnd_crop_layer]
274
+ self.optional_transforms = []
275
+ if compress:
276
+ self.optional_transforms.append(RandomCompress(severity, p=p))
277
+ if brightness:
278
+ self.optional_transforms.append(RandomBrightness(severity, p=p))
279
+ if contrast:
280
+ self.optional_transforms.append(RandomContrast(severity, p=p))
281
+ if color_jiggle:
282
+ self.optional_transforms.append(RandomColorJiggle(severity, p=p))
283
+ if gamma:
284
+ self.optional_transforms.append(RandomGamma(severity, p=p))
285
+ if grayscale:
286
+ self.optional_transforms.append(ko.augmentation.RandomGrayscale(p=p/4))
287
+ if gaussian_blur:
288
+ self.optional_transforms.append(RandomGaussianBlur(severity, p=p))
289
+ if gaussian_noise:
290
+ self.optional_transforms.append(RandomGaussianNoise(severity, p=p))
291
+ if hue:
292
+ self.optional_transforms.append(RandomHue(severity, p=p))
293
+ if motion_blur:
294
+ self.optional_transforms.append(RandomMotionBlur(severity, p=p))
295
+ if posterize:
296
+ self.optional_transforms.append(RandomPosterize(severity, p=p))
297
+ if rgb_shift:
298
+ self.optional_transforms.append(RandomRGBShift(severity, p=p))
299
+ if saturation:
300
+ self.optional_transforms.append(RandomSaturation(severity, p=p))
301
+ if sharpness:
302
+ self.optional_transforms.append(RandomSharpness(severity, p=p))
303
+ if median_blur:
304
+ self.optional_transforms.append(RandomMedianBlur(severity, p=p))
305
+
306
+ def activate(self, global_step):
307
+ if self.step0 == 0:
308
+ print(f'[TRAINING] Activating TransformNet at step {global_step}')
309
+ self.step0 = torch.tensor(global_step)
310
+
311
+ def is_activated(self):
312
+ return self.step0 > 0
313
+
314
+ def forward(self, x, global_step, p=0.9):
315
+ # x: [batch_size, 3, H, W] in range [-1, 1]
316
+ x = x * 0.5 + 0.5 # [-1, 1] -> [0, 1]
317
+ # fixed transforms
318
+ for tform in self.fixed_transforms:
319
+ x = tform(x)
320
+ if isinstance(x, tuple):
321
+ x = x[0]
322
+
323
+ # optional transforms
324
+ ramp = np.min([(global_step-self.step0.cpu().item()) / self.ramp, 1.])
325
+ try:
326
+ if len(self.optional_transforms) > 0:
327
+ tform_ids = torch.randint(len(self.optional_transforms), (self.n_optional,)).numpy()
328
+ for tform_id in tform_ids:
329
+ tform = self.optional_transforms[tform_id]
330
+ x = tform(x, ramp=ramp)
331
+ if isinstance(x, tuple):
332
+ x = x[0]
333
+ except Exception as e:
334
+ print(tform_id, ramp)
335
+ import pdb; pdb.set_trace()
336
+ return x * 2 - 1 # [0, 1] -> [-1, 1]
337
+
338
+
339
+ if __name__ == '__main__':
340
+ pass
cldm/transformations.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from . import utils
3
+ import torch
4
+ import numpy as np
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ from tools.augment_imagenetc import RandomImagenetC
8
+ from PIL import Image
9
+ import kornia as ko
10
+ # from kornia.augmentation import RandomHorizontalFlip, RandomCrop
11
+
12
+
13
+ class TransformNet(nn.Module):
14
+ def __init__(self, rnd_bri=0.3, rnd_hue=0.1, do_jpeg=False, jpeg_quality=50, rnd_noise=0.02, rnd_sat=1.0, rnd_trans=0.1,contrast=[0.5, 1.5], rnd_flip=False, ramp=1000, imagenetc_level=0, crop_mode='crop') -> None:
15
+ super().__init__()
16
+ self.rnd_bri = rnd_bri
17
+ self.rnd_hue = rnd_hue
18
+ self.jpeg_quality = jpeg_quality
19
+ self.rnd_noise = rnd_noise
20
+ self.rnd_sat = rnd_sat
21
+ self.rnd_trans = rnd_trans
22
+ self.contrast_low, self.contrast_high = contrast
23
+ self.do_jpeg = do_jpeg
24
+ p_flip = 0.5 if rnd_flip else 0
25
+ self.rnd_flip = ko.augmentation.RandomHorizontalFlip(p_flip)
26
+ self.ramp = ramp
27
+ self.register_buffer('step0', torch.tensor(0)) # large number
28
+ assert crop_mode in ['crop', 'resized_crop']
29
+ if crop_mode == 'crop':
30
+ self.rnd_crop = ko.augmentation.RandomCrop((224,224), cropping_mode="resample")
31
+ elif crop_mode == 'resized_crop':
32
+ self.rnd_crop = ko.augmentation.RandomResizedCrop(size=(224,224), scale=(0.7, 1.0), ratio=(3.0/4, 4.0/3), cropping_mode='resample')
33
+ if imagenetc_level > 0:
34
+ self.imagenetc = ImagenetCTransform(max_severity=imagenetc_level)
35
+
36
+ def activate(self, global_step):
37
+ if self.step0 == 0:
38
+ print(f'[TRAINING] Activating TransformNet at step {global_step}')
39
+ self.step0 = torch.tensor(global_step)
40
+
41
+ def is_activated(self):
42
+ return self.step0 > 0
43
+
44
+ def forward(self, x, global_step, p=0.9):
45
+ # x: [batch_size, 3, H, W] in range [-1, 1]
46
+ x = x * 0.5 + 0.5 # [-1, 1] -> [0, 1]
47
+
48
+ # flip
49
+ x = self.rnd_flip(x)
50
+ # random crop
51
+ x = self.rnd_crop(x)
52
+ if isinstance(x, tuple):
53
+ x = x[0] # weird bug in kornia 0.6.0 that returns transform matrix occasionally
54
+
55
+ if torch.rand(1)[0] >= p:
56
+ return x * 2 - 1 # [0, 1] -> [-1, 1]
57
+ if hasattr(self, 'imagenetc') and torch.rand(1)[0] < 0.5:
58
+ x = self.imagenetc(x * 2 - 1) # [0, 1] -> [-1, 1])
59
+ return x
60
+
61
+ batch_size, sh, device = x.shape[0], x.size(), x.device
62
+ # x0 = x.clone().detach()
63
+ ramp_fn = lambda ramp: np.min([(global_step-self.step0.cpu().item()) / ramp, 1.])
64
+
65
+ rnd_bri = ramp_fn(self.ramp) * self.rnd_bri
66
+ rnd_hue = ramp_fn(self.ramp) * self.rnd_hue
67
+ rnd_brightness = utils.get_rnd_brightness_torch(rnd_bri, rnd_hue, batch_size).to(device) # [batch_size, 3, 1, 1]
68
+ rnd_noise = torch.rand(1)[0] * ramp_fn(self.ramp) * self.rnd_noise
69
+
70
+ contrast_low = 1. - (1. - self.contrast_low) * ramp_fn(self.ramp)
71
+ contrast_high = 1. + (self.contrast_high - 1.) * ramp_fn(self.ramp)
72
+ contrast_params = [contrast_low, contrast_high]
73
+
74
+ # blur
75
+ N_blur = 7
76
+ f = utils.random_blur_kernel(probs=[.25, .25], N_blur=N_blur, sigrange_gauss=[1., 3.], sigrange_line=[.25, 1.],
77
+ wmin_line=3).to(device)
78
+ x = F.conv2d(x, f, bias=None, padding=int((N_blur - 1) / 2))
79
+
80
+ # noise
81
+ noise = torch.normal(mean=0, std=rnd_noise, size=x.size(), dtype=torch.float32).to(device)
82
+ x = x + noise
83
+ x = torch.clamp(x, 0, 1)
84
+
85
+ # contrast & brightness
86
+ contrast_scale = torch.Tensor(x.size()[0]).uniform_(contrast_params[0], contrast_params[1])
87
+ contrast_scale = contrast_scale.reshape(x.size()[0], 1, 1, 1).to(device)
88
+ x = x * contrast_scale
89
+ x = x + rnd_brightness
90
+ x = torch.clamp(x, 0, 1)
91
+
92
+ # saturation
93
+ # rnd_sat = torch.rand(1)[0] * ramp_fn(self.ramp) * self.rnd_sat
94
+ # sat_weight = torch.FloatTensor([.3, .6, .1]).reshape(1, 3, 1, 1).to(device)
95
+ # encoded_image_lum = torch.mean(x * sat_weight, dim=1).unsqueeze_(1)
96
+ # x = (1 - rnd_sat) * x + rnd_sat * encoded_image_lum
97
+ rnd_sat = (torch.rand(1)[0]*2.0 - 1.0)*ramp_fn(self.ramp) * self.rnd_sat + 1.0
98
+ x = ko.enhance.adjust.adjust_saturation(x, rnd_sat)
99
+
100
+ # jpeg
101
+ x = x.reshape(sh)
102
+ if self.do_jpeg:
103
+ jpeg_quality = 100. - torch.rand(1)[0] * ramp_fn(self.ramp) * (100. - self.jpeg_quality)
104
+ x = utils.jpeg_compress_decompress(x, rounding=utils.round_only_at_0, quality=jpeg_quality)
105
+
106
+ x = x * 2 - 1 # [0, 1] -> [-1, 1]
107
+ return x
108
+
109
+
110
+ class ImagenetCTransform(nn.Module):
111
+ def __init__(self, max_severity=5) -> None:
112
+ super().__init__()
113
+ self.max_severity = max_severity
114
+ self.tform = RandomImagenetC(max_severity=max_severity, phase='train')
115
+
116
+ def forward(self, x):
117
+ # x: [batch_size, 3, H, W] in range [-1, 1]
118
+ img0 = x.detach().cpu().numpy()
119
+ img = img0 * 127.5 + 127.5 # [-1, 1] -> [0, 255]
120
+ img = img.transpose(0, 2, 3, 1).astype(np.uint8)
121
+ img = [Image.fromarray(i) for i in img]
122
+ img = [self.tform(i) for i in img]
123
+ img = np.array([np.array(i) for i in img], dtype=np.float32)
124
+ img = img.transpose(0, 3, 1, 2) / 127.5 - 1. # [0, 255] -> [-1, 1]
125
+ residual = torch.from_numpy(img - img0).to(x.device)
126
+ x = x + residual
127
+ return x
cldm/transformations2.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ use kornia and albumentations for transformations
5
+ @author: Tu Bui @University of Surrey
6
+ """
7
+ import os
8
+ from . import utils
9
+ import torch
10
+ import numpy as np
11
+ from torch import nn
12
+ import torch.nn.functional as thf
13
+ from PIL import Image
14
+ import kornia as ko
15
+ import albumentations as ab
16
+ from torchvision import transforms
17
+
18
+
19
+ class IdentityAugment(nn.Module):
20
+ def __init__(self):
21
+ super().__init__()
22
+
23
+ def forward(self, x, **kwargs):
24
+ return x
25
+
26
+
27
+ class RandomCompress(nn.Module):
28
+ def __init__(self, severity='medium', p=0.5):
29
+ super().__init__()
30
+ self.p = p
31
+ if severity == 'low':
32
+ self.jpeg_quality = 70
33
+ elif severity == 'medium':
34
+ self.jpeg_quality = 50
35
+ elif severity == 'high':
36
+ self.jpeg_quality = 40
37
+
38
+ def forward(self, x, ramp=1.):
39
+ # x (B, C, H, W) in range [0, 1]
40
+ # ramp: adjust the ramping of the compression, 1.0 means min quality = self.jpeg_quality
41
+ if torch.rand(1)[0] >= self.p:
42
+ return x
43
+ jpeg_quality = 100. - torch.rand(1)[0] * ramp * (100. - self.jpeg_quality)
44
+ x = utils.jpeg_compress_decompress(x, rounding=utils.round_only_at_0, quality=jpeg_quality)
45
+ return x
46
+
47
+
48
+ class RandomBoxBlur(nn.Module):
49
+ def __init__(self, severity='medium', border_type='reflect', normalized=True, p=0.5):
50
+ super().__init__()
51
+ self.p = p
52
+ if severity == 'low':
53
+ kernel_size = 3
54
+ elif severity == 'medium':
55
+ kernel_size = 5
56
+ elif severity == 'high':
57
+ kernel_size = 7
58
+
59
+ self.tform = ko.augmentation.RandomBoxBlur(kernel_size=(kernel_size, kernel_size), border_type=border_type, normalized=normalized, p=self.p)
60
+
61
+ def forward(self, x, **kwargs):
62
+ return self.tform(x)
63
+
64
+ class RandomMedianBlur(nn.Module):
65
+ def __init__(self, severity='medium', p=0.5):
66
+ super().__init__()
67
+ self.p = p
68
+ self.tform = ko.augmentation.RandomMedianBlur(kernel_size=(3,3), p=p)
69
+
70
+ def forward(self, x, **kwargs):
71
+ return self.tform(x)
72
+
73
+
74
+ class RandomBrightness(nn.Module):
75
+ def __init__(self, severity='medium', p=0.5):
76
+ super().__init__()
77
+ self.p = p
78
+ if severity == 'low':
79
+ brightness = (0.9, 1.1)
80
+ elif severity == 'medium':
81
+ brightness = (0.75, 1.25)
82
+ elif severity == 'high':
83
+ brightness = (0.5, 1.5)
84
+ self.tform = ko.augmentation.RandomBrightness(brightness=brightness, p=p)
85
+
86
+ def forward(self, x, **kwargs):
87
+ return self.tform(x)
88
+
89
+
90
+ class RandomContrast(nn.Module):
91
+ def __init__(self, severity='medium', p=0.5):
92
+ super().__init__()
93
+ self.p = p
94
+ if severity == 'low':
95
+ contrast = (0.9, 1.1)
96
+ elif severity == 'medium':
97
+ contrast = (0.75, 1.25)
98
+ elif severity == 'high':
99
+ contrast = (0.5, 1.5)
100
+ self.tform = ko.augmentation.RandomContrast(contrast=contrast, p=p)
101
+
102
+ def forward(self, x, **kwargs):
103
+ return self.tform(x)
104
+
105
+
106
+ class RandomSaturation(nn.Module):
107
+ def __init__(self, severity='medium', p=0.5):
108
+ super().__init__()
109
+ self.p = p
110
+ if severity == 'low':
111
+ sat = (0.9, 1.1)
112
+ elif severity == 'medium':
113
+ sat = (0.75, 1.25)
114
+ elif severity == 'high':
115
+ sat = (0.5, 1.5)
116
+ self.tform = ko.augmentation.RandomSaturation(saturation=sat, p=p)
117
+
118
+ def forward(self, x, **kwargs):
119
+ return self.tform(x)
120
+
121
+ class RandomSharpness(nn.Module):
122
+ def __init__(self, severity='medium', p=0.5):
123
+ super().__init__()
124
+ self.p = p
125
+ if severity == 'low':
126
+ sharpness = 0.5
127
+ elif severity == 'medium':
128
+ sharpness = 1.0
129
+ elif severity == 'high':
130
+ sharpness = 2.5
131
+ self.tform = ko.augmentation.RandomSharpness(sharpness=sharpness, p=p)
132
+
133
+ def forward(self, x, **kwargs):
134
+ return self.tform(x)
135
+
136
+ class RandomColorJiggle(nn.Module):
137
+ def __init__(self, severity='medium', p=0.5):
138
+ super().__init__()
139
+ self.p = p
140
+ if severity == 'low':
141
+ factor = (0.05, 0.05, 0.05, 0.01)
142
+ elif severity == 'medium':
143
+ factor = (0.1, 0.1, 0.1, 0.02)
144
+ elif severity == 'high':
145
+ factor = (0.1, 0.1, 0.1, 0.05)
146
+ self.tform = ko.augmentation.ColorJiggle(*factor, p=p)
147
+
148
+ def forward(self, x, **kwargs):
149
+ return self.tform(x)
150
+
151
+ class RandomHue(nn.Module):
152
+ def __init__(self, severity='medium', p=0.5):
153
+ super().__init__()
154
+ self.p = p
155
+ if severity == 'low':
156
+ hue = 0.01
157
+ elif severity == 'medium':
158
+ hue = 0.02
159
+ elif severity == 'high':
160
+ hue = 0.05
161
+ self.tform = ko.augmentation.RandomHue(hue=(-hue, hue), p=p)
162
+
163
+ def forward(self, x, **kwargs):
164
+ return self.tform(x)
165
+
166
+ class RandomGamma(nn.Module):
167
+ def __init__(self, severity='medium', p=0.5):
168
+ super().__init__()
169
+ self.p = p
170
+ if severity == 'low':
171
+ gamma, gain = (0.9, 1.1), (0.9,1.1)
172
+ elif severity == 'medium':
173
+ gamma, gain = (0.75, 1.25), (0.75,1.25)
174
+ elif severity == 'high':
175
+ gamma, gain = (0.5, 1.5), (0.5,1.5)
176
+ self.tform = ko.augmentation.RandomGamma(gamma, gain, p=p)
177
+
178
+ def forward(self, x, **kwargs):
179
+ return self.tform(x)
180
+
181
+ class RandomGaussianBlur(nn.Module):
182
+ def __init__(self, severity='medium', p=0.5):
183
+ super().__init__()
184
+ self.p = p
185
+ if severity == 'low':
186
+ kernel_size, sigma = 3, (0.1, 1.0)
187
+ elif severity == 'medium':
188
+ kernel_size, sigma = 5, (0.1, 1.5)
189
+ elif severity == 'high':
190
+ kernel_size, sigma = 7, (0.1, 2.0)
191
+ self.tform = ko.augmentation.RandomGaussianBlur(kernel_size=(kernel_size, kernel_size), sigma=sigma, p=self.p)
192
+
193
+ def forward(self, x, **kwargs):
194
+ return self.tform(x)
195
+
196
+ class RandomGaussianNoise(nn.Module):
197
+ def __init__(self, severity='medium', p=0.5):
198
+ super().__init__()
199
+ self.p = p
200
+ if severity == 'low':
201
+ std = 0.02
202
+ elif severity == 'medium':
203
+ std = 0.04
204
+ elif severity == 'high':
205
+ std = 0.08
206
+ self.tform = ko.augmentation.RandomGaussianNoise(mean=0., std=std, p=p)
207
+
208
+ def forward(self, x, **kwargs):
209
+ return self.tform(x)
210
+
211
+ class RandomMotionBlur(nn.Module):
212
+ def __init__(self, severity='medium', p=0.5):
213
+ super().__init__()
214
+ self.p = p
215
+ if severity == 'low':
216
+ kernel_size, angle, direction = (3, 5), (-25, 25), (-0.25, 0.25)
217
+ elif severity == 'medium':
218
+ kernel_size, angle, direction = (3, 7), (-45, 45), (-0.5, 0.5)
219
+ elif severity == 'high':
220
+ kernel_size, angle, direction = (3, 9), (-90, 90), (-1.0, 1.0)
221
+ self.tform = ko.augmentation.RandomMotionBlur(kernel_size, angle, direction, p=p)
222
+
223
+ def forward(self, x, **kwargs):
224
+ return self.tform(x)
225
+
226
+ class RandomPosterize(nn.Module):
227
+ def __init__(self, severity='medium', p=0.5):
228
+ super().__init__()
229
+ self.p = p
230
+ if severity == 'low':
231
+ bits = 5
232
+ elif severity == 'medium':
233
+ bits = 4
234
+ elif severity == 'high':
235
+ bits = 3
236
+ self.tform = ko.augmentation.RandomPosterize(bits=bits, p=p)
237
+
238
+ def forward(self, x, **kwargs):
239
+ return self.tform(x)
240
+
241
+ class RandomRGBShift(nn.Module):
242
+ def __init__(self, severity='medium', p=0.5):
243
+ super().__init__()
244
+ self.p = p
245
+ if severity == 'low':
246
+ rgb = 0.02
247
+ elif severity == 'medium':
248
+ rgb = 0.05
249
+ elif severity == 'high':
250
+ rgb = 0.1
251
+ self.tform = ko.augmentation.RandomRGBShift(r_shift_limit=rgb, g_shift_limit=rgb, b_shift_limit=rgb, p=p)
252
+
253
+ def forward(self, x, **kwargs):
254
+ return self.tform(x)
255
+
256
+
257
+
258
+ class TransformNet(nn.Module):
259
+ def __init__(self, flip=True, crop_mode='random_crop', compress=True, brightness=True, contrast=True, color_jiggle=True, gamma=False, grayscale=True, gaussian_blur=True, gaussian_noise=True, hue=True, motion_blur=True, posterize=True, rgb_shift=True, saturation=True, sharpness=True, median_blur=True, box_blur=True, severity='medium', n_optional=2, ramp=1000, p=0.5):
260
+ super().__init__()
261
+ self.n_optional = n_optional
262
+ self.p = p
263
+ p_flip = 0.5 if flip else 0
264
+ rnd_flip_layer = ko.augmentation.RandomHorizontalFlip(p_flip)
265
+ self.ramp = ramp
266
+ self.register_buffer('step0', torch.tensor(0))
267
+
268
+ self.crop_mode = crop_mode
269
+ assert crop_mode in ['random_crop', 'resized_crop']
270
+ if crop_mode == 'random_crop':
271
+ rnd_crop_layer = ko.augmentation.RandomCrop((224,224), cropping_mode="resample")
272
+ elif crop_mode == 'resized_crop':
273
+ rnd_crop_layer = ko.augmentation.RandomResizedCrop(size=(224,224), scale=(0.7, 1.0), ratio=(3.0/4, 4.0/3), cropping_mode='resample')
274
+
275
+ self.fixed_transforms = [rnd_flip_layer, rnd_crop_layer]
276
+ if compress:
277
+ self.register(RandomCompress(severity, p=p), 'Random Compress')
278
+ if brightness:
279
+ self.register(RandomBrightness(severity, p=p), 'Random Brightness')
280
+ if contrast:
281
+ self.register(RandomContrast(severity, p=p), 'Random Contrast')
282
+ if color_jiggle:
283
+ self.register(RandomColorJiggle(severity, p=p), 'Random Color')
284
+ if gamma:
285
+ self.register(RandomGamma(severity, p=p), 'Random Gamma')
286
+ if grayscale:
287
+ self.register(ko.augmentation.RandomGrayscale(p=p), 'Grayscale')
288
+ if gaussian_blur:
289
+ self.register(RandomGaussianBlur(severity, p=p), 'Random Gaussian Blur')
290
+ if gaussian_noise:
291
+ self.register(RandomGaussianNoise(severity, p=p), 'Random Gaussian Noise')
292
+ if hue:
293
+ self.register(RandomHue(severity, p=p), 'Random Hue')
294
+ if motion_blur:
295
+ self.register(RandomMotionBlur(severity, p=p), 'Random Motion Blur')
296
+ if posterize:
297
+ self.register(RandomPosterize(severity, p=p), 'Random Posterize')
298
+ if rgb_shift:
299
+ self.register(RandomRGBShift(severity, p=p), 'Random RGB Shift')
300
+ if saturation:
301
+ self.register(RandomSaturation(severity, p=p), 'Random Saturation')
302
+ if sharpness:
303
+ self.register(RandomSharpness(severity, p=p), 'Random Sharpness')
304
+ if median_blur:
305
+ self.register(RandomMedianBlur(severity, p=p), 'Random Median Blur')
306
+ if box_blur:
307
+ self.register(RandomBoxBlur(severity, p=p), 'Random Box Blur')
308
+
309
+ def register(self, tform, name):
310
+ # register a new (optional) transform
311
+ if not hasattr(self, 'optional_transforms'):
312
+ self.optional_transforms = []
313
+ self.optional_names = []
314
+ self.optional_transforms.append(tform)
315
+ self.optional_names.append(name)
316
+
317
+ def activate(self, global_step):
318
+ if self.step0 == 0:
319
+ print(f'[TRAINING] Activating TransformNet at step {global_step}')
320
+ self.step0 = torch.tensor(global_step)
321
+
322
+ def is_activated(self):
323
+ return self.step0 > 0
324
+
325
+ def forward(self, x, global_step, p=0.9):
326
+ # x: [batch_size, 3, H, W] in range [-1, 1]
327
+ x = x * 0.5 + 0.5 # [-1, 1] -> [0, 1]
328
+ # fixed transforms
329
+ for tform in self.fixed_transforms:
330
+ x = tform(x)
331
+ if isinstance(x, tuple):
332
+ x = x[0]
333
+
334
+ # optional transforms
335
+ ramp = np.min([(global_step-self.step0.cpu().item()) / self.ramp, 1.])
336
+ if len(self.optional_transforms) > 0:
337
+ tform_ids = torch.randint(len(self.optional_transforms), (self.n_optional,)).numpy()
338
+ for tform_id in tform_ids:
339
+ tform = self.optional_transforms[tform_id]
340
+ x = tform(x, ramp=ramp)
341
+ if isinstance(x, tuple):
342
+ x = x[0]
343
+
344
+ return x * 2 - 1 # [0, 1] -> [-1, 1]
345
+
346
+ def transform_by_id(self, x, tform_id):
347
+ # x: [batch_size, 3, H, W] in range [-1, 1]
348
+ x = x * 0.5 + 0.5 # [-1, 1] -> [0, 1]
349
+ # fixed transforms
350
+ for tform in self.fixed_transforms:
351
+ x = tform(x)
352
+ if isinstance(x, tuple):
353
+ x = x[0]
354
+
355
+ # optional transforms
356
+ tform = self.optional_transforms[tform_id]
357
+ x = tform(x)
358
+ if isinstance(x, tuple):
359
+ x = x[0]
360
+ return x * 2 - 1 # [0, 1] -> [-1, 1]
361
+
362
+ def transform_by_name(self, x, tform_name):
363
+ assert tform_name in self.optional_names
364
+ tform_id = self.optional_names.index(tform_name)
365
+ return self.transform_by_id(x, tform_id)
366
+
367
+ def apply_transform_on_pil_image(self, x, tform_name):
368
+ # x: PIL image
369
+ # return: PIL image
370
+ assert tform_name in self.optional_names + ['Random Crop', 'Random Flip']
371
+ # if tform_name == 'Random Crop': # the only transform dependent on image size
372
+ # # crop equivalent to 224/256
373
+ # w, h = x.size
374
+ # new_w, new_h = int(224 / 256 * w), int(224 / 256 * h)
375
+ # x = transforms.RandomCrop((new_h, new_w))(x)
376
+ # return x
377
+
378
+ # x = np.array(x).astype(np.float32) / 255. # [0, 255] -> [0, 1]
379
+ # x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0) # [1, 3, H, W]
380
+ # if tform_name == 'Random Flip':
381
+ # x = self.fixed_transforms[0](x)
382
+ # else:
383
+ # tform_id = self.optional_names.index(tform_name)
384
+ # tform = self.optional_transforms[tform_id]
385
+ # x = tform(x)
386
+ # if isinstance(x, tuple):
387
+ # x = x[0]
388
+ # x = x.detach().squeeze(0).permute(1, 2, 0).numpy() * 255 # [0, 1] -> [0, 255]
389
+ # return Image.fromarray(x.astype(np.uint8))
390
+
391
+ w, h = x.size
392
+ x = x.resize((256, 256), Image.BILINEAR)
393
+ x = np.array(x).astype(np.float32) / 255. # [0, 255] -> [0, 1]
394
+ x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0) # [1, 3, H, W]
395
+ if tform_name == 'Random Flip':
396
+ x = self.fixed_transforms[0](x)
397
+ elif tform_name == 'Random Crop':
398
+ x = self.fixed_transforms[1](x)
399
+ else:
400
+ tform_id = self.optional_names.index(tform_name)
401
+ tform = self.optional_transforms[tform_id]
402
+ x = tform(x)
403
+ if isinstance(x, tuple):
404
+ x = x[0]
405
+ x = x.detach().squeeze(0).permute(1, 2, 0).numpy() * 255 # [0, 1] -> [0, 255]
406
+ x = Image.fromarray(x.astype(np.uint8))
407
+ if (tform_name == 'Random Crop') and (self.crop_mode == 'random_crop'):
408
+ w, h = int(224 / 256 * w), int(224 / 256 * h)
409
+ x = x.resize((w, h), Image.BILINEAR)
410
+ return x
411
+
412
+
413
+ if __name__ == '__main__':
414
+ pass
cldm/utils.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import itertools
3
+ import numpy as np
4
+ import random
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torch.nn as nn
8
+
9
+ from PIL import Image, ImageOps
10
+ import matplotlib.pyplot as plt
11
+
12
+ def random_blur_kernel(probs, N_blur, sigrange_gauss, sigrange_line, wmin_line):
13
+ N = N_blur
14
+ coords = torch.from_numpy(np.stack(np.meshgrid(range(N_blur), range(N_blur), indexing='ij'), axis=-1)) - (0.5 * (N-1)) # (7,7,2)
15
+ manhat = torch.sum(torch.abs(coords), dim=-1) # (7, 7)
16
+
17
+ # nothing, default
18
+ vals_nothing = (manhat < 0.5).float() # (7, 7)
19
+
20
+ # gauss
21
+ sig_gauss = torch.rand(1)[0] * (sigrange_gauss[1] - sigrange_gauss[0]) + sigrange_gauss[0]
22
+ vals_gauss = torch.exp(-torch.sum(coords ** 2, dim=-1) /2. / sig_gauss ** 2)
23
+
24
+ # line
25
+ theta = torch.rand(1)[0] * 2.* np.pi
26
+ v = torch.FloatTensor([torch.cos(theta), torch.sin(theta)]) # (2)
27
+ dists = torch.sum(coords * v, dim=-1) # (7, 7)
28
+
29
+ sig_line = torch.rand(1)[0] * (sigrange_line[1] - sigrange_line[0]) + sigrange_line[0]
30
+ w_line = torch.rand(1)[0] * (0.5 * (N-1) + 0.1 - wmin_line) + wmin_line
31
+
32
+ vals_line = torch.exp(-dists ** 2 / 2. / sig_line ** 2) * (manhat < w_line) # (7, 7)
33
+
34
+ t = torch.rand(1)[0]
35
+ vals = vals_nothing
36
+ if t < (probs[0] + probs[1]):
37
+ vals = vals_line
38
+ else:
39
+ vals = vals
40
+ if t < probs[0]:
41
+ vals = vals_gauss
42
+ else:
43
+ vals = vals
44
+
45
+ v = vals / torch.sum(vals) # 归一化 (7, 7)
46
+ z = torch.zeros_like(v)
47
+ f = torch.stack([v,z,z, z,v,z, z,z,v], dim=0).reshape([3, 3, N, N])
48
+ return f
49
+
50
+
51
+ def get_rand_transform_matrix(image_size, d, batch_size):
52
+ Ms = np.zeros((batch_size, 2, 3, 3))
53
+ for i in range(batch_size):
54
+ tl_x = random.uniform(-d, d) # Top left corner, top
55
+ tl_y = random.uniform(-d, d) # Top left corner, left
56
+ bl_x = random.uniform(-d, d) # Bot left corner, bot
57
+ bl_y = random.uniform(-d, d) # Bot left corner, left
58
+ tr_x = random.uniform(-d, d) # Top right corner, top
59
+ tr_y = random.uniform(-d, d) # Top right corner, right
60
+ br_x = random.uniform(-d, d) # Bot right corner, bot
61
+ br_y = random.uniform(-d, d) # Bot right corner, right
62
+
63
+ rect = np.array([
64
+ [tl_x, tl_y],
65
+ [tr_x + image_size, tr_y],
66
+ [br_x + image_size, br_y + image_size],
67
+ [bl_x, bl_y + image_size]], dtype = "float32")
68
+
69
+ dst = np.array([
70
+ [0, 0],
71
+ [image_size, 0],
72
+ [image_size, image_size],
73
+ [0, image_size]], dtype = "float32")
74
+
75
+ M = cv2.getPerspectiveTransform(rect, dst)
76
+ M_inv = np.linalg.inv(M)
77
+ Ms[i, 0, :, :] = M_inv
78
+ Ms[i, 1, :, :] = M
79
+ Ms = torch.from_numpy(Ms).float()
80
+
81
+ return Ms
82
+
83
+
84
+ def get_rnd_brightness_torch(rnd_bri, rnd_hue, batch_size):
85
+ rnd_hue = torch.FloatTensor(batch_size, 3, 1, 1).uniform_(-rnd_hue, rnd_hue)
86
+ rnd_brightness = torch.FloatTensor(batch_size, 1, 1, 1).uniform_(-rnd_bri, rnd_bri)
87
+ return rnd_hue + rnd_brightness
88
+
89
+
90
+ # reference: https://github.com/mlomnitz/DiffJPEG.git
91
+ y_table = np.array(
92
+ [[16, 11, 10, 16, 24, 40, 51, 61], [12, 12, 14, 19, 26, 58, 60,
93
+ 55], [14, 13, 16, 24, 40, 57, 69, 56],
94
+ [14, 17, 22, 29, 51, 87, 80, 62], [18, 22, 37, 56, 68, 109, 103,
95
+ 77], [24, 35, 55, 64, 81, 104, 113, 92],
96
+ [49, 64, 78, 87, 103, 121, 120, 101], [72, 92, 95, 98, 112, 100, 103, 99]],
97
+ dtype=np.float32).T
98
+
99
+ y_table = nn.Parameter(torch.from_numpy(y_table))
100
+ c_table = np.empty((8, 8), dtype=np.float32)
101
+ c_table.fill(99)
102
+ c_table[:4, :4] = np.array([[17, 18, 24, 47], [18, 21, 26, 66],
103
+ [24, 26, 56, 99], [47, 66, 99, 99]]).T
104
+ c_table = nn.Parameter(torch.from_numpy(c_table))
105
+
106
+ # 1. RGB -> YCbCr
107
+ class rgb_to_ycbcr_jpeg(nn.Module):
108
+ """ Converts RGB image to YCbCr
109
+ Input:
110
+ image(tensor): batch x 3 x height x width
111
+ Outpput:
112
+ result(tensor): batch x height x width x 3
113
+ """
114
+ def __init__(self):
115
+ super(rgb_to_ycbcr_jpeg, self).__init__()
116
+ matrix = np.array(
117
+ [[0.299, 0.587, 0.114], [-0.168736, -0.331264, 0.5],
118
+ [0.5, -0.418688, -0.081312]], dtype=np.float32).T
119
+ self.shift = nn.Parameter(torch.tensor([0., 128., 128.]))
120
+ self.matrix = nn.Parameter(torch.from_numpy(matrix))
121
+
122
+ def forward(self, image):
123
+ image = image.permute(0, 2, 3, 1)
124
+ result = torch.tensordot(image, self.matrix, dims=1) + self.shift
125
+ result.view(image.shape)
126
+ return result
127
+
128
+ # 2. Chroma subsampling
129
+ class chroma_subsampling(nn.Module):
130
+ """ Chroma subsampling on CbCv channels
131
+ Input:
132
+ image(tensor): batch x height x width x 3
133
+ Output:
134
+ y(tensor): batch x height x width
135
+ cb(tensor): batch x height/2 x width/2
136
+ cr(tensor): batch x height/2 x width/2
137
+ """
138
+ def __init__(self):
139
+ super(chroma_subsampling, self).__init__()
140
+
141
+ def forward(self, image):
142
+ image_2 = image.permute(0, 3, 1, 2).clone()
143
+ avg_pool = nn.AvgPool2d(kernel_size=2, stride=(2, 2),
144
+ count_include_pad=False)
145
+ cb = avg_pool(image_2[:, 1, :, :].unsqueeze(1))
146
+ cr = avg_pool(image_2[:, 2, :, :].unsqueeze(1))
147
+ cb = cb.permute(0, 2, 3, 1)
148
+ cr = cr.permute(0, 2, 3, 1)
149
+ return image[:, :, :, 0], cb.squeeze(3), cr.squeeze(3)
150
+
151
+ # 3. Block splitting
152
+ class block_splitting(nn.Module):
153
+ """ Splitting image into patches
154
+ Input:
155
+ image(tensor): batch x height x width
156
+ Output:
157
+ patch(tensor): batch x h*w/64 x h x w
158
+ """
159
+ def __init__(self):
160
+ super(block_splitting, self).__init__()
161
+ self.k = 8
162
+
163
+ def forward(self, image):
164
+ height, width = image.shape[1:3]
165
+ batch_size = image.shape[0]
166
+ image_reshaped = image.view(batch_size, height // self.k, self.k, -1, self.k)
167
+ image_transposed = image_reshaped.permute(0, 1, 3, 2, 4)
168
+ return image_transposed.contiguous().view(batch_size, -1, self.k, self.k)
169
+
170
+ # 4. DCT
171
+ class dct_8x8(nn.Module):
172
+ """ Discrete Cosine Transformation
173
+ Input:
174
+ image(tensor): batch x height x width
175
+ Output:
176
+ dcp(tensor): batch x height x width
177
+ """
178
+ def __init__(self):
179
+ super(dct_8x8, self).__init__()
180
+ tensor = np.zeros((8, 8, 8, 8), dtype=np.float32)
181
+ for x, y, u, v in itertools.product(range(8), repeat=4):
182
+ tensor[x, y, u, v] = np.cos((2 * x + 1) * u * np.pi / 16) * np.cos(
183
+ (2 * y + 1) * v * np.pi / 16)
184
+ alpha = np.array([1. / np.sqrt(2)] + [1] * 7)
185
+ #
186
+ self.tensor = nn.Parameter(torch.from_numpy(tensor).float())
187
+ self.scale = nn.Parameter(torch.from_numpy(np.outer(alpha, alpha) * 0.25).float() )
188
+
189
+ def forward(self, image):
190
+ image = image - 128
191
+ result = self.scale * torch.tensordot(image, self.tensor, dims=2)
192
+ result.view(image.shape)
193
+ return result
194
+
195
+ # 5. Quantization
196
+ class y_quantize(nn.Module):
197
+ """ JPEG Quantization for Y channel
198
+ Input:
199
+ image(tensor): batch x height x width
200
+ rounding(function): rounding function to use
201
+ factor(float): Degree of compression
202
+ Output:
203
+ image(tensor): batch x height x width
204
+ """
205
+ def __init__(self, rounding, factor=1):
206
+ super(y_quantize, self).__init__()
207
+ self.rounding = rounding
208
+ self.factor = factor
209
+ self.y_table = y_table
210
+
211
+ def forward(self, image):
212
+ image = image.float() / (self.y_table * self.factor)
213
+ image = self.rounding(image)
214
+ return image
215
+
216
+
217
+ class c_quantize(nn.Module):
218
+ """ JPEG Quantization for CrCb channels
219
+ Input:
220
+ image(tensor): batch x height x width
221
+ rounding(function): rounding function to use
222
+ factor(float): Degree of compression
223
+ Output:
224
+ image(tensor): batch x height x width
225
+ """
226
+ def __init__(self, rounding, factor=1):
227
+ super(c_quantize, self).__init__()
228
+ self.rounding = rounding
229
+ self.factor = factor
230
+ self.c_table = c_table
231
+
232
+ def forward(self, image):
233
+ image = image.float() / (self.c_table * self.factor)
234
+ image = self.rounding(image)
235
+ return image
236
+
237
+
238
+ class compress_jpeg(nn.Module):
239
+ """ Full JPEG compression algortihm
240
+ Input:
241
+ imgs(tensor): batch x 3 x height x width
242
+ rounding(function): rounding function to use
243
+ factor(float): Compression factor
244
+ Ouput:
245
+ compressed(dict(tensor)): batch x h*w/64 x 8 x 8
246
+ """
247
+ def __init__(self, rounding=torch.round, factor=1):
248
+ super(compress_jpeg, self).__init__()
249
+ self.l1 = nn.Sequential(
250
+ rgb_to_ycbcr_jpeg(),
251
+ chroma_subsampling()
252
+ )
253
+ self.l2 = nn.Sequential(
254
+ block_splitting(),
255
+ dct_8x8()
256
+ )
257
+ self.c_quantize = c_quantize(rounding=rounding, factor=factor)
258
+ self.y_quantize = y_quantize(rounding=rounding, factor=factor)
259
+
260
+ def forward(self, image):
261
+ y, cb, cr = self.l1(image*255)
262
+ components = {'y': y, 'cb': cb, 'cr': cr}
263
+ for k in components.keys():
264
+ comp = self.l2(components[k])
265
+ if k in ('cb', 'cr'):
266
+ comp = self.c_quantize(comp)
267
+ else:
268
+ comp = self.y_quantize(comp)
269
+
270
+ components[k] = comp
271
+
272
+ return components['y'], components['cb'], components['cr']
273
+
274
+ # -5. Dequantization
275
+ class y_dequantize(nn.Module):
276
+ """ Dequantize Y channel
277
+ Inputs:
278
+ image(tensor): batch x height x width
279
+ factor(float): compression factor
280
+ Outputs:
281
+ image(tensor): batch x height x width
282
+ """
283
+ def __init__(self, factor=1):
284
+ super(y_dequantize, self).__init__()
285
+ self.y_table = y_table
286
+ self.factor = factor
287
+
288
+ def forward(self, image):
289
+ return image * (self.y_table * self.factor)
290
+
291
+
292
+ class c_dequantize(nn.Module):
293
+ """ Dequantize CbCr channel
294
+ Inputs:
295
+ image(tensor): batch x height x width
296
+ factor(float): compression factor
297
+ Outputs:
298
+ image(tensor): batch x height x width
299
+ """
300
+ def __init__(self, factor=1):
301
+ super(c_dequantize, self).__init__()
302
+ self.factor = factor
303
+ self.c_table = c_table
304
+
305
+ def forward(self, image):
306
+ return image * (self.c_table * self.factor)
307
+
308
+ # -4. Inverse DCT
309
+ class idct_8x8(nn.Module):
310
+ """ Inverse discrete Cosine Transformation
311
+ Input:
312
+ dcp(tensor): batch x height x width
313
+ Output:
314
+ image(tensor): batch x height x width
315
+ """
316
+ def __init__(self):
317
+ super(idct_8x8, self).__init__()
318
+ alpha = np.array([1. / np.sqrt(2)] + [1] * 7)
319
+ self.alpha = nn.Parameter(torch.from_numpy(np.outer(alpha, alpha)).float())
320
+ tensor = np.zeros((8, 8, 8, 8), dtype=np.float32)
321
+ for x, y, u, v in itertools.product(range(8), repeat=4):
322
+ tensor[x, y, u, v] = np.cos((2 * u + 1) * x * np.pi / 16) * np.cos(
323
+ (2 * v + 1) * y * np.pi / 16)
324
+ self.tensor = nn.Parameter(torch.from_numpy(tensor).float())
325
+
326
+ def forward(self, image):
327
+ image = image * self.alpha
328
+ result = 0.25 * torch.tensordot(image, self.tensor, dims=2) + 128
329
+ result.view(image.shape)
330
+ return result
331
+
332
+ # -3. Block joining
333
+ class block_merging(nn.Module):
334
+ """ Merge pathces into image
335
+ Inputs:
336
+ patches(tensor) batch x height*width/64, height x width
337
+ height(int)
338
+ width(int)
339
+ Output:
340
+ image(tensor): batch x height x width
341
+ """
342
+ def __init__(self):
343
+ super(block_merging, self).__init__()
344
+
345
+ def forward(self, patches, height, width):
346
+ k = 8
347
+ batch_size = patches.shape[0]
348
+ image_reshaped = patches.view(batch_size, height//k, width//k, k, k)
349
+ image_transposed = image_reshaped.permute(0, 1, 3, 2, 4)
350
+ return image_transposed.contiguous().view(batch_size, height, width)
351
+
352
+ # -2. Chroma upsampling
353
+ class chroma_upsampling(nn.Module):
354
+ """ Upsample chroma layers
355
+ Input:
356
+ y(tensor): y channel image
357
+ cb(tensor): cb channel
358
+ cr(tensor): cr channel
359
+ Ouput:
360
+ image(tensor): batch x height x width x 3
361
+ """
362
+ def __init__(self):
363
+ super(chroma_upsampling, self).__init__()
364
+
365
+ def forward(self, y, cb, cr):
366
+ def repeat(x, k=2):
367
+ height, width = x.shape[1:3]
368
+ x = x.unsqueeze(-1)
369
+ x = x.repeat(1, 1, k, k)
370
+ x = x.view(-1, height * k, width * k)
371
+ return x
372
+
373
+ cb = repeat(cb)
374
+ cr = repeat(cr)
375
+
376
+ return torch.cat([y.unsqueeze(3), cb.unsqueeze(3), cr.unsqueeze(3)], dim=3)
377
+
378
+ # -1: YCbCr -> RGB
379
+ class ycbcr_to_rgb_jpeg(nn.Module):
380
+ """ Converts YCbCr image to RGB JPEG
381
+ Input:
382
+ image(tensor): batch x height x width x 3
383
+ Outpput:
384
+ result(tensor): batch x 3 x height x width
385
+ """
386
+ def __init__(self):
387
+ super(ycbcr_to_rgb_jpeg, self).__init__()
388
+
389
+ matrix = np.array(
390
+ [[1., 0., 1.402], [1, -0.344136, -0.714136], [1, 1.772, 0]],
391
+ dtype=np.float32).T
392
+ self.shift = nn.Parameter(torch.tensor([0, -128., -128.]))
393
+ self.matrix = nn.Parameter(torch.from_numpy(matrix))
394
+
395
+ def forward(self, image):
396
+ result = torch.tensordot(image + self.shift, self.matrix, dims=1)
397
+ result.view(image.shape)
398
+ return result.permute(0, 3, 1, 2)
399
+
400
+
401
+ class decompress_jpeg(nn.Module):
402
+ """ Full JPEG decompression algortihm
403
+ Input:
404
+ compressed(dict(tensor)): batch x h*w/64 x 8 x 8
405
+ rounding(function): rounding function to use
406
+ factor(float): Compression factor
407
+ Ouput:
408
+ image(tensor): batch x 3 x height x width
409
+ """
410
+ def __init__(self, height, width, rounding=torch.round, factor=1):
411
+ super(decompress_jpeg, self).__init__()
412
+ self.c_dequantize = c_dequantize(factor=factor)
413
+ self.y_dequantize = y_dequantize(factor=factor)
414
+ self.idct = idct_8x8()
415
+ self.merging = block_merging()
416
+ self.chroma = chroma_upsampling()
417
+ self.colors = ycbcr_to_rgb_jpeg()
418
+
419
+ self.height, self.width = height, width
420
+
421
+ def forward(self, y, cb, cr):
422
+ components = {'y': y, 'cb': cb, 'cr': cr}
423
+ for k in components.keys():
424
+ if k in ('cb', 'cr'):
425
+ comp = self.c_dequantize(components[k])
426
+ height, width = int(self.height/2), int(self.width/2)
427
+ else:
428
+ comp = self.y_dequantize(components[k])
429
+ height, width = self.height, self.width
430
+ comp = self.idct(comp)
431
+ components[k] = self.merging(comp, height, width)
432
+ #
433
+ image = self.chroma(components['y'], components['cb'], components['cr'])
434
+ image = self.colors(image)
435
+
436
+ image = torch.min(255*torch.ones_like(image),
437
+ torch.max(torch.zeros_like(image), image))
438
+ return image/255
439
+
440
+ def diff_round(x):
441
+ """ Differentiable rounding function
442
+ Input:
443
+ x(tensor)
444
+ Output:
445
+ x(tensor)
446
+ """
447
+ return torch.round(x) + (x - torch.round(x))**3
448
+
449
+ def round_only_at_0(x):
450
+ cond = (torch.abs(x) < 0.5).float()
451
+ return cond * (x ** 3) + (1 - cond) * x
452
+
453
+ def quality_to_factor(quality):
454
+ """ Calculate factor corresponding to quality
455
+ Input:
456
+ quality(float): Quality for jpeg compression
457
+ Output:
458
+ factor(float): Compression factor
459
+ """
460
+ if quality < 50:
461
+ quality = 5000. / quality
462
+ else:
463
+ quality = 200. - quality*2
464
+ return quality / 100.
465
+
466
+ def jpeg_compress_decompress(image,
467
+ # downsample_c=True,
468
+ rounding=round_only_at_0,
469
+ quality=80):
470
+ # image_r = image * 255
471
+ height, width = image.shape[2:4]
472
+ # orig_height, orig_width = height, width
473
+ # if height % 16 != 0 or width % 16 != 0:
474
+ # # Round up to next multiple of 16
475
+ # height = ((height - 1) // 16 + 1) * 16
476
+ # width = ((width - 1) // 16 + 1) * 16
477
+
478
+ # vpad = height - orig_height
479
+ # wpad = width - orig_width
480
+ # top = vpad // 2
481
+ # bottom = vpad - top
482
+ # left = wpad // 2
483
+ # right = wpad - left
484
+ # #image = tf.pad(image, [[0, 0], [top, bottom], [left, right], [0, 0]], 'SYMMETRIC')
485
+ # image = torch.pad(image, [[0, 0], [0, vpad], [0, wpad], [0, 0]], 'reflect')
486
+
487
+ factor = quality_to_factor(quality)
488
+
489
+ compress = compress_jpeg(rounding=rounding, factor=factor).to(image.device)
490
+ decompress = decompress_jpeg(height, width, rounding=rounding, factor=factor).to(image.device)
491
+
492
+ y, cb, cr = compress(image)
493
+ recovered = decompress(y, cb, cr)
494
+
495
+ return recovered.contiguous()
496
+
497
+
498
+ if __name__ == '__main__':
499
+ ''' test JPEG compress and decompress'''
500
+ # img = Image.open('house.jpg')
501
+ # img = np.array(img) / 255.
502
+ # img_r = np.transpose(img, [2, 0, 1])
503
+ # img_tensor = torch.from_numpy(img_r).unsqueeze(0).float()
504
+
505
+ # recover = jpeg_compress_decompress(img_tensor)
506
+
507
+ # recover_arr = recover.detach().squeeze(0).numpy()
508
+ # recover_arr = np.transpose(recover_arr, [1, 2, 0])
509
+
510
+ # plt.subplot(121)
511
+ # plt.imshow(img)
512
+ # plt.subplot(122)
513
+ # plt.imshow(recover_arr)
514
+ # plt.show()
515
+
516
+ ''' test blur '''
517
+ # blur
518
+
519
+ img = Image.open('house.jpg')
520
+ img = np.array(img) / 255.
521
+ img_r = np.transpose(img, [2, 0, 1])
522
+ img_tensor = torch.from_numpy(img_r).unsqueeze(0).float()
523
+ print(img_tensor.shape)
524
+
525
+ N_blur=7
526
+ f = random_blur_kernel(probs=[.25, .25], N_blur=N_blur, sigrange_gauss=[1., 3.], sigrange_line=[.25, 1.], wmin_line=3)
527
+ # print(f.shape)
528
+ # print(type(f))
529
+ encoded_image = F.conv2d(img_tensor, f, bias=None, padding=int((N_blur-1)/2))
530
+
531
+ encoded_image = encoded_image.detach().squeeze(0).numpy()
532
+ encoded_image = np.transpose(encoded_image, [1, 2, 0])
533
+
534
+ plt.subplot(121)
535
+ plt.imshow(img)
536
+ plt.subplot(122)
537
+ plt.imshow(encoded_image)
538
+ plt.show()
539
+
flae/models.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as thf
5
+ import pytorch_lightning as pl
6
+ from ldm.util import instantiate_from_config
7
+ import einops
8
+ import kornia
9
+ import numpy as np
10
+ import torchvision
11
+ from contextlib import contextmanager
12
+ from ldm.modules.ema import LitEma
13
+
14
+
15
+ class FlAE(pl.LightningModule):
16
+ def __init__(self,
17
+ cover_key,
18
+ secret_key,
19
+ secret_len,
20
+ resolution,
21
+ secret_encoder_config,
22
+ secret_decoder_config,
23
+ loss_config,
24
+ noise_config='__none__',
25
+ ckpt_path="__none__",
26
+ use_ema=False
27
+ ):
28
+ super().__init__()
29
+ self.cover_key = cover_key
30
+ self.secret_key = secret_key
31
+ secret_encoder_config.params.secret_len = secret_len
32
+ secret_decoder_config.params.secret_len = secret_len
33
+ secret_encoder_config.params.resolution = resolution
34
+ secret_decoder_config.params.resolution = 224
35
+ self.encoder = instantiate_from_config(secret_encoder_config)
36
+ self.decoder = instantiate_from_config(secret_decoder_config)
37
+ self.loss_layer = instantiate_from_config(loss_config)
38
+ if noise_config != '__none__':
39
+ print('Using noise')
40
+ self.noise = instantiate_from_config(noise_config)
41
+
42
+ self.use_ema = use_ema
43
+ if self.use_ema:
44
+ print('Using EMA')
45
+ self.encoder_ema = LitEma(self.encoder)
46
+ self.decoder_ema = LitEma(self.decoder)
47
+ print(f"Keeping EMAs of {len(list(self.encoder_ema.buffers()) + list(self.decoder_ema.buffers()))}.")
48
+
49
+ if ckpt_path != "__none__":
50
+ self.init_from_ckpt(ckpt_path, ignore_keys=[])
51
+
52
+ # early training phase
53
+ self.fixed_img = None
54
+ self.fixed_secret = None
55
+ self.register_buffer("fixed_input", torch.tensor(True))
56
+ self.crop = kornia.augmentation.CenterCrop((224, 224), cropping_mode="resample") # early training phase
57
+
58
+ def init_from_ckpt(self, path, ignore_keys=list()):
59
+ sd = torch.load(path, map_location="cpu")["state_dict"]
60
+ keys = list(sd.keys())
61
+ for k in keys:
62
+ for ik in ignore_keys:
63
+ if k.startswith(ik):
64
+ print("Deleting key {} from state_dict.".format(k))
65
+ del sd[k]
66
+ self.load_state_dict(sd, strict=False)
67
+ print(f"Restored from {path}")
68
+
69
+ @contextmanager
70
+ def ema_scope(self, context=None):
71
+ if self.use_ema:
72
+ self.encoder_ema.store(self.encoder.parameters())
73
+ self.decoder_ema.store(self.decoder.parameters())
74
+ self.encoder_ema.copy_to(self.encoder)
75
+ self.decoder_ema.copy_to(self.decoder)
76
+ if context is not None:
77
+ print(f"{context}: Switched to EMA weights")
78
+ try:
79
+ yield None
80
+ finally:
81
+ if self.use_ema:
82
+ self.encoder_ema.restore(self.encoder.parameters())
83
+ self.decoder_ema.restore(self.decoder.parameters())
84
+ if context is not None:
85
+ print(f"{context}: Restored training weights")
86
+
87
+ def on_train_batch_end(self, *args, **kwargs):
88
+ if self.use_ema:
89
+ self.encoder_ema(self.encoder)
90
+ self.decoder_ema(self.decoder)
91
+
92
+ @torch.no_grad()
93
+ def get_input(self, batch, bs=None):
94
+ image = batch[self.cover_key]
95
+ secret = batch[self.secret_key]
96
+ if bs is not None:
97
+ image = image[:bs]
98
+ secret = secret[:bs]
99
+ else:
100
+ bs = image.shape[0]
101
+ # encode image 1st stage
102
+ image = einops.rearrange(image, "b h w c -> b c h w").contiguous()
103
+
104
+ # check if using fixed input (early training phase)
105
+ # if self.training and self.fixed_input:
106
+ if self.fixed_input:
107
+ if self.fixed_img is None: # first iteration
108
+ print('[TRAINING] Warmup - using fixed input image for now!')
109
+ self.fixed_img = image.detach().clone()[:bs]
110
+ self.fixed_secret = secret.detach().clone()[:bs] # use for log_images with fixed_input option only
111
+ image = self.fixed_img
112
+ new_bs = min(secret.shape[0], image.shape[0])
113
+ image, secret = image[:new_bs], secret[:new_bs]
114
+
115
+ out = [image, secret]
116
+ return out
117
+
118
+ def forward(self, cover, secret):
119
+ # return a tuple (stego, residual)
120
+ enc_out = self.encoder(cover, secret)
121
+ if self.encoder.return_residual:
122
+ return cover + enc_out, enc_out
123
+ else:
124
+ return enc_out, enc_out - cover
125
+
126
+ def shared_step(self, batch):
127
+ x, s = self.get_input(batch)
128
+ stego, residual = self(x, s)
129
+ if hasattr(self, "noise") and self.noise.is_activated():
130
+ stego_noised = self.noise(stego, self.global_step, p=0.9)
131
+ else:
132
+ stego_noised = self.crop(stego)
133
+ stego_noised = torch.clamp(stego_noised, -1, 1)
134
+ spred = self.decoder(stego_noised)
135
+
136
+ loss, loss_dict = self.loss_layer(x, stego, None, s, spred, self.global_step)
137
+ bit_acc = loss_dict["bit_acc"]
138
+
139
+ bit_acc_ = bit_acc.item()
140
+
141
+ if (bit_acc_ > 0.98) and (not self.fixed_input) and self.noise.is_activated():
142
+ self.loss_layer.activate_ramp(self.global_step)
143
+
144
+ if (bit_acc_ > 0.95) and (not self.fixed_input): # ramp up image loss at late training stage
145
+ if hasattr(self, 'noise') and (not self.noise.is_activated()):
146
+ self.noise.activate(self.global_step)
147
+
148
+ if (bit_acc_ > 0.9) and self.fixed_input: # execute only once
149
+ print(f'[TRAINING] High bit acc ({bit_acc_}) achieved, switch to full image dataset training.')
150
+ self.fixed_input = ~self.fixed_input
151
+ return loss, loss_dict
152
+
153
+ def training_step(self, batch, batch_idx):
154
+ loss, loss_dict = self.shared_step(batch)
155
+ loss_dict = {f"train/{key}": val for key, val in loss_dict.items()}
156
+ self.log_dict(loss_dict, prog_bar=True,
157
+ logger=True, on_step=True, on_epoch=True)
158
+
159
+ self.log("global_step", self.global_step,
160
+ prog_bar=True, logger=True, on_step=True, on_epoch=False)
161
+ # if self.use_scheduler:
162
+ # lr = self.optimizers().param_groups[0]['lr']
163
+ # self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
164
+
165
+ return loss
166
+
167
+ @torch.no_grad()
168
+ def validation_step(self, batch, batch_idx):
169
+ _, loss_dict_no_ema = self.shared_step(batch)
170
+ loss_dict_no_ema = {f"val/{key}": val for key, val in loss_dict_no_ema.items() if key != 'img_lw'}
171
+ with self.ema_scope():
172
+ _, loss_dict_ema = self.shared_step(batch)
173
+ loss_dict_ema = {'val/' + key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
174
+ self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
175
+ self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
176
+
177
+ @torch.no_grad()
178
+ def log_images(self, batch, fixed_input=False, **kwargs):
179
+ log = dict()
180
+ if fixed_input and self.fixed_img is not None:
181
+ x, s = self.fixed_img, self.fixed_secret
182
+ else:
183
+ x, s = self.get_input(batch)
184
+ stego, residual = self(x, s)
185
+ if hasattr(self, 'noise') and self.noise.is_activated():
186
+ img_noise = self.noise(stego, self.global_step, p=1.0)
187
+ log['noised'] = img_noise
188
+ log['input'] = x
189
+ log['stego'] = stego
190
+ log['residual'] = (residual - residual.min()) / (residual.max() - residual.min() + 1e-8)*2 - 1
191
+ return log
192
+
193
+ def configure_optimizers(self):
194
+ lr = self.learning_rate
195
+ params = list(self.encoder.parameters()) + list(self.decoder.parameters())
196
+ optimizer = torch.optim.AdamW(params, lr=lr)
197
+ return optimizer
198
+
199
+
200
+
201
+
202
+ class SecretEncoder(nn.Module):
203
+ def __init__(self, resolution=256, secret_len=100, return_residual=False, act='tanh') -> None:
204
+ super().__init__()
205
+ self.secret_len = secret_len
206
+ self.return_residual = return_residual
207
+ self.act_fn = lambda x: torch.tanh(x) if act == 'tanh' else thf.sigmoid(x) * 2.0 -1.0
208
+ self.secret_dense = nn.Linear(secret_len, 16*16*3)
209
+ log_resolution = int(math.log(resolution, 2))
210
+ assert resolution == 2 ** log_resolution, f"Image resolution must be a power of 2, got {resolution}."
211
+ self.secret_upsample = nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4)))
212
+ self.conv1 = nn.Conv2d(2 * 3, 32, 3, 1, 1)
213
+ self.conv2 = nn.Conv2d(32, 32, 3, 2, 1)
214
+ self.conv3 = nn.Conv2d(32, 64, 3, 2, 1)
215
+ self.conv4 = nn.Conv2d(64, 128, 3, 2, 1)
216
+ self.conv5 = nn.Conv2d(128, 256, 3, 2, 1)
217
+ self.pad6 = nn.ZeroPad2d((0, 1, 0, 1))
218
+ self.up6 = nn.Conv2d(256, 128, 2, 1)
219
+ self.upsample6 = nn.Upsample(scale_factor=(2, 2))
220
+ self.conv6 = nn.Conv2d(128 + 128, 128, 3, 1, 1)
221
+ self.pad7 = nn.ZeroPad2d((0, 1, 0, 1))
222
+ self.up7 = nn.Conv2d(128, 64, 2, 1)
223
+ self.upsample7 = nn.Upsample(scale_factor=(2, 2))
224
+ self.conv7 = nn.Conv2d(64 + 64, 64, 3, 1, 1)
225
+ self.pad8 = nn.ZeroPad2d((0, 1, 0, 1))
226
+ self.up8 = nn.Conv2d(64, 32, 2, 1)
227
+ self.upsample8 = nn.Upsample(scale_factor=(2, 2))
228
+ self.conv8 = nn.Conv2d(32 + 32, 32, 3, 1, 1)
229
+ self.pad9 = nn.ZeroPad2d((0, 1, 0, 1))
230
+ self.up9 = nn.Conv2d(32, 32, 2, 1)
231
+ self.upsample9 = nn.Upsample(scale_factor=(2, 2))
232
+ self.conv9 = nn.Conv2d(32 + 32 + 2 * 3, 32, 3, 1, 1)
233
+ self.conv10 = nn.Conv2d(32, 32, 3, 1, 1)
234
+ self.residual = nn.Conv2d(32, 3, 1)
235
+
236
+ def forward(self, image, secret):
237
+ fingerprint = thf.relu(self.secret_dense(secret))
238
+ fingerprint = fingerprint.view((-1, 3, 16, 16))
239
+ fingerprint_enlarged = self.secret_upsample(fingerprint)
240
+ # try:
241
+ inputs = torch.cat([fingerprint_enlarged, image], dim=1)
242
+ # except:
243
+ # print(fingerprint_enlarged.shape, image.shape, fingerprint.shape)
244
+ # import pdb; pdb.set_trace()
245
+ conv1 = thf.relu(self.conv1(inputs))
246
+ conv2 = thf.relu(self.conv2(conv1))
247
+ conv3 = thf.relu(self.conv3(conv2))
248
+ conv4 = thf.relu(self.conv4(conv3))
249
+ conv5 = thf.relu(self.conv5(conv4))
250
+ up6 = thf.relu(self.up6(self.pad6(self.upsample6(conv5))))
251
+ merge6 = torch.cat([conv4, up6], dim=1)
252
+ conv6 = thf.relu(self.conv6(merge6))
253
+ up7 = thf.relu(self.up7(self.pad7(self.upsample7(conv6))))
254
+ merge7 = torch.cat([conv3, up7], dim=1)
255
+ conv7 = thf.relu(self.conv7(merge7))
256
+ up8 = thf.relu(self.up8(self.pad8(self.upsample8(conv7))))
257
+ merge8 = torch.cat([conv2, up8], dim=1)
258
+ conv8 = thf.relu(self.conv8(merge8))
259
+ up9 = thf.relu(self.up9(self.pad9(self.upsample9(conv8))))
260
+ merge9 = torch.cat([conv1, up9, inputs], dim=1)
261
+ conv9 = thf.relu(self.conv9(merge9))
262
+ conv10 = thf.relu(self.conv10(conv9))
263
+ residual = self.residual(conv10)
264
+ residual = self.act_fn(residual)
265
+ return residual
266
+
267
+
268
+ class SecretEncoder1(nn.Module):
269
+ def __init__(self, resolution=256, secret_len=100) -> None:
270
+ pass
271
+
272
+ class SecretDecoder(nn.Module):
273
+ def __init__(self, arch='resnet18', resolution=224, secret_len=100):
274
+ super().__init__()
275
+ self.resolution = resolution
276
+ self.arch = arch
277
+ if arch == 'resnet18':
278
+ self.decoder = torchvision.models.resnet18(pretrained=True, progress=False)
279
+ self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
280
+ elif arch == 'resnet50':
281
+ self.decoder = torchvision.models.resnet50(pretrained=True, progress=False)
282
+ self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
283
+ elif arch == 'simple':
284
+ self.decoder = SimpleCNN(resolution, secret_len)
285
+ else:
286
+ raise ValueError('Unknown architecture')
287
+
288
+ def forward(self, image):
289
+ if self.arch in ['resnet50', 'resnet18'] and image.shape[-1] > self.resolution:
290
+ image = thf.interpolate(image, size=(self.resolution, self.resolution), mode='bilinear', align_corners=False)
291
+ x = self.decoder(image)
292
+ return x
293
+
294
+
295
+ class SimpleCNN(nn.Module):
296
+ def __init__(self, resolution=224, secret_len=100):
297
+ super().__init__()
298
+ self.resolution = resolution
299
+ self.IMAGE_CHANNELS = 3
300
+ self.decoder = nn.Sequential(
301
+ nn.Conv2d(self.IMAGE_CHANNELS, 32, (3, 3), 2, 1), # resolution / 2
302
+ nn.ReLU(),
303
+ nn.Conv2d(32, 32, 3, 1, 1),
304
+ nn.ReLU(),
305
+ nn.Conv2d(32, 64, 3, 2, 1), # resolution / 4
306
+ nn.ReLU(),
307
+ nn.Conv2d(64, 64, 3, 1, 1),
308
+ nn.ReLU(),
309
+ nn.Conv2d(64, 64, 3, 2, 1), # resolution / 8
310
+ nn.ReLU(),
311
+ nn.Conv2d(64, 128, 3, 2, 1), # resolution / 16
312
+ nn.ReLU(),
313
+ nn.Conv2d(128, 128, (3, 3), 2, 1), # resolution / 32
314
+ nn.ReLU(),
315
+ )
316
+ self.dense = nn.Sequential(
317
+ nn.Linear(resolution * resolution * 128 // 32 // 32, 512),
318
+ nn.ReLU(),
319
+ nn.Linear(512, secret_len),
320
+ )
321
+
322
+ def forward(self, image):
323
+ x = self.decoder(image)
324
+ x = x.view(-1, self.resolution * self.resolution * 128 // 32 // 32)
325
+ return self.dense(x)
flae/munit.py ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (C) 2018 NVIDIA Corporation. All rights reserved.
3
+ Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
4
+ """
5
+ from torch import nn
6
+ from torch.autograd import Variable
7
+ import torch
8
+ import torch.nn.functional as F
9
+ try:
10
+ from itertools import izip as zip
11
+ except ImportError: # will be 3.x series
12
+ pass
13
+
14
+ ##################################################################################
15
+ # Discriminator
16
+ ##################################################################################
17
+
18
+ class MsImageDis(nn.Module):
19
+ # Multi-scale discriminator architecture
20
+ def __init__(self, input_dim, params):
21
+ super(MsImageDis, self).__init__()
22
+ self.n_layer = params['n_layer']
23
+ self.gan_type = params['gan_type']
24
+ self.dim = params['dim']
25
+ self.norm = params['norm']
26
+ self.activ = params['activ']
27
+ self.num_scales = params['num_scales']
28
+ self.pad_type = params['pad_type']
29
+ self.input_dim = input_dim
30
+ self.downsample = nn.AvgPool2d(3, stride=2, padding=[1, 1], count_include_pad=False)
31
+ self.cnns = nn.ModuleList()
32
+ for _ in range(self.num_scales):
33
+ self.cnns.append(self._make_net())
34
+
35
+ def _make_net(self):
36
+ dim = self.dim
37
+ cnn_x = []
38
+ cnn_x += [Conv2dBlock(self.input_dim, dim, 4, 2, 1, norm='none', activation=self.activ, pad_type=self.pad_type)]
39
+ for i in range(self.n_layer - 1):
40
+ cnn_x += [Conv2dBlock(dim, dim * 2, 4, 2, 1, norm=self.norm, activation=self.activ, pad_type=self.pad_type)]
41
+ dim *= 2
42
+ cnn_x += [nn.Conv2d(dim, 1, 1, 1, 0)]
43
+ cnn_x = nn.Sequential(*cnn_x)
44
+ return cnn_x
45
+
46
+ def forward(self, x):
47
+ outputs = []
48
+ for model in self.cnns:
49
+ outputs.append(model(x))
50
+ x = self.downsample(x)
51
+ return outputs
52
+
53
+ def calc_dis_loss(self, input_fake, input_real):
54
+ # calculate the loss to train D
55
+ outs0 = self.forward(input_fake)
56
+ outs1 = self.forward(input_real)
57
+ loss = 0
58
+
59
+ for it, (out0, out1) in enumerate(zip(outs0, outs1)):
60
+ if self.gan_type == 'lsgan':
61
+ loss += torch.mean((out0 - 0)**2) + torch.mean((out1 - 1)**2)
62
+ elif self.gan_type == 'nsgan':
63
+ all0 = Variable(torch.zeros_like(out0.data).cuda(), requires_grad=False)
64
+ all1 = Variable(torch.ones_like(out1.data).cuda(), requires_grad=False)
65
+ loss += torch.mean(F.binary_cross_entropy(F.sigmoid(out0), all0) +
66
+ F.binary_cross_entropy(F.sigmoid(out1), all1))
67
+ else:
68
+ assert 0, "Unsupported GAN type: {}".format(self.gan_type)
69
+ return loss
70
+
71
+ def calc_gen_loss(self, input_fake):
72
+ # calculate the loss to train G
73
+ outs0 = self.forward(input_fake)
74
+ loss = 0
75
+ for it, (out0) in enumerate(outs0):
76
+ if self.gan_type == 'lsgan':
77
+ loss += torch.mean((out0 - 1)**2) # LSGAN
78
+ elif self.gan_type == 'nsgan':
79
+ all1 = Variable(torch.ones_like(out0.data).cuda(), requires_grad=False)
80
+ loss += torch.mean(F.binary_cross_entropy(F.sigmoid(out0), all1))
81
+ else:
82
+ assert 0, "Unsupported GAN type: {}".format(self.gan_type)
83
+ return loss
84
+
85
+ ##################################################################################
86
+ # Generator
87
+ ##################################################################################
88
+
89
+ class AdaINGen(nn.Module):
90
+ # AdaIN auto-encoder architecture
91
+ def __init__(self, input_dim, params):
92
+ super(AdaINGen, self).__init__()
93
+ dim = params['dim']
94
+ style_dim = params['style_dim']
95
+ n_downsample = params['n_downsample']
96
+ n_res = params['n_res']
97
+ activ = params['activ']
98
+ pad_type = params['pad_type']
99
+ mlp_dim = params['mlp_dim']
100
+
101
+ # style encoder
102
+ self.enc_style = StyleEncoder(4, input_dim, dim, style_dim, norm='none', activ=activ, pad_type=pad_type)
103
+
104
+ # content encoder
105
+ self.enc_content = ContentEncoder(n_downsample, n_res, input_dim, dim, 'in', activ, pad_type=pad_type)
106
+ self.dec = Decoder(n_downsample, n_res, self.enc_content.output_dim, input_dim, res_norm='adain', activ=activ, pad_type=pad_type)
107
+
108
+ # MLP to generate AdaIN parameters
109
+ self.mlp = MLP(style_dim, self.get_num_adain_params(self.dec), mlp_dim, 3, norm='none', activ=activ)
110
+
111
+ def forward(self, images):
112
+ # reconstruct an image
113
+ content, style_fake = self.encode(images)
114
+ images_recon = self.decode(content, style_fake)
115
+ return images_recon
116
+
117
+ def encode(self, images):
118
+ # encode an image to its content and style codes
119
+ style_fake = self.enc_style(images)
120
+ content = self.enc_content(images)
121
+ return content, style_fake
122
+
123
+ def decode(self, content, style):
124
+ # decode content and style codes to an image
125
+ adain_params = self.mlp(style)
126
+ self.assign_adain_params(adain_params, self.dec)
127
+ images = self.dec(content)
128
+ return images
129
+
130
+ def assign_adain_params(self, adain_params, model):
131
+ # assign the adain_params to the AdaIN layers in model
132
+ for m in model.modules():
133
+ if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
134
+ mean = adain_params[:, :m.num_features]
135
+ std = adain_params[:, m.num_features:2*m.num_features]
136
+ m.bias = mean.contiguous().view(-1)
137
+ m.weight = std.contiguous().view(-1)
138
+ if adain_params.size(1) > 2*m.num_features:
139
+ adain_params = adain_params[:, 2*m.num_features:]
140
+
141
+ def get_num_adain_params(self, model):
142
+ # return the number of AdaIN parameters needed by the model
143
+ num_adain_params = 0
144
+ for m in model.modules():
145
+ if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
146
+ num_adain_params += 2*m.num_features
147
+ return num_adain_params
148
+
149
+
150
+ class VAEGen(nn.Module):
151
+ # VAE architecture
152
+ def __init__(self, input_dim, params):
153
+ super(VAEGen, self).__init__()
154
+ dim = params['dim']
155
+ n_downsample = params['n_downsample']
156
+ n_res = params['n_res']
157
+ activ = params['activ']
158
+ pad_type = params['pad_type']
159
+
160
+ # content encoder
161
+ self.enc = ContentEncoder(n_downsample, n_res, input_dim, dim, 'in', activ, pad_type=pad_type)
162
+ self.dec = Decoder(n_downsample, n_res, self.enc.output_dim, input_dim, res_norm='in', activ=activ, pad_type=pad_type)
163
+
164
+ def forward(self, images):
165
+ # This is a reduced VAE implementation where we assume the outputs are multivariate Gaussian distribution with mean = hiddens and std_dev = all ones.
166
+ hiddens = self.encode(images)
167
+ if self.training == True:
168
+ noise = Variable(torch.randn(hiddens.size()).cuda(hiddens.data.get_device()))
169
+ images_recon = self.decode(hiddens + noise)
170
+ else:
171
+ images_recon = self.decode(hiddens)
172
+ return images_recon, hiddens
173
+
174
+ def encode(self, images):
175
+ hiddens = self.enc(images)
176
+ noise = Variable(torch.randn(hiddens.size()).cuda(hiddens.data.get_device()))
177
+ return hiddens, noise
178
+
179
+ def decode(self, hiddens):
180
+ images = self.dec(hiddens)
181
+ return images
182
+
183
+
184
+ ##################################################################################
185
+ # Encoder and Decoders
186
+ ##################################################################################
187
+
188
+ class StyleEncoder(nn.Module):
189
+ def __init__(self, n_downsample, input_dim, dim, style_dim, norm, activ, pad_type):
190
+ super(StyleEncoder, self).__init__()
191
+ self.model = []
192
+ self.model += [Conv2dBlock(input_dim, dim, 7, 1, 3, norm=norm, activation=activ, pad_type=pad_type)]
193
+ for i in range(2):
194
+ self.model += [Conv2dBlock(dim, 2 * dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
195
+ dim *= 2
196
+ for i in range(n_downsample - 2):
197
+ self.model += [Conv2dBlock(dim, dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
198
+ self.model += [nn.AdaptiveAvgPool2d(1)] # global average pooling
199
+ self.model += [nn.Conv2d(dim, style_dim, 1, 1, 0)]
200
+ self.model = nn.Sequential(*self.model)
201
+ self.output_dim = dim
202
+
203
+ def forward(self, x):
204
+ return self.model(x)
205
+
206
+ class ContentEncoder(nn.Module):
207
+ def __init__(self, n_downsample, n_res, input_dim, dim, norm, activ, pad_type):
208
+ super(ContentEncoder, self).__init__()
209
+ self.model = []
210
+ self.model += [Conv2dBlock(input_dim, dim, 7, 1, 3, norm=norm, activation=activ, pad_type=pad_type)]
211
+ # downsampling blocks
212
+ for i in range(n_downsample):
213
+ self.model += [Conv2dBlock(dim, 2 * dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
214
+ dim *= 2
215
+ # residual blocks
216
+ self.model += [ResBlocks(n_res, dim, norm=norm, activation=activ, pad_type=pad_type)]
217
+ self.model = nn.Sequential(*self.model)
218
+ self.output_dim = dim
219
+
220
+ def forward(self, x):
221
+ return self.model(x)
222
+
223
+ class Decoder(nn.Module):
224
+ def __init__(self, n_upsample, n_res, dim, output_dim, res_norm='adain', activ='relu', pad_type='zero'):
225
+ super(Decoder, self).__init__()
226
+
227
+ self.model = []
228
+ # AdaIN residual blocks
229
+ self.model += [ResBlocks(n_res, dim, res_norm, activ, pad_type=pad_type)]
230
+ # upsampling blocks
231
+ for i in range(n_upsample):
232
+ self.model += [nn.Upsample(scale_factor=2),
233
+ Conv2dBlock(dim, dim // 2, 5, 1, 2, norm='ln', activation=activ, pad_type=pad_type)]
234
+ dim //= 2
235
+ # use reflection padding in the last conv layer
236
+ self.model += [Conv2dBlock(dim, output_dim, 7, 1, 3, norm='none', activation='tanh', pad_type=pad_type)]
237
+ self.model = nn.Sequential(*self.model)
238
+
239
+ def forward(self, x):
240
+ return self.model(x)
241
+
242
+ ##################################################################################
243
+ # Sequential Models
244
+ ##################################################################################
245
+ class ResBlocks(nn.Module):
246
+ def __init__(self, num_blocks, dim, norm='in', activation='relu', pad_type='zero'):
247
+ super(ResBlocks, self).__init__()
248
+ self.model = []
249
+ for i in range(num_blocks):
250
+ self.model += [ResBlock(dim, norm=norm, activation=activation, pad_type=pad_type)]
251
+ self.model = nn.Sequential(*self.model)
252
+
253
+ def forward(self, x):
254
+ return self.model(x)
255
+
256
+ class MLP(nn.Module):
257
+ def __init__(self, input_dim, output_dim, dim, n_blk, norm='none', activ='relu'):
258
+
259
+ super(MLP, self).__init__()
260
+ self.model = []
261
+ self.model += [LinearBlock(input_dim, dim, norm=norm, activation=activ)]
262
+ for i in range(n_blk - 2):
263
+ self.model += [LinearBlock(dim, dim, norm=norm, activation=activ)]
264
+ self.model += [LinearBlock(dim, output_dim, norm='none', activation='none')] # no output activations
265
+ self.model = nn.Sequential(*self.model)
266
+
267
+ def forward(self, x):
268
+ return self.model(x.view(x.size(0), -1))
269
+
270
+ ##################################################################################
271
+ # Basic Blocks
272
+ ##################################################################################
273
+ class ResBlock(nn.Module):
274
+ def __init__(self, dim, norm='in', activation='relu', pad_type='zero'):
275
+ super(ResBlock, self).__init__()
276
+
277
+ model = []
278
+ model += [Conv2dBlock(dim ,dim, 3, 1, 1, norm=norm, activation=activation, pad_type=pad_type)]
279
+ model += [Conv2dBlock(dim ,dim, 3, 1, 1, norm=norm, activation='none', pad_type=pad_type)]
280
+ self.model = nn.Sequential(*model)
281
+
282
+ def forward(self, x):
283
+ residual = x
284
+ out = self.model(x)
285
+ out += residual
286
+ return out
287
+
288
+ class Conv2dBlock(nn.Module):
289
+ def __init__(self, input_dim ,output_dim, kernel_size, stride,
290
+ padding=0, norm='none', activation='relu', pad_type='zero'):
291
+ super(Conv2dBlock, self).__init__()
292
+ self.use_bias = True
293
+ # initialize padding
294
+ if pad_type == 'reflect':
295
+ self.pad = nn.ReflectionPad2d(padding)
296
+ elif pad_type == 'replicate':
297
+ self.pad = nn.ReplicationPad2d(padding)
298
+ elif pad_type == 'zero':
299
+ self.pad = nn.ZeroPad2d(padding)
300
+ else:
301
+ assert 0, "Unsupported padding type: {}".format(pad_type)
302
+
303
+ # initialize normalization
304
+ norm_dim = output_dim
305
+ if norm == 'bn':
306
+ self.norm = nn.BatchNorm2d(norm_dim)
307
+ elif norm == 'in':
308
+ #self.norm = nn.InstanceNorm2d(norm_dim, track_running_stats=True)
309
+ self.norm = nn.InstanceNorm2d(norm_dim)
310
+ elif norm == 'ln':
311
+ self.norm = LayerNorm(norm_dim)
312
+ elif norm == 'adain':
313
+ self.norm = AdaptiveInstanceNorm2d(norm_dim)
314
+ elif norm == 'none' or norm == 'sn':
315
+ self.norm = None
316
+ else:
317
+ assert 0, "Unsupported normalization: {}".format(norm)
318
+
319
+ # initialize activation
320
+ if activation == 'relu':
321
+ self.activation = nn.ReLU(inplace=True)
322
+ elif activation == 'lrelu':
323
+ self.activation = nn.LeakyReLU(0.2, inplace=True)
324
+ elif activation == 'prelu':
325
+ self.activation = nn.PReLU()
326
+ elif activation == 'selu':
327
+ self.activation = nn.SELU(inplace=True)
328
+ elif activation == 'tanh':
329
+ self.activation = nn.Tanh()
330
+ elif activation == 'none':
331
+ self.activation = None
332
+ else:
333
+ assert 0, "Unsupported activation: {}".format(activation)
334
+
335
+ # initialize convolution
336
+ if norm == 'sn':
337
+ self.conv = SpectralNorm(nn.Conv2d(input_dim, output_dim, kernel_size, stride, bias=self.use_bias))
338
+ else:
339
+ self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride, bias=self.use_bias)
340
+
341
+ def forward(self, x):
342
+ x = self.conv(self.pad(x))
343
+ if self.norm:
344
+ x = self.norm(x)
345
+ if self.activation:
346
+ x = self.activation(x)
347
+ return x
348
+
349
+ class LinearBlock(nn.Module):
350
+ def __init__(self, input_dim, output_dim, norm='none', activation='relu'):
351
+ super(LinearBlock, self).__init__()
352
+ use_bias = True
353
+ # initialize fully connected layer
354
+ if norm == 'sn':
355
+ self.fc = SpectralNorm(nn.Linear(input_dim, output_dim, bias=use_bias))
356
+ else:
357
+ self.fc = nn.Linear(input_dim, output_dim, bias=use_bias)
358
+
359
+ # initialize normalization
360
+ norm_dim = output_dim
361
+ if norm == 'bn':
362
+ self.norm = nn.BatchNorm1d(norm_dim)
363
+ elif norm == 'in':
364
+ self.norm = nn.InstanceNorm1d(norm_dim)
365
+ elif norm == 'ln':
366
+ self.norm = LayerNorm(norm_dim)
367
+ elif norm == 'none' or norm == 'sn':
368
+ self.norm = None
369
+ else:
370
+ assert 0, "Unsupported normalization: {}".format(norm)
371
+
372
+ # initialize activation
373
+ if activation == 'relu':
374
+ self.activation = nn.ReLU(inplace=True)
375
+ elif activation == 'lrelu':
376
+ self.activation = nn.LeakyReLU(0.2, inplace=True)
377
+ elif activation == 'prelu':
378
+ self.activation = nn.PReLU()
379
+ elif activation == 'selu':
380
+ self.activation = nn.SELU(inplace=True)
381
+ elif activation == 'tanh':
382
+ self.activation = nn.Tanh()
383
+ elif activation == 'none':
384
+ self.activation = None
385
+ else:
386
+ assert 0, "Unsupported activation: {}".format(activation)
387
+
388
+ def forward(self, x):
389
+ out = self.fc(x)
390
+ if self.norm:
391
+ out = self.norm(out)
392
+ if self.activation:
393
+ out = self.activation(out)
394
+ return out
395
+
396
+ ##################################################################################
397
+ # VGG network definition
398
+ ##################################################################################
399
+ class Vgg16(nn.Module):
400
+ def __init__(self):
401
+ super(Vgg16, self).__init__()
402
+ self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
403
+ self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
404
+
405
+ self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
406
+ self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
407
+
408
+ self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
409
+ self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
410
+ self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
411
+
412
+ self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
413
+ self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
414
+ self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
415
+
416
+ self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
417
+ self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
418
+ self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
419
+
420
+ def forward(self, X):
421
+ h = F.relu(self.conv1_1(X), inplace=True)
422
+ h = F.relu(self.conv1_2(h), inplace=True)
423
+ # relu1_2 = h
424
+ h = F.max_pool2d(h, kernel_size=2, stride=2)
425
+
426
+ h = F.relu(self.conv2_1(h), inplace=True)
427
+ h = F.relu(self.conv2_2(h), inplace=True)
428
+ # relu2_2 = h
429
+ h = F.max_pool2d(h, kernel_size=2, stride=2)
430
+
431
+ h = F.relu(self.conv3_1(h), inplace=True)
432
+ h = F.relu(self.conv3_2(h), inplace=True)
433
+ h = F.relu(self.conv3_3(h), inplace=True)
434
+ # relu3_3 = h
435
+ h = F.max_pool2d(h, kernel_size=2, stride=2)
436
+
437
+ h = F.relu(self.conv4_1(h), inplace=True)
438
+ h = F.relu(self.conv4_2(h), inplace=True)
439
+ h = F.relu(self.conv4_3(h), inplace=True)
440
+ # relu4_3 = h
441
+
442
+ h = F.relu(self.conv5_1(h), inplace=True)
443
+ h = F.relu(self.conv5_2(h), inplace=True)
444
+ h = F.relu(self.conv5_3(h), inplace=True)
445
+ relu5_3 = h
446
+
447
+ return relu5_3
448
+ # return [relu1_2, relu2_2, relu3_3, relu4_3]
449
+
450
+ ##################################################################################
451
+ # Normalization layers
452
+ ##################################################################################
453
+ class AdaptiveInstanceNorm2d(nn.Module):
454
+ def __init__(self, num_features, eps=1e-5, momentum=0.1):
455
+ super(AdaptiveInstanceNorm2d, self).__init__()
456
+ self.num_features = num_features
457
+ self.eps = eps
458
+ self.momentum = momentum
459
+ # weight and bias are dynamically assigned
460
+ self.weight = None
461
+ self.bias = None
462
+ # just dummy buffers, not used
463
+ self.register_buffer('running_mean', torch.zeros(num_features))
464
+ self.register_buffer('running_var', torch.ones(num_features))
465
+
466
+ def forward(self, x):
467
+ assert self.weight is not None and self.bias is not None, "Please assign weight and bias before calling AdaIN!"
468
+ b, c = x.size(0), x.size(1)
469
+ running_mean = self.running_mean.repeat(b)
470
+ running_var = self.running_var.repeat(b)
471
+
472
+ # Apply instance norm
473
+ x_reshaped = x.contiguous().view(1, b * c, *x.size()[2:])
474
+
475
+ out = F.batch_norm(
476
+ x_reshaped, running_mean, running_var, self.weight, self.bias,
477
+ True, self.momentum, self.eps)
478
+
479
+ return out.view(b, c, *x.size()[2:])
480
+
481
+ def __repr__(self):
482
+ return self.__class__.__name__ + '(' + str(self.num_features) + ')'
483
+
484
+
485
+ class LayerNorm(nn.Module):
486
+ def __init__(self, num_features, eps=1e-5, affine=True):
487
+ super(LayerNorm, self).__init__()
488
+ self.num_features = num_features
489
+ self.affine = affine
490
+ self.eps = eps
491
+
492
+ if self.affine:
493
+ self.gamma = nn.Parameter(torch.Tensor(num_features).uniform_())
494
+ self.beta = nn.Parameter(torch.zeros(num_features))
495
+
496
+ def forward(self, x):
497
+ shape = [-1] + [1] * (x.dim() - 1)
498
+ # print(x.size())
499
+ if x.size(0) == 1:
500
+ # These two lines run much faster in pytorch 0.4 than the two lines listed below.
501
+ mean = x.view(-1).mean().view(*shape)
502
+ std = x.view(-1).std().view(*shape)
503
+ else:
504
+ mean = x.view(x.size(0), -1).mean(1).view(*shape)
505
+ std = x.view(x.size(0), -1).std(1).view(*shape)
506
+
507
+ x = (x - mean) / (std + self.eps)
508
+
509
+ if self.affine:
510
+ shape = [1, -1] + [1] * (x.dim() - 2)
511
+ x = x * self.gamma.view(*shape) + self.beta.view(*shape)
512
+ return x
513
+
514
+ def l2normalize(v, eps=1e-12):
515
+ return v / (v.norm() + eps)
516
+
517
+
518
+ class SpectralNorm(nn.Module):
519
+ """
520
+ Based on the paper "Spectral Normalization for Generative Adversarial Networks" by Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida
521
+ and the Pytorch implementation https://github.com/christiancosgrove/pytorch-spectral-normalization-gan
522
+ """
523
+ def __init__(self, module, name='weight', power_iterations=1):
524
+ super(SpectralNorm, self).__init__()
525
+ self.module = module
526
+ self.name = name
527
+ self.power_iterations = power_iterations
528
+ if not self._made_params():
529
+ self._make_params()
530
+
531
+ def _update_u_v(self):
532
+ u = getattr(self.module, self.name + "_u")
533
+ v = getattr(self.module, self.name + "_v")
534
+ w = getattr(self.module, self.name + "_bar")
535
+
536
+ height = w.data.shape[0]
537
+ for _ in range(self.power_iterations):
538
+ v.data = l2normalize(torch.mv(torch.t(w.view(height,-1).data), u.data))
539
+ u.data = l2normalize(torch.mv(w.view(height,-1).data, v.data))
540
+
541
+ # sigma = torch.dot(u.data, torch.mv(w.view(height,-1).data, v.data))
542
+ sigma = u.dot(w.view(height, -1).mv(v))
543
+ setattr(self.module, self.name, w / sigma.expand_as(w))
544
+
545
+ def _made_params(self):
546
+ try:
547
+ u = getattr(self.module, self.name + "_u")
548
+ v = getattr(self.module, self.name + "_v")
549
+ w = getattr(self.module, self.name + "_bar")
550
+ return True
551
+ except AttributeError:
552
+ return False
553
+
554
+
555
+ def _make_params(self):
556
+ w = getattr(self.module, self.name)
557
+
558
+ height = w.data.shape[0]
559
+ width = w.view(height, -1).data.shape[1]
560
+
561
+ u = nn.Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
562
+ v = nn.Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
563
+ u.data = l2normalize(u.data)
564
+ v.data = l2normalize(v.data)
565
+ w_bar = nn.Parameter(w.data)
566
+
567
+ del self.module._parameters[self.name]
568
+
569
+ self.module.register_parameter(self.name + "_u", u)
570
+ self.module.register_parameter(self.name + "_v", v)
571
+ self.module.register_parameter(self.name + "_bar", w_bar)
572
+
573
+
574
+ def forward(self, *args):
575
+ self._update_u_v()
576
+ return self.module.forward(*args)
flae/unet.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from torch.autograd import Variable
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from .munit import ResBlocks, Conv2dBlock
6
+ import math
7
+
8
+
9
+ class Unet(nn.Module):
10
+ def __init__(self, resolution=256, secret_len=100, return_residual=False) -> None:
11
+ super().__init__()
12
+ self.secret_len = secret_len
13
+ self.return_residual = return_residual
14
+ self.secret_dense = nn.Linear(secret_len, 16*16*3)
15
+ log_resolution = int(math.log(resolution, 2))
16
+ assert resolution == 2 ** log_resolution, f"Image resolution must be a power of 2, got {resolution}."
17
+ self.secret_upsample = nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4)))
18
+
19
+ self.enc = Encoder(2, 4, 6, 64, 'bn' , 'relu', 'reflect')
20
+ self.dec = Decoder(2, 4, self.enc.output_dim, 3, 'bn', 'relu', 'reflect')
21
+
22
+ def forward(self, image, secret):
23
+ # import pdb; pdb.set_trace()
24
+ fingerprint = F.relu(self.secret_dense(secret))
25
+ fingerprint = fingerprint.view((-1, 3, 16, 16))
26
+ fingerprint_enlarged = self.secret_upsample(fingerprint)
27
+ inputs = torch.cat([fingerprint_enlarged, image], dim=1)
28
+ emb = self.enc(inputs)
29
+ # import pdb; pdb.set_trace()
30
+ out = self.dec(emb)
31
+ return out
32
+
33
+ class Encoder(nn.Module):
34
+ def __init__(self, n_downsample, n_res, input_dim, dim, norm, activ, pad_type):
35
+ super().__init__()
36
+ self.model = []
37
+ self.model += [Conv2dBlock(input_dim, dim, 7, 1, 3, norm=norm, activation=activ, pad_type=pad_type)]
38
+ # downsampling blocks
39
+ for i in range(n_downsample):
40
+ self.model += [Conv2dBlock(dim, 2 * dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
41
+ dim *= 2
42
+ # residual blocks
43
+ self.model += [ResBlocks(n_res, dim, norm=norm, activation=activ, pad_type=pad_type)]
44
+ # self.model = nn.(*self.model)
45
+ self.model = nn.ModuleList(self.model)
46
+ self.output_dim = dim
47
+
48
+ def forward(self, x):
49
+ out = []
50
+ for block in self.model:
51
+ x = block(x)
52
+ out.append(x)
53
+ # print(x.shape)
54
+ return out
55
+
56
+
57
+ class Decoder(nn.Module):
58
+ def __init__(self, n_upsample, n_res, dim, output_dim, res_norm='adain', activ='relu', pad_type='zero'):
59
+ super(Decoder, self).__init__()
60
+
61
+ self.model = []
62
+ # AdaIN residual blocks
63
+ self.model += [DecoderBlock('resblock', n_res, dim, res_norm, activ, pad_type=pad_type)]
64
+ # upsampling blocks
65
+ for i in range(n_upsample):
66
+ self.model += [DecoderBlock('upsample', dim, dim//2,'bn', activ, pad_type)
67
+ ]
68
+ dim //= 2
69
+ # use reflection padding in the last conv layer
70
+ self.output_layer = Conv2dBlock(dim, output_dim, 7, 1, 3, norm='none', activation='tanh', pad_type=pad_type)
71
+ # self.model = nn.Sequential(*self.model)
72
+ self.model = nn.ModuleList(self.model)
73
+
74
+ def forward(self, x):
75
+ x1 = x.pop()
76
+ for block in self.model:
77
+ x2 = x.pop()
78
+ # print(x1.shape, x2.shape)
79
+ x1 = block(x1, x2)
80
+ x1 = self.output_layer(x1)
81
+ return x1
82
+
83
+
84
+ class Merge(nn.Module):
85
+ def __init__(self, dim, activation='relu'):
86
+ super().__init__()
87
+ self.conv = nn.Conv2d(2*dim, dim, 3, 1, 1)
88
+ # initialize activation
89
+ if activation == 'relu':
90
+ self.activation = nn.ReLU(inplace=True)
91
+ elif activation == 'lrelu':
92
+ self.activation = nn.LeakyReLU(0.2, inplace=True)
93
+ elif activation == 'prelu':
94
+ self.activation = nn.PReLU()
95
+ elif activation == 'selu':
96
+ self.activation = nn.SELU(inplace=True)
97
+ elif activation == 'tanh':
98
+ self.activation = nn.Tanh()
99
+ elif activation == 'none':
100
+ self.activation = None
101
+ else:
102
+ assert 0, "Unsupported activation: {}".format(activation)
103
+ def forward(self, x1, x2):
104
+ x = torch.cat([x1, x2], dim=1) # 2xdim
105
+ x = self.conv(x) # B,dim,H,W
106
+ x = self.activation(x)
107
+ return x
108
+
109
+ class DecoderBlock(nn.Module):
110
+ def __init__(self, block_type, in_dim, out_dim, norm, activ='relu', pad_type='reflect'):
111
+ super().__init__()
112
+ assert block_type in ['resblock', 'upsample']
113
+ if block_type == 'resblock':
114
+ self.core_layer = ResBlocks(in_dim, out_dim, norm, activ, pad_type=pad_type)
115
+ else:
116
+ assert out_dim == in_dim//2
117
+ self.core_layer = nn.Sequential(nn.Upsample(scale_factor=2),
118
+ Conv2dBlock(in_dim, out_dim, 5, 1, 2, norm=norm, activation=activ, pad_type=pad_type))
119
+ self.merge = Merge(out_dim, activ)
120
+
121
+ def forward(self, x1, x2):
122
+ x1 = self.core_layer(x1)
123
+ return self.merge(x1, x2)
ldm/util.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+ import torch
4
+ from torch import optim
5
+ import numpy as np
6
+
7
+ from inspect import isfunction
8
+ from PIL import Image, ImageDraw, ImageFont
9
+
10
+
11
+ def log_txt_as_img(wh, xc, size=10):
12
+ # wh a tuple of (width, height)
13
+ # xc a list of captions to plot
14
+ b = len(xc)
15
+ txts = list()
16
+ for bi in range(b):
17
+ txt = Image.new("RGB", wh, color="white")
18
+ draw = ImageDraw.Draw(txt)
19
+ font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
20
+ nc = int(40 * (wh[0] / 256))
21
+ lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
22
+
23
+ try:
24
+ draw.text((0, 0), lines, fill="black", font=font)
25
+ except UnicodeEncodeError:
26
+ print("Cant encode string for logging. Skipping.")
27
+
28
+ txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
29
+ txts.append(txt)
30
+ txts = np.stack(txts)
31
+ txts = torch.tensor(txts)
32
+ return txts
33
+
34
+
35
+ def ismap(x):
36
+ if not isinstance(x, torch.Tensor):
37
+ return False
38
+ return (len(x.shape) == 4) and (x.shape[1] > 3)
39
+
40
+
41
+ def isimage(x):
42
+ if not isinstance(x,torch.Tensor):
43
+ return False
44
+ return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
45
+
46
+
47
+ def exists(x):
48
+ return x is not None
49
+
50
+
51
+ def default(val, d):
52
+ if exists(val):
53
+ return val
54
+ return d() if isfunction(d) else d
55
+
56
+
57
+ def mean_flat(tensor):
58
+ """
59
+ https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
60
+ Take the mean over all non-batch dimensions.
61
+ """
62
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
63
+
64
+
65
+ def count_params(model, verbose=False):
66
+ total_params = sum(p.numel() for p in model.parameters())
67
+ if verbose:
68
+ print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
69
+ return total_params
70
+
71
+
72
+ def instantiate_from_config(config):
73
+ if not "target" in config:
74
+ if config == '__is_first_stage__':
75
+ return None
76
+ elif config == "__is_unconditional__":
77
+ return None
78
+ raise KeyError("Expected key `target` to instantiate.")
79
+ return get_obj_from_str(config["target"])(**config.get("params", dict()))
80
+
81
+
82
+ def get_obj_from_str(string, reload=False):
83
+ module, cls = string.rsplit(".", 1)
84
+ if reload:
85
+ module_imp = importlib.import_module(module)
86
+ importlib.reload(module_imp)
87
+ return getattr(importlib.import_module(module, package=None), cls)
88
+
89
+
90
+ class AdamWwithEMAandWings(optim.Optimizer):
91
+ # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
92
+ def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8, # TODO: check hyperparameters before using
93
+ weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999, # ema decay to match previous code
94
+ ema_power=1., param_names=()):
95
+ """AdamW that saves EMA versions of the parameters."""
96
+ if not 0.0 <= lr:
97
+ raise ValueError("Invalid learning rate: {}".format(lr))
98
+ if not 0.0 <= eps:
99
+ raise ValueError("Invalid epsilon value: {}".format(eps))
100
+ if not 0.0 <= betas[0] < 1.0:
101
+ raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
102
+ if not 0.0 <= betas[1] < 1.0:
103
+ raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
104
+ if not 0.0 <= weight_decay:
105
+ raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
106
+ if not 0.0 <= ema_decay <= 1.0:
107
+ raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
108
+ defaults = dict(lr=lr, betas=betas, eps=eps,
109
+ weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
110
+ ema_power=ema_power, param_names=param_names)
111
+ super().__init__(params, defaults)
112
+
113
+ def __setstate__(self, state):
114
+ super().__setstate__(state)
115
+ for group in self.param_groups:
116
+ group.setdefault('amsgrad', False)
117
+
118
+ @torch.no_grad()
119
+ def step(self, closure=None):
120
+ """Performs a single optimization step.
121
+ Args:
122
+ closure (callable, optional): A closure that reevaluates the model
123
+ and returns the loss.
124
+ """
125
+ loss = None
126
+ if closure is not None:
127
+ with torch.enable_grad():
128
+ loss = closure()
129
+
130
+ for group in self.param_groups:
131
+ params_with_grad = []
132
+ grads = []
133
+ exp_avgs = []
134
+ exp_avg_sqs = []
135
+ ema_params_with_grad = []
136
+ state_sums = []
137
+ max_exp_avg_sqs = []
138
+ state_steps = []
139
+ amsgrad = group['amsgrad']
140
+ beta1, beta2 = group['betas']
141
+ ema_decay = group['ema_decay']
142
+ ema_power = group['ema_power']
143
+
144
+ for p in group['params']:
145
+ if p.grad is None:
146
+ continue
147
+ params_with_grad.append(p)
148
+ if p.grad.is_sparse:
149
+ raise RuntimeError('AdamW does not support sparse gradients')
150
+ grads.append(p.grad)
151
+
152
+ state = self.state[p]
153
+
154
+ # State initialization
155
+ if len(state) == 0:
156
+ state['step'] = 0
157
+ # Exponential moving average of gradient values
158
+ state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
159
+ # Exponential moving average of squared gradient values
160
+ state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
161
+ if amsgrad:
162
+ # Maintains max of all exp. moving avg. of sq. grad. values
163
+ state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
164
+ # Exponential moving average of parameter values
165
+ state['param_exp_avg'] = p.detach().float().clone()
166
+
167
+ exp_avgs.append(state['exp_avg'])
168
+ exp_avg_sqs.append(state['exp_avg_sq'])
169
+ ema_params_with_grad.append(state['param_exp_avg'])
170
+
171
+ if amsgrad:
172
+ max_exp_avg_sqs.append(state['max_exp_avg_sq'])
173
+
174
+ # update the steps for each param group update
175
+ state['step'] += 1
176
+ # record the step after step update
177
+ state_steps.append(state['step'])
178
+
179
+ optim._functional.adamw(params_with_grad,
180
+ grads,
181
+ exp_avgs,
182
+ exp_avg_sqs,
183
+ max_exp_avg_sqs,
184
+ state_steps,
185
+ amsgrad=amsgrad,
186
+ beta1=beta1,
187
+ beta2=beta2,
188
+ lr=group['lr'],
189
+ weight_decay=group['weight_decay'],
190
+ eps=group['eps'],
191
+ maximize=False)
192
+
193
+ cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
194
+ for param, ema_param in zip(params_with_grad, ema_params_with_grad):
195
+ ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
196
+
197
+ return loss
pages/Extract_Secret.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ streamlit app demo
5
+ how to run:
6
+ streamlit run app.py --server.port 8501
7
+
8
+ @author: Tu Bui @surrey.ac.uk
9
+ """
10
+ import os, sys, torch
11
+ import inspect
12
+ cdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
13
+ sys.path.insert(1, os.path.join(cdir, '../'))
14
+ import argparse
15
+ from pathlib import Path
16
+ import numpy as np
17
+ import pickle
18
+ import pytorch_lightning as pl
19
+ from torchvision import transforms
20
+ import argparse
21
+ from ldm.util import instantiate_from_config
22
+ from omegaconf import OmegaConf
23
+ from PIL import Image
24
+ from tools.augment_imagenetc import RandomImagenetC
25
+ from cldm.transformations2 import TransformNet
26
+ from io import BytesIO
27
+ from tools.helpers import welcome_message
28
+ from tools.ecc import BCH, RSC
29
+ import streamlit as st
30
+ from Embed_Secret import load_ecc, load_model, decode_secret, to_bytes, model_names, SECRET_LEN
31
+
32
+
33
+ # model_names = ['RoSteALS', 'UNet']
34
+ # SECRET_LEN = 100
35
+
36
+ def app():
37
+ st.title('Watermarking Demo')
38
+ # setup model
39
+ model_name = st.selectbox("Choose the model", model_names)
40
+ model, tform_emb, tform_det = load_model(model_name)
41
+ display_width = 300
42
+ ecc = load_ecc('BCH')
43
+ noise = TransformNet(p=1.0, crop_mode='resized_crop')
44
+ noise_names = noise.optional_names
45
+
46
+ # setup st
47
+ st.subheader("Input")
48
+ image_file = None
49
+ image_file = st.file_uploader("Upload stego image", type=["png","jpg","jpeg"])
50
+ if image_file is not None:
51
+ im = Image.open(image_file).convert('RGB')
52
+ ext = image_file.name.split('.')[-1]
53
+ st.image(im, width=display_width)
54
+
55
+
56
+ # add crop
57
+ st.subheader("Corruptions")
58
+ crop_button = st.button('Regenerate Crop', key='crop')
59
+ if image_file is not None:
60
+ im_crop = noise.apply_transform_on_pil_image(im, 'Random Crop')
61
+ if crop_button:
62
+ im_crop = noise.apply_transform_on_pil_image(im, 'Random Crop')
63
+ # st.image(im_crop, width=display_width)
64
+
65
+ # add noise source 1
66
+ corrupt_method1 = st.selectbox("Choose noise source #1", ['None'] + noise_names, key='noise1')
67
+ if image_file is not None:
68
+ if corrupt_method1=='None':
69
+ im_noise1 = im_crop
70
+ else:
71
+ im_noise1 = noise.apply_transform_on_pil_image(im_crop, corrupt_method1)
72
+ # st.image(im_noise1, width=display_width)
73
+
74
+ # add noise source 2
75
+ corrupt_method2 = st.selectbox("Choose noise source #2", ['None'] + noise_names, key='noise2')
76
+ if image_file is not None:
77
+ if corrupt_method2=='None':
78
+ im_noise2 = im_noise1
79
+ else:
80
+ im_noise2 = noise.apply_transform_on_pil_image(im_noise1, corrupt_method2)
81
+
82
+ st.subheader("Output")
83
+ if image_file is not None:
84
+ st.image(im_noise2, width=display_width)
85
+ mime='image/jpeg' if ext=='jpg' else f'image/{ext}'
86
+ im_noise2_bytes = to_bytes(np.uint8(im_noise2), mime)
87
+ st.download_button(label='Download image', data=im_noise2_bytes, file_name=f'corrupted.{ext}', mime=mime)
88
+
89
+ # prediction
90
+ st.subheader('Extract Secret From Output')
91
+ status = st.empty()
92
+ if image_file is not None:
93
+ secret_pred = decode_secret(model_name, model, im_noise2, tform_det)
94
+ secret_decoded = ecc.decode_text(secret_pred)[0]
95
+ status.markdown(f'Predicted secret: **{secret_decoded}**', unsafe_allow_html=True)
96
+
97
+ # bit acc
98
+ st.subheader('Accuracy')
99
+ secret_text = st.text_input('Input groundtruth secret')
100
+ bit_acc_status = st.empty()
101
+ if image_file is not None and secret_text:
102
+ secret = ecc.encode_text([secret_text]) # (1, 100)
103
+ bit_acc = (secret_pred == secret).mean()
104
+ # bit_acc_status.markdown('**Bit Accuracy**: {:.2f}%'.format(bit_acc*100), unsafe_allow_html=True)
105
+ word_acc = int(secret_decoded == secret_text)
106
+ bit_acc_status.markdown(f'Bit Accuracy: **{bit_acc*100:.2f}%**<br />Word Accuracy: **{word_acc}**', unsafe_allow_html=True)
107
+
108
+ if __name__ == '__main__':
109
+ app()
110
+
tools/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .helpers import *
2
+ from .hparams import HParams
3
+ from .slack_bot import Notifier
tools/augment_imagenetc.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ wrapper for imagenet-c transformations
5
+ @author: Tu Bui @surrey.ac.uk
6
+ """
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+ import os
11
+ import sys
12
+ import random
13
+ import numpy as np
14
+ from PIL import Image
15
+ from imagenet_c import corrupt, corruption_dict
16
+
17
+
18
+ class IdentityAugment(object):
19
+ def __call__(self, x):
20
+ return x
21
+
22
+ def __repr__(self):
23
+ s = f'()'
24
+ return self.__class__.__name__ + s
25
+
26
+ class RandomImagenetC(object):
27
+ # transform id 5 (motion blur) and 7 (snow) requires WandImage which is not fork-safe, while id 4 (glass blur) and 6 (zoom blur) are super slow thus we move it to validation (unseen), 12 (elastic transform) is non realistic
28
+ methods = {'train': np.array([0,1,2,3,8,9,10,11,13,14,15, 16, 17, 18]),#np.arange(15),
29
+ 'val': np.array([4, 5, 6, 7, 12]),
30
+ 'test': np.array([0,1,2,3,8,9,10,11,13,14,15, 16, 17, 18])
31
+ }
32
+ method_names = list(corruption_dict.keys())
33
+ def __init__(self, min_severity=1, max_severity=5, phase='all', p=1.0,n=19):
34
+ assert phase in ['train', 'val', 'test', 'all'], ValueError(f'{phase} not recognised. Must be one of [train, val, all]')
35
+ if phase == 'all':
36
+ self.corrupt_ids = np.concatenate(list(self.methods.values()))
37
+ else:
38
+ self.corrupt_ids = self.methods[phase]
39
+ self.corrupt_ids = self.corrupt_ids[:n] # first n tforms
40
+ self.phase = phase
41
+ self.severity = np.arange(min_severity, max_severity+1)
42
+ self.p = p # probability to apply a transformation
43
+
44
+ def __call__(self, x, corrupt_id=None, corrupt_strength=None):
45
+ # input: x PIL image
46
+ if corrupt_id is None:
47
+ if len(self.corrupt_ids)==0: # do nothing
48
+ return x
49
+ corrupt_id = np.random.choice(self.corrupt_ids)
50
+ else:
51
+ assert corrupt_id in range(19)
52
+
53
+ severity = np.random.choice(self.severity) if corrupt_strength is None else corrupt_strength
54
+ assert severity in self.severity, f'Error! Corrupt strength {severity} isnt supported.'
55
+
56
+ if np.random.rand() < self.p:
57
+ org_size = x.size
58
+ x = np.asarray(x.convert('RGB').resize((224, 224), Image.BILINEAR))[:,:,::-1]
59
+ x = corrupt(x, severity, corruption_number=corrupt_id)
60
+ x = Image.fromarray(x[:,:,::-1])
61
+ if x.size != org_size:
62
+ x = x.resize(org_size, Image.BILINEAR)
63
+ return x
64
+
65
+ def transform_with_fixed_severity(self, x, severity, corrupt_id=None):
66
+ if corrupt_id is None:
67
+ corrupt_id = np.random.choice(self.corrupt_ids)
68
+ else:
69
+ assert corrupt_id in self.corrupt_ids
70
+ assert severity > 0 and severity < 6
71
+ org_size = x.size
72
+ x = np.asarray(x.convert('RGB').resize((224, 224), Image.BILINEAR))[:,:,::-1]
73
+ x = corrupt(x, severity, corruption_number=corrupt_id)
74
+ x = Image.fromarray(x[:,:,::-1])
75
+ if x.size != org_size:
76
+ x = x.resize(org_size, Image.BILINEAR)
77
+ return x
78
+
79
+ def __repr__(self):
80
+ s = f'(severity={self.severity}, phase={self.phase}, p={self.p},ids={self.corrupt_ids})'
81
+ return self.__class__.__name__ + s
82
+
83
+
84
+ class NoiseResidual(object):
85
+ def __init__(self, k=16):
86
+ self.k = k
87
+ def __call__(self, x):
88
+ h, w = x.height, x.width
89
+ x1 = x.resize((w//self.k,h//self.k), Image.BILINEAR).resize((w, h), Image.BILINEAR)
90
+ x1 = np.abs(np.array(x).astype(np.float32) - np.array(x1).astype(np.float32))
91
+ x1 = (x1 - x1.min())/(x1.max() - x1.min() + np.finfo(np.float32).eps)
92
+ x1 = Image.fromarray((x1*255).astype(np.uint8))
93
+ return x1
94
+ def __repr__(self):
95
+ s = f'(k={self.k}'
96
+ return self.__class__.__name__ + s
97
+
98
+
99
+ def get_transforms(img_mean=[0.5, 0.5, 0.5], img_std=[0.5, 0.5, 0.5], rsize=256, csize=224, pertubation=True, dct=False, residual=False, max_c=19):
100
+ from torchvision import transforms
101
+ prep = transforms.Compose([
102
+ transforms.Resize(rsize),
103
+ transforms.RandomHorizontalFlip(),
104
+ transforms.RandomCrop(csize)])
105
+ if pertubation:
106
+ pertubation_train = RandomImagenetC(max_severity=5, phase='train', p=0.95,n=max_c)
107
+ pertubation_val = RandomImagenetC(max_severity=5, phase='train', p=1.0,n=max_c)
108
+ pertubation_test = RandomImagenetC(max_severity=5, phase='val', p=1.0,n=max_c)
109
+ else:
110
+ pertubation_train = pertubation_val = pertubation_test = IdentityAugment()
111
+ if dct:
112
+ from .image_tools import DCT
113
+ norm = [
114
+ DCT(),
115
+ transforms.ToTensor(),
116
+ transforms.Normalize(mean=img_mean, std=img_std)]
117
+ else:
118
+ norm = [
119
+ transforms.ToTensor(),
120
+ transforms.Normalize(mean=img_mean, std=img_std)]
121
+ if residual:
122
+ norm.insert(0, NoiseResidual())
123
+
124
+ preprocess = {
125
+ 'train': [prep, pertubation_train, transforms.Compose(norm)],
126
+
127
+ 'val': [prep, pertubation_val, transforms.Compose(norm)],
128
+
129
+ 'test_unseen': [prep, pertubation_test, transforms.Compose(norm)],
130
+
131
+ 'clean': transforms.Compose([transforms.Resize(csize)] + norm)
132
+ }
133
+ return preprocess
134
+
135
+
136
+ # ## example
137
+ # from PIL import Image
138
+ # import numpy as np
139
+ # import time
140
+ # from imagenet_c import corrupt, corruption_dict
141
+ # im = Image.open('/vol/research/tubui1/projects/gan_prov/gan_models/stargan2/test.jpg').convert('RGB').resize((224,224), Image.BILINEAR)
142
+ # im.save('original.jpg')
143
+ # im = np.array(im)[:,:,::-1] # BRG
144
+ # t = np.zeros(19)
145
+ # for i, key in enumerate(corruption_dict.keys()):
146
+ # begin = time.time()
147
+ # for j in range(10):
148
+ # out = corrupt(im, 5, corruption_number=i)
149
+ # end = time.time()
150
+ # t[i] = end-begin
151
+ # # Image.fromarray(out[:,:,::-1]).save(f'imc_{key}.jpg')
152
+ # print(f'{i} - {key}: {end-begin}')
153
+
154
+ # for i,k in enumerate(corruption_dict.keys()):
155
+ # print(i, k, t[i])
tools/base_lmdb.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional, Union
2
+ from pathlib import Path
3
+ import os
4
+ import io
5
+ import lmdb
6
+ import pickle
7
+ import gzip
8
+ import bz2
9
+ import lzma
10
+ import shutil
11
+ from tqdm import tqdm
12
+ import pandas as pd
13
+ import numpy as np
14
+ from numpy import ndarray
15
+ import time
16
+ import torch
17
+ from torch import Tensor
18
+ from distutils.dir_util import copy_tree
19
+ from PIL import Image
20
+ from PIL import ImageFile
21
+
22
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
23
+
24
+
25
+ def _default_encode(data: Any, protocol: int) -> bytes:
26
+ return pickle.dumps(data, protocol=protocol)
27
+
28
+
29
+ def _ascii_encode(data: str) -> bytes:
30
+ return data.encode("ascii")
31
+
32
+
33
+ def _default_decode(data: bytes) -> Any:
34
+ return pickle.loads(data)
35
+
36
+
37
+ def _default_decompress(data: bytes) -> bytes:
38
+ return data
39
+
40
+
41
+ def _decompress(compression: Optional[str]):
42
+ if compression is None:
43
+ _decompress = _default_decompress
44
+ elif compression == "gzip":
45
+ _decompress = gzip.decompress
46
+ elif compression == "bz2":
47
+ _decompress = bz2.decompress
48
+ elif compression == "lzma":
49
+ _decompress = lzma.decompress
50
+ else:
51
+ raise ValueError(f"Unknown compression algorithm: {compression}")
52
+
53
+ return _decompress
54
+
55
+
56
+ class BaseLMDB(object):
57
+ _database = None
58
+ _protocol = None
59
+ _length = None
60
+
61
+ def __init__(
62
+ self,
63
+ path: Union[str, Path],
64
+ readahead: bool = False,
65
+ pre_open: bool = False,
66
+ compression: Optional[str] = None
67
+ ):
68
+ """
69
+ Base class for LMDB-backed databases.
70
+
71
+ :param path: Path to the database.
72
+ :param readahead: Enables the filesystem readahead mechanism.
73
+ :param pre_open: If set to True, the first iterations will be faster, but it will raise error when doing multi-gpu training. If set to False, the database will open when you will retrieve the first item.
74
+ """
75
+ if not isinstance(path, str):
76
+ path = str(path)
77
+
78
+ self.path = path
79
+ self.readahead = readahead
80
+ self.pre_open = pre_open
81
+ self._decompress = _decompress(compression)
82
+ self._has_fetched_an_item = False
83
+
84
+ @property
85
+ def database(self):
86
+ if self._database is None:
87
+ self._database = lmdb.open(
88
+ path=self.path,
89
+ readonly=True,
90
+ readahead=self.readahead,
91
+ max_spare_txns=256,
92
+ lock=False,
93
+ )
94
+ return self._database
95
+
96
+ @database.deleter
97
+ def database(self):
98
+ if self._database is not None:
99
+ self._database.close()
100
+ self._database = None
101
+
102
+ @property
103
+ def protocol(self):
104
+ """
105
+ Read the pickle protocol contained in the database.
106
+
107
+ :return: The set of available keys.
108
+ """
109
+ if self._protocol is None:
110
+ self._protocol = self._get(
111
+ item="protocol",
112
+ encode_key=_ascii_encode,
113
+ decompress_value=_default_decompress,
114
+ decode_value=_default_decode,
115
+ )
116
+ return self._protocol
117
+
118
+ @property
119
+ def keys(self):
120
+ """
121
+ Read the keys contained in the database.
122
+
123
+ :return: The set of available keys.
124
+ """
125
+ protocol = self.protocol
126
+ keys = self._get(
127
+ item="keys",
128
+ encode_key=lambda key: _default_encode(key, protocol=protocol),
129
+ decompress_value=_default_decompress,
130
+ decode_value=_default_decode,
131
+ )
132
+ return keys
133
+
134
+ def __len__(self):
135
+ """
136
+ Returns the number of keys available in the database.
137
+
138
+ :return: The number of keys.
139
+ """
140
+ if self._length is None:
141
+ self._length = len(self.keys)
142
+ return self._length
143
+
144
+ def __getitem__(self, item):
145
+ """
146
+ Retrieves an item or a list of items from the database.
147
+
148
+ :param item: A key or a list of keys.
149
+ :return: A value or a list of values.
150
+ """
151
+ self._has_fetched_an_item = True
152
+ if not isinstance(item, list):
153
+ item = self._get(
154
+ item=item,
155
+ encode_key=self._encode_key,
156
+ decompress_value=self._decompress_value,
157
+ decode_value=self._decode_value,
158
+ )
159
+ else:
160
+ item = self._gets(
161
+ items=item,
162
+ encode_keys=self._encode_keys,
163
+ decompress_values=self._decompress_values,
164
+ decode_values=self._decode_values,
165
+ )
166
+ return item
167
+
168
+ def _get(self, item, encode_key, decompress_value, decode_value):
169
+ """
170
+ Instantiates a transaction and its associated cursor to fetch an item.
171
+
172
+ :param item: A key.
173
+ :param encode_key:
174
+ :param decode_value:
175
+ :return:
176
+ """
177
+ with self.database.begin() as txn:
178
+ with txn.cursor() as cursor:
179
+ item = self._fetch(
180
+ cursor=cursor,
181
+ key=item,
182
+ encode_key=encode_key,
183
+ decompress_value=decompress_value,
184
+ decode_value=decode_value,
185
+ )
186
+ self._keep_database()
187
+ return item
188
+
189
+ def _gets(self, items, encode_keys, decompress_values, decode_values):
190
+ """
191
+ Instantiates a transaction and its associated cursor to fetch a list of items.
192
+
193
+ :param items: A list of keys.
194
+ :param encode_keys:
195
+ :param decode_values:
196
+ :return:
197
+ """
198
+ with self.database.begin() as txn:
199
+ with txn.cursor() as cursor:
200
+ items = self._fetchs(
201
+ cursor=cursor,
202
+ keys=items,
203
+ encode_keys=encode_keys,
204
+ decompress_values=decompress_values,
205
+ decode_values=decode_values,
206
+ )
207
+ self._keep_database()
208
+ return items
209
+
210
+ def _fetch(self, cursor, key, encode_key, decompress_value, decode_value):
211
+ """
212
+ Retrieve a value given a key.
213
+
214
+ :param cursor:
215
+ :param key: A key.
216
+ :param encode_key:
217
+ :param decode_value:
218
+ :return: A value.
219
+ """
220
+ key = encode_key(key)
221
+ value = cursor.get(key)
222
+ value = decompress_value(value)
223
+ value = decode_value(value)
224
+ return value
225
+
226
+ def _fetchs(self, cursor, keys, encode_keys, decompress_values, decode_values):
227
+ """
228
+ Retrieve a list of values given a list of keys.
229
+
230
+ :param cursor:
231
+ :param keys: A list of keys.
232
+ :param encode_keys:
233
+ :param decode_values:
234
+ :return: A list of values.
235
+ """
236
+ keys = encode_keys(keys)
237
+ _, values = list(zip(*cursor.getmulti(keys)))
238
+ values = decompress_values(values)
239
+ values = decode_values(values)
240
+ return values
241
+
242
+ def _encode_key(self, key: Any) -> bytes:
243
+ """
244
+ Converts a key into a byte key.
245
+
246
+ :param key: A key.
247
+ :return: A byte key.
248
+ """
249
+ return pickle.dumps(key, protocol=self.protocol)
250
+
251
+ def _encode_keys(self, keys: list) -> list:
252
+ """
253
+ Converts keys into byte keys.
254
+
255
+ :param keys: A list of keys.
256
+ :return: A list of byte keys.
257
+ """
258
+ return [self._encode_key(key=key) for key in keys]
259
+
260
+ def _decompress_value(self, value: bytes) -> bytes:
261
+ return self._decompress(value)
262
+
263
+ def _decompress_values(self, values: list) -> list:
264
+ return [self._decompress_value(value=value) for value in values]
265
+
266
+ def _decode_value(self, value: bytes) -> Any:
267
+ """
268
+ Converts a byte value back into a value.
269
+
270
+ :param value: A byte value.
271
+ :return: A value
272
+ """
273
+ return pickle.loads(value)
274
+
275
+ def _decode_values(self, values: list) -> list:
276
+ """
277
+ Converts bytes values back into values.
278
+
279
+ :param values: A list of byte values.
280
+ :return: A list of values.
281
+ """
282
+ return [self._decode_value(value=value) for value in values]
283
+
284
+ def _keep_database(self):
285
+ """
286
+ Checks if the database must be deleted.
287
+
288
+ :return:
289
+ """
290
+ if not self.pre_open and not self._has_fetched_an_item:
291
+ del self.database
292
+
293
+ def __iter__(self):
294
+ """
295
+ Provides an iterator over the keys when iterating over the database.
296
+
297
+ :return: An iterator on the keys.
298
+ """
299
+ return iter(self.keys)
300
+
301
+ def __del__(self):
302
+ """
303
+ Closes the database properly.
304
+ """
305
+ del self.database
306
+
307
+ @staticmethod
308
+ def write(data_lst, indir, outdir):
309
+ raise NotImplementedError
310
+
311
+
312
+ class PILlmdb(BaseLMDB):
313
+ def __init__(
314
+ self,
315
+ lmdb_dir: Union[str, Path],
316
+ image_list: Union[str, Path, pd.DataFrame]=None,
317
+ index_key='id',
318
+ **kwargs
319
+ ):
320
+ super().__init__(path=lmdb_dir, **kwargs)
321
+ if image_list is None:
322
+ self.ids = list(range(len(self.keys)))
323
+ self.labels = list(range(len(self.ids)))
324
+ else:
325
+ df = pd.read_csv(str(image_list))
326
+ assert index_key in df, f'[PILlmdb] Error! {image_list} must have id keys.'
327
+ self.ids = df[index_key].tolist()
328
+ assert max(self.ids) < len(self.keys)
329
+ if 'label' in df:
330
+ self.labels = df['label'].tolist()
331
+ else: # all numeric keys other than 'id' are labels
332
+ keys = [key for key in df if (key!=index_key and type(df[key][0]) in [int, np.int64])]
333
+ # df = df.drop('id', axis=1)
334
+ self.labels = df[keys].to_numpy()
335
+ self._length = len(self.ids)
336
+
337
+ def __len__(self):
338
+ return self._length
339
+
340
+ def __iter__(self):
341
+ return iter([self.keys[i] for i in self.ids])
342
+
343
+ def __getitem__(self, index):
344
+ key = self.keys[self.ids[index]]
345
+ return super().__getitem__(key)
346
+
347
+ def set_ids(self, ids):
348
+ self.ids = [self.ids[i] for i in ids]
349
+ self.labels = [self.labels[i] for i in ids]
350
+ self._length = len(self.ids)
351
+
352
+ def _decode_value(self, value: bytes):
353
+ """
354
+ Converts a byte image back into a PIL Image.
355
+
356
+ :param value: A byte image.
357
+ :return: A PIL Image image.
358
+ """
359
+ return Image.open(io.BytesIO(value))
360
+
361
+ @staticmethod
362
+ def write(indir, outdir, data_lst=None, transform=None):
363
+ """
364
+ create lmdb given data directory and list of image paths; or an iterator
365
+ :param data_lst None or csv file containing 'path' key to store relative paths to the images
366
+ :param indir root directory of the images
367
+ :param outdir output lmdb, data.mdb and lock.mdb will be written here
368
+ """
369
+
370
+ outdir = Path(outdir)
371
+ outdir.mkdir(parents=True, exist_ok=True)
372
+ tmp_dir = Path("/tmp") / f"TEMP_{time.time()}"
373
+ tmp_dir.mkdir(parents=True, exist_ok=True)
374
+ dtype = {'str': False, 'pil': False}
375
+ if isinstance(indir, str) or isinstance(indir, Path):
376
+ indir = Path(indir)
377
+ if data_lst is None: # grab all images in this dir
378
+ lst = list(indir.glob('**/*.jpg')) + list(indir.glob('**/*.png'))
379
+ else:
380
+ lst = pd.read_csv(data_lst)['path'].tolist()
381
+ lst = [indir/p for p in lst]
382
+ assert len(lst) > 0, f'Couldnt find any image in {indir} (Support only .jpg and .png) or list (must have path field).'
383
+ n = len(lst)
384
+ dtype['str'] = True
385
+ else: # iterator
386
+ n = len(indir)
387
+ lst = iter(indir)
388
+ dtype['pil'] = True
389
+
390
+ with lmdb.open(path=str(tmp_dir), map_size=2 ** 40) as env:
391
+ # Add the protocol to the database.
392
+ with env.begin(write=True) as txn:
393
+ key = "protocol".encode("ascii")
394
+ value = pickle.dumps(pickle.DEFAULT_PROTOCOL)
395
+ txn.put(key=key, value=value, dupdata=False)
396
+ # Add the keys to the database.
397
+ with env.begin(write=True) as txn:
398
+ key = pickle.dumps("keys")
399
+ value = pickle.dumps(list(range(n)))
400
+ txn.put(key=key, value=value, dupdata=False)
401
+ # Add the images to the database.
402
+ for key, value in tqdm(enumerate(lst), total=n, miniters=n//100, mininterval=300):
403
+ with env.begin(write=True) as txn:
404
+ key = pickle.dumps(key)
405
+ if dtype['str']:
406
+ with value.open("rb") as file:
407
+ byteimg = file.read()
408
+ else: # PIL
409
+ data = io.BytesIO()
410
+ value.save(data, 'png')
411
+ byteimg = data.getvalue()
412
+
413
+ if transform is not None:
414
+ im = Image.open(io.BytesIO(byteimg))
415
+ im = transform(im)
416
+ data = io.BytesIO()
417
+ im.save(data, 'png')
418
+ byteimg = data.getvalue()
419
+ txn.put(key=key, value=byteimg, dupdata=False)
420
+
421
+ # Move the database to its destination.
422
+ copy_tree(str(tmp_dir), str(outdir))
423
+ shutil.rmtree(str(tmp_dir))
424
+
425
+
426
+
427
+ class MaskDatabase(PILlmdb):
428
+ def _decode_value(self, value: bytes):
429
+ """
430
+ Converts a byte image back into a PIL Image.
431
+
432
+ :param value: A byte image.
433
+ :return: A PIL Image image.
434
+ """
435
+ return Image.open(io.BytesIO(value)).convert("1")
436
+
437
+
438
+ class LabelDatabase(BaseLMDB):
439
+ pass
440
+
441
+
442
+ class ArrayDatabase(BaseLMDB):
443
+ _dtype = None
444
+ _shape = None
445
+
446
+ def __init__(
447
+ self,
448
+ lmdb_dir: Union[str, Path],
449
+ image_list: Union[str, Path, pd.DataFrame]=None,
450
+ **kwargs
451
+ ):
452
+ super().__init__(path=lmdb_dir, **kwargs)
453
+ if image_list is None:
454
+ self.ids = list(range(len(self.keys)))
455
+ self.labels = list(range(len(self.ids)))
456
+ else:
457
+ df = pd.read_csv(str(image_list))
458
+ assert 'id' in df, f'[ArrayDatabase] Error! {image_list} must have id keys.'
459
+ self.ids = df['id'].tolist()
460
+ assert max(self.ids) < len(self.keys)
461
+ if 'label' in df:
462
+ self.labels = df['label'].tolist()
463
+ else: # all numeric keys other than 'id' are labels
464
+ keys = [key for key in df if (key!='id' and type(df[key][0]) in [int, np.int64])]
465
+ # df = df.drop('id', axis=1)
466
+ self.labels = df[keys].to_numpy()
467
+ self._length = len(self.ids)
468
+
469
+ def set_ids(self, ids):
470
+ self.ids = [self.ids[i] for i in ids]
471
+ self.labels = [self.labels[i] for i in ids]
472
+ self._length = len(self.ids)
473
+
474
+ def __len__(self):
475
+ return self._length
476
+
477
+ def __iter__(self):
478
+ return iter([self.keys[i] for i in self.ids])
479
+
480
+ def __getitem__(self, index):
481
+ key = self.keys[self.ids[index]]
482
+ return super().__getitem__(key)
483
+
484
+ @property
485
+ def dtype(self):
486
+ if self._dtype is None:
487
+ protocol = self.protocol
488
+ self._dtype = self._get(
489
+ item="dtype",
490
+ encode_key=lambda key: _default_encode(key, protocol=protocol),
491
+ decompress_value=_default_decompress,
492
+ decode_value=_default_decode,
493
+ )
494
+ return self._dtype
495
+
496
+ @property
497
+ def shape(self):
498
+ if self._shape is None:
499
+ protocol = self.protocol
500
+ self._shape = self._get(
501
+ item="shape",
502
+ encode_key=lambda key: _default_encode(key, protocol=protocol),
503
+ decompress_value=_default_decompress,
504
+ decode_value=_default_decode,
505
+ )
506
+ return self._shape
507
+
508
+ def _decode_value(self, value: bytes) -> ndarray:
509
+ value = super()._decode_value(value)
510
+ return np.frombuffer(value, dtype=self.dtype).reshape(self.shape)
511
+
512
+ def _decode_values(self, values: list) -> ndarray:
513
+ shape = (len(values),) + self.shape
514
+ return np.frombuffer(b"".join(values), dtype=self.dtype).reshape(shape)
515
+
516
+ @staticmethod
517
+ def write(diter, outdir):
518
+ """
519
+ diter is an iterator that has __len__ method
520
+ class Myiter():
521
+ def __init__(self, data):
522
+ self.data = data
523
+ def __iter__(self):
524
+ self.counter = 0
525
+ return self
526
+ def __len__(self):
527
+ return len(self.data)
528
+ def __next__(self):
529
+ if self.counter < len(self):
530
+ out = self.data[self.counter]
531
+ self.counter+=1
532
+ return out
533
+ else:
534
+ raise StopIteration
535
+ a = iter(Myiter([1,2,3]))
536
+ for i in a:
537
+ print(i)
538
+ """
539
+ outdir = Path(outdir)
540
+ outdir.mkdir(parents=True, exist_ok=True)
541
+ tmp_dir = Path("/tmp") / f"TEMP_{time.time()}"
542
+ tmp_dir.mkdir(parents=True, exist_ok=True)
543
+ # Create the database.
544
+ n = len(diter)
545
+ with lmdb.open(path=str(tmp_dir), map_size=2 ** 40) as env:
546
+ # Add the protocol to the database.
547
+ with env.begin(write=True) as txn:
548
+ key = "protocol".encode("ascii")
549
+ value = pickle.dumps(pickle.DEFAULT_PROTOCOL)
550
+ txn.put(key=key, value=value, dupdata=False)
551
+ # Add the keys to the database.
552
+ with env.begin(write=True) as txn:
553
+ key = pickle.dumps("keys")
554
+ value = pickle.dumps(list(range(n)))
555
+ txn.put(key=key, value=value, dupdata=False)
556
+ # Extract the shape and dtype of the values.
557
+ value = next(iter(diter))
558
+ shape = value.shape
559
+ dtype = value.dtype
560
+ # Add the shape to the database.
561
+ with env.begin(write=True) as txn:
562
+ key = pickle.dumps("shape")
563
+ value = pickle.dumps(shape)
564
+ txn.put(key=key, value=value, dupdata=False)
565
+ # Add the dtype to the database.
566
+ with env.begin(write=True) as txn:
567
+ key = pickle.dumps("dtype")
568
+ value = pickle.dumps(dtype)
569
+ txn.put(key=key, value=value, dupdata=False)
570
+ # Add the values to the database.
571
+ with env.begin(write=True) as txn:
572
+ for key, value in tqdm(enumerate(iter(diter)), total=n, miniters=n//100, mininterval=300):
573
+ key = pickle.dumps(key)
574
+ value = pickle.dumps(value)
575
+ txn.put(key=key, value=value, dupdata=False)
576
+
577
+ # Move the database to its destination.
578
+ copy_tree(str(tmp_dir), str(outdir))
579
+ shutil.rmtree(str(tmp_dir))
580
+
581
+
582
+
583
+ class TensorDatabase(ArrayDatabase):
584
+ def _decode_value(self, value: bytes) -> Tensor:
585
+ return torch.from_numpy(super(TensorDatabase, self)._decode_value(value))
586
+
587
+ def _decode_values(self, values: list) -> Tensor:
588
+ return torch.from_numpy(super(TensorDatabase, self)._decode_values(values))
tools/ecc.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bchlib
2
+ import numpy as np
3
+ from typing import List, Tuple
4
+ import random
5
+ from copy import deepcopy
6
+
7
+ class RSC(object):
8
+ def __init__(self, data_bytes=16, ecc_bytes=4, verbose=False, **kwargs):
9
+ from reedsolo import RSCodec
10
+ self.rs = RSCodec(ecc_bytes)
11
+ if verbose:
12
+ print(f'Reed-Solomon ECC len: {ecc_bytes*8} bits')
13
+ self.data_len = data_bytes
14
+ self.dlen = data_bytes * 8 # data length in bits
15
+ self.ecc_len = ecc_bytes * 8 # ecc length in bits
16
+
17
+ def get_total_len(self):
18
+ return self.dlen + self.ecc_len
19
+
20
+ def encode_text(self, text: List[str]):
21
+ return np.array([self._encode_text(t) for t in text])
22
+
23
+ def _encode_text(self, text: str):
24
+ text = text + ' ' * (self.dlen // 8 - len(text))
25
+ out = self.rs.encode(text.encode('utf-8')) # bytearray
26
+ out = ''.join(format(x, '08b') for x in out) # bit string
27
+ out = np.array([int(x) for x in out], dtype=np.float32)
28
+ return out
29
+
30
+ def decode_text(self, data: np.array):
31
+ assert len(data.shape)==2
32
+ return [self._decode_text(d) for d in data]
33
+
34
+ def _decode_text(self, data: np.array):
35
+ assert len(data.shape)==1
36
+ data = ''.join([str(int(bit)) for bit in data])
37
+ data = bytes(int(data[i: i + 8], 2) for i in range(0, len(data), 8))
38
+ data = bytearray(data)
39
+ try:
40
+ data = self.rs.decode(data)[0]
41
+ data = data.decode('utf-8').strip()
42
+ except:
43
+ print('Error: Decode failed')
44
+ data = get_random_unicode(self.get_total_len()//8)
45
+
46
+ return data
47
+
48
+ def get_random_unicode(length):
49
+ # Update this to include code point ranges to be sampled
50
+ include_ranges = [
51
+ ( 0x0021, 0x0021 ),
52
+ ( 0x0023, 0x0026 ),
53
+ ( 0x0028, 0x007E ),
54
+ ( 0x00A1, 0x00AC ),
55
+ ( 0x00AE, 0x00FF ),
56
+ ( 0x0100, 0x017F ),
57
+ ( 0x0180, 0x024F ),
58
+ ( 0x2C60, 0x2C7F ),
59
+ ( 0x16A0, 0x16F0 ),
60
+ ( 0x0370, 0x0377 ),
61
+ ( 0x037A, 0x037E ),
62
+ ( 0x0384, 0x038A ),
63
+ ( 0x038C, 0x038C ),
64
+ ]
65
+ alphabet = [
66
+ chr(code_point) for current_range in include_ranges
67
+ for code_point in range(current_range[0], current_range[1] + 1)
68
+ ]
69
+ return ''.join(random.choice(alphabet) for i in range(length))
70
+
71
+
72
+ class BCH(object):
73
+ def __init__(self, BCH_POLYNOMIAL = 137, BCH_BITS = 5, payload_len=100, verbose=True,**kwargs):
74
+ self.bch = bchlib.BCH(BCH_POLYNOMIAL, BCH_BITS)
75
+ self.payload_len = payload_len # in bits
76
+ self.data_len = (self.payload_len - self.bch.ecc_bytes*8)//7 # in ascii characters
77
+ assert self.data_len*7+self.bch.ecc_bytes*8 <= self.bch.n, f'Error! BCH with poly {BCH_POLYNOMIAL} and bits {BCH_BITS} can only encode max {self.bch.n//8} bytes of total payload'
78
+ if verbose:
79
+ print(f'BCH: POLYNOMIAL={BCH_POLYNOMIAL}, protected bits={BCH_BITS}, payload_len={payload_len} bits, data_len={self.data_len*7} bits ({self.data_len} ascii chars), ecc len={self.bch.ecc_bytes*8} bits')
80
+
81
+ def get_total_len(self):
82
+ return self.payload_len
83
+
84
+ def encode_text(self, text: List[str]):
85
+ return np.array([self._encode_text(t) for t in text])
86
+
87
+ def _encode_text(self, text: str):
88
+ text = text + ' ' * (self.data_len - len(text))
89
+ # data = text.encode('utf-8') # bytearray
90
+ data = encode_text_ascii(text) # bytearray
91
+ ecc = self.bch.encode(data) # bytearray
92
+ packet = data + ecc # payload in bytearray
93
+ packet = ''.join(format(x, '08b') for x in packet)
94
+ packet = [int(x) for x in packet]
95
+ packet.extend([0]*(self.payload_len - len(packet)))
96
+ packet = np.array(packet, dtype=np.float32)
97
+ return packet
98
+
99
+ def decode_text(self, data: np.array):
100
+ assert len(data.shape)==2
101
+ return [self._decode_text(d) for d in data]
102
+
103
+ def _decode_text(self, packet: np.array):
104
+ assert len(packet.shape)==1
105
+ packet = ''.join([str(int(bit)) for bit in packet]) # bit string
106
+ packet = packet[:(len(packet)//8*8)] # trim to multiple of 8 bits
107
+ packet = bytes(int(packet[i: i + 8], 2) for i in range(0, len(packet), 8))
108
+ packet = bytearray(packet)
109
+ # assert len(packet) == self.data_len + self.bch.ecc_bytes
110
+ data, ecc = packet[:-self.bch.ecc_bytes], packet[-self.bch.ecc_bytes:]
111
+ data0 = decode_text_ascii(deepcopy(data)).strip()
112
+ bitflips = self.bch.decode_inplace(data, ecc)
113
+ if bitflips == -1: # error, return random text
114
+ data = data0
115
+ else:
116
+ # data = data.decode('utf-8').strip()
117
+ data = decode_text_ascii(data).strip()
118
+ return data
119
+
120
+
121
+ def encode_text_ascii(text: str):
122
+ # encode text to 7-bit ascii
123
+ # input: text, str
124
+ # output: encoded text, bytearray
125
+ text_int7 = [ord(t) & 127 for t in text]
126
+ text_bitstr = ''.join(format(t,'07b') for t in text_int7)
127
+ if len(text_bitstr) % 8 != 0:
128
+ text_bitstr = '0'*(8-len(text_bitstr)%8) + text_bitstr # pad to multiple of 8
129
+ text_int8 = [int(text_bitstr[i:i+8], 2) for i in range(0, len(text_bitstr), 8)]
130
+ return bytearray(text_int8)
131
+
132
+
133
+ def decode_text_ascii(text: bytearray):
134
+ # decode text from 7-bit ascii
135
+ # input: text, bytearray
136
+ # output: decoded text, str
137
+ text_bitstr = ''.join(format(t,'08b') for t in text) # bit string
138
+ pad = len(text_bitstr) % 7
139
+ if pad != 0: # has padding, remove
140
+ text_bitstr = text_bitstr[pad:]
141
+ text_int7 = [int(text_bitstr[i:i+7], 2) for i in range(0, len(text_bitstr), 7)]
142
+ text_bytes = bytes(text_int7)
143
+ return text_bytes.decode('utf-8')
144
+
145
+
146
+ class ECC(object):
147
+ def __init__(self, BCH_POLYNOMIAL = 137, BCH_BITS = 5, **kwargs):
148
+ self.bch = bchlib.BCH(BCH_POLYNOMIAL, BCH_BITS)
149
+
150
+ def get_total_len(self):
151
+ return 100
152
+
153
+ def _encode(self, x):
154
+ # x: 56 bits, {0, 1}, np.array
155
+ # return: 100 bits, {0, 1}, np.array
156
+ dlen = len(x)
157
+ data_str = ''.join(str(x) for x in x.astype(int))
158
+ packet = bytes(int(data_str[i: i + 8], 2) for i in range(0, dlen, 8))
159
+ packet = bytearray(packet)
160
+ ecc = self.bch.encode(packet)
161
+ packet = packet + ecc # 96 bits
162
+ packet = ''.join(format(x, '08b') for x in packet)
163
+ packet = [int(x) for x in packet]
164
+ packet.extend([0, 0, 0, 0])
165
+ packet = np.array(packet, dtype=np.float32) # 100
166
+ return packet
167
+
168
+ def _decode(self, x):
169
+ # x: 100 bits, {0, 1}, np.array
170
+ # return: 56 bits, {0, 1}, np.array
171
+ packet_binary = "".join([str(int(bit)) for bit in x])
172
+ packet = bytes(int(packet_binary[i: i + 8], 2) for i in range(0, len(packet_binary), 8))
173
+ packet = bytearray(packet)
174
+
175
+ data, ecc = packet[:-self.bch.ecc_bytes], packet[-self.bch.ecc_bytes:]
176
+ bitflips = self.bch.decode_inplace(data, ecc)
177
+ if bitflips == -1: # error, return random data
178
+ data = np.random.binomial(1, .5, 56)
179
+ else:
180
+ data = ''.join(format(x, '08b') for x in data)
181
+ data = np.array([int(x) for x in data], dtype=np.float32)
182
+ return data # 56 bits
183
+
184
+ def _generate(self):
185
+ dlen = 56
186
+ data= np.random.binomial(1, .5, dlen)
187
+ packet = self._encode(data)
188
+ return packet, data
189
+
190
+ def generate(self, nsamples=1):
191
+ # generate random 56 bit secret
192
+ data = [self._generate() for _ in range(nsamples)]
193
+ data = (np.array([d[0] for d in data]), np.array([d[1] for d in data]))
194
+ return data # data with ecc, data org
195
+
196
+ def _to_text(self, data):
197
+ # data: {0, 1}, np.array
198
+ # return: str
199
+ data = ''.join([str(int(bit)) for bit in data])
200
+ all_bytes = [ data[i: i+8] for i in range(0, len(data), 8) ]
201
+ text = ''.join([chr(int(byte, 2)) for byte in all_bytes])
202
+ return text.strip()
203
+
204
+ def _to_binary(self, s):
205
+ if isinstance(s, str):
206
+ out = ''.join([ format(ord(i), "08b") for i in s ])
207
+ elif isinstance(s, bytes):
208
+ out = ''.join([ format(i, "08b") for i in s ])
209
+ elif isinstance(s, np.ndarray) and s.dtype is np.dtype(bool):
210
+ out = ''.join([chr(int(i)) for i in s])
211
+ elif isinstance(s, int) or isinstance(s, np.uint8):
212
+ out = format(s, "08b")
213
+ elif isinstance(s, np.ndarray):
214
+ out = [ format(i, "08b") for i in s ]
215
+ else:
216
+ raise TypeError("Type not supported.")
217
+
218
+ return np.array([float(i) for i in out], dtype=np.float32)
219
+
220
+ def _encode_text(self, s):
221
+ s = s + ' '*(7-len(s)) # 7 chars
222
+ s = self._to_binary(s) # 56 bits
223
+ packet = self._encode(s) # 100 bits
224
+ return packet, s
225
+
226
+ def encode_text(self, secret_list, return_pre_ecc=False):
227
+ """encode secret with BCH ECC.
228
+ Input: secret (list of strings)
229
+ Output: secret (np array) with shape (B, 100) type float23, val {0,1}"""
230
+ assert np.all(np.array([len(s) for s in secret_list]) <= 7), 'Error! all strings must be less than 7 characters'
231
+ secret_list = [self._encode_text(s) for s in secret_list]
232
+ ecc = np.array([s[0] for s in secret_list], dtype=np.float32)
233
+ if return_pre_ecc:
234
+ return ecc, np.array([s[1] for s in secret_list], dtype=np.float32)
235
+ return ecc
236
+
237
+ def decode_text(self, data):
238
+ """Decode secret with BCH ECC and convert to string.
239
+ Input: secret (torch.tensor) with shape (B, 100) type bool
240
+ Output: secret (B, 56)"""
241
+ data = self.decode(data)
242
+ data = [self._to_text(d) for d in data]
243
+ return data
244
+
245
+ def decode(self, data):
246
+ """Decode secret with BCH ECC and convert to string.
247
+ Input: secret (torch.tensor) with shape (B, 100) type bool
248
+ Output: secret (B, 56)"""
249
+ data = data[:, :96]
250
+ data = [self._decode(d) for d in data]
251
+ return np.array(data)
252
+
253
+ def test_ecc():
254
+ ecc = ECC()
255
+ batch_size = 10
256
+ secret_ecc, secret_org = ecc.generate(batch_size) # 10x100 ecc secret, 10x56 org secret
257
+ # modify secret_ecc
258
+ secret_pred = secret_ecc.copy()
259
+ secret_pred[:,3:6] = 1 - secret_pred[:,3:6]
260
+ # pass secret_ecc to model and get predicted as secret_pred
261
+ secret_pred_org = ecc.decode(secret_pred) # 10x56
262
+ assert np.all(secret_pred_org == secret_org) # 10
263
+
264
+
265
+ def test_bch():
266
+ # test 100 bit
267
+ def check(text, poly, k, l):
268
+ bch = BCH(poly, k, l)
269
+ # text = 'secrets'
270
+ encode = bch.encode_text([text])
271
+ for ind in np.random.choice(l, k):
272
+ encode[0, ind] = 1 - encode[0, ind]
273
+ text_recon = bch.decode_text(encode)[0]
274
+ assert text==text_recon
275
+
276
+ check('secrets', 137, 5, 100)
277
+ check('some secret', 285, 10, 160)
278
+
279
+ if __name__ == '__main__':
280
+ test_ecc()
281
+ test_bch()
tools/eval_metrics.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import skimage.metrics
4
+ import lpips
5
+ from PIL import Image
6
+ from .sifid import SIFID
7
+
8
+
9
+ def resize_array(x, size=256):
10
+ """
11
+ Resize image array to given size.
12
+ Args:
13
+ x (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
14
+ size (int): Size of output image.
15
+ Returns:
16
+ (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
17
+ """
18
+ if x.shape[1] != size:
19
+ x = [Image.fromarray(x[i]).resize((size, size), resample=Image.BILINEAR) for i in range(x.shape[0])]
20
+ x = np.array([np.array(i) for i in x])
21
+ return x
22
+
23
+
24
+ def resize_tensor(x, size=256):
25
+ """
26
+ Resize image tensor to given size.
27
+ Args:
28
+ x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
29
+ size (int): Size of output image.
30
+ Returns:
31
+ (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
32
+ """
33
+ if x.shape[2] != size:
34
+ x = torch.nn.functional.interpolate(x, size=(size, size), mode='bilinear', align_corners=False)
35
+ return x
36
+
37
+
38
+ def normalise(x):
39
+ """
40
+ Normalise image array to range [-1, 1] and tensor.
41
+ Args:
42
+ x (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
43
+ Returns:
44
+ (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
45
+ """
46
+ x = x.astype(np.float32)
47
+ x = x / 255
48
+ x = (x - 0.5) / 0.5
49
+ x = torch.from_numpy(x)
50
+ x = x.permute(0, 3, 1, 2)
51
+ return x
52
+
53
+
54
+ def unormalise(x, vrange=[-1, 1]):
55
+ """
56
+ Unormalise image tensor to range [0, 255] and RGB array.
57
+ Args:
58
+ x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
59
+ Returns:
60
+ (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
61
+ """
62
+ x = (x - vrange[0])/(vrange[1] - vrange[0])
63
+ x = x * 255
64
+ x = x.permute(0, 2, 3, 1)
65
+ x = x.cpu().numpy().astype(np.uint8)
66
+ return x
67
+
68
+
69
+ def compute_mse(x, y):
70
+ """
71
+ Compute mean squared error between two image arrays.
72
+ Args:
73
+ x (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
74
+ y (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
75
+ Returns:
76
+ (1darray): Mean squared error.
77
+ """
78
+ return np.square(x - y).reshape(x.shape[0], -1).mean(axis=1)
79
+
80
+
81
+ def compute_psnr(x, y):
82
+ """
83
+ Compute peak signal-to-noise ratio between two images.
84
+ Args:
85
+ x (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
86
+ y (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
87
+ Returns:
88
+ (float): Peak signal-to-noise ratio.
89
+ """
90
+ return 10 * np.log10(255 ** 2 / compute_mse(x, y))
91
+
92
+
93
+ def compute_ssim(x, y):
94
+ """
95
+ Compute structural similarity index between two images.
96
+ Args:
97
+ x (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
98
+ y (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
99
+ Returns:
100
+ (float): Structural similarity index.
101
+ """
102
+ return np.array([skimage.metrics.structural_similarity(xi, yi, channel_axis=2, gaussian_weights=True, sigma=1.5, use_sample_covariance=False, data_range=255) for xi, yi in zip(x, y)])
103
+
104
+
105
+ def compute_lpips(x, y, net='alex'):
106
+ """
107
+ Compute LPIPS between two images.
108
+ Args:
109
+ x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
110
+ y (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
111
+ Returns:
112
+ (float): LPIPS.
113
+ """
114
+ lpips_fn = lpips.LPIPS(net=net, verbose=False).cuda() if isinstance(net, str) else net
115
+ x, y = x.cuda(), y.cuda()
116
+ return lpips_fn(x, y).detach().cpu().numpy().squeeze()
117
+
118
+
119
+ def compute_sifid(x, y, net=None):
120
+ """
121
+ Compute SIFID between two images.
122
+ Args:
123
+ x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
124
+ y (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
125
+ Returns:
126
+ (float): SIFID.
127
+ """
128
+ fn = SIFID() if net is None else net
129
+ out = [fn(xi, yi) for xi, yi in zip(x, y)]
130
+ return np.array(out)
tools/fid.py ADDED
@@ -0,0 +1,672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Calculates the Frechet Inception Distance (FID) to evalulate GANs
2
+
3
+ The FID metric calculates the distance between two distributions of images.
4
+ Typically, we have summary statistics (mean & covariance matrix) of one
5
+ of these distributions, while the 2nd distribution is given by a GAN.
6
+
7
+ When run as a stand-alone program, it compares the distribution of
8
+ images that are stored as PNG/JPEG at a specified location with a
9
+ distribution given by summary statistics (in pickle format).
10
+
11
+ The FID is calculated by assuming that X_1 and X_2 are the activations of
12
+ the pool_3 layer of the inception net for generated samples and real world
13
+ samples respectively.
14
+
15
+ See --help to see further details.
16
+
17
+ Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
18
+ of Tensorflow
19
+
20
+ Copyright 2018 Institute of Bioinformatics, JKU Linz
21
+
22
+ Licensed under the Apache License, Version 2.0 (the "License");
23
+ you may not use this file except in compliance with the License.
24
+ You may obtain a copy of the License at
25
+
26
+ http://www.apache.org/licenses/LICENSE-2.0
27
+
28
+ Unless required by applicable law or agreed to in writing, software
29
+ distributed under the License is distributed on an "AS IS" BASIS,
30
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ See the License for the specific language governing permissions and
32
+ limitations under the License.
33
+ """
34
+ import os
35
+ import pathlib
36
+ from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
37
+
38
+ import numpy as np
39
+ import torch
40
+ import torchvision.transforms as TF
41
+ from PIL import Image
42
+ from scipy import linalg
43
+ from torch.nn.functional import adaptive_avg_pool2d
44
+ import torch.nn as nn
45
+ import torch.nn.functional as F
46
+ import torchvision
47
+
48
+ try:
49
+ from tqdm import tqdm
50
+ except ImportError:
51
+ # If tqdm is not available, provide a mock version of it
52
+ def tqdm(x):
53
+ return x
54
+
55
+
56
+ IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm',
57
+ 'tif', 'tiff', 'webp'}
58
+
59
+
60
+ try:
61
+ from torchvision.models.utils import load_state_dict_from_url
62
+ except ImportError:
63
+ from torch.utils.model_zoo import load_url as load_state_dict_from_url
64
+
65
+ # Inception weights ported to Pytorch from
66
+ # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
67
+ FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth' # noqa: E501
68
+
69
+
70
+ class InceptionV3(nn.Module):
71
+ """Pretrained InceptionV3 network returning feature maps"""
72
+
73
+ # Index of default block of inception to return,
74
+ # corresponds to output of final average pooling
75
+ DEFAULT_BLOCK_INDEX = 3
76
+
77
+ # Maps feature dimensionality to their output blocks indices
78
+ BLOCK_INDEX_BY_DIM = {
79
+ 64: 0, # First max pooling features
80
+ 192: 1, # Second max pooling featurs
81
+ 768: 2, # Pre-aux classifier features
82
+ 2048: 3 # Final average pooling features
83
+ }
84
+
85
+ def __init__(self,
86
+ output_blocks=(DEFAULT_BLOCK_INDEX,),
87
+ resize_input=True,
88
+ normalize_input=True,
89
+ requires_grad=False,
90
+ use_fid_inception=True):
91
+ """Build pretrained InceptionV3
92
+
93
+ Parameters
94
+ ----------
95
+ output_blocks : list of int
96
+ Indices of blocks to return features of. Possible values are:
97
+ - 0: corresponds to output of first max pooling
98
+ - 1: corresponds to output of second max pooling
99
+ - 2: corresponds to output which is fed to aux classifier
100
+ - 3: corresponds to output of final average pooling
101
+ resize_input : bool
102
+ If true, bilinearly resizes input to width and height 299 before
103
+ feeding input to model. As the network without fully connected
104
+ layers is fully convolutional, it should be able to handle inputs
105
+ of arbitrary size, so resizing might not be strictly needed
106
+ normalize_input : bool
107
+ If true, scales the input from range (0, 1) to the range the
108
+ pretrained Inception network expects, namely (-1, 1)
109
+ requires_grad : bool
110
+ If true, parameters of the model require gradients. Possibly useful
111
+ for finetuning the network
112
+ use_fid_inception : bool
113
+ If true, uses the pretrained Inception model used in Tensorflow's
114
+ FID implementation. If false, uses the pretrained Inception model
115
+ available in torchvision. The FID Inception model has different
116
+ weights and a slightly different structure from torchvision's
117
+ Inception model. If you want to compute FID scores, you are
118
+ strongly advised to set this parameter to true to get comparable
119
+ results.
120
+ """
121
+ super(InceptionV3, self).__init__()
122
+
123
+ self.resize_input = resize_input
124
+ self.normalize_input = normalize_input
125
+ self.output_blocks = sorted(output_blocks)
126
+ self.last_needed_block = max(output_blocks)
127
+
128
+ assert self.last_needed_block <= 3, \
129
+ 'Last possible output block index is 3'
130
+
131
+ self.blocks = nn.ModuleList()
132
+
133
+ if use_fid_inception:
134
+ inception = fid_inception_v3()
135
+ else:
136
+ inception = _inception_v3(weights='DEFAULT')
137
+
138
+ # Block 0: input to maxpool1
139
+ block0 = [
140
+ inception.Conv2d_1a_3x3,
141
+ inception.Conv2d_2a_3x3,
142
+ inception.Conv2d_2b_3x3,
143
+ nn.MaxPool2d(kernel_size=3, stride=2)
144
+ ]
145
+ self.blocks.append(nn.Sequential(*block0))
146
+
147
+ # Block 1: maxpool1 to maxpool2
148
+ if self.last_needed_block >= 1:
149
+ block1 = [
150
+ inception.Conv2d_3b_1x1,
151
+ inception.Conv2d_4a_3x3,
152
+ nn.MaxPool2d(kernel_size=3, stride=2)
153
+ ]
154
+ self.blocks.append(nn.Sequential(*block1))
155
+
156
+ # Block 2: maxpool2 to aux classifier
157
+ if self.last_needed_block >= 2:
158
+ block2 = [
159
+ inception.Mixed_5b,
160
+ inception.Mixed_5c,
161
+ inception.Mixed_5d,
162
+ inception.Mixed_6a,
163
+ inception.Mixed_6b,
164
+ inception.Mixed_6c,
165
+ inception.Mixed_6d,
166
+ inception.Mixed_6e,
167
+ ]
168
+ self.blocks.append(nn.Sequential(*block2))
169
+
170
+ # Block 3: aux classifier to final avgpool
171
+ if self.last_needed_block >= 3:
172
+ block3 = [
173
+ inception.Mixed_7a,
174
+ inception.Mixed_7b,
175
+ inception.Mixed_7c,
176
+ nn.AdaptiveAvgPool2d(output_size=(1, 1))
177
+ ]
178
+ self.blocks.append(nn.Sequential(*block3))
179
+
180
+ for param in self.parameters():
181
+ param.requires_grad = requires_grad
182
+
183
+ def forward(self, inp):
184
+ """Get Inception feature maps
185
+
186
+ Parameters
187
+ ----------
188
+ inp : torch.autograd.Variable
189
+ Input tensor of shape Bx3xHxW. Values are expected to be in
190
+ range (0, 1)
191
+
192
+ Returns
193
+ -------
194
+ List of torch.autograd.Variable, corresponding to the selected output
195
+ block, sorted ascending by index
196
+ """
197
+ outp = []
198
+ x = inp
199
+
200
+ if self.resize_input:
201
+ x = F.interpolate(x,
202
+ size=(299, 299),
203
+ mode='bilinear',
204
+ align_corners=False)
205
+
206
+ if self.normalize_input:
207
+ x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
208
+
209
+ for idx, block in enumerate(self.blocks):
210
+ x = block(x)
211
+ if idx in self.output_blocks:
212
+ outp.append(x)
213
+
214
+ if idx == self.last_needed_block:
215
+ break
216
+
217
+ return outp
218
+
219
+
220
+ def _inception_v3(*args, **kwargs):
221
+ """Wraps `torchvision.models.inception_v3`"""
222
+ try:
223
+ version = tuple(map(int, torchvision.__version__.split('.')[:2]))
224
+ except ValueError:
225
+ # Just a caution against weird version strings
226
+ version = (0,)
227
+
228
+ # Skips default weight inititialization if supported by torchvision
229
+ # version. See https://github.com/mseitzer/pytorch-fid/issues/28.
230
+ if version >= (0, 6):
231
+ kwargs['init_weights'] = False
232
+
233
+ # Backwards compatibility: `weights` argument was handled by `pretrained`
234
+ # argument prior to version 0.13.
235
+ if version < (0, 13) and 'weights' in kwargs:
236
+ if kwargs['weights'] == 'DEFAULT':
237
+ kwargs['pretrained'] = True
238
+ elif kwargs['weights'] is None:
239
+ kwargs['pretrained'] = False
240
+ else:
241
+ raise ValueError(
242
+ 'weights=={} not supported in torchvision {}'.format(
243
+ kwargs['weights'], torchvision.__version__
244
+ )
245
+ )
246
+ del kwargs['weights']
247
+
248
+ return torchvision.models.inception_v3(*args, **kwargs)
249
+
250
+
251
+ def fid_inception_v3():
252
+ """Build pretrained Inception model for FID computation
253
+
254
+ The Inception model for FID computation uses a different set of weights
255
+ and has a slightly different structure than torchvision's Inception.
256
+
257
+ This method first constructs torchvision's Inception and then patches the
258
+ necessary parts that are different in the FID Inception model.
259
+ """
260
+ inception = _inception_v3(num_classes=1008,
261
+ aux_logits=False,
262
+ weights=None)
263
+ inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
264
+ inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
265
+ inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
266
+ inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
267
+ inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
268
+ inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
269
+ inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
270
+ inception.Mixed_7b = FIDInceptionE_1(1280)
271
+ inception.Mixed_7c = FIDInceptionE_2(2048)
272
+
273
+ state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
274
+ inception.load_state_dict(state_dict)
275
+ return inception
276
+
277
+
278
+ class FIDInceptionA(torchvision.models.inception.InceptionA):
279
+ """InceptionA block patched for FID computation"""
280
+ def __init__(self, in_channels, pool_features):
281
+ super(FIDInceptionA, self).__init__(in_channels, pool_features)
282
+
283
+ def forward(self, x):
284
+ branch1x1 = self.branch1x1(x)
285
+
286
+ branch5x5 = self.branch5x5_1(x)
287
+ branch5x5 = self.branch5x5_2(branch5x5)
288
+
289
+ branch3x3dbl = self.branch3x3dbl_1(x)
290
+ branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
291
+ branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
292
+
293
+ # Patch: Tensorflow's average pool does not use the padded zero's in
294
+ # its average calculation
295
+ branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
296
+ count_include_pad=False)
297
+ branch_pool = self.branch_pool(branch_pool)
298
+
299
+ outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
300
+ return torch.cat(outputs, 1)
301
+
302
+
303
+ class FIDInceptionC(torchvision.models.inception.InceptionC):
304
+ """InceptionC block patched for FID computation"""
305
+ def __init__(self, in_channels, channels_7x7):
306
+ super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
307
+
308
+ def forward(self, x):
309
+ branch1x1 = self.branch1x1(x)
310
+
311
+ branch7x7 = self.branch7x7_1(x)
312
+ branch7x7 = self.branch7x7_2(branch7x7)
313
+ branch7x7 = self.branch7x7_3(branch7x7)
314
+
315
+ branch7x7dbl = self.branch7x7dbl_1(x)
316
+ branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
317
+ branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
318
+ branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
319
+ branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
320
+
321
+ # Patch: Tensorflow's average pool does not use the padded zero's in
322
+ # its average calculation
323
+ branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
324
+ count_include_pad=False)
325
+ branch_pool = self.branch_pool(branch_pool)
326
+
327
+ outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
328
+ return torch.cat(outputs, 1)
329
+
330
+
331
+ class FIDInceptionE_1(torchvision.models.inception.InceptionE):
332
+ """First InceptionE block patched for FID computation"""
333
+ def __init__(self, in_channels):
334
+ super(FIDInceptionE_1, self).__init__(in_channels)
335
+
336
+ def forward(self, x):
337
+ branch1x1 = self.branch1x1(x)
338
+
339
+ branch3x3 = self.branch3x3_1(x)
340
+ branch3x3 = [
341
+ self.branch3x3_2a(branch3x3),
342
+ self.branch3x3_2b(branch3x3),
343
+ ]
344
+ branch3x3 = torch.cat(branch3x3, 1)
345
+
346
+ branch3x3dbl = self.branch3x3dbl_1(x)
347
+ branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
348
+ branch3x3dbl = [
349
+ self.branch3x3dbl_3a(branch3x3dbl),
350
+ self.branch3x3dbl_3b(branch3x3dbl),
351
+ ]
352
+ branch3x3dbl = torch.cat(branch3x3dbl, 1)
353
+
354
+ # Patch: Tensorflow's average pool does not use the padded zero's in
355
+ # its average calculation
356
+ branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
357
+ count_include_pad=False)
358
+ branch_pool = self.branch_pool(branch_pool)
359
+
360
+ outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
361
+ return torch.cat(outputs, 1)
362
+
363
+
364
+ class FIDInceptionE_2(torchvision.models.inception.InceptionE):
365
+ """Second InceptionE block patched for FID computation"""
366
+ def __init__(self, in_channels):
367
+ super(FIDInceptionE_2, self).__init__(in_channels)
368
+
369
+ def forward(self, x):
370
+ branch1x1 = self.branch1x1(x)
371
+
372
+ branch3x3 = self.branch3x3_1(x)
373
+ branch3x3 = [
374
+ self.branch3x3_2a(branch3x3),
375
+ self.branch3x3_2b(branch3x3),
376
+ ]
377
+ branch3x3 = torch.cat(branch3x3, 1)
378
+
379
+ branch3x3dbl = self.branch3x3dbl_1(x)
380
+ branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
381
+ branch3x3dbl = [
382
+ self.branch3x3dbl_3a(branch3x3dbl),
383
+ self.branch3x3dbl_3b(branch3x3dbl),
384
+ ]
385
+ branch3x3dbl = torch.cat(branch3x3dbl, 1)
386
+
387
+ # Patch: The FID Inception model uses max pooling instead of average
388
+ # pooling. This is likely an error in this specific Inception
389
+ # implementation, as other Inception models use average pooling here
390
+ # (which matches the description in the paper).
391
+ branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
392
+ branch_pool = self.branch_pool(branch_pool)
393
+
394
+ outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
395
+ return torch.cat(outputs, 1)
396
+
397
+ class ImagePathDataset(torch.utils.data.Dataset):
398
+ def __init__(self, files, transforms=None):
399
+ self.files = files
400
+ self.transforms = transforms
401
+
402
+ def __len__(self):
403
+ return len(self.files)
404
+
405
+ def __getitem__(self, i):
406
+ path = self.files[i]
407
+ img = Image.open(path).convert('RGB')
408
+ if self.transforms is not None:
409
+ img = self.transforms(img)
410
+ return img
411
+
412
+
413
+ def get_activations(files, model, batch_size=50, dims=2048, device='cpu',
414
+ num_workers=1, resize=0):
415
+ """Calculates the activations of the pool_3 layer for all images.
416
+
417
+ Params:
418
+ -- files : List of image files paths
419
+ -- model : Instance of inception model
420
+ -- batch_size : Batch size of images for the model to process at once.
421
+ Make sure that the number of samples is a multiple of
422
+ the batch size, otherwise some samples are ignored. This
423
+ behavior is retained to match the original FID score
424
+ implementation.
425
+ -- dims : Dimensionality of features returned by Inception
426
+ -- device : Device to run calculations
427
+ -- num_workers : Number of parallel dataloader workers
428
+
429
+ Returns:
430
+ -- A numpy array of dimension (num images, dims) that contains the
431
+ activations of the given tensor when feeding inception with the
432
+ query tensor.
433
+ """
434
+ model.eval()
435
+
436
+ if batch_size > len(files):
437
+ print(('Warning: batch size is bigger than the data size. '
438
+ 'Setting batch size to data size'))
439
+ batch_size = len(files)
440
+ if resize > 0:
441
+ tform = TF.Compose([TF.Resize((resize, resize)), TF.ToTensor()])
442
+ else:
443
+ tform = TF.ToTensor()
444
+ dataset = ImagePathDataset(files, transforms=tform)
445
+ dataloader = torch.utils.data.DataLoader(dataset,
446
+ batch_size=batch_size,
447
+ shuffle=False,
448
+ drop_last=False,
449
+ num_workers=num_workers)
450
+
451
+ pred_arr = np.empty((len(files), dims))
452
+
453
+ start_idx = 0
454
+
455
+ for batch in tqdm(dataloader):
456
+ batch = batch.to(device)
457
+
458
+ with torch.no_grad():
459
+ pred = model(batch)[0]
460
+
461
+ # If model output is not scalar, apply global spatial average pooling.
462
+ # This happens if you choose a dimensionality not equal 2048.
463
+ if pred.size(2) != 1 or pred.size(3) != 1:
464
+ pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
465
+
466
+ pred = pred.squeeze(3).squeeze(2).cpu().numpy()
467
+
468
+ pred_arr[start_idx:start_idx + pred.shape[0]] = pred
469
+
470
+ start_idx = start_idx + pred.shape[0]
471
+
472
+ return pred_arr
473
+
474
+
475
+ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
476
+ """Numpy implementation of the Frechet Distance.
477
+ The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
478
+ and X_2 ~ N(mu_2, C_2) is
479
+ d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
480
+
481
+ Stable version by Dougal J. Sutherland.
482
+
483
+ Params:
484
+ -- mu1 : Numpy array containing the activations of a layer of the
485
+ inception net (like returned by the function 'get_predictions')
486
+ for generated samples.
487
+ -- mu2 : The sample mean over activations, precalculated on an
488
+ representative data set.
489
+ -- sigma1: The covariance matrix over activations for generated samples.
490
+ -- sigma2: The covariance matrix over activations, precalculated on an
491
+ representative data set.
492
+
493
+ Returns:
494
+ -- : The Frechet Distance.
495
+ """
496
+
497
+ mu1 = np.atleast_1d(mu1)
498
+ mu2 = np.atleast_1d(mu2)
499
+
500
+ sigma1 = np.atleast_2d(sigma1)
501
+ sigma2 = np.atleast_2d(sigma2)
502
+
503
+ assert mu1.shape == mu2.shape, \
504
+ 'Training and test mean vectors have different lengths'
505
+ assert sigma1.shape == sigma2.shape, \
506
+ 'Training and test covariances have different dimensions'
507
+
508
+ diff = mu1 - mu2
509
+
510
+ # Product might be almost singular
511
+ covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
512
+ if not np.isfinite(covmean).all():
513
+ msg = ('fid calculation produces singular product; '
514
+ 'adding %s to diagonal of cov estimates') % eps
515
+ print(msg)
516
+ offset = np.eye(sigma1.shape[0]) * eps
517
+ covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
518
+
519
+ # Numerical error might give slight imaginary component
520
+ if np.iscomplexobj(covmean):
521
+ if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
522
+ m = np.max(np.abs(covmean.imag))
523
+ raise ValueError('Imaginary component {}'.format(m))
524
+ covmean = covmean.real
525
+
526
+ tr_covmean = np.trace(covmean)
527
+
528
+ return (diff.dot(diff) + np.trace(sigma1)
529
+ + np.trace(sigma2) - 2 * tr_covmean)
530
+
531
+
532
+ def calculate_activation_statistics(files, model, batch_size=50, dims=2048,
533
+ device='cpu', num_workers=1, resize=0):
534
+ """Calculation of the statistics used by the FID.
535
+ Params:
536
+ -- files : List of image files paths
537
+ -- model : Instance of inception model
538
+ -- batch_size : The images numpy array is split into batches with
539
+ batch size batch_size. A reasonable batch size
540
+ depends on the hardware.
541
+ -- dims : Dimensionality of features returned by Inception
542
+ -- device : Device to run calculations
543
+ -- num_workers : Number of parallel dataloader workers
544
+
545
+ Returns:
546
+ -- mu : The mean over samples of the activations of the pool_3 layer of
547
+ the inception model.
548
+ -- sigma : The covariance matrix of the activations of the pool_3 layer of
549
+ the inception model.
550
+ """
551
+ act = get_activations(files, model, batch_size, dims, device, num_workers, resize)
552
+ mu = np.mean(act, axis=0)
553
+ sigma = np.cov(act, rowvar=False)
554
+ return mu, sigma
555
+
556
+
557
+ def compute_statistics_of_path(path, model, batch_size, dims, device,
558
+ num_workers=1, nimages=None, resize=0):
559
+ if path.endswith('.npz'):
560
+ with np.load(path) as f:
561
+ m, s = f['mu'][:], f['sigma'][:]
562
+ else:
563
+ path = pathlib.Path(path)
564
+
565
+ files = sorted([file for ext in IMAGE_EXTENSIONS
566
+ for file in path.glob('**/*.{}'.format(ext))])
567
+ nfiles = len(files)
568
+ n = nfiles if nimages is None else min(nimages, nfiles)
569
+ print(f'Found {nfiles} images. Computing FID with {n} images.')
570
+ files = files[:n]
571
+ m, s = calculate_activation_statistics(files, model, batch_size,
572
+ dims, device, num_workers, resize)
573
+
574
+ return m, s
575
+
576
+
577
+ def calculate_fid_given_paths(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
578
+ """Calculates the FID of two paths"""
579
+ for p in paths:
580
+ if not os.path.exists(p):
581
+ raise RuntimeError('Invalid path: %s' % p)
582
+
583
+ block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
584
+
585
+ model = InceptionV3([block_idx]).to(device)
586
+
587
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
588
+ dims, device, num_workers, nimages, resize)
589
+ m2, s2 = compute_statistics_of_path(paths[1], model, batch_size,
590
+ dims, device, num_workers, nimages, resize)
591
+ fid_value = calculate_frechet_distance(m1, s1, m2, s2)
592
+
593
+ return fid_value
594
+
595
+
596
+ def save_fid_stats(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
597
+ """Calculates the FID of two paths"""
598
+ if not os.path.exists(paths[0]):
599
+ raise RuntimeError('Invalid path: %s' % paths[0])
600
+
601
+ if os.path.exists(paths[1]):
602
+ raise RuntimeError('Existing output file: %s' % paths[1])
603
+
604
+ block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
605
+
606
+ model = InceptionV3([block_idx]).to(device)
607
+
608
+ print(f"Saving statistics for {paths[0]}")
609
+
610
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
611
+ dims, device, num_workers, nimages, resize=0)
612
+
613
+ np.savez_compressed(paths[1], mu=m1, sigma=s1)
614
+
615
+
616
+ def main():
617
+ parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
618
+ parser.add_argument('--batch-size', type=int, default=20,
619
+ help='Batch size to use')
620
+ parser.add_argument('--num-workers', type=int,
621
+ help=('Number of processes to use for data loading. '
622
+ 'Defaults to `min(8, num_cpus)`'))
623
+ parser.add_argument('--device', type=str, default='cuda:0',
624
+ help='Device to use. Like cuda, cuda:0 or cpu')
625
+ parser.add_argument('--dims', type=int, default=2048,
626
+ choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
627
+ help=('Dimensionality of Inception features to use. '
628
+ 'By default, uses pool3 features'))
629
+ parser.add_argument('--nimages', type=int, default=50000, help='max number of images to use')
630
+ parser.add_argument('--resize', type=int, default=0, help='resize images to this size, 0 mean keep original size')
631
+ parser.add_argument('--save-stats', action='store_true',
632
+ help=('Generate an npz archive from a directory of samples. '
633
+ 'The first path is used as input and the second as output.'))
634
+ parser.add_argument('path', type=str, nargs=2,
635
+ help=('Paths to the generated images or '
636
+ 'to .npz statistic files'))
637
+ args = parser.parse_args()
638
+
639
+ if args.device is None:
640
+ device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
641
+ else:
642
+ device = torch.device(args.device)
643
+
644
+ if args.num_workers is None:
645
+ try:
646
+ num_cpus = len(os.sched_getaffinity(0))
647
+ except AttributeError:
648
+ # os.sched_getaffinity is not available under Windows, use
649
+ # os.cpu_count instead (which may not return the *available* number
650
+ # of CPUs).
651
+ num_cpus = os.cpu_count()
652
+
653
+ num_workers = min(num_cpus, 8) if num_cpus is not None else 0
654
+ else:
655
+ num_workers = args.num_workers
656
+
657
+ if args.save_stats:
658
+ save_fid_stats(args.path, args.batch_size, device, args.dims, num_workers, args.nimages, args.resize)
659
+ return
660
+
661
+ fid_value = calculate_fid_given_paths(args.path,
662
+ args.batch_size,
663
+ device,
664
+ args.dims,
665
+ num_workers,
666
+ args.nimages,
667
+ args.resize)
668
+ print('FID: ', fid_value)
669
+
670
+
671
+ if __name__ == '__main__':
672
+ main()
tools/fid_lmdb.py ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Calculates the Frechet Inception Distance (FID) to evalulate GANs
2
+
3
+ The FID metric calculates the distance between two distributions of images.
4
+ Typically, we have summary statistics (mean & covariance matrix) of one
5
+ of these distributions, while the 2nd distribution is given by a GAN.
6
+
7
+ When run as a stand-alone program, it compares the distribution of
8
+ images that are stored as PNG/JPEG at a specified location with a
9
+ distribution given by summary statistics (in pickle format).
10
+
11
+ The FID is calculated by assuming that X_1 and X_2 are the activations of
12
+ the pool_3 layer of the inception net for generated samples and real world
13
+ samples respectively.
14
+
15
+ See --help to see further details.
16
+
17
+ Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
18
+ of Tensorflow
19
+
20
+ Copyright 2018 Institute of Bioinformatics, JKU Linz
21
+
22
+ Licensed under the Apache License, Version 2.0 (the "License");
23
+ you may not use this file except in compliance with the License.
24
+ You may obtain a copy of the License at
25
+
26
+ http://www.apache.org/licenses/LICENSE-2.0
27
+
28
+ Unless required by applicable law or agreed to in writing, software
29
+ distributed under the License is distributed on an "AS IS" BASIS,
30
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ See the License for the specific language governing permissions and
32
+ limitations under the License.
33
+ """
34
+ import os
35
+ import pathlib
36
+ from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
37
+
38
+ import numpy as np
39
+ import torch
40
+ import torchvision.transforms as TF
41
+ from PIL import Image
42
+ from scipy import linalg
43
+ from torch.nn.functional import adaptive_avg_pool2d
44
+ import torch.nn as nn
45
+ import torch.nn.functional as F
46
+ import torchvision
47
+ import sys
48
+ sys.path.insert(1, '/mnt/fast/nobackup/users/tb0035/projects/diffsteg/ControlNet')
49
+ from tools.image_dataset import ImageDataset
50
+ try:
51
+ from tqdm import tqdm
52
+ except ImportError:
53
+ # If tqdm is not available, provide a mock version of it
54
+ def tqdm(x):
55
+ return x
56
+
57
+
58
+ IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm',
59
+ 'tif', 'tiff', 'webp'}
60
+
61
+
62
+ try:
63
+ from torchvision.models.utils import load_state_dict_from_url
64
+ except ImportError:
65
+ from torch.utils.model_zoo import load_url as load_state_dict_from_url
66
+
67
+ # Inception weights ported to Pytorch from
68
+ # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
69
+ FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth' # noqa: E501
70
+
71
+
72
+ class InceptionV3(nn.Module):
73
+ """Pretrained InceptionV3 network returning feature maps"""
74
+
75
+ # Index of default block of inception to return,
76
+ # corresponds to output of final average pooling
77
+ DEFAULT_BLOCK_INDEX = 3
78
+
79
+ # Maps feature dimensionality to their output blocks indices
80
+ BLOCK_INDEX_BY_DIM = {
81
+ 64: 0, # First max pooling features
82
+ 192: 1, # Second max pooling featurs
83
+ 768: 2, # Pre-aux classifier features
84
+ 2048: 3 # Final average pooling features
85
+ }
86
+
87
+ def __init__(self,
88
+ output_blocks=(DEFAULT_BLOCK_INDEX,),
89
+ resize_input=True,
90
+ normalize_input=True,
91
+ requires_grad=False,
92
+ use_fid_inception=True):
93
+ """Build pretrained InceptionV3
94
+
95
+ Parameters
96
+ ----------
97
+ output_blocks : list of int
98
+ Indices of blocks to return features of. Possible values are:
99
+ - 0: corresponds to output of first max pooling
100
+ - 1: corresponds to output of second max pooling
101
+ - 2: corresponds to output which is fed to aux classifier
102
+ - 3: corresponds to output of final average pooling
103
+ resize_input : bool
104
+ If true, bilinearly resizes input to width and height 299 before
105
+ feeding input to model. As the network without fully connected
106
+ layers is fully convolutional, it should be able to handle inputs
107
+ of arbitrary size, so resizing might not be strictly needed
108
+ normalize_input : bool
109
+ If true, scales the input from range (0, 1) to the range the
110
+ pretrained Inception network expects, namely (-1, 1)
111
+ requires_grad : bool
112
+ If true, parameters of the model require gradients. Possibly useful
113
+ for finetuning the network
114
+ use_fid_inception : bool
115
+ If true, uses the pretrained Inception model used in Tensorflow's
116
+ FID implementation. If false, uses the pretrained Inception model
117
+ available in torchvision. The FID Inception model has different
118
+ weights and a slightly different structure from torchvision's
119
+ Inception model. If you want to compute FID scores, you are
120
+ strongly advised to set this parameter to true to get comparable
121
+ results.
122
+ """
123
+ super(InceptionV3, self).__init__()
124
+
125
+ self.resize_input = resize_input
126
+ self.normalize_input = normalize_input
127
+ self.output_blocks = sorted(output_blocks)
128
+ self.last_needed_block = max(output_blocks)
129
+
130
+ assert self.last_needed_block <= 3, \
131
+ 'Last possible output block index is 3'
132
+
133
+ self.blocks = nn.ModuleList()
134
+
135
+ if use_fid_inception:
136
+ inception = fid_inception_v3()
137
+ else:
138
+ inception = _inception_v3(weights='DEFAULT')
139
+
140
+ # Block 0: input to maxpool1
141
+ block0 = [
142
+ inception.Conv2d_1a_3x3,
143
+ inception.Conv2d_2a_3x3,
144
+ inception.Conv2d_2b_3x3,
145
+ nn.MaxPool2d(kernel_size=3, stride=2)
146
+ ]
147
+ self.blocks.append(nn.Sequential(*block0))
148
+
149
+ # Block 1: maxpool1 to maxpool2
150
+ if self.last_needed_block >= 1:
151
+ block1 = [
152
+ inception.Conv2d_3b_1x1,
153
+ inception.Conv2d_4a_3x3,
154
+ nn.MaxPool2d(kernel_size=3, stride=2)
155
+ ]
156
+ self.blocks.append(nn.Sequential(*block1))
157
+
158
+ # Block 2: maxpool2 to aux classifier
159
+ if self.last_needed_block >= 2:
160
+ block2 = [
161
+ inception.Mixed_5b,
162
+ inception.Mixed_5c,
163
+ inception.Mixed_5d,
164
+ inception.Mixed_6a,
165
+ inception.Mixed_6b,
166
+ inception.Mixed_6c,
167
+ inception.Mixed_6d,
168
+ inception.Mixed_6e,
169
+ ]
170
+ self.blocks.append(nn.Sequential(*block2))
171
+
172
+ # Block 3: aux classifier to final avgpool
173
+ if self.last_needed_block >= 3:
174
+ block3 = [
175
+ inception.Mixed_7a,
176
+ inception.Mixed_7b,
177
+ inception.Mixed_7c,
178
+ nn.AdaptiveAvgPool2d(output_size=(1, 1))
179
+ ]
180
+ self.blocks.append(nn.Sequential(*block3))
181
+
182
+ for param in self.parameters():
183
+ param.requires_grad = requires_grad
184
+
185
+ def forward(self, inp):
186
+ """Get Inception feature maps
187
+
188
+ Parameters
189
+ ----------
190
+ inp : torch.autograd.Variable
191
+ Input tensor of shape Bx3xHxW. Values are expected to be in
192
+ range (0, 1)
193
+
194
+ Returns
195
+ -------
196
+ List of torch.autograd.Variable, corresponding to the selected output
197
+ block, sorted ascending by index
198
+ """
199
+ outp = []
200
+ x = inp
201
+
202
+ if self.resize_input:
203
+ x = F.interpolate(x,
204
+ size=(299, 299),
205
+ mode='bilinear',
206
+ align_corners=False)
207
+
208
+ if self.normalize_input:
209
+ x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
210
+
211
+ for idx, block in enumerate(self.blocks):
212
+ x = block(x)
213
+ if idx in self.output_blocks:
214
+ outp.append(x)
215
+
216
+ if idx == self.last_needed_block:
217
+ break
218
+
219
+ return outp
220
+
221
+
222
+ def _inception_v3(*args, **kwargs):
223
+ """Wraps `torchvision.models.inception_v3`"""
224
+ try:
225
+ version = tuple(map(int, torchvision.__version__.split('.')[:2]))
226
+ except ValueError:
227
+ # Just a caution against weird version strings
228
+ version = (0,)
229
+
230
+ # Skips default weight inititialization if supported by torchvision
231
+ # version. See https://github.com/mseitzer/pytorch-fid/issues/28.
232
+ if version >= (0, 6):
233
+ kwargs['init_weights'] = False
234
+
235
+ # Backwards compatibility: `weights` argument was handled by `pretrained`
236
+ # argument prior to version 0.13.
237
+ if version < (0, 13) and 'weights' in kwargs:
238
+ if kwargs['weights'] == 'DEFAULT':
239
+ kwargs['pretrained'] = True
240
+ elif kwargs['weights'] is None:
241
+ kwargs['pretrained'] = False
242
+ else:
243
+ raise ValueError(
244
+ 'weights=={} not supported in torchvision {}'.format(
245
+ kwargs['weights'], torchvision.__version__
246
+ )
247
+ )
248
+ del kwargs['weights']
249
+
250
+ return torchvision.models.inception_v3(*args, **kwargs)
251
+
252
+
253
+ def fid_inception_v3():
254
+ """Build pretrained Inception model for FID computation
255
+
256
+ The Inception model for FID computation uses a different set of weights
257
+ and has a slightly different structure than torchvision's Inception.
258
+
259
+ This method first constructs torchvision's Inception and then patches the
260
+ necessary parts that are different in the FID Inception model.
261
+ """
262
+ inception = _inception_v3(num_classes=1008,
263
+ aux_logits=False,
264
+ weights=None)
265
+ inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
266
+ inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
267
+ inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
268
+ inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
269
+ inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
270
+ inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
271
+ inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
272
+ inception.Mixed_7b = FIDInceptionE_1(1280)
273
+ inception.Mixed_7c = FIDInceptionE_2(2048)
274
+
275
+ state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
276
+ inception.load_state_dict(state_dict)
277
+ return inception
278
+
279
+
280
+ class FIDInceptionA(torchvision.models.inception.InceptionA):
281
+ """InceptionA block patched for FID computation"""
282
+ def __init__(self, in_channels, pool_features):
283
+ super(FIDInceptionA, self).__init__(in_channels, pool_features)
284
+
285
+ def forward(self, x):
286
+ branch1x1 = self.branch1x1(x)
287
+
288
+ branch5x5 = self.branch5x5_1(x)
289
+ branch5x5 = self.branch5x5_2(branch5x5)
290
+
291
+ branch3x3dbl = self.branch3x3dbl_1(x)
292
+ branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
293
+ branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
294
+
295
+ # Patch: Tensorflow's average pool does not use the padded zero's in
296
+ # its average calculation
297
+ branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
298
+ count_include_pad=False)
299
+ branch_pool = self.branch_pool(branch_pool)
300
+
301
+ outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
302
+ return torch.cat(outputs, 1)
303
+
304
+
305
+ class FIDInceptionC(torchvision.models.inception.InceptionC):
306
+ """InceptionC block patched for FID computation"""
307
+ def __init__(self, in_channels, channels_7x7):
308
+ super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
309
+
310
+ def forward(self, x):
311
+ branch1x1 = self.branch1x1(x)
312
+
313
+ branch7x7 = self.branch7x7_1(x)
314
+ branch7x7 = self.branch7x7_2(branch7x7)
315
+ branch7x7 = self.branch7x7_3(branch7x7)
316
+
317
+ branch7x7dbl = self.branch7x7dbl_1(x)
318
+ branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
319
+ branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
320
+ branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
321
+ branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
322
+
323
+ # Patch: Tensorflow's average pool does not use the padded zero's in
324
+ # its average calculation
325
+ branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
326
+ count_include_pad=False)
327
+ branch_pool = self.branch_pool(branch_pool)
328
+
329
+ outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
330
+ return torch.cat(outputs, 1)
331
+
332
+
333
+ class FIDInceptionE_1(torchvision.models.inception.InceptionE):
334
+ """First InceptionE block patched for FID computation"""
335
+ def __init__(self, in_channels):
336
+ super(FIDInceptionE_1, self).__init__(in_channels)
337
+
338
+ def forward(self, x):
339
+ branch1x1 = self.branch1x1(x)
340
+
341
+ branch3x3 = self.branch3x3_1(x)
342
+ branch3x3 = [
343
+ self.branch3x3_2a(branch3x3),
344
+ self.branch3x3_2b(branch3x3),
345
+ ]
346
+ branch3x3 = torch.cat(branch3x3, 1)
347
+
348
+ branch3x3dbl = self.branch3x3dbl_1(x)
349
+ branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
350
+ branch3x3dbl = [
351
+ self.branch3x3dbl_3a(branch3x3dbl),
352
+ self.branch3x3dbl_3b(branch3x3dbl),
353
+ ]
354
+ branch3x3dbl = torch.cat(branch3x3dbl, 1)
355
+
356
+ # Patch: Tensorflow's average pool does not use the padded zero's in
357
+ # its average calculation
358
+ branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
359
+ count_include_pad=False)
360
+ branch_pool = self.branch_pool(branch_pool)
361
+
362
+ outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
363
+ return torch.cat(outputs, 1)
364
+
365
+
366
+ class FIDInceptionE_2(torchvision.models.inception.InceptionE):
367
+ """Second InceptionE block patched for FID computation"""
368
+ def __init__(self, in_channels):
369
+ super(FIDInceptionE_2, self).__init__(in_channels)
370
+
371
+ def forward(self, x):
372
+ branch1x1 = self.branch1x1(x)
373
+
374
+ branch3x3 = self.branch3x3_1(x)
375
+ branch3x3 = [
376
+ self.branch3x3_2a(branch3x3),
377
+ self.branch3x3_2b(branch3x3),
378
+ ]
379
+ branch3x3 = torch.cat(branch3x3, 1)
380
+
381
+ branch3x3dbl = self.branch3x3dbl_1(x)
382
+ branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
383
+ branch3x3dbl = [
384
+ self.branch3x3dbl_3a(branch3x3dbl),
385
+ self.branch3x3dbl_3b(branch3x3dbl),
386
+ ]
387
+ branch3x3dbl = torch.cat(branch3x3dbl, 1)
388
+
389
+ # Patch: The FID Inception model uses max pooling instead of average
390
+ # pooling. This is likely an error in this specific Inception
391
+ # implementation, as other Inception models use average pooling here
392
+ # (which matches the description in the paper).
393
+ branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
394
+ branch_pool = self.branch_pool(branch_pool)
395
+
396
+ outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
397
+ return torch.cat(outputs, 1)
398
+
399
+ class ImagePathDataset(torch.utils.data.Dataset):
400
+ def __init__(self, files, transforms=None):
401
+ self.files = files
402
+ self.transforms = transforms
403
+
404
+ def __len__(self):
405
+ return len(self.files)
406
+
407
+ def __getitem__(self, i):
408
+ path = self.files[i]
409
+ img = Image.open(path).convert('RGB')
410
+ if self.transforms is not None:
411
+ img = self.transforms(img)
412
+ return img
413
+
414
+
415
+ def get_activations(files, model, batch_size=50, dims=2048, device='cpu',
416
+ num_workers=1, resize=0):
417
+ """Calculates the activations of the pool_3 layer for all images.
418
+
419
+ Params:
420
+ -- files : List of image files paths
421
+ -- model : Instance of inception model
422
+ -- batch_size : Batch size of images for the model to process at once.
423
+ Make sure that the number of samples is a multiple of
424
+ the batch size, otherwise some samples are ignored. This
425
+ behavior is retained to match the original FID score
426
+ implementation.
427
+ -- dims : Dimensionality of features returned by Inception
428
+ -- device : Device to run calculations
429
+ -- num_workers : Number of parallel dataloader workers
430
+
431
+ Returns:
432
+ -- A numpy array of dimension (num images, dims) that contains the
433
+ activations of the given tensor when feeding inception with the
434
+ query tensor.
435
+ """
436
+ model.eval()
437
+
438
+ if batch_size > len(files):
439
+ print(('Warning: batch size is bigger than the data size. '
440
+ 'Setting batch size to data size'))
441
+ batch_size = len(files)
442
+ if resize > 0:
443
+ tform = TF.Compose([TF.Resize((resize, resize)), TF.ToTensor()])
444
+ else:
445
+ tform = TF.ToTensor()
446
+ if isinstance(files, list):
447
+ dataset = ImagePathDataset(files, transforms=tform)
448
+ else:
449
+ files.set_transform(tform)
450
+ dataset = files
451
+ dataloader = torch.utils.data.DataLoader(dataset,
452
+ batch_size=batch_size,
453
+ shuffle=False,
454
+ drop_last=False,
455
+ num_workers=num_workers)
456
+
457
+ pred_arr = np.empty((len(files), dims))
458
+
459
+ start_idx = 0
460
+
461
+ for batch in tqdm(dataloader):
462
+ batch = batch['image'].to(device)
463
+
464
+ with torch.no_grad():
465
+ pred = model(batch)[0]
466
+
467
+ # If model output is not scalar, apply global spatial average pooling.
468
+ # This happens if you choose a dimensionality not equal 2048.
469
+ if pred.size(2) != 1 or pred.size(3) != 1:
470
+ pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
471
+
472
+ pred = pred.squeeze(3).squeeze(2).cpu().numpy()
473
+
474
+ pred_arr[start_idx:start_idx + pred.shape[0]] = pred
475
+
476
+ start_idx = start_idx + pred.shape[0]
477
+
478
+ return pred_arr
479
+
480
+
481
+ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
482
+ """Numpy implementation of the Frechet Distance.
483
+ The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
484
+ and X_2 ~ N(mu_2, C_2) is
485
+ d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
486
+
487
+ Stable version by Dougal J. Sutherland.
488
+
489
+ Params:
490
+ -- mu1 : Numpy array containing the activations of a layer of the
491
+ inception net (like returned by the function 'get_predictions')
492
+ for generated samples.
493
+ -- mu2 : The sample mean over activations, precalculated on an
494
+ representative data set.
495
+ -- sigma1: The covariance matrix over activations for generated samples.
496
+ -- sigma2: The covariance matrix over activations, precalculated on an
497
+ representative data set.
498
+
499
+ Returns:
500
+ -- : The Frechet Distance.
501
+ """
502
+
503
+ mu1 = np.atleast_1d(mu1)
504
+ mu2 = np.atleast_1d(mu2)
505
+
506
+ sigma1 = np.atleast_2d(sigma1)
507
+ sigma2 = np.atleast_2d(sigma2)
508
+
509
+ assert mu1.shape == mu2.shape, \
510
+ 'Training and test mean vectors have different lengths'
511
+ assert sigma1.shape == sigma2.shape, \
512
+ 'Training and test covariances have different dimensions'
513
+
514
+ diff = mu1 - mu2
515
+
516
+ # Product might be almost singular
517
+ covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
518
+ if not np.isfinite(covmean).all():
519
+ msg = ('fid calculation produces singular product; '
520
+ 'adding %s to diagonal of cov estimates') % eps
521
+ print(msg)
522
+ offset = np.eye(sigma1.shape[0]) * eps
523
+ covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
524
+
525
+ # Numerical error might give slight imaginary component
526
+ if np.iscomplexobj(covmean):
527
+ if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
528
+ m = np.max(np.abs(covmean.imag))
529
+ raise ValueError('Imaginary component {}'.format(m))
530
+ covmean = covmean.real
531
+
532
+ tr_covmean = np.trace(covmean)
533
+
534
+ return (diff.dot(diff) + np.trace(sigma1)
535
+ + np.trace(sigma2) - 2 * tr_covmean)
536
+
537
+
538
+ def calculate_activation_statistics(files, model, batch_size=50, dims=2048,
539
+ device='cpu', num_workers=1, resize=0):
540
+ """Calculation of the statistics used by the FID.
541
+ Params:
542
+ -- files : List of image files paths
543
+ -- model : Instance of inception model
544
+ -- batch_size : The images numpy array is split into batches with
545
+ batch size batch_size. A reasonable batch size
546
+ depends on the hardware.
547
+ -- dims : Dimensionality of features returned by Inception
548
+ -- device : Device to run calculations
549
+ -- num_workers : Number of parallel dataloader workers
550
+
551
+ Returns:
552
+ -- mu : The mean over samples of the activations of the pool_3 layer of
553
+ the inception model.
554
+ -- sigma : The covariance matrix of the activations of the pool_3 layer of
555
+ the inception model.
556
+ """
557
+ act = get_activations(files, model, batch_size, dims, device, num_workers, resize)
558
+ mu = np.mean(act, axis=0)
559
+ sigma = np.cov(act, rowvar=False)
560
+ return mu, sigma
561
+
562
+
563
+ def compute_statistics_of_path(path, model, batch_size, dims, device,
564
+ num_workers=1, nimages=None, resize=0):
565
+ if path.endswith('.npz'):
566
+ with np.load(path) as f:
567
+ m, s = f['mu'][:], f['sigma'][:]
568
+ else:
569
+ path = pathlib.Path(path)
570
+ if (path/'data.mdb').exists():
571
+ files = ImageDataset(path, None)
572
+ nfiles = len(files)
573
+ n = nfiles if nimages is None else min(nimages, nfiles)
574
+ files.set_ids(range(n))
575
+ else:
576
+ files = sorted([file for ext in IMAGE_EXTENSIONS
577
+ for file in path.glob('**/*.{}'.format(ext))])
578
+ nfiles = len(files)
579
+ n = nfiles if nimages is None else min(nimages, nfiles)
580
+ files = files[:n]
581
+ print(f'Found {nfiles} images. Computing FID with {n} images.')
582
+ m, s = calculate_activation_statistics(files, model, batch_size,
583
+ dims, device, num_workers, resize)
584
+
585
+ return m, s
586
+
587
+
588
+ def calculate_fid_given_paths(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
589
+ """Calculates the FID of two paths"""
590
+ for p in paths:
591
+ if not os.path.exists(p):
592
+ raise RuntimeError('Invalid path: %s' % p)
593
+
594
+ block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
595
+
596
+ model = InceptionV3([block_idx]).to(device)
597
+
598
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
599
+ dims, device, num_workers, nimages, resize)
600
+ m2, s2 = compute_statistics_of_path(paths[1], model, batch_size,
601
+ dims, device, num_workers, nimages, resize)
602
+ fid_value = calculate_frechet_distance(m1, s1, m2, s2)
603
+
604
+ return fid_value
605
+
606
+
607
+ def save_fid_stats(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
608
+ """Calculates the FID of two paths"""
609
+ if not os.path.exists(paths[0]):
610
+ raise RuntimeError('Invalid path: %s' % paths[0])
611
+
612
+ if os.path.exists(paths[1]):
613
+ raise RuntimeError('Existing output file: %s' % paths[1])
614
+
615
+ block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
616
+
617
+ model = InceptionV3([block_idx]).to(device)
618
+
619
+ print(f"Saving statistics for {paths[0]}")
620
+
621
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
622
+ dims, device, num_workers, nimages, resize=0)
623
+
624
+ np.savez_compressed(paths[1], mu=m1, sigma=s1)
625
+
626
+
627
+ def main():
628
+ parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
629
+ parser.add_argument('--batch-size', type=int, default=20,
630
+ help='Batch size to use')
631
+ parser.add_argument('--num-workers', type=int,
632
+ help=('Number of processes to use for data loading. '
633
+ 'Defaults to `min(8, num_cpus)`'))
634
+ parser.add_argument('--device', type=str, default='cuda:0',
635
+ help='Device to use. Like cuda, cuda:0 or cpu')
636
+ parser.add_argument('--dims', type=int, default=2048,
637
+ choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
638
+ help=('Dimensionality of Inception features to use. '
639
+ 'By default, uses pool3 features'))
640
+ parser.add_argument('--nimages', type=int, default=50000, help='max number of images to use')
641
+ parser.add_argument('--resize', type=int, default=0, help='resize images to this size, 0 mean keep original size')
642
+ parser.add_argument('--save-stats', action='store_true',
643
+ help=('Generate an npz archive from a directory of samples. '
644
+ 'The first path is used as input and the second as output.'))
645
+ parser.add_argument('path', type=str, nargs=2,
646
+ help=('Paths to the generated images or '
647
+ 'to .npz statistic files'))
648
+ args = parser.parse_args()
649
+
650
+ if args.device is None:
651
+ device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
652
+ else:
653
+ device = torch.device(args.device)
654
+
655
+ if args.num_workers is None:
656
+ try:
657
+ num_cpus = len(os.sched_getaffinity(0))
658
+ except AttributeError:
659
+ # os.sched_getaffinity is not available under Windows, use
660
+ # os.cpu_count instead (which may not return the *available* number
661
+ # of CPUs).
662
+ num_cpus = os.cpu_count()
663
+
664
+ num_workers = min(num_cpus, 8) if num_cpus is not None else 0
665
+ else:
666
+ num_workers = args.num_workers
667
+
668
+ if args.save_stats:
669
+ save_fid_stats(args.path, args.batch_size, device, args.dims, num_workers, args.nimages, args.resize)
670
+ return
671
+
672
+ fid_value = calculate_fid_given_paths(args.path,
673
+ args.batch_size,
674
+ device,
675
+ args.dims,
676
+ num_workers,
677
+ args.nimages,
678
+ args.resize)
679
+ print('FID: ', fid_value)
680
+
681
+
682
+ if __name__ == '__main__':
683
+ main()
tools/gradcam.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ gradcam visualisation for each GAN class
5
+ @author: Tu Bui @surrey.ac.uk
6
+ """
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+ import os
11
+ import sys
12
+ import inspect
13
+ import argparse
14
+ import torch
15
+ import numpy as np
16
+ import matplotlib
17
+ matplotlib.use('Agg')
18
+ import matplotlib.pyplot as plt
19
+ import cv2
20
+ from PIL import Image, ImageDraw, ImageFont
21
+ import torch
22
+ import torchvision
23
+ from torch.autograd import Function
24
+ import torch.nn.functional as F
25
+
26
+
27
+ def show_cam_on_image(img, cam, cmap='jet'):
28
+ """
29
+ Args:
30
+ img PIL image (H,W,3)
31
+ cam heatmap (H, W), range [0,1]
32
+ Returns:
33
+ PIL image with heatmap applied.
34
+ """
35
+ cm = plt.get_cmap(cmap)
36
+ cam = cm(cam)[...,:3] # RGB [0,1]
37
+ cam = np.array(img, dtype=np.float32)/255. + cam
38
+ cam /= cam.max()
39
+ cam = np.uint8(cam*255)
40
+ return Image.fromarray(cam)
41
+
42
+
43
+ class HookedModel(object):
44
+ def __init__(self, model, feature_layer_name):
45
+ self.model = model
46
+ self.feature_trees = feature_layer_name.split('.')
47
+
48
+ def __call__(self, x):
49
+ x = feedforward(x, self.model, self.feature_trees)
50
+ return x
51
+
52
+
53
+ def feedforward(x, module, layer_names):
54
+ for name, submodule in module._modules.items():
55
+ # print(f'Forwarding {name} ...')
56
+ if name == layer_names[0]:
57
+ if len(layer_names) == 1: # leaf node reached
58
+ # print(f' Hook {name}')
59
+ x = submodule(x)
60
+ x.register_hook(save_gradients)
61
+ save_features(x)
62
+ else:
63
+ # print(f' Stepping into {name}:')
64
+ x = feedforward(x, submodule, layer_names[1:])
65
+ else:
66
+ x = submodule(x)
67
+ if name == 'avgpool': # specific for resnet50
68
+ x = x.view(x.size(0), -1)
69
+ return x
70
+
71
+
72
+ basket = dict(grads=[], feature_maps=[]) # global variable to hold the gradients and output features of the layers of interest
73
+
74
+ def empty_basket():
75
+ basket = dict(grads=[], feature_maps=[])
76
+
77
+ def save_gradients(grad):
78
+ basket['grads'].append(grad)
79
+
80
+ def save_features(feat):
81
+ basket['feature_maps'].append(feat)
82
+
83
+
84
+ class GradCam(object):
85
+ def __init__(self, model, feature_layer_name, use_cuda=True):
86
+ self.model = model
87
+ self.hooked_model = HookedModel(model, feature_layer_name)
88
+ self.cuda = use_cuda
89
+ if self.cuda:
90
+ self.model = model.cuda()
91
+ self.model.eval()
92
+
93
+ def __call__(self, x, target, act=None):
94
+ empty_basket()
95
+ target = torch.as_tensor(target, dtype=torch.float)
96
+ if self.cuda:
97
+ x = x.cuda()
98
+ target = target.cuda()
99
+ z = self.hooked_model(x)
100
+ if act is not None:
101
+ z = act(z)
102
+ criteria = F.cosine_similarity(z, target)
103
+ self.model.zero_grad()
104
+ criteria.backward(retain_graph=True)
105
+ gradients = [grad.cpu().data.numpy() for grad in basket['grads'][::-1]] # gradients appear in reversed order
106
+ feature_maps = [feat.cpu().data.numpy() for feat in basket['feature_maps']]
107
+ cams = []
108
+ for feat, grad in zip(feature_maps, gradients):
109
+ # feat and grad have shape (1, C, H, W)
110
+ weight = np.mean(grad, axis=(2,3), keepdims=True)[0] # (C,1,1)
111
+ cam = np.sum(weight * feat[0], axis=0) # (H,w)
112
+ cam = cv2.resize(cam, x.shape[2:])
113
+ cam = cam - np.min(cam)
114
+ cam = cam / (np.max(cam) + np.finfo(np.float32).eps)
115
+ cams.append(cam)
116
+ cams = np.array(cams).mean(axis=0) # (H,W)
117
+ return cams
118
+
119
+
120
+ def gradcam_demo():
121
+ from torchvision import transforms
122
+ model = torchvision.models.resnet50(pretrained=True)
123
+ model.eval()
124
+ gradcam = GradCam(model, 'layer4.2', True)
125
+ tform = [
126
+ transforms.Resize((224, 224)),
127
+ # transforms.CenterCrop(224),
128
+ transforms.ToTensor(),
129
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
130
+ ]
131
+ preprocess = transforms.Compose(tform)
132
+ im0 = Image.open('/mnt/fast/nobackup/users/tb0035/projects/diffsteg/ControlNet/examples/catdog.jpg').convert('RGB')
133
+ im = preprocess(im0).unsqueeze(0)
134
+ target = np.zeros((1,1000), dtype=np.float32)
135
+ target[0, 285] = 1 # cat
136
+ cam = gradcam(im, target)
137
+
138
+ im0 = tform[0](im0)
139
+ out = show_cam_on_image(im0, cam)
140
+ out.save('test.jpg')
141
+ print('done')
142
+
143
+
144
+ def make_target_vector(nclass, target_class_id):
145
+ out = np.zeros((1, nclass), dtype=np.float32)
146
+ out[0, target_class_id] = 1
147
+ return out
148
+
149
+
150
+
151
+ if __name__ == '__main__':
152
+ gradcam_demo()
tools/helpers.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Jul 12 11:05:57 2016
4
+ some help functions to perform basic tasks
5
+ @author: tb00083
6
+ """
7
+ import os
8
+ import sys
9
+ import csv
10
+ import socket
11
+ import numpy as np
12
+ import json
13
+ import pickle # python3.x
14
+ import time
15
+ from datetime import timedelta, datetime
16
+ from typing import Any, List, Tuple, Union
17
+ import subprocess
18
+ import struct
19
+ import errno
20
+ from pprint import pprint
21
+ import glob
22
+ from threading import Thread
23
+
24
+
25
+ def welcome_message():
26
+ """
27
+ get welcome message including hostname and command line arguments
28
+ """
29
+ hostname = socket.gethostname()
30
+ all_args = ' '.join(sys.argv)
31
+ out_text = 'On server {}: {}\n'.format(hostname, all_args)
32
+ return out_text
33
+
34
+
35
+ class EasyDict(dict):
36
+ """Convenience class that behaves like a dict but allows access with the attribute syntax."""
37
+ def __init__(self, dict_to_convert=None):
38
+ if dict_to_convert is not None:
39
+ for key, val in dict_to_convert.items():
40
+ self[key] = val
41
+
42
+ def __getattr__(self, name: str) -> Any:
43
+ try:
44
+ return self[name]
45
+ except KeyError:
46
+ raise AttributeError(name)
47
+
48
+ def __setattr__(self, name: str, value: Any) -> None:
49
+ self[name] = value
50
+
51
+ def __delattr__(self, name: str) -> None:
52
+ del self[name]
53
+
54
+
55
+ def get_time_id_str():
56
+ """
57
+ returns a string with DDHHM format, where M is the minutes cut to the tenths
58
+ """
59
+ now = datetime.now()
60
+ time_str = "{:02d}{:02d}{:02d}".format(now.day, now.hour, now.minute)
61
+ time_str = time_str[:-1]
62
+ return time_str
63
+
64
+
65
+ def time_format(t):
66
+ m, s = divmod(t, 60)
67
+ h, m = divmod(m, 60)
68
+ m, h, s = int(m), int(h), int(s)
69
+
70
+ if m == 0 and h == 0:
71
+ return "{}s".format(s)
72
+ elif h == 0:
73
+ return "{}m{}s".format(m, s)
74
+ else:
75
+ return "{}h{}m{}s".format(h, m, s)
76
+
77
+
78
+ def get_all_files(dir_path, trim=0, extension=''):
79
+ """
80
+ Recursively get list of all files in the given directory
81
+ trim = 1 : trim the dir_path from results, 0 otherwise
82
+ extension: get files with specific format
83
+ """
84
+ file_paths = [] # List which will store all of the full filepaths.
85
+
86
+ # Walk the tree.
87
+ for root, directories, files in os.walk(dir_path):
88
+ for filename in files:
89
+ # Join the two strings in order to form the full filepath.
90
+ filepath = os.path.join(root, filename)
91
+ file_paths.append(filepath) # Add it to the list.
92
+
93
+ if trim == 1: # trim dir_path from results
94
+ if dir_path[-1] != os.sep:
95
+ dir_path += os.sep
96
+ trim_len = len(dir_path)
97
+ file_paths = [x[trim_len:] for x in file_paths]
98
+
99
+ if extension: # select only file with specific extension
100
+ extension = extension.lower()
101
+ tlen = len(extension)
102
+ file_paths = [x for x in file_paths if x[-tlen:] == extension]
103
+
104
+ return file_paths # Self-explanatory.
105
+
106
+
107
+ def get_all_dirs(dir_path, trim=0):
108
+ """
109
+ Recursively get list of all directories in the given directory
110
+ excluding the '.' and '..' directories
111
+ trim = 1 : trim the dir_path from results, 0 otherwise
112
+ """
113
+ out = []
114
+ # Walk the tree.
115
+ for root, directories, files in os.walk(dir_path):
116
+ for dirname in directories:
117
+ # Join the two strings in order to form the full filepath.
118
+ dir_full = os.path.join(root, dirname)
119
+ out.append(dir_full) # Add it to the list.
120
+
121
+ if trim == 1: # trim dir_path from results
122
+ if dir_path[-1] != os.sep:
123
+ dir_path += os.sep
124
+ trim_len = len(dir_path)
125
+ out = [x[trim_len:] for x in out]
126
+
127
+ return out
128
+
129
+
130
+ def read_list(file_path, delimeter=' ', keep_original=True):
131
+ """
132
+ read list column wise
133
+ deprecated, should use pandas instead
134
+ """
135
+ out = []
136
+ with open(file_path, 'r') as f:
137
+ reader = csv.reader(f, delimiter=delimeter)
138
+ for row in reader:
139
+ out.append(row)
140
+ out = zip(*out)
141
+
142
+ if not keep_original:
143
+ for col in range(len(out)):
144
+ if out[col][0].isdigit(): # attempt to convert to numerical array
145
+ out[col] = np.array(out[col]).astype(np.int64)
146
+
147
+ return out
148
+
149
+
150
+ def save_pickle2(file_path, **kwargs):
151
+ """
152
+ save variables to file (using pickle)
153
+ """
154
+ # check if any variable is a dict
155
+ var_count = 0
156
+ for key in kwargs:
157
+ var_count += 1
158
+ if isinstance(kwargs[key], dict):
159
+ sys.stderr.write('Opps! Cannot write a dictionary into pickle')
160
+ sys.exit(1)
161
+ with open(file_path, 'wb') as f:
162
+ pickler = pickle.Pickler(f, -1)
163
+ pickler.dump(var_count)
164
+ for key in kwargs:
165
+ pickler.dump(key)
166
+ pickler.dump(kwargs[key])
167
+
168
+
169
+ def load_pickle2(file_path, varnum=0):
170
+ """
171
+ load variables that previously saved using self.save()
172
+ varnum : number of variables u want to load (0 mean it will load all)
173
+ Note: if you are loading class instance(s), you must have it defined in advance
174
+ """
175
+ with open(file_path, 'rb') as f:
176
+ pickler = pickle.Unpickler(f)
177
+ var_count = pickler.load()
178
+ if varnum:
179
+ var_count = min([var_count, varnum])
180
+ out = {}
181
+ for i in range(var_count):
182
+ key = pickler.load()
183
+ out[key] = pickler.load()
184
+
185
+ return out
186
+
187
+
188
+ def save_pickle(path, obj):
189
+ """
190
+ simple method to save a picklable object
191
+ :param path: path to save
192
+ :param obj: a picklable object
193
+ :return: None
194
+ """
195
+ with open(path, 'wb') as f:
196
+ pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
197
+
198
+
199
+ def load_pickle(path):
200
+ """
201
+ load a pickled object
202
+ :param path: .pkl path
203
+ :return: the pickled object
204
+ """
205
+ with open(path, 'rb') as f:
206
+ return pickle.load(f)
207
+
208
+
209
+ def make_new_dir(dir_path, remove_existing=False, mode=511):
210
+ """note: default mode in ubuntu is 511"""
211
+ if not os.path.exists(dir_path):
212
+ try:
213
+ if mode == 777:
214
+ oldmask = os.umask(000)
215
+ os.makedirs(dir_path, 0o777)
216
+ os.umask(oldmask)
217
+ else:
218
+ os.makedirs(dir_path, mode)
219
+ except OSError as exc: # Python >2.5
220
+ if exc.errno == errno.EEXIST and os.path.isdir(dir_path):
221
+ pass
222
+ else:
223
+ raise
224
+ if remove_existing:
225
+ for file_obj in os.listdir(dir_path):
226
+ file_path = os.path.join(dir_path, file_obj)
227
+ if os.path.isfile(file_path):
228
+ os.unlink(file_path)
229
+
230
+
231
+ def get_latest_file(root, pattern):
232
+ """
233
+ get the latest file in a directory that match the provided pattern
234
+ useful for getting the last checkpoint
235
+ :param root: search directory
236
+ :param pattern: search pattern containing 1 wild card representing a number e.g. 'ckpt_*.tar'
237
+ :return: full path of the file with largest number in wild card, None if not found
238
+ """
239
+ out = None
240
+ parts = pattern.split('*')
241
+ max_id = - np.inf
242
+ for path in glob.glob(os.path.join(root, pattern)):
243
+ id_ = os.path.basename(path)
244
+ for part in parts:
245
+ id_ = id_.replace(part, '')
246
+ try:
247
+ id_ = int(id_)
248
+ if id_ > max_id:
249
+ max_id = id_
250
+ out = path
251
+ except:
252
+ continue
253
+ return out
254
+
255
+
256
+ class Locker(object):
257
+ """place a lock file in specified location
258
+ useful for distributed computing"""
259
+
260
+ def __init__(self, name='lock.txt', mode=511):
261
+ """INPUT: name default file name to be created as a lock
262
+ mode if a directory has to be created, set its permission to mode"""
263
+ self.name = name
264
+ self.mode = mode
265
+
266
+ def lock(self, path):
267
+ make_new_dir(path, False, self.mode)
268
+ with open(os.path.join(path, self.name), 'w') as f:
269
+ f.write('progress')
270
+
271
+ def finish(self, path):
272
+ make_new_dir(path, False, self.mode)
273
+ with open(os.path.join(path, self.name), 'w') as f:
274
+ f.write('finish')
275
+
276
+ def customise(self, path, text):
277
+ make_new_dir(path, False, self.mode)
278
+ with open(os.path.join(path, self.name), 'w') as f:
279
+ f.write(text)
280
+
281
+ def is_locked(self, path):
282
+ out = False
283
+ check_path = os.path.join(path, self.name)
284
+ if os.path.exists(check_path):
285
+ text = open(check_path, 'r').readline().strip()
286
+ out = True if text == 'progress' else False
287
+ return out
288
+
289
+ def is_finished(self, path):
290
+ out = False
291
+ check_path = os.path.join(path, self.name)
292
+ if os.path.exists(check_path):
293
+ text = open(check_path, 'r').readline().strip()
294
+ out = True if text == 'finish' else False
295
+ return out
296
+
297
+ def is_locked_or_finished(self, path):
298
+ return self.is_locked(path) | self.is_finished(path)
299
+
300
+ def clean(self, path):
301
+ check_path = os.path.join(path, self.name)
302
+ if os.path.exists(check_path):
303
+ try:
304
+ os.remove(check_path)
305
+ except Exception as e:
306
+ print('Unable to remove %s: %s.' % (check_path, e))
307
+
308
+
309
+ class ProgressBar(object):
310
+ """show progress"""
311
+
312
+ def __init__(self, total, increment=5):
313
+ self.total = total
314
+ self.point = self.total / 100.0
315
+ self.increment = increment
316
+ self.interval = int(self.total * self.increment / 100)
317
+ self.milestones = list(range(0, total, self.interval)) + [self.total, ]
318
+ self.id = 0
319
+
320
+ def show_progress(self, i):
321
+ if i >= self.milestones[self.id]:
322
+ while i >= self.milestones[self.id]:
323
+ self.id += 1
324
+ sys.stdout.write("\r[" + "=" * int(i / self.interval) +
325
+ " " * int((self.total - i) / self.interval) + "]" + str(int((i + 1) / self.point)) + "%")
326
+ sys.stdout.flush()
327
+
328
+
329
+ class Timer(object):
330
+
331
+ def __init__(self):
332
+ self.start_t = time.time()
333
+ self.last_t = self.start_t
334
+
335
+ def time(self, lap=False):
336
+ end_t = time.time()
337
+ if lap:
338
+ out = timedelta(seconds=int(end_t - self.last_t)) # count from last stop point
339
+ else:
340
+ out = timedelta(seconds=int(end_t - self.start_t)) # count from beginning
341
+ self.last_t = end_t
342
+ return out
343
+
344
+
345
+ class ExThread(Thread):
346
+ def run(self):
347
+ self.exc = None
348
+ try:
349
+ if hasattr(self, '_Thread__target'):
350
+ # Thread uses name mangling prior to Python 3.
351
+ self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
352
+ else:
353
+ self.ret = self._target(*self._args, **self._kwargs)
354
+ except BaseException as e:
355
+ self.exc = e
356
+
357
+ def join(self):
358
+ super(ExThread, self).join()
359
+ if self.exc:
360
+ raise RuntimeError('Exception in thread.') from self.exc
361
+ return self.ret
362
+
363
+
364
+ def get_gpu_free_mem():
365
+ """return a list of free GPU memory"""
366
+ sp = subprocess.Popen(['nvidia-smi', '-q'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
367
+ out_str = sp.communicate()
368
+ out_list = out_str[0].decode("utf-8") .split('\n')
369
+
370
+ out = []
371
+ for i in range(len(out_list)):
372
+ item = out_list[i]
373
+ if item.strip() == 'FB Memory Usage':
374
+ free_mem = int(out_list[i + 3].split(':')[1].strip().split(' ')[0])
375
+ out.append(free_mem)
376
+ return out
377
+
378
+
379
+ def float2hex(x):
380
+ """
381
+ x: a vector
382
+ return: x in hex
383
+ """
384
+ f = np.float32(x)
385
+ out = ''
386
+ if f.size == 1: # just a single number
387
+ f = [f, ]
388
+ for e in f:
389
+ h = hex(struct.unpack('<I', struct.pack('<f', e))[0])
390
+ out += h[2:].zfill(8)
391
+ return out
392
+
393
+
394
+ def hex2float(x):
395
+ """
396
+ x: a string with len divided by 8
397
+ return x as array of float32
398
+ """
399
+ assert len(x) % 8 == 0, 'Error! string len = {} not divided by 8'.format(len(x))
400
+ l = len(x) / 8
401
+ out = np.empty(l, dtype=np.float32)
402
+ x = [x[i:i + 8] for i in range(0, len(x), 8)]
403
+ for i, e in enumerate(x):
404
+ out[i] = struct.unpack('!f', e.decode('hex'))[0]
405
+ return out
406
+
407
+
408
+ def nice_print(inputs, stream=sys.stdout):
409
+ """print a list of string to file stream"""
410
+ if type(inputs) is not list:
411
+ tstrings = inputs.split('\n')
412
+ pprint(tstrings, stream=stream)
413
+ else:
414
+ for string in inputs:
415
+ nice_print(string, stream=stream)
416
+ stream.flush()
tools/hparams.py ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019 The Tensor2Tensor Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # source: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/hparam.py
16
+ # Forked with minor changes from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py pylint: disable=line-too-long
17
+ """Hyperparameter values."""
18
+ from __future__ import absolute_import
19
+ from __future__ import division
20
+ from __future__ import print_function
21
+
22
+ import json
23
+ import numbers
24
+ import re
25
+ import six
26
+ import numpy as np
27
+
28
+ # Define the regular expression for parsing a single clause of the input
29
+ # (delimited by commas). A legal clause looks like:
30
+ # <variable name>[<index>]? = <rhs>
31
+ # where <rhs> is either a single token or [] enclosed list of tokens.
32
+ # For example: "var[1] = a" or "x = [1,2,3]"
33
+ PARAM_RE = re.compile(r"""
34
+ (?P<name>[a-zA-Z][\w\.]*) # variable name: "var" or "x"
35
+ (\[\s*(?P<index>\d+)\s*\])? # (optional) index: "1" or None
36
+ \s*=\s*
37
+ ((?P<val>[^,\[]*) # single value: "a" or None
38
+ |
39
+ \[(?P<vals>[^\]]*)\]) # list of values: None or "1,2,3"
40
+ ($|,\s*)""", re.VERBOSE)
41
+
42
+
43
+ def copy_hparams(hparams):
44
+ """Return a copy of an HParams instance."""
45
+ return HParams(**hparams.values())
46
+
47
+
48
+ def print_config(hps):
49
+ for key, val in six.iteritems(hps.values()):
50
+ print('%s = %s' % (key, str(val)))
51
+
52
+
53
+ def save_config(output_file, hps, verbose=True):
54
+ def convert(o): # json cannot serialize integer in np.int64 format
55
+ if isinstance(o, np.int64):
56
+ return int(o)
57
+ raise TypeError
58
+ if verbose:
59
+ print_config(hps)
60
+ with open(output_file, 'w') as f:
61
+ json.dump(hps.values(), f, indent=True, default=convert)
62
+
63
+
64
+ def load_config(hps, config_file, verbose=True):
65
+ """
66
+ parse hparams from config file
67
+ :param hps: hparams object whose values to be updated
68
+ :param config_file: json config file
69
+ :param verbose: print out values
70
+ """
71
+ try:
72
+ with open(config_file, 'r') as fin:
73
+ hps.parse_json(fin.read())
74
+ if verbose:
75
+ print_config(hps)
76
+ except Exception as e:
77
+ print('Error reading config file %s: %s.\nConfig will not be updated.' % (config_file, e))
78
+ # return hps
79
+
80
+
81
+ def _parse_fail(name, var_type, value, values):
82
+ """Helper function for raising a value error for bad assignment."""
83
+ raise ValueError(
84
+ 'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s' %
85
+ (name, var_type.__name__, value, values))
86
+
87
+
88
+ def _reuse_fail(name, values):
89
+ """Helper function for raising a value error for reuse of name."""
90
+ raise ValueError('Multiple assignments to variable \'%s\' in %s' % (name,
91
+ values))
92
+
93
+
94
+ def _process_scalar_value(name, parse_fn, var_type, m_dict, values,
95
+ results_dictionary):
96
+ """Update results_dictionary with a scalar value.
97
+
98
+ Used to update the results_dictionary to be returned by parse_values when
99
+ encountering a clause with a scalar RHS (e.g. "s=5" or "arr[0]=5".)
100
+
101
+ Mutates results_dictionary.
102
+
103
+ Args:
104
+ name: Name of variable in assignment ("s" or "arr").
105
+ parse_fn: Function for parsing the actual value.
106
+ var_type: Type of named variable.
107
+ m_dict: Dictionary constructed from regex parsing.
108
+ m_dict['val']: RHS value (scalar)
109
+ m_dict['index']: List index value (or None)
110
+ values: Full expression being parsed
111
+ results_dictionary: The dictionary being updated for return by the parsing
112
+ function.
113
+
114
+ Raises:
115
+ ValueError: If the name has already been used.
116
+ """
117
+ try:
118
+ parsed_value = parse_fn(m_dict['val'])
119
+ except ValueError:
120
+ _parse_fail(name, var_type, m_dict['val'], values)
121
+
122
+ # If no index is provided
123
+ if not m_dict['index']:
124
+ if name in results_dictionary:
125
+ _reuse_fail(name, values)
126
+ results_dictionary[name] = parsed_value
127
+ else:
128
+ if name in results_dictionary:
129
+ # The name has already been used as a scalar, then it
130
+ # will be in this dictionary and map to a non-dictionary.
131
+ if not isinstance(results_dictionary.get(name), dict):
132
+ _reuse_fail(name, values)
133
+ else:
134
+ results_dictionary[name] = {}
135
+
136
+ index = int(m_dict['index'])
137
+ # Make sure the index position hasn't already been assigned a value.
138
+ if index in results_dictionary[name]:
139
+ _reuse_fail('{}[{}]'.format(name, index), values)
140
+ results_dictionary[name][index] = parsed_value
141
+
142
+
143
+ def _process_list_value(name, parse_fn, var_type, m_dict, values,
144
+ results_dictionary):
145
+ """Update results_dictionary from a list of values.
146
+
147
+ Used to update results_dictionary to be returned by parse_values when
148
+ encountering a clause with a list RHS (e.g. "arr=[1,2,3]".)
149
+
150
+ Mutates results_dictionary.
151
+
152
+ Args:
153
+ name: Name of variable in assignment ("arr").
154
+ parse_fn: Function for parsing individual values.
155
+ var_type: Type of named variable.
156
+ m_dict: Dictionary constructed from regex parsing.
157
+ m_dict['val']: RHS value (scalar)
158
+ values: Full expression being parsed
159
+ results_dictionary: The dictionary being updated for return by the parsing
160
+ function.
161
+
162
+ Raises:
163
+ ValueError: If the name has an index or the values cannot be parsed.
164
+ """
165
+ if m_dict['index'] is not None:
166
+ raise ValueError('Assignment of a list to a list index.')
167
+ elements = filter(None, re.split('[ ,]', m_dict['vals']))
168
+ # Make sure the name hasn't already been assigned a value
169
+ if name in results_dictionary:
170
+ raise _reuse_fail(name, values)
171
+ try:
172
+ results_dictionary[name] = [parse_fn(e) for e in elements]
173
+ except ValueError:
174
+ _parse_fail(name, var_type, m_dict['vals'], values)
175
+
176
+
177
+ def _cast_to_type_if_compatible(name, param_type, value):
178
+ """Cast hparam to the provided type, if compatible.
179
+
180
+ Args:
181
+ name: Name of the hparam to be cast.
182
+ param_type: The type of the hparam.
183
+ value: The value to be cast, if compatible.
184
+
185
+ Returns:
186
+ The result of casting `value` to `param_type`.
187
+
188
+ Raises:
189
+ ValueError: If the type of `value` is not compatible with param_type.
190
+ * If `param_type` is a string type, but `value` is not.
191
+ * If `param_type` is a boolean, but `value` is not, or vice versa.
192
+ * If `param_type` is an integer type, but `value` is not.
193
+ * If `param_type` is a float type, but `value` is not a numeric type.
194
+ """
195
+ fail_msg = (
196
+ "Could not cast hparam '%s' of type '%s' from value %r" %
197
+ (name, param_type, value))
198
+
199
+ # Some callers use None, for which we can't do any casting/checking. :(
200
+ if issubclass(param_type, type(None)):
201
+ return value
202
+
203
+ # Avoid converting a non-string type to a string.
204
+ if (issubclass(param_type, (six.string_types, six.binary_type)) and
205
+ not isinstance(value, (six.string_types, six.binary_type))):
206
+ raise ValueError(fail_msg)
207
+
208
+ # Avoid converting a number or string type to a boolean or vice versa.
209
+ if issubclass(param_type, bool) != isinstance(value, bool):
210
+ raise ValueError(fail_msg)
211
+
212
+ # Avoid converting float to an integer (the reverse is fine).
213
+ if (issubclass(param_type, numbers.Integral) and
214
+ not isinstance(value, numbers.Integral)):
215
+ raise ValueError(fail_msg)
216
+
217
+ # Avoid converting a non-numeric type to a numeric type.
218
+ if (issubclass(param_type, numbers.Number) and
219
+ not isinstance(value, numbers.Number)):
220
+ raise ValueError(fail_msg)
221
+
222
+ return param_type(value)
223
+
224
+
225
+ def parse_values(values, type_map, ignore_unknown=False):
226
+ """Parses hyperparameter values from a string into a python map.
227
+
228
+ `values` is a string containing comma-separated `name=value` pairs.
229
+ For each pair, the value of the hyperparameter named `name` is set to
230
+ `value`.
231
+
232
+ If a hyperparameter name appears multiple times in `values`, a ValueError
233
+ is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
234
+
235
+ If a hyperparameter name in both an index assignment and scalar assignment,
236
+ a ValueError is raised. (e.g. 'a=[1,2,3],a[0] = 1').
237
+
238
+ The hyperparameter name may contain '.' symbols, which will result in an
239
+ attribute name that is only accessible through the getattr and setattr
240
+ functions. (And must be first explicit added through add_hparam.)
241
+
242
+ WARNING: Use of '.' in your variable names is allowed, but is not well
243
+ supported and not recommended.
244
+
245
+ The `value` in `name=value` must follows the syntax according to the
246
+ type of the parameter:
247
+
248
+ * Scalar integer: A Python-parsable integer point value. E.g.: 1,
249
+ 100, -12.
250
+ * Scalar float: A Python-parsable floating point value. E.g.: 1.0,
251
+ -.54e89.
252
+ * Boolean: Either true or false.
253
+ * Scalar string: A non-empty sequence of characters, excluding comma,
254
+ spaces, and square brackets. E.g.: foo, bar_1.
255
+ * List: A comma separated list of scalar values of the parameter type
256
+ enclosed in square brackets. E.g.: [1,2,3], [1.0,1e-12], [high,low].
257
+
258
+ When index assignment is used, the corresponding type_map key should be the
259
+ list name. E.g. for "arr[1]=0" the type_map must have the key "arr" (not
260
+ "arr[1]").
261
+
262
+ Args:
263
+ values: String. Comma separated list of `name=value` pairs where
264
+ 'value' must follow the syntax described above.
265
+ type_map: A dictionary mapping hyperparameter names to types. Note every
266
+ parameter name in values must be a key in type_map. The values must
267
+ conform to the types indicated, where a value V is said to conform to a
268
+ type T if either V has type T, or V is a list of elements of type T.
269
+ Hence, for a multidimensional parameter 'x' taking float values,
270
+ 'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
271
+ ignore_unknown: Bool. Whether values that are missing a type in type_map
272
+ should be ignored. If set to True, a ValueError will not be raised for
273
+ unknown hyperparameter type.
274
+
275
+ Returns:
276
+ A python map mapping each name to either:
277
+ * A scalar value.
278
+ * A list of scalar values.
279
+ * A dictionary mapping index numbers to scalar values.
280
+ (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
281
+
282
+ Raises:
283
+ ValueError: If there is a problem with input.
284
+ * If `values` cannot be parsed.
285
+ * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
286
+ * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
287
+ 'a[1]=1,a[1]=2', or 'a=1,a=[1]')
288
+ """
289
+ results_dictionary = {}
290
+ pos = 0
291
+ while pos < len(values):
292
+ m = PARAM_RE.match(values, pos)
293
+ if not m:
294
+ raise ValueError('Malformed hyperparameter value: %s' % values[pos:])
295
+ # Check that there is a comma between parameters and move past it.
296
+ pos = m.end()
297
+ # Parse the values.
298
+ m_dict = m.groupdict()
299
+ name = m_dict['name']
300
+ if name not in type_map:
301
+ if ignore_unknown:
302
+ continue
303
+ raise ValueError('Unknown hyperparameter type for %s' % name)
304
+ type_ = type_map[name]
305
+
306
+ # Set up correct parsing function (depending on whether type_ is a bool)
307
+ if type_ == bool:
308
+ def parse_bool(value):
309
+ if value in ['true', 'True']:
310
+ return True
311
+ elif value in ['false', 'False']:
312
+ return False
313
+ else:
314
+ try:
315
+ return bool(int(value))
316
+ except ValueError:
317
+ _parse_fail(name, type_, value, values)
318
+
319
+ parse = parse_bool
320
+ else:
321
+ parse = type_
322
+
323
+ # If a singe value is provided
324
+ if m_dict['val'] is not None:
325
+ _process_scalar_value(name, parse, type_, m_dict, values,
326
+ results_dictionary)
327
+
328
+ # If the assigned value is a list:
329
+ elif m_dict['vals'] is not None:
330
+ _process_list_value(name, parse, type_, m_dict, values,
331
+ results_dictionary)
332
+
333
+ else: # Not assigned a list or value
334
+ _parse_fail(name, type_, '', values)
335
+
336
+ return results_dictionary
337
+
338
+
339
+ class HParams(object):
340
+ """Class to hold a set of hyperparameters as name-value pairs.
341
+
342
+ A `HParams` object holds hyperparameters used to build and train a model,
343
+ such as the number of hidden units in a neural net layer or the learning rate
344
+ to use when training.
345
+
346
+ You first create a `HParams` object by specifying the names and values of the
347
+ hyperparameters.
348
+
349
+ To make them easily accessible the parameter names are added as direct
350
+ attributes of the class. A typical usage is as follows:
351
+
352
+ ```python
353
+ # Create a HParams object specifying names and values of the model
354
+ # hyperparameters:
355
+ hparams = HParams(learning_rate=0.1, num_hidden_units=100)
356
+
357
+ # The hyperparameter are available as attributes of the HParams object:
358
+ hparams.learning_rate ==> 0.1
359
+ hparams.num_hidden_units ==> 100
360
+ ```
361
+
362
+ Hyperparameters have type, which is inferred from the type of their value
363
+ passed at construction type. The currently supported types are: integer,
364
+ float, boolean, string, and list of integer, float, boolean, or string.
365
+
366
+ You can override hyperparameter values by calling the
367
+ [`parse()`](#HParams.parse) method, passing a string of comma separated
368
+ `name=value` pairs. This is intended to make it possible to override
369
+ any hyperparameter values from a single command-line flag to which
370
+ the user passes 'hyper-param=value' pairs. It avoids having to define
371
+ one flag for each hyperparameter.
372
+
373
+ The syntax expected for each value depends on the type of the parameter.
374
+ See `parse()` for a description of the syntax.
375
+
376
+ Example:
377
+
378
+ ```python
379
+ # Define a command line flag to pass name=value pairs.
380
+ # For example using argparse:
381
+ import argparse
382
+ parser = argparse.ArgumentParser(description='Train my model.')
383
+ parser.add_argument('--hparams', type=str,
384
+ help='Comma separated list of "name=value" pairs.')
385
+ args = parser.parse_args()
386
+ ...
387
+ def my_program():
388
+ # Create a HParams object specifying the names and values of the
389
+ # model hyperparameters:
390
+ hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
391
+ activations=['relu', 'tanh'])
392
+
393
+ # Override hyperparameters values by parsing the command line
394
+ hparams.parse(args.hparams)
395
+
396
+ # If the user passed `--hparams=learning_rate=0.3` on the command line
397
+ # then 'hparams' has the following attributes:
398
+ hparams.learning_rate ==> 0.3
399
+ hparams.num_hidden_units ==> 100
400
+ hparams.activations ==> ['relu', 'tanh']
401
+
402
+ # If the hyperparameters are in json format use parse_json:
403
+ hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
404
+ ```
405
+ """
406
+
407
+ _HAS_DYNAMIC_ATTRIBUTES = True # Required for pytype checks.
408
+
409
+ def __init__(self, model_structure=None, **kwargs):
410
+ """Create an instance of `HParams` from keyword arguments.
411
+
412
+ The keyword arguments specify name-values pairs for the hyperparameters.
413
+ The parameter types are inferred from the type of the values passed.
414
+
415
+ The parameter names are added as attributes of `HParams` object, so they
416
+ can be accessed directly with the dot notation `hparams._name_`.
417
+
418
+ Example:
419
+
420
+ ```python
421
+ # Define 3 hyperparameters: 'learning_rate' is a float parameter,
422
+ # 'num_hidden_units' an integer parameter, and 'activation' a string
423
+ # parameter.
424
+ hparams = tf.HParams(
425
+ learning_rate=0.1, num_hidden_units=100, activation='relu')
426
+
427
+ hparams.activation ==> 'relu'
428
+ ```
429
+
430
+ Note that a few names are reserved and cannot be used as hyperparameter
431
+ names. If you use one of the reserved name the constructor raises a
432
+ `ValueError`.
433
+
434
+ Args:
435
+ model_structure: An instance of ModelStructure, defining the feature
436
+ crosses to be used in the Trial.
437
+ **kwargs: Key-value pairs where the key is the hyperparameter name and
438
+ the value is the value for the parameter.
439
+
440
+ Raises:
441
+ ValueError: If both `hparam_def` and initialization values are provided,
442
+ or if one of the arguments is invalid.
443
+
444
+ """
445
+ # Register the hyperparameters and their type in _hparam_types.
446
+ # This simplifies the implementation of parse().
447
+ # _hparam_types maps the parameter name to a tuple (type, bool).
448
+ # The type value is the type of the parameter for scalar hyperparameters,
449
+ # or the type of the list elements for multidimensional hyperparameters.
450
+ # The bool value is True if the value is a list, False otherwise.
451
+ self._hparam_types = {}
452
+ self._model_structure = model_structure
453
+ for name, value in six.iteritems(kwargs):
454
+ self.add_hparam(name, value)
455
+
456
+ def __add__(self, other):
457
+ """
458
+ addition operation keeping key order
459
+ """
460
+ out = HParams()
461
+ for key in self._hparam_types.keys():
462
+ out.add_hparam(key, getattr(self, key))
463
+ for key in other._hparam_types.keys():
464
+ if getattr(out, key, None) is None: # add new param
465
+ out.add_hparam(key, getattr(other, key))
466
+ else: # update existing param
467
+ out.set_hparam(key, getattr(other, key))
468
+ return out
469
+
470
+ def __str__(self):
471
+ s = 'HParams(\n'
472
+ for key, val in six.iteritems(self.values()):
473
+ s += f'\t{key} = {val}\n'
474
+ # print('%s = %s' % (key, str(val)))
475
+ s += ')'
476
+ return s
477
+
478
+ def __repr__(self):
479
+ return self.__str__()
480
+
481
+ def add_hparam(self, name, value):
482
+ """Adds {name, value} pair to hyperparameters.
483
+
484
+ Args:
485
+ name: Name of the hyperparameter.
486
+ value: Value of the hyperparameter. Can be one of the following types:
487
+ int, float, string, int list, float list, or string list.
488
+
489
+ Raises:
490
+ ValueError: if one of the arguments is invalid.
491
+ """
492
+ # Keys in kwargs are unique, but 'name' could the name of a pre-existing
493
+ # attribute of this object. In that case we refuse to use it as a
494
+ # hyperparameter name.
495
+ if getattr(self, name, None) is not None:
496
+ raise ValueError('Hyperparameter name is reserved: %s' % name)
497
+ if isinstance(value, (list, tuple)):
498
+ if not value:
499
+ raise ValueError(
500
+ 'Multi-valued hyperparameters cannot be empty: %s' % name)
501
+ self._hparam_types[name] = (type(value[0]), True)
502
+ else:
503
+ self._hparam_types[name] = (type(value), False)
504
+ setattr(self, name, value)
505
+
506
+ def set_hparam(self, name, value):
507
+ """Set the value of an existing hyperparameter.
508
+
509
+ This function verifies that the type of the value matches the type of the
510
+ existing hyperparameter.
511
+
512
+ Args:
513
+ name: Name of the hyperparameter.
514
+ value: New value of the hyperparameter.
515
+
516
+ Raises:
517
+ KeyError: If the hyperparameter doesn't exist.
518
+ ValueError: If there is a type mismatch.
519
+ """
520
+ param_type, is_list = self._hparam_types[name]
521
+ if isinstance(value, list):
522
+ if not is_list:
523
+ raise ValueError(
524
+ 'Must not pass a list for single-valued parameter: %s' % name)
525
+ setattr(self, name, [
526
+ _cast_to_type_if_compatible(name, param_type, v) for v in value])
527
+ else:
528
+ if is_list:
529
+ raise ValueError(
530
+ 'Must pass a list for multi-valued parameter: %s.' % name)
531
+ setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
532
+
533
+ def del_hparam(self, name):
534
+ """Removes the hyperparameter with key 'name'.
535
+
536
+ Does nothing if it isn't present.
537
+
538
+ Args:
539
+ name: Name of the hyperparameter.
540
+ """
541
+ if hasattr(self, name):
542
+ delattr(self, name)
543
+ del self._hparam_types[name]
544
+
545
+ def parse(self, values):
546
+ """Override existing hyperparameter values, parsing new values from a string.
547
+
548
+ See parse_values for more detail on the allowed format for values.
549
+
550
+ Args:
551
+ values: String. Comma separated list of `name=value` pairs where 'value'
552
+ must follow the syntax described above.
553
+
554
+ Returns:
555
+ The `HParams` instance.
556
+
557
+ Raises:
558
+ ValueError: If `values` cannot be parsed or a hyperparameter in `values`
559
+ doesn't exist.
560
+ """
561
+ type_map = {}
562
+ for name, t in self._hparam_types.items():
563
+ param_type, _ = t
564
+ type_map[name] = param_type
565
+
566
+ values_map = parse_values(values, type_map)
567
+ return self.override_from_dict(values_map)
568
+
569
+ def override_from_dict(self, values_dict):
570
+ """Override existing hyperparameter values, parsing new values from a dictionary.
571
+
572
+ Args:
573
+ values_dict: Dictionary of name:value pairs.
574
+
575
+ Returns:
576
+ The `HParams` instance.
577
+
578
+ Raises:
579
+ KeyError: If a hyperparameter in `values_dict` doesn't exist.
580
+ ValueError: If `values_dict` cannot be parsed.
581
+ """
582
+ for name, value in values_dict.items():
583
+ self.set_hparam(name, value)
584
+ return self
585
+
586
+ def set_model_structure(self, model_structure):
587
+ self._model_structure = model_structure
588
+
589
+ def get_model_structure(self):
590
+ return self._model_structure
591
+
592
+ def to_json(self, indent=None, separators=None, sort_keys=False):
593
+ """Serializes the hyperparameters into JSON.
594
+
595
+ Args:
596
+ indent: If a non-negative integer, JSON array elements and object members
597
+ will be pretty-printed with that indent level. An indent level of 0, or
598
+ negative, will only insert newlines. `None` (the default) selects the
599
+ most compact representation.
600
+ separators: Optional `(item_separator, key_separator)` tuple. Default is
601
+ `(', ', ': ')`.
602
+ sort_keys: If `True`, the output dictionaries will be sorted by key.
603
+
604
+ Returns:
605
+ A JSON string.
606
+ """
607
+ def remove_callables(x):
608
+ """Omit callable elements from input with arbitrary nesting."""
609
+ if isinstance(x, dict):
610
+ return {k: remove_callables(v) for k, v in six.iteritems(x)
611
+ if not callable(v)}
612
+ elif isinstance(x, list):
613
+ return [remove_callables(i) for i in x if not callable(i)]
614
+ return x
615
+ return json.dumps(
616
+ remove_callables(self.values()),
617
+ indent=indent,
618
+ separators=separators,
619
+ sort_keys=sort_keys)
620
+
621
+ def parse_json(self, values_json):
622
+ """Override existing hyperparameter values, parsing new values from a json object.
623
+
624
+ Args:
625
+ values_json: String containing a json object of name:value pairs.
626
+
627
+ Returns:
628
+ The `HParams` instance.
629
+
630
+ Raises:
631
+ KeyError: If a hyperparameter in `values_json` doesn't exist.
632
+ ValueError: If `values_json` cannot be parsed.
633
+ """
634
+ values_map = json.loads(values_json)
635
+ return self.override_from_dict(values_map)
636
+
637
+ def values(self):
638
+ """Return the hyperparameter values as a Python dictionary.
639
+
640
+ Returns:
641
+ A dictionary with hyperparameter names as keys. The values are the
642
+ hyperparameter values.
643
+ """
644
+ return {n: getattr(self, n) for n in self._hparam_types.keys()}
645
+
646
+ def get(self, key, default=None):
647
+ """Returns the value of `key` if it exists, else `default`."""
648
+ if key in self._hparam_types:
649
+ # Ensure that default is compatible with the parameter type.
650
+ if default is not None:
651
+ param_type, is_param_list = self._hparam_types[key]
652
+ type_str = 'list<%s>' % param_type if is_param_list else str(param_type)
653
+ fail_msg = ("Hparam '%s' of type '%s' is incompatible with "
654
+ 'default=%s' % (key, type_str, default))
655
+
656
+ is_default_list = isinstance(default, list)
657
+ if is_param_list != is_default_list:
658
+ raise ValueError(fail_msg)
659
+
660
+ try:
661
+ if is_default_list:
662
+ for value in default:
663
+ _cast_to_type_if_compatible(key, param_type, value)
664
+ else:
665
+ _cast_to_type_if_compatible(key, param_type, default)
666
+ except ValueError as e:
667
+ raise ValueError('%s. %s' % (fail_msg, e))
668
+
669
+ return getattr(self, key)
670
+
671
+ return default
672
+
673
+ def __contains__(self, key):
674
+ return key in self._hparam_types
675
+
676
+ @staticmethod
677
+ def _get_kind_name(param_type, is_list):
678
+ """Returns the field name given parameter type and is_list.
679
+
680
+ Args:
681
+ param_type: Data type of the hparam.
682
+ is_list: Whether this is a list.
683
+
684
+ Returns:
685
+ A string representation of the field name.
686
+
687
+ Raises:
688
+ ValueError: If parameter type is not recognized.
689
+ """
690
+ if issubclass(param_type, bool):
691
+ # This check must happen before issubclass(param_type, six.integer_types),
692
+ # since Python considers bool to be a subclass of int.
693
+ typename = 'bool'
694
+ elif issubclass(param_type, six.integer_types):
695
+ # Setting 'int' and 'long' types to be 'int64' to ensure the type is
696
+ # compatible with both Python2 and Python3.
697
+ typename = 'int64'
698
+ elif issubclass(param_type, (six.string_types, six.binary_type)):
699
+ # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
700
+ # compatible with both Python2 and Python3.
701
+ typename = 'bytes'
702
+ elif issubclass(param_type, float):
703
+ typename = 'float'
704
+ else:
705
+ raise ValueError('Unsupported parameter type: %s' % str(param_type))
706
+
707
+ suffix = 'list' if is_list else 'value'
708
+ return '_'.join([typename, suffix])
709
+
710
+ @staticmethod
711
+ def save_config(self, output_file, verbose=True):
712
+ def convert(o): # json cannot serialize integer in np.int64 format
713
+ if isinstance(o, np.int64):
714
+ return int(o)
715
+ raise TypeError
716
+ if verbose:
717
+ print(self)
718
+ with open(output_file, 'w') as f:
719
+ json.dump(self.values(), f, indent=True, default=convert)
720
+
721
+ @staticmethod
722
+ def load_config(config_file, verbose=True):
723
+ """
724
+ parse hparams from config file
725
+ :param config_file: json config file
726
+ :param verbose: print out values
727
+ """
728
+ try:
729
+ with open(config_file, 'r') as fin:
730
+ json_dict = json.loads(fin.read())
731
+ hps = HParams(**json_dict)
732
+ if verbose:
733
+ print_config(hps)
734
+ except Exception as e:
735
+ print('Error reading config file %s: %s.\nConfig will not be updated.' % (config_file, e))
736
+ return hps
737
+
738
+ @staticmethod
739
+ def clone(self):
740
+ """
741
+ return a deep copy of this object
742
+ """
743
+ return HParams(**self.values)
tools/image_dataset.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ imagefolder loader
5
+ inspired from https://github.com/adambielski/siamese-triplet/blob/master/datasets.py
6
+ @author: Tu Bui @surrey.ac.uk
7
+ """
8
+ from __future__ import absolute_import
9
+ from __future__ import division
10
+ from __future__ import print_function
11
+ import os
12
+ import sys
13
+ import io
14
+ import time
15
+ import pandas as pd
16
+ import numpy as np
17
+ import random
18
+ from PIL import Image
19
+ from typing import Any, Callable, List, Optional, Tuple
20
+ import torch
21
+ from torchvision import transforms
22
+ from .base_lmdb import PILlmdb, ArrayDatabase
23
+ # from . import debug
24
+
25
+
26
+ def worker_init_fn(worker_id):
27
+ # to be passed to torch.utils.data.DataLoader to fix the
28
+ # random seed issue with numpy in multi-worker settings
29
+ torch_seed = torch.initial_seed()
30
+ random.seed(torch_seed + worker_id)
31
+ if torch_seed >= 2**30: # make sure torch_seed + workder_id < 2**32
32
+ torch_seed = torch_seed % 2**30
33
+ np.random.seed(torch_seed + worker_id)
34
+
35
+
36
+ def pil_loader(path: str) -> Image.Image:
37
+ # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
38
+ with open(path, 'rb') as f:
39
+ img = Image.open(f)
40
+ return img.convert('RGB')
41
+
42
+
43
+ def dataset_wrapper(data_dir, data_list, **kwargs):
44
+ if os.path.exists(os.path.join(data_dir, 'data.mdb')):
45
+ return ImageDataset(data_dir, data_list, **kwargs)
46
+ else:
47
+ return ImageFolder(data_dir, data_list, **kwargs)
48
+
49
+
50
+ class ImageFolder(torch.utils.data.Dataset):
51
+ _repr_indent = 4
52
+ def __init__(self, data_dir, data_list, secret_len=100, resize=256, transform=None, **kwargs):
53
+ super().__init__()
54
+ self.transform = transforms.RandomResizedCrop((resize, resize), scale=(0.8, 1.0), ratio=(0.75, 1.3333333333333333)) if transform is None else transform
55
+ self.build_data(data_dir, data_list, **kwargs)
56
+ self.kwargs = kwargs
57
+ self.secret_len = secret_len
58
+
59
+ def build_data(self, data_dir, data_list, **kwargs):
60
+ self.data_dir = data_dir
61
+ if isinstance(data_list, list):
62
+ self.data_list = data_list
63
+ elif isinstance(data_list, str):
64
+ self.data_list = pd.read_csv(data_list)['path'].tolist()
65
+ elif isinstance(data_list, pd.DataFrame):
66
+ self.data_list = data_list['path'].tolist()
67
+ else:
68
+ raise ValueError('data_list must be a list, str or pd.DataFrame')
69
+ self.N = len(self.data_list)
70
+
71
+ def __getitem__(self, index):
72
+ path = self.data_list[index]
73
+ img = pil_loader(os.path.join(self.data_dir, path))
74
+ img = self.transform(img)
75
+ img = np.array(img, dtype=np.float32)/127.5-1. # [-1, 1]
76
+ secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
77
+ return {'image': img, 'secret': secret} # {'img': x, 'index': index}
78
+
79
+ def __len__(self) -> int:
80
+ # raise NotImplementedError
81
+ return self.N
82
+
83
+ class ImageDataset(torch.utils.data.Dataset):
84
+ r"""
85
+ Customised Image Folder class for pytorch.
86
+ Accept lmdb and a csv list as the input.
87
+ Usage:
88
+ dataset = ImageDataset(img_dir, img_list)
89
+ dataset.set_transform(some_pytorch_transforms)
90
+ loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True,
91
+ num_workers=4, worker_init_fn=worker_init_fn)
92
+ for x,y in loader:
93
+ # x and y is input and target (dict), the keys can be customised.
94
+ """
95
+ _repr_indent = 4
96
+ def __init__(self, data_dir, data_list, secret_len=100, resize=None, transform=None, target_transform=None, **kwargs):
97
+ super().__init__()
98
+ if resize is not None:
99
+ self.resize = transforms.Resize((resize, resize))
100
+ self.set_transform(transform, target_transform)
101
+ self.build_data(data_dir, data_list, **kwargs)
102
+ self.secret_len = secret_len
103
+ self.kwargs = kwargs
104
+
105
+ def set_transform(self, transform, target_transform=None):
106
+ self.transform, self.target_transform = transform, target_transform
107
+
108
+ def build_data(self, data_dir, data_list, **kwargs):
109
+ """
110
+ Args:
111
+ data_list (text file) must have at least 3 fields: id, path and label
112
+
113
+ This method must create an attribute self.samples containing ID, input and target samples; and another attribute N storing the dataset size
114
+
115
+ Optional attributes: classes (list of unique classes), group (useful for
116
+ metric learning)
117
+ """
118
+ self.data_dir, self.list = data_dir, data_list
119
+ if ('dtype' in kwargs) and (kwargs['dtype'].lower() == 'array'):
120
+ data = ArrayDatabase(data_dir, data_list)
121
+ else:
122
+ data = PILlmdb(data_dir, data_list, **kwargs)
123
+ self.N = len(data)
124
+ self.classes = np.unique(data.labels)
125
+ self.samples = {'x': data, 'y': data.labels}
126
+ # assert isinstance(data_list, str) or isinstance(data_list, pd.DataFrame)
127
+ # df = pd.read_csv(data_list) if isinstance(data_list, str) else data_list
128
+ # assert 'id' in df and 'label' in df, f'[DATA] Error! {data_list} must contains "id" and "label".'
129
+ # ids = df['id'].tolist()
130
+ # labels = np.array(df['label'].tolist())
131
+ # data = PILlmdb(data_dir)
132
+ # assert set(ids).issubset(set(data.keys)) # ids should exist in lmdb
133
+ # self.N = len(ids)
134
+ # self.classes, inds = np.unique(labels, return_index=True)
135
+ # self.samples = {'id': ids, 'x': data, 'y': labels}
136
+
137
+ def set_ids(self, ids):
138
+ self.samples['x'].set_ids(ids)
139
+ self.samples['y'] = [self.samples['y'][i] for i in ids]
140
+ self.N = len(self.samples['x'])
141
+
142
+ def __getitem__(self, index: int) -> Any:
143
+ """
144
+ Args:
145
+ index (int): Index
146
+ Returns:
147
+ dict: (x: sample, y: target, **kwargs)
148
+ """
149
+ x, y = self.samples['x'][index], self.samples['y'][index]
150
+ if hasattr(self, 'resize'):
151
+ x = self.resize(x)
152
+ if self.transform is not None:
153
+ x = self.transform(x)
154
+ if self.target_transform is not None:
155
+ y = self.target_transform(y)
156
+ x = np.array(x, dtype=np.float32)/127.5-1.
157
+ secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
158
+ return {'image': x, 'secret': secret} # {'img': x, 'index': index}
159
+
160
+ def __len__(self) -> int:
161
+ # raise NotImplementedError
162
+ return self.N
163
+
164
+ def __repr__(self) -> str:
165
+ head = "\nDataset " + self.__class__.__name__
166
+ body = ["Number of datapoints: {}".format(self.__len__())]
167
+ if hasattr(self, 'data_dir') and self.data_dir is not None:
168
+ body.append("data_dir location: {}".format(self.data_dir))
169
+ if hasattr(self, 'kwargs'):
170
+ body.append(f'kwargs: {self.kwargs}')
171
+ body += self.extra_repr().splitlines()
172
+ if hasattr(self, "transform") and self.transform is not None:
173
+ body += [repr(self.transform)]
174
+ lines = [head] + [" " * self._repr_indent + line for line in body]
175
+ return '\n'.join(lines)
176
+
177
+ def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
178
+ lines = transform.__repr__().splitlines()
179
+ return (["{}{}".format(head, lines[0])] +
180
+ ["{}{}".format(" " * len(head), line) for line in lines[1:]])
181
+
182
+ def extra_repr(self) -> str:
183
+ return ""
184
+
tools/image_dataset_generic.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ imagefolder loader
5
+ inspired from https://github.com/adambielski/siamese-triplet/blob/master/datasets.py
6
+ @author: Tu Bui @surrey.ac.uk
7
+ """
8
+ from __future__ import absolute_import
9
+ from __future__ import division
10
+ from __future__ import print_function
11
+ import os
12
+ import sys
13
+ import io
14
+ import time
15
+ import pandas as pd
16
+ import numpy as np
17
+ import random
18
+ from PIL import Image
19
+ from typing import Any, Callable, List, Optional, Tuple
20
+ import torch
21
+ from .base_lmdb import PILlmdb, ArrayDatabase
22
+ from torchvision import transforms
23
+ # from . import debug
24
+
25
+
26
+ def worker_init_fn(worker_id):
27
+ # to be passed to torch.utils.data.DataLoader to fix the
28
+ # random seed issue with numpy in multi-worker settings
29
+ torch_seed = torch.initial_seed()
30
+ random.seed(torch_seed + worker_id)
31
+ if torch_seed >= 2**30: # make sure torch_seed + workder_id < 2**32
32
+ torch_seed = torch_seed % 2**30
33
+ np.random.seed(torch_seed + worker_id)
34
+
35
+
36
+ def pil_loader(path: str) -> Image.Image:
37
+ # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
38
+ with open(path, 'rb') as f:
39
+ img = Image.open(f)
40
+ return img.convert('RGB')
41
+
42
+
43
+ class ImageDataset(torch.utils.data.Dataset):
44
+ r"""
45
+ Customised Image Folder class for pytorch.
46
+ Accept lmdb and a csv list as the input.
47
+ Usage:
48
+ dataset = ImageDataset(img_dir, img_list)
49
+ dataset.set_transform(some_pytorch_transforms)
50
+ loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True,
51
+ num_workers=4, worker_init_fn=worker_init_fn)
52
+ for x,y in loader:
53
+ # x and y is input and target (dict), the keys can be customised.
54
+ """
55
+ _repr_indent = 4
56
+ def __init__(self, data_dir, data_list, secret_len=100, transform=None, target_transform=None, **kwargs):
57
+ super().__init__()
58
+ self.set_transform(transform, target_transform)
59
+ self.build_data(data_dir, data_list, **kwargs)
60
+ self.secret_len = secret_len
61
+ self.kwargs = kwargs
62
+
63
+ def set_transform(self, transform, target_transform=None):
64
+ self.transform, self.target_transform = transform, target_transform
65
+
66
+ def build_data(self, data_dir, data_list, **kwargs):
67
+ """
68
+ Args:
69
+ data_list (text file) must have at least 3 fields: id, path and label
70
+
71
+ This method must create an attribute self.samples containing ID, input and target samples; and another attribute N storing the dataset size
72
+
73
+ Optional attributes: classes (list of unique classes), group (useful for
74
+ metric learning)
75
+ """
76
+ self.data_dir, self.list = data_dir, data_list
77
+ if ('dtype' in kwargs) and (kwargs['dtype'].lower() == 'array'):
78
+ data = ArrayDatabase(data_dir, data_list)
79
+ else:
80
+ data = PILlmdb(data_dir, data_list, **kwargs)
81
+ self.N = len(data)
82
+ self.classes = np.unique(data.labels)
83
+ self.samples = {'x': data, 'y': data.labels}
84
+
85
+ def __getitem__(self, index: int) -> Any:
86
+ """
87
+ Args:
88
+ index (int): Index
89
+ Returns:
90
+ dict: (x: sample, y: target, **kwargs)
91
+ """
92
+ x, y = self.samples['x'][index], self.samples['y'][index]
93
+ if self.transform is not None:
94
+ x = self.transform(x)
95
+ if self.target_transform is not None:
96
+ y = self.target_transform(y)
97
+ x = np.array(x, dtype=np.float32)/127.5-1.
98
+ secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
99
+ return {'image': x, 'secret': secret} # {'img': x, 'index': index}
100
+
101
+ def __len__(self) -> int:
102
+ # raise NotImplementedError
103
+ return self.N
104
+
105
+ def __repr__(self) -> str:
106
+ head = "\nDataset " + self.__class__.__name__
107
+ body = ["Number of datapoints: {}".format(self.__len__())]
108
+ if hasattr(self, 'data_dir') and self.data_dir is not None:
109
+ body.append("data_dir location: {}".format(self.data_dir))
110
+ if hasattr(self, 'kwargs'):
111
+ body.append(f'kwargs: {self.kwargs}')
112
+ body += self.extra_repr().splitlines()
113
+ if hasattr(self, "transform") and self.transform is not None:
114
+ body += [repr(self.transform)]
115
+ lines = [head] + [" " * self._repr_indent + line for line in body]
116
+ return '\n'.join(lines)
117
+
118
+ def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
119
+ lines = transform.__repr__().splitlines()
120
+ return (["{}{}".format(head, lines[0])] +
121
+ ["{}{}".format(" " * len(head), line) for line in lines[1:]])
122
+
123
+ def extra_repr(self) -> str:
124
+ return ""
125
+
126
+ class ImageFolder(torch.utils.data.Dataset):
127
+ _repr_indent = 4
128
+ def __init__(self, data_dir, data_list, secret_len=100, resize=256, transform=None, **kwargs):
129
+ super().__init__()
130
+ self.transform = transforms.Resize((resize, resize)) if transform is None else transform
131
+ self.build_data(data_dir, data_list, **kwargs)
132
+ self.kwargs = kwargs
133
+ self.secret_len = secret_len
134
+
135
+ def build_data(self, data_dir, data_list, **kwargs):
136
+ self.data_dir = data_dir
137
+ if isinstance(data_list, list):
138
+ self.data_list = data_list
139
+ elif isinstance(data_list, str):
140
+ self.data_list = pd.read_csv(data_list)['path'].tolist()
141
+ elif isinstance(data_list, pd.DataFrame):
142
+ self.data_list = data_list['path'].tolist()
143
+ else:
144
+ raise ValueError('data_list must be a list, str or pd.DataFrame')
145
+ self.N = len(self.data_list)
146
+
147
+ def __getitem__(self, index):
148
+ path = self.data_list[index]
149
+ img = pil_loader(os.path.join(self.data_dir, path))
150
+ img = self.transform(img)
151
+ img = np.array(img, dtype=np.float32)/127.5-1. # [-1, 1]
152
+ secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2) # not used
153
+ return {'image': img, 'secret': secret} # {'img': x, 'index': index}
154
+
155
+ def __len__(self) -> int:
156
+ # raise NotImplementedError
157
+ return self.N
tools/image_tools.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+ @author: Tu Bui @surrey.ac.uk
6
+ """
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+ from scipy import fftpack
11
+ import sys, os
12
+ from pathlib import Path
13
+ import numpy as np
14
+ import random
15
+ import glob
16
+ import json
17
+ import time
18
+ import importlib
19
+ import pandas as pd
20
+ from tqdm import tqdm
21
+ # from IPython.display import display
22
+ # import seaborn as sns
23
+ import matplotlib
24
+ # matplotlib.use('Agg') # headless run
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as mpatches
27
+ from PIL import Image, ImageDraw, ImageFont
28
+ cmap = plt.get_cmap("tab10") # cmap as function
29
+ cmap = plt.rcParams['axes.prop_cycle'].by_key()['color'] # cmap
30
+
31
+ FONT = '/vol/research/tubui1/_base/utils/FreeSans.ttf'
32
+
33
+ # def imshow(im):
34
+ # if type(im) is np.ndarray:
35
+ # im = Image.fromarray(im)
36
+ # display(im)
37
+
38
+ def make_grid(array_list, gsize=(3,3)):
39
+ """
40
+ make a grid image from a list of image array (RGB)
41
+ return: array RGB
42
+ """
43
+ assert len(gsize)==2 and gsize[0]*gsize[1]==len(array_list)
44
+ h,w,c = array_list[0].shape
45
+ out = np.array(array_list).reshape(gsize[0], gsize[1], h, w, c).transpose(0, 2, 1, 3, 4).reshape(gsize[0]*h, gsize[1]*w, c)
46
+ return out
47
+
48
+ def collage(im_list, size=None, pad=0, color=255):
49
+ """
50
+ generalised function of make_grid()
51
+ work on PIL/numpy images of arbitrary size
52
+ """
53
+ if size is None:
54
+ size=(1, len(im_list))
55
+ assert len(size)==2
56
+ if isinstance(im_list[0], np.ndarray):
57
+ im_list = [Image.fromarray(im) for im in im_list]
58
+ h, w = size
59
+ n = len(im_list)
60
+ canvas = []
61
+ for i in range(h):
62
+ start, end = i*w, min((i+1)*w, n)
63
+ row = combine_horz(im_list[start:end], pad, color)
64
+ canvas.append(row)
65
+ canvas = combine_vert(canvas, pad, color)
66
+ return canvas
67
+
68
+ def combine_horz(pil_ims, pad=0, c=255):
69
+ """
70
+ Combines multiple pil_ims into a single side-by-side PIL image object.
71
+ """
72
+ widths, heights = zip(*(i.size for i in pil_ims))
73
+ total_width = sum(widths) + (len(pil_ims)-1) * pad
74
+ max_height = max(heights)
75
+ color = (c,c,c)
76
+ new_im = Image.new('RGB', (total_width, max_height), color)
77
+ x_offset = 0
78
+ for im in pil_ims:
79
+ new_im.paste(im, (x_offset,0))
80
+ x_offset += (im.size[0] + pad)
81
+ return new_im
82
+
83
+
84
+ def combine_vert(pil_ims, pad=0, c=255):
85
+ """
86
+ Combines multiple pil_ims into a single vertical PIL image object.
87
+ """
88
+ widths, heights = zip(*(i.size for i in pil_ims))
89
+ max_width = max(widths)
90
+ total_height = sum(heights) + (len(pil_ims)-1)*pad
91
+ color = (c,c,c)
92
+ new_im = Image.new('RGB', (max_width, total_height), color)
93
+ y_offset = 0
94
+ for im in pil_ims:
95
+ new_im.paste(im, (0,y_offset))
96
+ y_offset += (im.size[1] + pad)
97
+ return new_im
98
+
99
+ def make_text_image(img_shape=(100,20), text='hello', font_path=FONT, offset=(0,0), font_size=16):
100
+ """
101
+ make a text image with given width/height and font size
102
+ Args:
103
+ img_shape, offset tuple (width, height)
104
+ font_path path to font file (TrueType)
105
+ font_size max font size, actual may smaller
106
+
107
+ Return:
108
+ pil image
109
+ """
110
+ im = Image.new('RGB', tuple(img_shape), (255,255,255))
111
+ draw = ImageDraw.Draw(im)
112
+
113
+ def get_font_size(max_font_size):
114
+ font = ImageFont.truetype(font_path, max_font_size)
115
+ text_size = font.getsize(text) # (w,h)
116
+ start_w = int((img_shape[0] - text_size[0]) / 2)
117
+ start_h = int((img_shape[1] - text_size[1])/2)
118
+ if start_h <0 or start_w < 0:
119
+ return get_font_size(max_font_size-2)
120
+ else:
121
+ return font, (start_w, start_h)
122
+ font, pos = get_font_size(font_size)
123
+ pos = (pos[0]+offset[0], pos[1]+offset[1])
124
+ draw.text(pos, text, font=font, fill=0)
125
+ return im
126
+
127
+
128
+ def log_scale(array, epsilon=1e-12):
129
+ """Log scale the input array.
130
+ """
131
+ array = np.abs(array)
132
+ array += epsilon # no zero in log
133
+ array = np.log(array)
134
+ return array
135
+
136
+ def dct2(array):
137
+ """2D DCT"""
138
+ array = fftpack.dct(array, type=2, norm="ortho", axis=0)
139
+ array = fftpack.dct(array, type=2, norm="ortho", axis=1)
140
+ return array
141
+
142
+ def idct2(array):
143
+ """inverse 2D DCT"""
144
+ array = fftpack.idct(array, type=2, norm="ortho", axis=0)
145
+ array = fftpack.idct(array, type=2, norm="ortho", axis=1)
146
+ return array
147
+
148
+
149
+ class DCT(object):
150
+ def __init__(self, log=True):
151
+ self.log = log
152
+
153
+ def __call__(self, x):
154
+ x = np.array(x)
155
+ x = dct2(x)
156
+ if self.log:
157
+ x = log_scale(x)
158
+ # normalize
159
+ x = np.clip((x - x.min())/(x.max() - x.min()) * 255, 0, 255).astype(np.uint8)
160
+ return Image.fromarray(x)
161
+
162
+ def __repr__(self):
163
+ s = f'(Discrete Cosine Transform, logarithm={self.log})'
164
+ return self.__class__.__name__ + s
tools/imgcap_dataset.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Dataset class for image-caption
5
+ @author: Tu Bui @University of Surrey
6
+ """
7
+ import json
8
+ from PIL import Image
9
+ import numpy as np
10
+ from pathlib import Path
11
+ import torch
12
+ from torch.utils.data import Dataset, DataLoader
13
+ from functools import partial
14
+ import pytorch_lightning as pl
15
+ from ldm.util import instantiate_from_config
16
+ import pandas as pd
17
+
18
+
19
+ def worker_init_fn(_):
20
+ worker_info = torch.utils.data.get_worker_info()
21
+ worker_id = worker_info.id
22
+ return np.random.seed(np.random.get_state()[1][0] + worker_id)
23
+
24
+
25
+ class WrappedDataset(Dataset):
26
+ """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset"""
27
+
28
+ def __init__(self, dataset):
29
+ self.data = dataset
30
+
31
+ def __len__(self):
32
+ return len(self.data)
33
+
34
+ def __getitem__(self, idx):
35
+ return self.data[idx]
36
+
37
+
38
+ class DataModuleFromConfig(pl.LightningDataModule):
39
+ def __init__(self, batch_size, train=None, validation=None, test=None, predict=None, wrap=False, num_workers=None, shuffle_test_loader=False, use_worker_init_fn=False,
40
+ shuffle_val_dataloader=False):
41
+ super().__init__()
42
+ self.batch_size = batch_size
43
+ self.dataset_configs = dict()
44
+ self.num_workers = num_workers if num_workers is not None else batch_size * 2
45
+ self.use_worker_init_fn = use_worker_init_fn
46
+ if train is not None:
47
+ self.dataset_configs["train"] = train
48
+ self.train_dataloader = self._train_dataloader
49
+ if validation is not None:
50
+ self.dataset_configs["validation"] = validation
51
+ self.val_dataloader = partial(self._val_dataloader, shuffle=shuffle_val_dataloader)
52
+ if test is not None:
53
+ self.dataset_configs["test"] = test
54
+ self.test_dataloader = partial(self._test_dataloader, shuffle=shuffle_test_loader)
55
+ if predict is not None:
56
+ self.dataset_configs["predict"] = predict
57
+ self.predict_dataloader = self._predict_dataloader
58
+ self.wrap = wrap
59
+
60
+ def prepare_data(self):
61
+ for data_cfg in self.dataset_configs.values():
62
+ instantiate_from_config(data_cfg)
63
+
64
+ def setup(self, stage=None):
65
+ self.datasets = dict(
66
+ (k, instantiate_from_config(self.dataset_configs[k]))
67
+ for k in self.dataset_configs)
68
+ if self.wrap:
69
+ for k in self.datasets:
70
+ self.datasets[k] = WrappedDataset(self.datasets[k])
71
+
72
+ def _train_dataloader(self):
73
+ if self.use_worker_init_fn:
74
+ init_fn = worker_init_fn
75
+ else:
76
+ init_fn = None
77
+ return DataLoader(self.datasets["train"], batch_size=self.batch_size,
78
+ num_workers=self.num_workers, shuffle=True,
79
+ worker_init_fn=init_fn)
80
+
81
+ def _val_dataloader(self, shuffle=False):
82
+ if self.use_worker_init_fn:
83
+ init_fn = worker_init_fn
84
+ else:
85
+ init_fn = None
86
+ return DataLoader(self.datasets["validation"],
87
+ batch_size=self.batch_size,
88
+ num_workers=self.num_workers,
89
+ worker_init_fn=init_fn,
90
+ shuffle=shuffle)
91
+
92
+ def _test_dataloader(self, shuffle=False):
93
+ if self.use_worker_init_fn:
94
+ init_fn = worker_init_fn
95
+ else:
96
+ init_fn = None
97
+
98
+ return DataLoader(self.datasets["test"], batch_size=self.batch_size,
99
+ num_workers=self.num_workers, worker_init_fn=init_fn, shuffle=shuffle)
100
+
101
+ def _predict_dataloader(self, shuffle=False):
102
+ if self.use_worker_init_fn:
103
+ init_fn = worker_init_fn
104
+ else:
105
+ init_fn = None
106
+ return DataLoader(self.datasets["predict"], batch_size=self.batch_size,
107
+ num_workers=self.num_workers, worker_init_fn=init_fn)
108
+
109
+
110
+ class ImageCaptionRaw(Dataset):
111
+ def __init__(self, image_dir, caption_file, secret_len=100, transform=None):
112
+ super().__init__()
113
+ self.image_dir = Path(image_dir)
114
+ self.data = []
115
+ with open(caption_file, 'rt') as f:
116
+ for line in f:
117
+ self.data.append(json.loads(line))
118
+ self.secret_len = secret_len
119
+ self.transform = transform
120
+
121
+ def __len__(self):
122
+ return len(self.data)
123
+
124
+ def __getitem__(self, idx):
125
+ item = self.data[idx]
126
+ image = Image.open(self.image_dir/item['image']).convert('RGB').resize((512,512))
127
+ caption = item['captions']
128
+ cid = torch.randint(0, len(caption), (1,)).item()
129
+ caption = caption[cid]
130
+ if self.transform is not None:
131
+ image = self.transform(image)
132
+
133
+ image = np.array(image, dtype=np.float32)/ 255.0 # normalize to [0, 1]
134
+ target = image * 2.0 - 1.0 # normalize to [-1, 1]
135
+ secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
136
+ return dict(image=image, caption=caption, target=target, secret=secret)
137
+
138
+
139
+ class BAMFG(Dataset):
140
+ def __init__(self, style_dir, gt_dir, data_list, transform=None):
141
+ super().__init__()
142
+ self.style_dir = Path(style_dir)
143
+ self.gt_dir = Path(gt_dir)
144
+ self.data = pd.read_csv(data_list)
145
+ self.transform = transform
146
+
147
+ def __len__(self):
148
+ return len(self.data)
149
+
150
+ def __getitem__(self, idx):
151
+ item = self.data.iloc[idx]
152
+ gt_img = Image.open(self.gt_dir/item['gt_img']).convert('RGB').resize((512,512))
153
+ style_img = Image.open(self.style_dir/item['style_img']).convert('RGB').resize((512,512))
154
+ txt = item['prompt']
155
+ if self.transform is not None:
156
+ gt_img = self.transform(gt_img)
157
+ style_img = self.transform(style_img)
158
+
159
+ gt_img = np.array(gt_img, dtype=np.float32)/ 255.0 # normalize to [0, 1]
160
+ style_img = np.array(style_img, dtype=np.float32)/ 255.0 # normalize to [0, 1]
161
+ target = gt_img * 2.0 - 1.0 # normalize to [-1, 1]
162
+
163
+ return dict(image=gt_img, txt=txt, hint=style_img)
tools/sifid.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from scipy import linalg
4
+ import torchvision
5
+ from torchvision import transforms
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ from PIL import Image
9
+
10
+
11
+ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
12
+ """Numpy implementation of the Frechet Distance.
13
+ The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
14
+ and X_2 ~ N(mu_2, C_2) is
15
+ d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
16
+ Stable version by Dougal J. Sutherland.
17
+ Params:
18
+ -- mu1 : Numpy array containing the activations of a layer of the
19
+ inception net (like returned by the function 'get_predictions')
20
+ for generated samples.
21
+ -- mu2 : The sample mean over activations, precalculated on an
22
+ representative data set.
23
+ -- sigma1: The covariance matrix over activations for generated samples.
24
+ -- sigma2: The covariance matrix over activations, precalculated on an
25
+ representative data set.
26
+ Returns:
27
+ -- : The Frechet Distance.
28
+ """
29
+
30
+ mu1 = np.atleast_1d(mu1)
31
+ mu2 = np.atleast_1d(mu2)
32
+
33
+ sigma1 = np.atleast_2d(sigma1)
34
+ sigma2 = np.atleast_2d(sigma2)
35
+
36
+ assert mu1.shape == mu2.shape, \
37
+ 'Training and test mean vectors have different lengths'
38
+ assert sigma1.shape == sigma2.shape, \
39
+ 'Training and test covariances have different dimensions'
40
+
41
+ diff = mu1 - mu2
42
+
43
+ # Product might be almost singular
44
+ covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
45
+ if not np.isfinite(covmean).all():
46
+ msg = ('fid calculation produces singular product; '
47
+ 'adding %s to diagonal of cov estimates') % eps
48
+ print(msg)
49
+ offset = np.eye(sigma1.shape[0]) * eps
50
+ covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
51
+
52
+ # Numerical error might give slight imaginary component
53
+ if np.iscomplexobj(covmean):
54
+ if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
55
+ m = np.max(np.abs(covmean.imag))
56
+ raise ValueError('Imaginary component {}'.format(m))
57
+ covmean = covmean.real
58
+
59
+ tr_covmean = np.trace(covmean)
60
+
61
+ return (diff.dot(diff) + np.trace(sigma1) +
62
+ np.trace(sigma2) - 2 * tr_covmean)
63
+
64
+
65
+ class SIFID(object):
66
+ def __init__(self, dims=64) -> None:
67
+ block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
68
+ self.model = InceptionV3([block_idx]).cuda()
69
+ self.model.eval()
70
+ self.dims = dims
71
+
72
+ def calculate_activation_statistics(self, x):
73
+ act = self.get_activations(x)
74
+ mu = np.mean(act, axis=0)
75
+ sigma = np.cov(act, rowvar=False)
76
+ return mu, sigma
77
+
78
+ def get_activations(self, x):
79
+ # x tensor (B, C, H, W) in range [0, 1]
80
+ batch_size = x.shape[0]
81
+ with torch.no_grad():
82
+ pred = self.model(x)[0]
83
+ pred = pred.cpu().numpy()
84
+ pred = pred.transpose(0, 2, 3, 1).reshape(batch_size*pred.shape[2]*pred.shape[3],-1)
85
+ return pred
86
+
87
+ def __call__(self, x1, x2):
88
+ # x1, x2 tensor (B, C, H, W) in range [-1, 1]
89
+ x1, x2 = (x1 + 1.)/2, (x2 + 1.)/2 # [-1, 1] -> [0, 1]
90
+ m1, s1 = self.calculate_activation_statistics(x1.unsqueeze(0).cuda())
91
+ m2, s2 = self.calculate_activation_statistics(x2.unsqueeze(0).cuda())
92
+ return calculate_frechet_distance(m1, s1, m2, s2)
93
+
94
+
95
+ class InceptionV3(nn.Module):
96
+ """Pretrained InceptionV3 network returning feature maps"""
97
+
98
+ # Index of default block of inception to return,
99
+ # corresponds to output of final average pooling
100
+ DEFAULT_BLOCK_INDEX = 3
101
+
102
+ # Maps feature dimensionality to their output blocks indices
103
+ BLOCK_INDEX_BY_DIM = {
104
+ 64: 0, # First max pooling features
105
+ 192: 1, # Second max pooling featurs
106
+ 768: 2, # Pre-aux classifier features
107
+ 2048: 3 # Final average pooling features
108
+ }
109
+
110
+ def __init__(self,
111
+ output_blocks=[DEFAULT_BLOCK_INDEX],
112
+ resize_input=False,
113
+ normalize_input=True,
114
+ requires_grad=False):
115
+ """Build pretrained InceptionV3
116
+ Parameters
117
+ ----------
118
+ output_blocks : list of int
119
+ Indices of blocks to return features of. Possible values are:
120
+ - 0: corresponds to output of first max pooling
121
+ - 1: corresponds to output of second max pooling
122
+ - 2: corresponds to output which is fed to aux classifier
123
+ - 3: corresponds to output of final average pooling
124
+ resize_input : bool
125
+ If true, bilinearly resizes input to width and height 299 before
126
+ feeding input to model. As the network without fully connected
127
+ layers is fully convolutional, it should be able to handle inputs
128
+ of arbitrary size, so resizing might not be strictly needed
129
+ normalize_input : bool
130
+ If true, scales the input from range (0, 1) to the range the
131
+ pretrained Inception network expects, namely (-1, 1)
132
+ requires_grad : bool
133
+ If true, parameters of the model require gradient. Possibly useful
134
+ for finetuning the network
135
+ """
136
+ super(InceptionV3, self).__init__()
137
+
138
+ self.resize_input = resize_input
139
+ self.normalize_input = normalize_input
140
+ self.output_blocks = sorted(output_blocks)
141
+ self.last_needed_block = max(output_blocks)
142
+
143
+ assert self.last_needed_block <= 3, \
144
+ 'Last possible output block index is 3'
145
+
146
+ self.blocks = nn.ModuleList()
147
+
148
+ inception = torchvision.models.inception_v3(pretrained=True)
149
+
150
+ # Block 0: input to maxpool1
151
+ block0 = [
152
+ inception.Conv2d_1a_3x3,
153
+ inception.Conv2d_2a_3x3,
154
+ inception.Conv2d_2b_3x3,
155
+ ]
156
+
157
+
158
+ self.blocks.append(nn.Sequential(*block0))
159
+
160
+ # Block 1: maxpool1 to maxpool2
161
+ if self.last_needed_block >= 1:
162
+ block1 = [
163
+ nn.MaxPool2d(kernel_size=3, stride=2),
164
+ inception.Conv2d_3b_1x1,
165
+ inception.Conv2d_4a_3x3,
166
+ ]
167
+ self.blocks.append(nn.Sequential(*block1))
168
+
169
+ # Block 2: maxpool2 to aux classifier
170
+ if self.last_needed_block >= 2:
171
+ block2 = [
172
+ nn.MaxPool2d(kernel_size=3, stride=2),
173
+ inception.Mixed_5b,
174
+ inception.Mixed_5c,
175
+ inception.Mixed_5d,
176
+ inception.Mixed_6a,
177
+ inception.Mixed_6b,
178
+ inception.Mixed_6c,
179
+ inception.Mixed_6d,
180
+ inception.Mixed_6e,
181
+ ]
182
+ self.blocks.append(nn.Sequential(*block2))
183
+
184
+ # Block 3: aux classifier to final avgpool
185
+ if self.last_needed_block >= 3:
186
+ block3 = [
187
+ inception.Mixed_7a,
188
+ inception.Mixed_7b,
189
+ inception.Mixed_7c,
190
+ ]
191
+ self.blocks.append(nn.Sequential(*block3))
192
+
193
+ if self.last_needed_block >= 4:
194
+ block4 = [
195
+ nn.AdaptiveAvgPool2d(output_size=(1, 1))
196
+ ]
197
+ self.blocks.append(nn.Sequential(*block4))
198
+
199
+ for param in self.parameters():
200
+ param.requires_grad = requires_grad
201
+
202
+ def forward(self, inp):
203
+ """Get Inception feature maps
204
+ Parameters
205
+ ----------
206
+ inp : torch.autograd.Variable
207
+ Input tensor of shape Bx3xHxW. Values are expected to be in
208
+ range (0, 1)
209
+ Returns
210
+ -------
211
+ List of torch.autograd.Variable, corresponding to the selected output
212
+ block, sorted ascending by index
213
+ """
214
+ outp = []
215
+ x = inp
216
+
217
+ if self.resize_input:
218
+ x = F.upsample(x,
219
+ size=(299, 299),
220
+ mode='bilinear',
221
+ align_corners=False)
222
+
223
+ if self.normalize_input:
224
+ x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
225
+
226
+ for idx, block in enumerate(self.blocks):
227
+ x = block(x)
228
+ if idx in self.output_blocks:
229
+ outp.append(x)
230
+
231
+ if idx == self.last_needed_block:
232
+ break
233
+
234
+ return outp
235
+
236
+ if __name__ == '__main__':
237
+ tform = transforms.Compose([transforms.Resize((256,256)),
238
+ transforms.ToTensor(),
239
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
240
+ im1 = Image.open('test1.jpg')
241
+ im2 = Image.open('test2.jpg')
242
+ im1 = tform(im1) # 3xHxW in [-1,]
243
+ im2 = tform(im2)
244
+ sifid_model = SIFID()
245
+ sifid_score = sifid_model(im1, im2)
246
+ print(sifid_score)
tools/slack_bot.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ slack_bot.py
5
+ Created on May 02 2020 11:02
6
+ a bot to send message/image during program run
7
+ @author: Tu Bui tu@surrey.ac.uk
8
+ """
9
+
10
+ from __future__ import absolute_import
11
+ from __future__ import division
12
+ from __future__ import print_function
13
+ import os
14
+ import sys
15
+ import requests
16
+ import socket
17
+ from slack import WebClient
18
+ from slack.errors import SlackApiError
19
+ import threading
20
+
21
+
22
+ SLACK_MAX_PRINT_ERROR = 3
23
+ SLACK_ERROR_CODE = {'not_active': 1,
24
+ 'API': 2}
25
+
26
+
27
+ def welcome_message():
28
+ hostname = socket.gethostname()
29
+ all_args = ' '.join(sys.argv)
30
+ out_text = 'On server {}: {}\n'.format(hostname, all_args)
31
+ return out_text
32
+
33
+
34
+ class Notifier(object):
35
+ """
36
+ A slack bot to send text/image to a given workspace channel.
37
+ This class initializes with a text file as input, the text file should contain 2 lines:
38
+ slack token
39
+ slack channel
40
+
41
+ Usage:
42
+ msg = Notifier(token_file)
43
+ msg.send_initial_text(' '.join(sys.argv))
44
+ msg.send_text('hi, this text is inside slack thread')
45
+ msg.send_file(your_file, 'file title')
46
+ """
47
+ def __init__(self, token_file):
48
+ """
49
+ setup slack
50
+ :param token_file: path to slack token file
51
+ """
52
+ self.active = True
53
+ self.thread_id = None
54
+ self.counter = 0 # count number of errors during Web API call
55
+ if not os.path.exists(token_file):
56
+ print('[SLACK] token file not found. You will not be notified.')
57
+ self.active = False
58
+ else:
59
+ try:
60
+ with open(token_file, 'r') as f:
61
+ lines = f.readlines()
62
+ self.token = lines[0].strip()
63
+ self.channel = lines[1].strip()
64
+ except Exception as e:
65
+ print(e)
66
+ print('[SLACK] fail to read token file. You will not be notified.')
67
+ self.active = False
68
+
69
+ def _handel_error(self, e):
70
+ assert e.response["ok"] is False
71
+ assert e.response["error"] # str like 'invalid_auth', 'channel_not_found'
72
+ self.counter += 1
73
+ if self.counter <= SLACK_MAX_PRINT_ERROR:
74
+ print(f"Got the following error, you will not be notified: {e.response['error']}")
75
+
76
+ def send_init_text(self, text=None):
77
+ """
78
+ start a new thread with a main message and register the thread id
79
+ :param text: initial message for this thread
80
+ :return:
81
+ """
82
+ if not self.active:
83
+ return SLACK_ERROR_CODE['not_active']
84
+ try:
85
+ if text is None:
86
+ text = welcome_message()
87
+ sc = WebClient(self.token)
88
+ response = sc.chat_postMessage(channel=self.channel, text=text)
89
+ self.thread_id = response['ts']
90
+ except SlackApiError as e:
91
+ self._handel_error(e)
92
+ return SLACK_ERROR_CODE['API']
93
+ print('[SLACK] sent initial text. Chat ID %s. Message %s' % (self.thread_id, text))
94
+ return 0
95
+
96
+ def send_init_file(self, file_path, title=''):
97
+ """
98
+ start a new thread with a file and register thread id
99
+ :param file_path: path to file
100
+ :param title: title of this file
101
+ :return: 0 if success otherwise error code
102
+ """
103
+ if not self.active:
104
+ return SLACK_ERROR_CODE['not_active']
105
+ try:
106
+ response = sc.files_upload(title=title, channels=self.channel, file=file_path)
107
+ self.thread_id = response['ts']
108
+ except SlackApiError as e:
109
+ self._handel_error(e)
110
+ return SLACK_ERROR_CODE['API']
111
+ print('[SLACK] sent initial file. Chat ID %s.' % self.thread_id)
112
+ return 0
113
+
114
+ def send_text(self, text, reply_broadcast=False):
115
+ """
116
+ send text as a thread if one is registered in self.thread_id.
117
+ Otherwise send as a new message
118
+ :param text: message to send.
119
+ :return: 0 if success, error code otherwise
120
+ """
121
+ print(text)
122
+ if not self.active:
123
+ return SLACK_ERROR_CODE['not_active']
124
+ if self.thread_id is None:
125
+ self.send_init_text(text)
126
+ else:
127
+ try:
128
+ sc = WebClient(self.token)
129
+ response = sc.chat_postMessage(channel=self.channel, text=text,
130
+ thread_ts=self.thread_id, as_user=True,
131
+ reply_broadcast=reply_broadcast)
132
+ except SlackApiError as e:
133
+ self._handel_error(e)
134
+ return SLACK_ERROR_CODE['API']
135
+ return 0
136
+
137
+ def _send_file(self, file_path, title='', reply_broadcast=False):
138
+ """can be multithread target"""
139
+ try:
140
+ sc = WebClient(self.token)
141
+ sc.files_upload(title=title, channels=self.channel,
142
+ thread_ts=self.thread_id, file=file_path,
143
+ reply_broadcast=reply_broadcast)
144
+ except SlackApiError as e:
145
+ self._handel_error(e)
146
+ return SLACK_ERROR_CODE['API']
147
+ return 0
148
+
149
+ def send_file(self, file_path, title='', reply_broadcast=False):
150
+ if not self.active:
151
+ return SLACK_ERROR_CODE['not_active']
152
+ if self.thread_id is None:
153
+ return self.send_init_file(file_path, title)
154
+ else:
155
+ os_thread = threading.Thread(target=self._send_file, args=(file_path, title, reply_broadcast))
156
+ os_thread.start()
157
+ return 0 # may still have error if _send_file() fail